---
title: "05.3-Apul-lncRNA-discovery - with splice"
author: "Steven Roberts"
date: "`r format(Sys.time(), '%d %B, %Y')`"  
output: 
  github_document:
    toc: true
    toc_depth: 3
    number_sections: true
    html_preview: true
  html_document:
    theme: readable
    highlight: zenburn
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
---

```{r setup, include=FALSE}
library(knitr)
library(tidyverse)
library(kableExtra)
library(DT)
library(Biostrings)
library(tm)
knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = FALSE,         # Evaluate code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  fig.width = 6,       # Set plot width in inches
  fig.height = 4,      # Set plot height in inches
  fig.align = "center", # Align plots to the center
  comment = ""         # Prevents appending '##' to beginning of lines in code output
)
```


# Run HiSat on RNA-seq

Will end up with 5 sorted bam files. 

## Grab Trimmed RNA-seq Reads

```{r, engine='bash'}

wget -r \
--no-directories --no-parent \
-P ../data/fastq/ \
-A "*fastq.gz" https://gannet.fish.washington.edu/Atumefaciens/20230519-E5_coral-fastqc-fastp-multiqc-RNAseq/A_pulchra/trimmed/

```
## Genome

```{r, engine='bash'}
cd ../data

curl -O http://gannet.fish.washington.edu/seashell/snaps/GCF_013753865.1_Amil_v2.1_genomic.fna
```


## HiSat

```{r, engine='bash'}
/home/shared/hisat2-2.2.1/hisat2_extract_exons.py \
../data/Amil/ncbi_dataset/data/GCF_013753865.1/genomic.gtf \
> ../output/05.32-lncRNA-discovery/m_exon.tab
```

```{r, engine='bash', eval=TRUE}
head ../output/05.32-lncRNA-discovery/m_exon.tab
wc -l ../output/05.32-lncRNA-discovery/m_exon.tab
```

```{r, engine='bash'}

/home/shared/hisat2-2.2.1/hisat2_extract_splice_sites.py \
../data/Amil/ncbi_dataset/data/GCF_013753865.1/genomic.gtf \
> ../output/05.32-lncRNA-discovery/m_splice_sites.tab

```


```{r, engine='bash', eval=TRUE}
head ../output/05.32-lncRNA-discovery/m_splice_sites.tab
```

```{r, engine='bash'}
/home/shared/hisat2-2.2.1/hisat2-build \
../data/GCF_013753865.1_Amil_v2.1_genomic.fna \
../output/05.32-lncRNA-discovery/GCF_013753865.1_Amil_v2.1.index \
--exon ../output/05.32-lncRNA-discovery/m_exon.tab \
--ss ../output/05.32-lncRNA-discovery/m_splice_sites.tab \
-p 24 \
../data/Amil/ncbi_dataset/data/GCF_013753865.1/genomic.gtf \
2> ../output/05.32-lncRNA-discovery/hisat2-build_stats.txt
```

```{r, engine='bash', eval=TRUE}
cat ../output/05.32-lncRNA-discovery/hisat2-build_stats.txt
```


```{r, engine='bash', eval=TRUE}
find ../data/fastq/*gz
```

```{r, engine='bash', eval=TRUE}
find ../data/fastq/*R2_001.fastp-trim.20230519.fastq.gz \
| xargs basename -s -S1-TP2_R2_001.fastp-trim.20230519.fastq.gz | xargs -I{} \
echo {}
```

```{r, engine='bash'}
find ../data/fastq/*R2_001.fastp-trim.20230519.fastq.gz \
| xargs basename -s -S1-TP2_R2_001.fastp-trim.20230519.fastq.gz | xargs -I{} \
/home/shared/hisat2-2.2.1/hisat2 \
-x ../output/05.32-lncRNA-discovery/GCF_013753865.1_Amil_v2.1.index \
-p 48 \
-1 ../data/fastq/{}-S1-TP2_R1_001.fastp-trim.20230519.fastq.gz \
-2 ../data/fastq/{}-S1-TP2_R2_001.fastp-trim.20230519.fastq.gz \
-S ../output/05.32-lncRNA-discovery/{}.sam \
2> ../output/05.32-lncRNA-discovery/hisat.out
```

```{r, engine='bash', eval=TRUE}
cat ../output/05.32-lncRNA-discovery/hisat.out
```

## convert to bams

```{r, engine='bash'}
for samfile in ../output/05.32-lncRNA-discovery/*.sam; do
  bamfile="${samfile%.sam}.bam"
  sorted_bamfile="${samfile%.sam}.sorted.bam"
  
  # Convert SAM to BAM
  /home/shared/samtools-1.12/samtools view -bS -@ 20 "$samfile" > "$bamfile"
  
  # Sort BAM
  /home/shared/samtools-1.12/samtools sort -@ 20 "$bamfile" -o "$sorted_bamfile"
  
  # Index sorted BAM
  /home/shared/samtools-1.12/samtools index -@ 20 "$sorted_bamfile"
done
```


## Looking at Bams 

![igv](https//gannet.fish.washington.edu/seashell/snaps/Monosnap_IGV_2023-11-05_09-57-16.png)


```{r, engine='bash', eval=TRUE}
ls ../output/05.32-lncRNA-discovery/*xml
```

# StringTie
StringTie uses the sorted BAM files to assemble transcripts for each sample, outputting them as GTF (Gene Transfer Format) files. And then merges all individual GTF assemblies into a single merged GTF file. This step extracts transcript information and merges GTFs from all samples--an important step in creating a canonical list of lncRNAs across all samples included in the pipeline.


```{r, engine='bash'}
find ../output/05.32-lncRNA-discovery/*sorted.bam \
| xargs basename -s .sorted.bam | xargs -I{} \
/home/shared/stringtie-2.2.1.Linux_x86_64/stringtie \
-p 16 \
-G ../data/Amil/ncbi_dataset/data/GCF_013753865.1/genomic.gff \
-o ../output/05.32-lncRNA-discovery/{}.gtf \
../output/05.32-lncRNA-discovery/{}.sorted.bam
```

```{r, engine='bash', eval=TRUE}
wc -l ../output/05.32-lncRNA-discovery/RNA*.gtf
ls ../output/05.32-lncRNA-discovery/RNA*.gtf
#head ../output/05.32-lncRNA-discovery/RNA*.gtf
```

![gtf](http://gannet.fish.washington.edu/seashell/snaps/Monosnap_IGV_-_Session_Userssr320DesktopApul_lncRNA_igv_session.xml_2023-11-05_10-11-31.png)


![igv2](http://gannet.fish.washington.edu/seashell/snaps/Monosnap_IGV_-_Session_Userssr320DesktopApul_lncRNA_igv_session.xml_2023-11-05_12-33-58.png)

```{r, engine='bash', eval=TRUE}
ls ../output/05.32-lncRNA-discovery/*xml
```


Merges all individual GTF assemblies into a single merged GTF file.

This is used to create a non-redundant set of transcripts after running StringTie separately on multiple RNA-Seq datasets.

```{r, engine='bash'}
/home/shared/stringtie-2.2.1.Linux_x86_64/stringtie \
--merge \
-G ../data/Amil/ncbi_dataset/data/GCF_013753865.1/genomic.gff \
-o ../output/05.32-lncRNA-discovery/stringtie_merged.gtf \
../output/05.32-lncRNA-discovery/*.gtf
```

```{bash}
tail ../output/05.32-lncRNA-discovery/stringtie_merged.gtf
```


```{r, engine='bash', eval=TRUE}

wc -l ../output/05.32-lncRNA-discovery/stringtie_merged.gtf
head ../output/05.32-lncRNA-discovery/stringtie_merged.gtf


echo "what is possible"

grep -v '^#' ../output/05.32-lncRNA-discovery/stringtie_merged.gtf | cut -f3 | sort | uniq


```

# GFFcompare
https://ccb.jhu.edu/software/stringtie/gffcompare.shtml

```{r, engine='bash'}
/home/shared/gffcompare-0.12.6.Linux_x86_64/gffcompare \
-r ../data/Amil/ncbi_dataset/data/GCF_013753865.1/genomic.gff \
-o ../output/05.32-lncRNA-discovery/gffcompare_merged \
../output/05.32-lncRNA-discovery/stringtie_merged.gtf
```


```{r, engine='bash', eval=TRUE}
ls ../output/05.32-lncRNA-discovery/gffcompare_merged*
```


```{r, engine='bash', eval=TRUE}
head -10 ../output/05.32-lncRNA-discovery/gffcompare_merged.annotated.gtf
```

![gff](http://gannet.fish.washington.edu/seashell/snaps/2023-11-03_09-25-24.png)


# Filter
Filters the combined GTF output from GFFcompare to select only the lines representing "transcripts" and excluding lines starting with "#" (these are lines in the output format from GFFcompare that don't contain transcript information). This step further filters for a class code of "u", and keep only those with lengths greater than 199 bases. The "u' class code from the GFFcompare step is for "unknown" transcripts, that is those that are not previously annotated in our reference GFF as protein coding. The size filter of +200nt is a common filtering step for isolating lncRNAs.

```{r, engine='bash'}
awk '$3 == "transcript" && $1 !~ /^#/' \
../output/05.32-lncRNA-discovery/gffcompare_merged.annotated.gtf | grep 'class_code "u"\|class_code "x"|\class_code "i"\|class_code "y"' | awk '($5 - $4 > 199) || ($4 - $5 > 199)' > ../output/05.32-lncRNA-discovery/Apul_lncRNA_candidates.gtf
```

```{r, engine='bash', eval=TRUE}
wc -l ../output/05.32-lncRNA-discovery/Apul_lncRNA_candidates.gtf
head ../output/05.32-lncRNA-discovery/Apul_lncRNA_candidates.gtf
```

# Bedtools
Extracts the sequence data from the `$FASTA` reference file based on the coordinates from the filtered GTF. The resulting sequences represent potential lncRNA candidates.

```{r, engine='bash'}
/home/shared/bedtools2/bin/fastaFromBed \
-fi ../data/GCF_013753865.1_Amil_v2.1_genomic.fna \
-bed ../output/05.32-lncRNA-discovery/Apul_lncRNA_candidates.gtf \
-fo ../output/05.32-lncRNA-discovery/Apul_lncRNA_candidates.fasta \
-name -split
```

```{r, engine='bash'}
fgrep -c ">" ../output/05.32-lncRNA-discovery/Apul_lncRNA_candidates.fasta
head ../output/05.32-lncRNA-discovery/Apul_lncRNA_candidates.fasta
```


# CPC2
Initializes a conda environment (Anaconda) and runs CPC2, a software to predict whether a transcript is coding or non-coding. The results are saved to the $OUTPUT_DIR. CPC2 uses ORF (Open Reading Frame) Analysis, Isometric Feature Mapping (Isomap), Sequence Homology, RNA Sequence Features, and Quality of Translation to assess coding potential and flag any transcripts we would want to exclude using the FASTA generated in the previous step.

```{r, engine='bash'}
eval "$(/opt/anaconda/anaconda3/bin/conda shell.bash hook)"
python /home/shared/CPC2_standalone-1.0.1/bin/CPC2.py \
-i ../output/05.32-lncRNA-discovery/Apul_lncRNA_candidates.fasta \
-o ../output/05.32-lncRNA-discovery/Apul_CPC2
```


```{r, engine='bash', eval=TRUE}
wc -l head ../output/05.32-lncRNA-discovery/Apul_CPC2.txt
head ../output/05.32-lncRNA-discovery/Apul_CPC2.txt
```

#Filter
Filters the CPC2 results to get only noncoding transcripts (using the class "noncoding" from the CPC2 results) and extracts their IDs and matches these IDs with the sequences from the previous step to generate a GTF of long noncoding transcripts.

Matches these IDs with the sequences from the previous step to generate a GTF of noncoding transcripts.

```{r, engine='bash'}
awk '$8 == "noncoding" {print $1}' ../output/05.32-lncRNA-discovery/Apul_CPC2.txt > ../output/05.32-lncRNA-discovery/Apul_noncoding_transcripts_ids.txt
```

```{r, engine='bash'}
wc -l ../output/05.32-lncRNA-discovery/Apul_noncoding_transcripts_ids.txt
head ../output/05.32-lncRNA-discovery/Apul_noncoding_transcripts_ids.txt
```


```{r, engine='bash', eval=TRUE}
head ../output/05.32-lncRNA-discovery/Apul_lncRNA_candidates.fasta
fgrep -c ">" ../output/05.32-lncRNA-discovery/Apul_lncRNA_candidates.fasta
```


17098 original number

# subsetting fasta
```{r, engine='bash'}
/home/shared/samtools-1.12/samtools faidx ../output/05.32-lncRNA-discovery/Apul_lncRNA_candidates.fasta \
-r ../output/05.32-lncRNA-discovery/Apul_noncoding_transcripts_ids.txt > ../output/05.32-lncRNA-discovery/Apul_lncRNA.fasta

```


```{r, engine='bash', eval=TRUE}
fgrep -c ">" ../output/05.32-lncRNA-discovery/Apul_lncRNA.fasta
fgrep ">" ../output/05.32-lncRNA-discovery/Apul_lncRNA.fasta | head -5

head ../output/05.32-lncRNA-discovery/Apul_lncRNA.fasta
```

# Getting genome feature track

```{r, engine='python'}

# Open the input file and the output file
with open('../output/05.32-lncRNA-discovery/Apul_noncoding_transcripts_ids.txt', 'r') as infile, open('../output/05.32-lncRNA-discovery/Apul_lncRNA.bed', 'w') as outfile:
    # Process each line in the input file
    for line in infile:
        # Remove 'transcript::' and then split the line by ':' to extract the relevant parts
        parts = line.strip().replace('transcript::', '').split(':')
        chromosome = parts[0]
        # Split the position part by '-' to get start and end positions
        start, end = parts[1].split('-')
        
        # BED format requires the start position to be 0-based
        # Convert the start position to 0-based by subtracting 1
        start = str(int(start) - 1)
        
        # Write the chromosome, start, and end positions to the output file
        # Separate each field with a tab character
        outfile.write(f'{chromosome}\t{start}\t{end}\n')

# After running this script, 'output.bed' will contain the converted data in BED format.

```

```{bash}
head ../output/05.32-lncRNA-discovery/Apul_lncRNA.bed 
```