--- title: "05.3-Apul-lncRNA-discovery - with splice" author: "Steven Roberts" date: "`r format(Sys.time(), '%d %B, %Y')`" output: github_document: toc: true toc_depth: 3 number_sections: true html_preview: true html_document: theme: readable highlight: zenburn toc: true toc_float: true number_sections: true code_folding: show code_download: true --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) library(kableExtra) library(DT) library(Biostrings) library(tm) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center", # Align plots to the center comment = "" # Prevents appending '##' to beginning of lines in code output ) ``` # Run HiSat on RNA-seq Will end up with 5 sorted bam files. ## Grab Trimmed RNA-seq Reads ```{r, engine='bash'} wget -r \ --no-directories --no-parent \ -P ../data/fastq/ \ -A "*fastq.gz" https://gannet.fish.washington.edu/Atumefaciens/20230519-E5_coral-fastqc-fastp-multiqc-RNAseq/A_pulchra/trimmed/ ``` ## Genome ```{r, engine='bash'} cd ../data curl -O http://gannet.fish.washington.edu/seashell/snaps/GCF_013753865.1_Amil_v2.1_genomic.fna ``` ## HiSat ```{r, engine='bash'} /home/shared/hisat2-2.2.1/hisat2_extract_exons.py \ ../data/Amil/ncbi_dataset/data/GCF_013753865.1/genomic.gtf \ > ../output/05.32-lncRNA-discovery/m_exon.tab ``` ```{r, engine='bash', eval=TRUE} head ../output/05.32-lncRNA-discovery/m_exon.tab wc -l ../output/05.32-lncRNA-discovery/m_exon.tab ``` ```{r, engine='bash'} /home/shared/hisat2-2.2.1/hisat2_extract_splice_sites.py \ ../data/Amil/ncbi_dataset/data/GCF_013753865.1/genomic.gtf \ > ../output/05.32-lncRNA-discovery/m_splice_sites.tab ``` ```{r, engine='bash', eval=TRUE} head ../output/05.32-lncRNA-discovery/m_splice_sites.tab ``` ```{r, engine='bash'} /home/shared/hisat2-2.2.1/hisat2-build \ ../data/GCF_013753865.1_Amil_v2.1_genomic.fna \ ../output/05.32-lncRNA-discovery/GCF_013753865.1_Amil_v2.1.index \ --exon ../output/05.32-lncRNA-discovery/m_exon.tab \ --ss ../output/05.32-lncRNA-discovery/m_splice_sites.tab \ -p 24 \ ../data/Amil/ncbi_dataset/data/GCF_013753865.1/genomic.gtf \ 2> ../output/05.32-lncRNA-discovery/hisat2-build_stats.txt ``` ```{r, engine='bash', eval=TRUE} cat ../output/05.32-lncRNA-discovery/hisat2-build_stats.txt ``` ```{r, engine='bash', eval=TRUE} find ../data/fastq/*gz ``` ```{r, engine='bash', eval=TRUE} find ../data/fastq/*R2_001.fastp-trim.20230519.fastq.gz \ | xargs basename -s -S1-TP2_R2_001.fastp-trim.20230519.fastq.gz | xargs -I{} \ echo {} ``` ```{r, engine='bash'} find ../data/fastq/*R2_001.fastp-trim.20230519.fastq.gz \ | xargs basename -s -S1-TP2_R2_001.fastp-trim.20230519.fastq.gz | xargs -I{} \ /home/shared/hisat2-2.2.1/hisat2 \ -x ../output/05.32-lncRNA-discovery/GCF_013753865.1_Amil_v2.1.index \ -p 48 \ -1 ../data/fastq/{}-S1-TP2_R1_001.fastp-trim.20230519.fastq.gz \ -2 ../data/fastq/{}-S1-TP2_R2_001.fastp-trim.20230519.fastq.gz \ -S ../output/05.32-lncRNA-discovery/{}.sam \ 2> ../output/05.32-lncRNA-discovery/hisat.out ``` ```{r, engine='bash', eval=TRUE} cat ../output/05.32-lncRNA-discovery/hisat.out ``` ## convert to bams ```{r, engine='bash'} for samfile in ../output/05.32-lncRNA-discovery/*.sam; do bamfile="${samfile%.sam}.bam" sorted_bamfile="${samfile%.sam}.sorted.bam" # Convert SAM to BAM /home/shared/samtools-1.12/samtools view -bS -@ 20 "$samfile" > "$bamfile" # Sort BAM /home/shared/samtools-1.12/samtools sort -@ 20 "$bamfile" -o "$sorted_bamfile" # Index sorted BAM /home/shared/samtools-1.12/samtools index -@ 20 "$sorted_bamfile" done ``` ## Looking at Bams ![igv](https//gannet.fish.washington.edu/seashell/snaps/Monosnap_IGV_2023-11-05_09-57-16.png) ```{r, engine='bash', eval=TRUE} ls ../output/05.32-lncRNA-discovery/*xml ``` # StringTie StringTie uses the sorted BAM files to assemble transcripts for each sample, outputting them as GTF (Gene Transfer Format) files. And then merges all individual GTF assemblies into a single merged GTF file. This step extracts transcript information and merges GTFs from all samples--an important step in creating a canonical list of lncRNAs across all samples included in the pipeline. ```{r, engine='bash'} find ../output/05.32-lncRNA-discovery/*sorted.bam \ | xargs basename -s .sorted.bam | xargs -I{} \ /home/shared/stringtie-2.2.1.Linux_x86_64/stringtie \ -p 16 \ -G ../data/Amil/ncbi_dataset/data/GCF_013753865.1/genomic.gff \ -o ../output/05.32-lncRNA-discovery/{}.gtf \ ../output/05.32-lncRNA-discovery/{}.sorted.bam ``` ```{r, engine='bash', eval=TRUE} wc -l ../output/05.32-lncRNA-discovery/RNA*.gtf ls ../output/05.32-lncRNA-discovery/RNA*.gtf #head ../output/05.32-lncRNA-discovery/RNA*.gtf ``` ![gtf](http://gannet.fish.washington.edu/seashell/snaps/Monosnap_IGV_-_Session_Userssr320DesktopApul_lncRNA_igv_session.xml_2023-11-05_10-11-31.png) ![igv2](http://gannet.fish.washington.edu/seashell/snaps/Monosnap_IGV_-_Session_Userssr320DesktopApul_lncRNA_igv_session.xml_2023-11-05_12-33-58.png) ```{r, engine='bash', eval=TRUE} ls ../output/05.32-lncRNA-discovery/*xml ``` Merges all individual GTF assemblies into a single merged GTF file. This is used to create a non-redundant set of transcripts after running StringTie separately on multiple RNA-Seq datasets. ```{r, engine='bash'} /home/shared/stringtie-2.2.1.Linux_x86_64/stringtie \ --merge \ -G ../data/Amil/ncbi_dataset/data/GCF_013753865.1/genomic.gff \ -o ../output/05.32-lncRNA-discovery/stringtie_merged.gtf \ ../output/05.32-lncRNA-discovery/*.gtf ``` ```{bash} tail ../output/05.32-lncRNA-discovery/stringtie_merged.gtf ``` ```{r, engine='bash', eval=TRUE} wc -l ../output/05.32-lncRNA-discovery/stringtie_merged.gtf head ../output/05.32-lncRNA-discovery/stringtie_merged.gtf echo "what is possible" grep -v '^#' ../output/05.32-lncRNA-discovery/stringtie_merged.gtf | cut -f3 | sort | uniq ``` # GFFcompare https://ccb.jhu.edu/software/stringtie/gffcompare.shtml ```{r, engine='bash'} /home/shared/gffcompare-0.12.6.Linux_x86_64/gffcompare \ -r ../data/Amil/ncbi_dataset/data/GCF_013753865.1/genomic.gff \ -o ../output/05.32-lncRNA-discovery/gffcompare_merged \ ../output/05.32-lncRNA-discovery/stringtie_merged.gtf ``` ```{r, engine='bash', eval=TRUE} ls ../output/05.32-lncRNA-discovery/gffcompare_merged* ``` ```{r, engine='bash', eval=TRUE} head -10 ../output/05.32-lncRNA-discovery/gffcompare_merged.annotated.gtf ``` ![gff](http://gannet.fish.washington.edu/seashell/snaps/2023-11-03_09-25-24.png) # Filter Filters the combined GTF output from GFFcompare to select only the lines representing "transcripts" and excluding lines starting with "#" (these are lines in the output format from GFFcompare that don't contain transcript information). This step further filters for a class code of "u", and keep only those with lengths greater than 199 bases. The "u' class code from the GFFcompare step is for "unknown" transcripts, that is those that are not previously annotated in our reference GFF as protein coding. The size filter of +200nt is a common filtering step for isolating lncRNAs. ```{r, engine='bash'} awk '$3 == "transcript" && $1 !~ /^#/' \ ../output/05.32-lncRNA-discovery/gffcompare_merged.annotated.gtf | grep 'class_code "u"\|class_code "x"|\class_code "i"\|class_code "y"' | awk '($5 - $4 > 199) || ($4 - $5 > 199)' > ../output/05.32-lncRNA-discovery/Apul_lncRNA_candidates.gtf ``` ```{r, engine='bash', eval=TRUE} wc -l ../output/05.32-lncRNA-discovery/Apul_lncRNA_candidates.gtf head ../output/05.32-lncRNA-discovery/Apul_lncRNA_candidates.gtf ``` # Bedtools Extracts the sequence data from the `$FASTA` reference file based on the coordinates from the filtered GTF. The resulting sequences represent potential lncRNA candidates. ```{r, engine='bash'} /home/shared/bedtools2/bin/fastaFromBed \ -fi ../data/GCF_013753865.1_Amil_v2.1_genomic.fna \ -bed ../output/05.32-lncRNA-discovery/Apul_lncRNA_candidates.gtf \ -fo ../output/05.32-lncRNA-discovery/Apul_lncRNA_candidates.fasta \ -name -split ``` ```{r, engine='bash'} fgrep -c ">" ../output/05.32-lncRNA-discovery/Apul_lncRNA_candidates.fasta head ../output/05.32-lncRNA-discovery/Apul_lncRNA_candidates.fasta ``` # CPC2 Initializes a conda environment (Anaconda) and runs CPC2, a software to predict whether a transcript is coding or non-coding. The results are saved to the $OUTPUT_DIR. CPC2 uses ORF (Open Reading Frame) Analysis, Isometric Feature Mapping (Isomap), Sequence Homology, RNA Sequence Features, and Quality of Translation to assess coding potential and flag any transcripts we would want to exclude using the FASTA generated in the previous step. ```{r, engine='bash'} eval "$(/opt/anaconda/anaconda3/bin/conda shell.bash hook)" python /home/shared/CPC2_standalone-1.0.1/bin/CPC2.py \ -i ../output/05.32-lncRNA-discovery/Apul_lncRNA_candidates.fasta \ -o ../output/05.32-lncRNA-discovery/Apul_CPC2 ``` ```{r, engine='bash', eval=TRUE} wc -l head ../output/05.32-lncRNA-discovery/Apul_CPC2.txt head ../output/05.32-lncRNA-discovery/Apul_CPC2.txt ``` #Filter Filters the CPC2 results to get only noncoding transcripts (using the class "noncoding" from the CPC2 results) and extracts their IDs and matches these IDs with the sequences from the previous step to generate a GTF of long noncoding transcripts. Matches these IDs with the sequences from the previous step to generate a GTF of noncoding transcripts. ```{r, engine='bash'} awk '$8 == "noncoding" {print $1}' ../output/05.32-lncRNA-discovery/Apul_CPC2.txt > ../output/05.32-lncRNA-discovery/Apul_noncoding_transcripts_ids.txt ``` ```{r, engine='bash'} wc -l ../output/05.32-lncRNA-discovery/Apul_noncoding_transcripts_ids.txt head ../output/05.32-lncRNA-discovery/Apul_noncoding_transcripts_ids.txt ``` ```{r, engine='bash', eval=TRUE} head ../output/05.32-lncRNA-discovery/Apul_lncRNA_candidates.fasta fgrep -c ">" ../output/05.32-lncRNA-discovery/Apul_lncRNA_candidates.fasta ``` 17098 original number # subsetting fasta ```{r, engine='bash'} /home/shared/samtools-1.12/samtools faidx ../output/05.32-lncRNA-discovery/Apul_lncRNA_candidates.fasta \ -r ../output/05.32-lncRNA-discovery/Apul_noncoding_transcripts_ids.txt > ../output/05.32-lncRNA-discovery/Apul_lncRNA.fasta ``` ```{r, engine='bash', eval=TRUE} fgrep -c ">" ../output/05.32-lncRNA-discovery/Apul_lncRNA.fasta fgrep ">" ../output/05.32-lncRNA-discovery/Apul_lncRNA.fasta | head -5 head ../output/05.32-lncRNA-discovery/Apul_lncRNA.fasta ``` # Getting genome feature track ```{r, engine='python'} # Open the input file and the output file with open('../output/05.32-lncRNA-discovery/Apul_noncoding_transcripts_ids.txt', 'r') as infile, open('../output/05.32-lncRNA-discovery/Apul_lncRNA.bed', 'w') as outfile: # Process each line in the input file for line in infile: # Remove 'transcript::' and then split the line by ':' to extract the relevant parts parts = line.strip().replace('transcript::', '').split(':') chromosome = parts[0] # Split the position part by '-' to get start and end positions start, end = parts[1].split('-') # BED format requires the start position to be 0-based # Convert the start position to 0-based by subtracting 1 start = str(int(start) - 1) # Write the chromosome, start, and end positions to the output file # Separate each field with a tab character outfile.write(f'{chromosome}\t{start}\t{end}\n') # After running this script, 'output.bed' will contain the converted data in BED format. ``` ```{bash} head ../output/05.32-lncRNA-discovery/Apul_lncRNA.bed ```