--- title: "05-lncRNA-discovery-overview" author: "Zach Bengtsson" date: "2023-08-16" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` # lncRNA Discovery (using Acropora pulchra as example) ## HISAT HISAT2 build to create an index for the Acropora millipora FASTA file... ```{bash hisat-build} #FASTA file path input ref_genome="/home/shared/8TB_HDD_02/zbengt/github/coral-lncRNA/data/GCF_013753865.1_Amil_v2.1_genomic.fna" index="/home/shared/8TB_HDD_02/zbengt/github/coral-lncRNA/ouput/Amil.index" /home/shared/hisat2-2.2.1/hisat2-build \ -f "$ref_genome" \ "$index" ``` HISAT2 to align paired-end RNA-Seq reads to the Pver_genome_assembly_v1.0.fasta index... ```{bash} find /home/shared/8TB_HDD_01/apul/trimmed/*gz \ | xargs basename -s _R1_001.fastp-trim.20230519.fastq.gz | xargs -I{} \ /home/shared/hisat2-2.2.1/hisat2 \ -x /home/shared/8TB_HDD_02/zbengt/github/coral-lncRNA/ouput/Amil.index \ -p 20 \ -1 /home/shared/8TB_HDD_01/apul/trimmed/{}_R1_001.fastp-trim.20230519.fastq.gz \ -2 /home/shared/8TB_HDD_01/apul/trimmed/{}_R2_001.fastp-trim.20230519.fastq.gz \ -S /home/shared/8TB_HDD_01/apul/sam/{}.sam ``` ## Samtools Samtools to convert the SAM files output from the previous HISAT2 command into sorted BAM files... ```{bash samtools} for file in /home/shared/8TB_HDD_01/apul/sam/*.sam; do base=$(basename "$file" .sam) /home/shared/samtools-1.12/samtools view -bS "$file" | \ /home/shared/samtools-1.12/samtools sort \ -o /home/shared/8TB_HDD_01/apul/bam/"$base".bam done ``` ## StringTie StringTie to assemble transcripts from the sorted BAM files generated by the previous Samtools commands... ```{bash stringtie} find /home/shared/8TB_HDD_01/apul/bam/*bam \ | xargs basename -s .bam | xargs -I{} \ /home/shared/stringtie-2.2.1.Linux_x86_64/stringtie \ -p 8 \ -G ../data/GCF_013753865.1_Amil_v2.1_genomic.gff \ -o /home/shared/8TB_HDD_01/apul/stringtie-assembly/{}.gtf \ /home/shared/8TB_HDD_01/apul/bam/{}.bam \ ``` Use StringTie merge to merge all the individual GTF files into a single merged GTF file... ```{bash stringtie} /home/shared/stringtie-2.2.1.Linux_x86_64/stringtie \ --merge \ -G ../data/GCF_013753865.1_Amil_v2.1_genomic.gff \ -o /home/shared/8TB_HDD_01/apul/merged-gtf/stringtie_merged.gtf \ /home/shared/8TB_HDD_01/apul/stringtie-assembly/*.gtf ``` ```{bash} head /home/shared/8TB_HDD_01/apul/merged-gtf/stringtie_merged.gtf tail /home/shared/8TB_HDD_01/apul/merged-gtf/stringtie_merged.gtf ``` ## GFFcompare Use gffcompare to compare annotation file generated by StringTie to a reference annotation file and to produce a set of output files summarizing the results of the comparison, including classification of each transcript... ```{bash gffcompare} /home/shared/gffcompare-0.12.6.Linux_x86_64/gffcompare \ -r ../data/GCF_013753865.1_Amil_v2.1_genomic.gff \ -G ../data/GCF_013753865.1_Amil_v2.1_genomic.gff \ -o /home/shared/8TB_HDD_01/apul/gffcompare/gffcompare_merged \ /home/shared/8TB_HDD_01/apul/merged-gtf/stringtie_merged.gtf \ ``` ```{bash} head /home/shared/8TB_HDD_01/apul/gffcompare/gffcompare_merged.combined.gtf ``` ## Filter Filter to get a subset of the transcripts from the original GTF file that are putative lncRNA candidates based on their length and lack of overlap with known reference transcripts… ```{bash filter} awk '$3 == "transcript" && $1 !~ /^#/ {print}' /home/shared/8TB_HDD_01/apul/gffcompare/gffcompare_merged.combined.gtf | grep 'class_code "u"' | awk '$5 - $4 > 199 {print}' > /home/shared/8TB_HDD_01/apul/filter-one/merged_lncRNA_candidates.gtf ``` ```{bash} head /home/shared/8TB_HDD_01/pver/filter-output/merged_lncRNA_candidates.gtf ``` ## Bedtools Get fasta of lncRNA candidate regions with Bedtools… ```{bash} /home/shared/bedtools2/bin/bedtools \ getfasta -fi /home/shared/8TB_HDD_02/zbengt/github/coral-lncRNA/data/GCF_013753865.1_Amil_v2.1_genomic.fna -bed /home/shared/8TB_HDD_01/apul/filter-one/merged_lncRNA_candidates.gtf -fo /home/shared/8TB_HDD_01/apul/get-fasta/merged_lncRNA_candidates.fasta -name -split ``` ```{bash} head /home/shared/8TB_HDD_01/pver/bedtools-output/merged_lncRNA_candidates.fasta ``` ## CPC2 Evaluate coding potential of transcripts with Coding Potential Calculator 2… ```{bash} eval "$(/opt/anaconda/anaconda3/bin/conda shell.bash hook)" python /home/shared/CPC2_standalone-1.0.1/bin/CPC2.py -i /home/shared/8TB_HDD_01/apul/get-fasta/merged_lncRNA_candidates.fasta -o ~/github/coral-lncRNA/output/apul_merged_cpc2_results ``` ## Final Filter Filter for those transcripts with label “noncoding”… ```{bash} # Filter out transcripts with coding potential awk '$8 == "noncoding" {print $1}' /home/shared/8TB_HDD_02/zbengt/github/coral-lncRNA/ouput/apul_merged_cpc2_results.txt > /home/shared/8TB_HDD_02/zbengt/github/coral-lncRNA/ouput/apul_noncoding_transcripts_ids.txt grep -Fwf /home/shared/8TB_HDD_02/zbengt/github/coral-lncRNA/ouput/apul_noncoding_transcripts_ids.txt /home/shared/8TB_HDD_01/apul/get-fasta/merged_lncRNA_candidates.fasta > /home/shared/8TB_HDD_02/zbengt/github/coral-lncRNA/ouput/apul_merged_final_lncRNAs.gtf ``` ## De-duplicate GTF Remove duplicates of transcript IDs that appear more than once within the GTF... ```{bash} awk '!seen[$0]++' ~/github/coral-lncRNA/ouput/apul_merged_final_lncRNAs.gtf > ~/github/coral-lncRNA/ouput/apul_deduplicated_final_lncRNAs.gtf ``` This results in the final GTF list with transcript IDs containing scaffold and base range, information used to filter the reference FASTA for only lncRNAs. ## Reformat to BED Make GTF into BED format so Bedtools can recognize scaffold and base range... ```{bash} awk -F":|-" '{print $3 "\t" $4 "\t" $5}' /home/shared/8TB_HDD_02/zbengt/github/coral-lncRNA/ouput/apul_deduplicated_final_lncRNAs.gtf > /home/shared/8TB_HDD_02/zbengt/github/coral-lncRNA/ouput/apul_deduplicated_final_lncRNAs.bed ``` ## BEDTools Use BEDTools to extract sequences of lncRNAs using the reformatted GTF list of transcript IDs... ```{bash} /home/shared/bedtools2/bin/bedtools \ getfasta -fi /home/shared/8TB_HDD_02/zbengt/github/coral-lncRNA/data/GCF_013753865.1_Amil_v2.1_genomic.fna -bed /home/shared/8TB_HDD_02/zbengt/github/coral-lncRNA/ouput/apul_deduplicated_final_lncRNAs.bed -fo /home/shared/8TB_HDD_02/zbengt/github/coral-lncRNA/ouput/apul_bedtools_lncRNAs.fasta -name ``` # Untested Push-button after defining directory paths and bioinformatic tool locations ```{bash} # Define directories and files ROOT_DIR="/home/shared" DATA_DIR="$ROOT_DIR/8TB_HDD_02/zbengt/github/coral-lncRNA/data" OUTPUT_DIR="$ROOT_DIR/8TB_HDD_02/zbengt/github/coral-lncRNA/ouput" APUL_DIR="$ROOT_DIR/8TB_HDD_01/apul" FASTA="$DATA_DIR/GCF_013753865.1_Amil_v2.1_genomic.fna" GFF="$DATA_DIR/GCF_013753865.1_Amil_v2.1_genomic.gff" INDEX="$OUTPUT_DIR/Amil.index" # Software paths HISAT_PATH="$ROOT_DIR/hisat2-2.2.1" SAMTOOLS_PATH="$ROOT_DIR/samtools-1.12" STRINGTIE_PATH="$ROOT_DIR/stringtie-2.2.1.Linux_x86_64" GFFCOMPARE_PATH="$ROOT_DIR/gffcompare-0.12.6.Linux_x86_64" BEDTOOLS_PATH="$ROOT_DIR/bedtools2/bin" CPC2_PATH="$ROOT_DIR/CPC2_standalone-1.0.1/bin" # HISAT $HISAT_PATH/hisat2-build -f "$FASTA" "$INDEX" find $APUL_DIR/trimmed/*gz \ | xargs basename -s _R1_001.fastp-trim.20230519.fastq.gz | xargs -I{} \ $HISAT_PATH/hisat2 \ -x $INDEX \ -p 20 \ -1 $APUL_DIR/trimmed/{}_R1_001.fastp-trim.20230519.fastq.gz \ -2 $APUL_DIR/trimmed/{}_R2_001.fastp-trim.20230519.fastq.gz \ -S $APUL_DIR/sam/{}.sam # Samtools for file in $APUL_DIR/sam/*.sam; do base=$(basename "$file" .sam) $SAMTOOLS_PATH/samtools view -bS "$file" | \ $SAMTOOLS_PATH/samtools sort \ -o $APUL_DIR/bam/"$base".bam done # StringTie find $APUL_DIR/bam/*bam \ | xargs basename -s .bam | xargs -I{} \ $STRINGTIE_PATH/stringtie \ -p 8 \ -G $GFF \ -o $APUL_DIR/stringtie-assembly/{}.gtf \ $APUL_DIR/bam/{}.bam $STRINGTIE_PATH/stringtie \ --merge \ -G $GFF \ -o $APUL_DIR/merged-gtf/stringtie_merged.gtf \ $APUL_DIR/stringtie-assembly/*.gtf # GFFcompare $GFFCOMPARE_PATH/gffcompare \ -r $GFF \ -G $GFF \ -o $APUL_DIR/gffcompare/gffcompare_merged \ $APUL_DIR/merged-gtf/stringtie_merged.gtf # Filter awk '$3 == "transcript" && $1 !~ /^#/ {print}' $APUL_DIR/gffcompare/gffcompare_merged.combined.gtf | grep 'class_code "u"' | awk '$5 - $4 > 199 {print}' > $APUL_DIR/filter-one/merged_lncRNA_candidates.gtf # Bedtools $BEDTOOLS_PATH/bedtools \ getfasta -fi $FASTA -bed $APUL_DIR/filter-one/merged_lncRNA_candidates.gtf -fo $APUL_DIR/get-fasta/merged_lncRNA_candidates.fasta -name -split # CPC2 eval "$(/opt/anaconda/anaconda3/bin/conda shell.bash hook)" python $CPC2_PATH/CPC2.py -i $APUL_DIR/get-fasta/merged_lncRNA_candidates.fasta -o $OUTPUT_DIR/apul_merged_cpc2_results # Final Filter awk '$8 == "noncoding" {print $1}' $OUTPUT_DIR/apul_merged_cpc2_results.txt > $OUTPUT_DIR/apul_noncoding_transcripts_ids.txt grep -Fwf $OUTPUT_DIR/apul_noncoding_transcripts_ids.txt $APUL_DIR/get-fasta/merged_lncRNA_candidates.fasta > $OUTPUT_DIR/apul_merged_final_lncRNAs.gtf # De-duplicate GTF awk '!seen[$0]++' $OUTPUT_DIR/apul_merged_final_lncRNAs.gtf > $OUTPUT_DIR/apul_deduplicated_final_lncRNAs.gtf # Reformat to BED awk -F":|-" '{print $3 "\t" $4 "\t" $5}' $OUTPUT_DIR/apul_deduplicated_final_lncRNAs.gtf > $OUTPUT_DIR/apul_deduplicated_final_lncRNAs.bed # BEDTools $BEDTOOLS_PATH/bedtools \ getfasta -fi $FASTA -bed $OUTPUT_DIR/apul_deduplicated_final_lncRNAs.bed -fo $OUTPUT_DIR/apul_bedtools_lncRNAs.fasta -name ```