--- title: "07.2- A pulcra HiSat2 - with splice" author: Steven Roberts date: "`r format(Sys.time(), '%d %B, %Y')`" output: html_document: theme: readable highlight: zenburn toc: true toc_float: true number_sections: true code_folding: show code_download: true --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) ``` # Run HiSat on RNA-seq Will end up with 5 sorted bam files. ## Grab Trimmed RNA-seq Reads ```{r, engine='bash'} cd ../data/fastq/ echo "*.fq" >> .gitignore ``` ```{r, engine='bash'} wget -r \ --no-directories --no-parent \ -P ../data/fastq/ \ -A "*fastq.gz" https://gannet.fish.washington.edu/Atumefaciens/20230519-E5_coral-fastqc-fastp-multiqc-RNAseq/A_pulchra/trimmed/ ``` ## Genome ```{r, engine='bash'} cd ../data wget -O Apulcra-genome.fa "https://osf.io/download/kn96u/" ``` ```{r, engine='bash'} head ../data/Apulcra-genome.fa ``` ## Grab GTF ```{r, engine='bash'} cd ../data wget -O Apulchra-genome.gtf "https://raw.githubusercontent.com/urol-e5/timeseries_molecular/refs/heads/main/D-Apul/data/Apulchra-genome.gtf" ``` ## HiSat ```{r, engine='bash'} /home/shared/hisat2-2.2.1/hisat2_extract_exons.py \ ../data/Apulchra-genome.gtf \ > ../output/07.2-Apul-Hisat/m_exon.tab ``` ```{r, engine='bash', eval=TRUE} head ../output/07.2-Apul-Hisat/m_exon.tab wc -l ../output/07.2-Apul-Hisat/m_exon.tab ``` ```{r, engine='bash'} /home/shared/hisat2-2.2.1/hisat2_extract_splice_sites.py \ ../data/Apulchra-genome.gtf \ > ../output/07.2-Apul-Hisat/m_splice_sites.tab ``` ```{r, engine='bash', eval=TRUE} head ../output/07.2-Apul-Hisat/m_splice_sites.tab ``` ```{r, engine='bash'} /home/shared/hisat2-2.2.1/hisat2-build \ ../data/Apulcra-genome.fa \ ../output/07.2-Apul-Hisat/Apulcra-genome.index \ --exon ../output/07.2-Apul-Hisat/m_exon.tab \ --ss ../output/07.2-Apul-Hisat/m_splice_sites.tab \ -p 48 \ ../data/Apulcra-genome.gtf \ 2> ../output/07.2-Apul-Hisat/hisat2-build_stats.txt ``` ```{r, engine='bash', eval=TRUE} tail ../output/07.2-Apul-Hisat/hisat2-build_stats.txt ``` ```{r, engine='bash'} cd ../output/07-Apul-Hisat/ echo "*sam" >> .gitignore ``` ```{r, engine='bash'} find ../data/fastq/*R2_001.fastp-trim.20230519.fastq.gz \ | xargs basename -s -S1-TP2_R2_001.fastp-trim.20230519.fastq.gz | xargs -I{} \ /home/shared/hisat2-2.2.1/hisat2 \ -x ../output/07.2-Apul-Hisat/Apulcra-genome.index \ -p 48 \ -1 ../data/fastq/{}-S1-TP2_R1_001.fastp-trim.20230519.fastq.gz \ -2 ../data/fastq/{}-S1-TP2_R2_001.fastp-trim.20230519.fastq.gz \ -S ../output/07.2-Apul-Hisat/{}.sam \ 2> ../output/07.2-Apul-Hisat/hisat.out ``` ```{r, engine='bash', eval=TRUE} cat ../output/07.2-Apul-Hisat/hisat.out ``` ## convert to bams ```{r, engine='bash'} for samfile in ../output/07.2-Apul-Hisat/*.sam; do bamfile="${samfile%.sam}.bam" sorted_bamfile="${samfile%.sam}.sorted.bam" # Convert SAM to BAM /home/shared/samtools-1.12/samtools view -bS -@ 20 "$samfile" > "$bamfile" # Sort BAM /home/shared/samtools-1.12/samtools sort -@ 20 "$bamfile" -o "$sorted_bamfile" # Index sorted BAM /home/shared/samtools-1.12/samtools index -@ 20 "$sorted_bamfile" done ``` dd # StringTie StringTie uses the sorted BAM files to assemble transcripts for each sample, outputting them as GTF (Gene Transfer Format) files. And then merges all individual GTF assemblies into a single merged GTF file. This step extracts transcript information and merges GTFs from all samples--an important step in creating a canonical list of lncRNAs across all samples included in the pipeline. Getting gff ```{r, engine='bash'} ``` ```{r, engine='bash'} head ../data/Apulcra-genome.gff ``` ``` -eB \ -e: Only estimate expression for transcripts that are present in the reference annotation provided with the -G option. -B: Enables the creation of a ballgown output table, useful for downstream analysis with the R ballgown package. ``` ```{r, engine='bash'} find ../output/07.2-Apul-Hisat/*sorted.bam \ | xargs basename -s .sorted.bam | xargs -I{} \ /home/shared/stringtie-2.2.1.Linux_x86_64/stringtie \ -p 32 \ -eB \ -G ../data/Apulcra-genome.gff \ -o ../output/07.2-Apul-Hisat/{}.gtf \ ../output/07.2-Apul-Hisat/{}.sorted.bam ``` ```{r, engine='bash', eval=TRUE} wc -l ../output/07.2-Apul-Hisat/RNA*.gtf ls ../output/07.2-Apul-Hisat/RNA*.gtf #head ../output/07.2-Apul-Hisat/RNA*.gtf ``` # Count Matrix list format RNA-ACR-140 ../output/15-Apul-hisat/RNA-ACR-140.gtf RNA-ACR-145 ../output/15-Apul-hisat/RNA-ACR-145.gtf RNA-ACR-173 ../output/15-Apul-hisat/RNA-ACR-173.gtf RNA-ACR-178 ../output/15-Apul-hisat/RNA-ACR-178.gtf ```{r, engine='bash'} cat ../output/07.2-Apul-Hisat/list01.txt ``` ```{r, engine='bash'} python /home/shared/stringtie-2.2.1.Linux_x86_64/prepDE.py \ -i ../output/07.2-Apul-Hisat/list01.txt \ -g ../output/07.2-Apul-Hisat/gene_count_matrix.csv \ -t ../output/07.2-Apul-Hisat/transcript_count_matrix.csv head ../output/07.2-Apul-Hisat/*matrix.csv ``` # how does this change without AS very small differences ```{bash} head ../output/07.2-Apul-Hisat/*matrix.csv head ../output/07-Apul-Hisat/*matrix.csv ```