--- title: "07- A pulcra HiSat2" author: Steven Roberts date: "`r format(Sys.time(), '%d %B, %Y')`" output: html_document: theme: readable highlight: zenburn toc: true toc_float: true number_sections: true code_folding: show code_download: true --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) ``` # Run HiSat on RNA-seq Will end up with 5 sorted bam files, without building index with splice site. ## Grab Trimmed RNA-seq Reads ```{r, engine='bash'} cd ../data/fastq/ echo "*.fq" >> .gitignore ``` ```{r, engine='bash'} wget -r \ --no-directories --no-parent \ -P ../data/fastq/ \ -A "*fastq.gz" https://gannet.fish.washington.edu/Atumefaciens/20230519-E5_coral-fastqc-fastp-multiqc-RNAseq/A_pulchra/trimmed/ ``` ## Genome ```{r, engine='bash'} cd ../data wget -O Apulcra-genome.fa "https://osf.io/download/kn96u/" ``` ```{r, engine='bash'} head ../data/Apulcra-genome.fa ``` ## HiSat ```{r, engine='bash'} /home/shared/hisat2-2.2.1/hisat2-build \ ../data/Apulcra-genome.fa \ ../output/07-Apul-Hisat/Apulcra-genome.index \ -p 24 \ 2> ../output/07-Apul-Hisat/hisat2-build_stats.txt ``` ```{r, engine='bash', eval=TRUE} cat ../output/07-Apul-Hisat/hisat2-build_stats.txt ``` ```{r, engine='bash', eval=TRUE} find ../data/fastq/*gz ``` ```{r, engine='bash', eval=TRUE} find ../data/fastq/*R2_001.fastp-trim.20230519.fastq.gz \ | xargs basename -s -S1-TP2_R2_001.fastp-trim.20230519.fastq.gz | xargs -I{} \ echo {} ``` ```{r, engine='bash'} cd ../output/07-Apul-Hisat/ echo "*sam" >> .gitignore ``` ```{r, engine='bash'} find ../data/fastq/*R2_001.fastp-trim.20230519.fastq.gz \ | xargs basename -s -S1-TP2_R2_001.fastp-trim.20230519.fastq.gz | xargs -I{} \ /home/shared/hisat2-2.2.1/hisat2 \ -x ../output/07-Apul-Hisat/Apulcra-genome.index \ -p 48 \ -1 ../data/fastq/{}-S1-TP2_R1_001.fastp-trim.20230519.fastq.gz \ -2 ../data/fastq/{}-S1-TP2_R2_001.fastp-trim.20230519.fastq.gz \ -S ../output/07-Apul-Hisat/{}.sam \ 2> ../output/07-Apul-Hisat/hisat.out ``` ```{r, engine='bash', eval=TRUE} cat ../output/07-Apul-Hisat/hisat.out ``` ## convert to bams ```{r, engine='bash'} for samfile in ../output/07-Apul-Hisat/*.sam; do bamfile="${samfile%.sam}.bam" sorted_bamfile="${samfile%.sam}.sorted.bam" # Convert SAM to BAM /home/shared/samtools-1.12/samtools view -bS -@ 20 "$samfile" > "$bamfile" # Sort BAM /home/shared/samtools-1.12/samtools sort -@ 20 "$bamfile" -o "$sorted_bamfile" # Index sorted BAM /home/shared/samtools-1.12/samtools index -@ 20 "$sorted_bamfile" done ``` # StringTie StringTie uses the sorted BAM files to assemble transcripts for each sample, outputting them as GTF (Gene Transfer Format) files. And then merges all individual GTF assemblies into a single merged GTF file. This step extracts transcript information and merges GTFs from all samples--an important step in creating a canonical list of lncRNAs across all samples included in the pipeline. Getting gff ```{r, engine='bash'} cd ../data wget -O Apulcra-genome.gff "https://osf.io/download/f9dbr/" ``` ```{r, engine='bash'} head ../data/Apulcra-genome.gff ``` ```{r, engine='bash'} find ../output/07-Apul-Hisat/*sorted.bam \ | xargs basename -s .sorted.bam | xargs -I{} \ /home/shared/stringtie-2.2.1.Linux_x86_64/stringtie \ -p 32 \ -eB \ -G ../data/Apulcra-genome.gff \ -o ../output/07-Apul-Hisat/{}.gtf \ ../output/07-Apul-Hisat/{}.sorted.bam ``` ```{r, engine='bash', eval=TRUE} wc -l ../output/07-Apul-Hisat/RNA*.gtf ls ../output/07-Apul-Hisat/RNA*.gtf #head ../output/07-Apul-Hisat/RNA*.gtf ``` # Count Matrix list format RNA-ACR-140 ../output/15-Apul-hisat/RNA-ACR-140.gtf RNA-ACR-145 ../output/15-Apul-hisat/RNA-ACR-145.gtf RNA-ACR-173 ../output/15-Apul-hisat/RNA-ACR-173.gtf RNA-ACR-178 ../output/15-Apul-hisat/RNA-ACR-178.gtf ```{r, engine='bash'} head ../output/07-Apul-Hisat/list01.txt ``` ```{r, engine='bash'} python /home/shared/stringtie-2.2.1.Linux_x86_64/prepDE.py \ -i ../output/07-Apul-Hisat/list01.txt \ -g ../output/07-Apul-Hisat/gene_count_matrix.csv \ -t ../output/07-Apul-Hisat/transcript_count_matrix.csv head ../output/07-Apul-Hisat/*matrix.csv ``` ```{bash} cp ../output/07-Apul-Hisat/gene_count_matrix.csv ../output/07-Apul-Hisat/Apul-gene_count_matrix.csv ``` # Getting FASTA for anno $ bedtools getfasta [OPTIONS] -fi -bed grep -w "mRNA" annotation.gff > mRNA_only.gff ```{r, engine='bash'} grep -w "mRNA" ../data/Apulcra-genome.gff > ../data/Apulcra-genome-mRNA_only.gff ``` awk '{if ($0 !~ /^#/ && $3 != "") {feature[$3]++}} END {for (f in feature) print f, feature[f]}' ../data/Apulcra-genome.gff ```{r, engine='bash'} awk '{if ($0 !~ /^#/ && $3 != "") {feature[$3]++}} END {for (f in feature) print f, feature[f]}' ../data/Apulcra-genome.gff ``` ```{r, engine='bash'} /home/shared/bedtools2/bin/fastaFromBed \ -fi ../data/Apulcra-genome.fa \ -bed ../data/Apulcra-genome-mRNA_only.gff \ -fo ../output/07-Apul-Hisat/genes.fasta \ -name -split ``` ```{r, engine='bash'} head ../output/07-Apul-Hisat/genes.fasta grep -c ">" ../output/07-Apul-Hisat/genes.fasta ``` # SP Download ```{r, engine='bash'} cd ../data/blast_dbs curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz mv uniprot_sprot.fasta.gz uniprot_sprot_r2024_05.fasta.gz gunzip -k uniprot_sprot_r2024_05.fasta.gz head ../../data/blast_dbs/uniprot_sprot_r2024_05.fasta echo "Number of Sequences" grep -c ">" ../../data/blast_dbs/uniprot_sprot_r2024_05.fasta /home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \ -in ../../data/blast_dbs/uniprot_sprot_r2024_05.fasta \ -dbtype prot \ -out ../../data/blast_dbs/uniprot_sprot_r2024_05 ``` # Blastp ```{r, engine='bash'} fasta="../output/07-Apul-Hisat/genes.fasta" /home/shared/ncbi-blast-2.15.0+/bin/blastx \ -query $fasta \ -db ../data/blast_dbs/uniprot_sprot_r2024_05 \ -out ../output/07-Apul-Hisat/blastx_out.tab \ -evalue 1E-05 \ -num_threads 48 \ -max_target_seqs 1 \ -max_hsps 1 \ -outfmt 6 wc -l ../output/07-Apul-Hisat/blastx_out.tab tr '|' '\t' < ../output/07-Apul-Hisat/blastx_out.tab \ > ../output/07-Apul-Hisat/blastx_out_sep.tab head -1 ../output/07-Apul-Hisat/blastx_out_sep.tab ``` # Download GO info ```{bash} cd ../output/07-Apul-Hisat curl -H "Accept: text/plain; format=tsv" "https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength%2Cgo_p%2Cgo_c%2Cgo%2Cgo_f%2Cgo_id&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29" -o SwissProt-Annot-GO.tsv wc -l ../output/07-Apul-Hisat/SwissProt-Annot-GO.tsv ``` Join blast with GO info ```{r} bltabl <- read.csv("../output/07-Apul-Hisat/blastx_out_sep.tab", sep = '\t', header = FALSE) spgo <- read.csv("../output/07-Apul-Hisat/SwissProt-Annot-GO.tsv", sep = '\t', header = TRUE) annot_tab <- left_join(bltabl, spgo, by = c("V3" = "Entry")) %>% select( query = V1, blast_hit = V3, evalue = V13, ProteinNames = Protein.names, BiologicalProcess = Gene.Ontology..biological.process., GeneOntologyIDs = Gene.Ontology.IDs ) head(annot_tab) write.table(annot_tab, file = "../output/07-Apul-Hisat/Apul-gene-annot-GO.tsv", sep = "\t", row.names = FALSE, quote = FALSE) system("head ../output/07-Apul-Hisat/Apul-gene-annot-GO.tsv") ```