--- title: "07- Ptuh HiSat2" author: Steven Roberts date: "`r format(Sys.time(), '%d %B, %Y')`" output: html_document: theme: readable highlight: zenburn toc: true toc_float: true number_sections: true code_folding: show code_download: true --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) ``` # Run HiSat on RNA-seq Will end up with 5 sorted bam files, without building index with splice site. ## Grab Trimmed RNA-seq Reads ```{r, engine='bash'} cd ../data/fastq/ echo "*.fq" >> .gitignore ``` ```{r, engine='bash'} wget -r \ --no-directories --no-parent \ -P ../data/fastq/ \ -A "*fastq.gz" https://gannet.fish.washington.edu/Atumefaciens/20230519-E5_coral-fastqc-fastp-multiqc-RNAseq/P_meandrina/trimmed/ ``` ## Genome ```{r, engine='bash'} cd ../data wget https://owl.fish.washington.edu/halfshell/genomic-databank/Pocillopora_meandrina_HIv1.assembly.fasta 2> error_log.txt ``` ```{r, engine='bash'} head ../data/Pocillopora_meandrina_HIv1.assembly.fasta ``` ## HiSat ```{r, engine='bash'} /home/shared/hisat2-2.2.1/hisat2-build \ ../data/Pocillopora_meandrina_HIv1.assembly.fasta \ ../output/06-Ptuh-Hisat/Ptuh-genome.index \ -p 24 &> ../output/06-Ptuh-Hisat/hisat2_build.log ``` ```{r, engine='bash'} cd ../output/06-Ptuh-Hisat/ echo "*sam" >> .gitignore ``` ```{r, engine='bash'} find ../data/fastq/*R2_001.fastp-trim.20230519.fastq.gz \ | xargs -I{} basename -s -S1-TP2_R2_001.fastp-trim.20230519.fastq.gz {} \ | xargs -I{} sh -c '/home/shared/hisat2-2.2.1/hisat2 \ -x ../output/06-Ptuh-Hisat/Ptuh-genome.index \ -p 38 \ -1 ../data/fastq/{}-S1-TP2_R1_001.fastp-trim.20230519.fastq.gz \ -2 ../data/fastq/{}-S1-TP2_R2_001.fastp-trim.20230519.fastq.gz \ -S ../output/06-Ptuh-Hisat/{}.sam \ > ../output/06-Ptuh-Hisat/{}_hisat.stdout 2> ../output/06-Ptuh-Hisat/{}_hisat.stderr' ``` ## convert to bams ```{r, engine='bash'} for samfile in ../output/06-Ptuh-Hisat/*.sam; do bamfile="${samfile%.sam}.bam" sorted_bamfile="${samfile%.sam}.sorted.bam" # Convert SAM to BAM /home/shared/samtools-1.12/samtools view -bS -@ 40 "$samfile" > "$bamfile" # Sort BAM /home/shared/samtools-1.12/samtools sort -@ 40 "$bamfile" -o "$sorted_bamfile" # Index sorted BAM /home/shared/samtools-1.12/samtools index -@ 40 "$sorted_bamfile" done ``` ## remove sams ```{r, engine='bash'} rm ../output/06-Ptuh-Hisat/*sam ``` # StringTie StringTie uses the sorted BAM files to assemble transcripts for each sample, outputting them as GTF (Gene Transfer Format) files. And then merges all individual GTF assemblies into a single merged GTF file. This step extracts transcript information and merges GTFs from all samples--an important step in creating a canonical list of lncRNAs across all samples included in the pipeline. Getting gff ```{r, engine='bash'} cd ../data wget https://owl.fish.washington.edu/halfshell/genomic-databank/Pocillopora_meandrina_HIv1.genes.gff3 ``` ```{r, engine='bash'} head ../data/Pocillopora_meandrina_HIv1.genes.gff3 ``` ```{r, engine='bash'} find ../output/06-Ptuh-Hisat/*sorted.bam \ | xargs basename -s .sorted.bam | xargs -I{} \ /home/shared/stringtie-2.2.1.Linux_x86_64/stringtie \ -p 32 \ -eB \ -G ../data/Pocillopora_meandrina_HIv1.genes.gff3 \ -o ../output/06-Ptuh-Hisat/{}.gtf \ ../output/06-Ptuh-Hisat/{}.sorted.bam ``` ```{r, engine='bash', eval=TRUE} wc -l ../output/06-Ptuh-Hisat/RNA*.gtf ls ../output/06-Ptuh-Hisat/RNA*.gtf #head ../output/06-Ptuh-Hisat/RNA*.gtf ``` # Count Matrix ```{bash} for file in $(ls ../output/06-Ptuh-Hisat/*.gtf); do sample=$(basename "$file" .gtf) # Extract sample name from filename echo "$sample $file" done > ../output/06-Ptuh-Hisat/list01.txt ``` ```{r, engine='bash'} head ../output/06-Ptuh-Hisat/list01.txt ``` ```{r, engine='bash'} python /home/shared/stringtie-2.2.1.Linux_x86_64/prepDE.py \ -i ../output/06-Ptuh-Hisat/list01.txt \ -g ../output/06-Ptuh-Hisat/gene_count_matrix.csv \ -t ../output/06-Ptuh-Hisat/transcript_count_matrix.csv head ../output/06-Ptuh-Hisat/*matrix.csv ``` ```{bash} cp ../output/06-Ptuh-Hisat/gene_count_matrix.csv ../output/06-Ptuh-Hisat/Ptuh-gene_count_matrix.csv ```h ```{bash} head ../output/06-Ptuh-Hisat/Ptuh-gene_count_matrix.csv ``` ```{r} library(tidyverse) # Load the data file_path <- "../output/06-Ptuh-Hisat/Ptuh-gene_count_matrix.csv" df <- read_csv(file_path) # Convert all columns to character to avoid type mismatch df <- df %>% mutate(across(everything(), as.character)) # Transpose the data frame transposed_df <- df %>% pivot_longer(cols = everything(), names_to = "original_column", values_to = "value") %>% mutate(row = row_number()) %>% pivot_wider(names_from = row, values_from = value) # Save the transposed data frame output_path <- "../output/06-Ptuh-Hisat/Ptuh-gene_count_matrix_transposed.csv" write_csv(transposed_df, output_path) print(paste("Transposed CSV saved to", output_path)) ``` ```{bash} head ../output/06-Ptuh-Hisat/Ptuh-gene_count_matrix_transposed.csv ``` # Getting FASTA for anno $ bedtools getfasta [OPTIONS] -fi -bed grep -w "mRNA" annotation.gff > mRNA_only.gff ```{r, engine='bash'} grep -w "mRNA" ../data/Apulcra-genome.gff > ../data/Apulcra-genome-mRNA_only.gff ``` awk '{if ($0 !~ /^#/ && $3 != "") {feature[$3]++}} END {for (f in feature) print f, feature[f]}' ../data/Apulcra-genome.gff ```{r, engine='bash'} awk '{if ($0 !~ /^#/ && $3 != "") {feature[$3]++}} END {for (f in feature) print f, feature[f]}' ../data/Apulcra-genome.gff ``` ```{r, engine='bash'} /home/shared/bedtools2/bin/fastaFromBed \ -fi ../data/Apulcra-genome.fa \ -bed ../data/Apulcra-genome-mRNA_only.gff \ -fo ../output/06-Ptuh-Hisat/genes.fasta \ -name -split ``` ```{r, engine='bash'} head ../output/06-Ptuh-Hisat/genes.fasta grep -c ">" ../output/06-Ptuh-Hisat/genes.fasta ``` # SP Download ```{r, engine='bash'} cd ../data/blast_dbs curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz mv uniprot_sprot.fasta.gz uniprot_sprot_r2024_05.fasta.gz gunzip -k uniprot_sprot_r2024_05.fasta.gz head ../../data/blast_dbs/uniprot_sprot_r2024_05.fasta echo "Number of Sequences" grep -c ">" ../../data/blast_dbs/uniprot_sprot_r2024_05.fasta /home/shared/ncbi-blast-2.15.0+/bin/makeblastdb \ -in ../../data/blast_dbs/uniprot_sprot_r2024_05.fasta \ -dbtype prot \ -out ../../data/blast_dbs/uniprot_sprot_r2024_05 ``` # Blastp ```{r, engine='bash'} fasta="../output/06-Ptuh-Hisat/genes.fasta" /home/shared/ncbi-blast-2.15.0+/bin/blastx \ -query $fasta \ -db ../data/blast_dbs/uniprot_sprot_r2024_05 \ -out ../output/06-Ptuh-Hisat/blastx_out.tab \ -evalue 1E-05 \ -num_threads 48 \ -max_target_seqs 1 \ -max_hsps 1 \ -outfmt 6 wc -l ../output/06-Ptuh-Hisat/blastx_out.tab tr '|' '\t' < ../output/06-Ptuh-Hisat/blastx_out.tab \ > ../output/06-Ptuh-Hisat/blastx_out_sep.tab head -1 ../output/06-Ptuh-Hisat/blastx_out_sep.tab ``` # Download GO info ```{bash} cd ../output/06-Ptuh-Hisat curl -H "Accept: text/plain; format=tsv" "https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength%2Cgo_p%2Cgo_c%2Cgo%2Cgo_f%2Cgo_id&format=tsv&query=%28*%29+AND+%28reviewed%3Atrue%29" -o SwissProt-Annot-GO.tsv wc -l ../output/06-Ptuh-Hisat/SwissProt-Annot-GO.tsv ``` Join blast with GO info ```{r} bltabl <- read.csv("../output/06-Ptuh-Hisat/blastx_out_sep.tab", sep = '\t', header = FALSE) spgo <- read.csv("../output/06-Ptuh-Hisat/SwissProt-Annot-GO.tsv", sep = '\t', header = TRUE) annot_tab <- left_join(bltabl, spgo, by = c("V3" = "Entry")) %>% select( query = V1, blast_hit = V3, evalue = V13, ProteinNames = Protein.names, BiologicalProcess = Gene.Ontology..biological.process., GeneOntologyIDs = Gene.Ontology.IDs ) head(annot_tab) write.table(annot_tab, file = "../output/06-Ptuh-Hisat/Apul-gene-annot-GO.tsv", sep = "\t", row.names = FALSE, quote = FALSE) system("head ../output/06-Ptuh-Hisat/Apul-gene-annot-GO.tsv") ```