--- title: "13-Hisat" author: "Steven Roberts" date: "`r format(Sys.time(), '%d %B, %Y')`" analyses: github_document: toc: true toc_depth: 3 number_sections: true html_preview: true html_document: theme: readable highlight: zenburn toc: true toc_float: true number_sections: true code_folding: show code_download: true editor_options: markdown: wrap: sentence --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) library(kableExtra) library(DT) library(Biostrings) library(tm) library(pheatmap) library(DESeq2) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center", # Align plots to the center comment = "" # Prevents appending '##' to beginning of lines in code analyses ) ``` # Differentially Expressed Genes # Reads ```{r, engine='bash', eval=TRUE} ls /home/shared/8TB_HDD_02/graceac9/data/pycno2021/* ``` ```{r, engine='bash'} /home/shared/FastQC-0.12.1/fastqc \ /home/shared/8TB_HDD_02/graceac9/data/pycno2021/*fq.gz \ -t 36 \ -o ../analyses/13-hisat-deseq2/ ``` ```{r, engine='bash'} eval "$(/opt/anaconda/anaconda3/bin/conda shell.bash hook)" conda activate which multiqc multiqc ../analyses/13-hisat-deseq2/ \ -o ../analyses/13-hisat-deseq2/ ``` # Genome ![](http://gannet.fish.washington.edu/seashell/snaps/Monosnap_Pycnopodia_helianthoides_genome_assembly_ASM3215829v1_-_NCBI_-_NLM_2024-05-25_15-32-13.png) ```{r, engine='bash'} cd ../data /home/shared/datasets download genome accession GCA_032158295.1 --include gff3,rna,cds,protein,genome,seq-report ``` ```{r, engine='bash'} cd ../data unzip ncbi_dataset.zip ``` ```{r, engine='bash', eval=TRUE} ls ../data/ncbi_dataset/data/GCA_032158295.1 ``` ## Annotation files ```{r, engine='bash', eval=TRUE} head ../analyses/12-fix-gff/mod_augustus.gtf head ../data/ncbi_dataset/data/GCA_032158295.1/GCA_032158295.1_ASM3215829v1_genomic.fna ``` # Hisat ```{r, engine='bash'} /home/shared/hisat2-2.2.1/hisat2_extract_exons.py \ ../analyses/12-fix-gff/mod_augustus.gtf \ > ../analyses/13-hisat-deseq2/m_exon.tab ``` ```{r, engine='bash'} /home/shared/hisat2-2.2.1/hisat2_extract_splice_sites.py \ ../analyses/12-fix-gff/mod_augustus.gtf \ > ../analyses/13-hisat-deseq2/m_spice_sites.tab ``` ```{r, engine='bash'} echo "13-hisat-deseq2/GCF*" >> ../analyses/.gitignore echo "13-hisat-deseq2/GCF**fastq" >> ../analyses/.gitignore ``` ```{r, engine='bash'} /home/shared/hisat2-2.2.1/hisat2-build \ ../data/ncbi_dataset/data/GCA_032158295.1/GCA_032158295.1_ASM3215829v1_genomic.fna \ ../analyses/13-hisat-deseq2/GCA_032158295.index \ --exon ../analyses/13-hisat-deseq2/m_exon.tab \ --ss ../analyses/13-hisat-deseq2/m_spice_sites.tab \ -p 20 \ ../analyses/12-fix-gff/mod_augustus.gtf \ 2> ../analyses/13-hisat-deseq2/hisat2-build_stats.txt ``` ```{r, engine='bash'} echo "13-hisat-deseq2/*sam" >> ../analyses/.gitignore ``` ```{r, engine='bash', eval=TRUE} find /home/shared/8TB_HDD_02/graceac9/data/pycno2021/*_R1_001.fastq.gz.fastp-trim.20220810.fq.gz | xargs basename -s _R1_001.fastq.gz.fastp-trim.20220810.fq.gz | xargs -I{} echo {} ``` keeping unmapped reads ```{r, engine='bash'} find /home/shared/8TB_HDD_02/graceac9/data/pycno2021/*_R1_001.fastq.gz.fastp-trim.20220810.fq.gz \ | xargs -I{} basename -s _R1_001.fastq.gz.fastp-trim.20220810.fq.gz {} \ | xargs -I{} sh -c '/home/shared/hisat2-2.2.1/hisat2 \ -x ../analyses/13-hisat-deseq2/GCA_032158295.index \ --dta \ -p 32 \ -1 /home/shared/8TB_HDD_02/graceac9/data/pycno2021/{}_R1_001.fastq.gz.fastp-trim.20220810.fq.gz \ -2 /home/shared/8TB_HDD_02/graceac9/data/pycno2021/{}_R2_001.fastq.gz.fastp-trim.20220810.fq.gz \ -S ../analyses/13-hisat-deseq2/{}_03.sam \ --un-conc ../analyses/13-hisat-deseq2/{}_unmapped_reads.fastq \ > ../analyses/13-hisat-deseq2/{}_hisat03.stdout 2> ../analyses/13-hisat-deseq2/{}_hisat03.stderr' ``` Explanation xargs -I{}: This option allows you to replace {} in the command with the output from the previous command (i.e., basename). It's used twice: first, to strip the suffix from the filenames, and second, to construct and execute the hisat2 command. sh -c: This is used to execute a complex command within xargs. It's necessary because the output redirection (\>, 2\>) is shell functionality, and without sh -c, xargs wouldn't handle it correctly. ```{r, engine='bash'} echo "13-hisat-deseq2/*bam" >> ../analyses/.gitignore echo "13-hisat-deseq2/*bam*" >> ../analyses/.gitignore ``` ```{r, engine='bash'} for samfile in ../analyses/13-hisat-deseq2/*.sam; do bamfile="${samfile%.sam}.bam" sorted_bamfile="${samfile%.sam}.sorted.bam" /home/shared/samtools-1.12/samtools view -bS -@ 20 "$samfile" > "$bamfile" /home/shared/samtools-1.12/samtools sort -@ 20 "$bamfile" -o "$sorted_bamfile" /home/shared/samtools-1.12/samtools index -@ 20 "$sorted_bamfile" done ``` , ```{r, engine='bash'} rm ../analyses/13-hisat-deseq2/*sam ``` ```{r, engine='bash'} ls ../analyses/13-hisat-deseq2/*sorted.bam | wc -l ``` # Stringtie ```{r, engine='bash'} echo "13-hisat-deseq2/*gtf" >> ../analyses/.gitignore ``` ```{bash} /home/shared/gffread-0.12.7.Linux_x86_64/gffread \ ../analyses/12-fix-gff/mod_augustus.gtf \ -T \ -o ../analyses/13-hisat-deseq2/mod_augustus.gff ``` ```{r, engine='bash'} find ../analyses/13-hisat-deseq2/*sorted.bam \ | xargs basename -s .sorted.bam | xargs -I{} \ sh -c '/home/shared/stringtie-2.2.1.Linux_x86_64/stringtie \ -p 36 \ -eB \ -G ../analyses/13-hisat-deseq2/mod_augustus.gff \ -o ../analyses/13-hisat-deseq2/{}.gtf \ ../analyses/13-hisat-deseq2/{}.sorted.bam' ``` ```{r, engine='bash'} eval "$(/opt/anaconda/anaconda3/bin/conda shell.bash hook)" conda activate which multiqc multiqc ../analyses/13-hisat-deseq2/ \ -o ../analyses/13-hisat-deseq2/ ```