--- title: "10-Hisat" author: "Steven Roberts" date: "`r format(Sys.time(), '%d %B, %Y')`" output: github_document: toc: true toc_depth: 3 number_sections: true html_preview: true html_document: theme: readable highlight: zenburn toc: true toc_float: true number_sections: true code_folding: show code_download: true editor_options: markdown: wrap: sentence --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) library(kableExtra) library(DT) library(Biostrings) library(tm) library(pheatmap) library(DESeq2) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center", # Align plots to the center comment = "" # Prevents appending '##' to beginning of lines in code output ) ``` # Differentially Expressed Genes Alt splice test run # Reads ```{r, engine='bash', eval=TRUE} ls ../data/reads/* ``` Need to find treatment file... # Genome ![genome](http://gannet.fish.washington.edu/seashell/snaps/Monosnap_Gadus_macrocephalus_genome_assembly_ASM3116895v1_-_NCBI_-_NLM_2024-05-15_09-20-59.png) ```{r, engine='bash'} cd ../data /home/shared/datasets download genome accession GCF_031168955.1 --include gff3,gtf,rna,cds,protein,genome,seq-report ``` ```{r, engine='bash'} cd ../data unzip ncbi_dataset.zip ``` ```{r, engine='bash', eval=TRUE} ls /home/shared/8TB_HDD_03/sr320/github/project-cod-temperature/data/ncbi_dataset/data/GCF_031168955.1 ``` # Hisat ```{r, engine='bash'} /home/shared/hisat2-2.2.1/hisat2_extract_exons.py \ ../data/ncbi_dataset/data/GCF_031168955.1/genomic.gtf \ > ../output/10-hisat-deseq2/m_exon.tab ``` ```{r, engine='bash'} /home/shared/hisat2-2.2.1/hisat2_extract_splice_sites.py \ ../data/ncbi_dataset/data/GCF_031168955.1/genomic.gtf \ > ../output/10-hisat-deseq2/m_spice_sites.tab ``` ```{r, engine='bash'} echo "10-hisat-deseq2/GCF*" >> ../output/.gitignore ``` ```{r, engine='bash'} /home/shared/hisat2-2.2.1/hisat2-build \ ../data/ncbi_dataset/data/GCF_031168955.1/GCF_031168955.1_ASM3116895v1_genomic.fna \ ../output/10-hisat-deseq2/GCF_031168955.1.index \ --exon ../output/10-hisat-deseq2/m_exon.tab \ --ss ../output/10-hisat-deseq2/m_spice_sites.tab \ -p 20 \ ../data/ncbi_dataset/data/GCF_031168955.1/genomic.gtf \ 2> ../output/10-hisat-deseq2/hisat2-build_stats.txt ``` ```{r, engine='bash'} echo "10-hisat-deseq2/*sam" >> ../output/.gitignore ``` ```{r, engine='bash'} find ../data/reads/*.trimmed.R1.fastq.gz \ | xargs basename -s .trimmed.R1.fastq.gz | xargs -I{} \ /home/shared/hisat2-2.2.1/hisat2 \ -x ../output/10-hisat-deseq2/GCF_031168955.1.index \ --dta \ -p 20 \ -1 ../data/reads/{}.trimmed.R1.fastq.gz \ -2 ../data/reads/{}.trimmed.R2.fastq.gz \ -S ../output/10-hisat-deseq2/{}.sam \ 2> ../output/10-hisat-deseq2/hisat.out ``` ```{r, engine='bash'} echo "10-hisat-deseq2/*bam" >> ../output/.gitignore echo "10-hisat-deseq2/*bam*" >> ../output/.gitignore ``` ```{r, engine='bash'} for samfile in ../output/10-hisat-deseq2/*.sam; do bamfile="${samfile%.sam}.bam" sorted_bamfile="${samfile%.sam}.sorted.bam" /home/shared/samtools-1.12/samtools view -bS -@ 20 "$samfile" > "$bamfile" /home/shared/samtools-1.12/samtools sort -@ 20 "$bamfile" -o "$sorted_bamfile" /home/shared/samtools-1.12/samtools index -@ 20 "$sorted_bamfile" done ``` ```{r, engine='bash'} rm ../output/10-hisat-deseq2/*sam ``` ```{r, engine='bash'} ls ../output/10-hisat-deseq2/*sorted.bam | wc -l ``` ```{r, engine='bash', eval=TRUE} tail ../output/10-hisat-deseq2/hisat.out ``` ```{r, engine='bash', eval=TRUE} cat ../output/10-hisat-deseq2/hisat.out \ | grep "overall alignment rate" ``` # Stringtie ```{r, engine='bash'} echo "10-hisat-deseq2/*gtf" >> ../output/.gitignore ``` ```{r, engine='bash'} find ../output/10-hisat-deseq2/*sorted.bam \ | xargs basename -s .sorted.bam | xargs -I{} \ /home/shared/stringtie-2.2.1.Linux_x86_64/stringtie \ -p 36 \ -eB \ -G ../data/ncbi_dataset/data/GCF_031168955.1/genomic.gff \ -o ../output/10-hisat-deseq2/{}.gtf \ ../output/10-hisat-deseq2/{}.sorted.bam ``` # Count Matrix list format ``` RNA-ACR-140 ../output/15-Apul-hisat/RNA-ACR-140.gtf RNA-ACR-145 ../output/15-Apul-hisat/RNA-ACR-145.gtf RNA-ACR-173 ../output/15-Apul-hisat/RNA-ACR-173.gtf RNA-ACR-178 ../output/15-Apul-hisat/RNA-ACR-178.gtf ``` ```{r, engine='bash', eval=TRUE} ls ../output/10-hisat-deseq2/*gtf ``` ```{r, engine='bash', eval=TRUE} head ../data/list01.txt ``` ```{r, engine='bash'} python /home/shared/stringtie-2.2.1.Linux_x86_64/prepDE.py \ -i ../data/list01.txt \ -g ../output/10-hisat-deseq2/gene_count_matrix.csv \ -t ../output/10-hisat-deseq2/transcript_count_matrix.csv ``` ```{r, engine='bash', eval=TRUE} head ../output/10-hisat-deseq2/*matrix.csv ``` # DEseq2 need `conditions.txt` file in this format ``` SampleID Condition RNA.ACR.140 control RNA.ACR.145 control RNA.ACR.173 treated RNA.ACR.178 treated ``` ```{r, engine='bash', eval=TRUE} head ../data/conditions.txt ``` ```{r, eval=TRUE} library(DESeq2) ``` ```{r, eval=TRUE, cache=TRUE} # Load gene(/transcript) count matrix and labels countData <- as.matrix(read.csv("../output/10-hisat-deseq2/gene_count_matrix.csv", row.names="gene_id")) colData <- read.csv("../data/conditions.txt", sep="\t", row.names = 1) # Note: The PHENO_DATA file contains information on each sample, e.g., sex or population. # The exact way to import this depends on the format of the file. # Check all sample IDs in colData are also in CountData and match their orders all(rownames(colData) %in% colnames(countData)) # This should return TRUE countData <- countData[, rownames(colData)] all(rownames(colData) == colnames(countData)) # This should also return TRUE # Create a DESeqDataSet from count matrix and labels dds <- DESeqDataSetFromMatrix(countData = countData, colData = colData, design = ~ Condition) # Run the default analysis for DESeq2 and generate results table dds <- DESeq(dds) deseq2.res <- results(dds) # Sort by adjusted p-value and display resOrdered <- deseq2.res[order(deseq2.res$padj), ] vsd <- vst(dds, blind = FALSE) plotPCA(vsd, intgroup = "Condition") ``` ```{r, eval=TRUE} # Select top 50 differentially expressed genes res <- results(dds) res_ordered <- res[order(res$padj), ] top_genes <- row.names(res_ordered)[1:50] # Extract counts and normalize counts <- counts(dds, normalized = TRUE) counts_top <- counts[top_genes, ] # Log-transform counts log_counts_top <- log2(counts_top + 1) # Generate heatmap pheatmap(log_counts_top, scale = "row") ``` ```{r, eval=TRUE} # Count number of hits with adjusted p-value less then 0.05 dim(res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]) ``` ```{r, eval=TRUE} tmp <- deseq2.res # The main plot plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray", main="DEG Dessication (pval <= 0.05)", xlab="mean of normalized counts", ylab="Log2 Fold Change") # Getting the significant points and plotting them again so they're a different color tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ] points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red") # 2 FC lines abline(h=c(-1,1), col="blue") ``` ```{r} write.table(tmp.sig, "../output/10-hisat-deseq2/DEGlist.tab", sep = '\t', row.names = T) ``` ```{r, eval=TRUE} deglist <- read.csv("../output/10-hisat-deseq2/DEGlist.tab", sep = '\t', header = TRUE) deglist$RowName <- rownames(deglist) deglist2 <- deglist[, c("RowName", "pvalue")] # Optionally, reorder the columns ``` ```{r,eval=TRUE} head(deglist) ```