--- title: "Differential Gene Expression (dge)" author: "Celeste Valdivia" date: "2023-04-14" output: html_document --- ```{r setup, include=FALSE} library(knitr) library(tidyverse) library(kableExtra) library(DT) library(tm) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Setting this to false will allow us to knit the codument without evaluating code chunks warning = FALSE, # Hide warnings message = FALSE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) ``` # Downloading sequence reads With wget's recursive feature we access all 24 fastq files for this data set. ```{r downloading sequence reads, engine= 'bash'} cd ../data wget --no-check-certificate --recursive --no-parent --no-directories \ --accept '*.fastq.gz' \ https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/nopp/ ``` # Downloading the reference This code indexes an rna.fna file and renames it cgigas_roslin_rna.index and puts it in the data directory. ```{r, engine= 'bash'} cd ../data curl --insecure -O https://gannet.fish.washington.edu/seashell/bu-github/nb-2023/Cgigas/data/rna.fna ``` ```{r downloading reference, engine = 'bash'} /home/shared/kallisto/kallisto \ index -i \ ../data/cgigas_roslin_rna.index \ ../data/rna.fna ``` # Create abundance estimates Running the kallisto quant command on each input fastq file (that is found with the command find with using the wildcard '*' in front of the extension fastq.gz). It then writes output files to the sub-directory kallisto_01. ```{r output abundance and sequence run info per sequence read, engine= 'bash'} mkdir ../output/kallisto_01 find ../data/*fastq.gz \ | xargs basename -s _L001_R1_001.fastq.gz | xargs -I{} /home/shared/kallisto/kallisto \ quant -i ../data/cgigas_roslin_rna.index \ -o ../output/kallisto_01/{} \ -t 40 \ --single -l 100 -s 10 ../data/{}_L001_R1_001.fastq.gz ``` Down below we will be creating a gene expression matrix from the kallisto output files using the command abundance_estimates_to_matrix.pl from the Trinity RNA-seq assembly software package. ```{r quantify abundance, engine = 'bash'} perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \ --est_method kallisto \ --gene_trans_map none \ --out_prefix ../output/kallisto_01 \ --name_sample_by_basedir \ ../output/kallisto_01/D54_S145/abundance.tsv \ ../output/kallisto_01/D56_S136/abundance.tsv \ ../output/kallisto_01/D58_S144/abundance.tsv \ ../output/kallisto_01/M45_S140/abundance.tsv \ ../output/kallisto_01/M48_S137/abundance.tsv \ ../output/kallisto_01/M89_S138/abundance.tsv \ ../output/kallisto_01/D55_S146/abundance.tsv \ ../output/kallisto_01/D57_S143/abundance.tsv \ ../output/kallisto_01/D59_S142/abundance.tsv \ ../output/kallisto_01/M46_S141/abundance.tsv \ ../output/kallisto_01/M49_S139/abundance.tsv \ ../output/kallisto_01/M90_S147/abundance.tsv \ ../output/kallisto_01/N48_S194/abundance.tsv \ ../output/kallisto_01/N50_S187/abundance.tsv \ ../output/kallisto_01/N52_S184/abundance.tsv \ ../output/kallisto_01/N54_S193/abundance.tsv \ ../output/kallisto_01/N56_S192/abundance.tsv \ ../output/kallisto_01/N58_S195/abundance.tsv \ ../output/kallisto_01/N49_S185/abundance.tsv \ ../output/kallisto_01/N51_S186/abundance.tsv \ ../output/kallisto_01/N53_S188/abundance.tsv \ ../output/kallisto_01/N55_S190/abundance.tsv \ ../output/kallisto_01/N57_S191/abundance.tsv \ ../output/kallisto_01/N59_S189/abundance.tsv ``` Next we will install the R package BioManager which will allow us to install DEseq2 for our dge analysis. If you already have this package you can skip this step. ```{r Installing BioManager which will then be able to install DESeq2, echo=FALSE} if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("DESeq2") ``` # Using DESeq2 to conduct a differential expression analysis Below are the required packages needed to conduct a dge in R. ```{r requirements} #requirements library(DESeq2) #running several functions from this package to conduct a differential expression analysis between our two treatments library(tidyverse) library(pheatmap) library(RColorBrewer) library(data.table) ``` We will be reading in our count matrix and formatting it. ```{r read in count matrix} countmatrix <- read.delim("../output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t') rownames(countmatrix) <- countmatrix$X countmatrix <- countmatrix[,-1] head(countmatrix) # let's take a peak at the count matrix we just read in ``` Next we wantt to round up the values in the count matrix ```{r rounding up values} countmatrix <- round(countmatrix, 0) str(countmatrix) ``` Next we will conduct a dge based on the treatments of the data we are evaluating. Here it's control vs desiccation in *C. gigas*. ```{r DGEs based on dessecation} deseq2.colData <- data.frame(condition=factor(c(rep("control", 12), rep("desicated", 12))), type=factor(rep("single-read", 24))) rownames(deseq2.colData) <- colnames(data) deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix, colData = deseq2.colData, design = ~ condition) ``` ```{r} deseq2.dds <- DESeq(deseq2.dds) deseq2.res <- results(deseq2.dds) deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ] head(deseq2.res) ``` ```{r} # Count number of hits with adjusted p-value less then 0.05 dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]) ``` the following code chunk will take the resutls we produced using DESeq2 to make a plot of differentially expressed genes ```{r} tmp <- deseq2.res # The main plot plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray", main="DEG Dessication (pval <= 0.05)", xlab="mean of normalized counts", ylab="Log2 Fold Change") # Getting the significant points and plotting them again so they're a different color tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ] points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red") # 2 FC lines abline(h=c(-1,1), col="blue") ``` ```{r} write.table(tmp.sig, "../output/DEGlist.tab", row.names = T) ```