--- title: "05-DESeq2" format: html editor: visual --- # Running DESeq2 This code performs differential expression analysis to identify deferentially expressed genes (DEGs) between a control condition and a desiccated condition in Pacific oysters. First, it reads in a count matrix of isoform counts generated by `kallisto`, with row names set to the gene/transcript IDs and the first column removed. It then rounds the counts to whole numbers. The `results()` function is used to extract the results table, which is ordered by gene/transcript ID. The code then prints the top few rows of the results table and calculates the number of DEGs with an adjusted p-value less than or equal to 0.05. It plots the log2 fold changes versus the mean normalized counts for all genes, highlighting significant DEGs in red and adding horizontal lines at 2-fold upregulation and downregulation. Finally, it writes the list of significant DEGs to a file called "DEGlist.tab". ## Install Packages ```{r} if ("DESeq2" %in% rownames(installed.packages()) == 'FALSE') BiocManager::install('DESeq2') if ("kableExtra" %in% rownames(installed.packages()) == 'FALSE') install.packages('kableExtra') ``` ### Load Libraries ```{r} library(DESeq2) library(kableExtra) ``` ### Read in count matrix ```{r} countmatrix <- read.delim("../output/kallisto_01.isoform.counts.matrix", header = TRUE, sep = '\t') rownames(countmatrix) <- countmatrix$X countmatrix <- countmatrix[,-1] head(countmatrix) ``` This count matrix has 54,384 observations (genes) and 8 samples (variables) ### Round integers up to whole numbers for analysis ```{r} countmatrix <- round(countmatrix, 0) head(countmatrix) ``` ### Create Dataframe Here the code creates a `data.frame` named `deseq2.colData` containing information about the experimental conditions (embryos & recruits). It uses the column data dataframe named `deseq2.colData` to create a `DESeqDataSet` object using the `DESeqDataSetFromMatrix` function from the DESeq2 package. ```{r} # make a dataframe deseq2.colData <- data.frame(condition=factor(c(rep("embryos", 4), rep("recruits", 4))), type=factor(rep("single-read", 8))) # set row names to match the column names in the count matrix rownames(deseq2.colData) <- colnames(data) # DESeqDataSet object created using the `DESeqDataSetFromMatrix` function deseq2.dds <- DESeqDataSetFromMatrix(countData = countmatrix, colData = deseq2.colData, design = ~ condition) ``` ### Negative Binomial The `DESeqDataSet` object, named `deseq.dds`, is then passed to the `DESeq()` function to fit a negative binomial model and estimate dispersions. ```{r} deseq2.dds <- DESeq(deseq2.dds) deseq2.res <- results(deseq2.dds) deseq2.res <- deseq2.res[order(rownames(deseq2.res)), ] ``` ```{r} str(deseq2.res) ``` ```{r} dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]) ``` # Plotting ## PCA ```{r} vsd <- vst(deseq2.dds, blind = FALSE) plotPCA(vsd, intgroup = "condition") ``` ## Heatmap ### Install Packages ```{r} if ("pheatmap" %in% rownames(installed.packages()) == 'FALSE') install.packages('pheatmap') ``` ### Load Libraries ```{r} library(pheatmap) ``` ### Top 20 Differentially Expressed Genes ```{r} # Select top 50 differentially expressed genes res <- results(deseq2.dds) res_ordered <- res[order(res$padj), ] top_genes <- row.names(res_ordered)[1:20] # Extract counts and normalize counts <- counts(deseq2.dds, normalized = TRUE) counts_top <- counts[top_genes, ] # Log-transform counts log_counts_top <- log2(counts_top + 1) # Generate heatmap pheatmap(log_counts_top, scale = "row") ``` ```{r} tmp <- deseq2.res # The main plot plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray", main="DEG Coral Early Life History (pval <= 0.05)", xlab="mean of normalized counts", ylab="Log2 Fold Change") ``` ```{r} tmp <- deseq2.res # The main plot plot(tmp$baseMean, tmp$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray", main="DEG Coral Early Life History (pval <= 0.05)", xlab="mean of normalized counts", ylab="Log2 Fold Change") # Getting the significant points and plotting them again so they're a different color tmp.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ] points(tmp.sig$baseMean, tmp.sig$log2FoldChange, pch=20, cex=0.45, col="red") # 2 FC lines abline(h=c(-1,1), col="blue") # Save plot ggsave(path = "../figs", filename="DEGlog2fold-plot.png", plot = DEGplot) ``` ### Volcano Plot ```{r} # Prepare the data for plotting res_df <- as.data.frame(deseq2.res) res_df$gene <- row.names(res_df) # Create volcano plot volcano_plot <- ggplot(res_df, aes(x = log2FoldChange, y = -log10(padj), color = padj < 0.05)) + geom_point(alpha = 0.6, size = 1.5) + scale_color_manual(values = c("grey", "red")) + labs(title = "Volcano Plot", x = "Log2 Fold Change", y = "-Log10 Adjusted P-value", color = "Significantly\nDifferentially Expressed") + theme_minimal() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.position = "top") # Save plot ggsave(path = "../figs", filename="volcano-plot.png") # Print plot print(volcano_plot) ``` ## Save the list of Differentially Expressed Genes (DEGs)! Write the output to a table ```{r} write.table(tmp.sig, "../output/DEGlist.tab", sep = '\t', row.names = T) ``` Let's look at this list ```{r} deglist <- read.csv("../output/DEGlist.tab", sep = '\t', header = TRUE) deglist$RowName <- rownames(deglist) deglist2 <- deglist[, c("RowName", "pvalue")] # Optionally, reorder the columns head(deglist2) ```