--- title: "18-deseq2" output: html_document date: "2024-09-18" --- Rmd to run `DESeq2` with the 2021 count matrix created using `kallisto` in the code Rmd: [17-kallisto-2021-2022.Rmd](https://github.com/grace-ac/paper-pycno-sswd-2021-2022/blob/main/code/17-kallisto-2021-2022.Rmd). Load packages needed to run code: ```{r} library(DESeq2) library(tidyverse) library(pheatmap) library(RColorBrewer) library(data.table) library(ggplot2) ``` # load rounded count matrix of all 64 coelomocyte RNAseq libraries with genome gene transcript list FASTA alignment: ```{r} countmatrix <- read.delim("../data/2021-2022_kallisto_count_matrix_rounded_20240918.tab", header = TRUE, sep = '\t') head(countmatrix) ``` # 2021 Experiment `DESeq2` Work Subset the libraries from "Experiment B" from 2021 work - stars that were part of the final experiment that were injected with their inoculate (either unfiltered raw tissue homogenate from a wasting adult _Pycnopodia helianthoides_, or a heat-killed raw tissue homogenate from a wasting adult _Pycnopodia helianthoides_). Stars H03 and H05 had two time points as controls, so I picked one of each so that it's a balanced 8 vs 8 comparison. | library_ID | star_ID | previous_exposure | treatment | sample_date | experiment_day | |------------|---------|-----------------------------|-------------------|-------------|----------------| | PSC.56 | E06 | 0.45 live inoculate 9/23/21 | control (10/5/21) | 10/14/21 | 9 | | PSC.52 | E11 | 0.45 live inoculate 9/23/21 | control (10/5/21) | 10/14/21 | 9 | | PSC.54 | E18 | 0.45 live inoculate 9/23/21 | control (10/5/21) | 10/14/21 | 9 | | PSC.61 | E09 | 0.45 live inoculate 9/23/21 | control (10/5/21) | 10/15/21 | 10 | | PSC.64 | H05 | NA | control (10/5/21) | 10/15/21 | 10 | | PSC.73 | H03 | NA | control (10/5/21) | 10/16/21 | 11 | | PSC.76 | E10 | 0.45 live inoculate 9/23/21 | control (10/5/21) | 10/17/21 | 12 | | PSC.81 | E07 | 0.45 live inoculate 9/23/21 | control (10/5/21) | 10/18/21 | 13 | | PSC.59 | H01 | NA | exposed (10/5/21) | 10/14/21 | 9 | | PSC.57 | H06 | NA | exposed (10/5/21) | 10/14/21 | 9 | | PSC.69 | H04 | NA | exposed (10/5/21) | 10/15/21 | 10 | | PSC.67 | H09 | NA | exposed (10/5/21) | 10/15/21 | 10 | | PSC.71 | H18 | NA | exposed (10/5/21) | 10/15/21 | 10 | | PSC.75 | H08 | NA | exposed (10/5/21) | 10/16/21 | 11 | | PSC.78 | H10 | NA | exposed (10/5/21) | 10/17/21 | 12 | | PSC.83 | H07 | NA | exposed (10/5/21) | 10/18/21 | 13 | ```{r} expB <- select(countmatrix, "PSC.56", "PSC.52", "PSC.54", "PSC.61", "PSC.64", "PSC.73", "PSC.76", "PSC.81", "PSC.59", "PSC.57", "PSC.69", "PSC.67", "PSC.71", "PSC.75", "PSC.78", "PSC.83") head(expB) ``` 26,581 rows, 64 16 libraries Make a data frame for the comparison: ```{r} colData <- data.frame(condition=factor(c("control", "control", "control", "control", "control", "control", "control", "control", "exposed", "exposed", "exposed", "exposed", "exposed", "exposed", "exposed", "exposed")), type=factor(rep("paired-end",16))) rownames(colData) <- colnames(expB) dds <- DESeqDataSetFromMatrix(countData = expB, colData = colData, design = ~ condition) ``` ```{r} # Check all sample IDs in colData are also in CountData and match their orders all(rownames(colData) %in% colnames(expB)) #should return TRUE ``` ```{r} expB <- expB[, rownames(colData)] all(rownames(colData) == colnames(expB)) # This should also return TRUE ``` ```{r} # Create a DESeqDataSet from count matrix and labels dds <- DESeqDataSetFromMatrix(countData = expB, colData = colData, design = ~ condition) # Run the default analysis for DESeq2 and generate results table dds <- DESeq(dds) deseq2.res <- results(dds) # Sort by adjusted p-value and display resOrdered <- deseq2.res[order(deseq2.res$padj), ] vsd <- vst(dds, blind = FALSE) eBgPCA <- plotPCA(vsd, intgroup = "condition") nudge <- position_nudge(y = 4) eBgPCA + geom_text(aes(label = name), position = nudge) ``` ```{r} # Select top 50 differentially expressed genes res <- results(dds) res_ordered <- res[order(res$padj), ] top_genes <- row.names(res_ordered)[1:50] # Extract counts and normalize counts <- counts(dds, normalized = TRUE) counts_top <- counts[top_genes, ] # Log-transform counts log_counts_top <- log2(counts_top + 1) # Generate heatmap pheatmap(log_counts_top, scale = "row") ``` ```{r} # Count number of hits with adjusted p-value less then 0.05 dim(res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]) ``` 7833 degs ```{r} exBg <- deseq2.res # The main plot plot(exBg$baseMean, exBg$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray", main="DEG SSWD Exposed (pval <= 0.05)", xlab="mean of normalized counts", ylab="Log2 Fold Change") # Getting the significant points and plotting them again so they're a different color exBg.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ] points(exBg.sig$baseMean, exBg.sig$log2FoldChange, pch=20, cex=0.45, col="red") # 2 FC lines abline(h=c(-1,1), col="blue") ``` ### `join` the DEG list with BLAST output: the file that is the DEG list is: exBg.sig Write out differentially expressed gene list: ```{r} #write.table(exBg.sig, "../data/expB_DEG_DET_lists/DEGlist_8v8_2021_exposedVcontrol.tab", sep = '\t', row.names = T, quote = F) ``` comment out the code - wrote out 2024-09-18 read in the DEG list: ```{r} deg2021 <- read.table("../data/expB_DEG_DET_lists/DEGlist_8v8_2021_exposedVcontrol.tab") head(deg2021) ``` set rownames as a colum called transcript ID ```{r} library(tibble) deg2021 <- tibble::rownames_to_column(deg2021, "transcriptID") head(deg2021) ``` Read in the BLAST output: ```{r} blast <- read.table("../analyses/16-blast-annotation/blast_out_sep.tab", ) head(blast) ``` rename the first column: rename first column "transcriptID": ```{r} colnames(blast) <- c("transcriptID", "V2", "uniprot_accession_ID", "gene", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12", "V13", "V14") head(blast) ``` `join` the two tables by "transcriptID: ```{r} blastdeg2021 <- left_join(deg2021, blast, by = "transcriptID") head(blastdeg2021) ``` ```{r} library(tibble) expB <- tibble::rownames_to_column(expB, "transcriptID") head(expB) ``` `join` the above by the count matrix ```{r} blastdeg2021counts <- left_join(blastdeg2021, expB, by = "transcriptID") head(blastdeg2021counts) ``` write out the table: Write out differentially expressed gene list: ```{r} #write.table(blastdeg2021counts, "../data/expB_DEG_DET_lists/DEGlist_8v8_2021_exposedVcontrol_counts_BLAST.tab", sep = '\t', row.names = F, quote = F) ``` comment out the code - wrote out 2024-09-18 # Summer 2022 `DESeq2` Work Want a balanced comparison. Below is a table detailing the samples: | library_ID | star_ID | star_age_class | treatment | experiment_day | total_armdrop | total_armtwist | disease_sign | |------------|---------|----------------|-----------|----------------|---------------|----------------|---------------| | PSC.0186 | 15 | adult | control | 11 | 0 | 0 | healthy | | PSC.0203 | 31 | adult | control | 12 | 0 | 0 | healthy | | PSC.0209 | 9 | adult | control | 13 | 0 | 0 | healthy | | PSC.0177 | 34 | juvenile | control | 11 | 0 | 0 | healthy | | PSC.0219 | 63 | juvenile | control | 14 | 0 | 0 | healthy | | PSC.0230 | 65 | juvenile | control | 15 | 0 | 0 | healthy | | PSC.0174 | 30 | adult | exposed | 10 | 1 | 6 | first_armdrop | | PSC.0190 | 10 | adult | exposed | 11 | 1 | 0 | first_armdrop | | PSC.0231 | 13 | adult | exposed | 15 | 1 | 5 | first_armdrop | | PSC.0187 | 57 | juvenile | exposed | 11 | 1 | 1 | first_armdrop | | PSC.0188 | 38 | juvenile | exposed | 11 | 1 | 0 | first_armdrop | | PSC.0228 | 61 | juvenile | exposed | 14 | 1 | 0 | first_armdrop | subset just the counts for the libraries specified in the table above: ```{r} counts2022 <- select(countmatrix, PSC.0228, PSC.0187, PSC.0188, PSC.0174, PSC.0190, PSC.0231, PSC.0230, PSC.0219, PSC.0177, PSC.0186, PSC.0209, PSC.0203) head(counts2022) ``` ```{r} deseq2.colData <- data.frame(condition=factor(c("exposed", "exposed", "exposed", "exposed", "exposed", "exposed", "control", "control", "control", "control", "control", "control")), type=factor(rep("paired-end", 12)), age=factor(c("juvenile", "juvenile", "juvenile", "adult", "adult", "adult", "juvenile", "juvenile", "juvenile", "adult", "adult", "adult"))) rownames(deseq2.colData) <- colnames(counts2022) ``` ```{r} # Check all sample IDs in colData are also in CountData and match their orders all(rownames(deseq2.colData) %in% colnames(counts2022)) #should return TRUE ``` ```{r} counts2022 <- counts2022[, rownames(deseq2.colData)] all(rownames(deseq2.colData) == colnames(counts2022)) # This should also return TRUE ``` ```{r} # Create a DESeqDataSet from count matrix and labels, with the design taking age into account dds <- DESeqDataSetFromMatrix(countData = counts2022, colData = deseq2.colData, design = ~ condition + age) ``` Check levels of age and condition ```{r} levels(dds$age) ``` ```{r} levels(dds$condition) ``` note: `DESeq2` automatically puts the levels in alphabetical order and the first listed level is the reference level for the factor. We want control to be the reference. ```{r} dds$condition = relevel(dds$condition, "control") levels(dds$condition) ``` The following will pull the results from `condition` because that is our variable of interest. This tells us how age contributes to the DEGs ```{r} design(dds) <- formula(~ age + condition) dds <- DESeq(dds) ``` Access results: ```{r} deseq2.res <- results(dds) head(deseq2.res) ``` ```{r} summary(deseq2.res) ``` ```{r} # Count number of hits with adjusted p-value less then 0.05 dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]) ``` 6496 DEGs ```{r} inf <- deseq2.res # The main plot plot(inf$baseMean, inf$log2FoldChange, pch=20, cex=0.45, ylim=c(-15, 15), log="x", col="darkgray", #main="Infection Status (pval