---
title: "18-deseq2"
output: html_document
date: "2024-09-18"
---
Rmd to run `DESeq2` with the 2021 count matrix created using `kallisto` in the code Rmd: [17-kallisto-2021-2022.Rmd](https://github.com/grace-ac/paper-pycno-sswd-2021-2022/blob/main/code/17-kallisto-2021-2022.Rmd).
Load packages needed to run code:
```{r}
library(DESeq2)
library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(data.table)
library(ggplot2)
```
# load rounded count matrix of all 64 coelomocyte RNAseq libraries with genome gene transcript list FASTA alignment:
```{r}
countmatrix <- read.delim("../data/2021-2022_kallisto_count_matrix_rounded_20240918.tab", header = TRUE, sep = '\t')
head(countmatrix)
```
# 2021 Experiment `DESeq2` Work
Subset the libraries from "Experiment B" from 2021 work - stars that were part of the final experiment that were injected with their inoculate (either unfiltered raw tissue homogenate from a wasting adult _Pycnopodia helianthoides_, or a heat-killed raw tissue homogenate from a wasting adult _Pycnopodia helianthoides_). Stars H03 and H05 had two time points as controls, so I picked one of each so that it's a balanced 8 vs 8 comparison.
| library_ID | star_ID | previous_exposure | treatment | sample_date | experiment_day |
|------------|---------|-----------------------------|-------------------|-------------|----------------|
| PSC.56 | E06 | 0.45 live inoculate 9/23/21 | control (10/5/21) | 10/14/21 | 9 |
| PSC.52 | E11 | 0.45 live inoculate 9/23/21 | control (10/5/21) | 10/14/21 | 9 |
| PSC.54 | E18 | 0.45 live inoculate 9/23/21 | control (10/5/21) | 10/14/21 | 9 |
| PSC.61 | E09 | 0.45 live inoculate 9/23/21 | control (10/5/21) | 10/15/21 | 10 |
| PSC.64 | H05 | NA | control (10/5/21) | 10/15/21 | 10 |
| PSC.73 | H03 | NA | control (10/5/21) | 10/16/21 | 11 |
| PSC.76 | E10 | 0.45 live inoculate 9/23/21 | control (10/5/21) | 10/17/21 | 12 |
| PSC.81 | E07 | 0.45 live inoculate 9/23/21 | control (10/5/21) | 10/18/21 | 13 |
| PSC.59 | H01 | NA | exposed (10/5/21) | 10/14/21 | 9 |
| PSC.57 | H06 | NA | exposed (10/5/21) | 10/14/21 | 9 |
| PSC.69 | H04 | NA | exposed (10/5/21) | 10/15/21 | 10 |
| PSC.67 | H09 | NA | exposed (10/5/21) | 10/15/21 | 10 |
| PSC.71 | H18 | NA | exposed (10/5/21) | 10/15/21 | 10 |
| PSC.75 | H08 | NA | exposed (10/5/21) | 10/16/21 | 11 |
| PSC.78 | H10 | NA | exposed (10/5/21) | 10/17/21 | 12 |
| PSC.83 | H07 | NA | exposed (10/5/21) | 10/18/21 | 13 |
```{r}
expB <- select(countmatrix, "PSC.56", "PSC.52", "PSC.54", "PSC.61", "PSC.64", "PSC.73", "PSC.76", "PSC.81", "PSC.59", "PSC.57", "PSC.69", "PSC.67", "PSC.71", "PSC.75", "PSC.78", "PSC.83")
head(expB)
```
26,581 rows, 64 16 libraries
Make a data frame for the comparison:
```{r}
colData <- data.frame(condition=factor(c("control", "control", "control", "control", "control", "control", "control", "control", "exposed", "exposed", "exposed", "exposed", "exposed", "exposed", "exposed", "exposed")),
type=factor(rep("paired-end",16)))
rownames(colData) <- colnames(expB)
dds <- DESeqDataSetFromMatrix(countData = expB,
colData = colData,
design = ~ condition)
```
```{r}
# Check all sample IDs in colData are also in CountData and match their orders
all(rownames(colData) %in% colnames(expB)) #should return TRUE
```
```{r}
expB <- expB[, rownames(colData)]
all(rownames(colData) == colnames(expB)) # This should also return TRUE
```
```{r}
# Create a DESeqDataSet from count matrix and labels
dds <- DESeqDataSetFromMatrix(countData = expB,
colData = colData, design = ~ condition)
# Run the default analysis for DESeq2 and generate results table
dds <- DESeq(dds)
deseq2.res <- results(dds)
# Sort by adjusted p-value and display
resOrdered <- deseq2.res[order(deseq2.res$padj), ]
vsd <- vst(dds, blind = FALSE)
eBgPCA <- plotPCA(vsd, intgroup = "condition")
nudge <- position_nudge(y = 4)
eBgPCA + geom_text(aes(label = name), position = nudge)
```
```{r}
# Select top 50 differentially expressed genes
res <- results(dds)
res_ordered <- res[order(res$padj), ]
top_genes <- row.names(res_ordered)[1:50]
# Extract counts and normalize
counts <- counts(dds, normalized = TRUE)
counts_top <- counts[top_genes, ]
# Log-transform counts
log_counts_top <- log2(counts_top + 1)
# Generate heatmap
pheatmap(log_counts_top, scale = "row")
```
```{r}
# Count number of hits with adjusted p-value less then 0.05
dim(res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])
```
7833 degs
```{r}
exBg <- deseq2.res
# The main plot
plot(exBg$baseMean, exBg$log2FoldChange, pch=20, cex=0.45, ylim=c(-3, 3), log="x", col="darkgray",
main="DEG SSWD Exposed (pval <= 0.05)",
xlab="mean of normalized counts",
ylab="Log2 Fold Change")
# Getting the significant points and plotting them again so they're a different color
exBg.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]
points(exBg.sig$baseMean, exBg.sig$log2FoldChange, pch=20, cex=0.45, col="red")
# 2 FC lines
abline(h=c(-1,1), col="blue")
```
### `join` the DEG list with BLAST output:
the file that is the DEG list is:
exBg.sig
Write out differentially expressed gene list:
```{r}
#write.table(exBg.sig, "../data/expB_DEG_DET_lists/DEGlist_8v8_2021_exposedVcontrol.tab", sep = '\t', row.names = T, quote = F)
```
comment out the code - wrote out 2024-09-18
read in the DEG list:
```{r}
deg2021 <- read.table("../data/expB_DEG_DET_lists/DEGlist_8v8_2021_exposedVcontrol.tab")
head(deg2021)
```
set rownames as a colum called transcript ID
```{r}
library(tibble)
deg2021 <- tibble::rownames_to_column(deg2021, "transcriptID")
head(deg2021)
```
Read in the BLAST output:
```{r}
blast <- read.table("../analyses/16-blast-annotation/blast_out_sep.tab", )
head(blast)
```
rename the first column:
rename first column "transcriptID":
```{r}
colnames(blast) <- c("transcriptID", "V2", "uniprot_accession_ID", "gene", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12", "V13", "V14")
head(blast)
```
`join` the two tables by "transcriptID:
```{r}
blastdeg2021 <- left_join(deg2021, blast, by = "transcriptID")
head(blastdeg2021)
```
```{r}
library(tibble)
expB <- tibble::rownames_to_column(expB, "transcriptID")
head(expB)
```
`join` the above by the count matrix
```{r}
blastdeg2021counts <- left_join(blastdeg2021, expB, by = "transcriptID")
head(blastdeg2021counts)
```
write out the table:
Write out differentially expressed gene list:
```{r}
#write.table(blastdeg2021counts, "../data/expB_DEG_DET_lists/DEGlist_8v8_2021_exposedVcontrol_counts_BLAST.tab", sep = '\t', row.names = F, quote = F)
```
comment out the code - wrote out 2024-09-18
# Summer 2022 `DESeq2` Work
Want a balanced comparison.
Below is a table detailing the samples:
| library_ID | star_ID | star_age_class | treatment | experiment_day | total_armdrop | total_armtwist | disease_sign |
|------------|---------|----------------|-----------|----------------|---------------|----------------|---------------|
| PSC.0186 | 15 | adult | control | 11 | 0 | 0 | healthy |
| PSC.0203 | 31 | adult | control | 12 | 0 | 0 | healthy |
| PSC.0209 | 9 | adult | control | 13 | 0 | 0 | healthy |
| PSC.0177 | 34 | juvenile | control | 11 | 0 | 0 | healthy |
| PSC.0219 | 63 | juvenile | control | 14 | 0 | 0 | healthy |
| PSC.0230 | 65 | juvenile | control | 15 | 0 | 0 | healthy |
| PSC.0174 | 30 | adult | exposed | 10 | 1 | 6 | first_armdrop |
| PSC.0190 | 10 | adult | exposed | 11 | 1 | 0 | first_armdrop |
| PSC.0231 | 13 | adult | exposed | 15 | 1 | 5 | first_armdrop |
| PSC.0187 | 57 | juvenile | exposed | 11 | 1 | 1 | first_armdrop |
| PSC.0188 | 38 | juvenile | exposed | 11 | 1 | 0 | first_armdrop |
| PSC.0228 | 61 | juvenile | exposed | 14 | 1 | 0 | first_armdrop |
subset just the counts for the libraries specified in the table above:
```{r}
counts2022 <- select(countmatrix, PSC.0228, PSC.0187, PSC.0188, PSC.0174, PSC.0190, PSC.0231, PSC.0230, PSC.0219, PSC.0177, PSC.0186, PSC.0209, PSC.0203)
head(counts2022)
```
```{r}
deseq2.colData <- data.frame(condition=factor(c("exposed", "exposed", "exposed", "exposed", "exposed", "exposed", "control", "control", "control", "control", "control", "control")),
type=factor(rep("paired-end", 12)),
age=factor(c("juvenile", "juvenile", "juvenile", "adult", "adult", "adult", "juvenile", "juvenile", "juvenile", "adult", "adult", "adult")))
rownames(deseq2.colData) <- colnames(counts2022)
```
```{r}
# Check all sample IDs in colData are also in CountData and match their orders
all(rownames(deseq2.colData) %in% colnames(counts2022)) #should return TRUE
```
```{r}
counts2022 <- counts2022[, rownames(deseq2.colData)]
all(rownames(deseq2.colData) == colnames(counts2022)) # This should also return TRUE
```
```{r}
# Create a DESeqDataSet from count matrix and labels, with the design taking age into account
dds <- DESeqDataSetFromMatrix(countData = counts2022,
colData = deseq2.colData,
design = ~ condition + age)
```
Check levels of age and condition
```{r}
levels(dds$age)
```
```{r}
levels(dds$condition)
```
note: `DESeq2` automatically puts the levels in alphabetical order and the first listed level is the reference level for the factor.
We want control to be the reference.
```{r}
dds$condition = relevel(dds$condition, "control")
levels(dds$condition)
```
The following will pull the results from `condition` because that is our variable of interest. This tells us how age contributes to the DEGs
```{r}
design(dds) <- formula(~ age + condition)
dds <- DESeq(dds)
```
Access results:
```{r}
deseq2.res <- results(dds)
head(deseq2.res)
```
```{r}
summary(deseq2.res)
```
```{r}
# Count number of hits with adjusted p-value less then 0.05
dim(deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ])
```
6496 DEGs
```{r}
inf <- deseq2.res
# The main plot
plot(inf$baseMean, inf$log2FoldChange, pch=20, cex=0.45, ylim=c(-15, 15), log="x", col="darkgray",
#main="Infection Status (pval = 0.05)",
xlab="mean of normalized counts",
ylab="Log2 Fold Change")
# Getting the significant points and plotting them again so they're a different color
inf.sig <- deseq2.res[!is.na(deseq2.res$padj) & deseq2.res$padj <= 0.05, ]
points(inf.sig$baseMean, inf.sig$log2FoldChange, pch=20, cex=0.45, col="red")
# 2 FC lines
abline(h=c(-1,1), col="blue")
```
Write out differentially expressed gene list:
```{bash}
pwd
```
```{r}
#write.table(inf.sig, "../data/2022_DEG_DET_lists/DEGlist_2022_exposed_v_control_no_contrast.tab", sep = '\t', row.names = T, quote = FALSE)
```
wrote out 2024-09-18
### `join` the DEG list with BLAST output:
read in the DEG list:
```{r}
deg2022noage <- read.delim("../data/2022_DEG_DET_lists/DEGlist_2022_exposed_v_control_no_contrast.tab")
head(deg2022noage)
```
set rownames as a colum called transcript ID
```{r}
library(tibble)
deg2022noage <- tibble::rownames_to_column(deg2022noage, "transcriptID")
head(deg2022noage)
```
`join` the two tables by "transcriptID:
```{r}
blastdeg2022 <- left_join(deg2022noage, blast, by = "transcriptID")
head(blastdeg2022)
```
```{r}
library(tibble)
counts2022 <- tibble::rownames_to_column(counts2022, "transcriptID")
head(counts2022)
```
`join` the above by the count matrix
```{r}
blastdeg2022counts <- left_join(blastdeg2022, counts2022, by = "transcriptID")
head(blastdeg2022counts)
```
write out the table:
Write out differentially expressed gene list:
```{r}
#write.table(blastdeg2022counts, "../data/2022_DEG_DET_lists/DEGlist_2022_noage_contrast_counts_BLAST.tab", sep = '\t', row.names = F, quote = F)
```
comment out the code - wrote out 2024-09-18
Now need to perform a contrast to see if there is a difference between these groups as it relates to age.
In the multifactor section of the `DESeq2` manual:
The contrast argument of the function _results_ needs a character vector of three componenets: the name of the variable (in this case "age"), and the name of the factor level for the numerator of the log2 ratio (juvenile) and the denominator (adult)
A **contrast** is a linear combination of estimated log2 fold changes. Can be used to test if differences between groups are equal to zero.
```{r}
resultsNames(dds)
```
```{r}
deseq2.resage <- results(dds,
contrast = c("age", "juvenile", "adult"))
head(deseq2.resage)
```
```{r}
age <- deseq2.resage
# The main plot
plot(age$baseMean, age$log2FoldChange, pch=20, cex=0.45, ylim=c(-15, 15), log="x", col="darkgray",
#main="Age (pval = 0.05)",
xlab="mean of normalized counts",
ylab="Log2 Fold Change")
# Getting the significant points and plotting them again so they're a different color
age.sig <- deseq2.resage[!is.na(deseq2.resage$padj) & deseq2.resage$padj <= 0.05, ]
points(age.sig$baseMean, age.sig$log2FoldChange, pch=20, cex=0.45, col="red")
# 2 FC lines
abline(h=c(-1,1), col="blue")
```
```{r}
# Count number of hits with adjusted p-value less then 0.05
dim(deseq2.resage[!is.na(deseq2.resage$padj) & deseq2.resage$padj <= 0.05, ])
```
85 DEGs contrasted by age
```{r}
summary(deseq2.resage)
```
write out the DEG age contrast list:
```{r}
#write.table(age.sig, "../data/2022_DEG_DET_lists/DEGlist_2022-contrast_age.tab", sep = "\t", row.names = T, quote = FALSE, col.names = TRUE)
```
wrote out 2024-09-18
annotate the list:
read in the DEg age contrast list:
```{r}
degage <- read.delim("../data/2022_DEG_DET_lists/DEGlist_2022-contrast_age.tab")
head(degage)
```
set rownames as a colum called transcript ID
```{r}
library(tibble)
degage <- tibble::rownames_to_column(degage, "transcriptID")
head(degage)
```
`join` the above with blast output and the count matrix:
`join` the two tables by "transcriptID:
```{r}
blastdegage <- left_join(degage, blast, by = "transcriptID")
head(blastdegage)
```
`join` the above by the count matrix
```{r}
blastdegagecounts <- left_join(blastdegage, counts2022, by = "transcriptID")
head(blastdegagecounts)
```
write out the table:
Write out differentially expressed gene list:
```{r}
#write.table(blastdegagecounts, "../data/2022_DEG_DET_lists/DEGlist_2022_agecontrast_counts_BLAST.tab", sep = '\t', row.names = F, quote = F)
```
comment out the code - wrote out 2024-09-18
# `join` the DEG lists to see where there's overlap:
library(dplyr)
inner_join(df1, df2)
V1
1 id300
2 id5456
3 id45
```{r}
library(dplyr)
matchdegs <- inner_join(blastdeg2021counts, blastdeg2022counts, by = "transcriptID")
head(matchdegs)
```
4386 DEGs match between the two
write out table:
```{r}
#write.table(matchdegs, "../data/DEGlist_2021-2022-matching.tab", sep = '\t', row.names = F, quote = F)
```
comment out the code - wrote out 2024-09-19
# `join` the DEG lists with the uniprot accession IDs from the blast output that was put into uniprot.org ID/Mapping tool to get gene ID and GO IDs
read in the file downloaded from uniprot.org:
```{r}
ungo <- read.delim("../data/genome_uniprot_GOIDs_2024_09_20.tsv")
head(ungo)
```
read in the DEG lists:
```{r}
deg2021 <- read.delim("../data/expB_DEG_DET_lists/DEGlist_8v8_2021_exposedVcontrol_counts_BLAST.tab")
head(deg2021)
```
```{r}
deg2022 <- read.delim("../data/2022_DEG_DET_lists/DEGlist_2022_noage_contrast_counts_BLAST.tab")
head(deg2022)
```
rename the uniprot accession iD column in ungo:
```{r}
colnames(ungo) <- c("From", "uniprot_accession_ID", "Reviewed", "Entry.name", "Protein.names", "Gene.names", "Organism", "Length", "Gene.Ontology.IDs")
head(ungo)
```
`join` both DEG lists with ungo:
### 2021
```{r}
deg2021ungo <- left_join(deg2021, ungo, by = "uniprot_accession_ID")
head(deg2021ungo)
```
write out table:
```{r}
#write.table(deg2021ungo, "../data/expB_DEG_DET_lists/DEGlist_8v8_exVco_GOIDs.tab", sep = "\t", row.names = T, quote = FALSE, col.names = TRUE)
```
wrote out 2024-09-20
### 2022
```{r}
deg2022ungo <- left_join(deg2022, ungo, by = "uniprot_accession_ID")
head(deg2022ungo)
```
write out table:
```{r}
#write.table(deg2021ungo, "../data/2022_DEG_DET_lists/DEGlist_2022_exVco_noagecontrast_GOID.tab", sep = "\t", row.names = T, quote = FALSE, col.names = TRUE)
```
wrote out 2024-09-20