--- title: "04-PCAplots" output: html_document date: "2024-02-01" --- Rmd to create PCA plots to look for any preliminary effects of treatment, etc., on star gene expression in preparation for `DESEq2`. Gene counts were gotten from aligning libraries to the gene list FASTA using `kallisto`. ```{r} sessionInfo() ``` Load packages: ```{r} library(DESeq2) library(tidyverse) library(pheatmap) library(RColorBrewer) library(data.table) library(ggplot2) library(vegan) ``` # Read in rounded count matrix from comparing the 2021 _Pycnopodia helianthoides_ coelomocyte RNAseq libraries with the 2023 _Pycnopodia helianthoides_ genome gene list FASTA: ```{r} countmatrix <- read.delim("../data/kallisto_count_matrix_rounded.tab", header = TRUE, sep = '\t') head(countmatrix) ``` 26581 rows (genes) with 32 columns (libraries). ## QUESTION When done with the 2015 Phel transcriptome, it was 29476 genes... is genome smaller list because it's more fine-tuned to _Pycnopodia helianthoides_ whereas the _de novo_ assembled transcriptome included some incidental non-host transcripts? # Subset all data from what we'll call Experiment B (stars that were incoulated with a treatment October 5, 2024) n = 18 Exposed --> n = 8 (8 samples from 8 stars) Control --> n = 10 (10 samples, from 8 stars ... 2 stars have two sample time points) ```{r} expB <- select(countmatrix, "PSC.56", "PSC.81", "PSC.61", "PSC.76", "PSC.52", "PSC.54", "PSC.63", "PSC.73", "PSC.58", "PSC.64", "PSC.57", "PSC.59", "PSC.67", "PSC.69", "PSC.71", "PSC.75", "PSC.78", "PSC.83") head(expB) ``` ## Make a dataframe for PCA comparison: ```{r} colData <- data.frame(condition=factor(c("control", "control", "control", "control", "control", "control", "control", "control", "control", "control", "exposed", "exposed", "exposed", "exposed", "exposed", "exposed", "exposed", "exposed")), type=factor(rep("paired-end",18))) rownames(colData) <- colnames(expB) dds <- DESeqDataSetFromMatrix(countData = expB, colData = colData, design = ~ condition) ``` ```{r} dds <- DESeq(dds) res <- results(dds) res <- res[order(rownames(res)), ] ``` From [Bioconductor `DESeq2` Vignette](http://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html): ```{r} vsd <- vst(dds, blind=FALSE) rld <- rlog(dds, blind=FALSE) head(assay(vsd), 3) ``` ```{r} plot <- plotPCA(vsd, intgroup=c("condition", "type")) nudge <- position_nudge(y = 5) plot + geom_text(aes(label = name), position = nudge) ``` Create a permanova plot to get statistical significance if any: transpose the vsd data so that the genes are the columns and the rows (n=14) are the libraries ```{r} vsd.transpose <- t(assay(vsd)) head(vsd.transpose) ``` vsd.transpose is currently a matrix, but adonis2 needs it to be a data frame. Make it into a dataframe: ```{r} vsd.transpose.df=as.data.frame(vsd.transpose) head(vsd.transpose.df) ``` make rownames into a column called LibraryID: ```{r} library(tibble) vsd.transpose.df <- tibble::rownames_to_column(vsd.transpose.df, "LibraryID") head(vsd.transpose.df) ``` Add columns to dataframe to explain the libraries: ```{r} vsd.transpose.df$condition <- c("control", "control", "control", "control", "control", "control", "control", "control", "control", "control", "exposed", "exposed", "exposed", "exposed", "exposed", "exposed", "exposed", "exposed") head(vsd.transpose.df) ``` run a permanova to get significance: ```{r} permanova.expB <- adonis2(scale(vsd.transpose.df[c(2:26582)]) %>% replace(is.na(.), 0) ~ condition, data = vsd.transpose.df, method = "eu") permanova.expB #Significant influence of all factors ``` # CLEAR R ENVIRONMENT # Compare controls from Experiment B, pre-exposed by 0.45um filtered live inoculate from Experiment A and not pre-exposed n =10 libraries; 8 individual stars (6 pre-exposed, 2 not). The two stars that were not pre-exposed have two sampling time points each. ## Read in rounded count matrix from comparing the 2021 _Pycnopodia helianthoides_ coelomocyte RNAseq libraries with the 2023 _Pycnopodia helianthoides_ genome gene list FASTA: ```{r} countmatrix <- read.delim("../data/kallisto_count_matrix_rounded.tab", header = TRUE, sep = '\t') head(countmatrix) ``` ## subset libraries from Experiment B that are controls: ```{r} expBcontrols <- select(countmatrix, "PSC.56", "PSC.81", "PSC.61", "PSC.76", "PSC.52", "PSC.54", "PSC.63", "PSC.73", "PSC.58", "PSC.64") head(expBcontrols) ``` ## Make a dataframe for PCA comparison: ```{r} colData <- data.frame(condition=factor(c("pre-exposed", "pre-exposed", "pre-exposed", "pre-exposed", "pre-exposed", "pre-exposed", "not-pre-exposed", "not-pre-exposed", "not-pre-exposed", "not-pre-exposed")), type=factor(rep("paired-end",10))) rownames(colData) <- colnames(expBcontrols) dds <- DESeqDataSetFromMatrix(countData = expBcontrols, colData = colData, design = ~ condition) ``` ```{r} dds <- DESeq(dds) res <- results(dds) res <- res[order(rownames(res)), ] ``` From [Bioconductor `DESeq2` Vignette](http://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html): ```{r} vsd <- vst(dds, blind=FALSE) rld <- rlog(dds, blind=FALSE) head(assay(vsd), 3) ``` ```{r} plot <- plotPCA(vsd, intgroup=c("condition", "type")) nudge <- position_nudge(y = 5) plot + geom_text(aes(label = name), position = nudge) ``` Create a permanova plot to get statistical significance if any: transpose the vsd data so that the genes are the columns and the rows (n=14) are the libraries ```{r} vsd.transpose <- t(assay(vsd)) head(vsd.transpose) ``` vsd.transpose is currently a matrix, but adonis2 needs it to be a data frame. Make it into a dataframe: ```{r} vsd.transpose.df=as.data.frame(vsd.transpose) head(vsd.transpose.df) ``` make rownames into a column called LibraryID: ```{r} library(tibble) vsd.transpose.df <- tibble::rownames_to_column(vsd.transpose.df, "LibraryID") head(vsd.transpose.df) ``` Add columns to dataframe to explain the libraries: ```{r} vsd.transpose.df$condition <- c("pre-exposed", "pre-exposed", "pre-exposed", "pre-exposed", "pre-exposed", "pre-exposed", "not-pre-exposed", "not-pre-exposed", "not-pre-exposed", "not-pre-exposed") head(vsd.transpose.df) ``` run a permanova to get significance: ```{r} permanova.expBcontrols <- adonis2(scale(vsd.transpose.df[c(2:26582)]) %>% replace(is.na(.), 0) ~ condition, data = vsd.transpose.df, method = "eu") permanova.expBcontrols #Significant influence of all factors ``` # CLEAR R ENVIRONMENT # Compare libraries from Experiment A ## Read in rounded count matrix from comparing the 2021 _Pycnopodia helianthoides_ coelomocyte RNAseq libraries with the 2023 _Pycnopodia helianthoides_ genome gene list FASTA: ```{r} countmatrix <- read.delim("../data/kallisto_count_matrix_rounded.tab", header = TRUE, sep = '\t') head(countmatrix) ``` ## Subset libraries from Experiment A as well as Library 19 which was a star that wasted before getting included in experiment A (was exposed through shared tank) ```{r} expA <- select(countmatrix, "PSC.19", "PSC.23", "PSC.24", "PSC.34", "PSC.35", "PSC.36", "PSC.37", "PSC.38", "PSC.39", "PSC.40", "PSC.42", "PSC.43", "PSC.48", "PSC.49") head(expA) ``` ## Make a dataframe for PCA comparison: ```{r} colData <- data.frame(condition=factor(c("exposed", "exposed", "exposed", "exposed", "exposed", "exposed", "exposed", "control", "control", "control", "control", "exposed", "control", "control")), type=factor(rep("paired-end",14))) rownames(colData) <- colnames(expA) dds <- DESeqDataSetFromMatrix(countData = expA, colData = colData, design = ~ condition) dds$condition <- relevel(dds$condition, ref = "control") ``` ```{r} dds <- DESeq(dds) res <- results(dds) res <- res[order(rownames(res)), ] ``` From [Bioconductor `DESeq2` Vignette](http://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html): ```{r} vsd <- vst(dds, blind=FALSE) rld <- rlog(dds, blind=FALSE) head(assay(vsd), 3) ``` ```{r} plot <- plotPCA(vsd, intgroup=c("condition", "type")) nudge <- position_nudge(y = 5) plot + geom_text(aes(label = name), position = nudge) ``` Create a permanova plot to get statistical significance if any: transpose the vsd data so that the genes are the columns and the rows (n=14) are the libraries ```{r} vsd.transpose <- t(assay(vsd)) head(vsd.transpose) ``` vsd.transpose is currently a matrix, but adonis2 needs it to be a data frame. Make it into a dataframe: ```{r} vsd.transpose.df=as.data.frame(vsd.transpose) head(vsd.transpose.df) ``` make rownames into a column called LibraryID: ```{r} library(tibble) vsd.transpose.df <- tibble::rownames_to_column(vsd.transpose.df, "LibraryID") head(vsd.transpose.df) ``` Add columns to dataframe to explain the libraries: ```{r} vsd.transpose.df$condition <- c("exposed", "exposed", "exposed", "exposed", "exposed", "exposed", "exposed", "control", "control", "control", "control", "exposed", "control", "control") head(vsd.transpose.df) ``` run a permanova to get significance: ```{r} permanova.expB <- adonis2(scale(vsd.transpose.df[c(2:26582)]) %>% replace(is.na(.), 0) ~ condition, data = vsd.transpose.df, method = "eu") permanova.expB #Significant influence of all factors ``` # Compare the arm drop stars from Exp A - inoculated vs tank exposed: ## Read in rounded count matrix from comparing the 2021 _Pycnopodia helianthoides_ coelomocyte RNAseq libraries with the 2023 _Pycnopodia helianthoides_ genome gene list FASTA: ```{r} countmatrix <- read.delim("../data/kallisto_count_matrix_rounded.tab", header = TRUE, sep = '\t') head(countmatrix) ``` ## Subset libraries from Experiment A as well as Library 19 which was a star that wasted before getting included in experiment A (was exposed through shared tank) ```{r} expAad <- select(countmatrix, "PSC.19", "PSC.23", "PSC.24", "PSC.34", "PSC.35", "PSC.36", "PSC.37", "PSC.43") head(expAad) ``` ## Make a dataframe for PCA comparison: in this case - exposed = tank-exposed, and control = inoculated exposed ```{r} colData <- data.frame(condition=factor(c("exposed", "exposed", "exposed", "exposed", "control", "control", "control", "control")), type=factor(rep("paired-end",8))) rownames(colData) <- colnames(expAad) dds <- DESeqDataSetFromMatrix(countData = expAad, colData = colData, design = ~ condition) ``` ```{r} dds <- DESeq(dds) res <- results(dds) res <- res[order(rownames(res)), ] ``` From [Bioconductor `DESeq2` Vignette](http://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html): ```{r} vsd <- vst(dds, blind=FALSE) rld <- rlog(dds, blind=FALSE) head(assay(vsd), 3) ``` ```{r} plot <- plotPCA(vsd, intgroup=c("condition", "type")) nudge <- position_nudge(y = 5) plot + geom_text(aes(label = name), position = nudge) ``` Create a permanova plot to get statistical significance if any: transpose the vsd data so that the genes are the columns and the rows (n=14) are the libraries ```{r} vsd.transpose <- t(assay(vsd)) head(vsd.transpose) ``` vsd.transpose is currently a matrix, but adonis2 needs it to be a data frame. Make it into a dataframe: ```{r} vsd.transpose.df=as.data.frame(vsd.transpose) head(vsd.transpose.df) ``` make rownames into a column called LibraryID: ```{r} library(tibble) vsd.transpose.df <- tibble::rownames_to_column(vsd.transpose.df, "LibraryID") head(vsd.transpose.df) ``` Add columns to dataframe to explain the libraries: ```{r} vsd.transpose.df$condition <- c("exposed", "exposed", "exposed", "exposed", "control", "control", "control", "control") head(vsd.transpose.df) ``` run a permanova to get significance: ```{r} permanova.expB <- adonis2(scale(vsd.transpose.df[c(2:26582)]) %>% replace(is.na(.), 0) ~ condition, data = vsd.transpose.df, method = "eu") permanova.expB #Significant influence of all factors ```