---
title: "04-PCAplots"
output: html_document
date: "2024-02-01"
---
Rmd to create PCA plots to look for any preliminary effects of treatment, etc., on star gene expression in preparation for `DESEq2`. Gene counts were gotten from aligning libraries to the gene list FASTA using `kallisto`. 


```{r}
sessionInfo()
```

Load packages:
```{r}
library(DESeq2)
library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(data.table)
library(ggplot2)
library(vegan)
```


# Read in rounded count matrix from comparing the 2021 _Pycnopodia helianthoides_ coelomocyte RNAseq libraries with the 2023 _Pycnopodia helianthoides_ genome gene list FASTA:  
```{r}
countmatrix <- read.delim("../data/kallisto_count_matrix_rounded.tab", header = TRUE, sep = '\t')
head(countmatrix)
```
26581 rows (genes) with 32 columns (libraries). 

## QUESTION 
When done with the 2015 Phel transcriptome, it was 29476 genes... is genome smaller list because it's more fine-tuned to _Pycnopodia helianthoides_ whereas the _de novo_ assembled transcriptome included some incidental non-host transcripts? 


# Subset all data from what we'll call Experiment B (stars that were incoulated with a treatment October 5, 2024)
n = 18      
Exposed --> n = 8 (8 samples from 8 stars)    
Control --> n = 10 (10 samples, from 8 stars ... 2 stars have two sample time points)    


```{r}
expB <- select(countmatrix, "PSC.56", "PSC.81", "PSC.61", "PSC.76", "PSC.52", "PSC.54", "PSC.63", "PSC.73", "PSC.58", "PSC.64", "PSC.57", "PSC.59", "PSC.67", "PSC.69", "PSC.71", "PSC.75", "PSC.78", "PSC.83")
head(expB)
```

## Make a dataframe for PCA comparison:
```{r}
colData <- data.frame(condition=factor(c("control", "control", "control", "control", "control", "control", "control", "control", "control", "control", "exposed", "exposed", "exposed", "exposed", "exposed", "exposed", "exposed", "exposed")),
                      type=factor(rep("paired-end",18)))
rownames(colData) <- colnames(expB)
dds <- DESeqDataSetFromMatrix(countData = expB,
                              colData = colData,
                              design = ~ condition)
```

```{r}
dds <- DESeq(dds)
res <- results(dds)
res <- res[order(rownames(res)), ]
```

From [Bioconductor `DESeq2` Vignette](http://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html):     
```{r}
vsd <- vst(dds, blind=FALSE)
rld <- rlog(dds, blind=FALSE)
head(assay(vsd), 3)
```

```{r}
plot <- plotPCA(vsd, intgroup=c("condition", "type"))
nudge <- position_nudge(y = 5)
plot + geom_text(aes(label = name), position = nudge)
```
Create a permanova plot to get statistical significance if any:
transpose the vsd data so that the genes are the columns and the rows (n=14) are the libraries 
```{r}
vsd.transpose <- t(assay(vsd))
head(vsd.transpose)
```

vsd.transpose is currently a matrix, but adonis2 needs it to be a data frame. Make it into a dataframe:
```{r}
vsd.transpose.df=as.data.frame(vsd.transpose)
head(vsd.transpose.df)
```
make rownames into a column called LibraryID:
```{r}
library(tibble)
vsd.transpose.df <- tibble::rownames_to_column(vsd.transpose.df, "LibraryID")
head(vsd.transpose.df)
```

Add columns to dataframe to explain the libraries:
```{r}
vsd.transpose.df$condition <- c("control", "control", "control", "control", "control", "control", "control", "control", "control", "control", "exposed", "exposed", "exposed", "exposed", "exposed", "exposed", "exposed", "exposed")
head(vsd.transpose.df)
```


run a permanova to get significance:
```{r}
permanova.expB <- adonis2(scale(vsd.transpose.df[c(2:26582)]) %>%
                       replace(is.na(.), 0) ~ condition, data = vsd.transpose.df, method = "eu")  
permanova.expB #Significant influence of all factors
``` 


# CLEAR R ENVIRONMENT

# Compare controls from Experiment B, pre-exposed by 0.45um filtered live inoculate from Experiment A and not pre-exposed
n =10 libraries; 8 individual stars (6 pre-exposed, 2 not). The two stars that were not pre-exposed have two sampling time points each. 

## Read in rounded count matrix from comparing the 2021 _Pycnopodia helianthoides_ coelomocyte RNAseq libraries with the 2023 _Pycnopodia helianthoides_ genome gene list FASTA:  
```{r}
countmatrix <- read.delim("../data/kallisto_count_matrix_rounded.tab", header = TRUE, sep = '\t')
head(countmatrix)
```

## subset libraries from Experiment B that are controls:
```{r}
expBcontrols <- select(countmatrix, "PSC.56", "PSC.81", "PSC.61", "PSC.76", "PSC.52", "PSC.54", "PSC.63", "PSC.73", "PSC.58", "PSC.64")
head(expBcontrols)
```

## Make a dataframe for PCA comparison:
```{r}
colData <- data.frame(condition=factor(c("pre-exposed", "pre-exposed", "pre-exposed", "pre-exposed", "pre-exposed", "pre-exposed", "not-pre-exposed", "not-pre-exposed", "not-pre-exposed", "not-pre-exposed")),
                      type=factor(rep("paired-end",10)))
rownames(colData) <- colnames(expBcontrols)
dds <- DESeqDataSetFromMatrix(countData = expBcontrols,
                              colData = colData,
                              design = ~ condition)
```

```{r}
dds <- DESeq(dds)
res <- results(dds)
res <- res[order(rownames(res)), ]
```

From [Bioconductor `DESeq2` Vignette](http://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html):     
```{r}
vsd <- vst(dds, blind=FALSE)
rld <- rlog(dds, blind=FALSE)
head(assay(vsd), 3)
```

```{r}
plot <- plotPCA(vsd, intgroup=c("condition", "type"))
nudge <- position_nudge(y = 5)
plot + geom_text(aes(label = name), position = nudge)
```

Create a permanova plot to get statistical significance if any:
transpose the vsd data so that the genes are the columns and the rows (n=14) are the libraries 
```{r}
vsd.transpose <- t(assay(vsd))
head(vsd.transpose)
```

vsd.transpose is currently a matrix, but adonis2 needs it to be a data frame. Make it into a dataframe:
```{r}
vsd.transpose.df=as.data.frame(vsd.transpose)
head(vsd.transpose.df)
```
make rownames into a column called LibraryID:
```{r}
library(tibble)
vsd.transpose.df <- tibble::rownames_to_column(vsd.transpose.df, "LibraryID")
head(vsd.transpose.df)
```

Add columns to dataframe to explain the libraries:
```{r}
vsd.transpose.df$condition <- c("pre-exposed", "pre-exposed", "pre-exposed", "pre-exposed", "pre-exposed", "pre-exposed", "not-pre-exposed", "not-pre-exposed", "not-pre-exposed", "not-pre-exposed")
head(vsd.transpose.df)
```


run a permanova to get significance:
```{r}
permanova.expBcontrols <- adonis2(scale(vsd.transpose.df[c(2:26582)]) %>%
                       replace(is.na(.), 0) ~ condition, data = vsd.transpose.df, method = "eu")  
permanova.expBcontrols #Significant influence of all factors
``` 

# CLEAR R ENVIRONMENT

# Compare libraries from Experiment A
## Read in rounded count matrix from comparing the 2021 _Pycnopodia helianthoides_ coelomocyte RNAseq libraries with the 2023 _Pycnopodia helianthoides_ genome gene list FASTA:  
```{r}
countmatrix <- read.delim("../data/kallisto_count_matrix_rounded.tab", header = TRUE, sep = '\t')
head(countmatrix)
```

## Subset libraries from Experiment A as well as Library 19 which was a star that wasted before getting included in experiment A (was exposed through shared tank)

```{r}
expA <- select(countmatrix, "PSC.19", "PSC.23", "PSC.24", "PSC.34", "PSC.35", "PSC.36", "PSC.37", "PSC.38", "PSC.39", "PSC.40", "PSC.42", "PSC.43", "PSC.48", "PSC.49")
head(expA)
```

## Make a dataframe for PCA comparison:
```{r}
colData <- data.frame(condition=factor(c("exposed", "exposed", "exposed", "exposed", "exposed", "exposed", "exposed", "control", "control", "control", "control", "exposed", "control", "control")),
                      type=factor(rep("paired-end",14)))
rownames(colData) <- colnames(expA)
dds <- DESeqDataSetFromMatrix(countData = expA,
                              colData = colData,
                              design = ~ condition)
dds$condition <- relevel(dds$condition, ref = "control")
```

```{r}
dds <- DESeq(dds)
res <- results(dds)
res <- res[order(rownames(res)), ]
```

From [Bioconductor `DESeq2` Vignette](http://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html):     
```{r}
vsd <- vst(dds, blind=FALSE)
rld <- rlog(dds, blind=FALSE)
head(assay(vsd), 3)
```

```{r}
plot <- plotPCA(vsd, intgroup=c("condition", "type"))
nudge <- position_nudge(y = 5)
plot + geom_text(aes(label = name), position = nudge)
```

Create a permanova plot to get statistical significance if any:
transpose the vsd data so that the genes are the columns and the rows (n=14) are the libraries 
```{r}
vsd.transpose <- t(assay(vsd))
head(vsd.transpose)
```

vsd.transpose is currently a matrix, but adonis2 needs it to be a data frame. Make it into a dataframe:
```{r}
vsd.transpose.df=as.data.frame(vsd.transpose)
head(vsd.transpose.df)
```
make rownames into a column called LibraryID:
```{r}
library(tibble)
vsd.transpose.df <- tibble::rownames_to_column(vsd.transpose.df, "LibraryID")
head(vsd.transpose.df)
```

Add columns to dataframe to explain the libraries:
```{r}
vsd.transpose.df$condition <- c("exposed", "exposed", "exposed", "exposed", "exposed", "exposed", "exposed", "control", "control", "control", "control", "exposed", "control", "control")
head(vsd.transpose.df)
```


run a permanova to get significance:
```{r}
permanova.expB <- adonis2(scale(vsd.transpose.df[c(2:26582)]) %>%
                       replace(is.na(.), 0) ~ condition, data = vsd.transpose.df, method = "eu")  
permanova.expB #Significant influence of all factors
``` 
# Compare the arm drop stars from Exp A - inoculated vs tank exposed: 

## Read in rounded count matrix from comparing the 2021 _Pycnopodia helianthoides_ coelomocyte RNAseq libraries with the 2023 _Pycnopodia helianthoides_ genome gene list FASTA:  
```{r}
countmatrix <- read.delim("../data/kallisto_count_matrix_rounded.tab", header = TRUE, sep = '\t')
head(countmatrix)
```

## Subset libraries from Experiment A as well as Library 19 which was a star that wasted before getting included in experiment A (was exposed through shared tank)

```{r}
expAad <- select(countmatrix, "PSC.19", "PSC.23", "PSC.24", "PSC.34", "PSC.35", "PSC.36", "PSC.37", "PSC.43")
head(expAad)
```

## Make a dataframe for PCA comparison:
in this case - exposed = tank-exposed, and control = inoculated exposed
```{r}
colData <- data.frame(condition=factor(c("exposed", "exposed", "exposed", "exposed", "control", "control", "control", "control")),
                      type=factor(rep("paired-end",8)))
rownames(colData) <- colnames(expAad)
dds <- DESeqDataSetFromMatrix(countData = expAad,
                              colData = colData,
                              design = ~ condition)
```

```{r}
dds <- DESeq(dds)
res <- results(dds)
res <- res[order(rownames(res)), ]
```

From [Bioconductor `DESeq2` Vignette](http://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html):     
```{r}
vsd <- vst(dds, blind=FALSE)
rld <- rlog(dds, blind=FALSE)
head(assay(vsd), 3)
```

```{r}
plot <- plotPCA(vsd, intgroup=c("condition", "type"))
nudge <- position_nudge(y = 5)
plot + geom_text(aes(label = name), position = nudge)
```

Create a permanova plot to get statistical significance if any:
transpose the vsd data so that the genes are the columns and the rows (n=14) are the libraries 
```{r}
vsd.transpose <- t(assay(vsd))
head(vsd.transpose)
```

vsd.transpose is currently a matrix, but adonis2 needs it to be a data frame. Make it into a dataframe:
```{r}
vsd.transpose.df=as.data.frame(vsd.transpose)
head(vsd.transpose.df)
```
make rownames into a column called LibraryID:
```{r}
library(tibble)
vsd.transpose.df <- tibble::rownames_to_column(vsd.transpose.df, "LibraryID")
head(vsd.transpose.df)
```

Add columns to dataframe to explain the libraries:
```{r}
vsd.transpose.df$condition <- c("exposed", "exposed", "exposed", "exposed", "control", "control", "control", "control")
head(vsd.transpose.df)
```


run a permanova to get significance:
```{r}
permanova.expB <- adonis2(scale(vsd.transpose.df[c(2:26582)]) %>%
                       replace(is.na(.), 0) ~ condition, data = vsd.transpose.df, method = "eu")  
permanova.expB #Significant influence of all factors
```