---
title: "02-SRdiff-exp-analysis"
output: html_document
date: "2023-12-12"
---
```{r load libraries}
library(DESeq2)
library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(data.table)
```
## Download trimmed files
```{bash}
wget -e robots=off -np --input-file=../data/download.txt -P ../data/SR
```
## get transcriptome
```{r, engine='bash'}
cd ../data
curl -O https://owl.fish.washington.edu/halfshell/genomic-databank/Mtros-hq_transcripts.fasta
```
## Create indices for transcriptome with Kallisto
```{bash}
# Build index
# Execute the Kallisto tool located at the specified path
/home/shared/kallisto_linux-v0.50.1/kallisto \
index \
-t 20 \
-i ../output/Mtros-hq_transcripts.index \
../data/Mtros-hq_transcripts.fasta
```
# Quantify the indices
```{bash}
# Set the paths
DATA_DIRECTORY="../data/SR/"
KALLISTO_INDEX="../output/Mtros-hq_transcripts.index"
OUTPUT_DIRECTORY="../output/kallisto-quant"
# Iterate over all .fq.gz files in the data directory
for FILE in "$DATA_DIRECTORY"/*.fastq.gz; do
# Extract the base name of the file for naming the output folder
BASENAME=$(basename "$FILE" _L099_R1_cmb.trim.fastq.gz)
# Create output directory for this sample
SAMPLE_OUTPUT="$OUTPUT_DIRECTORY/$BASENAME"
mkdir -p "$SAMPLE_OUTPUT"
# Run Kallisto quantification
/home/shared/kallisto_linux-v0.50.1/kallisto quant -i "$KALLISTO_INDEX" -o "$SAMPLE_OUTPUT" \
--single -t 40 -l 65 -s 2 "$FILE"
done
echo "Kallisto quantification complete."
```
altenative code format to above
```
mkdir ../output/kallisto_01
find ../data/*fastq.gz \
| xargs basename -s _L001_R1_001.fastq.gz | xargs -I{} /home/shared/kallisto/kallisto \
quant -i ../data/cgigas_roslin_rna.index \
-o ../output/kallisto_01/{} \
-t 4 \
--single -l 100 -s 10 ../data/{}_L001_R1_001.fastq.gz
```
## Generate count matrices
```{bash}
perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \
--est_method kallisto \
--gene_trans_map 'none' \
--out_prefix ../output/kallisto-quant/abundance-matrices \
--name_sample_by_basedir \
../output/kallisto-quant/*/abundance.tsv
```
## View abundance matrix
```{bash}
head ../output/kallisto-quant/abundance-matrices.isoform.counts.matrix
```
## Make counts matrix
```{r, eval = TRUE}
countmatrix <- read.delim("../output/kallisto-quant/abundance-matrices.isoform.counts.matrix", header = TRUE, sep = '\t')
rownames(countmatrix) <- countmatrix$X
countmatrix <- countmatrix[,-1]
head(countmatrix)
```
## Make count matrix into integers
```{r, eval=TRUE}
countmatrix <- round(countmatrix, 0)
str(countmatrix)
```
## Load in treatment info for DESeq
```{r}
#Load in the metadata
treatmentinfo <- read.csv("../data/PSMFC-mytilus-byssus-pilot-RNA-tagseq_raw.csv", header = TRUE, sep = ",")
treatmentinfo
```
## Make file for sample, treatment ID, day, and tissue (to be plugged into DESeq)
```{r}
# Get the tissue ID from the sample name, extract last character of sample ID
treatmentinfo$tissue <- substr(treatmentinfo$sample.1, nchar(treatmentinfo$sample.1) - 0, nchar(treatmentinfo$sample.1))
#extract last character of treatment ID to get day of treatment
treatmentinfo$day <- substr(treatmentinfo$trt, nchar(treatmentinfo$trt) - 0, nchar(treatmentinfo$trt))
#Extract treatment from treatment column
## Split each entry based on underscores
split_entries <- strsplit(treatmentinfo$trt, "_")
## Extract the middle part (assuming there is a middle part)
middle_parts <- sapply(split_entries, function(x) if(length(x) >= 3) x[2] else NA)
## Assign the extracted middle parts to a new column named "middle"
treatmentinfo$treatment <- middle_parts
#Make subset of sample ID, treatment, tissue harvested, day
treatmentinfo.2 <- treatmentinfo[, c("sample.1","tissue", "treatment", "day")]
#Specify FX samples for removal later
FXsamples <- treatmentinfo.2 %>%
filter(treatmentinfo.2$tissue == "X") %>%
select(sample.1)
#Convert sample ID to character vector
fX_ids <- as.vector(as.character(FXsamples$sample.1))
#Change column names in count matrix to adjust to sample name in treatment info
countmatrix_2 <- countmatrix
new_column_names <- substr(names(countmatrix_2), 4, nchar(names(countmatrix_2)) - 5)
names(countmatrix_2) <- new_column_names
#Remove remaining periods that are at the front of column names
remove_periods <- function(col_name) {
if (grepl("^\\.", col_name)) {
# Remove period from the beginning of the column name
return(sub("^\\.", "", col_name))
} else {
# Keep the column name unchanged
return(col_name)
}
}
# Apply the function to all column names
new_column_names <- sapply(names(countmatrix_2), remove_periods)
# Assign the modified column names to the data frame
names(countmatrix_2) <- new_column_names
#clean up data, get rid of NA's and blank cells, and put samples in order
countmatrix_2 <- countmatrix_2[ , order(names(countmatrix_2))]
treatmentinfo.2 <- treatmentinfo.2[order(treatmentinfo.2),]
treatmentinfo.2 <- treatmentinfo.2[complete.cases(treatmentinfo.2), ]
# Remove sample IDs that matched the FX samples
countmatrix_2 <- countmatrix_2 %>%
select(-all_of(fX_ids))
treatmentinfo.2 <- treatmentinfo.2 %>%
filter(!sample.1 %in% fX_ids)
rownames(treatmentinfo.2) <- treatmentinfo.2$sample.1
#Removing T047 from the treatment info because there isn't data for that, weirdly enough
removal <- which(rownames(treatmentinfo.2) %in% "T047F")
treatmentinfo.2 <- treatmentinfo.2[-removal,]
#Check that columns and row names match
all(rownames(tissuesub) == colnames(countmatrix_2))
```
## DEG in the foot after exposure to ocean acidification
```{r}
# Filter data
footOA <- treatmentinfo.2 %>%
filter(tissue == "F" | treatment == "control" | treatment == "OA" | day == "3") %>%
filter(!(treatment=="OW"| treatment=="DO" | day == "0" | tissue == "G"))
countmatrix_FOA <- subset(countmatrix_2, select=row.names(footOA))
# Calculate DESeq object
dds <- DESeqDataSetFromMatrix(countData = countmatrix_FOA,
colData = footOA,
design = ~ treatment)
dds <- DESeq(dds)
resultsNames(dds) # lists the coefficients
# Filtering: keep genes that have at least 10 counts across 1/3 of the samples - https://support.bioconductor.org/p/110307/
keep <- rowSums(DESeq2::counts(dds) >= 10) >= ncol(countmatrix_FOA)/3
dds <- dds[keep,]
# Generate Contrasts
contrast_list <- c("treatment", "OA", "control") # order is important: factor, treatment group, control
res_table <- results(dds, contrast=contrast_list, alpha = 0.05)
res_table_normal <- lfcShrink(dds,
coef=2,
type="normal") # lfcThreshold = 0.585) # a lfc threshold of 1 = 2-fold change, 0.585 = 1.5-fold change
res_table_apeglm <- lfcShrink(dds,
coef=2,
type="apeglm") # lfcThreshold = 0.585) # a lfc threshold of 1 = 2-fold change, 0.585 = 1.5-fold change
res_table_ashr <- lfcShrink(dds,
coef=2,
type="ashr")
```
## Visualize treatment differences in foot tissue
```{r}
# Filter data
treatmentinfo.F <- treatmentinfo.2 %>% filter(tissue == "F" | day == "3") %>%
filter(!(day == "0" | tissue == "G"))
countmatrix_F <- subset(countmatrix_2, select=row.names(treatmentinfo.F))
# Calculate DESeq object
dds2 <- DESeqDataSetFromMatrix(countData = countmatrix_F,
colData = treatmentinfo.F ,
design = ~ treatment)
dds2 <- DESeq(dds2)
resultsNames(dds2) # lists the coefficients
dds2vst <- vst(dds2, blind = FALSE)
# Plot PCA
vsd <- vst(deseq2.dds, blind = FALSE)
plotPCA(vsd, intgroup = "condition")
p <- pca(assay(dds2vst), metadata = treatmentinfo.F)
screeplot(p, axisLabSize = 18, titleLabSize = 22)
bp0 <- biplot(p, showLoadings = FALSE, colby = 'treatment',
labSize = 5, pointSize = 5, sizeLoadingsNames = 5)
bp0
#Plot biplot
bp1 <- biplot(p, x = "PC1", y = "PC2",
colby = 'treatment',
colkey = c('control' = 'royalblue1' ,
'OA' = 'yellow' ,
'OW' = 'red' ,
'DO' = 'green' ),
# shape = 'tissue',
# shapekey = c('foot' = 19, # circle
# 'gill' = 17), # triangle
pointSize = 4,
showLoadings = FALSE,
lab = NULL,
drawConnectors = FALSE,
ellipse = TRUE,
ellipseLevel = 0.95,
ellipseFill = FALSE,
# ellipseFillKey = c('foot' = 'blue', 'gill' = 'green'),
ellipseAlpha = 1/4,
ellipseLineSize = 1,
# xlim = c(-150,150), ylim = c(-80, 50),
gridlines.major = FALSE,
gridlines.minor = FALSE,
borderWidth = 1,
legendPosition = 'top', legendLabSize = 16, legendIconSize = 6.0) +
theme(axis.text.x = element_text(size=16,color="black"),
axis.text.y = element_text(size=16,color="black"),
axis.ticks.x = element_line(color="black"),
axis.ticks.y = element_line(color="black"))
bp1
```
# Construct DESeq2 dataset
Create a DESeqDataSet design from gene count matrix and labels. Here we set the design to look at the difference in expression between foot and gill tissue
```{r}
#Set DESeq2 design
library(DESeq2)
gdds <- DESeqDataSetFromMatrix(countData = countmatrix_2,
colData = tissuesub,
design = ~tissue)
```
## Visualize gene count data
We’re looking to see if the samples of the same infection status and temperature treatment cluster.
## Log-transform the count data
First we are going to log-transform the data using a variance stabilizing transforamtion (vst). This is only for visualization purposes. Essentially, this is roughly similar to putting the data on the log2 scale. It will deal with the sampling variability of low counts by calculating within-group variability (if blind=FALSE).
```{r}
gvst <- vst(gdds, blind = FALSE)
head (assay(gvst), 3)
#-- note: fitType='parametric', but the dispersion trend was not well captured by the
# function: y = a/x + b, and a local regression fit was automatically substituted.
# specify fitType='local' or 'mean' to avoid this message next time.
```
## Create a heat map
```{r}
gsampleDists <- dist(t(assay(gvst))) #calculate distance matix
gsampleDistMatrix <- as.matrix(gsampleDists) #distance matrix
rownames(gsampleDistMatrix) <- colnames(gvst) #assign row names
colnames(gsampleDistMatrix) <- NULL #assign col names
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) #assign colors
pheatmap(gsampleDistMatrix, #plot matrix
clustering_distance_rows=gsampleDists, #cluster rows
clustering_distance_cols=gsampleDists, #cluster columns
col=colors) #set colors
```
## Principal component plot of samples
```{r}
gPCAdata <- plotPCA(gvst, intgroup = c("tissue"), returnData=TRUE)
percentVar <- round(100*attr(gPCAdata, "percentVar")) #plot PCA of samples with all data
ggplot(gPCAdata, aes(PC1, PC2, color=tissue)) +
geom_point(size=4, alpha = 5/10) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()
```
## Differential Gene Expression Analysis
NOTE: This will take a few minutes to run.
```{r}
DEG.group <- DESeq(gdds, test = "Wald") #run differential expression test by group using Wald (default)
DEG.group.results <- results(DEG.group) #save DE results
resultsNames(DEG.group) #view DE results
```
```{r}
group.results.ordered <- order(DEG.group.results$pvalue) #Order p-values by smallest value first
summary(DEG.group.results) #view results summary
```
```{r}
sig.num <- sum(DEG.group.results$padj < 0.05, na.rm=TRUE) #How many adjusted p-values were less than 0.05?
sig.num
```