--- title: "02-SRdiff-exp-analysis" output: html_document date: "2023-12-12" --- ```{r load libraries} library(DESeq2) library(tidyverse) library(pheatmap) library(RColorBrewer) library(data.table) ``` ## Download trimmed files ```{bash} wget -e robots=off -np --input-file=../data/download.txt -P ../data/SR ``` ## get transcriptome ```{r, engine='bash'} cd ../data curl -O https://owl.fish.washington.edu/halfshell/genomic-databank/Mtros-hq_transcripts.fasta ``` ## Create indices for transcriptome with Kallisto ```{bash} # Build index # Execute the Kallisto tool located at the specified path /home/shared/kallisto_linux-v0.50.1/kallisto \ index \ -t 20 \ -i ../output/Mtros-hq_transcripts.index \ ../data/Mtros-hq_transcripts.fasta ``` # Quantify the indices ```{bash} # Set the paths DATA_DIRECTORY="../data/SR/" KALLISTO_INDEX="../output/Mtros-hq_transcripts.index" OUTPUT_DIRECTORY="../output/kallisto-quant" # Iterate over all .fq.gz files in the data directory for FILE in "$DATA_DIRECTORY"/*.fastq.gz; do # Extract the base name of the file for naming the output folder BASENAME=$(basename "$FILE" _L099_R1_cmb.trim.fastq.gz) # Create output directory for this sample SAMPLE_OUTPUT="$OUTPUT_DIRECTORY/$BASENAME" mkdir -p "$SAMPLE_OUTPUT" # Run Kallisto quantification /home/shared/kallisto_linux-v0.50.1/kallisto quant -i "$KALLISTO_INDEX" -o "$SAMPLE_OUTPUT" \ --single -t 40 -l 65 -s 2 "$FILE" done echo "Kallisto quantification complete." ``` altenative code format to above ``` mkdir ../output/kallisto_01 find ../data/*fastq.gz \ | xargs basename -s _L001_R1_001.fastq.gz | xargs -I{} /home/shared/kallisto/kallisto \ quant -i ../data/cgigas_roslin_rna.index \ -o ../output/kallisto_01/{} \ -t 4 \ --single -l 100 -s 10 ../data/{}_L001_R1_001.fastq.gz ``` ## Generate count matrices ```{bash} perl /home/shared/trinityrnaseq-v2.12.0/util/abundance_estimates_to_matrix.pl \ --est_method kallisto \ --gene_trans_map 'none' \ --out_prefix ../output/kallisto-quant/abundance-matrices \ --name_sample_by_basedir \ ../output/kallisto-quant/*/abundance.tsv ``` ## View abundance matrix ```{bash} head ../output/kallisto-quant/abundance-matrices.isoform.counts.matrix ``` ## Make counts matrix ```{r, eval = TRUE} countmatrix <- read.delim("../output/kallisto-quant/abundance-matrices.isoform.counts.matrix", header = TRUE, sep = '\t') rownames(countmatrix) <- countmatrix$X countmatrix <- countmatrix[,-1] head(countmatrix) ``` ## Make count matrix into integers ```{r, eval=TRUE} countmatrix <- round(countmatrix, 0) str(countmatrix) ``` ## Load in treatment info for DESeq ```{r} #Load in the metadata treatmentinfo <- read.csv("../data/PSMFC-mytilus-byssus-pilot-RNA-tagseq_raw.csv", header = TRUE, sep = ",") treatmentinfo ``` ## Make file for sample, treatment ID, day, and tissue (to be plugged into DESeq) ```{r} # Get the tissue ID from the sample name, extract last character of sample ID treatmentinfo$tissue <- substr(treatmentinfo$sample.1, nchar(treatmentinfo$sample.1) - 0, nchar(treatmentinfo$sample.1)) #extract last character of treatment ID to get day of treatment treatmentinfo$day <- substr(treatmentinfo$trt, nchar(treatmentinfo$trt) - 0, nchar(treatmentinfo$trt)) #Extract treatment from treatment column ## Split each entry based on underscores split_entries <- strsplit(treatmentinfo$trt, "_") ## Extract the middle part (assuming there is a middle part) middle_parts <- sapply(split_entries, function(x) if(length(x) >= 3) x[2] else NA) ## Assign the extracted middle parts to a new column named "middle" treatmentinfo$treatment <- middle_parts #Make subset of sample ID, treatment, tissue harvested, day treatmentinfo.2 <- treatmentinfo[, c("sample.1","tissue", "treatment", "day")] #Specify FX samples for removal later FXsamples <- treatmentinfo.2 %>% filter(treatmentinfo.2$tissue == "X") %>% select(sample.1) #Convert sample ID to character vector fX_ids <- as.vector(as.character(FXsamples$sample.1)) #Change column names in count matrix to adjust to sample name in treatment info countmatrix_2 <- countmatrix new_column_names <- substr(names(countmatrix_2), 4, nchar(names(countmatrix_2)) - 5) names(countmatrix_2) <- new_column_names #Remove remaining periods that are at the front of column names remove_periods <- function(col_name) { if (grepl("^\\.", col_name)) { # Remove period from the beginning of the column name return(sub("^\\.", "", col_name)) } else { # Keep the column name unchanged return(col_name) } } # Apply the function to all column names new_column_names <- sapply(names(countmatrix_2), remove_periods) # Assign the modified column names to the data frame names(countmatrix_2) <- new_column_names #clean up data, get rid of NA's and blank cells, and put samples in order countmatrix_2 <- countmatrix_2[ , order(names(countmatrix_2))] treatmentinfo.2 <- treatmentinfo.2[order(treatmentinfo.2),] treatmentinfo.2 <- treatmentinfo.2[complete.cases(treatmentinfo.2), ] # Remove sample IDs that matched the FX samples countmatrix_2 <- countmatrix_2 %>% select(-all_of(fX_ids)) treatmentinfo.2 <- treatmentinfo.2 %>% filter(!sample.1 %in% fX_ids) rownames(treatmentinfo.2) <- treatmentinfo.2$sample.1 #Removing T047 from the treatment info because there isn't data for that, weirdly enough removal <- which(rownames(treatmentinfo.2) %in% "T047F") treatmentinfo.2 <- treatmentinfo.2[-removal,] #Check that columns and row names match all(rownames(tissuesub) == colnames(countmatrix_2)) ``` ## DEG in the foot after exposure to ocean acidification ```{r} # Filter data footOA <- treatmentinfo.2 %>% filter(tissue == "F" | treatment == "control" | treatment == "OA" | day == "3") %>% filter(!(treatment=="OW"| treatment=="DO" | day == "0" | tissue == "G")) countmatrix_FOA <- subset(countmatrix_2, select=row.names(footOA)) # Calculate DESeq object dds <- DESeqDataSetFromMatrix(countData = countmatrix_FOA, colData = footOA, design = ~ treatment) dds <- DESeq(dds) resultsNames(dds) # lists the coefficients # Filtering: keep genes that have at least 10 counts across 1/3 of the samples - https://support.bioconductor.org/p/110307/ keep <- rowSums(DESeq2::counts(dds) >= 10) >= ncol(countmatrix_FOA)/3 dds <- dds[keep,] # Generate Contrasts contrast_list <- c("treatment", "OA", "control") # order is important: factor, treatment group, control res_table <- results(dds, contrast=contrast_list, alpha = 0.05) res_table_normal <- lfcShrink(dds, coef=2, type="normal") # lfcThreshold = 0.585) # a lfc threshold of 1 = 2-fold change, 0.585 = 1.5-fold change res_table_apeglm <- lfcShrink(dds, coef=2, type="apeglm") # lfcThreshold = 0.585) # a lfc threshold of 1 = 2-fold change, 0.585 = 1.5-fold change res_table_ashr <- lfcShrink(dds, coef=2, type="ashr") ``` ## Visualize treatment differences in foot tissue ```{r} # Filter data treatmentinfo.F <- treatmentinfo.2 %>% filter(tissue == "F" | day == "3") %>% filter(!(day == "0" | tissue == "G")) countmatrix_F <- subset(countmatrix_2, select=row.names(treatmentinfo.F)) # Calculate DESeq object dds2 <- DESeqDataSetFromMatrix(countData = countmatrix_F, colData = treatmentinfo.F , design = ~ treatment) dds2 <- DESeq(dds2) resultsNames(dds2) # lists the coefficients dds2vst <- vst(dds2, blind = FALSE) # Plot PCA vsd <- vst(deseq2.dds, blind = FALSE) plotPCA(vsd, intgroup = "condition") p <- pca(assay(dds2vst), metadata = treatmentinfo.F) screeplot(p, axisLabSize = 18, titleLabSize = 22) bp0 <- biplot(p, showLoadings = FALSE, colby = 'treatment', labSize = 5, pointSize = 5, sizeLoadingsNames = 5) bp0 #Plot biplot bp1 <- biplot(p, x = "PC1", y = "PC2", colby = 'treatment', colkey = c('control' = 'royalblue1' , 'OA' = 'yellow' , 'OW' = 'red' , 'DO' = 'green' ), # shape = 'tissue', # shapekey = c('foot' = 19, # circle # 'gill' = 17), # triangle pointSize = 4, showLoadings = FALSE, lab = NULL, drawConnectors = FALSE, ellipse = TRUE, ellipseLevel = 0.95, ellipseFill = FALSE, # ellipseFillKey = c('foot' = 'blue', 'gill' = 'green'), ellipseAlpha = 1/4, ellipseLineSize = 1, # xlim = c(-150,150), ylim = c(-80, 50), gridlines.major = FALSE, gridlines.minor = FALSE, borderWidth = 1, legendPosition = 'top', legendLabSize = 16, legendIconSize = 6.0) + theme(axis.text.x = element_text(size=16,color="black"), axis.text.y = element_text(size=16,color="black"), axis.ticks.x = element_line(color="black"), axis.ticks.y = element_line(color="black")) bp1 ``` # Construct DESeq2 dataset Create a DESeqDataSet design from gene count matrix and labels. Here we set the design to look at the difference in expression between foot and gill tissue ```{r} #Set DESeq2 design library(DESeq2) gdds <- DESeqDataSetFromMatrix(countData = countmatrix_2, colData = tissuesub, design = ~tissue) ``` ## Visualize gene count data We’re looking to see if the samples of the same infection status and temperature treatment cluster. ## Log-transform the count data First we are going to log-transform the data using a variance stabilizing transforamtion (vst). This is only for visualization purposes. Essentially, this is roughly similar to putting the data on the log2 scale. It will deal with the sampling variability of low counts by calculating within-group variability (if blind=FALSE). ```{r} gvst <- vst(gdds, blind = FALSE) head (assay(gvst), 3) #-- note: fitType='parametric', but the dispersion trend was not well captured by the # function: y = a/x + b, and a local regression fit was automatically substituted. # specify fitType='local' or 'mean' to avoid this message next time. ``` ## Create a heat map ```{r} gsampleDists <- dist(t(assay(gvst))) #calculate distance matix gsampleDistMatrix <- as.matrix(gsampleDists) #distance matrix rownames(gsampleDistMatrix) <- colnames(gvst) #assign row names colnames(gsampleDistMatrix) <- NULL #assign col names colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) #assign colors pheatmap(gsampleDistMatrix, #plot matrix clustering_distance_rows=gsampleDists, #cluster rows clustering_distance_cols=gsampleDists, #cluster columns col=colors) #set colors ``` ## Principal component plot of samples ```{r} gPCAdata <- plotPCA(gvst, intgroup = c("tissue"), returnData=TRUE) percentVar <- round(100*attr(gPCAdata, "percentVar")) #plot PCA of samples with all data ggplot(gPCAdata, aes(PC1, PC2, color=tissue)) + geom_point(size=4, alpha = 5/10) + xlab(paste0("PC1: ",percentVar[1],"% variance")) + ylab(paste0("PC2: ",percentVar[2],"% variance")) + coord_fixed() ``` ## Differential Gene Expression Analysis NOTE: This will take a few minutes to run. ```{r} DEG.group <- DESeq(gdds, test = "Wald") #run differential expression test by group using Wald (default) DEG.group.results <- results(DEG.group) #save DE results resultsNames(DEG.group) #view DE results ``` ```{r} group.results.ordered <- order(DEG.group.results$pvalue) #Order p-values by smallest value first summary(DEG.group.results) #view results summary ``` ```{r} sig.num <- sum(DEG.group.results$padj < 0.05, na.rm=TRUE) #How many adjusted p-values were less than 0.05? sig.num ```