--- title: "03.0-D-Apul-lncRNAseq-expression-DESeq2" author: "Kathleen Durkin" date: "2024-05-01" output: github_document: toc: true number_sections: true html_document: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true bookdown::html_document2: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true bibliography: references.bib --- Expression summary for *Acropora pulchra* lncRNA data. ### Install and load packages ```{r load_libraries, inlcude = TRUE} library(tidyverse) library(ggplot2) library(reshape2) library(pheatmap) library(RColorBrewer) library(DESeq2) library(ComplexHeatmap) ``` ## Load count data and coldata Load in the lncRNA count matrix generated in `08-Apul-lncRNA`. Coldata generated in `03.00-D-Apul-RNAseq-gene-expression-DESeq2` ```{r load-lncRNA-counts} # Read in lncRNA counts data Apul_counts_lncRNA_data_OG <- read.table("../output/08-Apul-lncRNA/counts.txt", header = TRUE, sep = "\t", skip = 1) head(Apul_counts_lncRNA_data_OG) # Read in coldata coldata_OG <- read.csv(file = "../output/03.00-D-Apul-RNAseq-gene-expression-DESeq2/DESeq2-coldata.tab", row.names=1, sep = "\t") coldata_OG$time.point <- factor(coldata_OG$time.point) ``` ## Count data munging ```{r lncRNA-count-data-munging} Apul_counts_lncRNA <- Apul_counts_lncRNA_data_OG coldata <- coldata_OG # Remove excess portions of sample column names to just "sample###" colnames(Apul_counts_lncRNA) <- sub("...data.", "", colnames(Apul_counts_lncRNA)) colnames(Apul_counts_lncRNA) <- sub(".sorted.bam", "", colnames(Apul_counts_lncRNA)) # Keep just the counts and names Apul_counts_lncRNA <- Apul_counts_lncRNA %>% select(-Chr, -Start, -End, -Strand, -Length) # Make the cluster names our new row names Apul_counts_lncRNA <- Apul_counts_lncRNA %>% column_to_rownames(var = "Geneid") # Remove any with 0 counts in all samples Apul_counts_lncRNA <- Apul_counts_lncRNA[rowSums(Apul_counts_lncRNA) != 0, ] # Append colony and timepoint info to sample names colnames(Apul_counts_lncRNA) <- paste(colnames(Apul_counts_lncRNA), coldata[colnames(Apul_counts_lncRNA), "colony.id"], coldata[colnames(Apul_counts_lncRNA), "time.point"], sep = "_") # Make sure coldata metadata has matching rownames (for DEseq2 formatting) rownames(coldata) <- paste(rownames(coldata), coldata$colony.id, coldata$time.point, sep = "_") write.table(Apul_counts_lncRNA, file = "../output/03.20-D-Apul-lncRNA-expression-DESeq2/Apul_lncRNA_ShortStack_counts_formatted.txt", sep = "\t", row.names = TRUE, col.names = TRUE, quote = FALSE) head(Apul_counts_lncRNA) head(coldata) ``` ## Expression levels Plot histograms of the expression levels in each sample ```{r expression-level-histograms} # Melt the count matrix into long format Apul_counts_lncRNA_melted <- melt(Apul_counts_lncRNA, variable.name = "sample", value.name = "counts") # Plot the expression level histograms for each sample ggplot(Apul_counts_lncRNA_melted, aes(x = counts)) + geom_histogram(binwidth = 1, fill = "#408EC6", color = "black") + scale_x_log10() + # Optional: Log-transform the x-axis for better visualization facet_wrap(~sample, scales = "free_y") + labs(title = "Gene Expression Level Histogram for Each Sample", x = "Expression Level (Counts)", y = "Frequency") + theme_minimal() ``` ## Transcript counts First let's check the total number of transcripts in each sample -- keep in mind this expression data has *not* been normalized yet, so there may be different totals for each sample ```{r transcript-counts-plot} # Calculate the total number of transcripts for each sample total_transcripts <- colSums(Apul_counts_lncRNA) # Create a data frame for plotting total_transcripts_df <- data.frame(sample = names(total_transcripts), totals = total_transcripts) # Plot the total number of transcripts for each sample ggplot(total_transcripts_df, aes(x = reorder(sample, totals), y = totals)) + geom_bar(stat = "identity", fill = "#408EC6", color = "black") + geom_text(aes(label = totals), vjust = -0.3, size = 3.5) + labs(title = "Total Number of Transcripts per Sample", x = "Sample", y = "Total Transcripts") + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability ``` No glaring discrepancies/patterns Now let's check the number of unique transcripts in each sample -- that is, how many unique lncRNAs are expressed in each sample? This should be pretty much the same across samples, even without normalization. ```{r total-unique-transcripts-plot} # Calculate the number of unique transcripts (non-zero counts) for each sample unique_transcripts <- colSums(Apul_counts_lncRNA > 0) # Create a data frame for plotting unique_transcripts_df <- data.frame(sample = names(unique_transcripts), uniques = unique_transcripts) # Plot the total number of unique transcripts for each sample ggplot(unique_transcripts_df, aes(x = reorder(sample, uniques), y = uniques)) + geom_bar(stat = "identity", fill = "#408EC6", color = "black") + geom_text(aes(label = uniques), vjust = -0.3, size = 3.5) + labs(title = "Total Number of Unique Expressed Transcripts per Sample", x = "Sample", y = "Unique Transcripts") + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability ``` # ........... ## Normalize counts with DESeq2 ### Metadata DESeq2 requires a metadata data frame as input -- we'll use the coldata we've already formatted ```{r make-lncRNA-metadata-dataframe} head(coldata) ``` ### DESeq object ## Verify rownames match ```{r check-rownames} # Alphabetize rownames of coldata and colnames of Apul_counts_lncRNA coldata <- coldata[order(rownames(coldata)), ] Apul_counts_lncRNA <- Apul_counts_lncRNA[, order(colnames(Apul_counts_lncRNA))] all(rownames(coldata) == colnames(Apul_counts_lncRNA)) ``` # Create DESeq2 data set ```{r create-deseq2-data-set, cache=TRUE} dds_Apul_lncRNA <- DESeqDataSetFromMatrix(countData = Apul_counts_lncRNA, colData = coldata, design = ~ time.point + colony.id) dds_Apul_lncRNA ``` ```{r} dds_Apul_lncRNA$time.point <- factor(dds_Apul_lncRNA$time.point, levels = c("TP1","TP2", "TP3", "TP4")) dds_Apul_lncRNA <- DESeq(dds_Apul_lncRNA) ``` ## Pairwise results tables ```{r deseq2-pairwise-results-tables} # Define the output directory path output_dir <- "../output/03.20-D-Apul-lncRNA-expression-DESeq2/" # Set desired false discovery rate threshold (i.e. adjusted p-value, padj) fdr <- 0.05 # Set log2 fold change threshold (a value of '1' is equal to a fold change of '2') log2fc <- 1 lncRNA_tp1.v.tp2.results <- results(dds_Apul_lncRNA, contrast=c("time.point","TP1","TP2"), alpha = fdr, lfcThreshold = log2fc) lncRNA_tp1.v.tp3.results <- results(dds_Apul_lncRNA, contrast=c("time.point","TP1","TP3"), alpha = fdr, lfcThreshold = log2fc) lncRNA_tp1.v.tp4.results <- results(dds_Apul_lncRNA, contrast=c("time.point","TP1","TP4"), alpha = fdr, lfcThreshold = log2fc) lncRNA_tp2.v.tp3.results <- results(dds_Apul_lncRNA, contrast=c("time.point","TP2","TP3"), alpha = fdr, lfcThreshold = log2fc) lncRNA_tp2.v.tp4.results <- results(dds_Apul_lncRNA, contrast=c("time.point","TP2","TP4"), alpha = fdr, lfcThreshold = log2fc) lncRNA_tp3.v.tp4.results <- results(dds_Apul_lncRNA, contrast=c("time.point","TP3","TP4"), alpha = fdr, lfcThreshold = log2fc) lncRNA_tp2.v.tp4.results summary(lncRNA_tp2.v.tp4.results) table(lncRNA_tp2.v.tp4.results$padj < 0.05) ``` Write DDS results tables to CSVs ```{r write-dds-results-csv} # Create a named list of the data frames results_list <- list( lncRNA_tp1.v.tp2.results = lncRNA_tp1.v.tp2.results, lncRNA_tp1.v.tp3.results = lncRNA_tp1.v.tp3.results, lncRNA_tp1.v.tp4.results = lncRNA_tp1.v.tp4.results, lncRNA_tp2.v.tp3.results = lncRNA_tp2.v.tp3.results, lncRNA_tp2.v.tp4.results = lncRNA_tp2.v.tp4.results, lncRNA_tp3.v.tp4.results = lncRNA_tp3.v.tp4.results ) # Loop through the list and write each data frame to a CSV file in the specified directory for (df_name in names(results_list)) { write.csv(results_list[[df_name]], file = paste0(output_dir, df_name, ".table.csv"), row.names = TRUE, quote = FALSE) } ``` ## Normalizations It's worth noting here that I'm actually going to be doing two different types of transformation on the counts data, which serve different purposes. - First is **normalizing** the transcript counts, which adjusts for differences in library size or sequencing depth, but retains count-like properties. Normalized counts are most useful for things like visualizing expression levels and differential expression analysis. - Second is **variance stabilizing** the counts data, which aims to make the variance of the transformed data approximately independent of the mean, reducing heteroscedasticity (the relationship between variance and mean) and "smoothing" out the variance at low counts. Notably, the transformed data is *no longer on the original count scale*. The transformation makes the variance roughly constant across the range of counts, which makes it easier to interpret patterns in the data visually. Variance stabilized data is most useful for exploratory data analysis, like PCA, clustering, and heatmaps, and is also the transformation we'll want to use before WGCNA. ```{r get-normalized-lncRNA-counts, cache=TRUE} # extract normalized counts # (normalization is automatically performed by deseq2) Apul_counts_lncRNA_norm <- counts(dds_Apul_lncRNA, normalized=TRUE) %>% data.frame() write.table(Apul_counts_lncRNA_norm, file = "../output/03.20-D-Apul-lncRNA-expression-DESeq2/Apul_counts_lncRNA_normalized.txt", sep = "\t", row.names = TRUE, col.names = TRUE, quote = FALSE) # variance stabilized data vsd_Apul_lncRNA <- varianceStabilizingTransformation(dds_Apul_lncRNA, blind=TRUE) wpn_vsd_Apul_lncRNA <- getVarianceStabilizedData(dds_Apul_lncRNA) rv_wpn_Apul_lncRNA <- rowVars(wpn_vsd_Apul_lncRNA, useNames=TRUE) Apul_counts_lncRNA_vsd <- data.frame(wpn_vsd_Apul_lncRNA) write.table(Apul_counts_lncRNA_vsd, file = "../output/03.20-D-Apul-lncRNA-expression-DESeq2/Apul_counts_lncRNA_variancestabilized.txt", sep = "\t", row.names = TRUE, col.names = TRUE,quote = FALSE) q75_wpn_Apul_lncRNA <- quantile(rowVars(wpn_vsd_Apul_lncRNA, useNames=TRUE), .75) # 75th quantile variability Apul_counts_lncRNA_vsd_q75 <- wpn_vsd_Apul_lncRNA[ rv_wpn_Apul_lncRNA > q75_wpn_Apul_lncRNA, ] %>% data.frame # filter to retain only the most variable genes write.table(Apul_counts_lncRNA_vsd_q75, file = "../output/03.20-D-Apul-lncRNA-expression-DESeq2/Apul_counts_lncRNA_variancestabilized_q75.txt", sep = "\t", row.names = TRUE, col.names = TRUE,quote = FALSE) q95_wpn_Apul_lncRNA <- quantile(rowVars(wpn_vsd_Apul_lncRNA, useNames=TRUE), .95) # 95th quantile variability Apul_counts_lncRNA_vsd_q95 <- wpn_vsd_Apul_lncRNA[ rv_wpn_Apul_lncRNA > q95_wpn_Apul_lncRNA, ] %>% data.frame # filter to retain only the most variable genes write.table(Apul_counts_lncRNA_vsd_q95, file = "../output/03.20-D-Apul-lncRNA-expression-DESeq2/Apul_counts_lncRNA_variancestabilized_q95.txt", sep = "\t", row.names = TRUE, col.names = TRUE,quote = FALSE) ``` ## Plot normalized data ```{r plot-normalized-lncRNA} Apul_counts_lncRNA_norm_long <- Apul_counts_lncRNA_norm %>% mutate( Gene_id = row.names(Apul_counts_lncRNA_norm) ) %>% pivot_longer(-Gene_id) Apul_counts_lncRNA_norm_long %>% ggplot(., aes(x = name, y = value)) + geom_violin() + geom_point() + theme_bw() + theme( axis.text.x = element_text( angle = 90) ) + ylim(0, NA) + labs( title = "Normalized Expression", x = "Sample", y = "Normalized counts" ) ``` ## Plot variance stabilized data ```{r plot-vsd-lncRNA} Apul_counts_lncRNA_vsd_long <- Apul_counts_lncRNA_vsd %>% mutate( Gene_id = row.names(Apul_counts_lncRNA_vsd) ) %>% pivot_longer(-Gene_id) Apul_counts_lncRNA_vsd_long %>% ggplot(., aes(x = name, y = value)) + geom_violin() + geom_point() + theme_bw() + theme( axis.text.x = element_text( angle = 90) ) + ylim(0, NA) + labs( title = "Variance Stabilized Expression", x = "Sample", y = "Variance stabilized data" ) ``` ## Normalized expression levels Plot histograms of the normalized expression levels in each sample ```{r norm-expression-level-histograms} # Melt the count matrix into long format Apul_counts_norm_melted <- melt(Apul_counts_lncRNA_norm, variable.name = "sample", value.name = "counts") # Plot the expression level histograms for each sample ggplot(Apul_counts_norm_melted, aes(x = counts)) + geom_histogram(binwidth = 1, fill = "#408EC6", color = "black") + scale_x_log10() + # Optional: Log-transform the x-axis for better visualization facet_wrap(~sample, scales = "free_y") + labs(title = "Gene Expression Level Histogram for Each Sample", x = "Expression Level (Counts)", y = "Frequency") + theme_minimal() ``` ## Normalized transcript counts Check the total number of transcripts in each sample -- now that we've normalized the data these totals should be similar ```{r norm-transcript-counts-plot} # Calculate the total number of transcripts for each sample total_transcripts_norm <- colSums(Apul_counts_lncRNA_norm) # Create a data frame for plotting total_transcripts_norm_df <- data.frame(sample = names(total_transcripts_norm), totals = total_transcripts_norm) # Plot the total number of transcripts for each sample ggplot(total_transcripts_norm_df, aes(x = reorder(sample, totals), y = totals)) + geom_bar(stat = "identity", fill = "#408EC6", color = "black") + geom_text(aes(label = totals), vjust = -0.3, size = 3.5) + labs(title = "Total Number of Transcripts per Sample", x = "Sample", y = "Total Transcripts") + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability ``` ## PCA of variance stabilized data ```{r PCA} plotPCA(vsd_Apul_lncRNA, intgroup="time.point") plotPCA(vsd_Apul_lncRNA, intgroup="colony.id") ``` Samples are strongly clustering by colony. Interestingly time point doesn't appear to influence clustering. ## Sample clustering ```{r sample-clustering} sample_dists <- dist(t(assay(vsd_Apul_lncRNA))) pheatmap(as.matrix(sample_dists), clustering_distance_rows = "euclidean", clustering_distance_cols = "euclidean", main="Sample Clustering") ``` Samples are strongly clustering by colony. ## Heatmaps Of most variable variance stabilized lncRNA transcripts ```{r heatmpas} # 75th quantile heat_colors <- rev(brewer.pal(12, "RdYlBu")) pheatmap(Apul_counts_lncRNA_vsd_q75, cluster_rows = TRUE, cluster_cols = TRUE, show_rownames = TRUE, show_colnames = TRUE, color = heat_colors, scale="row") # 95th quantile pheatmap(Apul_counts_lncRNA_vsd_q95, cluster_rows = TRUE, cluster_cols = TRUE, show_rownames = TRUE, show_colnames = TRUE, color = heat_colors, scale="row") ```