--- title: "deseq2 with featurecounts" author: "Megan Ewing" date: "2025-05-01" output: html_document --- ```{r} # Load the DESeq2 library # install.packages("BiocManager") # BiocManager::install("DESeq2") library(DESeq2) library(dplyr) ``` ### STAR Count Data provided by Giles ```{r} # Read the FeatureCounts output file into R STAR <- read.csv("../data/STAR_count_data.csv", header=TRUE, row.names=1) head(STAR) #subset from feature counts of just the count data countData <- STAR[, -(31:33)] ``` Now, the STAR_count_data.csv, has not yet corrected for the mislabing of M-C-193 as M-T-193. The following chunks assigns that column to a new position (among the other control samples) and renames it appropriately. ```{r} #confirm location of M-T-193 colnames(countData[16]) ``` ```{r} #we're moving it to first postion to ensure grouping with other control samples and renaming to M-C-193, as it should be countData <- countData[, c(16, 1:15, 17:ncol(countData))] names(countData)[names(countData) == "M.T.193"] <- "M.C.193" head(countData) ``` ```{r} metaData <- data.frame( sample = c("M.C.193", "M.C.216", "M.C.218", "M.C.226", "M.C.306", "M.C.329", "M.C.334", "M.C.337", "M.C.339", "M.C.358", "M.C.360", "M.C.363", "M.C.373", "M.C.482","M.C.488", "M.T.18", "M.T.20", "M.T.22", "M.T.235", "M.T.245", "M.T.29", "M.T.31", "M.T.32", "M.T.399", "M.T.43", "M.T.44", "M.T.500", "M.T.7", "M.T.83", "M.T.8"), treatment = c(rep("control", 15), rep("treatment", 15)) ) ``` ### Running DESeq (initial, no pre-filter) ```{r} # Create a DESeqDataSet object dds <- DESeqDataSetFromMatrix(countData, colData = metaData, design = ~ treatment) # Normalize the data dds <- DESeq(dds) # Perform differential expression analysis res <- results(dds) ``` ### Filter Parameters: Determining Filter Parameters Based of Count Summary Stats ```{r} # Load necessary libraries library(DESeq2) library(ggplot2) library(reshape2) # Assume `dds` is your DESeqDataSet object # Extract the counts matrix counts_matrix <- counts(dds) # Convert the counts matrix to a long format for ggplot2 counts_long <- melt(counts_matrix) # Rename columns for clarity colnames(counts_long) <- c("Gene", "Sample", "Count") # removing 0 counts counts_long <- counts_long[counts_long$Count > 0,] # summary summary(counts_long) # # BOXPLOT # # Rename columns for clarity colnames(counts_long) <- c("Gene", "Sample", "Count") # Plot a single box plot for all counts combined ggplot(counts_long, aes(x = "", y = Count)) + geom_boxplot(fill = "blue", color = "black") + theme_minimal() + scale_y_log10() + labs(title = "Box Plot of Read Counts", x = "", y = "Read Count") # Save the plot ggsave("../images/boxplot_of_all_read_counts.png") # # HISTOGRAM # counts_long <- counts_long[counts_long$Count > 0 & counts_long$Count <= 100, ] # Plot the histogram ggplot(counts_long, aes(x = Count)) + geom_histogram(binwidth = 1, fill = "blue", color = "black") + theme_minimal() + labs(title = "Histogram of Read Counts", x = "Read Count", y = "Frequency") # Save the plot ggsave("../images/histogram_of_read_counts_1to100.png") ``` ### Running DESeq, Seperated Treatment and Control -- INDIVUDALS THRESHOLD, NOT AVERAGE BASED ```{r} # Assume `dds` is your DESeqDataSet object # Pre-filtering step # Calculate the counts per gene in the control and treatment samples counts_control <- counts(dds)[, 1:15] counts_treatment <- counts(dds)[, 16:30] # Function to check if a gene has at least half(n=8) samples in either/both groups with at least median (27) reads has_min_samples <- function(counts, min_samples = 8, min_reads = 27) { rowSums(counts >= min_reads) >= min_samples } # Filter genes that have at least 8 samples with at least 27 reads in either control or treatment filter <- has_min_samples(counts_control) | has_min_samples(counts_treatment) dds2 <- dds[filter, ] # Normalize the data dds2 <- DESeq(dds2) # Perform differential expression analysis res2 <- results(dds2) # Save the results write.table(res2, "../output/DESeq_res-ToC_8ind27r.tab", row.names = TRUE, col.names = TRUE) ``` Getting full counts ```{r} # Extract counts and normalize counts_all <- counts(dds2, normalized = TRUE) # Log-transform counts log_counts_all <- log2(counts_all + 1) # shorten col names short_col <- c("M.C.193", "M.C.216", "M.C.218", "M.C.226", "M.C.306", "M.C.329" ,"M.C.334", "M.C.337" ,"M.C.339" ,"M.C.358" ,"M.C.360", "M.C.363", "M.C.373", "M.C.482", "M.C.488", "M.T.18", "M.T.20" , "M.T.22" , "M.T.235" ,"M.T.245", "M.T.29" ,"M.T.31" , "M.T.32" , "M.T.399", "M.T.43","M.T.44" , "M.T.500", "M.T.7" , "M.T.83" , "M.T.8") colnames(log_counts_all) <- short_col head(log_counts_all) write.csv(log_counts_all, "../output/logcounts_all-8ind27r_ToC.csv") ``` ### Data explore and filtering significant rename input/output files as needed ```{r} allcounts <- read.csv("../output/DESeq_res-ToC_8ind27r.tab", sep="") head(allcounts) # Extract significant results sigresults <- res2[which(res2$padj < 0.05),] # Explore the results head(sigresults) write.table(sigresults,"../output/DEGstats_ToC_8ind27r.tab", row.names = T, col.names = T) # peeking at sigresults sigres_table <- read.csv("../output/DEGstats_ToC_8ind27r.tab", sep="") head(sigres_table) sigDEGname <- row.names(sigres_table) # write.table(sigDEGname, "../output/0807-DEGnames_ToC_8ind27r.tab", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE) ``` ### Heatmap w/ LOC we had so few DEGs, makes sense to put all in one heatmap ```{r} library(pheatmap) res_ordered <- res2[order(res2$padj), ] # getting a all (replace 1:# with is total # of DEGs (can glance at stored value sigDEGname)) all_deg <- row.names(res_ordered)[1:48] # Extract counts and normalize counts <- counts(dds2, normalized = TRUE) counts_deg <- counts[all_deg, ] # Log-transform counts log_counts_deg <- log2(counts_deg + 1) # shorten col names short_col <- c("M.C.193", "M.C.216", "M.C.218", "M.C.226", "M.C.306", "M.C.329" ,"M.C.334", "M.C.337" ,"M.C.339" ,"M.C.358" ,"M.C.360", "M.C.363", "M.C.373", "M.C.482", "M.C.488", "M.T.18", "M.T.20" , "M.T.22" , "M.T.235" ,"M.T.245", "M.T.29" ,"M.T.31" , "M.T.32" , "M.T.399", "M.T.43" ,"M.T.44" , "M.T.500", "M.T.7" , "M.T.83" , "M.T.8") colnames(log_counts_deg) <- short_col head(log_counts_deg) # write.csv(log_counts_deg, "../output/1020-STAR-logcounts_allDEG_8ind27r_ToC.csv") # Open a PNG device to save the plot # change naming scheme and cluster_col argument as appropriate. # cluster_cols = F: ****heatmap_unclust.png # cluster_cols = T: ****heatmap_clust.png png("../images/DEG_LOC_heatmap_unclust.png", width = 500, height = 700) # Generate the heatmap pheatmap(log_counts_deg, scale = "row", cluster_cols = F, main = "Heatmap All DEGs (padj < 0.05) clustered") # Close the PNG device dev.off() ``` ### Heatmap w/ Names prep data ```{r} # Order results by adjusted p-value res_ordered <- res2[order(res2$padj), ] # Select differentially expressed genes (DEGs) all_deg <- row.names(res_ordered)[1:48] # Extract and normalize counts counts <- counts(dds2, normalized = TRUE) counts_deg <- counts[all_deg, ] # Log-transform counts log_counts_deg <- log2(counts_deg + 1) ``` prepping descriptive row names ```{r} # create column with row names (LOC), so we can make a heatmap that has gene names instead of LOC# on it STAR2 <- STAR STAR2$LOC <- rownames(STAR) head(STAR2) ``` ```{r} # make dataframe with just LOC and description DEG_names <- data.frame("LOC" = STAR2$LOC, "Description" = STAR2$Desc) head(DEG_names) ``` ```{r} # Ensure the row names of the heatmap data are stored row_names_current <- rownames(log_counts_deg) # Map current row names to new descriptions using DEG_names dataframe new_gene_names <- DEG_names$Description[match(row_names_current, DEG_names$LOC)] # Assign the new names to the heatmap dataframe rownames(log_counts_deg) <- new_gene_names head(log_counts_deg) ``` ```{r} library(pheatmap) # Shorten column names short_col <- c("M.C.193", "M.C.216", "M.C.218", "M.C.226", "M.C.306", "M.C.329", "M.C.334", "M.C.337", "M.C.339", "M.C.358", "M.C.360", "M.C.363", "M.C.373", "M.C.482", "M.C.488", "M.T.18", "M.T.20", "M.T.22", "M.T.235", "M.T.245", "M.T.29", "M.T.31", "M.T.32", "M.T.399", "M.T.43", "M.T.44", "M.T.500", "M.T.7", "M.T.83", "M.T.8") colnames(log_counts_deg) <- short_col # Define conditions (Ensure this matches the column order) condition_vector <- c(rep("Control", 15), rep("Primed", 15)) # Create annotation for the heatmap annotation_col <- data.frame(Condition = factor(condition_vector)) rownames(annotation_col) <- short_col # Ensure rownames match column names of data # Define colors for annotation ann_colors <- list(Condition = c(Control = "cadetblue", Primed = "salmon"), Border = "grey") # Save the heatmap png("../images/DEG_named_heatmap_unclust.png", width = 1000, height = 700) pheatmap(log_counts_deg, scale = "row", cluster_cols = FALSE, main = "Heatmap All DEGs (padj < 0.05) Clustered", border_color = "grey", annotation_col = annotation_col, # Add annotation annotation_colors = ann_colors # Define annotation colors ) dev.off() ```