--- title: "deseq2 with featurecounts" author: "Megan Ewing" date: "2024-05-13" output: html_document --- ```{r} # Load the DESeq2 library # install.packages("BiocManager") # BiocManager::install("DESeq2") library(DESeq2) ``` ### Getting Matrix from Feature Counts ```{r} # Read the FeatureCounts output file into R featureCounts <- read.table("../output/featurecounts_gene", header=TRUE, row.names=1) head(featureCounts) #subset from feature counts of just the count data countData <- featureCounts[, -(1:5)] ``` ```{r} metaData <- data.frame( sample = c("M.C.193", "M.C.216", "M.C.218", "M.C.226", "M.C.306", "M.C.329", "M.C.334", "M.C.337", "M.C.339", "M.C.358", "M.C.360", "M.C.363", "M.C.373", "M.C.482","M.C.488", "M.T.18", "M.T.20", "M.T.22", "M.T.235", "M.T.245", "M.T.29", "M.T.31", "M.T.32", "M.T.399", "M.T.43", "M.T.44", "M.T.500", "M.T.7", "M.T.83", "M.T.8"), treatment = c(rep("control", 15), rep("treatment", 15)) ) ``` ### Running DESeq ```{r} # Create a DESeqDataSet object dds <- DESeqDataSetFromMatrix(countData, colData = metaData, design = ~ treatment) # Normalize the data dds <- DESeq(dds) # Perform differential expression analysis res <- results(dds) ``` ### Filtering for Significant results (padj \< 0.05) ```{r} # Extract significant results sig_results <- res[which(res$padj < 0.05),] # Explore the results head(sig_results) write.table(sig_results,"../output/0513-DEG.tab", row.names = T, col.names = T) # full results (including non signifcant results) write.table(res, "../output/0709-DeSeq_AllCounts.tab", row.names = T, col.names = T) # reading in all results incl. non signifcant, just to take a peek restable <- read.csv("../output/0709-DeSeq_AllCounts.tab", sep="") ``` ### Heatmap of Top 25 Sig. DEG (no pre filter) ```{r} library(pheatmap) res_ordered <- res[order(res$padj), ] top_genes <- row.names(res_ordered)[1:25] # Extract counts and normalize counts <- counts(dds, normalized = TRUE) counts_top <- counts[top_genes, ] # Log-transform counts log_counts_top <- log2(counts_top + 1) # shorten col names short_col <- c("M.C.216", "M.C.218", "M.C.226", "M.C.306", "M.C.329" ,"M.C.334", "M.C.337" ,"M.C.339" ,"M.C.358" ,"M.C.360", "M.C.363", "M.C.373", "M.C.482", "M.C.488", "M.T.18", "M.T.20" , "M.T.22" , "M.T.235" ,"M.T.245", "M.T.29" ,"M.T.31" , "M.T.32" , "M.T.399", "M.T.43" ,"M.T.44" , "M.T.500", "M.T.7" , "M.T.83" , "M.T.8") colnames(log_counts_top) <- short_col head(log_counts_top) write.csv(log_counts_top, "../output/log_counts_top.csv") # Open a PNG device to save the plot png("../output/heatmap_top25_DEGs.png") # Generate the heatmap pheatmap(log_counts_top, scale = "row", cluster_cols = FALSE, main = "Heatmap of Log2 Fold Change for Top 25 DEGs") # Close the PNG device dev.off() ``` ### Heatmap of All (54) sig. DEG (no pre filter) ```{r} # ALL 54 # getting a all (54 is total number, left it so there's less for me to edit code wise) all_genes <- row.names(res_ordered)[1:54] # Extract counts and normalize counts3 <- counts(dds, normalized = TRUE) counts_all <- counts3[all_genes, ] # Log-transform counts log_counts_all <- log2(counts_all + 1) # shorten col names short_col <- c("M.C.216", "M.C.218", "M.C.226", "M.C.306", "M.C.329" ,"M.C.334", "M.C.337" ,"M.C.339" ,"M.C.358" ,"M.C.360", "M.C.363", "M.C.373", "M.C.482", "M.C.488", "M.T.18", "M.T.20" , "M.T.22" , "M.T.235" ,"M.T.245", "M.T.29" ,"M.T.31" , "M.T.32" , "M.T.399", "M.T.43" ,"M.T.44" , "M.T.500", "M.T.7" , "M.T.83" , "M.T.8") colnames(log_counts_all) <- short_col head(log_counts_all) write.csv(log_counts_all, "../output/log_counts_all.csv") # Open a PNG device to save the plot png("../output/heatmap_all_DEGs.png") # Generate the heatmap pheatmap(log_counts_all, scale = "row", cluster_cols = FALSE, main = "Heatmap of Log2 Fold Change for Top 25 DEGs") # Close the PNG device dev.off() ``` ![](http://127.0.0.1:10249/chunk_output/D3453FCD6c41c5b0/22487329/cula8vyttjp54/00003d.png) ### Filter Parameters: Determining Filter Parameters Based of Count Summary Stats ```{r} # Load necessary libraries library(DESeq2) library(ggplot2) library(reshape2) # Assume `dds` is your DESeqDataSet object # Extract the counts matrix counts_matrix <- counts(dds) # Convert the counts matrix to a long format for ggplot2 counts_long <- melt(counts_matrix) # Rename columns for clarity colnames(counts_long) <- c("Gene", "Sample", "Count") # removing 0 counts counts_long <- counts_long[counts_long$Count > 0,] # summary summary(counts_long) # # BOXPLOT # # Rename columns for clarity colnames(counts_long) <- c("Gene", "Sample", "Count") # Plot a single box plot for all counts combined ggplot(counts_long, aes(x = "", y = Count)) + geom_boxplot(fill = "blue", color = "black") + theme_minimal() + scale_y_log10() + labs(title = "Box Plot of Read Counts", x = "", y = "Read Count") # Save the plot ggsave("../output/boxplot_of_all_read_counts.png") # # HISTOGRAM # counts_long <- counts_long[counts_long$Count > 0 & counts_long$Count <= 100, ] # Plot the histogram ggplot(counts_long, aes(x = Count)) + geom_histogram(binwidth = 1, fill = "blue", color = "black") + theme_minimal() + labs(title = "Histogram of Read Counts", x = "Read Count", y = "Frequency") # Save the plot ggsave("../output/0807-histogram_of_read_counts_1to100.png") ``` ### Filtered: Running DESeq (hits must average 10+ reads across 3+ samples) ```{r} # filtering so that only DEGs are identified if they average 10 reads across 3+ samples # Pre-filtering step # Calculate the mean read count per gene mean_counts <- rowMeans(counts(dds)) # Filter genes that have at least 10 reads across 3 or more samples filter <- rowSums(counts(dds) >= 10) >= 5 dds2 <- dds[filter, ] # Normalize the data dds2 <- DESeq(dds2) # Perform differential expression analysis res2 <- results(dds2) # save write.table(res2, "../output/0725-DeSeq_AllCounts_filtered.tab", row.names = T, col.names = T) ``` ### Filtered: Running DESeq, Separated Treatment and Control Filtering the treatment and control groups separately so that we can filter by each group averaging X reads rather than all of the samples at once *need to alter chunk so it is filtering based off "at least 10 individuals have 5+ reads" rather than "an average of 5 reads across 10 individuals" – don't want one individual driving up the average* ```{r} # Assume `dds` is your DESeqDataSet object # Pre-filtering step # Calculate the mean read count per gene in the control and treatment samples mean_counts_control <- rowMeans(counts(dds)[, 1:15]) mean_counts_treatment <- rowMeans(counts(dds)[, 16:30]) # Filter genes that have an average of at least 10 reads in either control or treatment samples filter <- (mean_counts_control >= 10) | (mean_counts_treatment >= 10) dds2 <- dds[filter, ] # Normalize the data dds2 <- DESeq(dds2) # Perform differential expression analysis res2 <- results(dds2) # Save the results write.table(res2, "../output/0727-deseqres_filteredbygroup.tab", row.names = TRUE, col.names = TRUE) ``` ### Filtered: Running DESeq, Seperated Treatment and Control – INDIVUDALS THRESHOLD, NOT AVERAGE BASED ```{r} # Assume `dds` is your DESeqDataSet object # Pre-filtering step # Calculate the counts per gene in the control and treatment samples counts_control <- counts(dds)[, 1:15] counts_treatment <- counts(dds)[, 16:30] # Function to check if a gene has at least 8 samples with at least 10 reads has_min_samples <- function(counts, min_samples = 8, min_reads = 23) { rowSums(counts >= min_reads) >= min_samples } # Filter genes that have at least 8 samples with at least 10 reads in either control or treatment filter <- has_min_samples(counts_control) | has_min_samples(counts_treatment) dds2 <- dds[filter, ] # Normalize the data dds2 <- DESeq(dds2) # Perform differential expression analysis res2 <- results(dds2) # Save the results write.table(res2, "../output/0807-deseqres_ToC_8ind23r.tab", row.names = TRUE, col.names = TRUE) ``` ### Filtered: data explore and filtering significant rename input/output files as needed ```{r} allcounts <- read.csv("../output/0807-deseqres_ToC_8ind23r.tab", sep="") head(allcounts) # Extract significant results sigresults <- res2[which(res2$padj < 0.05),] # Explore the results head(sigresults) write.table(sigresults,"../output/0807-DEGstats_ToC_8ind23r.tab", row.names = T, col.names = T) # peeking at sigresults sigres_table <- read.csv("../output/0807-DEGstats_ToC_8ind23r.tab", sep="") head(sigres_table) sigDEGname <- row.names(sigres_table) write.table(sigDEGname, "../output/0807-DEGnames_ToC_8ind23r.tab", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE) ``` ### Filtered: Heatmap of Top 25 Sig. DEG ```{r} # just top 25 genes top_genes <- row.names(res_ordered)[1:25] # Extract counts and normalize counts <- counts(dds2, normalized = TRUE) counts_top <- counts[top_genes, ] # Log-transform counts log_counts_top <- log2(counts_top + 1) # shorten col names short_col <- c("M.C.193", "M.C.216", "M.C.218", "M.C.226", "M.C.306", "M.C.329" ,"M.C.334", "M.C.337" ,"M.C.339" ,"M.C.358" ,"M.C.360", "M.C.363", "M.C.373", "M.C.482", "M.C.488", "M.T.18", "M.T.20" , "M.T.22" , "M.T.235" ,"M.T.245", "M.T.29" ,"M.T.31" , "M.T.32" , "M.T.399", "M.T.43" ,"M.T.44" , "M.T.500", "M.T.7" , "M.T.83" , "M.T.8") colnames(log_counts_top) <- short_col head(log_counts_top) write.csv(log_counts_top, "../output/log_counts_top_filtered.csv") # Open a PNG device to save the plot png("../output/heatmap_top25_DEGs_filtered.png") # Generate the heatmap pheatmap(log_counts_top, scale = "row", cluster_cols = FALSE, main = "Heatmap of Log2 Fold Change for Top 25 DEGs") # Close the PNG device dev.off() ``` ### Filtered: Heatmap of All Sig. DEG ```{r} library(pheatmap) res_ordered <- res2[order(res2$padj), ] # getting a all (replace 1:# with is total # of DEGs (can glance at stored value sigDEGname)) all_deg <- row.names(res_ordered)[1:46] # Extract counts and normalize counts <- counts(dds2, normalized = TRUE) counts_deg <- counts[all_deg, ] # Log-transform counts log_counts_deg <- log2(counts_deg + 1) # shorten col names short_col <- c("M.C.193", "M.C.216", "M.C.218", "M.C.226", "M.C.306", "M.C.329" ,"M.C.334", "M.C.337" ,"M.C.339" ,"M.C.358" ,"M.C.360", "M.C.363", "M.C.373", "M.C.482", "M.C.488", "M.T.18", "M.T.20" , "M.T.22" , "M.T.235" ,"M.T.245", "M.T.29" ,"M.T.31" , "M.T.32" , "M.T.399", "M.T.43" ,"M.T.44" , "M.T.500", "M.T.7" , "M.T.83" , "M.T.8") colnames(log_counts_deg) <- short_col head(log_counts_deg) write.csv(log_counts_deg, "../output/0807-logcounts_allDEG_8ind23r_ToC.csv") # Open a PNG device to save the plot png("../output/0807-DEG_heatmap_8ind23r_ToC_clust.png", width = 1000, height = 1000) # Generate the heatmap pheatmap(log_counts_deg, scale = "row", cluster_cols = TRUE, main = "Heatmap All DEGs (padj < 0.05, 23r 8ind ToC clustered") # Close the PNG device dev.off() ``` ### Post Filter Recurrent Genes Heatmap code below is using the 30n ToC as the base since it had the most genes ```{r} # only selecting for LOC that popped up across all 4 pre-filters # vector of genes that recurred occursmost <- c("LOC132713118", "LOC132722155", "LOC132722176", "LOC132723296", "LOC132725056", "LOC132728236", "LOC132729677", "LOC132730403", "LOC132731011", "LOC132732296", "LOC132734395", "LOC132740341", "LOC132741452", "LOC132741637", "LOC132742068", "LOC132743875", "LOC132745166", "LOC132745822", "LOC132746012", "LOC132746399", "LOC132746437", "LOC132746792", "LOC132752133", "LOC132752300", "LOC132758254", "LOC132758583", "LOC132758975", "LOC132759052", "LOC132759372", "LOC132759828") # dataframe of genes that recurred # occursmost <- data.frame("gene" = occursmost) ``` ```{r} # Load the necessary library library(pheatmap) # Order the results by adjusted p-value res_ordered <- res2[order(res2$padj), ] # Get the top differentially expressed genes (all 84 genes) all_deg <- row.names(res_ordered)[1:87] # Extract counts and normalize counts <- counts(dds2, normalized = TRUE) counts_deg <- counts[all_deg, ] # Log-transform the counts log_counts_deg <- log2(counts_deg + 1) # Shorten column names short_col <- c("M.C.193", "M.C.216", "M.C.218", "M.C.226", "M.C.306", "M.C.329" ,"M.C.334", "M.C.337" ,"M.C.339" ,"M.C.358" ,"M.C.360", "M.C.363", "M.C.373", "M.C.482", "M.C.488", "M.T.18", "M.T.20" , "M.T.22" , "M.T.235" ,"M.T.245", "M.T.29" ,"M.T.31" , "M.T.32" , "M.T.399", "M.T.43", "M.T.44" , "M.T.500", "M.T.7" , "M.T.83" , "M.T.8") colnames(log_counts_deg) <- short_col # Display the first few rows to check the transformation head(log_counts_deg) # Assume occursmost is a vector containing the gene names to include # Example: occursmost <- c("Gene1", "Gene2", "Gene3") # Filter log_counts_deg to include only the genes in occursmost log_counts_deg_filtered <- log_counts_deg[rownames(log_counts_deg) %in% occursmost, ] # Ensure all values are finite log_counts_deg_filtered <- log_counts_deg_filtered[apply(log_counts_deg_filtered, 1, function(x) all(is.finite(x))), ] # Save the filtered log-transformed counts to a CSV file write.csv(log_counts_deg_filtered, "../output/0725-logcounts_filtered_occursmost.csv") # Open a PNG device to save the heatmap png("../output/0725-DEG_heatmap_occursmost.png", width = 1000, height = 1000) # Generate the heatmap with filtered genes pheatmap(log_counts_deg_filtered, scale = "row", cluster_cols = FALSE, main = "Heatmap Filtered DEGs (occursmost)") # Close the PNG device dev.off() ``` ### Abandoned Chunks ```{r} # COMMENTING OUT - ABANDONED CHUNK # # # # # prepping DEG table for joining with blast results in next step by adding in the gene long ids # # deg_noname <- read.csv("../output/0513-DEG.tab", sep = " ") # colnames(deg_noname) <- c("LOC", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj") # # #getting the long gene id names that we will join with blast # gene_id <- data.frame(rownames(featureCounts), featureCounts$Chr) # # #merging those IDs back to our DEGs # deg_names <- merge(x = deg_noname, y = gene_id, by.x = "LOC", by.y = "rownames.featureCounts.") # # #writing the output to a file (overwriting previous one) # write.table(deg_names,"../output/0513-DEG.tab", col.names = T) ```