--- title: "65 - Exon Coverage" author: Steven Roberts date: "`r format(Sys.time(), '%d %B, %Y')`" output: github_document: toc: true toc_depth: 3 number_sections: true html_preview: true --- ```{r setup, include=FALSE} library(tidyverse) library(kableExtra) library(DESeq2) library(pheatmap) library(RColorBrewer) library(data.table) library(DT) library(Biostrings) library(methylKit) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = FALSE, # Evaluate code chunks warning = TRUE, # Hide warnings message = TRUE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) ``` # Define vectors ```{r exon-sum-threshold} exon_sum_threshold <- 10 ``` # Define functions ## Function to calculate log fold change ```{r function-fold-change} calculate_log_fold_change <- function(data) { result <- data %>% dplyr::select(GeneID, Exon1, Exon2, Exon3, Exon4, Exon5, Exon6) %>% mutate(fold1 = log(Exon1 / Exon1)) %>% mutate(fold2 = log(Exon2 / Exon1)) %>% mutate(fold3 = log(Exon3 / Exon1)) %>% mutate(fold4 = log(Exon4 / Exon1)) %>% mutate(fold5 = log(Exon5 / Exon1)) %>% mutate(fold6 = log(Exon6 / Exon1)) return(result) } ``` ```{r assign-variables} all <- c("S13M", "S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S64M", "S6M", "S76F", "S7M", "S12M", "S22F", "S23M", "S29F", "S31M", "S35F", "S36F", "S3F", "S41F", "S48M", "S50F", "S59M", "S77F", "S9M") ``` ## Sum of Exons 1-6 for each file. ```{r} # Initialize an empty list to store results for each file sums_list <- list() # Loop through each file in the list for (file in all) { # Read in the data file data <- read.csv(paste0("../output/65-exon-coverage/", file, "_exon_reads.processed_data.csv")) # Calculate the sum of Exons 1-6 sum_result <- data %>% dplyr::select(GeneID, Exon1, Exon2, Exon3, Exon4, Exon5, Exon6) %>% mutate(Sum = rowSums(.[, -1], na.rm = TRUE)) # Count GeneIDs meeting the threshold for the current file geneid_count <- sum_result %>% dplyr::filter(Sum >= exon_sum_threshold) %>% nrow() cat("\nCount of GeneIDs meeting the threshold (", exon_sum_threshold, "summed reads ) in", file, ":", geneid_count, "\n") # Append the result and count to the lists sums_list[[file]] <- sum_result } ``` ## Filter across all samples Join all data frames in the `sums_list` on `GeneID` and then filter rows for any `GeneID` with sums >= set threshold. ```{r} # Rename the Sum column in each data frame in sums_list # Assigns the original filename to the Sum column in each data frame renamed_sums_list <- lapply(seq_along(sums_list), function(i) { colname <- paste0(all[i], "_Sum") col_index <- grep("Sum", colnames(sums_list[[i]])) colnames(sums_list[[i]])[col_index] <- colname dplyr::select(sums_list[[i]], GeneID, colname) }) # Join data frames in renamed_sums_list on GeneID all_geneids_df <- Reduce(function(x, y) dplyr::full_join(x, y, by = "GeneID"), renamed_sums_list) # Filter rows where all values are >= the set threshold exon_threshold_geneids <- all_geneids_df %>% dplyr::filter(if_all(ends_with("_Sum"), ~. >= exon_sum_threshold)) %>% dplyr::select(GeneID) # Print count of GeneIDs meeting the threshold across all files cat("Final count of unique GeneIDs meeting the threshold (", exon_sum_threshold, "summed reads ) across all files:", nrow(exon_threshold_geneids), "\n") # Print the structure of the final data frame str(exon_threshold_geneids) ``` ### Write filtered GeneIDs to file ```{r} exon_threshold_out_file <- paste0("../output/65-exon-coverage/geneids-exon_threshold-", exon_sum_threshold, ".csv", collapse = "") write.csv(exon_threshold_geneids, file = exon_threshold_out_file) ``` # Calculate fold change for all groups Uses `GeneIDs` passing threshold determined above. ```{r} # Create a list to store results for each group results_list <- list() # Loop through each group for (group_name in c("all")) { # Get the corresponding vector current_vector <- eval(parse(text = group_name)) # Create an empty list to store results for each file in the group group_results <- list() # Loop through each file in the group for (file_id in current_vector) { # Read the data frame current_data <- read.csv(paste0("../output/65-exon-coverage/", file_id, "_exon_reads.processed_data.csv")) # Filter current_data based on exon_threshold_geneids current_data <- subset(current_data, GeneID %in% exon_threshold_geneids$GeneID) # Check if any rows are left after filtering if (nrow(current_data) > 0) { # Calculate fold change current_result <- calculate_log_fold_change(current_data) # Replace infinite values within the data frame current_result <- as.data.frame(lapply(current_result, function(x) ifelse(is.infinite(x), NA, x))) # Ensure unique column names colnames(current_result) <- make.unique(as.character(colnames(current_result)), sep = "_") # write out fc <- paste0("../output/65-exon-coverage/", file_id, "_exonsum_", exon_sum_threshold, "_logfold.csv") write.csv(current_result, file = fc) }}} ```