---
title: "65 - Exon Coverage"
author: Steven Roberts
date: "`r format(Sys.time(), '%d %B, %Y')`" 
output: 
  github_document:
    toc: true
    toc_depth: 3
    number_sections: true
    html_preview: true
---

```{r setup, include=FALSE}
library(tidyverse)
library(kableExtra)
library(DESeq2)
library(pheatmap)
library(RColorBrewer)
library(data.table)
library(DT)
library(Biostrings)
library(methylKit)
knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = FALSE,         # Evaluate code chunks
  warning = TRUE,     # Hide warnings
  message = TRUE,     # Hide messages
  fig.width = 6,       # Set plot width in inches
  fig.height = 4,      # Set plot height in inches
  fig.align = "center" # Align plots to the center
)
```

# Define vectors


```{r exon-sum-threshold}
exon_sum_threshold <- 10
```

# Define functions

##  Function to calculate log fold change
```{r function-fold-change}
calculate_log_fold_change <- function(data) {
  result <- data %>%
    dplyr::select(GeneID, Exon1, Exon2, Exon3, Exon4, Exon5, Exon6) %>%
    mutate(fold1 = log(Exon1 / Exon1)) %>%
    mutate(fold2 = log(Exon2 / Exon1)) %>%
    mutate(fold3 = log(Exon3 / Exon1)) %>%
    mutate(fold4 = log(Exon4 / Exon1)) %>%
    mutate(fold5 = log(Exon5 / Exon1)) %>%
    mutate(fold6 = log(Exon6 / Exon1))
  
  return(result)
}
```


```{r assign-variables}
all <- c("S13M", "S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S64M", "S6M", "S76F", "S7M", "S12M", "S22F", "S23M", "S29F", "S31M", "S35F", "S36F", "S3F", "S41F", "S48M", "S50F", "S59M", "S77F", "S9M")
```

## Sum of Exons 1-6 for each file.
```{r}
# Initialize an empty list to store results for each file
sums_list <- list()

# Loop through each file in the list
for (file in all) {
  # Read in the data file
  data <- read.csv(paste0("../output/65-exon-coverage/", file, "_exon_reads.processed_data.csv"))

  # Calculate the sum of Exons 1-6
  sum_result <- data %>%
    dplyr::select(GeneID, Exon1, Exon2, Exon3, Exon4, Exon5, Exon6) %>%
    mutate(Sum = rowSums(.[, -1], na.rm = TRUE))

  # Count GeneIDs meeting the threshold for the current file
  geneid_count <- sum_result %>%
    dplyr::filter(Sum >= exon_sum_threshold) %>%
    nrow()
  cat("\nCount of GeneIDs meeting the threshold (", exon_sum_threshold, "summed reads ) in", file, ":", geneid_count, "\n")

  # Append the result and count to the lists
  sums_list[[file]] <- sum_result
}
```
## Filter across all samples

Join all data frames in the `sums_list` on `GeneID` and
then filter rows for any `GeneID` with sums >= set threshold.

```{r}
# Rename the Sum column in each data frame in sums_list
# Assigns the original filename to the Sum column in each data frame
renamed_sums_list <- lapply(seq_along(sums_list), function(i) {
  colname <- paste0(all[i], "_Sum")
  col_index <- grep("Sum", colnames(sums_list[[i]]))
  colnames(sums_list[[i]])[col_index] <- colname
  dplyr::select(sums_list[[i]], GeneID, colname)
})

# Join data frames in renamed_sums_list on GeneID
all_geneids_df <- Reduce(function(x, y) dplyr::full_join(x, y, by = "GeneID"), renamed_sums_list)

# Filter rows where all values are >= the set threshold
exon_threshold_geneids <- all_geneids_df %>%
  dplyr::filter(if_all(ends_with("_Sum"), ~. >= exon_sum_threshold)) %>%
  dplyr::select(GeneID)

# Print count of GeneIDs meeting the threshold across all files
cat("Final count of unique GeneIDs meeting the threshold (", exon_sum_threshold, "summed reads ) across all files:", nrow(exon_threshold_geneids), "\n")

# Print the structure of the final data frame
str(exon_threshold_geneids)
```

### Write filtered GeneIDs to file
```{r}
exon_threshold_out_file <- paste0("../output/65-exon-coverage/geneids-exon_threshold-", exon_sum_threshold, ".csv", collapse = "")
write.csv(exon_threshold_geneids, file = exon_threshold_out_file)
```

# Calculate fold change for all groups

Uses `GeneIDs` passing threshold determined above.
```{r}
# Create a list to store results for each group
results_list <- list()

# Loop through each group
for (group_name in c("all")) {
  # Get the corresponding vector
  current_vector <- eval(parse(text = group_name))
  
  # Create an empty list to store results for each file in the group
  group_results <- list()
  
  # Loop through each file in the group
  for (file_id in current_vector) {
    # Read the data frame
    current_data <- read.csv(paste0("../output/65-exon-coverage/", file_id, "_exon_reads.processed_data.csv"))
    
    # Filter current_data based on exon_threshold_geneids
    current_data <- subset(current_data, GeneID %in% exon_threshold_geneids$GeneID)
    
    # Check if any rows are left after filtering
    if (nrow(current_data) > 0) {
      # Calculate  fold change
      current_result <- calculate_log_fold_change(current_data)
      
      # Replace infinite values within the data frame
      current_result <- as.data.frame(lapply(current_result, function(x) ifelse(is.infinite(x), NA, x)))
      
      # Ensure unique column names
      colnames(current_result) <- make.unique(as.character(colnames(current_result)), sep = "_")
      
        # write out
  
  fc <- paste0("../output/65-exon-coverage/", file_id, "_exonsum_", exon_sum_threshold, "_logfold.csv")
  write.csv(current_result, file = fc)
  
    }}}

```