---
title: "71 fm exon ind"
author: Steven Roberts
date: "`r format(Sys.time(), '%d %B, %Y')`" 
output: 
  html_document:
    theme: readable
    highlight: zenburn
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
  # github_document:
  #   toc: true
  #   toc_depth: 3
  #   number_sections: true
  #   html_preview: true
---

```{r setup, include=FALSE}
#library(kableExtra)
# library(DESeq2)
# library(pheatmap)
# library(RColorBrewer)
# library(data.table)
#library(DT)
# library(Biostrings)
#library(methylKit)
library(tidyverse)

knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = TRUE,         # Evaluate code chunks
  warning = TRUE,     # Hide warnings
  message = TRUE,     # Hide messages
  fig.width = 6,       # Set plot width in inches
  fig.height = 4,      # Set plot height in inches
  fig.align = "center" # Align plots to the center
)
```

We have fold exon expression for all samples


# Metadata

| Sample.ID | OldSample.ID | Treatment | Sex | TreatmentN | Parent.ID |
|-----------|--------------|-----------|-----|------------|-----------|
| 12M       | S12M         | Exposed   | M   | 3          | EM05      |
| 13M       | S13M         | Control   | M   | 1          | CM04      |
| 16F       | S16F         | Control   | F   | 2          | CF05      |
| 19F       | S19F         | Control   | F   | 2          | CF08      |
| 22F       | S22F         | Exposed   | F   | 4          | EF02      |
| 23M       | S23M         | Exposed   | M   | 3          | EM04      |
| 29F       | S29F         | Exposed   | F   | 4          | EF07      |
| 31M       | S31M         | Exposed   | M   | 3          | EM06      |
| 35F       | S35F         | Exposed   | F   | 4          | EF08      |
| 36F       | S36F         | Exposed   | F   | 4          | EF05      |
| 39F       | S39F         | Control   | F   | 2          | CF06      |
| 3F        | S3F          | Exposed   | F   | 4          | EF06      |
| 41F       | S41F         | Exposed   | F   | 4          | EF03      |
| 44F       | S44F         | Control   | F   | 2          | CF03      |
| 48M       | S48M         | Exposed   | M   | 3          | EM03      |
| 50F       | S50F         | Exposed   | F   | 4          | EF01      |
| 52F       | S52F         | Control   | F   | 2          | CF07      |
| 53F       | S53F         | Control   | F   | 2          | CF02      |
| 54F       | S54F         | Control   | F   | 2          | CF01      |
| 59M       | S59M         | Exposed   | M   | 3          | EM01      |
| 64M       | S64M         | Control   | M   | 1          | CM05      |
| 6M        | S6M          | Control   | M   | 1          | CM02      |
| 76F       | S76F         | Control   | F   | 2          | CF04      |
| 77F       | S77F         | Exposed   | F   | 4          | EF04      |
| 7M        | S7M          | Control   | M   | 1          | CM01      |
| 9M        | S9M          | Exposed   | M   | 3          | EM02      |

```
all <- c("S13M", "S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S64M", "S6M", "S76F", "S7M", "S12M", "S22F", "S23M", "S29F", "S31M", "S35F", "S36F", "S3F", "S41F", "S48M", "S50F", "S59M", "S77F", "S9M")
controls <- c("S13M", "S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S64M", "S6M", "S76F", "S7M")
exposed <- c("S12M", "S22F", "S23M", "S29F", "S31M", "S35F", "S36F", "S3F", "S41F", "S48M", "S50F", "S59M", "S77F", "S9M")
females <- c("S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S76F", "S22F", "S29F", "S35F", "S36F", "S3F", "S41F", "S50F", "S77F")
males <- c("S12M", "S23M", "S31M", "S48M", "S59M", "S9M","S13M", "S64M", "S6M", "S7M")
controls_males <- c("S13M", "S64M", "S6M", "S7M")
exposed_males <- c("S12M", "S23M", "S31M", "S48M", "S59M", "S9M")
controls_females <- c("S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S76F")
exposed_females <- c("S22F", "S29F", "S35F", "S36F", "S3F", "S41F", "S50F", "S77F")
```

# Loading log fold


```{r}
# Define the vector of file paths
logfile_paths <- c(
  "../output/65-exon-coverage/S12M_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S13M_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S16F_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S19F_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S22F_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S23M_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S29F_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S31M_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S35F_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S36F_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S39F_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S3F_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S41F_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S44F_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S48M_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S50F_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S52F_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S53F_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S54F_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S59M_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S64M_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S6M_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S76F_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S77F_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S7M_exonsum_10_logfold.csv",
  "../output/65-exon-coverage/S9M_exonsum_10_logfold.csv"
)

# Initialize an empty list to store the data frames
logdata_frames <- list()

# Loop through the file paths, read each file, and store it in the list with a named key
for (file_path in logfile_paths) {
  # Extract a meaningful name from the file path, e.g., S12M, S13M, etc.
  name <- gsub(".*/|_exonsum_10_logfold\\.csv$", "", file_path)
  
  # Read the file and assign it to the list with the name as the key
  logdata_frames[[name]] <- read.csv(file_path)
}

# At this point, `data_frames` is a list of data frames.
# You can access each data frame by its name, for example:
# data_frames$S12M  # This will give you the data frame for S12M_exonsum_10_fold.csv

```

```{r}
head(logdata_frames$S12M)
```


controls_females <- c("S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S76F")
exposed_females <- c("S22F", "S29F", "S35F", "S36F", "S3F", "S41F", "S50F", "S77F")


```{r}

# Define the vectors for controls and exposed
controls_females <- c("S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S76F")
exposed_females <- c("S22F", "S29F", "S35F", "S36F", "S3F", "S41F", "S50F", "S77F") # Assuming "S77" was intended to be "S77F"


# Initialize an empty list to store the modified data frames
filtered_logdata_frames <- list()

# Loop through the list of data frames, filter, and add a source column
for (name in names(logdata_frames)) {
  # Filter the dataframe and add a new column 'Source' with the name of the dataframe
  filtered_logdata_frames[[name]] <- logdata_frames[[name]] %>%
    filter(GeneID == "LOC111126804") %>%
    mutate(Source = name) %>%
    mutate(Exposure_Status = case_when(
    Source %in% controls_females ~ "control",
    Source %in% exposed_females ~ "exposed",
    TRUE ~ NA_character_ # This line handles any Source not in the two lists
  ))

}

# Merge all the modified dataframes in the list into a new single dataframe
merged_dataframe <- bind_rows(filtered_logdata_frames)
```

```{r}
# Assuming merged_dataframe is your final data frame

# Reshape the data from wide to long format
long_dataframe <- pivot_longer(merged_dataframe, 
                               cols = starts_with("fold"),
                               names_to = "Fold",
                               values_to = "Value")

# Plot the data
ggplot(long_dataframe, aes(x = Fold, y = Value, group = Source, color = Exposure_Status)) +
  geom_line() +
  geom_point() +
  theme_minimal() +
  labs(title = "GOI",
       x = "Fold",
       y = "Value") +
  scale_color_viridis_d() # Optional: Use a viridis color scale for better visibility

```


```{r}
# Define the vectors for controls and exposed
controls_females <- c("S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S76F")
exposed_females <- c("S22F", "S29F", "S35F", "S36F", "S3F", "S41F", "S50F", "S77F") # Assuming "S77" was intended to be "S77F"

# Initialize an empty list to store the modified data frames
filtered_logdata_frames <- list()

# Define your array of LOC numbers
loc_numbers <- c("LOC111099772", "LOC111100989", "LOC111102631", "LOC111104769", 
                 "LOC111106458", "LOC111108202", "LOC111110841", "LOC111112400", 
                 "LOC111113283", "LOC111114474", "LOC111117013", "LOC111117739", 
                 "LOC111119268", "LOC111119697", "LOC111120203", "LOC111121088", 
                 "LOC111121543", "LOC111121944", "LOC111122118", "LOC111125917", 
                 "LOC111126146", "LOC111126693", "LOC111127043", "LOC111129029", 
                 "LOC111129868", "LOC111130078", "LOC111130115", "LOC111130425", 
                 "LOC111130633", "LOC111130837", "LOC111132656", "LOC111132880", 
                 "LOC111133439", "LOC111134707", "LOC111136541", "LOC111137437")

# Loop through each LOC number
for (loc in loc_numbers) {
  # Loop through the list of data frames, filter for each LOC number, and add a source column
  for (name in names(logdata_frames)) {
    # Filter the dataframe for the current LOC number and add a new column 'Source' with the name of the dataframe
    temp_filtered_df <- logdata_frames[[name]] %>%
      filter(GeneID == loc) %>%
      mutate(Source = name) %>%
      mutate(Exposure_Status = case_when(
        Source %in% controls_females ~ "control",
        Source %in% exposed_females ~ "exposed",
        TRUE ~ NA_character_ # This line handles any Source not in the two lists
      ))
    
    # Append the filtered dataframe to the list
    filtered_logdata_frames[[paste(name, loc, sep = "_")]] <- temp_filtered_df
  }
}

# Merge all the modified dataframes in the list into a new single dataframe
merged_dataframe <- bind_rows(filtered_logdata_frames)

# Assuming merged_dataframe is your final data frame

# Reshape the data from wide to long format
long_dataframe <- pivot_longer(merged_dataframe, 
                               cols = starts_with("fold"),
                               names_to = "Fold",
                               values_to = "Value")

# Ensure there's a column in your dataframe that identifies the LOC ID. 
# If your merged_dataframe does not have one, you might need to ensure that the information is retained in previous steps.
# For the sake of this example, let's assume there's a column named 'GeneID' that contains the LOC ID.

# Plot the data with facets
ggplot(long_dataframe, aes(x = Fold, y = Value, group = Source, color = Exposure_Status)) +
  geom_line() +
  geom_point() +
  theme_minimal() +
  labs(title = "Gene Expression Across LOC Numbers",
       x = "Fold Change",
       y = "Expression Value") +
  scale_color_viridis_d() + # Optional: Use a viridis color scale for better visibility
  facet_wrap(~ GeneID, scales = "free_y") # Facet by LOC ID, allowing each plot to have its own y-axis scale
```
```{r}
ggplot(long_dataframe, aes(x = Fold, y = Value, group = Source, color = Exposure_Status)) +
  geom_line() +
  geom_point() +
  theme_minimal() +
  theme(text = element_text(size = 8), # Adjust text size down
        axis.title = element_text(size = 10), # Adjust axis titles
        legend.title = element_text(size = 10)) + # Adjust legend title
  labs(title = "Gene Expression Across LOC Numbers",
       x = "Fold Change",
       y = "Expression Value") +
  scale_color_viridis_d() +
  facet_wrap(~ GeneID, scales = "free_y", ncol = 6)

 
```