--- title: "71 fm exon ind" author: Steven Roberts date: "`r format(Sys.time(), '%d %B, %Y')`" output: html_document: theme: readable highlight: zenburn toc: true toc_float: true number_sections: true code_folding: show code_download: true # github_document: # toc: true # toc_depth: 3 # number_sections: true # html_preview: true --- ```{r setup, include=FALSE} #library(kableExtra) # library(DESeq2) # library(pheatmap) # library(RColorBrewer) # library(data.table) #library(DT) # library(Biostrings) #library(methylKit) library(tidyverse) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = TRUE, # Evaluate code chunks warning = TRUE, # Hide warnings message = TRUE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) ``` We have fold exon expression for all samples # Metadata | Sample.ID | OldSample.ID | Treatment | Sex | TreatmentN | Parent.ID | |-----------|--------------|-----------|-----|------------|-----------| | 12M | S12M | Exposed | M | 3 | EM05 | | 13M | S13M | Control | M | 1 | CM04 | | 16F | S16F | Control | F | 2 | CF05 | | 19F | S19F | Control | F | 2 | CF08 | | 22F | S22F | Exposed | F | 4 | EF02 | | 23M | S23M | Exposed | M | 3 | EM04 | | 29F | S29F | Exposed | F | 4 | EF07 | | 31M | S31M | Exposed | M | 3 | EM06 | | 35F | S35F | Exposed | F | 4 | EF08 | | 36F | S36F | Exposed | F | 4 | EF05 | | 39F | S39F | Control | F | 2 | CF06 | | 3F | S3F | Exposed | F | 4 | EF06 | | 41F | S41F | Exposed | F | 4 | EF03 | | 44F | S44F | Control | F | 2 | CF03 | | 48M | S48M | Exposed | M | 3 | EM03 | | 50F | S50F | Exposed | F | 4 | EF01 | | 52F | S52F | Control | F | 2 | CF07 | | 53F | S53F | Control | F | 2 | CF02 | | 54F | S54F | Control | F | 2 | CF01 | | 59M | S59M | Exposed | M | 3 | EM01 | | 64M | S64M | Control | M | 1 | CM05 | | 6M | S6M | Control | M | 1 | CM02 | | 76F | S76F | Control | F | 2 | CF04 | | 77F | S77F | Exposed | F | 4 | EF04 | | 7M | S7M | Control | M | 1 | CM01 | | 9M | S9M | Exposed | M | 3 | EM02 | ``` all <- c("S13M", "S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S64M", "S6M", "S76F", "S7M", "S12M", "S22F", "S23M", "S29F", "S31M", "S35F", "S36F", "S3F", "S41F", "S48M", "S50F", "S59M", "S77F", "S9M") controls <- c("S13M", "S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S64M", "S6M", "S76F", "S7M") exposed <- c("S12M", "S22F", "S23M", "S29F", "S31M", "S35F", "S36F", "S3F", "S41F", "S48M", "S50F", "S59M", "S77F", "S9M") females <- c("S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S76F", "S22F", "S29F", "S35F", "S36F", "S3F", "S41F", "S50F", "S77F") males <- c("S12M", "S23M", "S31M", "S48M", "S59M", "S9M","S13M", "S64M", "S6M", "S7M") controls_males <- c("S13M", "S64M", "S6M", "S7M") exposed_males <- c("S12M", "S23M", "S31M", "S48M", "S59M", "S9M") controls_females <- c("S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S76F") exposed_females <- c("S22F", "S29F", "S35F", "S36F", "S3F", "S41F", "S50F", "S77F") ``` # Loading log fold ```{r} # Define the vector of file paths logfile_paths <- c( "../output/65-exon-coverage/S12M_exonsum_10_logfold.csv", "../output/65-exon-coverage/S13M_exonsum_10_logfold.csv", "../output/65-exon-coverage/S16F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S19F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S22F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S23M_exonsum_10_logfold.csv", "../output/65-exon-coverage/S29F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S31M_exonsum_10_logfold.csv", "../output/65-exon-coverage/S35F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S36F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S39F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S3F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S41F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S44F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S48M_exonsum_10_logfold.csv", "../output/65-exon-coverage/S50F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S52F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S53F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S54F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S59M_exonsum_10_logfold.csv", "../output/65-exon-coverage/S64M_exonsum_10_logfold.csv", "../output/65-exon-coverage/S6M_exonsum_10_logfold.csv", "../output/65-exon-coverage/S76F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S77F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S7M_exonsum_10_logfold.csv", "../output/65-exon-coverage/S9M_exonsum_10_logfold.csv" ) # Initialize an empty list to store the data frames logdata_frames <- list() # Loop through the file paths, read each file, and store it in the list with a named key for (file_path in logfile_paths) { # Extract a meaningful name from the file path, e.g., S12M, S13M, etc. name <- gsub(".*/|_exonsum_10_logfold\\.csv$", "", file_path) # Read the file and assign it to the list with the name as the key logdata_frames[[name]] <- read.csv(file_path) } # At this point, `data_frames` is a list of data frames. # You can access each data frame by its name, for example: # data_frames$S12M # This will give you the data frame for S12M_exonsum_10_fold.csv ``` ```{r} head(logdata_frames$S12M) ``` controls_females <- c("S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S76F") exposed_females <- c("S22F", "S29F", "S35F", "S36F", "S3F", "S41F", "S50F", "S77F") ```{r} # Define the vectors for controls and exposed controls_females <- c("S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S76F") exposed_females <- c("S22F", "S29F", "S35F", "S36F", "S3F", "S41F", "S50F", "S77F") # Assuming "S77" was intended to be "S77F" # Initialize an empty list to store the modified data frames filtered_logdata_frames <- list() # Loop through the list of data frames, filter, and add a source column for (name in names(logdata_frames)) { # Filter the dataframe and add a new column 'Source' with the name of the dataframe filtered_logdata_frames[[name]] <- logdata_frames[[name]] %>% filter(GeneID == "LOC111126804") %>% mutate(Source = name) %>% mutate(Exposure_Status = case_when( Source %in% controls_females ~ "control", Source %in% exposed_females ~ "exposed", TRUE ~ NA_character_ # This line handles any Source not in the two lists )) } # Merge all the modified dataframes in the list into a new single dataframe merged_dataframe <- bind_rows(filtered_logdata_frames) ``` ```{r} # Assuming merged_dataframe is your final data frame # Reshape the data from wide to long format long_dataframe <- pivot_longer(merged_dataframe, cols = starts_with("fold"), names_to = "Fold", values_to = "Value") # Plot the data ggplot(long_dataframe, aes(x = Fold, y = Value, group = Source, color = Exposure_Status)) + geom_line() + geom_point() + theme_minimal() + labs(title = "GOI", x = "Fold", y = "Value") + scale_color_viridis_d() # Optional: Use a viridis color scale for better visibility ``` ```{r} # Define the vectors for controls and exposed controls_females <- c("S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S76F") exposed_females <- c("S22F", "S29F", "S35F", "S36F", "S3F", "S41F", "S50F", "S77F") # Assuming "S77" was intended to be "S77F" # Initialize an empty list to store the modified data frames filtered_logdata_frames <- list() # Define your array of LOC numbers loc_numbers <- c("LOC111099772", "LOC111100989", "LOC111102631", "LOC111104769", "LOC111106458", "LOC111108202", "LOC111110841", "LOC111112400", "LOC111113283", "LOC111114474", "LOC111117013", "LOC111117739", "LOC111119268", "LOC111119697", "LOC111120203", "LOC111121088", "LOC111121543", "LOC111121944", "LOC111122118", "LOC111125917", "LOC111126146", "LOC111126693", "LOC111127043", "LOC111129029", "LOC111129868", "LOC111130078", "LOC111130115", "LOC111130425", "LOC111130633", "LOC111130837", "LOC111132656", "LOC111132880", "LOC111133439", "LOC111134707", "LOC111136541", "LOC111137437") # Loop through each LOC number for (loc in loc_numbers) { # Loop through the list of data frames, filter for each LOC number, and add a source column for (name in names(logdata_frames)) { # Filter the dataframe for the current LOC number and add a new column 'Source' with the name of the dataframe temp_filtered_df <- logdata_frames[[name]] %>% filter(GeneID == loc) %>% mutate(Source = name) %>% mutate(Exposure_Status = case_when( Source %in% controls_females ~ "control", Source %in% exposed_females ~ "exposed", TRUE ~ NA_character_ # This line handles any Source not in the two lists )) # Append the filtered dataframe to the list filtered_logdata_frames[[paste(name, loc, sep = "_")]] <- temp_filtered_df } } # Merge all the modified dataframes in the list into a new single dataframe merged_dataframe <- bind_rows(filtered_logdata_frames) # Assuming merged_dataframe is your final data frame # Reshape the data from wide to long format long_dataframe <- pivot_longer(merged_dataframe, cols = starts_with("fold"), names_to = "Fold", values_to = "Value") # Ensure there's a column in your dataframe that identifies the LOC ID. # If your merged_dataframe does not have one, you might need to ensure that the information is retained in previous steps. # For the sake of this example, let's assume there's a column named 'GeneID' that contains the LOC ID. # Plot the data with facets ggplot(long_dataframe, aes(x = Fold, y = Value, group = Source, color = Exposure_Status)) + geom_line() + geom_point() + theme_minimal() + labs(title = "Gene Expression Across LOC Numbers", x = "Fold Change", y = "Expression Value") + scale_color_viridis_d() + # Optional: Use a viridis color scale for better visibility facet_wrap(~ GeneID, scales = "free_y") # Facet by LOC ID, allowing each plot to have its own y-axis scale ``` ```{r} ggplot(long_dataframe, aes(x = Fold, y = Value, group = Source, color = Exposure_Status)) + geom_line() + geom_point() + theme_minimal() + theme(text = element_text(size = 8), # Adjust text size down axis.title = element_text(size = 10), # Adjust axis titles legend.title = element_text(size = 10)) + # Adjust legend title labs(title = "Gene Expression Across LOC Numbers", x = "Fold Change", y = "Expression Value") + scale_color_viridis_d() + facet_wrap(~ GeneID, scales = "free_y", ncol = 6) ```