--- title: "67 exon expression" author: Steven Roberts date: "`r format(Sys.time(), '%d %B, %Y')`" output: html_document: theme: readable highlight: zenburn toc: true toc_float: true number_sections: true code_folding: show code_download: true # github_document: # toc: true # toc_depth: 3 # number_sections: true # html_preview: true --- ```{r setup, include=FALSE} library(kableExtra) # library(DESeq2) # library(pheatmap) # library(RColorBrewer) # library(data.table) library(DT) # library(Biostrings) #library(methylKit) library(tidyverse) knitr::opts_chunk$set( echo = TRUE, # Display code chunks eval = TRUE, # Evaluate code chunks warning = TRUE, # Hide warnings message = TRUE, # Hide messages fig.width = 6, # Set plot width in inches fig.height = 4, # Set plot height in inches fig.align = "center" # Align plots to the center ) ``` We have fold exon expression for all samples ```{r, engine='bash', eval=TRUE} ls ../output/65-exon-coverage/S*fold.csv ls ../output/65-exon-coverage/S*fold.csv | wc -l ``` # Loading exon fold data ```{r} # Define the vector of file paths file_paths <- c( "../output/65-exon-coverage/S12M_exonsum_10_fold.csv", "../output/65-exon-coverage/S13M_exonsum_10_fold.csv", "../output/65-exon-coverage/S16F_exonsum_10_fold.csv", "../output/65-exon-coverage/S19F_exonsum_10_fold.csv", "../output/65-exon-coverage/S22F_exonsum_10_fold.csv", "../output/65-exon-coverage/S23M_exonsum_10_fold.csv", "../output/65-exon-coverage/S29F_exonsum_10_fold.csv", "../output/65-exon-coverage/S31M_exonsum_10_fold.csv", "../output/65-exon-coverage/S35F_exonsum_10_fold.csv", "../output/65-exon-coverage/S36F_exonsum_10_fold.csv", "../output/65-exon-coverage/S39F_exonsum_10_fold.csv", "../output/65-exon-coverage/S3F_exonsum_10_fold.csv", "../output/65-exon-coverage/S41F_exonsum_10_fold.csv", "../output/65-exon-coverage/S44F_exonsum_10_fold.csv", "../output/65-exon-coverage/S48M_exonsum_10_fold.csv", "../output/65-exon-coverage/S50F_exonsum_10_fold.csv", "../output/65-exon-coverage/S52F_exonsum_10_fold.csv", "../output/65-exon-coverage/S53F_exonsum_10_fold.csv", "../output/65-exon-coverage/S54F_exonsum_10_fold.csv", "../output/65-exon-coverage/S59M_exonsum_10_fold.csv", "../output/65-exon-coverage/S64M_exonsum_10_fold.csv", "../output/65-exon-coverage/S6M_exonsum_10_fold.csv", "../output/65-exon-coverage/S76F_exonsum_10_fold.csv", "../output/65-exon-coverage/S77F_exonsum_10_fold.csv", "../output/65-exon-coverage/S7M_exonsum_10_fold.csv", "../output/65-exon-coverage/S9M_exonsum_10_fold.csv" ) # Initialize an empty list to store the data frames data_frames <- list() # Loop through the file paths, read each file, and store it in the list with a named key for (file_path in file_paths) { # Extract a meaningful name from the file path, e.g., S12M, S13M, etc. name <- gsub(".*/|_exonsum_10_fold\\.csv$", "", file_path) # Read the file and assign it to the list with the name as the key data_frames[[name]] <- read.csv(file_path) } # At this point, `data_frames` is a list of data frames. # You can access each data frame by its name, for example: # data_frames$S12M # This will give you the data frame for S12M_exonsum_10_logfold.csv ``` ```{r} datatable(head(data_frames$S12M)) ``` # Metadata | Sample.ID | OldSample.ID | Treatment | Sex | TreatmentN | Parent.ID | |-----------|--------------|-----------|-----|------------|-----------| | 12M | S12M | Exposed | M | 3 | EM05 | | 13M | S13M | Control | M | 1 | CM04 | | 16F | S16F | Control | F | 2 | CF05 | | 19F | S19F | Control | F | 2 | CF08 | | 22F | S22F | Exposed | F | 4 | EF02 | | 23M | S23M | Exposed | M | 3 | EM04 | | 29F | S29F | Exposed | F | 4 | EF07 | | 31M | S31M | Exposed | M | 3 | EM06 | | 35F | S35F | Exposed | F | 4 | EF08 | | 36F | S36F | Exposed | F | 4 | EF05 | | 39F | S39F | Control | F | 2 | CF06 | | 3F | S3F | Exposed | F | 4 | EF06 | | 41F | S41F | Exposed | F | 4 | EF03 | | 44F | S44F | Control | F | 2 | CF03 | | 48M | S48M | Exposed | M | 3 | EM03 | | 50F | S50F | Exposed | F | 4 | EF01 | | 52F | S52F | Control | F | 2 | CF07 | | 53F | S53F | Control | F | 2 | CF02 | | 54F | S54F | Control | F | 2 | CF01 | | 59M | S59M | Exposed | M | 3 | EM01 | | 64M | S64M | Control | M | 1 | CM05 | | 6M | S6M | Control | M | 1 | CM02 | | 76F | S76F | Control | F | 2 | CF04 | | 77F | S77F | Exposed | F | 4 | EF04 | | 7M | S7M | Control | M | 1 | CM01 | | 9M | S9M | Exposed | M | 3 | EM02 | ```{r assign-variables} all <- c("S13M", "S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S64M", "S6M", "S76F", "S7M", "S12M", "S22F", "S23M", "S29F", "S31M", "S35F", "S36F", "S3F", "S41F", "S48M", "S50F", "S59M", "S77F", "S9M") controls <- c("S13M", "S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S64M", "S6M", "S76F", "S7M") exposed <- c("S12M", "S22F", "S23M", "S29F", "S31M", "S35F", "S36F", "S3F", "S41F", "S48M", "S50F", "S59M", "S77F", "S9M") females <- c("S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S76F", "S22F", "S29F", "S35F", "S36F", "S3F", "S41F", "S50F", "S77F") males <- c("S12M", "S23M", "S31M", "S48M", "S59M", "S9M","S13M", "S64M", "S6M", "S7M") controls_males <- c("S13M", "S64M", "S6M", "S7M") exposed_males <- c("S12M", "S23M", "S31M", "S48M", "S59M", "S9M") controls_females <- c("S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S76F") exposed_females <- c("S22F", "S29F", "S35F", "S36F", "S3F", "S41F", "S50F", "S77F") ``` # female controls ``` controls_females <- c("S16F", "S19F", "S39F", "S44F", "S52F", "S53F", "S54F", "S76F") ``` ```{r} fc <- data_frames$S16F %>% left_join(data_frames$S19F, by = "GeneID") %>% left_join(data_frames$S39F, by = "GeneID") %>% left_join(data_frames$S44F, by = "GeneID") %>% left_join(data_frames$S52F, by = "GeneID") %>% left_join(data_frames$S53F, by = "GeneID") %>% left_join(data_frames$S54F, by = "GeneID") %>% left_join(data_frames$S76F, by = "GeneID") %>% dplyr::select(GeneID, starts_with("f")) %>% mutate(fold2_avg = rowMeans(select(.,starts_with("fold2")), na.rm = TRUE)) %>% mutate(fold3_avg = rowMeans(select(.,starts_with("fold3")), na.rm = TRUE)) %>% mutate(fold4_avg = rowMeans(select(.,starts_with("fold4")), na.rm = TRUE)) %>% mutate(fold5_avg = rowMeans(select(.,starts_with("fold5")), na.rm = TRUE)) %>% mutate(fold6_avg = rowMeans(select(.,starts_with("fold6")), na.rm = TRUE)) %>% select(GeneID, ends_with("avg")) ``` ```{r} # Function to calculate slope and R^2 for a row calculate_slope_r2 <- function(row) { # Define x-values (fold numbers) x <- 2:6 # Select y-values and remove rows with NA in them for both x and y y <- as.numeric(row) valid_indices <- !is.na(y) x <- x[valid_indices] y <- y[valid_indices] # Check if we have enough data points after removing NAs if (length(y) < 2) { return(c(NA, NA)) # Return NA if not enough data points to calculate slope and R^2 } # Perform linear regression model <- lm(y ~ x) # Extract slope and R^2 slope <- coef(model)[["x"]] r_squared <- summary(model)$r.squared # Return slope and R^2 c(slope = slope, r_squared = r_squared) } # Apply the function to each row of df and create new columns for slope and R^2 fc <- fc %>% rowwise() %>% mutate(results = list(calculate_slope_r2(c_across(fold2_avg:fold6_avg)))) %>% ungroup() %>% mutate(slope = map_dbl(results, 1), r_squared = map_dbl(results, 2)) %>% select(-results) # View the updated data frame print(df) ``` ```{r} fc <- na.omit(fc) ``` ```{r} ggplot(fc, aes(x = slope)) + geom_histogram(binwidth = .002, fill = "blue", color = "black") + labs(title = "Histogram of slope", x = "slope", y = "Frequency") + theme_minimal() ``` ```{r} ggplot(fc, aes(x = r_squared)) + geom_histogram(binwidth = .02, fill = "blue", color = "black") + labs(title = "Histogram of r_squared", x = "r_squared", y = "Frequency") + theme_minimal() ``` # Loadding log fold Lets also get log fold change data and see if that impacts r2 and slope ```{r} # Define the vector of file paths logfile_paths <- c( "../output/65-exon-coverage/S12M_exonsum_10_logfold.csv", "../output/65-exon-coverage/S13M_exonsum_10_logfold.csv", "../output/65-exon-coverage/S16F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S19F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S22F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S23M_exonsum_10_logfold.csv", "../output/65-exon-coverage/S29F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S31M_exonsum_10_logfold.csv", "../output/65-exon-coverage/S35F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S36F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S39F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S3F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S41F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S44F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S48M_exonsum_10_logfold.csv", "../output/65-exon-coverage/S50F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S52F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S53F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S54F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S59M_exonsum_10_logfold.csv", "../output/65-exon-coverage/S64M_exonsum_10_logfold.csv", "../output/65-exon-coverage/S6M_exonsum_10_logfold.csv", "../output/65-exon-coverage/S76F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S77F_exonsum_10_logfold.csv", "../output/65-exon-coverage/S7M_exonsum_10_logfold.csv", "../output/65-exon-coverage/S9M_exonsum_10_logfold.csv" ) # Initialize an empty list to store the data frames logdata_frames <- list() # Loop through the file paths, read each file, and store it in the list with a named key for (file_path in logfile_paths) { # Extract a meaningful name from the file path, e.g., S12M, S13M, etc. name <- gsub(".*/|_exonsum_10_logfold\\.csv$", "", file_path) # Read the file and assign it to the list with the name as the key logdata_frames[[name]] <- read.csv(file_path) } # At this point, `data_frames` is a list of data frames. # You can access each data frame by its name, for example: # data_frames$S12M # This will give you the data frame for S12M_exonsum_10_fold.csv ``` ```{r} datatable(head(logdata_frames$S12M)) ``` # log female controls ```{r} logfc <- logdata_frames$S16F %>% left_join(logdata_frames$S19F, by = "GeneID") %>% left_join(logdata_frames$S39F, by = "GeneID") %>% left_join(logdata_frames$S44F, by = "GeneID") %>% left_join(logdata_frames$S52F, by = "GeneID") %>% left_join(logdata_frames$S53F, by = "GeneID") %>% left_join(logdata_frames$S54F, by = "GeneID") %>% left_join(logdata_frames$S76F, by = "GeneID") %>% dplyr::select(GeneID, starts_with("f")) %>% mutate(fold2_avg = rowMeans(select(.,starts_with("fold2")), na.rm = TRUE)) %>% mutate(fold3_avg = rowMeans(select(.,starts_with("fold3")), na.rm = TRUE)) %>% mutate(fold4_avg = rowMeans(select(.,starts_with("fold4")), na.rm = TRUE)) %>% mutate(fold5_avg = rowMeans(select(.,starts_with("fold5")), na.rm = TRUE)) %>% mutate(fold6_avg = rowMeans(select(.,starts_with("fold6")), na.rm = TRUE)) %>% dplyr::select(GeneID, ends_with("avg")) ``` ```{r} datatable(head(logfc)) ``` calculating slope R2 ```{r} # Function to calculate slope and R^2 for a row calculate_slope_r2 <- function(row) { # Define x-values (fold numbers) x <- 2:6 # Select y-values and remove rows with NA in them for both x and y y <- as.numeric(row) valid_indices <- !is.na(y) x <- x[valid_indices] y <- y[valid_indices] # Check if we have enough data points after removing NAs if (length(y) < 2) { return(c(NA, NA)) # Return NA if not enough data points to calculate slope and R^2 } # Perform linear regression model <- lm(y ~ x) # Extract slope and R^2 slope <- coef(model)[["x"]] r_squared <- summary(model)$r.squared # Return slope and R^2 c(slope = slope, r_squared = r_squared) } # Apply the function to each row of df and create new columns for slope and R^2 logfc <- logfc %>% rowwise() %>% mutate(results = list(calculate_slope_r2(c_across(fold2_avg:fold6_avg)))) %>% ungroup() %>% mutate(slope = map_dbl(results, 1), r_squared = map_dbl(results, 2)) %>% select(-results) # View the updated data frame print(df) ``` ```{r} datatable(head(logfc)) ``` ```{r} logfc <- na.omit(logfc) ``` ```{r} datatable(head(logfc)) ``` ```{r} ggplot(logfc, aes(x = slope)) + geom_histogram(binwidth = .05, fill = "blue", color = "black") + labs(title = "Histogram of column_name", x = "slope", y = "Frequency") + theme_minimal() ``` ```{r} ggplot(logfc, aes(x = r_squared)) + geom_histogram(binwidth = .02, fill = "blue", color = "black") + labs(title = "Histogram of r_squared", x = "r_squared", y = "Frequency") + theme_minimal() ``` ```{r} # Assuming `df` is your dataframe with 13,000 rows and 6 or more columns # Function to fit a polynomial model to specific columns of a row and return coefficients fit_poly <- function(row) { # Select only columns 2 through 6 from the row row_selected <- row[2:6] # Create a simple sequence of x values for the selected columns x <- 1:5 # Fit a polynomial model to the selected part of the row fit <- lm(row_selected ~ poly(x, 2, raw = TRUE)) # Return the coefficients of the model return(coef(fit)) } # Apply this function to each row of the dataframe results <- t(apply(logfc, 1, fit_poly)) # Convert the results to a dataframe for easier manipulation results_df <- as.data.frame(results) ``` ```{r} datatable(head(results_df)) ``` # log female exposed ``` exposed_females <- c("S22F", "S29F", "S35F", "S36F", "S3F", "S41F", "S50F", "S77F") ``` ```{r} logfe <- logdata_frames$S22F %>% left_join(logdata_frames$S29F, by = "GeneID") %>% left_join(logdata_frames$S35F, by = "GeneID") %>% left_join(logdata_frames$S36F, by = "GeneID") %>% left_join(logdata_frames$S3F, by = "GeneID") %>% left_join(logdata_frames$S41F, by = "GeneID") %>% left_join(logdata_frames$S50F, by = "GeneID") %>% left_join(logdata_frames$S77F, by = "GeneID") %>% dplyr::select(GeneID, starts_with("f")) %>% mutate(fold2_avg = rowMeans(select(.,starts_with("fold2")), na.rm = TRUE)) %>% mutate(fold3_avg = rowMeans(select(.,starts_with("fold3")), na.rm = TRUE)) %>% mutate(fold4_avg = rowMeans(select(.,starts_with("fold4")), na.rm = TRUE)) %>% mutate(fold5_avg = rowMeans(select(.,starts_with("fold5")), na.rm = TRUE)) %>% mutate(fold6_avg = rowMeans(select(.,starts_with("fold6")), na.rm = TRUE)) %>% dplyr::select(GeneID, ends_with("avg")) ``` ```{r} datatable(head(logfe)) ``` calculating slope R2 ```{r} # Function to calculate slope and R^2 for a row calculate_slope_r2 <- function(row) { # Define x-values (fold numbers) x <- 2:6 # Select y-values and remove rows with NA in them for both x and y y <- as.numeric(row) valid_indices <- !is.na(y) x <- x[valid_indices] y <- y[valid_indices] # Check if we have enough data points after removing NAs if (length(y) < 2) { return(c(NA, NA)) # Return NA if not enough data points to calculate slope and R^2 } # Perform linear regression model <- lm(y ~ x) # Extract slope and R^2 slope <- coef(model)[["x"]] r_squared <- summary(model)$r.squared # Return slope and R^2 c(slope = slope, r_squared = r_squared) } # Apply the function to each row of df and create new columns for slope and R^2 logfe <- logfe %>% rowwise() %>% mutate(results = list(calculate_slope_r2(c_across(fold2_avg:fold6_avg)))) %>% ungroup() %>% mutate(slope = map_dbl(results, 1), r_squared = map_dbl(results, 2)) %>% select(-results) # View the updated data frame print(df) ``` ```{r} datatable(head(logfe)) ``` ```{r} logfe <- na.omit(logfe) ``` ```{r} datatable(head(logfe)) ``` ```{r} ggplot(logfe, aes(x = slope)) + geom_histogram(binwidth = .05, fill = "blue", color = "black") + labs(title = "Histogram of column_name", x = "slope", y = "Frequency") + theme_minimal() ``` ```{r} ggplot(logfe, aes(x = r_squared)) + geom_histogram(binwidth = .02, fill = "blue", color = "black") + labs(title = "Histogram of r_squared", x = "r_squared", y = "Frequency") + theme_minimal() ``` # joining female con exposed ```{r} female <- logfe %>% left_join(logfc, by = "GeneID") %>% mutate(slope_diff = abs(slope.x-slope.y)) ``` ```{r} datatable(head(female)) ```