--- title: "exon expression" output: github_document date: "2023-11-16" --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ```{bash} head -5 ../output/19-exon-expression/S12M-exon_expression.tab > ../output/19-exon-expression/HEAD5-S12M-exon_expression.tab ``` ```{r} library(readr) library(dplyr) library(tidyr) # Load the data data <- read_delim("../output/19-exon-expression/S12M-exon_expression.tab", "\t", escape_double = FALSE, col_types = cols(), trim_ws = TRUE) # Reshape data to a long format for easier manipulation data_long <- pivot_longer(data, cols = -gene_name, names_to = "exon", values_to = "expression") # Calculate the mean expression for each gene mean_expressions <- data_long %>% group_by(gene_name) %>% summarize(mean_expression = mean(expression, na.rm = TRUE)) # Join the mean expressions back to the original data data_long <- data_long %>% left_join(mean_expressions, by = "gene_name") # Normalize the expression values data_long <- data_long %>% mutate(normalized_expression = expression / mean_expression) # Reshape data back to a wide format (optional) data_normalized <- data_long %>% select(-expression, -mean_expression) %>% pivot_wider(names_from = exon, values_from = normalized_expression) # Save the normalized data write_csv(data_normalized, "../output/19-exon-expression/normalized_exon_expression.tab") # (Optional) Print the first few rows of the normalized data print(head(data_normalized)) ``` ```{r} library(readr) library(dplyr) library(broom) library(tidyr) library(purrr) # Load the data data <- read_delim("../output/19-exon-expression/S12M-exon_expression.tab", "\t", escape_double = FALSE, col_types = cols(), trim_ws = TRUE) # Reshape data for linear regression analysis data_long <- pivot_longer(data, cols = -gene_name, names_to = "exon", values_to = "expression") data_long <- data_long %>% filter(!is.na(expression)) # Function to calculate the summary statistics for each gene calculate_statistics <- function(df) { model <- lm(expression ~ as.numeric(gsub("exon_", "", exon)), data = df) slope <- coef(model)[2] r_squared <- summary(model)$r.squared predictions <- predict(model, interval = "confidence") df$within_ci <- df$expression >= predictions[, "lwr"] & df$expression <= predictions[, "upr"] percentage_within_ci <- mean(df$within_ci) * 100 return(tibble( slope_best_fit = slope, r_squared = r_squared, percentage_within_95_CI = percentage_within_ci )) } # Apply function to each gene and calculate additional statistics results <- data_long %>% group_by(gene_name) %>% nest() %>% mutate( statistics = map(data, calculate_statistics), num_exons = map_dbl(data, ~ nrow(.x)), mean_exon_expression = map_dbl(data, ~ mean(.x$expression, na.rm = TRUE)), std_dev_exon_expression = map_dbl(data, ~ sd(.x$expression, na.rm = TRUE)), sum_exon_expression = map_dbl(data, ~ sum(.x$expression, na.rm = TRUE)) ) %>% select(-data) %>% unnest(statistics) # Save the results write_csv(results, "../output/64-exon-expression/exon-summary.csv") # Print the results print(results) ``` ```{bash} head ../output/19-exon-expression/normalized_exon_expression.tab ``` ```{r} library(readr) library(dplyr) library(tidyr) library(ggplot2) # Load the data data <- read_delim("../output/19-exon-expression/normalized_exon_expression.tab", ",", escape_double = FALSE, col_types = cols(), trim_ws = TRUE) # Reshape the data to a long format data_long <- pivot_longer(data, cols = -gene_name, names_to = "exon", values_to = "expression") # Filter out groups with all NA values data_long <- data_long %>% group_by(gene_name) %>% filter(sum(!is.na(expression)) > 0) %>% ungroup() # Fit linear models for each gene models <- data_long %>% group_by(gene_name) %>% do(model = lm(expression ~ as.numeric(gsub("exon_", "", .$exon)), data = .)) # Create a function to predict values using the model predict_values <- function(model, data) { if (is.null(model)) return(rep(NA, nrow(data))) exon_numbers <- as.numeric(gsub("exon_", "", data$exon)) predict(model, newdata = data.frame(exon = exon_numbers)) } # Add predictions to the original data data_long <- data_long %>% group_by(gene_name) %>% mutate(prediction = predict_values(first(models$model), cur_data())) # Plotting ggplot(data_long, aes(x = as.numeric(gsub("exon_", "", exon)), y = expression, group = gene_name)) + geom_point(aes(color = gene_name)) + geom_line(aes(y = prediction, color = gene_name)) + theme_minimal() + labs(title = "Exon Expression with Best-Fit Lines for Each Gene", x = "Exon Number", y = "Expression") + theme(legend.position = "none") # Adjust or remove legend as needed ``` ```{r} library(readr) library(dplyr) library(tidyr) library(ggplot2) # Load the data data <- read_delim("../output/19-exon-expression/normalized_exon_expression.tab", ",", escape_double = FALSE, col_types = cols(), trim_ws = TRUE) # Reshape the data to a long format data_long <- pivot_longer(data, cols = -gene_name, names_to = "exon", values_to = "expression") # Convert exon names to numeric and filter for the first 10 exons data_long <- data_long %>% mutate(exon_number = as.numeric(gsub("exon_", "", exon))) %>% filter(exon_number <= 10) # Filter out groups with all NA values data_long <- data_long %>% group_by(gene_name) %>% filter(sum(!is.na(expression)) > 0) %>% ungroup() # Fit linear models for each gene models <- data_long %>% group_by(gene_name) %>% do(model = lm(expression ~ exon_number, data = .)) # Create a function to predict values using the model predict_values <- function(model, data) { if (is.null(model)) return(rep(NA, nrow(data))) predict(model, newdata = data.frame(exon_number = data$exon_number)) } # Add predictions to the original data data_long <- data_long %>% group_by(gene_name) %>% mutate(prediction = predict_values(first(models$model), cur_data())) # Plotting with jitter ggplot(data_long, aes(x = exon_number, y = expression, group = gene_name)) + geom_jitter(aes(color = gene_name), width = 0.2, height = 0) + # Add jitter here geom_line(aes(y = prediction, color = gene_name)) + theme_minimal() + labs(title = "Exon Expression with Best-Fit Lines for Each Gene (First 10 Exons)", x = "Exon Number", y = "Expression") + theme(legend.position = "none") # Adjust or remove legend as needed ``` ```{r} library(readr) library(dplyr) library(tidyr) library(ggplot2) library(broom) # Load the data data <- read_delim("../output/19-exon-expression/normalized_exon_expression.tab", ",", escape_double = FALSE, col_types = cols(), trim_ws = TRUE) # Reshape the data to a long format data_long <- pivot_longer(data, cols = -gene_name, names_to = "exon", values_to = "expression") # Convert exon names to numeric and filter for the first 10 exons and expression range data_long <- data_long %>% mutate(exon_number = as.numeric(gsub("exon_", "", exon))) %>% filter(exon_number <= 10, expression >= 0, expression <= 10) # Filter out groups with all NA values data_long <- data_long %>% group_by(gene_name) %>% filter(sum(!is.na(expression)) > 0) %>% ungroup() # Calculate the slope for each gene slopes <- data_long %>% group_by(gene_name) %>% do(tidy(lm(expression ~ exon_number, data = .))) %>% filter(term == "exon_number") %>% mutate(slope_color = ifelse(estimate > 0, "blue", "red")) # Join the slope information back to the data data_long <- data_long %>% left_join(slopes, by = "gene_name") # Plotting with colored lines based on slope ggplot(data_long, aes(x = exon_number, y = expression, group = gene_name)) + geom_point(aes(color = gene_name), alpha = 0.02) + geom_smooth(aes(color = slope_color), method = "lm", se = FALSE, size = 0.5, linetype = "solid", alpha = 0.5) + theme_minimal() + labs(title = "Exon Expression with Best-Fit Lines for Each Gene (Exon 0-10, Expression 0-10)", x = "Exon Number", y = "Expression") + theme(legend.position = "none") # Adjust or remove legend as needed ``` ```{r} library(readr) library(dplyr) library(tidyr) library(ggplot2) library(purrr) # Load the data data <- read_delim("../output/19-exon-expression/normalized_exon_expression.tab", ",", escape_double = FALSE, col_types = cols(), trim_ws = TRUE) # Reshape data for analysis data_long <- pivot_longer(data, cols = ~gene_name, names_to = "exon", values_to = "norm_expression") data_long <- data_long %>% filter(!is.na(expression), expression < 4000, as.numeric(gsub("exon_", "", exon)) < 100) # Calculate mean exon expression for each gene mean_expressions <- data_long %>% group_by(gene_name) %>% summarize(mean_expression = mean(expression, na.rm = TRUE)) %>% filter(mean_expression >= 10) # Filter out genes with mean expression less than 10 data_long_filtered <- data_long %>% semi_join(mean_expressions, by = "gene_name") # Function to get regression line data get_regression_line <- function(df) { model <- lm(expression ~ as.numeric(gsub("exon_", "", exon)), data = df) tibble( exon = as.numeric(gsub("exon_", "", df$exon)), predicted_expression = predict(model, newdata = df) ) } # Calculate regression lines for each gene regression_lines <- data_long_filtered %>% group_by(gene_name) %>% nest() %>% mutate(regression_data = map(data, get_regression_line)) %>% select(-data) %>% unnest(regression_data) # Plotting ggplot() + geom_point(data = data_long_filtered, aes(x = as.numeric(gsub("exon_", "", exon)), y = expression, color = gene_name), alpha = 0.5) + geom_line(data = regression_lines, aes(x = exon, y = predicted_expression, color = gene_name), size = 1) + theme_minimal() + labs(title = "Exon Expression with Best-Fit Lines for Each Gene (Filtered Data)", x = "Exon Number", y = "Expression") + theme(legend.position = "none") # Remove legend if too many genes ``` ```{r} # Specify the file location file_location <- "../output/64-exon-expression/exon-summary.csv" # Read data from the specified file data <- read.csv(file_location, header = TRUE, sep = ",") # Check the structure of the data str(data) # Create histograms for numeric variables par(mfrow = c(2, 3)) # Arrange histograms in a 2x3 grid hist(data$slope_best_fit, main = "Slope Best Fit", xlab = "Value", xlim = c(-200, 200), breaks = 2000) hist(data$r_squared, main = "R-squared", xlab = "Value") hist(data$num_exons, main = "Number of Exons", xlab = "Value", breaks = 100) hist(data$mean_exon_expression, main = "Mean Exon Expression", xlab = "Value", breaks = 100, xlim = c(0, 1000)) hist(data$std_dev_exon_expression, main = "Std Dev Exon Expression", xlab = "Value", breaks = 100, xlim = c(0, 1000)) hist(data$sum_exon_expression, main = "Sum Exon Expression", xlab = "Value", breaks = 1000, xlim = c(0, 1000)) # Reset the plotting layout par(mfrow = c(1, 1)) ``` ```{r} # Specify the file location file_location <- "../output/64-exon-expression/exon-summary.csv" # Read data from the specified file data <- read.csv(file_location, header = TRUE, sep = ",") # Create a scatter plot for slope_best_fit with custom colors plot(data$slope_best_fit, type = "p", pch = 16, xlab = "Index", ylab = "Slope Best Fit", main = "Scatter Plot of Slope Best Fit", col = ifelse(data$slope_best_fit > 1000 | data$slope_best_fit < -1000, "red", "blue")) # Add a legend for the custom colors legend("topright", legend = c("<= -1000 or >= 1000", "> -1000 and < 1000"), col = c("red", "blue"), pch = 16) ``` ```{r} # Specify the file location file_location <- "../output/64-exon-expression/exon-summary.csv" # Read data from the specified file data <- read.csv(file_location, header = TRUE, sep = ",") # Filter the data based on conditions filtered_data <- subset(data, r_squared > 0.6 & num_exons > 4) # Now, filtered_data contains only the rows that meet the criteria. ```