---
title: "exon expression"
output: github_document
date: "2023-11-16"
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{bash}
head -5 ../output/19-exon-expression/S12M-exon_expression.tab > ../output/19-exon-expression/HEAD5-S12M-exon_expression.tab
```

```{r}
library(readr)
library(dplyr)
library(tidyr)

# Load the data
data <- read_delim("../output/19-exon-expression/S12M-exon_expression.tab", "\t", escape_double = FALSE, col_types = cols(), trim_ws = TRUE)

# Reshape data to a long format for easier manipulation
data_long <- pivot_longer(data, cols = -gene_name, names_to = "exon", values_to = "expression")

# Calculate the mean expression for each gene
mean_expressions <- data_long %>%
  group_by(gene_name) %>%
  summarize(mean_expression = mean(expression, na.rm = TRUE))

# Join the mean expressions back to the original data
data_long <- data_long %>%
  left_join(mean_expressions, by = "gene_name")

# Normalize the expression values
data_long <- data_long %>%
  mutate(normalized_expression = expression / mean_expression)

# Reshape data back to a wide format (optional)
data_normalized <- data_long %>%
  select(-expression, -mean_expression) %>%
  pivot_wider(names_from = exon, values_from = normalized_expression)

# Save the normalized data
write_csv(data_normalized, "../output/19-exon-expression/normalized_exon_expression.tab")

# (Optional) Print the first few rows of the normalized data
print(head(data_normalized))

```


```{r}
library(readr)
library(dplyr)
library(broom)
library(tidyr)
library(purrr)

# Load the data
data <- read_delim("../output/19-exon-expression/S12M-exon_expression.tab", "\t", escape_double = FALSE, col_types = cols(), trim_ws = TRUE)

# Reshape data for linear regression analysis
data_long <- pivot_longer(data, cols = -gene_name, names_to = "exon", values_to = "expression")
data_long <- data_long %>% filter(!is.na(expression))

# Function to calculate the summary statistics for each gene
calculate_statistics <- function(df) {
  model <- lm(expression ~ as.numeric(gsub("exon_", "", exon)), data = df)
  slope <- coef(model)[2]
  r_squared <- summary(model)$r.squared
  predictions <- predict(model, interval = "confidence")
  df$within_ci <- df$expression >= predictions[, "lwr"] & df$expression <= predictions[, "upr"]
  percentage_within_ci <- mean(df$within_ci) * 100
  
  return(tibble(
    slope_best_fit = slope,
    r_squared = r_squared,
    percentage_within_95_CI = percentage_within_ci
  ))
}

# Apply function to each gene and calculate additional statistics
results <- data_long %>%
  group_by(gene_name) %>%
  nest() %>%
  mutate(
    statistics = map(data, calculate_statistics),
    num_exons = map_dbl(data, ~ nrow(.x)),
    mean_exon_expression = map_dbl(data, ~ mean(.x$expression, na.rm = TRUE)),
    std_dev_exon_expression = map_dbl(data, ~ sd(.x$expression, na.rm = TRUE)),
    sum_exon_expression = map_dbl(data, ~ sum(.x$expression, na.rm = TRUE))
  ) %>%
  select(-data) %>%
  unnest(statistics)

# Save the results
write_csv(results, "../output/64-exon-expression/exon-summary.csv")

# Print the results
print(results)

```

```{bash}
head ../output/19-exon-expression/normalized_exon_expression.tab
```

```{r}
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)

# Load the data
data <- read_delim("../output/19-exon-expression/normalized_exon_expression.tab", ",", escape_double = FALSE, col_types = cols(), trim_ws = TRUE)

# Reshape the data to a long format
data_long <- pivot_longer(data, cols = -gene_name, names_to = "exon", values_to = "expression")

# Filter out groups with all NA values
data_long <- data_long %>% 
  group_by(gene_name) %>% 
  filter(sum(!is.na(expression)) > 0) %>% 
  ungroup()

# Fit linear models for each gene
models <- data_long %>% 
  group_by(gene_name) %>% 
  do(model = lm(expression ~ as.numeric(gsub("exon_", "", .$exon)), data = .))

# Create a function to predict values using the model
predict_values <- function(model, data) {
  if (is.null(model)) return(rep(NA, nrow(data)))
  exon_numbers <- as.numeric(gsub("exon_", "", data$exon))
  predict(model, newdata = data.frame(exon = exon_numbers))
}

# Add predictions to the original data
data_long <- data_long %>% 
  group_by(gene_name) %>% 
  mutate(prediction = predict_values(first(models$model), cur_data()))

# Plotting
ggplot(data_long, aes(x = as.numeric(gsub("exon_", "", exon)), y = expression, group = gene_name)) +
  geom_point(aes(color = gene_name)) +
  geom_line(aes(y = prediction, color = gene_name)) +
  theme_minimal() +
  labs(title = "Exon Expression with Best-Fit Lines for Each Gene",
       x = "Exon Number",
       y = "Expression") +
  theme(legend.position = "none") # Adjust or remove legend as needed
```


```{r}
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
# Load the data
data <- read_delim("../output/19-exon-expression/normalized_exon_expression.tab", ",", escape_double = FALSE, col_types = cols(), trim_ws = TRUE)

# Reshape the data to a long format
data_long <- pivot_longer(data, cols = -gene_name, names_to = "exon", values_to = "expression")

# Convert exon names to numeric and filter for the first 10 exons
data_long <- data_long %>%
  mutate(exon_number = as.numeric(gsub("exon_", "", exon))) %>%
  filter(exon_number <= 10)

# Filter out groups with all NA values
data_long <- data_long %>% 
  group_by(gene_name) %>% 
  filter(sum(!is.na(expression)) > 0) %>% 
  ungroup()

# Fit linear models for each gene
models <- data_long %>% 
  group_by(gene_name) %>% 
  do(model = lm(expression ~ exon_number, data = .))

# Create a function to predict values using the model
predict_values <- function(model, data) {
  if (is.null(model)) return(rep(NA, nrow(data)))
  predict(model, newdata = data.frame(exon_number = data$exon_number))
}

# Add predictions to the original data
data_long <- data_long %>% 
  group_by(gene_name) %>% 
  mutate(prediction = predict_values(first(models$model), cur_data()))

# Plotting with jitter
ggplot(data_long, aes(x = exon_number, y = expression, group = gene_name)) +
  geom_jitter(aes(color = gene_name), width = 0.2, height = 0) +  # Add jitter here
  geom_line(aes(y = prediction, color = gene_name)) +
  theme_minimal() +
  labs(title = "Exon Expression with Best-Fit Lines for Each Gene (First 10 Exons)",
       x = "Exon Number",
       y = "Expression") +
  theme(legend.position = "none") # Adjust or remove legend as needed
```


```{r}
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(broom)

# Load the data
data <- read_delim("../output/19-exon-expression/normalized_exon_expression.tab", ",", escape_double = FALSE, col_types = cols(), trim_ws = TRUE)


# Reshape the data to a long format
data_long <- pivot_longer(data, cols = -gene_name, names_to = "exon", values_to = "expression")

# Convert exon names to numeric and filter for the first 10 exons and expression range
data_long <- data_long %>%
  mutate(exon_number = as.numeric(gsub("exon_", "", exon))) %>%
  filter(exon_number <= 10, expression >= 0, expression <= 10)

# Filter out groups with all NA values
data_long <- data_long %>% 
  group_by(gene_name) %>% 
  filter(sum(!is.na(expression)) > 0) %>% 
  ungroup()

# Calculate the slope for each gene
slopes <- data_long %>% 
  group_by(gene_name) %>% 
  do(tidy(lm(expression ~ exon_number, data = .))) %>%
  filter(term == "exon_number") %>%
  mutate(slope_color = ifelse(estimate > 0, "blue", "red"))

# Join the slope information back to the data
data_long <- data_long %>%
  left_join(slopes, by = "gene_name")

# Plotting with colored lines based on slope
ggplot(data_long, aes(x = exon_number, y = expression, group = gene_name)) +
  geom_point(aes(color = gene_name), alpha = 0.02) + 
  geom_smooth(aes(color = slope_color), method = "lm", se = FALSE, size = 0.5, linetype = "solid", alpha = 0.5) +
  theme_minimal() +
  labs(title = "Exon Expression with Best-Fit Lines for Each Gene (Exon 0-10, Expression 0-10)",
       x = "Exon Number",
       y = "Expression") +
  theme(legend.position = "none") # Adjust or remove legend as needed
```

```{r}
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(purrr)

# Load the data
data <- read_delim("../output/19-exon-expression/normalized_exon_expression.tab", ",", escape_double = FALSE, col_types = cols(), trim_ws = TRUE)

# Reshape data for analysis
data_long <- pivot_longer(data, cols = ~gene_name, names_to = "exon", values_to = "norm_expression")
data_long <- data_long %>% 
  filter(!is.na(expression), expression < 4000, as.numeric(gsub("exon_", "", exon)) < 100)

# Calculate mean exon expression for each gene
mean_expressions <- data_long %>%
  group_by(gene_name) %>%
  summarize(mean_expression = mean(expression, na.rm = TRUE)) %>%
  filter(mean_expression >= 10)

# Filter out genes with mean expression less than 10
data_long_filtered <- data_long %>% 
  semi_join(mean_expressions, by = "gene_name")

# Function to get regression line data
get_regression_line <- function(df) {
  model <- lm(expression ~ as.numeric(gsub("exon_", "", exon)), data = df)
  tibble(
    exon = as.numeric(gsub("exon_", "", df$exon)),
    predicted_expression = predict(model, newdata = df)
  )
}

# Calculate regression lines for each gene
regression_lines <- data_long_filtered %>%
  group_by(gene_name) %>%
  nest() %>%
  mutate(regression_data = map(data, get_regression_line)) %>%
  select(-data) %>%
  unnest(regression_data)

# Plotting
ggplot() +
  geom_point(data = data_long_filtered, aes(x = as.numeric(gsub("exon_", "", exon)), y = expression, color = gene_name), alpha = 0.5) +
  geom_line(data = regression_lines, aes(x = exon, y = predicted_expression, color = gene_name), size = 1) +
  theme_minimal() +
  labs(title = "Exon Expression with Best-Fit Lines for Each Gene (Filtered Data)",
       x = "Exon Number",
       y = "Expression") +
  theme(legend.position = "none")  # Remove legend if too many genes

```


```{r}
# Specify the file location
file_location <- "../output/64-exon-expression/exon-summary.csv"

# Read data from the specified file
data <- read.csv(file_location, header = TRUE, sep = ",")

# Check the structure of the data
str(data)

# Create histograms for numeric variables
par(mfrow = c(2, 3))  # Arrange histograms in a 2x3 grid
hist(data$slope_best_fit, main = "Slope Best Fit", xlab = "Value", xlim = c(-200, 200), breaks = 2000)
hist(data$r_squared, main = "R-squared", xlab = "Value")
hist(data$num_exons, main = "Number of Exons", xlab = "Value", breaks = 100)
hist(data$mean_exon_expression, main = "Mean Exon Expression", xlab = "Value", breaks = 100, xlim = c(0, 1000))
hist(data$std_dev_exon_expression, main = "Std Dev Exon Expression", xlab = "Value", breaks = 100, xlim = c(0, 1000))
hist(data$sum_exon_expression, main = "Sum Exon Expression", xlab = "Value", breaks = 1000, xlim = c(0, 1000))

# Reset the plotting layout
par(mfrow = c(1, 1))

```

```{r}
# Specify the file location
file_location <- "../output/64-exon-expression/exon-summary.csv"
# Read data from the specified file
data <- read.csv(file_location, header = TRUE, sep = ",")

# Create a scatter plot for slope_best_fit with custom colors
plot(data$slope_best_fit, type = "p", pch = 16, 
     xlab = "Index", ylab = "Slope Best Fit", 
     main = "Scatter Plot of Slope Best Fit",
     col = ifelse(data$slope_best_fit > 1000 | data$slope_best_fit < -1000, "red", "blue"))

# Add a legend for the custom colors
legend("topright", legend = c("<= -1000 or >= 1000", "> -1000 and < 1000"), 
       col = c("red", "blue"), pch = 16)

```
```{r}
# Specify the file location
file_location <- "../output/64-exon-expression/exon-summary.csv"

# Read data from the specified file
data <- read.csv(file_location, header = TRUE, sep = ",")

# Filter the data based on conditions
filtered_data <- subset(data, r_squared > 0.6 & num_exons > 4)

# Now, filtered_data contains only the rows that meet the criteria.
```