exon expression

2023-11-16

head -5 ../output/19-exon-expression/S12M-exon_expression.tab > ../output/19-exon-expression/HEAD5-S12M-exon_expression.tab

library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(broom)
library(tidyr)
library(purrr)

# Load the data
data <- read_delim("../output/19-exon-expression/S12M-exon_expression.tab", "\t", escape_double = FALSE, col_types = cols(), trim_ws = TRUE)

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

# Reshape data for linear regression analysis
data_long <- pivot_longer(data, cols = -gene_name, names_to = "exon", values_to = "expression")
data_long <- data_long %>% filter(!is.na(expression))

# Function to calculate the summary statistics for each gene
calculate_statistics <- function(df) {
  model <- lm(expression ~ as.numeric(gsub("exon_", "", exon)), data = df)
  slope <- coef(model)[2]
  r_squared <- summary(model)$r.squared
  predictions <- predict(model, interval = "confidence")
  df$within_ci <- df$expression >= predictions[, "lwr"] & df$expression <= predictions[, "upr"]
  percentage_within_ci <- mean(df$within_ci) * 100
  
  return(tibble(
    slope_best_fit = slope,
    r_squared = r_squared,
    percentage_within_95_CI = percentage_within_ci
  ))
}

# Apply function to each gene and calculate additional statistics
results <- data_long %>%
  group_by(gene_name) %>%
  nest() %>%
  mutate(
    statistics = map(data, calculate_statistics),
    num_exons = map_dbl(data, ~ nrow(.x)),
    mean_exon_expression = map_dbl(data, ~ mean(.x$expression, na.rm = TRUE)),
    std_dev_exon_expression = map_dbl(data, ~ sd(.x$expression, na.rm = TRUE)),
    sum_exon_expression = map_dbl(data, ~ sum(.x$expression, na.rm = TRUE))
  ) %>%
  select(-data) %>%
  unnest(statistics)

## Warning: There were 6151 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `statistics = map(data, calculate_statistics)`.
## ℹ In group 1: `gene_name = "gene-ATP6"`.
## Caused by warning in `qt()`:
## ! NaNs produced
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 6150 remaining warnings.

# Save the results
write_csv(results, "../output/64-exon-expression/exon-summary.csv")

# Print the results
print(results)

## # A tibble: 38,838 × 8
## # Groups:   gene_name [38,838]
##    gene_name         slope_best_fit r_squared percentage_within_95_CI num_exons
##    <chr>                      <dbl>     <dbl>                   <dbl>     <dbl>
##  1 gene-LOC111116054         -60.5    0.907                     100           3
##  2 gene-LOC111126949          -2.00   0.00230                   100           5
##  3 gene-LOC111110729         -82.1    0.759                     100           4
##  4 gene-LOC111112434          49.5    0.868                     100           3
##  5 gene-LOC111120752           1.4    0.00586                   100           5
##  6 gene-LOC111128944          NA      0                          NA           1
##  7 gene-LOC111128953         -87.0    1                          NA           2
##  8 gene-LOC111105691          10.1    0.0825                     87.5         8
##  9 gene-LOC111105685         150      1                          NA           2
## 10 gene-LOC111105702          -0.2    0.100                     100           4
## # ℹ 38,828 more rows
## # ℹ 3 more variables: mean_exon_expression <dbl>,
## #   std_dev_exon_expression <dbl>, sum_exon_expression <dbl>

library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(purrr)

# Load the data
data <- read_delim("../output/19-exon-expression/S12M-exon_expression.tab", "\t", escape_double = FALSE, col_types = cols(), trim_ws = TRUE)

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

# Reshape data for analysis
data_long <- pivot_longer(data, cols = -gene_name, names_to = "exon", values_to = "expression")
data_long <- data_long %>% 
  filter(!is.na(expression), expression < 4000, as.numeric(gsub("exon_", "", exon)) < 100)

# Calculate mean exon expression for each gene
mean_expressions <- data_long %>%
  group_by(gene_name) %>%
  summarize(mean_expression = mean(expression, na.rm = TRUE)) %>%
  filter(mean_expression >= 10)

# Filter out genes with mean expression less than 10
data_long_filtered <- data_long %>% 
  semi_join(mean_expressions, by = "gene_name")

# Function to get regression line data
get_regression_line <- function(df) {
  model <- lm(expression ~ as.numeric(gsub("exon_", "", exon)), data = df)
  tibble(
    exon = as.numeric(gsub("exon_", "", df$exon)),
    predicted_expression = predict(model, newdata = df)
  )
}

# Calculate regression lines for each gene
regression_lines <- data_long_filtered %>%
  group_by(gene_name) %>%
  nest() %>%
  mutate(regression_data = map(data, get_regression_line)) %>%
  select(-data) %>%
  unnest(regression_data)

## Warning: There were 1966 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `regression_data = map(data, get_regression_line)`.
## ℹ In group 1: `gene_name = "gene-ATP6"`.
## Caused by warning in `predict.lm()`:
## ! prediction from a rank-deficient fit may be misleading
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1965 remaining warnings.

# Plotting
ggplot() +
  geom_point(data = data_long_filtered, aes(x = as.numeric(gsub("exon_", "", exon)), y = expression, color = gene_name), alpha = 0.5) +
  geom_line(data = regression_lines, aes(x = exon, y = predicted_expression, color = gene_name), size = 1) +
  theme_minimal() +
  labs(title = "Exon Expression with Best-Fit Lines for Each Gene (Filtered Data)",
       x = "Exon Number",
       y = "Expression") +
  theme(legend.position = "none")  # Remove legend if too many genes

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Specify the file location
file_location <- "../output/64-exon-expression/exon-summary.csv"

# Read data from the specified file
data <- read.csv(file_location, header = TRUE, sep = ",")

# Check the structure of the data
str(data)

## 'data.frame':    38838 obs. of  8 variables:
##  $ gene_name              : chr  "gene-LOC111116054" "gene-LOC111126949" "gene-LOC111110729" "gene-LOC111112434" ...
##  $ slope_best_fit         : num  -60.5 -2 -82.1 49.5 1.4 ...
##  $ r_squared              : num  0.90728 0.0023 0.75891 0.86755 0.00586 ...
##  $ percentage_within_95_CI: num  100 100 100 100 100 NA NA 87.5 NA 100 ...
##  $ num_exons              : int  3 5 4 3 5 1 2 8 2 4 ...
##  $ mean_exon_expression   : num  80.7 78.8 102.8 62.3 60 ...
##  $ std_dev_exon_expression: num  63.5 65.9 121.7 53.1 28.9 ...
##  $ sum_exon_expression    : int  242 394 411 187 300 416 299 493 196 8 ...

# Create histograms for numeric variables
par(mfrow = c(2, 3))  # Arrange histograms in a 2x3 grid
hist(data$slope_best_fit, main = "Slope Best Fit", xlab = "Value", xlim = c(-200, 200), breaks = 2000)
hist(data$r_squared, main = "R-squared", xlab = "Value")
hist(data$num_exons, main = "Number of Exons", xlab = "Value", breaks = 100)
hist(data$mean_exon_expression, main = "Mean Exon Expression", xlab = "Value", breaks = 100, xlim = c(0, 1000))
hist(data$std_dev_exon_expression, main = "Std Dev Exon Expression", xlab = "Value", breaks = 100, xlim = c(0, 1000))
hist(data$sum_exon_expression, main = "Sum Exon Expression", xlab = "Value", breaks = 1000, xlim = c(0, 1000))

# Reset the plotting layout
par(mfrow = c(1, 1))

# Specify the file location
file_location <- "../output/64-exon-expression/exon-summary.csv"
# Read data from the specified file
data <- read.csv(file_location, header = TRUE, sep = ",")

# Create a scatter plot for slope_best_fit with custom colors
plot(data$slope_best_fit, type = "p", pch = 16, 
     xlab = "Index", ylab = "Slope Best Fit", 
     main = "Scatter Plot of Slope Best Fit",
     col = ifelse(data$slope_best_fit > 1000 | data$slope_best_fit < -1000, "red", "blue"))

# Add a legend for the custom colors
legend("topright", legend = c("<= -1000 or >= 1000", "> -1000 and < 1000"), 
       col = c("red", "blue"), pch = 16)

# Specify the file location
file_location <- "../output/64-exon-expression/exon-summary.csv"

# Read data from the specified file
data <- read.csv(file_location, header = TRUE, sep = ",")

# Filter the data based on conditions
filtered_data <- subset(data, r_squared > 0.6 & num_exons > 4)

# Now, filtered_data contains only the rows that meet the criteria.