--- title: "41-exon-gbm" author: "Auto-generated" date: "`r Sys.Date()`" output: html_document: toc: true toc_float: true number_sections: true code_folding: show --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, fig.width = 10, fig.height = 7) library(dplyr) library(readr) library(tidyr) library(ggplot2) library(here) ``` # Overview This notebook plots the coefficient of variation (CV) of exon-level expression against gene body methylation (GBM) for each of three coral species: - **Acropora pulchra** (Apul) - **Porites evermanni** (Peve) - **Pocillopora tuahiniensis** (Ptua) The analysis explores the relationship between transcriptional variability at the exon level and epigenetic regulation via DNA methylation. # Data Paths ```{r paths} # Exon summary files (from 40-exon-count-matrix.Rmd) exon_summary_dir <- here("M-multi-species/output/40-exon-count-matrix") apul_exon_summary_path <- file.path(exon_summary_dir, "apul-gene_summary_by_ortholog.csv") peve_exon_summary_path <- file.path(exon_summary_dir, "peve-gene_summary_by_ortholog.csv") ptua_exon_summary_path <- file.path(exon_summary_dir, "ptua-gene_summary_by_ortholog.csv") # Gene body methylation files (75% coverage threshold) apul_gbm_path <- here("D-Apul/output/40-Apul-Gene-Methylation/Apul-gene-methylation_75pct.tsv") peve_gbm_path <- here("E-Peve/output/15-Peve-Gene-Methylation/Peve-gene-methylation_75pct.tsv") ptua_gbm_path <- here("F-Ptua/output/09-Ptua-Gene-Methylation/Ptua-gene-methylation_75pct.tsv") # Output directory output_dir <- here("M-multi-species/output/41-exon-gbm") if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE) ``` # Load Data ## Exon expression summaries ```{r load-exon-summaries} apul_exon_summary <- read_csv(apul_exon_summary_path, show_col_types = FALSE) peve_exon_summary <- read_csv(peve_exon_summary_path, show_col_types = FALSE) ptua_exon_summary <- read_csv(ptua_exon_summary_path, show_col_types = FALSE) cat("Apul gene summary:", nrow(apul_exon_summary), "genes\n") cat("Peve gene summary:", nrow(peve_exon_summary), "genes\n") cat("Ptua gene summary:", nrow(ptua_exon_summary), "genes\n") ``` ## Gene body methylation ```{r load-gbm} apul_gbm <- read_tsv(apul_gbm_path, show_col_types = FALSE) peve_gbm <- read_tsv(peve_gbm_path, show_col_types = FALSE) ptua_gbm <- read_tsv(ptua_gbm_path, show_col_types = FALSE) cat("Apul GBM:", nrow(apul_gbm), "genes\n") cat("Peve GBM:", nrow(peve_gbm), "genes\n") cat("Ptua GBM:", nrow(ptua_gbm), "genes\n") ``` # Compute Mean GBM per Gene Average gene body methylation across all samples for each gene. ```{r compute-mean-gbm} compute_mean_gbm <- function(gbm_df) { # First column is gene_id, rest are sample columns sample_cols <- setdiff(names(gbm_df), "gene_id") gbm_df %>% rowwise() %>% mutate(mean_gbm = mean(c_across(all_of(sample_cols)), na.rm = TRUE)) %>% ungroup() %>% select(gene_id, mean_gbm) } apul_mean_gbm <- compute_mean_gbm(apul_gbm) peve_mean_gbm <- compute_mean_gbm(peve_gbm) ptua_mean_gbm <- compute_mean_gbm(ptua_gbm) head(apul_mean_gbm) ``` # Merge CV and GBM Data Join exon expression CV with gene body methylation for each species. Filter to include only genes with more than 5 exons. ```{r merge-data} # Apul: gene_id in exon summary is like "FUN_000185", GBM has same format apul_merged <- apul_exon_summary %>% filter(n_exons > 5) %>% inner_join(apul_mean_gbm, by = "gene_id") %>% filter(!is.na(cv_expr_gene), !is.na(mean_gbm)) %>% mutate(species = "Acropora pulchra") # Peve: gene_id in exon summary is like "gene-Peve_00000039", GBM has same format peve_merged <- peve_exon_summary %>% filter(n_exons > 5) %>% inner_join(peve_mean_gbm, by = "gene_id") %>% filter(!is.na(cv_expr_gene), !is.na(mean_gbm)) %>% mutate(species = "Porites evermanni") # Ptua: gene_id in exon summary is like "gene-Pocillopora_meandrina_HIv1___RNAseq.g20902.t1", GBM has same format ptua_merged <- ptua_exon_summary %>% filter(n_exons > 5) %>% inner_join(ptua_mean_gbm, by = "gene_id") %>% filter(!is.na(cv_expr_gene), !is.na(mean_gbm)) %>% mutate(species = "Pocillopora tuahiniensis") cat("Apul merged:", nrow(apul_merged), "genes with >5 exons, CV, and GBM\n") cat("Peve merged:", nrow(peve_merged), "genes with >5 exons, CV, and GBM\n") cat("Ptua merged:", nrow(ptua_merged), "genes with >5 exons, CV, and GBM\n") ``` # Combined Dataset ```{r combine-data} all_species <- bind_rows(apul_merged, peve_merged, ptua_merged) # Summary all_species %>% group_by(species) %>% summarise( n_genes = n(), mean_cv = mean(cv_expr_gene, na.rm = TRUE), median_cv = median(cv_expr_gene, na.rm = TRUE), mean_gbm = mean(mean_gbm, na.rm = TRUE), median_gbm = median(mean_gbm, na.rm = TRUE) ) ``` # Plots ## Exon CV vs Gene Body Methylation - All Species ```{r plot-cv-gbm-all, fig.width=12, fig.height=5} ggplot(all_species, aes(x = mean_gbm, y = cv_expr_gene)) + geom_point(alpha = 0.3, size = 0.8) + geom_smooth(method = "lm", se = TRUE, color = "red", linewidth = 1) + facet_wrap(~species, scales = "free") + labs( x = "Mean Gene Body Methylation (%)", y = "Coefficient of Variation (Exon Expression)", title = "Exon Expression CV vs Gene Body Methylation", subtitle = "Each point represents one gene" ) + theme_minimal() + theme( strip.text = element_text(face = "bold", size = 12), axis.title = element_text(size = 11) ) ggsave(file.path(output_dir, "cv_vs_gbm_all_species.png"), width = 12, height = 5, dpi = 300) ``` ## Individual Species Plots with Density ### Acropora pulchra ```{r plot-apul, fig.width=8, fig.height=6} ggplot(apul_merged, aes(x = mean_gbm, y = cv_expr_gene)) + geom_hex(bins = 50) + scale_fill_viridis_c(option = "plasma", trans = "log10") + geom_smooth(method = "lm", se = TRUE, color = "white", linewidth = 1.2) + labs( x = "Mean Gene Body Methylation (%)", y = "Coefficient of Variation (Exon Expression)", title = "Acropora pulchra: Exon CV vs Gene Body Methylation", fill = "Gene\nCount" ) + theme_minimal() + theme( plot.title = element_text(face = "bold", size = 14), legend.position = "right" ) ggsave(file.path(output_dir, "cv_vs_gbm_apul.png"), width = 8, height = 6, dpi = 300) ``` ### Porites evermanni ```{r plot-peve, fig.width=8, fig.height=6} ggplot(peve_merged, aes(x = mean_gbm, y = cv_expr_gene)) + geom_hex(bins = 50) + scale_fill_viridis_c(option = "plasma", trans = "log10") + geom_smooth(method = "lm", se = TRUE, color = "white", linewidth = 1.2) + labs( x = "Mean Gene Body Methylation (%)", y = "Coefficient of Variation (Exon Expression)", title = "Porites evermanni: Exon CV vs Gene Body Methylation", fill = "Gene\nCount" ) + theme_minimal() + theme( plot.title = element_text(face = "bold", size = 14), legend.position = "right" ) ggsave(file.path(output_dir, "cv_vs_gbm_peve.png"), width = 8, height = 6, dpi = 300) ``` ### Pocillopora tuahiniensis ```{r plot-ptua, fig.width=8, fig.height=6} ggplot(ptua_merged, aes(x = mean_gbm, y = cv_expr_gene)) + geom_hex(bins = 50) + scale_fill_viridis_c(option = "plasma", trans = "log10") + geom_smooth(method = "lm", se = TRUE, color = "white", linewidth = 1.2) + labs( x = "Mean Gene Body Methylation (%)", y = "Coefficient of Variation (Exon Expression)", title = "Pocillopora tuahiniensis: Exon CV vs Gene Body Methylation", fill = "Gene\nCount" ) + theme_minimal() + theme( plot.title = element_text(face = "bold", size = 14), legend.position = "right" ) ggsave(file.path(output_dir, "cv_vs_gbm_ptua.png"), width = 8, height = 6, dpi = 300) ``` ## Combined Overlay Plot ```{r plot-overlay, fig.width=10, fig.height=7} species_colors <- c( "Acropora pulchra" = "#E69F00", "Porites evermanni" = "#56B4E9", "Pocillopora tuahiniensis" = "#009E73" ) ggplot(all_species, aes(x = mean_gbm, y = cv_expr_gene, color = species)) + geom_point(alpha = 0.2, size = 0.5) + geom_smooth(method = "lm", se = TRUE, linewidth = 1.5) + scale_color_manual(values = species_colors) + labs( x = "Mean Gene Body Methylation (%)", y = "Coefficient of Variation (Exon Expression)", title = "Exon Expression CV vs Gene Body Methylation", subtitle = "Comparison across three coral species", color = "Species" ) + theme_minimal() + theme( plot.title = element_text(face = "bold", size = 14), legend.position = "bottom", legend.text = element_text(face = "italic") ) ggsave(file.path(output_dir, "cv_vs_gbm_overlay.png"), width = 10, height = 7, dpi = 300) ``` # Statistical Analysis ## Correlation Tests ```{r correlation-tests} # Pearson correlation for each species cat("=== Pearson Correlation: CV vs GBM ===\n\n") apul_cor <- cor.test(apul_merged$cv_expr_gene, apul_merged$mean_gbm) cat("Acropora pulchra:\n") cat(" r =", round(apul_cor$estimate, 4), "\n") cat(" p-value =", format(apul_cor$p.value, scientific = TRUE, digits = 3), "\n") cat(" n =", nrow(apul_merged), "\n\n") peve_cor <- cor.test(peve_merged$cv_expr_gene, peve_merged$mean_gbm) cat("Porites evermanni:\n") cat(" r =", round(peve_cor$estimate, 4), "\n") cat(" p-value =", format(peve_cor$p.value, scientific = TRUE, digits = 3), "\n") cat(" n =", nrow(peve_merged), "\n\n") ptua_cor <- cor.test(ptua_merged$cv_expr_gene, ptua_merged$mean_gbm) cat("Pocillopora tuahiniensis:\n") cat(" r =", round(ptua_cor$estimate, 4), "\n") cat(" p-value =", format(ptua_cor$p.value, scientific = TRUE, digits = 3), "\n") cat(" n =", nrow(ptua_merged), "\n") ``` ## Linear Models ```{r linear-models} cat("=== Linear Regression: CV ~ GBM ===\n\n") apul_lm <- lm(cv_expr_gene ~ mean_gbm, data = apul_merged) cat("Acropora pulchra:\n") print(summary(apul_lm)$coefficients) cat(" R² =", round(summary(apul_lm)$r.squared, 4), "\n\n") peve_lm <- lm(cv_expr_gene ~ mean_gbm, data = peve_merged) cat("Porites evermanni:\n") print(summary(peve_lm)$coefficients) cat(" R² =", round(summary(peve_lm)$r.squared, 4), "\n\n") ptua_lm <- lm(cv_expr_gene ~ mean_gbm, data = ptua_merged) cat("Pocillopora tuahiniensis:\n") print(summary(ptua_lm)$coefficients) cat(" R² =", round(summary(ptua_lm)$r.squared, 4), "\n") ``` # Export Combined Data ```{r export} write_csv(all_species, file.path(output_dir, "exon_cv_gbm_all_species.csv")) write_csv(apul_merged, file.path(output_dir, "exon_cv_gbm_apul.csv")) write_csv(peve_merged, file.path(output_dir, "exon_cv_gbm_peve.csv")) write_csv(ptua_merged, file.path(output_dir, "exon_cv_gbm_ptua.csv")) cat("\nExported data to:", output_dir, "\n") ``` # Session Info ```{r session-info} sessionInfo() ```