--- title: "45-exon-mCpG" author: "Auto-generated" date: "`r Sys.Date()`" output: html_document: toc: true toc_float: true number_sections: true code_folding: show --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, message = TRUE, warning = TRUE) library(dplyr) library(readr) library(tidyr) library(stringr) library(ggplot2) library(here) ``` # Overview This notebook examines the relationship between local CpG methylation and exon-level expression. For each exon, we identify CpG sites that fall within the exon coordinates and compute correlations between methylation levels and exon expression across samples. ## Data Sources ### Exon Expression Matrices - Apul: `https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries-molecular-calcification/M-multi-species/output/40-exon-count-matrix/apul-exon_gene_count_matrix.csv` - Peve: `https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries-molecular-calcification/M-multi-species/output/40-exon-count-matrix/peve-exon_gene_count_matrix.csv` - Ptua: `https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries-molecular-calcification/M-multi-species/output/40-exon-count-matrix/ptua-exon_gene_count_matrix.csv` ### mCpG Methylation Data - Apul: `https://gannet.fish.washington.edu/metacarcinus/E5/20250903_meth_Apul/merged-WGBS-CpG-counts_filtered_n20.csv` - Peve: `https://gannet.fish.washington.edu/metacarcinus/E5/Pevermanni/20250821_meth_Peve/merged-WGBS-CpG-counts_filtered_n20.csv` - Ptua: `https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250821_meth_Ptua/merged-WGBS-CpG-counts_filtered_n20.csv` ## CpG ID Parsing CpG IDs encode chromosome and position: - **Apul**: `CpG_ntLink_0_90500` → chr = `ntLink_0`, pos = `90500` - **Peve**: `CpG_Porites_evermani_scaffold_1000_100536` → chr = `Porites_evermani_scaffold_1000`, pos = `100536` - **Ptua**: `CpG_Pocillopora_meandrina_HIv1___Sc0000000_1000005` → chr = `Pocillopora_meandrina_HIv1___Sc0000000`, pos = `1000005` ```{r output-dir} output_dir <- here("M-multi-species/output/45-exon-mCpG") if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE) ``` # Helper Functions ## Parse CpG ID to extract chromosome and position ```{r helper-parse-cpg} # Parse CpG IDs to extract chromosome and position # Pattern: CpG_{chromosome}_{position} where position is the last numeric segment parse_cpg_id <- function(cpg_id) { # Remove "CpG_" prefix stripped <- str_remove(cpg_id, "^CpG_") # The position is the last underscore-separated segment (always numeric) # The chromosome is everything before that parts <- str_match(stripped, "^(.+)_([0-9]+)$") tibble( cpg_id = cpg_id, cpg_chr = parts[, 2], cpg_pos = as.numeric(parts[, 3]) ) } # Vectorized version for efficiency parse_cpg_ids <- function(cpg_ids) { stripped <- str_remove(cpg_ids, "^CpG_") parts <- str_match(stripped, "^(.+)_([0-9]+)$") tibble( cpg_id = cpg_ids, cpg_chr = parts[, 2], cpg_pos = as.numeric(parts[, 3]) ) } ``` ## Find CpGs within exon coordinates ```{r helper-find-cpgs-in-exons} # For each exon, find CpG sites that fall within its coordinates # Returns a tibble mapping exon_id to cpg_ids find_cpgs_in_exons <- function(exon_coords, cpg_coords) { # exon_coords: tibble with gene_id, e_id, chr, start, end # cpg_coords: tibble with cpg_id, cpg_chr, cpg_pos # Join on chromosome and filter by position within exon exon_cpg_map <- exon_coords %>% inner_join(cpg_coords, by = c("chr" = "cpg_chr")) %>% filter(cpg_pos >= start & cpg_pos <= end) %>% select(gene_id, e_id, chr, start, end, cpg_id, cpg_pos) exon_cpg_map } ``` ## Compute correlation between exon expression and CpG methylation ```{r helper-compute-correlation} # Compute correlation between exon expression and CpG methylation # for matched samples compute_exon_cpg_correlation <- function(exon_expr, cpg_meth, exon_cpg_map, sample_cols) { # exon_expr: exon expression matrix with sample columns # cpg_meth: CpG methylation matrix with sample columns # exon_cpg_map: mapping of exons to CpGs within them # sample_cols: vector of sample column names present in both datasets results <- list() # Get unique exon-CpG pairs pairs <- exon_cpg_map %>% distinct(gene_id, e_id, cpg_id, chr, start, end, cpg_pos) if (nrow(pairs) == 0) { return(tibble()) } # For each pair, compute correlation for (i in seq_len(nrow(pairs))) { pair <- pairs[i, ] # Get exon expression values exon_row <- exon_expr %>% filter(gene_id == pair$gene_id, e_id == pair$e_id) if (nrow(exon_row) == 0) next # Get CpG methylation values cpg_row <- cpg_meth %>% filter(CpG == pair$cpg_id) if (nrow(cpg_row) == 0) next # Extract values for common samples exon_vals <- as.numeric(exon_row[1, sample_cols]) cpg_vals <- as.numeric(cpg_row[1, sample_cols]) # Remove pairs with NA valid_idx <- !is.na(exon_vals) & !is.na(cpg_vals) exon_vals <- exon_vals[valid_idx] cpg_vals <- cpg_vals[valid_idx] # Need at least 5 samples for meaningful correlation if (length(exon_vals) < 5) next # Compute correlation cor_result <- tryCatch({ cor.test(exon_vals, cpg_vals, method = "spearman") }, error = function(e) NULL) if (!is.null(cor_result)) { results[[length(results) + 1]] <- tibble( gene_id = pair$gene_id, e_id = pair$e_id, chr = pair$chr, exon_start = pair$start, exon_end = pair$end, cpg_id = pair$cpg_id, cpg_pos = pair$cpg_pos, n_samples = length(exon_vals), correlation = cor_result$estimate, p_value = cor_result$p.value ) } } if (length(results) == 0) { return(tibble()) } bind_rows(results) } ``` ## Faster vectorized correlation computation ```{r helper-compute-correlation-fast} # Faster version using matrix operations compute_exon_cpg_correlation_fast <- function(exon_expr, cpg_meth, exon_cpg_map, sample_cols) { # Get unique exon-CpG pairs pairs <- exon_cpg_map %>% distinct(gene_id, e_id, cpg_id, chr, start, end, cpg_pos) if (nrow(pairs) == 0) { return(tibble()) } # Create unique exon key pairs <- pairs %>% mutate(exon_key = paste(gene_id, e_id, sep = "_")) exon_expr <- exon_expr %>% mutate(exon_key = paste(gene_id, e_id, sep = "_")) results <- vector("list", nrow(pairs)) for (i in seq_len(nrow(pairs))) { pair <- pairs[i, ] # Get exon expression values exon_row <- exon_expr %>% filter(exon_key == pair$exon_key) if (nrow(exon_row) == 0) next # Get CpG methylation values cpg_row <- cpg_meth %>% filter(CpG == pair$cpg_id) if (nrow(cpg_row) == 0) next # Extract values for common samples exon_vals <- as.numeric(exon_row[1, sample_cols]) cpg_vals <- as.numeric(cpg_row[1, sample_cols]) # Remove pairs with NA valid_idx <- !is.na(exon_vals) & !is.na(cpg_vals) n_valid <- sum(valid_idx) # Need at least 5 samples for meaningful correlation if (n_valid < 5) next exon_vals_clean <- exon_vals[valid_idx] cpg_vals_clean <- cpg_vals[valid_idx] # Compute Spearman correlation cor_result <- tryCatch({ cor.test(exon_vals_clean, cpg_vals_clean, method = "spearman") }, error = function(e) NULL) if (!is.null(cor_result)) { results[[i]] <- tibble( gene_id = pair$gene_id, e_id = pair$e_id, chr = pair$chr, exon_start = pair$start, exon_end = pair$end, cpg_id = pair$cpg_id, cpg_pos = pair$cpg_pos, n_samples = n_valid, correlation = as.numeric(cor_result$estimate), p_value = cor_result$p.value ) } } bind_rows(results) } ``` # Acropora pulchra (Apul) ## Load Data ```{r load-apul-data} # Load exon expression matrix apul_exon <- read_csv( "https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries-molecular-calcification/M-multi-species/output/40-exon-count-matrix/apul-exon_gene_count_matrix.csv", show_col_types = FALSE ) # Load CpG methylation data apul_cpg <- read_csv( "https://gannet.fish.washington.edu/metacarcinus/E5/20250903_meth_Apul/merged-WGBS-CpG-counts_filtered_n20.csv", show_col_types = FALSE ) cat("Apul exon matrix:", nrow(apul_exon), "exons x", ncol(apul_exon) - 6, "samples\n") cat("Apul CpG matrix:", nrow(apul_cpg), "CpGs x", ncol(apul_cpg) - 1, "samples\n") ``` ## Parse CpG Coordinates ```{r parse-apul-cpg} # Parse CpG IDs to get chromosome and position apul_cpg_coords <- parse_cpg_ids(apul_cpg$CpG) # Show distribution by chromosome apul_cpg_coords %>% count(cpg_chr, name = "n_cpgs") %>% arrange(desc(n_cpgs)) %>% head(20) ``` ## Find CpGs Within Exons ```{r find-apul-cpgs-in-exons} # Extract exon coordinates apul_exon_coords <- apul_exon %>% select(gene_id, e_id, chr, strand, start, end) # Find CpGs that fall within exons apul_exon_cpg_map <- find_cpgs_in_exons(apul_exon_coords, apul_cpg_coords) cat("Found", nrow(apul_exon_cpg_map), "exon-CpG pairs\n") cat("Unique exons with CpGs:", n_distinct(paste(apul_exon_cpg_map$gene_id, apul_exon_cpg_map$e_id)), "\n") cat("Unique CpGs in exons:", n_distinct(apul_exon_cpg_map$cpg_id), "\n") # Summary: CpGs per exon apul_exon_cpg_map %>% count(gene_id, e_id, name = "n_cpgs") %>% summarise( mean_cpgs_per_exon = mean(n_cpgs), median_cpgs_per_exon = median(n_cpgs), max_cpgs_per_exon = max(n_cpgs) ) ``` ## Identify Common Samples ```{r apul-common-samples} # Get sample columns from exon data (columns 7 onwards) apul_exon_samples <- colnames(apul_exon)[7:ncol(apul_exon)] # Get sample columns from CpG data (columns 2 onwards) apul_cpg_samples <- colnames(apul_cpg)[2:ncol(apul_cpg)] # Find common samples apul_common_samples <- intersect(apul_exon_samples, apul_cpg_samples) cat("Exon samples:", length(apul_exon_samples), "\n") cat("CpG samples:", length(apul_cpg_samples), "\n") cat("Common samples:", length(apul_common_samples), "\n") ``` ## Compute Correlations ```{r compute-apul-correlations} # Compute correlations between exon expression and CpG methylation apul_correlations <- compute_exon_cpg_correlation_fast( exon_expr = apul_exon, cpg_meth = apul_cpg, exon_cpg_map = apul_exon_cpg_map, sample_cols = apul_common_samples ) cat("Computed", nrow(apul_correlations), "correlations\n") # Add adjusted p-values if (nrow(apul_correlations) > 0) { apul_correlations <- apul_correlations %>% mutate(p_adj = p.adjust(p_value, method = "BH")) } ``` ## Summarize Results ```{r summarize-apul-results} if (nrow(apul_correlations) > 0) { # Summary statistics cat("\n=== Apul Correlation Summary ===\n") cat("Total exon-CpG pairs tested:", nrow(apul_correlations), "\n") cat("Significant (p < 0.05):", sum(apul_correlations$p_value < 0.05), "\n") cat("Significant (FDR < 0.1):", sum(apul_correlations$p_adj < 0.1), "\n") # Direction of correlations cat("\nCorrelation direction:\n") cat("Positive correlations:", sum(apul_correlations$correlation > 0), "\n") cat("Negative correlations:", sum(apul_correlations$correlation < 0), "\n") # Significant correlations breakdown sig <- apul_correlations %>% filter(p_adj < 0.1) if (nrow(sig) > 0) { cat("\nSignificant correlations (FDR < 0.1):\n") cat("Positive:", sum(sig$correlation > 0), "\n") cat("Negative:", sum(sig$correlation < 0), "\n") } # Distribution of correlation coefficients print(summary(apul_correlations$correlation)) # Save results write_csv(apul_correlations, file.path(output_dir, "apul_exon_cpg_correlations.csv")) } ``` ## Visualization ```{r plot-apul-correlations, fig.width=10, fig.height=6} if (nrow(apul_correlations) > 0) { # Histogram of correlations p1 <- ggplot(apul_correlations, aes(x = correlation)) + geom_histogram(bins = 50, fill = "steelblue", color = "white") + geom_vline(xintercept = 0, linetype = "dashed", color = "red") + labs( title = "Apul: Distribution of Exon-CpG Correlations", x = "Spearman Correlation", y = "Count" ) + theme_minimal() print(p1) # Volcano plot p2 <- ggplot(apul_correlations, aes(x = correlation, y = -log10(p_value))) + geom_point(aes(color = p_adj < 0.1), alpha = 0.5, size = 1) + scale_color_manual(values = c("grey50", "red"), name = "FDR < 0.1") + geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "blue") + labs( title = "Apul: Volcano Plot of Exon-CpG Correlations", x = "Spearman Correlation", y = "-log10(p-value)" ) + theme_minimal() print(p2) } ``` ## Top Correlations ```{r apul-top-correlations} if (nrow(apul_correlations) > 0) { # Top positive correlations cat("\n=== Top 20 Positive Correlations (Apul) ===\n") apul_correlations %>% filter(correlation > 0) %>% arrange(p_adj, desc(correlation)) %>% head(20) %>% print() # Top negative correlations cat("\n=== Top 20 Negative Correlations (Apul) ===\n") apul_correlations %>% filter(correlation < 0) %>% arrange(p_adj, correlation) %>% head(20) %>% print() } ``` # Porites evermanni (Peve) ## Load Data ```{r load-peve-data} # Load exon expression matrix peve_exon <- read_csv( "https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries-molecular-calcification/M-multi-species/output/40-exon-count-matrix/peve-exon_gene_count_matrix.csv", show_col_types = FALSE ) # Load CpG methylation data peve_cpg <- read_csv( "https://gannet.fish.washington.edu/metacarcinus/E5/Pevermanni/20250821_meth_Peve/merged-WGBS-CpG-counts_filtered_n20.csv", show_col_types = FALSE ) cat("Peve exon matrix:", nrow(peve_exon), "exons x", ncol(peve_exon) - 6, "samples\n") cat("Peve CpG matrix:", nrow(peve_cpg), "CpGs x", ncol(peve_cpg) - 1, "samples\n") ``` ## Parse CpG Coordinates ```{r parse-peve-cpg} # Parse CpG IDs to get chromosome and position peve_cpg_coords <- parse_cpg_ids(peve_cpg$CpG) # Show distribution by chromosome peve_cpg_coords %>% count(cpg_chr, name = "n_cpgs") %>% arrange(desc(n_cpgs)) %>% head(20) ``` ## Find CpGs Within Exons ```{r find-peve-cpgs-in-exons} # Extract exon coordinates peve_exon_coords <- peve_exon %>% select(gene_id, e_id, chr, strand, start, end) # Find CpGs that fall within exons peve_exon_cpg_map <- find_cpgs_in_exons(peve_exon_coords, peve_cpg_coords) cat("Found", nrow(peve_exon_cpg_map), "exon-CpG pairs\n") cat("Unique exons with CpGs:", n_distinct(paste(peve_exon_cpg_map$gene_id, peve_exon_cpg_map$e_id)), "\n") cat("Unique CpGs in exons:", n_distinct(peve_exon_cpg_map$cpg_id), "\n") # Summary: CpGs per exon peve_exon_cpg_map %>% count(gene_id, e_id, name = "n_cpgs") %>% summarise( mean_cpgs_per_exon = mean(n_cpgs), median_cpgs_per_exon = median(n_cpgs), max_cpgs_per_exon = max(n_cpgs) ) ``` ## Identify Common Samples ```{r peve-common-samples} # Get sample columns from exon data (columns 7 onwards) peve_exon_samples <- colnames(peve_exon)[7:ncol(peve_exon)] # Get sample columns from CpG data (columns 2 onwards) peve_cpg_samples <- colnames(peve_cpg)[2:ncol(peve_cpg)] # Find common samples peve_common_samples <- intersect(peve_exon_samples, peve_cpg_samples) cat("Exon samples:", length(peve_exon_samples), "\n") cat("CpG samples:", length(peve_cpg_samples), "\n") cat("Common samples:", length(peve_common_samples), "\n") ``` ## Compute Correlations ```{r compute-peve-correlations} # Compute correlations between exon expression and CpG methylation peve_correlations <- compute_exon_cpg_correlation_fast( exon_expr = peve_exon, cpg_meth = peve_cpg, exon_cpg_map = peve_exon_cpg_map, sample_cols = peve_common_samples ) cat("Computed", nrow(peve_correlations), "correlations\n") # Add adjusted p-values if (nrow(peve_correlations) > 0) { peve_correlations <- peve_correlations %>% mutate(p_adj = p.adjust(p_value, method = "BH")) } ``` ## Summarize Results ```{r summarize-peve-results} if (nrow(peve_correlations) > 0) { # Summary statistics cat("\n=== Peve Correlation Summary ===\n") cat("Total exon-CpG pairs tested:", nrow(peve_correlations), "\n") cat("Significant (p < 0.05):", sum(peve_correlations$p_value < 0.05), "\n") cat("Significant (FDR < 0.1):", sum(peve_correlations$p_adj < 0.1), "\n") # Direction of correlations cat("\nCorrelation direction:\n") cat("Positive correlations:", sum(peve_correlations$correlation > 0), "\n") cat("Negative correlations:", sum(peve_correlations$correlation < 0), "\n") # Significant correlations breakdown sig <- peve_correlations %>% filter(p_adj < 0.1) if (nrow(sig) > 0) { cat("\nSignificant correlations (FDR < 0.1):\n") cat("Positive:", sum(sig$correlation > 0), "\n") cat("Negative:", sum(sig$correlation < 0), "\n") } # Distribution of correlation coefficients print(summary(peve_correlations$correlation)) # Save results write_csv(peve_correlations, file.path(output_dir, "peve_exon_cpg_correlations.csv")) } ``` ## Visualization ```{r plot-peve-correlations, fig.width=10, fig.height=6} if (nrow(peve_correlations) > 0) { # Histogram of correlations p1 <- ggplot(peve_correlations, aes(x = correlation)) + geom_histogram(bins = 50, fill = "coral", color = "white") + geom_vline(xintercept = 0, linetype = "dashed", color = "red") + labs( title = "Peve: Distribution of Exon-CpG Correlations", x = "Spearman Correlation", y = "Count" ) + theme_minimal() print(p1) # Volcano plot p2 <- ggplot(peve_correlations, aes(x = correlation, y = -log10(p_value))) + geom_point(aes(color = p_adj < 0.1), alpha = 0.5, size = 1) + scale_color_manual(values = c("grey50", "red"), name = "FDR < 0.1") + geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "blue") + labs( title = "Peve: Volcano Plot of Exon-CpG Correlations", x = "Spearman Correlation", y = "-log10(p-value)" ) + theme_minimal() print(p2) } ``` ## Top Correlations ```{r peve-top-correlations} if (nrow(peve_correlations) > 0) { # Top positive correlations cat("\n=== Top 20 Positive Correlations (Peve) ===\n") peve_correlations %>% filter(correlation > 0) %>% arrange(p_adj, desc(correlation)) %>% head(20) %>% print() # Top negative correlations cat("\n=== Top 20 Negative Correlations (Peve) ===\n") peve_correlations %>% filter(correlation < 0) %>% arrange(p_adj, correlation) %>% head(20) %>% print() } ``` # Pocillopora tuahiniensis (Ptua) ## Load Data ```{r load-ptua-data} # Load exon expression matrix ptua_exon <- read_csv( "https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries-molecular-calcification/M-multi-species/output/40-exon-count-matrix/ptua-exon_gene_count_matrix.csv", show_col_types = FALSE ) # Load CpG methylation data ptua_cpg <- read_csv( "https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250821_meth_Ptua/merged-WGBS-CpG-counts_filtered_n20.csv", show_col_types = FALSE ) cat("Ptua exon matrix:", nrow(ptua_exon), "exons x", ncol(ptua_exon) - 6, "samples\n") cat("Ptua CpG matrix:", nrow(ptua_cpg), "CpGs x", ncol(ptua_cpg) - 1, "samples\n") ``` ## Parse CpG Coordinates ```{r parse-ptua-cpg} # Parse CpG IDs to get chromosome and position ptua_cpg_coords <- parse_cpg_ids(ptua_cpg$CpG) # Show distribution by chromosome ptua_cpg_coords %>% count(cpg_chr, name = "n_cpgs") %>% arrange(desc(n_cpgs)) %>% head(20) ``` ## Find CpGs Within Exons ```{r find-ptua-cpgs-in-exons} # Extract exon coordinates ptua_exon_coords <- ptua_exon %>% select(gene_id, e_id, chr, strand, start, end) # Find CpGs that fall within exons ptua_exon_cpg_map <- find_cpgs_in_exons(ptua_exon_coords, ptua_cpg_coords) cat("Found", nrow(ptua_exon_cpg_map), "exon-CpG pairs\n") cat("Unique exons with CpGs:", n_distinct(paste(ptua_exon_cpg_map$gene_id, ptua_exon_cpg_map$e_id)), "\n") cat("Unique CpGs in exons:", n_distinct(ptua_exon_cpg_map$cpg_id), "\n") # Summary: CpGs per exon ptua_exon_cpg_map %>% count(gene_id, e_id, name = "n_cpgs") %>% summarise( mean_cpgs_per_exon = mean(n_cpgs), median_cpgs_per_exon = median(n_cpgs), max_cpgs_per_exon = max(n_cpgs) ) ``` ## Identify Common Samples ```{r ptua-common-samples} # Get sample columns from exon data (columns 7 onwards) ptua_exon_samples <- colnames(ptua_exon)[7:ncol(ptua_exon)] # Get sample columns from CpG data (columns 2 onwards) ptua_cpg_samples <- colnames(ptua_cpg)[2:ncol(ptua_cpg)] # Find common samples ptua_common_samples <- intersect(ptua_exon_samples, ptua_cpg_samples) cat("Exon samples:", length(ptua_exon_samples), "\n") cat("CpG samples:", length(ptua_cpg_samples), "\n") cat("Common samples:", length(ptua_common_samples), "\n") ``` ## Compute Correlations ```{r compute-ptua-correlations} # Compute correlations between exon expression and CpG methylation ptua_correlations <- compute_exon_cpg_correlation_fast( exon_expr = ptua_exon, cpg_meth = ptua_cpg, exon_cpg_map = ptua_exon_cpg_map, sample_cols = ptua_common_samples ) cat("Computed", nrow(ptua_correlations), "correlations\n") # Add adjusted p-values if (nrow(ptua_correlations) > 0) { ptua_correlations <- ptua_correlations %>% mutate(p_adj = p.adjust(p_value, method = "BH")) } ``` ## Summarize Results ```{r summarize-ptua-results} if (nrow(ptua_correlations) > 0) { # Summary statistics cat("\n=== Ptua Correlation Summary ===\n") cat("Total exon-CpG pairs tested:", nrow(ptua_correlations), "\n") cat("Significant (p < 0.05):", sum(ptua_correlations$p_value < 0.05), "\n") cat("Significant (FDR < 0.1):", sum(ptua_correlations$p_adj < 0.1), "\n") # Direction of correlations cat("\nCorrelation direction:\n") cat("Positive correlations:", sum(ptua_correlations$correlation > 0), "\n") cat("Negative correlations:", sum(ptua_correlations$correlation < 0), "\n") # Significant correlations breakdown sig <- ptua_correlations %>% filter(p_adj < 0.1) if (nrow(sig) > 0) { cat("\nSignificant correlations (FDR < 0.1):\n") cat("Positive:", sum(sig$correlation > 0), "\n") cat("Negative:", sum(sig$correlation < 0), "\n") } # Distribution of correlation coefficients print(summary(ptua_correlations$correlation)) # Save results write_csv(ptua_correlations, file.path(output_dir, "ptua_exon_cpg_correlations.csv")) } ``` ## Visualization ```{r plot-ptua-correlations, fig.width=10, fig.height=6} if (nrow(ptua_correlations) > 0) { # Histogram of correlations p1 <- ggplot(ptua_correlations, aes(x = correlation)) + geom_histogram(bins = 50, fill = "seagreen", color = "white") + geom_vline(xintercept = 0, linetype = "dashed", color = "red") + labs( title = "Ptua: Distribution of Exon-CpG Correlations", x = "Spearman Correlation", y = "Count" ) + theme_minimal() print(p1) # Volcano plot p2 <- ggplot(ptua_correlations, aes(x = correlation, y = -log10(p_value))) + geom_point(aes(color = p_adj < 0.1), alpha = 0.5, size = 1) + scale_color_manual(values = c("grey50", "red"), name = "FDR < 0.1") + geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "blue") + labs( title = "Ptua: Volcano Plot of Exon-CpG Correlations", x = "Spearman Correlation", y = "-log10(p-value)" ) + theme_minimal() print(p2) } ``` ## Top Correlations ```{r ptua-top-correlations} if (nrow(ptua_correlations) > 0) { # Top positive correlations cat("\n=== Top 20 Positive Correlations (Ptua) ===\n") ptua_correlations %>% filter(correlation > 0) %>% arrange(p_adj, desc(correlation)) %>% head(20) %>% print() # Top negative correlations cat("\n=== Top 20 Negative Correlations (Ptua) ===\n") ptua_correlations %>% filter(correlation < 0) %>% arrange(p_adj, correlation) %>% head(20) %>% print() } ``` # Cross-Species Comparison ## Summary Table ```{r cross-species-summary} # Combine summaries from all three species species_summary <- tibble( species = c("Apul", "Peve", "Ptua"), total_exons = c(nrow(apul_exon), nrow(peve_exon), nrow(ptua_exon)), total_cpgs = c(nrow(apul_cpg), nrow(peve_cpg), nrow(ptua_cpg)), exon_cpg_pairs = c( nrow(apul_exon_cpg_map), nrow(peve_exon_cpg_map), nrow(ptua_exon_cpg_map) ), correlations_tested = c( nrow(apul_correlations), nrow(peve_correlations), nrow(ptua_correlations) ), sig_p05 = c( sum(apul_correlations$p_value < 0.05, na.rm = TRUE), sum(peve_correlations$p_value < 0.05, na.rm = TRUE), sum(ptua_correlations$p_value < 0.05, na.rm = TRUE) ), sig_fdr10 = c( sum(apul_correlations$p_adj < 0.1, na.rm = TRUE), sum(peve_correlations$p_adj < 0.1, na.rm = TRUE), sum(ptua_correlations$p_adj < 0.1, na.rm = TRUE) ), mean_correlation = c( mean(apul_correlations$correlation, na.rm = TRUE), mean(peve_correlations$correlation, na.rm = TRUE), mean(ptua_correlations$correlation, na.rm = TRUE) ) ) print(species_summary) # Save summary write_csv(species_summary, file.path(output_dir, "cross_species_summary.csv")) ``` ## Correlation Distribution Comparison ```{r cross-species-correlation-plot, fig.width=12, fig.height=5} # Combine all correlations all_correlations <- bind_rows( apul_correlations %>% mutate(species = "Apul"), peve_correlations %>% mutate(species = "Peve"), ptua_correlations %>% mutate(species = "Ptua") ) if (nrow(all_correlations) > 0) { # Density plot comparison p <- ggplot(all_correlations, aes(x = correlation, fill = species)) + geom_density(alpha = 0.5) + geom_vline(xintercept = 0, linetype = "dashed", color = "black") + scale_fill_manual(values = c("Apul" = "steelblue", "Peve" = "coral", "Ptua" = "seagreen")) + labs( title = "Distribution of Exon-CpG Correlations Across Species", x = "Spearman Correlation", y = "Density", fill = "Species" ) + theme_minimal() print(p) # Save combined results write_csv(all_correlations, file.path(output_dir, "all_species_exon_cpg_correlations.csv")) } ``` # Gene-Level Summary Aggregate exon-level correlations to gene level to identify genes with consistent methylation-expression relationships. ```{r gene-level-summary} # Function to summarize at gene level summarize_gene_correlations <- function(corr_df, species_name) { if (nrow(corr_df) == 0) return(tibble()) corr_df %>% group_by(gene_id) %>% summarise( n_exons = n_distinct(e_id), n_cpgs = n_distinct(cpg_id), n_pairs = n(), mean_correlation = mean(correlation, na.rm = TRUE), median_correlation = median(correlation, na.rm = TRUE), sd_correlation = sd(correlation, na.rm = TRUE), n_sig_p05 = sum(p_value < 0.05, na.rm = TRUE), n_sig_fdr10 = sum(p_adj < 0.1, na.rm = TRUE), pct_positive = mean(correlation > 0, na.rm = TRUE) * 100, min_p_adj = min(p_adj, na.rm = TRUE), .groups = "drop" ) %>% arrange(min_p_adj) %>% mutate(species = species_name) } # Summarize for each species apul_gene_summary <- summarize_gene_correlations(apul_correlations, "Apul") peve_gene_summary <- summarize_gene_correlations(peve_correlations, "Peve") ptua_gene_summary <- summarize_gene_correlations(ptua_correlations, "Ptua") # Save gene-level summaries if (nrow(apul_gene_summary) > 0) { write_csv(apul_gene_summary, file.path(output_dir, "apul_gene_level_correlation_summary.csv")) cat("\nApul: Top 10 genes by significance:\n") print(head(apul_gene_summary, 10)) } if (nrow(peve_gene_summary) > 0) { write_csv(peve_gene_summary, file.path(output_dir, "peve_gene_level_correlation_summary.csv")) cat("\nPeve: Top 10 genes by significance:\n") print(head(peve_gene_summary, 10)) } if (nrow(ptua_gene_summary) > 0) { write_csv(ptua_gene_summary, file.path(output_dir, "ptua_gene_level_correlation_summary.csv")) cat("\nPtua: Top 10 genes by significance:\n") print(head(ptua_gene_summary, 10)) } ``` # Session Info ```{r session-info} sessionInfo() ```