Hypothesis A-Mod: Calcification Gene Activity and Gene Body Methylation

Hypothesis Overview

This analysis tests three hypotheses regarding calcification gene activity and gene body methylation (GBM) in three coral species across four timepoints:

Hypothesis 1: Genes that change in activity in T2 and T3 compared to T1 and T4 will have low gene body methylation.

Hypothesis 2: Genes that are highly expressed in T2 and T3 with high GBM overall will have higher GBM at T2 and T3.

Hypothesis 3: Genes that have high gene expression variability based on exon level expression at T2 and T3 will have differential GBM.

Study Design

Species: Acropora pulchra, Porites evermanni, Pocillopora tuahiniensis
Timepoints: TP1 (baseline), TP2 (warmest/stress), TP3 (recovery), TP4 (control)
Comparisons: T1→T2, T1→T3, T4→T2, T4→T3

Setup and Libraries

library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(viridis)
library(ggpubr)
library(cowplot)
library(broom)
library(scales)

# Set theme
theme_set(theme_minimal(base_size = 12))

# Species lookup
species_lookup <- c(
  apul = "Acropora pulchra",
  peve = "Porites evermanni",
  ptua = "Pocillopora tuahiniensis"
)

# Color palette for species
species_colors <- c(
  "Acropora pulchra" = "#E64B35",
  "Porites evermanni" = "#4DBBD5",
  "Pocillopora tuahiniensis" = "#00A087"
)

# Timepoint colors
tp_colors <- c("TP1" = "#2166AC", "TP2" = "#D6604D", "TP3" = "#4393C3", "TP4" = "#92C5DE")

# Comparison colors
comparison_colors <- c(
  "T1_to_T2" = "#E64B35",
  "T1_to_T3" = "#F39B7F",
  "T4_to_T2" = "#4DBBD5",
  "T4_to_T3" = "#91D1C2"
)

output_dir <- "../output/52-HypA-mod-calcification"
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)

Load Data

Expression Matrices for Calcification Genes

# Load biomineralization gene expression counts
apul_biomin <- read_csv("https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/M-multi-species/output/33-biomin-pathway-counts/apul_biomin_counts.csv",
                        show_col_types = FALSE)

peve_biomin <- read_csv("https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/M-multi-species/output/33-biomin-pathway-counts/peve_biomin_counts.csv",
                        show_col_types = FALSE)

ptua_biomin <- read_csv("https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/M-multi-species/output/33-biomin-pathway-counts/ptua_biomin_counts.csv",
                        show_col_types = FALSE)

cat("Biomin genes loaded:\n")

## Biomin genes loaded:

cat("  Apul:", nrow(apul_biomin), "genes\n")

##   Apul: 519 genes

cat("  Peve:", nrow(peve_biomin), "genes\n")

##   Peve: 565 genes

cat("  Ptua:", nrow(ptua_biomin), "genes\n")

##   Ptua: 483 genes

Gene Body Methylation Data

# Acropora pulchra methylation
apul_meth <- read_tsv("https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/D-Apul/output/40-Apul-Gene-Methylation/Apul-gene-methylation_75pct.tsv",
                      show_col_types = FALSE)

# Porites evermanni methylation
peve_meth <- read_tsv("https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/E-Peve/output/15-Peve-Gene-Methylation/Peve-gene-methylation_75pct.tsv",
                      show_col_types = FALSE)

# Pocillopora tuahiniensis methylation
ptua_meth <- read_tsv("https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/F-Ptua/output/09-Ptua-Gene-Methylation/Ptua-gene-methylation_75pct.tsv",
                      show_col_types = FALSE)

cat("Methylation data loaded:\n")

## Methylation data loaded:

cat("  Apul:", nrow(apul_meth), "genes\n")

##   Apul: 21621 genes

cat("  Peve:", nrow(peve_meth), "genes\n")

##   Peve: 20824 genes

cat("  Ptua:", nrow(ptua_meth), "genes\n")

##   Ptua: 23369 genes

Annotation File

ortholog_annot <- read_csv("../output/12-ortho-annot/ortholog_groups_annotated.csv",
                           show_col_types = FALSE)

Exon-Level Expression Data

# Exon count matrices for expression variability analysis
apul_exon_counts <- read_csv("../output/40-exon-count-matrix/apul-exon_gene_count_matrix.csv",
                              show_col_types = FALSE)

peve_exon_counts <- read_csv("../output/40-exon-count-matrix/peve-exon_gene_count_matrix.csv",
                              show_col_types = FALSE)

ptua_exon_counts <- read_csv("../output/40-exon-count-matrix/ptua-exon_gene_count_matrix.csv",
                              show_col_types = FALSE)

cat("Exon data loaded:\n")

## Exon data loaded:

cat("  Apul:", nrow(apul_exon_counts), "exons\n")

##   Apul: 209537 exons

cat("  Peve:", nrow(peve_exon_counts), "exons\n")

##   Peve: 235943 exons

cat("  Ptua:", nrow(ptua_exon_counts), "exons\n")

##   Ptua: 208535 exons

Data Wrangling

Reshape Expression Data

# Function to reshape expression data to long format
reshape_expression <- function(df, species_code) {
  df %>%
    pivot_longer(
      cols = -c(group_id, gene_id),
      names_to = "sample_id",
      values_to = "count"
    ) %>%
    mutate(
      species_code = species_code,
      colony_id = str_remove(sample_id, "-TP\\d+$"),
      timepoint = str_extract(sample_id, "TP\\d+"),
      timepoint_num = parse_number(timepoint),
      log_count = log2(count + 1)
    )
}

# Reshape all species
expr_long <- bind_rows(
  reshape_expression(apul_biomin, "apul"),
  reshape_expression(peve_biomin, "peve"),
  reshape_expression(ptua_biomin, "ptua")
) %>%
  mutate(species = species_lookup[species_code])

cat("Expression data reshaped:", nrow(expr_long), "observations\n")

## Expression data reshaped: 61067 observations

Reshape Methylation Data

# Function to reshape methylation data
reshape_methylation <- function(df, species_code) {
  df %>%
    pivot_longer(
      cols = -gene_id,
      names_to = "sample_id",
      values_to = "methylation"
    ) %>%
    mutate(
      species_code = species_code,
      gene_id_clean = str_remove(gene_id, "^gene-"),
      colony_id = str_remove(sample_id, "-TP\\d+$"),
      timepoint = str_extract(sample_id, "TP\\d+"),
      timepoint_num = parse_number(timepoint)
    )
}

# Reshape all species methylation
meth_long <- bind_rows(
  reshape_methylation(apul_meth, "apul"),
  reshape_methylation(peve_meth, "peve"),
  reshape_methylation(ptua_meth, "ptua")
) %>%
  mutate(species = species_lookup[species_code])

cat("Methylation data reshaped:", nrow(meth_long), "observations\n")

## Methylation data reshaped: 2383136 observations

Identify Biomineralization Genes

# Get unique biomineralization gene IDs for each species
biomin_genes <- list(
  apul = unique(apul_biomin$gene_id),
  peve = unique(peve_biomin$gene_id),
  ptua = unique(ptua_biomin$gene_id)
)

biomin_orthogroups <- unique(c(apul_biomin$group_id, peve_biomin$group_id, ptua_biomin$group_id))

cat("Biomineralization genes per species:\n")

## Biomineralization genes per species:

sapply(biomin_genes, length) %>% print()

## apul peve ptua 
##  505  565  483

cat("\nTotal unique orthogroups:", length(biomin_orthogroups), "\n")

## 
## Total unique orthogroups: 657

Filter Methylation to Biomin Genes

# Filter methylation data to biomineralization genes
biomin_meth <- meth_long %>%
  filter(
    (species_code == "apul" & gene_id_clean %in% biomin_genes$apul) |
    (species_code == "peve" & gene_id_clean %in% biomin_genes$peve) |
    (species_code == "ptua" & gene_id_clean %in% biomin_genes$ptua)
  )

cat("Biomin genes with methylation data:\n")

## Biomin genes with methylation data:

biomin_meth %>%
  group_by(species_code) %>%
  summarise(n_genes = n_distinct(gene_id_clean)) %>%
  print()

## # A tibble: 3 × 2
##   species_code n_genes
##   <chr>          <int>
## 1 apul             485
## 2 peve             483
## 3 ptua             459

Analysis 1: Gene Expression Changes Across Timepoints

Calculate Mean Expression by Timepoint

# Calculate mean expression per gene per timepoint
expr_by_tp <- expr_long %>%
  group_by(species_code, species, group_id, gene_id, timepoint) %>%
  summarise(
    mean_count = mean(count, na.rm = TRUE),
    mean_log = mean(log_count, na.rm = TRUE),
    sd_count = sd(count, na.rm = TRUE),
    n_samples = n(),
    .groups = "drop"
  )

head(expr_by_tp)

## # A tibble: 6 × 9
##   species_code species   group_id gene_id timepoint mean_count mean_log sd_count
##   <chr>        <chr>     <chr>    <chr>   <chr>          <dbl>    <dbl>    <dbl>
## 1 apul         Acropora… OG_00030 FUN_00… TP1             126.     6.30    126. 
## 2 apul         Acropora… OG_00030 FUN_00… TP2             140      7.09     44.6
## 3 apul         Acropora… OG_00030 FUN_00… TP3             160      7.12     77.5
## 4 apul         Acropora… OG_00030 FUN_00… TP4             112.     6.62     65.1
## 5 apul         Acropora… OG_00171 FUN_00… TP1             958.     9.75    517. 
## 6 apul         Acropora… OG_00171 FUN_00… TP2            2723.    11.4     752. 
## # ℹ 1 more variable: n_samples <int>

Calculate Expression Changes (Fold Changes)

# Pivot to wide format for comparison calculations
expr_wide <- expr_by_tp %>%
  select(species_code, species, group_id, gene_id, timepoint, mean_log) %>%
  pivot_wider(names_from = timepoint, values_from = mean_log, names_prefix = "")

# Calculate fold changes for each comparison
expr_fc <- expr_wide %>%
  mutate(
    # Fold changes (log2 difference = log2 fold change)
    fc_T1_to_T2 = TP2 - TP1,
    fc_T1_to_T3 = TP3 - TP1,
    fc_T4_to_T2 = TP2 - TP4,
    fc_T4_to_T3 = TP3 - TP4,
    # Mean expression
    mean_expr = (TP1 + TP2 + TP3 + TP4) / 4,
    # Stress response (T2+T3 vs T1+T4)
    stress_response = ((TP2 + TP3) / 2) - ((TP1 + TP4) / 2)
  ) %>%
  filter(!is.na(fc_T1_to_T2))

# Summary of fold changes
cat("Fold change statistics (log2):\n")

## Fold change statistics (log2):

expr_fc %>%
  group_by(species_code) %>%
  summarise(
    across(starts_with("fc_"), list(mean = ~mean(., na.rm = TRUE), 
                                    sd = ~sd(., na.rm = TRUE)),
           .names = "{.col}_{.fn}")
  ) %>%
  print()

## # A tibble: 3 × 9
##   species_code fc_T1_to_T2_mean fc_T1_to_T2_sd fc_T1_to_T3_mean fc_T1_to_T3_sd
##   <chr>                   <dbl>          <dbl>            <dbl>          <dbl>
## 1 apul                    0.479          1.38             0.378          1.07 
## 2 peve                   -0.413          0.546           -0.630          0.665
## 3 ptua                    0.238          0.575           -0.198          0.619
## # ℹ 4 more variables: fc_T4_to_T2_mean <dbl>, fc_T4_to_T2_sd <dbl>,
## #   fc_T4_to_T3_mean <dbl>, fc_T4_to_T3_sd <dbl>

Identify Significantly Changed Genes

# Define threshold for "changed" genes (|log2FC| > 1 = 2-fold change)
fc_threshold <- 1

# Identify genes with significant changes
changed_genes <- expr_fc %>%
  mutate(
    changed_T1_T2 = abs(fc_T1_to_T2) > fc_threshold,
    changed_T1_T3 = abs(fc_T1_to_T3) > fc_threshold,
    changed_T4_T2 = abs(fc_T4_to_T2) > fc_threshold,
    changed_T4_T3 = abs(fc_T4_to_T3) > fc_threshold,
    any_stress_change = changed_T1_T2 | changed_T1_T3 | changed_T4_T2 | changed_T4_T3,
    # Direction of change (up in stress timepoints)
    up_in_stress = stress_response > fc_threshold,
    down_in_stress = stress_response < -fc_threshold
  )

# Summary of changed genes
cat("\nGenes with |log2FC| > 1 per comparison:\n")

## 
## Genes with |log2FC| > 1 per comparison:

changed_genes %>%
  group_by(species_code, species) %>%
  summarise(
    T1_to_T2 = sum(changed_T1_T2, na.rm = TRUE),
    T1_to_T3 = sum(changed_T1_T3, na.rm = TRUE),
    T4_to_T2 = sum(changed_T4_T2, na.rm = TRUE),
    T4_to_T3 = sum(changed_T4_T3, na.rm = TRUE),
    any_change = sum(any_stress_change, na.rm = TRUE),
    total_genes = n(),
    .groups = "drop"
  ) %>%
  print()

## # A tibble: 3 × 8
##   species_code species            T1_to_T2 T1_to_T3 T4_to_T2 T4_to_T3 any_change
##   <chr>        <chr>                 <int>    <int>    <int>    <int>      <int>
## 1 apul         Acropora pulchra        270      205      241      176        360
## 2 peve         Porites evermanni        70      169       67       73        240
## 3 ptua         Pocillopora tuahi…       52       60       73       56        160
## # ℹ 1 more variable: total_genes <int>

Visualization: Expression Fold Changes

# Prepare data for heatmap
fc_matrix_data <- expr_fc %>%
  select(species_code, gene_id, fc_T1_to_T2, fc_T1_to_T3, fc_T4_to_T2, fc_T4_to_T3) %>%
  pivot_longer(cols = starts_with("fc_"), names_to = "comparison", values_to = "log2fc")

# Boxplot of fold changes by species and comparison
p_fc <- ggplot(fc_matrix_data, aes(x = comparison, y = log2fc, fill = species_lookup[species_code])) +
  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.8)) +
  geom_hline(yintercept = c(-1, 0, 1), linetype = c("dashed", "solid", "dashed"), 
             color = c("red", "gray50", "red"), alpha = 0.5) +
  scale_fill_manual(values = species_colors, name = "Species") +
  scale_x_discrete(labels = c("T1→T2", "T1→T3", "T4→T2", "T4→T3")) +
  labs(
    title = "Expression Fold Changes in Calcification Genes",
    subtitle = "Comparing baseline (T1, T4) to stress/recovery (T2, T3)",
    x = "Comparison",
    y = "Log2 Fold Change"
  ) +
  theme(legend.text = element_text(face = "italic"),
        axis.text.x = element_text(angle = 45, hjust = 1))

p_fc

ggsave(file.path(output_dir, "expr_fold_changes.png"), p_fc, width = 12, height = 8, dpi = 300)

Analysis 2: Exon-Level Expression Variability

Calculate Exon Variability by Gene and Timepoint

# Load exon summary files to get gene_id -> group_id mapping
apul_exon_mapping <- read_csv("../output/40-exon-count-matrix/apul-exon_summary_by_ortholog.csv",
                              show_col_types = FALSE) %>%
  select(gene_id, group_id) %>%
  distinct() %>%
  mutate(gene_id_clean = str_remove(gene_id, "^gene-"))

peve_exon_mapping <- read_csv("../output/40-exon-count-matrix/peve-exon_summary_by_ortholog.csv",
                              show_col_types = FALSE) %>%
  select(gene_id, group_id) %>%
  distinct() %>%
  mutate(gene_id_clean = str_remove(gene_id, "^gene-"))

ptua_exon_mapping <- read_csv("../output/40-exon-count-matrix/ptua-exon_summary_by_ortholog.csv",
                              show_col_types = FALSE) %>%
  select(gene_id, group_id) %>%
  distinct() %>%
  mutate(gene_id_clean = str_remove(gene_id, "^gene-"))

# Function to calculate exon variability by timepoint using group_id for matching
calculate_exon_variability_v2 <- function(exon_df, biomin_df, exon_mapping, species_code) {
  # Get biomin ortholog groups
  biomin_groups <- unique(biomin_df$group_id)
  
  # Clean gene IDs in exon data
  exon_df <- exon_df %>%
    mutate(gene_id_clean = str_remove(gene_id, "^gene-"))
  
  # Add group_id mapping to exon data
  exon_df <- exon_df %>%
    left_join(exon_mapping %>% select(gene_id_clean, group_id), by = "gene_id_clean")
  
  # Filter to biomin genes using ortholog groups
  exon_biomin <- exon_df %>%
    filter(group_id %in% biomin_groups)
  
  cat(sprintf("  %s: %d exons matched via %d ortholog groups\n", 
              species_code, nrow(exon_biomin), n_distinct(exon_biomin$group_id)))
  
  if(nrow(exon_biomin) == 0) {
    cat(sprintf("    WARNING: No matches found for %s\n", species_code))
    return(tibble())
  }
  
  # Get sample columns
  sample_cols <- names(exon_df)[!names(exon_df) %in% c("gene_id", "gene_id_clean", "group_id", "e_id", "chr", "strand", "start", "end")]
  
  # Reshape to long format
  exon_long <- exon_biomin %>%
    select(gene_id_clean, group_id, e_id, all_of(sample_cols)) %>%
    pivot_longer(cols = all_of(sample_cols), names_to = "sample_id", values_to = "count") %>%
    mutate(
      timepoint = str_extract(sample_id, "TP\\d+"),
      log_count = log2(count + 1)
    )
  
  # Calculate CV of exon expression within each gene per timepoint
  exon_cv_by_tp <- exon_long %>%
    group_by(gene_id = gene_id_clean, group_id, timepoint, sample_id) %>%
    summarise(
      n_exons = n(),
      mean_exon_expr = mean(log_count, na.rm = TRUE),
      sd_exon_expr = sd(log_count, na.rm = TRUE),
      cv_exon = sd_exon_expr / mean_exon_expr,
      .groups = "drop"
    ) %>%
    filter(!is.na(cv_exon) & is.finite(cv_exon) & n_exons >= 2)
  
  # Average CV across samples within each timepoint
  exon_cv_summary <- exon_cv_by_tp %>%
    group_by(gene_id, group_id, timepoint) %>%
    summarise(
      mean_cv_exon = mean(cv_exon, na.rm = TRUE),
      sd_cv_exon = sd(cv_exon, na.rm = TRUE),
      n_samples = n(),
      .groups = "drop"
    ) %>%
    mutate(species_code = species_code)
  
  return(exon_cv_summary)
}

# Calculate for each species using ortholog group matching
cat("Calculating exon variability by species (using ortholog groups):\n")

## Calculating exon variability by species (using ortholog groups):

apul_exon_cv <- calculate_exon_variability_v2(apul_exon_counts, apul_biomin, apul_exon_mapping, "apul")

##   apul: 8039 exons matched via 519 ortholog groups

peve_exon_cv <- calculate_exon_variability_v2(peve_exon_counts, peve_biomin, peve_exon_mapping, "peve")

##   peve: 8701 exons matched via 565 ortholog groups

ptua_exon_cv <- calculate_exon_variability_v2(ptua_exon_counts, ptua_biomin, ptua_exon_mapping, "ptua")

##   ptua: 8136 exons matched via 483 ortholog groups

exon_cv_all <- bind_rows(apul_exon_cv, peve_exon_cv, ptua_exon_cv) %>%
  mutate(species = species_lookup[species_code])

cat("\nExon variability summary by species:\n")

## 
## Exon variability summary by species:

exon_cv_all %>%
  group_by(species_code) %>%
  summarise(n_genes = n_distinct(gene_id), n_groups = n_distinct(group_id), n_records = n()) %>%
  print()

## # A tibble: 3 × 4
##   species_code n_genes n_groups n_records
##   <chr>          <int>    <int>     <int>
## 1 apul             497      511      2043
## 2 peve             539      539      2128
## 3 ptua             459      459      1834

Calculate Exon Variability Change

# Pivot to wide format for comparison
exon_cv_wide <- exon_cv_all %>%
  select(species_code, species, gene_id, group_id, timepoint, mean_cv_exon) %>%
  pivot_wider(names_from = timepoint, values_from = mean_cv_exon, names_prefix = "cv_")

# Calculate variability changes
exon_var_change <- exon_cv_wide %>%
  mutate(
    delta_cv_T1_T2 = cv_TP2 - cv_TP1,
    delta_cv_T1_T3 = cv_TP3 - cv_TP1,
    delta_cv_T4_T2 = cv_TP2 - cv_TP4,
    delta_cv_T4_T3 = cv_TP3 - cv_TP4,
    # Overall variability
    mean_cv = (cv_TP1 + cv_TP2 + cv_TP3 + cv_TP4) / 4,
    # Stress vs baseline variability
    stress_variability = ((cv_TP2 + cv_TP3) / 2) - ((cv_TP1 + cv_TP4) / 2)
  ) %>%
  filter(!is.na(delta_cv_T1_T2))

# Summary
cat("\nExon variability change summary:\n")

## 
## Exon variability change summary:

exon_var_change %>%
  group_by(species_code) %>%
  summarise(
    mean_stress_var_change = mean(stress_variability, na.rm = TRUE),
    sd_stress_var_change = sd(stress_variability, na.rm = TRUE),
    n_genes = n(),
    .groups = "drop"
  ) %>%
  print()

## # A tibble: 3 × 4
##   species_code mean_stress_var_change sd_stress_var_change n_genes
##   <chr>                         <dbl>                <dbl>   <int>
## 1 apul                      -0.237                   0.305     510
## 2 peve                       0.0177                  0.212     529
## 3 ptua                       0.000621                0.147     458

Visualization: Exon Variability by Timepoint

# Boxplot of exon variability by timepoint and species
p_exon_var <- ggplot(exon_cv_all, aes(x = timepoint, y = mean_cv_exon, fill = species)) +
  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.8)) +
  scale_fill_manual(values = species_colors) +
  facet_wrap(~species, scales = "free_y") +
  labs(
    title = "Exon-Level Expression Variability by Timepoint",
    subtitle = "CV of exon expression within genes (proxy for alternative splicing)",
    x = "Timepoint",
    y = "Mean CV (Exon Expression)"
  ) +
  theme(legend.position = "none",
        strip.text = element_text(face = "italic"))

p_exon_var

ggsave(file.path(output_dir, "exon_variability_by_tp.png"), p_exon_var, width = 12, height = 8, dpi = 300)

Analysis 3: Gene Body Methylation by Timepoint

Calculate GBM by Timepoint

# First, add group_id to biomin_meth by joining with biomin counts
# Create gene_id to group_id mapping from biomin counts
biomin_gene_group_map <- bind_rows(
  apul_biomin %>% select(gene_id, group_id) %>% mutate(species_code = "apul"),
  peve_biomin %>% select(gene_id, group_id) %>% mutate(species_code = "peve"),
  ptua_biomin %>% select(gene_id, group_id) %>% mutate(species_code = "ptua")
)

# Add group_id to biomin_meth
biomin_meth_grouped <- biomin_meth %>%
  left_join(biomin_gene_group_map, by = c("species_code", "gene_id_clean" = "gene_id"))

cat("Biomin meth with group_id:\n")

## Biomin meth with group_id:

biomin_meth_grouped %>%
  group_by(species_code) %>%
  summarise(n_with_group = sum(!is.na(group_id)), n_total = n()) %>%
  print()

## # A tibble: 3 × 3
##   species_code n_with_group n_total
##   <chr>               <int>   <int>
## 1 apul                19960   19960
## 2 peve                17871   17871
## 3 ptua                14688   14688

# Calculate mean GBM per gene per timepoint
gbm_by_tp <- biomin_meth_grouped %>%
  filter(!is.na(methylation)) %>%
  group_by(species_code, species, gene_id_clean, group_id, timepoint) %>%
  summarise(
    mean_meth = mean(methylation, na.rm = TRUE),
    sd_meth = sd(methylation, na.rm = TRUE),
    n_samples = n(),
    .groups = "drop"
  )

# Pivot to wide format
gbm_wide <- gbm_by_tp %>%
  select(species_code, species, gene_id_clean, group_id, timepoint, mean_meth) %>%
  pivot_wider(names_from = timepoint, values_from = mean_meth, names_prefix = "meth_")

# Calculate GBM changes and overall levels
gbm_analysis <- gbm_wide %>%
  mutate(
    # GBM changes
    delta_meth_T1_T2 = meth_TP2 - meth_TP1,
    delta_meth_T1_T3 = meth_TP3 - meth_TP1,
    delta_meth_T4_T2 = meth_TP2 - meth_TP4,
    delta_meth_T4_T3 = meth_TP3 - meth_TP4,
    # Overall GBM level
    mean_gbm = (meth_TP1 + meth_TP2 + meth_TP3 + meth_TP4) / 4,
    # Stress timepoint GBM
    stress_gbm = (meth_TP2 + meth_TP3) / 2,
    baseline_gbm = (meth_TP1 + meth_TP4) / 2,
    # GBM change in stress vs baseline
    gbm_stress_change = stress_gbm - baseline_gbm
  ) %>%
  filter(!is.na(mean_gbm))

cat("GBM analysis genes per species:\n")

## GBM analysis genes per species:

gbm_analysis %>%
  group_by(species_code) %>%
  summarise(n_genes = n(), n_with_group = sum(!is.na(group_id))) %>%
  print()

## # A tibble: 3 × 3
##   species_code n_genes n_with_group
##   <chr>          <int>        <int>
## 1 apul             499          499
## 2 peve             483          483
## 3 ptua             459          459

Define High vs Low GBM Genes

# Classify genes by overall GBM level (median split)
gbm_classified <- gbm_analysis %>%
  group_by(species_code) %>%
  mutate(
    gbm_median = median(mean_gbm, na.rm = TRUE),
    gbm_category = ifelse(mean_gbm >= gbm_median, "High GBM", "Low GBM")
  ) %>%
  ungroup()

cat("\nGBM classification summary:\n")

## 
## GBM classification summary:

gbm_classified %>%
  group_by(species_code, gbm_category) %>%
  summarise(
    n_genes = n(),
    mean_gbm = mean(mean_gbm, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  print()

## # A tibble: 6 × 4
##   species_code gbm_category n_genes mean_gbm
##   <chr>        <chr>          <int>    <dbl>
## 1 apul         High GBM         250   13.4  
## 2 apul         Low GBM          249    0.579
## 3 peve         High GBM         242   12.8  
## 4 peve         Low GBM          241    0.598
## 5 ptua         High GBM         230    7.02 
## 6 ptua         Low GBM          229    0.428

Hypothesis 1: Changed Genes Have Low GBM

Merge Expression Changes with GBM Data

# Merge expression fold change data with GBM data
expr_gbm_merged <- changed_genes %>%
  inner_join(gbm_classified, 
             by = c("species_code", "gene_id" = "gene_id_clean"))

cat("Merged expression-GBM data:\n")

## Merged expression-GBM data:

expr_gbm_merged %>%
  group_by(species_code) %>%
  summarise(n_genes = n()) %>%
  print()

## # A tibble: 3 × 2
##   species_code n_genes
##   <chr>          <int>
## 1 apul             527
## 2 peve             483
## 3 ptua             459

Test H1: Compare GBM Between Changed and Unchanged Genes

# Compare mean GBM between changed and unchanged genes
h1_comparison <- expr_gbm_merged %>%
  group_by(species_code, any_stress_change) %>%
  summarise(
    n_genes = n(),
    mean_gbm = mean(mean_gbm, na.rm = TRUE),
    sd_gbm = sd(mean_gbm, na.rm = TRUE),
    median_gbm = median(mean_gbm, na.rm = TRUE),
    .groups = "drop"
  )

cat("\n=== HYPOTHESIS 1 RESULTS ===\n")

## 
## === HYPOTHESIS 1 RESULTS ===

cat("Mean GBM by gene change status:\n")

## Mean GBM by gene change status:

print(h1_comparison)

## # A tibble: 6 × 6
##   species_code any_stress_change n_genes mean_gbm sd_gbm median_gbm
##   <chr>        <lgl>               <int>    <dbl>  <dbl>      <dbl>
## 1 apul         FALSE                 159     6.59     NA       6.59
## 2 apul         TRUE                  368     7.12     NA       7.12
## 3 peve         FALSE                 263     9.01     NA       9.01
## 4 peve         TRUE                  220     3.96     NA       3.96
## 5 ptua         FALSE                 308     4.63     NA       4.63
## 6 ptua         TRUE                  151     1.90     NA       1.90

# Statistical tests by species
h1_tests <- expr_gbm_merged %>%
  group_by(species_code) %>%
  summarise(
    wilcox_p = wilcox.test(mean_gbm ~ any_stress_change)$p.value,
    changed_mean_gbm = mean(mean_gbm[any_stress_change], na.rm = TRUE),
    unchanged_mean_gbm = mean(mean_gbm[!any_stress_change], na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    h1_supported = changed_mean_gbm < unchanged_mean_gbm,
    interpretation = ifelse(h1_supported, "SUPPORTED: Changed genes have lower GBM",
                           "NOT SUPPORTED: Changed genes have similar/higher GBM")
  )

cat("\nStatistical tests:\n")

## 
## Statistical tests:

print(h1_tests)

## # A tibble: 3 × 6
##   species_code wilcox_p changed_mean_gbm unchanged_mean_gbm h1_supported
##   <chr>           <dbl>            <dbl>              <dbl> <lgl>       
## 1 apul         0.0104               7.12               6.59 FALSE       
## 2 peve         0.000173             3.96               9.01 TRUE        
## 3 ptua         0.000236             1.90               4.63 TRUE        
## # ℹ 1 more variable: interpretation <chr>

Visualization: H1 - GBM in Changed vs Unchanged Genes

p_h1 <- ggplot(expr_gbm_merged, aes(x = any_stress_change, y = mean_gbm, fill = species.x)) +
  geom_boxplot(alpha = 0.7) +
  geom_jitter(width = 0.2, alpha = 0.3, size = 1) +
  scale_fill_manual(values = species_colors, name = "Species") +
  scale_x_discrete(labels = c("FALSE" = "Unchanged", "TRUE" = "Changed")) +
  facet_wrap(~species.x, scales = "free_y") +
  labs(
    title = "Hypothesis 1: Gene Body Methylation in Changed vs Unchanged Genes",
    subtitle = "Changed = |log2FC| > 1 in any stress comparison (T1→T2, T1→T3, T4→T2, T4→T3)",
    x = "Gene Expression Changed",
    y = "Mean Gene Body Methylation (%)"
  ) +
  theme(legend.position = "none",
        strip.text = element_text(face = "italic"))

p_h1

ggsave(file.path(output_dir, "h1_gbm_changed_vs_unchanged.png"), p_h1, width = 12, height = 8, dpi = 300)

Hypothesis 2: High Expression + High GBM = Higher GBM at Stress Timepoints

Identify High Expression Genes at Stress Timepoints

# Identify genes with high expression at stress timepoints (T2, T3)
high_expr_stress <- expr_fc %>%
  group_by(species_code) %>%
  mutate(
    stress_expr = (TP2 + TP3) / 2,
    stress_expr_median = median(stress_expr, na.rm = TRUE),
    high_stress_expr = stress_expr >= stress_expr_median
  ) %>%
  ungroup()

cat("High stress expression genes per species:\n")

## High stress expression genes per species:

high_expr_stress %>%
  group_by(species_code, high_stress_expr) %>%
  summarise(n = n(), .groups = "drop") %>%
  print()

## # A tibble: 6 × 3
##   species_code high_stress_expr     n
##   <chr>        <lgl>            <int>
## 1 apul         FALSE              259
## 2 apul         TRUE               260
## 3 peve         FALSE              282
## 4 peve         TRUE               283
## 5 ptua         FALSE              241
## 6 ptua         TRUE               242

Test H2: High Expr + High GBM → Higher Stress GBM?

# Merge with GBM data
h2_data <- high_expr_stress %>%
  inner_join(gbm_classified, by = c("species_code", "gene_id" = "gene_id_clean")) %>%
  mutate(
    category = case_when(
      high_stress_expr & gbm_category == "High GBM" ~ "High Expr + High GBM",
      high_stress_expr & gbm_category == "Low GBM" ~ "High Expr + Low GBM",
      !high_stress_expr & gbm_category == "High GBM" ~ "Low Expr + High GBM",
      TRUE ~ "Low Expr + Low GBM"
    )
  )

# Compare stress GBM by category
h2_comparison <- h2_data %>%
  group_by(species_code, category) %>%
  summarise(
    n_genes = n(),
    mean_stress_gbm = mean(stress_gbm, na.rm = TRUE),
    mean_baseline_gbm = mean(baseline_gbm, na.rm = TRUE),
    mean_gbm_change = mean(gbm_stress_change, na.rm = TRUE),
    .groups = "drop"
  )

cat("\n=== HYPOTHESIS 2 RESULTS ===\n")

## 
## === HYPOTHESIS 2 RESULTS ===

cat("GBM at stress timepoints by expression and overall GBM category:\n")

## GBM at stress timepoints by expression and overall GBM category:

print(h2_comparison)

## # A tibble: 12 × 6
##    species_code category             n_genes mean_stress_gbm mean_baseline_gbm
##    <chr>        <chr>                  <int>           <dbl>             <dbl>
##  1 apul         High Expr + High GBM     180          17.3              16.2  
##  2 apul         High Expr + Low GBM       89           0.619             0.657
##  3 apul         Low Expr + High GBM       84           6.07              5.68 
##  4 apul         Low Expr + Low GBM       174           0.536             0.561
##  5 peve         High Expr + High GBM     168          14.2              14.8  
##  6 peve         High Expr + Low GBM       96           0.658             0.624
##  7 peve         Low Expr + High GBM       74           8.69              9.02 
##  8 peve         Low Expr + Low GBM       145           0.568             0.571
##  9 ptua         High Expr + High GBM     157           7.85              7.95 
## 10 ptua         High Expr + Low GBM       84           0.444             0.428
## 11 ptua         Low Expr + High GBM       73           5.12              5.16 
## 12 ptua         Low Expr + Low GBM       145           0.435             0.412
## # ℹ 1 more variable: mean_gbm_change <dbl>

# Focus on High Expr + High GBM genes
h2_focus <- h2_data %>%
  filter(category == "High Expr + High GBM") %>%
  group_by(species_code) %>%
  summarise(
    n_genes = n(),
    mean_stress_gbm = mean(stress_gbm, na.rm = TRUE),
    mean_baseline_gbm = mean(baseline_gbm, na.rm = TRUE),
    gbm_increase = mean(gbm_stress_change, na.rm = TRUE),
    pct_increase = gbm_increase / mean_baseline_gbm * 100,
    t_test_p = t.test(stress_gbm, baseline_gbm, paired = TRUE)$p.value,
    .groups = "drop"
  ) %>%
  mutate(
    h2_supported = gbm_increase > 0,
    interpretation = ifelse(h2_supported, 
                           "SUPPORTED: Higher GBM at stress timepoints",
                           "NOT SUPPORTED: No increase in GBM at stress timepoints")
  )

cat("\nHigh Expr + High GBM genes - stress vs baseline GBM:\n")

## 
## High Expr + High GBM genes - stress vs baseline GBM:

print(h2_focus)

## # A tibble: 3 × 9
##   species_code n_genes mean_stress_gbm mean_baseline_gbm gbm_increase
##   <chr>          <int>           <dbl>             <dbl>        <dbl>
## 1 apul             180           17.3              16.2        1.10  
## 2 peve             168           14.2              14.8       -0.565 
## 3 ptua             157            7.85              7.95      -0.0987
## # ℹ 4 more variables: pct_increase <dbl>, t_test_p <dbl>, h2_supported <lgl>,
## #   interpretation <chr>

Visualization: H2

# Reshape for paired comparison visualization
h2_plot_data <- h2_data %>%
  filter(category == "High Expr + High GBM") %>%
  select(species_code, gene_id, stress_gbm, baseline_gbm) %>%
  pivot_longer(cols = c(stress_gbm, baseline_gbm), 
               names_to = "period", values_to = "gbm") %>%
  mutate(
    period = ifelse(period == "stress_gbm", "Stress (T2+T3)", "Baseline (T1+T4)"),
    species = species_lookup[species_code]
  )

p_h2 <- ggplot(h2_plot_data, aes(x = period, y = gbm, fill = period)) +
  geom_boxplot(alpha = 0.7) +
  geom_jitter(width = 0.2, alpha = 0.3, size = 1) +
  scale_fill_manual(values = c("Baseline (T1+T4)" = "#4393C3", "Stress (T2+T3)" = "#D6604D")) +
  facet_wrap(~species, scales = "free_y") +
  labs(
    title = "Hypothesis 2: GBM in High Expression + High GBM Genes",
    subtitle = "Comparing baseline (T1+T4) to stress (T2+T3) timepoints",
    x = "Period",
    y = "Mean Gene Body Methylation (%)"
  ) +
  theme(legend.position = "none",
        strip.text = element_text(face = "italic"))

p_h2

ggsave(file.path(output_dir, "h2_high_expr_high_gbm.png"), p_h2, width = 14, height = 10, dpi = 300)

Hypothesis 3: High Exon Variability at Stress → Differential GBM

Identify High Variability Genes at Stress Timepoints

# Calculate stress timepoint exon variability (using group_id for matching)
stress_exon_var <- exon_cv_all %>%
  filter(timepoint %in% c("TP2", "TP3")) %>%
  group_by(species_code, gene_id, group_id) %>%
  summarise(
    mean_stress_cv = mean(mean_cv_exon, na.rm = TRUE),
    .groups = "drop"
  )

# Identify high variability genes (top quartile)
high_var_genes <- stress_exon_var %>%
  group_by(species_code) %>%
  mutate(
    cv_q75 = quantile(mean_stress_cv, 0.75, na.rm = TRUE),
    high_var = mean_stress_cv >= cv_q75
  ) %>%
  ungroup()

cat("High exon variability genes at stress timepoints:\n")

## High exon variability genes at stress timepoints:

high_var_genes %>%
  group_by(species_code, high_var) %>%
  summarise(n = n(), .groups = "drop") %>%
  print()

## # A tibble: 6 × 3
##   species_code high_var     n
##   <chr>        <lgl>    <int>
## 1 apul         FALSE      383
## 2 apul         TRUE       128
## 3 peve         FALSE      401
## 4 peve         TRUE       134
## 5 ptua         FALSE      344
## 6 ptua         TRUE       115

Test H3: High Variability → Differential GBM

# Merge with GBM change data using group_id (ortholog group)
h3_data <- high_var_genes %>%
  filter(!is.na(group_id)) %>%
  inner_join(
    gbm_analysis %>% filter(!is.na(group_id)), 
    by = c("species_code", "group_id")
  ) %>%
  mutate(
    abs_gbm_change = abs(gbm_stress_change),
    species = species_lookup[species_code]
  )

cat("H3 data available by species:\n")

## H3 data available by species:

h3_data %>%
  group_by(species_code) %>%
  summarise(n_genes = n(), n_high_var = sum(high_var), n_low_var = sum(!high_var)) %>%
  print()

## # A tibble: 3 × 4
##   species_code n_genes n_high_var n_low_var
##   <chr>          <int>      <int>     <int>
## 1 apul             492        117       375
## 2 peve             475        121       354
## 3 ptua             441        110       331

# Compare GBM change magnitude between high and low variability genes
h3_comparison <- h3_data %>%
  group_by(species_code, species, high_var) %>%
  summarise(
    n_genes = n(),
    mean_abs_gbm_change = mean(abs_gbm_change, na.rm = TRUE),
    sd_abs_gbm_change = sd(abs_gbm_change, na.rm = TRUE),
    mean_gbm_change = mean(gbm_stress_change, na.rm = TRUE),
    .groups = "drop"
  )

cat("\n=== HYPOTHESIS 3 RESULTS ===\n")

## 
## === HYPOTHESIS 3 RESULTS ===

cat("GBM change magnitude by exon variability category:\n")

## GBM change magnitude by exon variability category:

print(h3_comparison)

## # A tibble: 6 × 7
##   species_code species    high_var n_genes mean_abs_gbm_change sd_abs_gbm_change
##   <chr>        <chr>      <lgl>      <int>               <dbl>             <dbl>
## 1 apul         Acropora … FALSE        375              0.691              1.06 
## 2 apul         Acropora … TRUE         117              0.198              0.315
## 3 peve         Porites e… FALSE        354              0.605              1.18 
## 4 peve         Porites e… TRUE         121              0.191              0.352
## 5 ptua         Pocillopo… FALSE        331              0.169              0.253
## 6 ptua         Pocillopo… TRUE         110              0.0745             0.122
## # ℹ 1 more variable: mean_gbm_change <dbl>

# Statistical tests - handle each species separately to avoid errors
run_h3_test <- function(data, sp_code) {
  sp_data <- data %>% filter(species_code == sp_code)
  
  if(nrow(sp_data) == 0 || sum(sp_data$high_var) < 3 || sum(!sp_data$high_var) < 3) {
    return(tibble(
      species_code = sp_code,
      wilcox_p = NA_real_,
      high_var_mean_change = NA_real_,
      low_var_mean_change = NA_real_,
      h3_supported = NA,
      interpretation = "INSUFFICIENT DATA: Not enough genes with exon variability data"
    ))
  }
  
  test_result <- tryCatch({
    wilcox.test(abs_gbm_change ~ high_var, data = sp_data)$p.value
  }, error = function(e) NA_real_)
  
  tibble(
    species_code = sp_code,
    wilcox_p = test_result,
    high_var_mean_change = mean(sp_data$abs_gbm_change[sp_data$high_var], na.rm = TRUE),
    low_var_mean_change = mean(sp_data$abs_gbm_change[!sp_data$high_var], na.rm = TRUE)
  ) %>%
    mutate(
      h3_supported = high_var_mean_change > low_var_mean_change,
      interpretation = case_when(
        is.na(h3_supported) ~ "INSUFFICIENT DATA",
        h3_supported ~ "SUPPORTED: High variability genes show more differential GBM",
        TRUE ~ "NOT SUPPORTED: No difference in GBM changes"
      )
    )
}

# Run tests for all species (even if some have no data)
h3_tests <- bind_rows(
  run_h3_test(h3_data, "apul"),
  run_h3_test(h3_data, "peve"),
  run_h3_test(h3_data, "ptua")
)

cat("\nStatistical tests:\n")

## 
## Statistical tests:

print(h3_tests)

## # A tibble: 3 × 6
##   species_code    wilcox_p high_var_mean_change low_var_mean_change h3_supported
##   <chr>              <dbl>                <dbl>               <dbl> <lgl>       
## 1 apul         0.00000622                0.198                0.691 FALSE       
## 2 peve         0.00151                   0.191                0.605 FALSE       
## 3 ptua         0.000000528               0.0745               0.169 FALSE       
## # ℹ 1 more variable: interpretation <chr>

# Show why some species might be missing
cat("\nNote: Species may lack H3 data due to gene ID mismatches between exon and biomin datasets.\n")

## 
## Note: Species may lack H3 data due to gene ID mismatches between exon and biomin datasets.

Visualization: H3

p_h3 <- ggplot(h3_data, aes(x = high_var, y = abs_gbm_change, fill = species)) +
  geom_boxplot(alpha = 0.7) +
  geom_jitter(width = 0.2, alpha = 0.3, size = 1) +
  scale_fill_manual(values = species_colors) +
  scale_x_discrete(labels = c("FALSE" = "Low Variability", "TRUE" = "High Variability")) +
  facet_wrap(~species, scales = "free_y") +
  labs(
    title = "Hypothesis 3: GBM Change in High vs Low Exon Variability Genes",
    subtitle = "Absolute GBM change (stress vs baseline) by exon expression variability at T2+T3",
    x = "Exon Expression Variability (at T2+T3)",
    y = "|Δ Gene Body Methylation| (%)"
  ) +
  theme(legend.position = "none",
        strip.text = element_text(face = "italic"))

p_h3

ggsave(file.path(output_dir, "h3_exon_var_gbm_change.png"), p_h3, width = 12, height = 8, dpi = 300)

Correlation: Exon Variability vs GBM Change

# Scatter plot of exon variability vs GBM change
p_h3_cor <- ggplot(h3_data, aes(x = mean_stress_cv, y = abs_gbm_change, color = species)) +
  geom_point(alpha = 0.5, size = 2) +
  geom_smooth(method = "lm", se = TRUE, alpha = 0.2) +
  scale_color_manual(values = species_colors) +
  facet_wrap(~species, scales = "free") +
  labs(
    title = "Exon Expression Variability vs GBM Change Magnitude",
    subtitle = "Testing correlation between regulatory flexibility and epigenetic dynamics",
    x = "Mean Exon CV (at T2+T3)",
    y = "|Δ Gene Body Methylation| (%)"
  ) +
  theme(legend.position = "none",
        strip.text = element_text(face = "italic"))

p_h3_cor

ggsave(file.path(output_dir, "h3_exon_var_gbm_correlation.png"), p_h3_cor, width = 12, height = 8, dpi = 300)

# Calculate correlations
h3_correlations <- h3_data %>%
  group_by(species_code, species) %>%
  summarise(
    cor_spearman = cor(mean_stress_cv, abs_gbm_change, method = "spearman", use = "complete.obs"),
    cor_p = cor.test(mean_stress_cv, abs_gbm_change, method = "spearman")$p.value,
    n = n(),
    .groups = "drop"
  )

cat("\nCorrelation between exon variability and GBM change:\n")

## 
## Correlation between exon variability and GBM change:

print(h3_correlations)

## # A tibble: 3 × 5
##   species_code species                  cor_spearman    cor_p     n
##   <chr>        <chr>                           <dbl>    <dbl> <int>
## 1 apul         Acropora pulchra               -0.305 4.75e-12   492
## 2 peve         Porites evermanni              -0.235 2.31e- 7   475
## 3 ptua         Pocillopora tuahiniensis       -0.373 4.64e-16   441

Summary and Conclusions

Compile All Results

cat("\n" %+% strrep("=", 60) %+% "\n")
cat("           HYPOTHESIS TESTING SUMMARY\n")

##            HYPOTHESIS TESTING SUMMARY

cat(strrep("=", 60) %+% "\n\n")

# Hypothesis 1 Summary
cat("HYPOTHESIS 1: Genes changing in activity at T2/T3 have low GBM\n")

## HYPOTHESIS 1: Genes changing in activity at T2/T3 have low GBM

cat(strrep("-", 50) %+% "\n")
for(i in 1:nrow(h1_tests)) {
  sp <- h1_tests$species_code[i]
  cat(sprintf("  %s: %s (p = %.4f)\n", 
              species_lookup[sp], 
              h1_tests$interpretation[i],
              h1_tests$wilcox_p[i]))
}

##   Acropora pulchra: NOT SUPPORTED: Changed genes have similar/higher GBM (p = 0.0104)
##   Porites evermanni: SUPPORTED: Changed genes have lower GBM (p = 0.0002)
##   Pocillopora tuahiniensis: SUPPORTED: Changed genes have lower GBM (p = 0.0002)

cat("\n")

# Hypothesis 2 Summary
cat("HYPOTHESIS 2: High Expr + High GBM genes have higher GBM at stress\n")

## HYPOTHESIS 2: High Expr + High GBM genes have higher GBM at stress

cat(strrep("-", 50) %+% "\n")
for(i in 1:nrow(h2_focus)) {
  sp <- h2_focus$species_code[i]
  cat(sprintf("  %s: %s (change = %.2f%%, p = %.4f)\n", 
              species_lookup[sp], 
              h2_focus$interpretation[i],
              h2_focus$pct_increase[i],
              h2_focus$t_test_p[i]))
}

##   Acropora pulchra: SUPPORTED: Higher GBM at stress timepoints (change = 6.78%, p = 0.0000)
##   Porites evermanni: NOT SUPPORTED: No increase in GBM at stress timepoints (change = -3.82%, p = 0.0000)
##   Pocillopora tuahiniensis: NOT SUPPORTED: No increase in GBM at stress timepoints (change = -1.24%, p = 0.0001)

cat("\n")

# Hypothesis 3 Summary
cat("HYPOTHESIS 3: High exon variability genes have differential GBM\n")

## HYPOTHESIS 3: High exon variability genes have differential GBM

cat(strrep("-", 50) %+% "\n")
for(i in 1:nrow(h3_tests)) {
  sp <- h3_tests$species_code[i]
  cat(sprintf("  %s: %s (p = %.4f)\n", 
              species_lookup[sp], 
              h3_tests$interpretation[i],
              h3_tests$wilcox_p[i]))
}

##   Acropora pulchra: NOT SUPPORTED: No difference in GBM changes (p = 0.0000)
##   Porites evermanni: NOT SUPPORTED: No difference in GBM changes (p = 0.0015)
##   Pocillopora tuahiniensis: NOT SUPPORTED: No difference in GBM changes (p = 0.0000)

cat("\n")

Export Results Tables

# Combine all hypothesis results
h1_export <- h1_tests %>%
  mutate(hypothesis = "H1", 
         species = species_lookup[species_code]) %>%
  select(hypothesis, species, changed_mean_gbm, unchanged_mean_gbm, 
         wilcox_p, h1_supported, interpretation)

h2_export <- h2_focus %>%
  mutate(hypothesis = "H2",
         species = species_lookup[species_code]) %>%
  select(hypothesis, species, n_genes, mean_stress_gbm, mean_baseline_gbm,
         gbm_increase, pct_increase, t_test_p, h2_supported, interpretation)

h3_export <- h3_tests %>%
  mutate(hypothesis = "H3",
         species = species_lookup[species_code]) %>%
  select(hypothesis, species, high_var_mean_change, low_var_mean_change,
         wilcox_p, h3_supported, interpretation)

# Export
write_csv(h1_export, file.path(output_dir, "h1_results.csv"))
write_csv(h2_export, file.path(output_dir, "h2_results.csv"))
write_csv(h3_export, file.path(output_dir, "h3_results.csv"))

# Export full merged datasets
write_csv(expr_gbm_merged, file.path(output_dir, "expression_gbm_merged.csv"))
write_csv(h2_data, file.path(output_dir, "h2_expression_gbm_categories.csv"))
write_csv(h3_data, file.path(output_dir, "h3_exon_var_gbm.csv"))

cat("Results exported to:", output_dir, "\n")

## Results exported to: ../output/52-HypA-mod-calcification

Combined Visualization

# Create combined summary figure
combined_plot <- plot_grid(
  p_h1 + theme(plot.title = element_text(size = 11)),
  p_h2 + theme(plot.title = element_text(size = 11)),
  p_h3 + theme(plot.title = element_text(size = 11)),
  p_h3_cor + theme(plot.title = element_text(size = 11)),
  ncol = 2,
  labels = c("A", "B", "C", "D")
)

combined_plot

ggsave(file.path(output_dir, "combined_hypothesis_figure.png"), 
       combined_plot, width = 16, height = 14, dpi = 300)

Species-Specific Deep Dive

Comparison Across Species

# Create summary data for species comparison using safer join approach
# Prepare H1 results
h1_summary <- h1_tests %>%
  mutate(
    hypothesis = "H1",
    species = species_lookup[species_code],
    p_value = wilcox_p,
    supported = h1_supported
  ) %>%
  select(species, hypothesis, p_value, supported)

# Prepare H2 results
h2_summary <- h2_focus %>%
  mutate(
    hypothesis = "H2",
    species = species_lookup[species_code],
    p_value = t_test_p,
    supported = h2_supported
  ) %>%
  select(species, hypothesis, p_value, supported)

# Prepare H3 results
h3_summary <- h3_tests %>%
  mutate(
    hypothesis = "H3",
    species = species_lookup[species_code],
    p_value = wilcox_p,
    supported = h3_supported
  ) %>%
  select(species, hypothesis, p_value, supported)

# Combine all results
species_summary <- bind_rows(h1_summary, h2_summary, h3_summary) %>%
  mutate(
    significance = case_when(
      is.na(p_value) ~ "NA",
      p_value < 0.001 ~ "***",
      p_value < 0.01 ~ "**",
      p_value < 0.05 ~ "*",
      TRUE ~ "ns"
    ),
    result = case_when(
      is.na(supported) ~ "No Data",
      supported ~ "Supported",
      TRUE ~ "Not Supported"
    )
  )

cat("Species summary:\n")

## Species summary:

print(species_summary)

## # A tibble: 9 × 6
##   species                  hypothesis  p_value supported significance result    
##   <chr>                    <chr>         <dbl> <lgl>     <chr>        <chr>     
## 1 Acropora pulchra         H1         1.04e- 2 FALSE     *            Not Suppo…
## 2 Porites evermanni        H1         1.73e- 4 TRUE      ***          Supported 
## 3 Pocillopora tuahiniensis H1         2.36e- 4 TRUE      ***          Supported 
## 4 Acropora pulchra         H2         2.36e-21 TRUE      ***          Supported 
## 5 Porites evermanni        H2         2.42e- 5 FALSE     ***          Not Suppo…
## 6 Pocillopora tuahiniensis H2         9.76e- 5 FALSE     ***          Not Suppo…
## 7 Acropora pulchra         H3         6.22e- 6 FALSE     ***          Not Suppo…
## 8 Porites evermanni        H3         1.51e- 3 FALSE     **           Not Suppo…
## 9 Pocillopora tuahiniensis H3         5.28e- 7 FALSE     ***          Not Suppo…

# Heatmap-style visualization
p_summary <- ggplot(species_summary, aes(x = hypothesis, y = species, fill = result)) +
  geom_tile(color = "white", size = 1) +
  geom_text(aes(label = significance), size = 6, fontface = "bold") +
  scale_fill_manual(values = c("Supported" = "#00A087", 
                               "Not Supported" = "#E64B35",
                               "No Data" = "gray80"),
                    name = "Hypothesis") +
  labs(
    title = "Summary: Hypothesis Support Across Species",
    subtitle = "* p<0.05, ** p<0.01, *** p<0.001, ns = not significant",
    x = "Hypothesis",
    y = "Species"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.y = element_text(face = "italic"),
    panel.grid = element_blank()
  )

p_summary

ggsave(file.path(output_dir, "hypothesis_summary_heatmap.png"), p_summary, width = 10, height = 6, dpi = 300)

Session Info

sessionInfo()

## R version 4.2.3 (2023-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 24.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] scales_1.4.0       broom_1.0.9        cowplot_1.2.0      ggpubr_0.6.1      
##  [5] viridis_0.6.5      viridisLite_0.4.2  RColorBrewer_1.1-3 pheatmap_1.0.13   
##  [9] lubridate_1.9.4    forcats_1.0.0      stringr_1.5.1      dplyr_1.1.4       
## [13] purrr_1.1.0        readr_2.1.5        tidyr_1.3.1        tibble_3.3.0      
## [17] ggplot2_4.0.0      tidyverse_2.0.0   
## 
## loaded via a namespace (and not attached):
##  [1] lattice_0.22-7    digest_0.6.37     utf8_1.2.6        R6_2.6.1         
##  [5] backports_1.5.0   evaluate_1.0.4    pillar_1.11.0     rlang_1.1.6      
##  [9] curl_5.2.3        rstudioapi_0.17.1 car_3.1-3         jquerylib_0.1.4  
## [13] Matrix_1.5-3      rmarkdown_2.29    splines_4.2.3     textshaping_1.0.3
## [17] labeling_0.4.3    S7_0.2.0          bit_4.6.0         compiler_4.2.3   
## [21] xfun_0.52         systemfonts_1.2.3 pkgconfig_2.0.3   mgcv_1.9-3       
## [25] htmltools_0.5.8.1 tidyselect_1.2.1  gridExtra_2.3     crayon_1.5.3     
## [29] tzdb_0.5.0        withr_3.0.2       grid_4.2.3        nlme_3.1-168     
## [33] jsonlite_2.0.0    gtable_0.3.6      lifecycle_1.0.4   magrittr_2.0.4   
## [37] cli_3.6.5         stringi_1.8.7     vroom_1.6.5       cachem_1.1.0     
## [41] carData_3.0-5     farver_2.1.2      ggsignif_0.6.4    bslib_0.9.0      
## [45] ragg_1.4.0        generics_0.1.4    vctrs_0.6.5       Formula_1.2-5    
## [49] tools_4.2.3       bit64_4.6.0-1     glue_1.8.0        hms_1.1.3        
## [53] abind_1.4-8       parallel_4.2.3    fastmap_1.2.0     yaml_2.3.10      
## [57] timechange_0.3.0  rstatix_0.7.2     knitr_1.50        sass_0.4.10