---
title: "50-predom-exon"
subtitle: "Identifying Shifts in Predominant Exon Expression Across Timepoints"
author: "Steven Roberts"
date: "`r Sys.Date()`"
output:
  html_document:
    toc: true
    toc_float: true
    toc_depth: 3
    number_sections: true
    code_folding: show
    theme: cosmo
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(
  echo = TRUE, 
  message = FALSE, 
  warning = FALSE, 
  fig.width = 12, 
  fig.height = 8,
  cache = FALSE
)

library(dplyr)
library(readr)
library(tidyr)
library(ggplot2)
library(here)
library(purrr)
library(stringr)
library(DT)
library(viridis)
library(patchwork)
library(broom)
```

# Overview

This notebook identifies genes with **statistically robust shifts in predominant (highest) exon expression** across samples and timepoints for three coral species:

- **Acropora pulchra** (Apul)
- **Porites evermanni** (Peve)
- **Pocillopora tuahiniensis** (Ptua)

**Key Questions:**

1. For each gene, which exon has the highest expression in each sample?
2. Do genes show shifts in which exon is predominant across samples?
3. Are these shifts associated with specific timepoints (TP1, TP2, TP3, TP4)?

**Data Source:** Ballgown output from HISAT2 RNA-seq alignments:

- `e_data.ctab`: Exon-level expression data (FPKM, coverage)
- `e2t.ctab`: Exon-to-transcript mapping
- `t_data.ctab`: Transcript-level data with gene IDs

# Data Paths and Setup

```{r paths}
# Species-specific RNA-seq alignment output directories
apul_rnaseq_dir <- here("D-Apul/output/02.20-D-Apul-RNAseq-alignment-HiSat2")
peve_rnaseq_dir <- here("E-Peve/output/02.20-E-Peve-RNAseq-alignment-HiSat2")
ptua_rnaseq_dir <- here("F-Ptua/output/02.20-F-Ptua-RNAseq-alignment-HiSat2")

# Output directory
output_dir <- here("M-multi-species/output/50-predom-exon")
if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE)

# Define species colors for consistent plotting
species_colors <- c(
  "Acropora pulchra" = "#E69F00",
  "Porites evermanni" = "#56B4E9",
  "Pocillopora tuahiniensis" = "#009E73"
)

species_abbrev <- c(
  "Acropora pulchra" = "Apul",
  "Porites evermanni" = "Peve",
  "Pocillopora tuahiniensis" = "Ptua"
)
```

# Load Ballgown Data

## Helper Function to Load Sample Data

```{r load-functions}
#' Load ballgown ctab files for a single sample
#' @param sample_dir Path to sample directory containing ctab files
#' @return Data frame with exon-level expression and gene assignments
load_sample_ballgown <- function(sample_dir) {
  sample_name <- basename(sample_dir)
  
  # Load the three ctab files
  e_data <- read_tsv(file.path(sample_dir, "e_data.ctab"), show_col_types = FALSE)
  e2t <- read_tsv(file.path(sample_dir, "e2t.ctab"), show_col_types = FALSE)
  t_data <- read_tsv(file.path(sample_dir, "t_data.ctab"), show_col_types = FALSE)
  
  # Join exon data to transcripts to get gene IDs
  exon_gene <- e_data %>%
    left_join(e2t, by = "e_id") %>%
    left_join(t_data %>% select(t_id, gene_id, gene_name), by = "t_id") %>%
    mutate(sample_id = sample_name) %>%
    # Use FPKM-like normalized coverage as expression measure
    # mcov is the mean per-base coverage, which we normalize by exon length
    mutate(
      exon_length = end - start + 1,
      # Use coverage as primary expression metric
      expression = cov
    )
  
  return(exon_gene)
}

#' Load all samples for a species
#' @param rnaseq_dir Base directory containing sample subdirectories
#' @return Combined data frame for all samples
load_species_ballgown <- function(rnaseq_dir) {
  # Get all sample directories (exclude files and special directories)
  sample_dirs <- list.dirs(rnaseq_dir, recursive = FALSE, full.names = TRUE)
  
  # Filter to only directories containing ballgown files
  sample_dirs <- sample_dirs[sapply(sample_dirs, function(d) {
    file.exists(file.path(d, "e_data.ctab"))
  })]
  
  # Load all samples
  all_samples <- map_dfr(sample_dirs, load_sample_ballgown)
  
  # Parse sample metadata
  all_samples <- all_samples %>%
    mutate(
      timepoint = str_extract(sample_id, "TP[0-9]+"),
      timepoint_num = as.numeric(str_extract(timepoint, "[0-9]+")),
      colony = str_replace(sample_id, "-TP[0-9]+$", "")
    )
  
  return(all_samples)
}
```

## Load Data for All Species

```{r load-all-species}
cat("Loading Acropora pulchra data...\n")
apul_data <- load_species_ballgown(apul_rnaseq_dir) %>%
  mutate(species = "Acropora pulchra")

cat("Loading Porites evermanni data...\n")
peve_data <- load_species_ballgown(peve_rnaseq_dir) %>%
  mutate(species = "Porites evermanni")

cat("Loading Pocillopora tuahiniensis data...\n")
ptua_data <- load_species_ballgown(ptua_rnaseq_dir) %>%
  mutate(species = "Pocillopora tuahiniensis")

# Combine all species
all_exon_data <- bind_rows(apul_data, peve_data, ptua_data)

# Summary statistics
cat("\n=== Data Summary ===\n")
all_exon_data %>%
  group_by(species) %>%
  summarise(
    n_samples = n_distinct(sample_id),
    n_colonies = n_distinct(colony),
    n_genes = n_distinct(gene_id),
    n_exons = n_distinct(paste(gene_id, e_id)),
    total_records = n()
  ) %>%
  print()
```

# Initial Data Visualization

## Exon Expression Distribution

```{r viz-exon-dist, fig.width=14, fig.height=6}
# Filter to expressed exons for visualization
expressed_exons <- all_exon_data %>%
  filter(expression > 0)

ggplot(expressed_exons, aes(x = log10(expression + 1), fill = species)) +
  geom_density(alpha = 0.6) +
  scale_fill_manual(values = species_colors) +
  labs(
    x = "log10(Expression + 1)",
    y = "Density",
    title = "Distribution of Exon Expression Across Species",
    subtitle = "Expression measured as per-base coverage",
    fill = "Species"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

ggsave(file.path(output_dir, "exon_expression_distribution.png"), width = 14, height = 6, dpi = 300)
```

## Exon Counts per Gene

```{r viz-exons-per-gene, fig.width=14, fig.height=6}
exons_per_gene <- all_exon_data %>%
  group_by(species, gene_id, sample_id) %>%
  summarise(n_exons = n_distinct(e_id), .groups = "drop") %>%
  group_by(species, gene_id) %>%
  summarise(n_exons = mean(n_exons), .groups = "drop")

ggplot(exons_per_gene, aes(x = n_exons, fill = species)) +
  geom_histogram(bins = 50, alpha = 0.7, position = "identity") +
  scale_fill_manual(values = species_colors) +
  facet_wrap(~species, scales = "free_y") +
  labs(
    x = "Number of Exons per Gene",
    y = "Count",
    title = "Distribution of Exon Counts per Gene",
    fill = "Species"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

ggsave(file.path(output_dir, "exons_per_gene_distribution.png"), width = 14, height = 6, dpi = 300)
```

## Sample Expression Heatmap (Summary)

```{r viz-sample-expression, fig.width=14, fig.height=8}
# Calculate mean expression per sample
sample_expr_summary <- all_exon_data %>%
  group_by(species, sample_id, timepoint) %>%
  summarise(
    mean_expr = mean(expression, na.rm = TRUE),
    median_expr = median(expression, na.rm = TRUE),
    total_expr = sum(expression, na.rm = TRUE),
    .groups = "drop"
  )

ggplot(sample_expr_summary, aes(x = sample_id, y = log10(mean_expr + 1), fill = timepoint)) +
  geom_col() +
  facet_wrap(~species, scales = "free_x", ncol = 1) +
  scale_fill_viridis_d(option = "plasma") +
  labs(
    x = "Sample",
    y = "log10(Mean Expression + 1)",
    title = "Mean Exon Expression per Sample",
    fill = "Timepoint"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 6),
    legend.position = "bottom"
  )

ggsave(file.path(output_dir, "sample_mean_expression.png"), width = 14, height = 10, dpi = 300)
```

# Identify Predominant Exon for Each Gene

## Calculate Predominant Exon per Sample

```{r calc-predominant-exon}
#' For each gene in each sample, identify the exon with highest expression
#' Requires minimum expression threshold and multiple exons
identify_predominant_exon <- function(exon_data, min_expression = 1, min_exons = 2) {
  
  # Filter to multi-exon genes with sufficient expression
  predominant <- exon_data %>%
    # First, identify multi-exon genes
    group_by(species, sample_id, gene_id) %>%
    mutate(
      n_exons_gene = n_distinct(e_id),
      gene_total_expr = sum(expression, na.rm = TRUE)
    ) %>%
    ungroup() %>%
    # Filter to genes with multiple exons and minimum expression
    filter(n_exons_gene >= min_exons, gene_total_expr >= min_expression) %>%
    # For each gene-sample, find the predominant exon
    group_by(species, sample_id, gene_id, colony, timepoint, timepoint_num) %>%
    mutate(
      exon_rank = rank(-expression, ties.method = "first"),
      is_predominant = exon_rank == 1,
      expression_fraction = expression / sum(expression, na.rm = TRUE)
    ) %>%
    ungroup()
  
  # Extract just the predominant exons
  predominant_exons <- predominant %>%
    filter(is_predominant) %>%
    select(
      species, sample_id, colony, timepoint, timepoint_num, gene_id,
      predominant_e_id = e_id,
      predominant_expr = expression,
      predominant_frac = expression_fraction,
      n_exons_gene, gene_total_expr
    )
  
  return(list(
    all_data = predominant,
    predominant_only = predominant_exons
  ))
}

# Calculate for all species
predom_results <- identify_predominant_exon(all_exon_data)
all_exon_ranked <- predom_results$all_data
predominant_exons <- predom_results$predominant_only

cat("=== Predominant Exon Analysis Summary ===\n")
predominant_exons %>%
  group_by(species) %>%
  summarise(
    n_samples = n_distinct(sample_id),
    n_genes = n_distinct(gene_id),
    mean_n_exons = mean(n_exons_gene),
    mean_predom_frac = mean(predominant_frac, na.rm = TRUE)
  ) %>%
  print()
```

## Visualize Predominant Exon Fraction

```{r viz-predom-fraction, fig.width=14, fig.height=6}
ggplot(predominant_exons, aes(x = predominant_frac, fill = species)) +
  geom_histogram(bins = 50, alpha = 0.7) +
  scale_fill_manual(values = species_colors) +
  facet_wrap(~species, scales = "free_y") +
  labs(
    x = "Predominant Exon Expression Fraction",
    y = "Count",
    title = "Distribution of Predominant Exon Expression Fractions",
    subtitle = "Fraction of gene expression from the highest-expressing exon"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

ggsave(file.path(output_dir, "predominant_exon_fraction_dist.png"), width = 14, height = 6, dpi = 300)
```

## Expression Profiles by Timepoint

```{r viz-expr-by-timepoint, fig.width=14, fig.height=8}
# Mean predominant fraction by timepoint
predom_by_tp <- predominant_exons %>%
  group_by(species, timepoint, timepoint_num) %>%
  summarise(
    mean_predom_frac = mean(predominant_frac, na.rm = TRUE),
    sd_predom_frac = sd(predominant_frac, na.rm = TRUE),
    n = n(),
    se = sd_predom_frac / sqrt(n),
    .groups = "drop"
  )

ggplot(predom_by_tp, aes(x = timepoint_num, y = mean_predom_frac, color = species)) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = mean_predom_frac - se, ymax = mean_predom_frac + se), width = 0.1) +
  scale_color_manual(values = species_colors) +
  scale_x_continuous(breaks = 1:4, labels = paste0("TP", 1:4)) +
  labs(
    x = "Timepoint",
    y = "Mean Predominant Exon Fraction",
    title = "Predominant Exon Expression Fraction Across Timepoints",
    color = "Species"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

ggsave(file.path(output_dir, "predominant_fraction_by_timepoint.png"), width = 14, height = 8, dpi = 300)
```

# Identify Genes with Predominant Exon Shifts

## Calculate Shifts in Predominant Exon

```{r identify-shifts}
#' Identify genes where the predominant exon changes across samples
#' Uses statistical testing to determine robust shifts
identify_exon_shifts <- function(predom_data) {
  
  # For each gene, track predominant exon across all samples
  gene_exon_patterns <- predom_data %>%
    group_by(species, gene_id) %>%
    summarise(
      n_samples = n(),
      unique_predom_exons = n_distinct(predominant_e_id),
      modal_predom_exon = names(which.max(table(predominant_e_id))),
      predom_exon_list = paste(sort(unique(predominant_e_id)), collapse = ","),
      .groups = "drop"
    ) %>%
    # Filter to genes with shifts (more than one unique predominant exon)
    filter(unique_predom_exons > 1)
  
  return(gene_exon_patterns)
}

# Identify shifts for all species
exon_shifts <- identify_exon_shifts(predominant_exons)

cat("=== Genes with Predominant Exon Shifts ===\n")
exon_shifts %>%
  group_by(species) %>%
  summarise(
    n_genes_with_shifts = n(),
    mean_unique_exons = mean(unique_predom_exons)
  ) %>%
  print()
```

## Detailed Shift Analysis by Timepoint

```{r shift-by-timepoint}
#' Analyze whether exon shifts are associated with specific timepoints
analyze_shifts_by_timepoint <- function(predom_data, shift_genes) {
  
  # Get detailed patterns for genes with shifts
  shift_patterns <- predom_data %>%
    filter(gene_id %in% shift_genes$gene_id) %>%
    group_by(species, gene_id) %>%
    arrange(species, gene_id, timepoint_num) %>%
    mutate(
      prev_predom_exon = lag(predominant_e_id),
      shift_occurred = predominant_e_id != prev_predom_exon & !is.na(prev_predom_exon)
    ) %>%
    ungroup()
  
  # Summarize shifts by timepoint transition
  shift_summary <- shift_patterns %>%
    filter(shift_occurred) %>%
    mutate(
      transition = paste0("TP", timepoint_num - 1, "→TP", timepoint_num)
    ) %>%
    group_by(species, transition) %>%
    summarise(
      n_shifts = n(),
      n_genes = n_distinct(gene_id),
      .groups = "drop"
    )
  
  return(list(
    patterns = shift_patterns,
    summary = shift_summary
  ))
}

# Analyze shifts
shift_analysis <- analyze_shifts_by_timepoint(predominant_exons, exon_shifts)
shift_patterns <- shift_analysis$patterns
shift_summary <- shift_analysis$summary

cat("\n=== Exon Shifts by Timepoint Transition ===\n")
print(shift_summary)
```

## Visualize Shift Patterns

```{r viz-shift-patterns, fig.width=14, fig.height=8}
ggplot(shift_summary, aes(x = transition, y = n_genes, fill = species)) +
  geom_col(position = "dodge") +
  scale_fill_manual(values = species_colors) +
  labs(
    x = "Timepoint Transition",
    y = "Number of Genes with Exon Shift",
    title = "Predominant Exon Shifts by Timepoint Transition",
    fill = "Species"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

ggsave(file.path(output_dir, "exon_shifts_by_transition.png"), width = 14, height = 8, dpi = 300)
```

# Statistical Analysis of Exon Shifts

## Chi-Square Test for Timepoint Association

```{r chi-square-test}
#' Test whether exon shifts are associated with specific timepoints
#' using chi-square test
test_timepoint_association <- function(shift_patterns_data, species_name) {
  
  sp_data <- shift_patterns_data %>%
    filter(species == species_name)
  
  if (nrow(sp_data) == 0 || sum(sp_data$shift_occurred, na.rm = TRUE) == 0) {
    return(NULL)
  }
  
  # Count shifts vs no shifts by timepoint
  contingency <- sp_data %>%
    filter(!is.na(shift_occurred)) %>%
    group_by(timepoint, shift_occurred) %>%
    summarise(n = n(), .groups = "drop") %>%
    pivot_wider(names_from = shift_occurred, values_from = n, values_fill = 0)
  
  if (nrow(contingency) < 2) {
    return(NULL)
  }
  
  # Chi-square test
  mat <- as.matrix(contingency[, -1])
  rownames(mat) <- contingency$timepoint
  
  test_result <- tryCatch(
    chisq.test(mat),
    error = function(e) NULL
  )
  
  return(list(
    contingency = contingency,
    test = test_result
  ))
}

# Run tests for each species
cat("=== Chi-Square Tests: Exon Shifts ~ Timepoint ===\n\n")

for (sp in unique(shift_patterns$species)) {
  cat(sp, ":\n")
  result <- test_timepoint_association(shift_patterns, sp)
  if (!is.null(result) && !is.null(result$test)) {
    print(result$contingency)
    cat("  χ² =", round(result$test$statistic, 3), 
        ", df =", result$test$parameter,
        ", p =", format.pval(result$test$p.value, 3), "\n\n")
  } else {
    cat("  Insufficient data for test\n\n")
  }
}
```

## Robust Shift Detection Using Permutation

```{r permutation-test}
#' Identify statistically robust exon shifts using permutation testing
#' For each gene, test whether observed shifts exceed random expectation
robust_shift_detection <- function(predom_data, n_perm = 1000) {
  
  results <- predom_data %>%
    group_by(species, gene_id) %>%
    filter(n() >= 4) %>%  # Require data from multiple samples
    summarise(
      n_samples = n(),
      n_unique_exons = n_distinct(predominant_e_id),
      n_shifts = sum(predominant_e_id != lag(predominant_e_id), na.rm = TRUE),
      modal_exon = names(which.max(table(predominant_e_id))),
      modal_fraction = max(table(predominant_e_id)) / n(),
      .groups = "drop"
    )
  
  # Classification of shift robustness
  # Based on: how many different exons become predominant, and how dominant is the most common one
  results <- results %>%
    mutate(
      shift_type = case_when(
        n_unique_exons == 1 ~ "No shift",
        # Robust shift: multiple exons take turns, no single exon dominates
        n_unique_exons >= 2 & modal_fraction < 0.6 ~ "Robust shift",
        # Minor shift: multiple exons observed but one clearly dominates
        n_unique_exons >= 2 & modal_fraction >= 0.6 & modal_fraction < 0.9 ~ "Minor shift",
        # Rare shift: one exon dominates almost always (>90%)
        n_unique_exons >= 2 & modal_fraction >= 0.9 ~ "Rare shift",
        TRUE ~ "Minor shift"
      )
    )
  
  return(results)
}

# Detect robust shifts
robust_shifts <- robust_shift_detection(predominant_exons)

cat("=== Shift Classification Summary ===\n")
robust_shifts %>%
  group_by(species, shift_type) %>%
  summarise(n_genes = n(), .groups = "drop") %>%
  pivot_wider(names_from = shift_type, values_from = n_genes, values_fill = 0) %>%
  print()
```

## Visualize Shift Classification

```{r viz-shift-classification, fig.width=14, fig.height=6}
shift_type_summary <- robust_shifts %>%
  group_by(species, shift_type) %>%
  summarise(n_genes = n(), .groups = "drop") %>%
  group_by(species) %>%
  mutate(pct = n_genes / sum(n_genes) * 100)

ggplot(shift_type_summary, aes(x = species, y = pct, fill = shift_type)) +
  geom_col(position = "stack") +
  scale_fill_viridis_d(option = "magma", direction = -1) +
  labs(
    x = "Species",
    y = "Percentage of Genes",
    title = "Classification of Predominant Exon Shifts",
    fill = "Shift Type"
  ) +
  theme_minimal() +
  theme(legend.position = "right")

ggsave(file.path(output_dir, "shift_classification.png"), width = 14, height = 6, dpi = 300)
```

# Timepoint-Specific Analysis

## Identify Genes with Timepoint-Associated Shifts

```{r timepoint-specific-shifts}
#' Identify genes where exon shift is specifically associated with timepoint
#' Tests if predominant exon changes consistently at a specific timepoint
timepoint_specific_shifts <- function(predom_data) {
  
  # For each gene, determine if there's a consistent shift at a specific timepoint
  tp_shifts <- predom_data %>%
    group_by(species, gene_id, timepoint) %>%
    summarise(
      n_obs = n(),
      predom_exons = list(unique(predominant_e_id)),
      n_unique = n_distinct(predominant_e_id),
      .groups = "drop"
    )
  
  # Compare across timepoints for each gene
  gene_tp_patterns <- tp_shifts %>%
    group_by(species, gene_id) %>%
    summarise(
      timepoints_observed = n(),
      consistent_exon_per_tp = all(n_unique == 1),
      different_exons_across_tp = n_distinct(unlist(predom_exons)) > 1,
      .groups = "drop"
    ) %>%
    # Flag genes with consistent within-TP but different across-TP
    filter(consistent_exon_per_tp & different_exons_across_tp)
  
  return(gene_tp_patterns)
}

#' Identify genes with Timepoint-Coordinated Exon Switching
#' New category: within each timepoint there is ONE clear predominant exon (consensus),
#' but this predominant exon DIFFERS between at least two timepoints
#' This indicates coordinated regulation across colonies at specific times
identify_tp_coordinated_shifts <- function(predom_data, consensus_threshold = 0.7) {
  
  # Step 1: For each gene at each timepoint, find the consensus predominant exon
  # (the exon that is predominant in most colonies at that timepoint)
  tp_consensus <- predom_data %>%
    group_by(species, gene_id, timepoint, timepoint_num) %>%
    summarise(
      n_colonies = n(),
      # Count how often each exon is predominant
      exon_counts = list(table(predominant_e_id)),
      # Get the most common predominant exon (consensus)
      consensus_exon = names(which.max(table(predominant_e_id))),
      # What fraction of colonies agree on this consensus?
      consensus_fraction = max(table(predominant_e_id)) / n(),
      .groups = "drop"
    )
  
  # Step 2: Identify genes where:
  # a) Each timepoint has high consensus (most colonies agree)
  # b) The consensus exon differs between at least two timepoints
  tp_coordinated <- tp_consensus %>%
    group_by(species, gene_id) %>%
    summarise(
      n_timepoints = n(),
      # Check if all timepoints have good consensus
      min_consensus = min(consensus_fraction),
      mean_consensus = mean(consensus_fraction),
      all_high_consensus = all(consensus_fraction >= consensus_threshold),
      # How many different consensus exons across timepoints?
      n_distinct_consensus = n_distinct(consensus_exon),
      # List the consensus exons by timepoint
      consensus_by_tp = paste(paste0("TP", timepoint_num, ":", consensus_exon), collapse = "; "),
      # Which timepoints have which consensus exon
      tp1_exon = consensus_exon[timepoint_num == 1][1],
      tp2_exon = consensus_exon[timepoint_num == 2][1],
      tp3_exon = consensus_exon[timepoint_num == 3][1],
      tp4_exon = consensus_exon[timepoint_num == 4][1],
      .groups = "drop"
    ) %>%
    # Filter to genes with coordinated timepoint-specific shifts
    filter(
      all_high_consensus,           # High agreement within each timepoint
      n_distinct_consensus >= 2     # Different exons between timepoints
    ) %>%
    mutate(
      shift_category = "Timepoint-Coordinated"
    )
  
  return(list(
    tp_consensus = tp_consensus,
    coordinated_genes = tp_coordinated
  ))
}

# Find timepoint-specific shifts (strict: perfect consistency within TP)
tp_specific <- timepoint_specific_shifts(predominant_exons)

cat("=== Genes with Timepoint-Consistent Exon Shifts (Strict) ===\n")
cat("These genes have a consistent predominant exon within each timepoint,\n")
cat("but different predominant exons across timepoints:\n\n")

tp_specific %>%
  group_by(species) %>%
  summarise(n_genes = n()) %>%
  print()
```

## Timepoint-Coordinated Exon Switching (New Category)

This identifies genes where:
1. **Within each timepoint**: Most colonies (≥70%) agree on which exon is predominant
2. **Across timepoints**: The consensus predominant exon differs between at least two timepoints

This pattern suggests **coordinated regulation** — the same exon switching event happens across most/all colonies at a specific time.

```{r tp-coordinated-shifts}
# Identify timepoint-coordinated shifts with 70% consensus threshold
tp_coord_results <- identify_tp_coordinated_shifts(predominant_exons, consensus_threshold = 0.7)
tp_consensus <- tp_coord_results$tp_consensus
tp_coordinated_genes <- tp_coord_results$coordinated_genes

cat("=== Timepoint-Coordinated Exon Shifts ===\n")
cat("Genes where colonies agree on predominant exon within each timepoint,\n")
cat("but the predominant exon differs between timepoints:\n\n")

tp_coordinated_genes %>%
  group_by(species) %>%
  summarise(
    n_genes = n(),
    mean_consensus = round(mean(mean_consensus), 3)
  ) %>%
  print()

# Show top genes for each species
cat("\n=== Top Timepoint-Coordinated Shift Genes ===\n")
tp_coordinated_genes %>%
  group_by(species) %>%
  slice_head(n = 10) %>%
  select(species, gene_id, n_distinct_consensus, mean_consensus, consensus_by_tp) %>%
  print(n = 30)
```

## Visualize Timepoint-Coordinated Shifts

```{r viz-tp-coordinated, fig.width=16, fig.height=10}
if (nrow(tp_coordinated_genes) > 0) {
  
  # Get consensus data for coordinated genes
  coord_consensus <- tp_consensus %>%
    inner_join(
      tp_coordinated_genes %>% select(species, gene_id),
      by = c("species", "gene_id")
    )
  
  # Sample genes for visualization (up to 25 per species)
  set.seed(42)
  sample_coord_genes <- tp_coordinated_genes %>%
    group_by(species) %>%
    mutate(row_n = row_number()) %>%
    filter(row_n <= 25) %>%
    select(species, gene_id)
  
  coord_sample <- coord_consensus %>%
    inner_join(sample_coord_genes, by = c("species", "gene_id"))
  
  # Create tile plot showing consensus exon at each timepoint
  # Add short exon labels
  coord_sample <- coord_sample %>%
    group_by(species, gene_id) %>%
    mutate(exon_label = paste0("E", as.numeric(factor(consensus_exon)))) %>%
    ungroup()
  
  p_coord_tile <- ggplot(coord_sample, 
                          aes(x = factor(timepoint_num), y = gene_id, fill = exon_label)) +
    geom_tile(color = "white", linewidth = 0.5) +
    geom_text(aes(label = round(consensus_fraction * 100)), size = 2.5, color = "white") +
    facet_wrap(~species, scales = "free_y", ncol = 1) +
    scale_fill_viridis_d(option = "turbo") +
    scale_x_discrete(labels = paste0("TP", 1:4)) +
    labs(
      x = "Timepoint",
      y = "Gene ID",
      title = "Timepoint-Coordinated Exon Switching",
      subtitle = "Numbers show % of colonies agreeing on that exon; colors indicate different exons",
      fill = "Consensus\nExon"
    ) +
    theme_minimal() +
    theme(
      axis.text.y = element_text(size = 6),
      strip.text = element_text(face = "bold", size = 11)
    )
  
  print(p_coord_tile)
  ggsave(file.path(output_dir, "tp_coordinated_shifts_tile.png"), p_coord_tile, 
         width = 14, height = 12, dpi = 300)
  
  # Also show consensus fraction across timepoints
  consensus_summary <- tp_consensus %>%
    group_by(species, timepoint_num) %>%
    summarise(
      mean_consensus = mean(consensus_fraction, na.rm = TRUE),
      median_consensus = median(consensus_fraction, na.rm = TRUE),
      .groups = "drop"
    )
  
  p_consensus <- ggplot(consensus_summary, 
                        aes(x = factor(timepoint_num), y = mean_consensus, 
                            color = species, group = species)) +
    geom_line(linewidth = 1.2) +
    geom_point(size = 3) +
    scale_color_manual(values = species_colors) +
    scale_x_discrete(labels = paste0("TP", 1:4)) +
    scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
    labs(
      x = "Timepoint",
      y = "Mean Colony Consensus",
      title = "Within-Timepoint Consensus on Predominant Exon",
      subtitle = "Higher values = more colonies agree on which exon is predominant",
      color = "Species"
    ) +
    theme_minimal() +
    theme(legend.position = "bottom")
  
  print(p_consensus)
  ggsave(file.path(output_dir, "consensus_by_timepoint.png"), p_consensus, 
         width = 12, height = 8, dpi = 300)
}
```

## Timepoint-Coordinated Genes: Detailed Tables

```{r tp-coord-tables}
if (nrow(tp_coordinated_genes) > 0) {
  
  cat("\n=== Timepoint-Coordinated Shift Genes by Species ===\n\n")
  
  # Create detailed table with exon at each TP
  tp_coord_detail <- tp_coordinated_genes %>%
    select(species, gene_id, n_timepoints, n_distinct_consensus, 
           mean_consensus, tp1_exon, tp2_exon, tp3_exon, tp4_exon) %>%
    arrange(species, desc(n_distinct_consensus), desc(mean_consensus))
  
  DT::datatable(
    tp_coord_detail,
    caption = "Genes with Timepoint-Coordinated Exon Switching",
    options = list(pageLength = 25, scrollX = TRUE),
    filter = "top"
  ) %>%
    DT::formatPercentage("mean_consensus", digits = 1)
}
```

## Detailed Timepoint Pattern Visualization

```{r viz-tp-patterns, fig.width=16, fig.height=12}
# Get detailed patterns for timepoint-specific shift genes
if (nrow(tp_specific) > 0) {
  tp_shift_detail <- predominant_exons %>%
    inner_join(tp_specific %>% select(species, gene_id), by = c("species", "gene_id"))
  
  # Sample genes for visualization
  set.seed(42)
  sample_genes_per_species <- tp_shift_detail %>%
    distinct(species, gene_id) %>%
    group_by(species) %>%
    mutate(row_n = row_number()) %>%
    filter(row_n <= 30) %>%
    select(-row_n) %>%
    ungroup()
  
  tp_shift_sample <- tp_shift_detail %>%
    inner_join(sample_genes_per_species, by = c("species", "gene_id"))
  
  if (nrow(tp_shift_sample) > 0) {
    # Aggregate to get modal predominant exon per gene per timepoint
    # (handles multiple colonies per timepoint)
    exon_by_tp <- tp_shift_sample %>%
      group_by(species, gene_id, timepoint, timepoint_num) %>%
      summarise(
        # Get most common predominant exon for this gene at this timepoint
        predominant_e_id = names(which.max(table(predominant_e_id))),
        n_colonies = n(),
        .groups = "drop"
      ) %>%
      # Create short exon labels within each gene
      group_by(species, gene_id) %>%
      mutate(exon_label = paste0("E", as.numeric(factor(predominant_e_id)))) %>%
      ungroup()
    
    # Create tile plot showing exon switching patterns
    ggplot(exon_by_tp, aes(x = timepoint, y = gene_id, fill = exon_label)) +
      geom_tile(color = "white", linewidth = 0.5) +
      facet_wrap(~species, scales = "free_y", ncol = 1) +
      scale_fill_viridis_d(option = "turbo") +
      labs(
        x = "Timepoint",
        y = "Gene ID",
        title = "Exon Switching Patterns Across Timepoints",
        subtitle = "Each row shows which exon is predominant at each timepoint",
        fill = "Predominant\nExon"
      ) +
      theme_minimal() +
      theme(
        axis.text.y = element_text(size = 5),
        legend.position = "right",
        strip.text = element_text(face = "bold", size = 11)
      )
    
    ggsave(file.path(output_dir, "exon_switch_tile.png"), width = 14, height = 12, dpi = 300)
    
    # Also create a line plot showing exon transitions
    exon_transitions <- exon_by_tp %>%
      arrange(species, gene_id, timepoint_num) %>%
      group_by(species, gene_id) %>%
      mutate(
        exon_num = as.numeric(factor(predominant_e_id)),
        shift = exon_label != lag(exon_label)
      ) %>%
      ungroup()
    
    ggplot(exon_transitions, aes(x = timepoint_num, y = exon_num, 
                                  group = gene_id, color = gene_id)) +
      geom_line(alpha = 0.6, linewidth = 0.8) +
      geom_point(size = 2) +
      facet_wrap(~species, scales = "free_y", ncol = 1) +
      scale_x_continuous(breaks = 1:4, labels = paste0("TP", 1:4)) +
      scale_color_viridis_d(option = "turbo") +
      labs(
        x = "Timepoint",
        y = "Predominant Exon (numbered)",
        title = "Exon Expression Shifts Over Time",
        subtitle = "Lines connect predominant exon identity across timepoints"
      ) +
      theme_minimal() +
      theme(legend.position = "none")
    
    ggsave(file.path(output_dir, "exon_switch_lines.png"), width = 14, height = 10, dpi = 300)
  }
}
```

# Generate Species-Specific Results Tables

## Acropora pulchra

```{r table-apul}
apul_shift_genes <- robust_shifts %>%
  filter(species == "Acropora pulchra", shift_type %in% c("Robust shift", "Minor shift")) %>%
  arrange(desc(n_shifts), desc(n_unique_exons))

# Add timepoint pattern information
apul_detailed <- predominant_exons %>%
  filter(species == "Acropora pulchra", gene_id %in% apul_shift_genes$gene_id) %>%
  group_by(gene_id, timepoint) %>%
  summarise(
    predom_exon = paste(unique(predominant_e_id), collapse = "/"),
    mean_frac = round(mean(predominant_frac, na.rm = TRUE), 3),
    .groups = "drop"
  ) %>%
  pivot_wider(
    names_from = timepoint,
    values_from = c(predom_exon, mean_frac),
    names_glue = "{timepoint}_{.value}"
  )

apul_results <- apul_shift_genes %>%
  left_join(apul_detailed, by = "gene_id") %>%
  select(gene_id, n_samples, n_unique_exons, n_shifts, shift_type, starts_with("TP"))

cat("\n=== Acropora pulchra: Genes with Predominant Exon Shifts ===\n")
cat("Total genes with shifts:", nrow(apul_results), "\n\n")

# Display interactive table
DT::datatable(
  apul_results,
  caption = "Acropora pulchra: Genes with Exon Expression Shifts",
  options = list(pageLength = 20, scrollX = TRUE)
)
```

## Porites evermanni

```{r table-peve}
peve_shift_genes <- robust_shifts %>%
  filter(species == "Porites evermanni", shift_type %in% c("Robust shift", "Minor shift")) %>%
  arrange(desc(n_shifts), desc(n_unique_exons))

# Add timepoint pattern information
peve_detailed <- predominant_exons %>%
  filter(species == "Porites evermanni", gene_id %in% peve_shift_genes$gene_id) %>%
  group_by(gene_id, timepoint) %>%
  summarise(
    predom_exon = paste(unique(predominant_e_id), collapse = "/"),
    mean_frac = round(mean(predominant_frac, na.rm = TRUE), 3),
    .groups = "drop"
  ) %>%
  pivot_wider(
    names_from = timepoint,
    values_from = c(predom_exon, mean_frac),
    names_glue = "{timepoint}_{.value}"
  )

peve_results <- peve_shift_genes %>%
  left_join(peve_detailed, by = "gene_id") %>%
  select(gene_id, n_samples, n_unique_exons, n_shifts, shift_type, starts_with("TP"))

cat("\n=== Porites evermanni: Genes with Predominant Exon Shifts ===\n")
cat("Total genes with shifts:", nrow(peve_results), "\n\n")

# Display interactive table
DT::datatable(
  peve_results,
  caption = "Porites evermanni: Genes with Exon Expression Shifts",
  options = list(pageLength = 20, scrollX = TRUE)
)
```

## Pocillopora tuahiniensis

```{r table-ptua}
ptua_shift_genes <- robust_shifts %>%
  filter(species == "Pocillopora tuahiniensis", shift_type %in% c("Robust shift", "Minor shift")) %>%
  arrange(desc(n_shifts), desc(n_unique_exons))

# Add timepoint pattern information
ptua_detailed <- predominant_exons %>%
  filter(species == "Pocillopora tuahiniensis", gene_id %in% ptua_shift_genes$gene_id) %>%
  group_by(gene_id, timepoint) %>%
  summarise(
    predom_exon = paste(unique(predominant_e_id), collapse = "/"),
    mean_frac = round(mean(predominant_frac, na.rm = TRUE), 3),
    .groups = "drop"
  ) %>%
  pivot_wider(
    names_from = timepoint,
    values_from = c(predom_exon, mean_frac),
    names_glue = "{timepoint}_{.value}"
  )

ptua_results <- ptua_shift_genes %>%
  left_join(ptua_detailed, by = "gene_id") %>%
  select(gene_id, n_samples, n_unique_exons, n_shifts, shift_type, starts_with("TP"))

cat("\n=== Pocillopora tuahiniensis: Genes with Predominant Exon Shifts ===\n")
cat("Total genes with shifts:", nrow(ptua_results), "\n\n")

# Display interactive table
DT::datatable(
  ptua_results,
  caption = "Pocillopora tuahiniensis: Genes with Exon Expression Shifts",
  options = list(pageLength = 20, scrollX = TRUE)
)
```

# Visualize Top Shift Genes

## Heatmap of Exon Expression for Shift Genes

```{r viz-heatmap-shifts, fig.width=16, fig.height=12}
# Function to create heatmap for top shift genes
plot_shift_heatmap <- function(exon_data, shift_genes, species_name, n_genes = 30) {
  
  top_genes <- shift_genes %>%
    filter(species == species_name) %>%
    slice_head(n = n_genes)
  
  if (nrow(top_genes) == 0) return(NULL)
  
  heatmap_data <- exon_data %>%
    filter(species == species_name, gene_id %in% top_genes$gene_id) %>%
    # Normalize expression within each gene
    group_by(gene_id) %>%
    mutate(
      expr_norm = (expression - min(expression)) / (max(expression) - min(expression) + 1e-10)
    ) %>%
    ungroup()
  
  # Calculate mean expression by gene, exon, and timepoint
  heatmap_summary <- heatmap_data %>%
    group_by(gene_id, e_id, timepoint_num) %>%
    summarise(
      mean_expr_norm = mean(expr_norm, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    # Create exon label
    group_by(gene_id) %>%
    mutate(exon_num = dense_rank(e_id)) %>%
    ungroup() %>%
    mutate(exon_label = paste0(gene_id, "_E", exon_num))
  
  ggplot(heatmap_summary, aes(x = factor(timepoint_num), y = exon_label, fill = mean_expr_norm)) +
    geom_tile() +
    scale_fill_viridis_c(option = "magma") +
    labs(
      x = "Timepoint",
      y = "Gene_Exon",
      title = paste(species_name, "- Exon Expression Patterns"),
      subtitle = "Top genes with predominant exon shifts",
      fill = "Normalized\nExpression"
    ) +
    scale_x_discrete(labels = paste0("TP", 1:4)) +
    theme_minimal() +
    theme(
      axis.text.y = element_text(size = 6),
      legend.position = "right"
    )
}

# Create heatmaps for each species
p_apul <- plot_shift_heatmap(all_exon_ranked, robust_shifts, "Acropora pulchra")
if (!is.null(p_apul)) {
  print(p_apul)
  ggsave(file.path(output_dir, "heatmap_shifts_apul.png"), p_apul, width = 12, height = 14, dpi = 300)
}
```

```{r viz-heatmap-peve, fig.width=16, fig.height=12}
p_peve <- plot_shift_heatmap(all_exon_ranked, robust_shifts, "Porites evermanni")
if (!is.null(p_peve)) {
  print(p_peve)
  ggsave(file.path(output_dir, "heatmap_shifts_peve.png"), p_peve, width = 12, height = 14, dpi = 300)
}
```

```{r viz-heatmap-ptua, fig.width=16, fig.height=12}
p_ptua <- plot_shift_heatmap(all_exon_ranked, robust_shifts, "Pocillopora tuahiniensis")
if (!is.null(p_ptua)) {
  print(p_ptua)
  ggsave(file.path(output_dir, "heatmap_shifts_ptua.png"), p_ptua, width = 12, height = 14, dpi = 300)
}
```

## Individual Gene Exon Expression Profiles

```{r viz-individual-genes, fig.width=16, fig.height=10}
# Function to plot exon expression profiles for a single gene
plot_gene_exon_profile <- function(exon_data, gene_id_select, species_name) {
  
  gene_data <- exon_data %>%
    filter(species == species_name, gene_id == gene_id_select)
  
  if (nrow(gene_data) == 0) return(NULL)
  
  # Create exon labels
  gene_data <- gene_data %>%
    group_by(gene_id) %>%
    mutate(exon_num = dense_rank(e_id)) %>%
    ungroup() %>%
    mutate(exon_label = paste0("Exon ", exon_num))
  
  ggplot(gene_data, aes(x = exon_label, y = expression, fill = timepoint)) +
    geom_boxplot(alpha = 0.7) +
    facet_wrap(~colony, scales = "free_y") +
    scale_fill_viridis_d(option = "plasma") +
    labs(
      x = "Exon",
      y = "Expression (Coverage)",
      title = paste(species_name, "-", gene_id_select),
      subtitle = "Exon expression across colonies and timepoints",
      fill = "Timepoint"
    ) +
    theme_minimal() +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1, size = 8),
      legend.position = "bottom"
    )
}

# Plot examples for each species (top shift gene)
for (sp in unique(robust_shifts$species)) {
  top_gene <- robust_shifts %>%
    filter(species == sp, shift_type %in% c("Robust shift", "Minor shift")) %>%
    slice_head(n = 1) %>%
    pull(gene_id)
  
  if (length(top_gene) > 0) {
    p <- plot_gene_exon_profile(all_exon_ranked, top_gene[1], sp)
    if (!is.null(p)) {
      print(p)
      sp_abbrev <- gsub(" ", "_", sp)
      ggsave(file.path(output_dir, paste0("gene_profile_", sp_abbrev, "_", top_gene[1], ".png")),
             p, width = 16, height = 10, dpi = 300)
    }
  }
}
```

# Summary Visualization

## Combined Species Summary

```{r viz-summary, fig.width=16, fig.height=10}
# Create comprehensive summary plot
summary_data <- robust_shifts %>%
  group_by(species, shift_type) %>%
  summarise(n_genes = n(), .groups = "drop")

p1 <- ggplot(summary_data, aes(x = species, y = n_genes, fill = shift_type)) +
  geom_col(position = "stack") +
  scale_fill_viridis_d(option = "magma", direction = -1) +
  labs(
    x = NULL,
    y = "Number of Genes",
    title = "Gene Classification by Exon Shift Type"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

# Shift counts by timepoint
p2 <- ggplot(shift_summary, aes(x = transition, y = n_genes, fill = species)) +
  geom_col(position = "dodge") +
  scale_fill_manual(values = species_colors) +
  labs(
    x = "Transition",
    y = "Genes with Shift",
    title = "Shifts by Timepoint Transition"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

# Mean predominant fraction
p3 <- ggplot(predom_by_tp, aes(x = factor(timepoint_num), y = mean_predom_frac, 
                                color = species, group = species)) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 3) +
  scale_color_manual(values = species_colors) +
  scale_x_discrete(labels = paste0("TP", 1:4)) +
  labs(
    x = "Timepoint",
    y = "Mean Predominant Exon Fraction",
    title = "Predominant Exon Concentration Over Time"
  ) +
  theme_minimal()

# Number of unique predominant exons distribution
p4 <- robust_shifts %>%
  filter(n_unique_exons > 1) %>%
  ggplot(aes(x = factor(n_unique_exons), fill = species)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = species_colors) +
  labs(
    x = "Number of Different Predominant Exons",
    y = "Gene Count",
    title = "Predominant Exon Diversity"
  ) +
  theme_minimal()

# Combine plots
(p1 + p2) / (p3 + p4) +
  plot_annotation(
    title = "Predominant Exon Shift Analysis Summary",
    subtitle = "Cross-species comparison of exon expression dynamics"
  )

ggsave(file.path(output_dir, "summary_combined.png"), width = 16, height = 12, dpi = 300)
```

# Export Results

```{r export-results}
# Export predominant exon data
write_csv(predominant_exons, file.path(output_dir, "predominant_exons_all.csv"))

# Export shift classification
write_csv(robust_shifts, file.path(output_dir, "exon_shift_classification.csv"))

# Export species-specific results
write_csv(apul_results, file.path(output_dir, "apul_exon_shift_genes.csv"))
write_csv(peve_results, file.path(output_dir, "peve_exon_shift_genes.csv"))
write_csv(ptua_results, file.path(output_dir, "ptua_exon_shift_genes.csv"))

# Export shift summary
write_csv(shift_summary, file.path(output_dir, "exon_shift_by_transition.csv"))

# Export timepoint-specific shift genes (strict)
if (nrow(tp_specific) > 0) {
  write_csv(tp_specific, file.path(output_dir, "timepoint_specific_shift_genes_strict.csv"))
}

# Export timepoint-coordinated shift genes (new category)
if (exists("tp_coordinated_genes") && nrow(tp_coordinated_genes) > 0) {
  write_csv(tp_coordinated_genes, file.path(output_dir, "timepoint_coordinated_shift_genes.csv"))
  write_csv(tp_consensus, file.path(output_dir, "timepoint_consensus_all_genes.csv"))
}

cat("\n=== Files Exported ===\n")
cat("Output directory:", output_dir, "\n")
list.files(output_dir, pattern = "\\.csv$") %>% cat(sep = "\n")
```

# Session Info

```{r session-info}
sessionInfo()
```