---
title: "34-biomin-pathway-compare"
output: html_document
date: "2025-11-29"
---

This notebook joins the biomineralization orthogroup count matrices that were produced for each species in script 33. The goal is to compare expression support for each biomineralization orthogroup across *Acropora pulchra* (apul), *Porites evermanni* (peve), and *Pocillopora tuahiniensis* (ptua).

## Setup

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)

library(tidyverse)
library(glue)
```

```{r paths}
count_dir <- "../output/33-biomin-pathway-counts"
output_dir <- "../output/34-biomin-pathway-compare"
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)

species_lookup <- c(
  apul = "Acropora pulchra",
  peve = "Porites evermanni",
  ptua = "Pocillopora tuahiniensis"
)
```

## Available count matrices

```{r input-files}
count_files <- tibble(
  file = list.files(count_dir, pattern = "_biomin_counts\\.csv$", full.names = TRUE)
) %>%
  mutate(
    filename = basename(file),
    species_code = str_extract(filename, "^[a-z]+"),
    species = species_lookup[species_code]
  )

count_files
```

## Join counts by orthogroup

```{r load-and-join}
counts_long <- count_files %>%
  mutate(data = map(file, ~ read_csv(.x, show_col_types = FALSE))) %>%
  mutate(data = map2(data, species_code, ~ .x %>%
                       pivot_longer(
                         cols = -c(group_id, gene_id),
                         names_to = "sample_id",
                         values_to = "count"
                       ) %>%
                       mutate(species_code = .y))) %>%
  select(data) %>%
  unnest(data) %>%
  mutate(
    species = species_lookup[species_code],
    colony_id = str_remove(sample_id, "-TP\\d+$"),
    timepoint = parse_number(str_extract(sample_id, "TP\\d+$")),
    timepoint_label = factor(glue("TP{timepoint}"), levels = glue("TP{1:4}")),
    gene_label = glue("{group_id} | {gene_id}"),
    log10_count = log10(count + 1)
  )

counts_long
```

The long table above contains every sample-level count paired with its orthogroup and species label, enabling direct comparisons across species.

## Species-level summaries

```{r species-summary}
species_summary <- counts_long %>%
  group_by(group_id, species_code, species) %>%
  summarise(
    mean_count = mean(count),
    median_count = median(count),
    sd_count = sd(count),
    n_samples = n(),
    .groups = "drop"
  )

species_wide <- species_summary %>%
  select(group_id, species_code, mean_count) %>%
  pivot_wider(
    names_from = species_code,
    values_from = mean_count,
    names_glue = "{species_code}_mean"
  )

spread_summary <- species_wide %>%
  rowwise() %>%
  mutate(
    mean_spread = max(c_across(ends_with("_mean")), na.rm = TRUE) -
      min(c_across(ends_with("_mean")), na.rm = TRUE)
  ) %>%
  ungroup() %>%
  left_join(
    species_summary %>%
      group_by(group_id) %>%
      slice_max(order_by = mean_count, n = 1, with_ties = FALSE) %>%
      select(group_id, top_species = species, top_mean = mean_count),
    by = "group_id"
  ) %>%
  arrange(desc(mean_spread))

species_summary
```

```{r write-output}
write_csv(species_summary, file.path(output_dir, "biomin_counts_species_summary_long.csv"))
write_csv(species_wide, file.path(output_dir, "biomin_counts_species_summary_wide.csv"))
write_csv(spread_summary, file.path(output_dir, "biomin_counts_species_max_spread.csv"))
```

## Timepoint dynamics

```{r timepoint-summary}
timepoint_summary <- counts_long %>%
  filter(!is.na(timepoint)) %>%
  group_by(group_id, species, timepoint, timepoint_label) %>%
  summarise(
    mean_count = mean(count),
    median_count = median(count),
    sd_count = sd(count),
    n_samples = n(),
    se_count = sd_count / sqrt(n_samples),
    .groups = "drop"
  )

timepoint_summary
```

```{r timepoint-plot, fig.width=11, fig.height=7}
timepoint_plot_groups <- spread_summary %>%
  semi_join(
    timepoint_summary %>%
      distinct(group_id, species) %>%
      count(group_id) %>%
      filter(n == length(species_lookup)),
    by = "group_id"
  ) %>%
  slice_head(n = 12) %>%
  pull(group_id)

timepoint_summary %>%
  filter(group_id %in% timepoint_plot_groups) %>%
  mutate(group_id = fct_reorder(group_id, mean_count, .fun = max, .desc = TRUE)) %>%
  ggplot(aes(x = timepoint_label, y = mean_count, color = species, group = species)) +
  geom_errorbar(aes(ymin = pmax(mean_count - se_count, 0),
                    ymax = mean_count + se_count),
                width = 0.15, linewidth = 0.4, alpha = 0.6) +
  geom_line(linewidth = 0.7) +
  geom_point(size = 2) +
  facet_wrap(~ group_id, scales = "free_y") +
  labs(
    x = "Timepoint",
    y = "Mean count",
    title = "Timepoint trends for orthogroups with largest inter-species spread",
    subtitle = "Lines trace species-level mean counts across sequential sampling points"
  ) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "bottom")
```

```{r timepoint-distribution, fig.width=10, fig.height=6}
counts_long %>%
  filter(!is.na(timepoint_label)) %>%
  ggplot(aes(x = timepoint_label, y = log10_count, fill = species)) +
  geom_boxplot(alpha = 0.55, outlier_shape = NA) +
  labs(
    x = "Timepoint",
    y = "log10(count + 1)",
    title = "Sample-level distribution of biomineralization counts by timepoint"
  ) +
  theme_minimal(base_size = 12)
```

## Largest inter-species differences

```{r top-differences}
spread_summary %>%
  select(group_id, mean_spread, top_species, top_mean) %>%
  slice_head(n = 15)
```

```{r diff-plot, fig.width=10, fig.height=7}
top_groups <- spread_summary %>%
  slice_head(n = 20) %>%
  pull(group_id)

plot_data <- species_summary %>%
  filter(group_id %in% top_groups)

plot_data %>%
  mutate(group_id = fct_reorder(group_id, mean_count, .fun = max, .desc = TRUE)) %>%
  ggplot(aes(x = species, y = group_id, fill = mean_count)) +
  geom_tile(color = "grey30") +
  scale_fill_viridis_c(trans = "log10", option = "plasma", name = "mean counts") +
  labs(
    x = "Species",
    y = "Orthogroup",
    title = "Orthogroups with largest mean-count spread across species"
  ) +
  theme_minimal(base_size = 12)
```

## Sample-level distributions

```{r density}
counts_long %>%
  ggplot(aes(x = log10_count, fill = species)) +
  geom_density(alpha = 0.4) +
  labs(
    x = "log10(count + 1)",
    y = "Density",
    title = "Distribution of biomineralization counts across species"
  ) +
  theme_minimal(base_size = 12)
```

## Gene-level overview across species and timepoints

```{r gene-heatmap, fig.width=12, fig.height=16}
gene_timepoint_means <- counts_long %>%
  filter(!is.na(timepoint_label)) %>%
  group_by(group_id, species, species_code, timepoint_label) %>%
  summarise(
    mean_log10 = mean(log10_count),
    .groups = "drop"
  ) %>%
  group_by(group_id, species) %>%
  filter(n_distinct(timepoint_label) == 4) %>%
  ungroup() %>%
  group_by(group_id) %>%
  filter(n_distinct(species) == length(species_lookup)) %>%
  ungroup() %>%
  mutate(
    gene_axis = fct_reorder(group_id, parse_number(group_id))
  )

gene_timepoint_means %>%
  ggplot(aes(x = timepoint_label, y = gene_axis, fill = mean_log10)) +
  geom_tile(color = "grey25", linewidth = 0.05) +
  facet_wrap(~ species, scales = "free_y") +
  scale_fill_viridis_c(option = "inferno", name = "mean log10(count)", na.value = "grey90") +
  labs(
    x = "Timepoint",
    y = "group_id",
    title = "Gene-level biomineralization expression",
    subtitle = "Every tile is a gene within an orthogroup; shared group_ids align across species panels"
  ) +
  theme_minimal(base_size = 10) +
  theme(
    strip.text = element_text(face = "bold"),
    axis.text.y = element_text(size = 5)
  )
```

## Clustered species-pattern heatmap

```{r species-cluster-heatmap, fig.width=14, fig.height=10}
species_time_means <- counts_long %>%
  filter(!is.na(timepoint_label)) %>%
  group_by(group_id, species, timepoint_label) %>%
  summarise(mean_log10 = mean(log10_count), .groups = "drop") %>%
  group_by(group_id, species) %>%
  filter(n_distinct(timepoint_label) == 4) %>%
  ungroup() %>%
  group_by(group_id) %>%
  filter(n_distinct(species) == length(species_lookup)) %>%
  ungroup() %>%
  mutate(col_key = glue("{species} | {timepoint_label}"))

col_levels <- species_time_means %>%
  distinct(species, timepoint_label, col_key) %>%
  arrange(species, timepoint_label) %>%
  pull(col_key)

heatmap_wide <- species_time_means %>%
  select(group_id, col_key, mean_log10) %>%
  pivot_wider(names_from = col_key, values_from = mean_log10)

heatmap_matrix <- heatmap_wide %>%
  select(-group_id) %>%
  as.matrix()

row_means <- rowMeans(heatmap_matrix, na.rm = TRUE)
row_means[is.na(row_means)] <- 0
na_idx <- which(is.na(heatmap_matrix), arr.ind = TRUE)
if (length(na_idx) > 0) {
  heatmap_matrix[na_idx] <- row_means[na_idx[, 1]]
}

row_order <- hclust(dist(heatmap_matrix))$order
ordered_groups <- heatmap_wide$group_id[row_order]

species_time_means %>%
  mutate(
    col_key = factor(col_key, levels = col_levels),
    group_id = factor(group_id, levels = ordered_groups)
  ) %>%
  ggplot(aes(x = col_key, y = group_id, fill = mean_log10)) +
  geom_tile(color = "grey15", linewidth = 0.05) +
  scale_fill_viridis_c(option = "magma", name = "mean log10(count)", na.value = "grey90") +
  labs(
    x = "Species | Timepoint",
    y = "group_id (clustered by species pattern)",
    title = "Clustered biomineralization patterns across species and timepoints",
    subtitle = "Rows ordered via hierarchical clustering; columns keep species-timepoint blocks together"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    axis.text.x = element_text(angle = 55, hjust = 1),
    panel.grid = element_blank()
  )
```

## Z-scored temporal-pattern heatmap

```{r zscore-cluster-heatmap, fig.width=14, fig.height=10}
zscore_matrix <- species_time_means %>%
  select(group_id, species, timepoint_label, mean_log10) %>%
  mutate(col_key = glue("{species} | {timepoint_label}")) %>%
  select(group_id, col_key, mean_log10) %>%
  pivot_wider(names_from = col_key, values_from = mean_log10)

z_mat <- zscore_matrix %>%
  select(-group_id) %>%
  as.matrix()

row_means <- rowMeans(z_mat, na.rm = TRUE)
row_sds <- apply(z_mat, 1, sd, na.rm = TRUE)
row_sds[row_sds == 0] = 1
z_mat <- sweep(z_mat, 1, row_means, FUN = "-")
z_mat <- sweep(z_mat, 1, row_sds, FUN = "/")

row_order_z <- hclust(dist(z_mat))$order
ordered_groups_z <- zscore_matrix$group_id[row_order_z]

zscore_long <- as_tibble(z_mat) %>%
  mutate(group_id = zscore_matrix$group_id) %>%
  pivot_longer(-group_id, names_to = "col_key", values_to = "zscore") %>%
  mutate(col_key = factor(col_key, levels = col_levels))

zscore_long %>%
  mutate(group_id = factor(group_id, levels = ordered_groups_z)) %>%
  ggplot(aes(x = col_key, y = group_id, fill = zscore)) +
  geom_tile(color = "grey20", linewidth = 0.05) +
  scale_fill_distiller(palette = "RdBu", direction = -1, name = "z-score") +
  labs(
    x = "Species | Timepoint",
    y = "group_id (clustered on z-scored profiles)",
    title = "Z-scored biomineralization patterns across species and timepoints",
    subtitle = "Each orthogroup row normalized to zero mean, unit variance before clustering"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    axis.text.x = element_text(angle = 55, hjust = 1),
    panel.grid = element_blank()
  )
```

The exported CSVs in `../output/34-biomin-pathway-compare/` capture the joined counts and summary tables so downstream analyses can focus on orthogroups that show the strongest inter-species differences.