---
title: "WGCNA analysis of ortholog expression"
author: "Ariana S Huffmyer, E5 RoL Team"
date: "2025"
output: 
  html_document:
    code_folding: hide
    toc: yes
    toc_depth: 6
    toc_float: yes
editor_options: 
  chunk_output_type: console
--- 

# Set up 

```{r}
library(vegan)
library(mixOmics)
library(ggplot2)
library(tidyverse)
library(DESeq2)
library(factoextra)
library(WGCNA)
library(ComplexHeatmap)
```

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

# The following setting is important, do not omit.
options(stringsAsFactors = FALSE) #Set Strings to character
```

# Load data 

Read in data generated for ortholog expression 
```{r}
orthos<-read_csv("M-multi-species/output/14-pca-orthologs/vst_counts_matrix.csv")
str(orthos)

metadata<-read_csv("master-molecular-metadata.csv")%>%dplyr::select(colony, timepoint, site, species)
str(metadata)
```

# WGCNA 

```{r}
## =========================
## WGCNA on ortholog matrix
## =========================

# Packages
suppressPackageStartupMessages({
  library(tidyverse)
  library(WGCNA)
})

# Speed / reproducibility
options(stringsAsFactors = FALSE)
allowWGCNAThreads()   # uses multi-core where possible

## -------------------------
## 1) Prepare input matrices
## -------------------------

# Expect: orthos has first column 'group_id' then sample columns
stopifnot("group_id" %in% names(orthos))

# Keep a clean copy
orthos_df <- orthos %>% as.data.frame()

# Pull sample columns (everything except group_id)
sample_cols <- setdiff(names(orthos_df), "group_id")
```

```{r}
# --- Build a clean metadata with string cols and an explicit sample_id ---

metadata <- metadata %>%
  as.data.frame() %>%                        # drop tibble/S4 wrappers
  mutate(across(everything(), as.character)) %>% 
  mutate(sample_id = paste(colony, timepoint, sep = "-"))

# Check for duplicates in sample_id (these would break ordering)
dup_ids <- metadata$sample_id[duplicated(metadata$sample_id)]
if (length(dup_ids) > 0) {
  stop("Duplicate sample_id(s) in metadata: ", paste(unique(dup_ids), collapse = ", "))
}

# --- Order metadata to match the expression columns using a left_join ---
# This preserves the order of sample_cols and is robust to Rle types.
metadata <- tibble(sample_id = setdiff(names(orthos), "group_id")) %>%
  left_join(metadata, by = "sample_id")

# Sanity check for missing joins
missing_meta <- metadata %>% filter(is.na(colony) | is.na(timepoint))
if (nrow(missing_meta) > 0) {
  stop(
    "These expression samples are missing from metadata (no join): ",
    paste(missing_meta$sample_id, collapse = ", ")
  )
}
```

```{r}
# Rebuild the expression matrix to match this order
expr_mat <- orthos %>%
  as.data.frame() %>%
  column_to_rownames("group_id") %>%
  .[, metadata$sample_id, drop = FALSE] %>%  # force column order to match metadata
  t() %>%
  as.data.frame()

identical(rownames(expr_mat), metadata$sample_id)

# Sanity checks
nrow(expr_mat) == nrow(metadata)
identical(rownames(expr_mat), metadata$sample_id)
```

```{r}
dim(expr_mat)
# Optional: remove genes with too much missingness / low variance
# WGCNA will also check, but a pre-filter can help speed.
# e.g., keep genes with variance > 0 across samples
gene_var <- apply(expr_mat, 2, var, na.rm = TRUE)
expr_mat <- expr_mat[, gene_var > 0]

dim(expr_mat)
#removed only 4 genes, most have variance >0

# Let WGCNA check for good samples/genes
gsg <- goodSamplesGenes(expr_mat, verbose = 3)
if (!gsg$allOK) {
  if (sum(!gsg$goodGenes) > 0)
    message("Removing genes: ", paste(colnames(expr_mat)[!gsg$goodGenes], collapse = ", "))
  if (sum(!gsg$goodSamples) > 0)
    message("Removing samples: ", paste(rownames(expr_mat)[!gsg$goodSamples], collapse = ", "))
  expr_mat <- expr_mat[gsg$goodSamples, gsg$goodGenes]
  metadata  <- metadata[gsg$goodSamples, , drop = FALSE]
}
```


```{r}

## -------------------------
## 2) Build trait matrix
## -------------------------
# We’ll correlate module eigengenes with timepoint and site (categorical).
# Use one-hot encoding (model.matrix) to get numeric contrasts.
# We'll drop intercept to avoid collinearity.

traits_timepoint <- model.matrix(~ 0 + factor(metadata$timepoint))
colnames(traits_timepoint) <- gsub("^factor\\(metadata\\$timepoint\\)", "", colnames(traits_timepoint))

traits_site <- model.matrix(~ 0 + factor(metadata$site))
colnames(traits_site) <- gsub("^factor\\(metadata\\$site\\)", "", colnames(traits_site))

traits_species <- model.matrix(~ 0 + factor(metadata$species))
colnames(traits_species) <- gsub("^factor\\(metadata\\$species\\)", "", colnames(traits_species))

# Combine into one trait matrix
trait_mat <- cbind(traits_timepoint, traits_site, traits_species)
head(trait_mat)
rownames(trait_mat) <- metadata$sample_id
head(trait_mat)
```

Add physiology into the trait matrix
```{r}
phys<-read_csv("M-multi-species/data/phys_master_timeseries.csv")
head(phys)

phys<-phys%>%
  dplyr::select(sample_id, Host_AFDW.mg.cm2, Sym_AFDW.mg.cm2, Am, AQY, Rd, Ik, Ic, calc.umol.cm2.hr, cells.mgAFDW, prot_mg.mgafdw, cre.umol.mgafdw, Ratio_AFDW.mg.cm2, Total_Chl, Total_Chl_cell)%>%
  mutate(sample_id=str_replace(sample_id, "_", "-"))

str(trait_mat)
str(phys)
```

```{r}
# 1) Make 'phys' row-indexable by sample_id and drop the id column
phys_by_id <- phys %>%
  distinct(sample_id, .keep_all = TRUE) %>%   # just in case there are duplicates
  column_to_rownames("sample_id") %>%
  as.data.frame()

# 2) Reorder 'phys' rows to match trait_mat rownames (keeps NAs for any missing)
phys_aligned <- phys_by_id[rownames(trait_mat), , drop = FALSE]

# 3) Bind to the right of trait_mat, preserving matrix type
#    (columns in phys are numeric per your str(), so this stays numeric)
trait_mat_plus <- cbind(trait_mat, as.matrix(phys_aligned))

# 4) (Optional) Make sure column names are unique if there’s any overlap
colnames(trait_mat_plus) <- make.unique(colnames(trait_mat_plus))

# ---- quick sanity checks (optional) ----
# any trait_mat rows missing in phys?
setdiff(rownames(trait_mat), rownames(phys_by_id))
# any phys rows not in trait_mat?
setdiff(rownames(phys_aligned), rownames(trait_mat))
```


```{r}

## --------------------------------------
## 3) Pick soft-thresholding power (beta)
## --------------------------------------
powers <- c(seq(1, 10), seq(12, 20, by = 2))
sft <- pickSoftThreshold(expr_mat,
                         powerVector = powers,
                         networkType = "signed",
                         verbose = 5)

# Choose the smallest power achieving scale-free topology fit ~0.8 (or your preference).
# If none reach 0.8, pick the elbow / max fit.
fitIdx <- which.max(sft$fitIndices[, "SFT.R.sq"] >= 0.8)
if (length(fitIdx) == 0) {
  # fallback: best overall fit
  fitIdx <- which.max(sft$fitIndices[, "SFT.R.sq"])
}
softPower <- sft$fitIndices[fitIdx, "Power"]
message("Chosen soft-threshold power: ", softPower)

## --------------------------------------
## Plot scale-free topology and connectivity
## --------------------------------------
par(mfrow = c(1, 2))
cex1 <- 0.9

# 1. Scale-free topology fit index as a function of power
plot(sft$fitIndices[, 1], -sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
     xlab = "Soft Threshold (power)",
     ylab = "Scale Free Topology Model Fit, signed R^2",
     type = "n", main = paste("Scale independence"))
text(sft$fitIndices[, 1], -sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
     labels = powers, cex = cex1, col = "blue")
abline(h = 0.8, col = "red", lty = 2)
abline(v = softPower, col = "red", lty = 3)

# 2. Mean connectivity as a function of power
plot(sft$fitIndices[, 1], sft$fitIndices[, 5],
     xlab = "Soft Threshold (power)",
     ylab = "Mean Connectivity",
     type = "n", main = paste("Mean connectivity"))
text(sft$fitIndices[, 1], sft$fitIndices[, 5],
     labels = powers, cex = cex1, col = "blue")
abline(v = softPower, col = "red", lty = 3)

par(mfrow = c(1, 1))

dev.off()
```

Soft power = 16 

```{r}
## -----------------------------------
## 4) Build network and detect modules
## -----------------------------------
# Use blockwiseModules for convenience and speed.
net <- blockwiseModules(
  expr_mat,
  power = softPower,
  networkType = "signed",
  TOMType = "signed",
  corType = "pearson",
  minModuleSize = 30,           # adjust based on your goals
  mergeCutHeight = 0.25,        # ~cor 0.75 between MEs
  numericLabels = TRUE,
  pamRespectsDendro = TRUE,
  saveTOMs = FALSE,
  verbose = 3
)

# Extract module labels and colors
module_labels <- net$colors
module_colors <- labels2colors(net$colors)
MEs           <- net$MEs
# Order eigengenes by hierarchical clustering
MEs           <- orderMEs(MEs)
```

```{r}
## --------------------------------------------
## 5) Correlate module eigengenes with the traits
## --------------------------------------------
# Ensure row order matches
identical(rownames(MEs), rownames(trait_mat_plus))

mod_trait_cor  <- cor(MEs, trait_mat_plus, use = "p")
mod_trait_pval <- corPvalueStudent(mod_trait_cor, nSamples = nrow(expr_mat))

# --- Tidy module–trait correlation table (with BH-adjusted p per trait) ---

# Correlations
mod_trait_cor_df <- as.data.frame(mod_trait_cor) %>%
  tibble::rownames_to_column(var = "module") %>%
  tidyr::pivot_longer(-module, names_to = "trait", values_to = "cor")

# P-values
mod_trait_pval_df <- as.data.frame(mod_trait_pval) %>%
  tibble::rownames_to_column(var = "module") %>%
  tidyr::pivot_longer(-module, names_to = "trait", values_to = "p")

# Join + tidy
mod_trait_tbl <- mod_trait_cor_df %>%
  dplyr::left_join(mod_trait_pval_df, by = c("module", "trait")) %>%
  dplyr::mutate(
    cor = as.numeric(cor),
    p   = as.numeric(p)
  ) %>%
  dplyr::group_by(trait) %>%
  dplyr::mutate(p_adj = p.adjust(p, method = "BH")) %>%
  dplyr::ungroup() %>%
  dplyr::arrange(trait, dplyr::desc(abs(cor)))

print(
  mod_trait_tbl %>%
    dplyr::group_by(trait) %>%
    dplyr::slice_max(order_by = abs(cor), n = 5) %>%
    dplyr::ungroup()
)
```


```{r}
## ----------------------------------------------------
## 6) Map genes (orthogroups) to modules & save outputs
## ----------------------------------------------------
# Gene (orthogroup) IDs
gene_ids <- colnames(expr_mat)

gene_module_df <- tibble(
  group_id = gene_ids,
  module_label = module_labels,
  module_color = labels2colors(module_labels)
)

# Example: save gene lists per module color
write_csv(gene_module_df, "M-multi-species/output/18-ortholog-wgcna/gene_module_assignments.csv")

#summarize number of genes in each module

module_counts <- gene_module_df %>%
  as.data.frame() %>%
  dplyr::count(module_label, name = "n_genes") %>%
  arrange(desc(n_genes))

print(module_counts)
```

```{r}
## --------------------------------------
## 8) Optional: Module membership / kME
## --------------------------------------
# Correlate each gene with its module eigengene (kME) and attach p-values.
kME <- as.data.frame(cor(expr_mat, MEs, use = "p"))
kME_p <- as.data.frame(corPvalueStudent(as.matrix(kME), nSamples = nrow(expr_mat)))

kME_tbl <- kME %>%
  rownames_to_column(var = "group_id") %>%
  as_tibble() %>%
  left_join(gene_module_df, by = "group_id")

# Example: list top hub-like orthogroups within a module (by |kME|)
top_hubs <- kME_tbl %>%
  mutate(across(starts_with("ME"), abs)) %>%
  rowwise() %>%
  mutate(kME_in_own = {
    me_col <- paste0("ME", module_color)
    if (me_col %in% colnames(kME_tbl)) get(me_col) else NA_real_
  }) %>%
  ungroup() %>%
  arrange(module_color, desc(kME_in_own)) %>%
  group_by(module_color) %>%
  slice_head(n = 20) %>%
  ungroup()

print(top_hubs)
```

```{r}
## --------------------------------------
## 9) Heatmap with compact row dendrogram + show text only if p<0.05
## --------------------------------------

# --- Build text and significance mask ---
text_matrix <- matrix("", nrow = nrow(mod_trait_cor), ncol = ncol(mod_trait_cor),
                      dimnames = dimnames(mod_trait_cor))
sig_mask <- mod_trait_pval < 0.05

for (i in seq_len(nrow(mod_trait_cor))) {
  for (j in seq_len(ncol(mod_trait_cor))) {
    if (isTRUE(sig_mask[i, j])) {
      r <- signif(mod_trait_cor[i, j], 2)
      p <- signif(mod_trait_pval[i, j], 1)
      text_matrix[i, j] <- paste0(r, "\n(", p, ")")
    } else {
      text_matrix[i, j] <- ""  # hide non-significant
    }
  }
}

# --- Cluster rows (modules) ---
row_cor <- cor(t(mod_trait_cor), use = "pairwise.complete.obs")
row_d   <- as.dist(1 - row_cor)
row_hc  <- hclust(row_d, method = "average")
rord    <- row_hc$order

mat_re      <- mod_trait_cor[rord, , drop = FALSE]
text_re     <- text_matrix[rord, , drop = FALSE]
sig_mask_re <- sig_mask[rord, , drop = FALSE]
yLabs_re    <- rownames(mod_trait_cor)[rord]
xLabs_re    <- colnames(trait_mat_plus)

# --- Compact layout: narrower dendrogram panel ---
op <- par(no.readonly = TRUE)
layout(matrix(c(1, 2), nrow = 1), widths = c(0.6, 5))

## (1) Row dendrogram
par(mar = c(10, 0, 3, 0))
plot(as.dendrogram(row_hc), horiz = TRUE, axes = FALSE, yaxs = "i", xaxs = "i",
     xlab = "", ylab = "", main = "",
     xlim = c(max(row_hc$height) * 1.05, 0))

## (2) Heatmap with reduced left margin
par(mar = c(10, 5, 3, 3))
WGCNA::labeledHeatmap(
  Matrix         = mat_re,
  xLabels        = xLabs_re,
  yLabels        = yLabs_re,
  ySymbols       = yLabs_re,
  colorLabels    = FALSE,
  colors         = WGCNA::blueWhiteRed(50),
  textMatrix     = text_re,         # already blank where p>0.05
  setStdMargins  = FALSE,
  cex.text       = 0.6,
  zlim           = c(-1, 1),
  main           = "Module–Trait Correlations\n(cor with p-values, p<0.05 shown only)"
)

# Overlay bold text for significant cells
nr <- nrow(mat_re)
nc <- ncol(mat_re)
for (i in seq_len(nr)) {
  for (j in seq_len(nc)) {
    if (isTRUE(sig_mask_re[i, j])) {
      text(x = j, y = nr - i + 1, labels = text_re[i, j],
           cex = 0.6, font = 2)  # bold
    }
  }
}

# reset
par(op); layout(1)
```

```{r}
## --------------------------------------
## 10) Export per-module gene lists to CSV
## --------------------------------------

# Output locations
out_base <- "M-multi-species/output/18-ortholog-wgcna"
out_dir  <- file.path(out_base, "module_gene_lists")
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)

# Ensure we have gene -> module mapping
if (!exists("gene_module_df")) {
  gene_ids <- colnames(expr_mat)
  gene_module_df <- tibble(
    group_id     = gene_ids,
    module_label = module_labels,
    module_color = labels2colors(module_labels)
  )
}

# Ensure we have kME table (gene-to-ME correlations)
if (!exists("kME_tbl")) {
  kME   <- as.data.frame(cor(expr_mat, MEs, use = "p"))
  kME_tbl <- kME %>%
    rownames_to_column(var = "group_id") %>%
    as_tibble() %>%
    left_join(gene_module_df, by = "group_id")
}

# Ensure a per-gene kME for its own module ("kME_in_own")
if (!"kME_in_own" %in% names(kME_tbl)) {
  kME_tbl <- kME_tbl %>%
    mutate(across(starts_with("ME"), as.numeric)) %>%
    rowwise() %>%
    mutate(kME_in_own = {
      me_col <- paste0("ME", module_color)
      if (me_col %in% colnames(cur_data_all())) cur_data_all()[[me_col]] else NA_real_
    }) %>%
    ungroup()
}

# 1) One CSV per module with its gene list ordered by hub-likeness
module_levels <- sort(unique(gene_module_df$module_color))
for (col in module_levels) {
  df <- kME_tbl %>%
    filter(module_color == col) %>%
    arrange(desc(kME_in_own)) %>%
    dplyr::select(group_id, module_label, module_color, kME_in_own)

  safe_col <- gsub("[^A-Za-z0-9]+", "-", col)
  outfile  <- file.path(out_dir, paste0("genes_in_", safe_col, "_module.csv"))
  write_csv(df, outfile)
}

# 2) A single combined CSV of all genes with assignments and kMEs
combined <- kME_tbl %>%
  arrange(module_color, desc(kME_in_own)) %>%
  dplyr::select(group_id, module_label)%>%
  dplyr::rename(wgcna_module=module_label)

write_csv(combined, file.path(out_base, "wgcna_ortholog_module_assignments.csv"))
```

# Plot eigengenes 

Plot eigengene expression across factors of interest

```{r}
## --------------------------------------
## 11) Plot MEs with significance stars
##    (stars denote p < 0.05 in ME~trait correlation)
## --------------------------------------

suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(broom))

# --- Prepare long-form ME table with metadata & phys -------------------------
me_long <- MEs %>%
  as.data.frame() %>%
  rownames_to_column("sample_id") %>%
  pivot_longer(starts_with("ME"), names_to = "ME", values_to = "eigengene") %>%
  mutate(module_color = sub("^ME", "", ME))

meta_min <- metadata %>%
  dplyr::select(sample_id, species, timepoint, site) %>%
  mutate(
    species   = as.factor(species),
    timepoint = as.factor(timepoint),
    site      = as.factor(site)
  )

phys_df <- phys_aligned %>%
  as.data.frame() %>%
  rownames_to_column("sample_id")

me_annot <- me_long %>%
  left_join(meta_min, by = "sample_id") %>%
  left_join(phys_df, by = "sample_id")

# Identify numeric phys columns available (for scatter panels & sig lookup)
phys_numeric_cols <- names(phys_df)[names(phys_df) != "sample_id"]
phys_numeric_cols <- phys_numeric_cols[sapply(phys_df[phys_numeric_cols], is.numeric)]

# --- Build a tidy p-value table from mod_trait_pval for significance lookups --
pval_tbl <- as.data.frame(mod_trait_pval) %>%
  rownames_to_column("ME") %>%
  pivot_longer(-ME, names_to = "trait", values_to = "p") %>%
  mutate(module_color = sub("^ME", "", ME),
         sig = p < 0.05)

# Factor-level sets (as created earlier)
timepoint_cols <- colnames(traits_timepoint)
site_cols      <- colnames(traits_site)
species_cols   <- colnames(traits_species)

# Significance by factor (panel-level)
sig_timepoint <- pval_tbl %>%
  filter(trait %in% timepoint_cols) %>%
  group_by(module_color) %>%
  summarise(sig = any(sig, na.rm = TRUE), .groups = "drop")

sig_site <- pval_tbl %>%
  filter(trait %in% site_cols) %>%
  group_by(module_color) %>%
  summarise(sig = any(sig, na.rm = TRUE), .groups = "drop")

sig_species <- pval_tbl %>%
  filter(trait %in% species_cols) %>%
  group_by(module_color) %>%
  summarise(sig = any(sig, na.rm = TRUE), .groups = "drop")

# Significance for phys metrics: specific (metric, module_color) pairs
sig_phys <- pval_tbl %>%
  filter(trait %in% phys_numeric_cols) %>%
  transmute(metric = trait, module_color, sig)

# --- y-positions for star placement (top-left of each panel) -----------------
panel_y_species <- me_annot %>%
  group_by(module_color) %>%
  summarise(y = max(eigengene, na.rm = TRUE), .groups = "drop") %>%
  left_join(sig_species, by = "module_color") %>%
  filter(sig == TRUE) %>%
  mutate(label = "★")

panel_y_time <- me_annot %>%
  group_by(module_color) %>%
  summarise(y = max(eigengene, na.rm = TRUE), .groups = "drop") %>%
  left_join(sig_timepoint, by = "module_color") %>%
  filter(sig == TRUE) %>%
  mutate(label = "★")

panel_y_site <- me_annot %>%
  group_by(module_color) %>%
  summarise(y = max(eigengene, na.rm = TRUE), .groups = "drop") %>%
  left_join(sig_site, by = "module_color") %>%
  filter(sig == TRUE) %>%
  mutate(label = "★")

panel_y_phys <- me_annot %>%
  pivot_longer(cols = any_of(phys_numeric_cols),
               names_to = "metric", values_to = "metric_value") %>%
  group_by(metric, module_color) %>%
  summarise(y = max(eigengene, na.rm = TRUE), .groups = "drop") %>%
  left_join(sig_phys, by = c("metric", "module_color")) %>%
  filter(sig == TRUE) %>%
  mutate(label = "★")

# A small, clean theme
theme_me <- theme_bw(base_size = 11) +
  theme(
    panel.grid.minor = element_blank(),
    strip.background = element_rect(fill = "grey95", color = "grey85"),
    legend.position = "none", 
    axis.text.x = element_text(angle=45, hjust=1)
  )

# -------- Categorical factors: species / timepoint / site --------------------

# Species
p_species <- ggplot(me_annot, aes(x = species, y = eigengene, fill = species)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  geom_jitter(width = 0.15, size = 0.8, alpha = 0.6) +
  facet_wrap(~ module_color, scales = "free_y") +
  geom_text(
    data = panel_y_species,
    aes(x = -Inf, y = y, label = label),
    inherit.aes = FALSE, hjust = -0.2, vjust = 1.1, size = 6, color="red"
  ) +
  labs(title = "Module eigengenes by species (★ p<0.05)",
       x = "Species", y = "Module eigengene") +
  theme_me
print(p_species)

# Save plot
out_plot_dir <- file.path(out_base, "module_eigengene_factor_panels")
dir.create(out_plot_dir, recursive = TRUE, showWarnings = FALSE)

ggsave(
  filename = file.path(out_plot_dir, "MEs_by_species.png"),
  plot = p_species,
  width = 10, height = 6, dpi = 300
)

# Timepoint
p_time <- ggplot(me_annot, aes(x = timepoint, y = eigengene, fill = timepoint)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  geom_jitter(width = 0.15, size = 0.8, alpha = 0.6) +
  facet_wrap(~ module_color, scales = "free_y") +
  geom_text(
    data = panel_y_time,
    aes(x = -Inf, y = y, label = label),
    inherit.aes = FALSE, hjust = -0.2, vjust = 1.1, size = 6, color="red"
  ) +
  labs(title = "Module eigengenes by timepoint (★ p<0.05)",
       x = "Timepoint", y = "Module eigengene") +
  theme_me
print(p_time)

ggsave(
  filename = file.path(out_plot_dir, "MEs_by_time.png"),
  plot = p_time,
  width = 10, height = 6, dpi = 300
)

# Site
p_site <- ggplot(me_annot, aes(x = site, y = eigengene, fill = site)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  geom_jitter(width = 0.15, size = 0.8, alpha = 0.6) +
  facet_wrap(~ module_color, scales = "free_y") +
  geom_text(
    data = panel_y_site,
    aes(x = -Inf, y = y, label = label),
    inherit.aes = FALSE, hjust = -0.2, vjust = 1.1, size = 6, color="red"
  ) +
  labs(title = "Module eigengenes by site (★ p<0.05)",
       x = "Site", y = "Module eigengene") +
  theme_me
print(p_site)

ggsave(
  filename = file.path(out_plot_dir, "MEs_by_site.png"),
  plot = p_site,
  width = 10, height = 6, dpi = 300
)

# -------------------- Physiological metrics (continuous) ---------------------
# One plot per metric, faceted by module_color, with significance stars

# Long-form data
me_phys_long <- me_annot %>%
  pivot_longer(
    cols = any_of(phys_numeric_cols),
    names_to  = "metric",
    values_to = "metric_value"
  )

# Significance table (TRUE if ME~metric correlation p<0.05)
# Uses previously built pval_tbl and phys_numeric_cols
sig_phys <- pval_tbl %>%
  filter(trait %in% phys_numeric_cols) %>%
  transmute(metric = trait, module_color, sig)

# Helper: build one plot for a given metric
plot_metric_panel <- function(metric_name) {
  d <- me_phys_long %>% filter(metric == metric_name)

  # Per-(metric,module) star y-position (top + padding) and x at left edge
  star_df <- d %>%
    group_by(module_color) %>%
    summarise(
      y_star = max(eigengene, na.rm = TRUE) +
               0.05 * (diff(range(eigengene, na.rm = TRUE)) + 1e-8),
      x_star = suppressWarnings(min(metric_value, na.rm = TRUE)),
      .groups = "drop"
    ) %>%
    left_join(sig_phys %>% filter(metric == metric_name),
              by = "module_color") %>%
    filter(!is.na(sig) & sig) %>%
    mutate(label = "★",
           x_star = ifelse(is.finite(x_star), x_star, 0))

  ggplot(d, aes(x = metric_value, y = eigengene)) +
    geom_point(alpha = 0.6, size = 1) +
    geom_smooth(method = "lm", se = TRUE) +
    facet_wrap(~ module_color, scales = "free_y") +
    geom_text(
      data = star_df,
      aes(x = x_star, y = y_star, label = label),
      inherit.aes = FALSE, size = 5, vjust = 1, color="red"
    ) +
    labs(
      title = paste0("Module eigengenes vs. ", metric_name, "  (★ p<0.05)"),
      x = metric_name, y = "Module eigengene"
    ) +
    coord_cartesian(clip = "off") +
    theme_me
}

# Build all metric plots as a named list and print each
metric_plots <- setNames(
  lapply(phys_numeric_cols, plot_metric_panel),
  phys_numeric_cols
)

# Print each plot in the document (one panel per metric)
invisible(lapply(metric_plots, print))

# -----------------------------------------
# Save each panel as a PNG in out_base dir
# -----------------------------------------
out_plot_dir <- file.path(out_base, "module_eigengene_metric_panels")
dir.create(out_plot_dir, recursive = TRUE, showWarnings = FALSE)

for (metric_name in names(metric_plots)) {
  outfile <- file.path(out_plot_dir, paste0("MEs_vs_", metric_name, ".png"))
  message("Saving: ", outfile)
  ggsave(
    filename = outfile,
    plot = metric_plots[[metric_name]],
    width = 8, height = 6, dpi = 300
  )
}
```