---
title: "WGCNA analysis of ortholog expression"
author: "Ariana S Huffmyer, E5 RoL Team"
date: "2025"
output: 
  html_document:
    code_folding: hide
    toc: yes
    toc_depth: 6
    toc_float: yes
editor_options: 
  chunk_output_type: console
--- 

# Set up 

```{r}
library(vegan)
library(mixOmics)
library(ggplot2)
library(tidyverse)
library(DESeq2)
library(factoextra)
library(WGCNA)
library(ComplexHeatmap)
```

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

# The following setting is important, do not omit.
options(stringsAsFactors = FALSE) #Set Strings to character
```

# Load data 

Read in data generated for ortholog expression 
```{r}
orthos<-read_csv("M-multi-species/output/14-pca-orthologs/vst_counts_matrix.csv")
str(orthos)

metadata<-read_csv("master-molecular-metadata.csv")%>%select(colony, timepoint, site, species)
str(metadata)
```

# WGCNA 

```{r}
## =========================
## WGCNA on ortholog matrix
## =========================

# Packages
suppressPackageStartupMessages({
  library(tidyverse)
  library(WGCNA)
})

# Speed / reproducibility
options(stringsAsFactors = FALSE)
allowWGCNAThreads()   # uses multi-core where possible

## -------------------------
## 1) Prepare input matrices
## -------------------------

# Expect: orthos has first column 'group_id' then sample columns
stopifnot("group_id" %in% names(orthos))

# Keep a clean copy
orthos_df <- orthos %>% as.data.frame()

# Pull sample columns (everything except group_id)
sample_cols <- setdiff(names(orthos_df), "group_id")
```

```{r}
# --- Build a clean metadata with string cols and an explicit sample_id ---

metadata <- metadata %>%
  as.data.frame() %>%                        # drop tibble/S4 wrappers
  mutate(across(everything(), as.character)) %>% 
  mutate(sample_id = paste(colony, timepoint, sep = "-"))

# Check for duplicates in sample_id (these would break ordering)
dup_ids <- metadata$sample_id[duplicated(metadata$sample_id)]
if (length(dup_ids) > 0) {
  stop("Duplicate sample_id(s) in metadata: ", paste(unique(dup_ids), collapse = ", "))
}

# --- Order metadata to match the expression columns using a left_join ---
# This preserves the order of sample_cols and is robust to Rle types.
metadata <- tibble(sample_id = setdiff(names(orthos), "group_id")) %>%
  left_join(metadata, by = "sample_id")

# Sanity check for missing joins
missing_meta <- metadata %>% filter(is.na(colony) | is.na(timepoint))
if (nrow(missing_meta) > 0) {
  stop(
    "These expression samples are missing from metadata (no join): ",
    paste(missing_meta$sample_id, collapse = ", ")
  )
}
```

```{r}
# Rebuild the expression matrix to match this order
expr_mat <- orthos %>%
  as.data.frame() %>%
  column_to_rownames("group_id") %>%
  .[, metadata$sample_id, drop = FALSE] %>%  # force column order to match metadata
  t() %>%
  as.data.frame()

identical(rownames(expr_mat), metadata$sample_id)

# Sanity checks
nrow(expr_mat) == nrow(metadata)
identical(rownames(expr_mat), metadata$sample_id)
```

```{r}
dim(expr_mat)
# Optional: remove genes with too much missingness / low variance
# WGCNA will also check, but a pre-filter can help speed.
# e.g., keep genes with variance > 0 across samples
gene_var <- apply(expr_mat, 2, var, na.rm = TRUE)
expr_mat <- expr_mat[, gene_var > 0]

dim(expr_mat)
#removed only 4 genes, most have variance >0

# Let WGCNA check for good samples/genes
gsg <- goodSamplesGenes(expr_mat, verbose = 3)
if (!gsg$allOK) {
  if (sum(!gsg$goodGenes) > 0)
    message("Removing genes: ", paste(colnames(expr_mat)[!gsg$goodGenes], collapse = ", "))
  if (sum(!gsg$goodSamples) > 0)
    message("Removing samples: ", paste(rownames(expr_mat)[!gsg$goodSamples], collapse = ", "))
  expr_mat <- expr_mat[gsg$goodSamples, gsg$goodGenes]
  metadata  <- metadata[gsg$goodSamples, , drop = FALSE]
}
```


```{r}

## -------------------------
## 2) Build trait matrix
## -------------------------
# We’ll correlate module eigengenes with timepoint and site (categorical).
# Use one-hot encoding (model.matrix) to get numeric contrasts.
# We'll drop intercept to avoid collinearity.

traits_timepoint <- model.matrix(~ 0 + factor(metadata$timepoint))
colnames(traits_timepoint) <- gsub("^factor\\(metadata\\$timepoint\\)", "", colnames(traits_timepoint))

traits_site <- model.matrix(~ 0 + factor(metadata$site))
colnames(traits_site) <- gsub("^factor\\(metadata\\$site\\)", "", colnames(traits_site))

traits_species <- model.matrix(~ 0 + factor(metadata$species))
colnames(traits_species) <- gsub("^factor\\(metadata\\$species\\)", "", colnames(traits_species))

# Combine into one trait matrix
trait_mat <- cbind(traits_timepoint, traits_site, traits_species)
head(trait_mat)
rownames(trait_mat) <- metadata$sample_id
head(trait_mat)
```

```{r}

## --------------------------------------
## 3) Pick soft-thresholding power (beta)
## --------------------------------------
powers <- c(seq(1, 10), seq(12, 20, by = 2))
sft <- pickSoftThreshold(expr_mat,
                         powerVector = powers,
                         networkType = "signed",
                         verbose = 5)

# Choose the smallest power achieving scale-free topology fit ~0.8 (or your preference).
# If none reach 0.8, pick the elbow / max fit.
fitIdx <- which.max(sft$fitIndices[, "SFT.R.sq"] >= 0.8)
if (length(fitIdx) == 0) {
  # fallback: best overall fit
  fitIdx <- which.max(sft$fitIndices[, "SFT.R.sq"])
}
softPower <- sft$fitIndices[fitIdx, "Power"]
message("Chosen soft-threshold power: ", softPower)

## --------------------------------------
## Plot scale-free topology and connectivity
## --------------------------------------
par(mfrow = c(1, 2))
cex1 <- 0.9

# 1. Scale-free topology fit index as a function of power
plot(sft$fitIndices[, 1], -sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
     xlab = "Soft Threshold (power)",
     ylab = "Scale Free Topology Model Fit, signed R^2",
     type = "n", main = paste("Scale independence"))
text(sft$fitIndices[, 1], -sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
     labels = powers, cex = cex1, col = "blue")
abline(h = 0.8, col = "red", lty = 2)
abline(v = softPower, col = "red", lty = 3)

# 2. Mean connectivity as a function of power
plot(sft$fitIndices[, 1], sft$fitIndices[, 5],
     xlab = "Soft Threshold (power)",
     ylab = "Mean Connectivity",
     type = "n", main = paste("Mean connectivity"))
text(sft$fitIndices[, 1], sft$fitIndices[, 5],
     labels = powers, cex = cex1, col = "blue")
abline(v = softPower, col = "red", lty = 3)

par(mfrow = c(1, 1))

dev.off()
```

Soft power = 16 

```{r}
## -----------------------------------
## 4) Build network and detect modules
## -----------------------------------
# Use blockwiseModules for convenience and speed.
net <- blockwiseModules(
  expr_mat,
  power = softPower,
  networkType = "signed",
  TOMType = "signed",
  corType = "pearson",
  minModuleSize = 30,           # adjust based on your goals
  mergeCutHeight = 0.25,        # ~cor 0.75 between MEs
  numericLabels = TRUE,
  pamRespectsDendro = TRUE,
  saveTOMs = FALSE,
  verbose = 3
)

# Extract module labels and colors
module_labels <- net$colors
module_colors <- labels2colors(net$colors)
MEs           <- net$MEs
# Order eigengenes by hierarchical clustering
MEs           <- orderMEs(MEs)
```

```{r}
## --------------------------------------------
## 5) Correlate module eigengenes with the traits
## --------------------------------------------
# Ensure row order matches
identical(rownames(MEs), rownames(trait_mat))

mod_trait_cor  <- cor(MEs, trait_mat, use = "p")
mod_trait_pval <- corPvalueStudent(mod_trait_cor, nSamples = nrow(expr_mat))

# --- Tidy module–trait correlation table (with BH-adjusted p per trait) ---

# Correlations
mod_trait_cor_df <- as.data.frame(mod_trait_cor) %>%
  tibble::rownames_to_column(var = "module") %>%
  tidyr::pivot_longer(-module, names_to = "trait", values_to = "cor")

# P-values
mod_trait_pval_df <- as.data.frame(mod_trait_pval) %>%
  tibble::rownames_to_column(var = "module") %>%
  tidyr::pivot_longer(-module, names_to = "trait", values_to = "p")

# Join + tidy
mod_trait_tbl <- mod_trait_cor_df %>%
  dplyr::left_join(mod_trait_pval_df, by = c("module", "trait")) %>%
  dplyr::mutate(
    cor = as.numeric(cor),
    p   = as.numeric(p)
  ) %>%
  dplyr::group_by(trait) %>%
  dplyr::mutate(p_adj = p.adjust(p, method = "BH")) %>%
  dplyr::ungroup() %>%
  dplyr::arrange(trait, dplyr::desc(abs(cor)))

print(
  mod_trait_tbl %>%
    dplyr::group_by(trait) %>%
    dplyr::slice_max(order_by = abs(cor), n = 5) %>%
    dplyr::ungroup()
)
```


```{r}
## ----------------------------------------------------
## 6) Map genes (orthogroups) to modules & save outputs
## ----------------------------------------------------
# Gene (orthogroup) IDs
gene_ids <- colnames(expr_mat)

gene_module_df <- tibble(
  group_id = gene_ids,
  module_label = module_labels,
  module_color = labels2colors(module_labels)
)

# Example: save gene lists per module color
write_csv(gene_module_df, "M-multi-species/output/18-ortholog-wgcna/gene_module_assignments.csv")

#summarize number of genes in each module

module_counts <- gene_module_df %>%
  as.data.frame() %>%
  dplyr::count(module_label, name = "n_genes") %>%
  arrange(desc(n_genes))

print(module_counts)
```

```{r}
## --------------------------------------
## 8) Optional: Module membership / kME
## --------------------------------------
# Correlate each gene with its module eigengene (kME) and attach p-values.
kME <- as.data.frame(cor(expr_mat, MEs, use = "p"))
kME_p <- as.data.frame(corPvalueStudent(as.matrix(kME), nSamples = nrow(expr_mat)))

kME_tbl <- kME %>%
  rownames_to_column(var = "group_id") %>%
  as_tibble() %>%
  left_join(gene_module_df, by = "group_id")

# Example: list top hub-like orthogroups within a module (by |kME|)
top_hubs <- kME_tbl %>%
  mutate(across(starts_with("ME"), abs)) %>%
  rowwise() %>%
  mutate(kME_in_own = {
    me_col <- paste0("ME", module_color)
    if (me_col %in% colnames(kME_tbl)) get(me_col) else NA_real_
  }) %>%
  ungroup() %>%
  arrange(module_color, desc(kME_in_own)) %>%
  group_by(module_color) %>%
  slice_head(n = 20) %>%
  ungroup()

print(top_hubs)
```

```{r}
## --------------------------------------
## 9) Heatmap with compact row dendrogram + show text only if p<0.05
## --------------------------------------

# --- Build text and significance mask ---
text_matrix <- matrix("", nrow = nrow(mod_trait_cor), ncol = ncol(mod_trait_cor),
                      dimnames = dimnames(mod_trait_cor))
sig_mask <- mod_trait_pval < 0.05

for (i in seq_len(nrow(mod_trait_cor))) {
  for (j in seq_len(ncol(mod_trait_cor))) {
    if (isTRUE(sig_mask[i, j])) {
      r <- signif(mod_trait_cor[i, j], 2)
      p <- signif(mod_trait_pval[i, j], 1)
      text_matrix[i, j] <- paste0(r, "\n(", p, ")")
    } else {
      text_matrix[i, j] <- ""  # hide non-significant
    }
  }
}

# --- Cluster rows (modules) ---
row_cor <- cor(t(mod_trait_cor), use = "pairwise.complete.obs")
row_d   <- as.dist(1 - row_cor)
row_hc  <- hclust(row_d, method = "average")
rord    <- row_hc$order

mat_re      <- mod_trait_cor[rord, , drop = FALSE]
text_re     <- text_matrix[rord, , drop = FALSE]
sig_mask_re <- sig_mask[rord, , drop = FALSE]
yLabs_re    <- rownames(mod_trait_cor)[rord]
xLabs_re    <- colnames(trait_mat)

# --- Compact layout: narrower dendrogram panel ---
op <- par(no.readonly = TRUE)
layout(matrix(c(1, 2), nrow = 1), widths = c(0.6, 5))

## (1) Row dendrogram
par(mar = c(10, 0, 3, 0))
plot(as.dendrogram(row_hc), horiz = TRUE, axes = FALSE, yaxs = "i", xaxs = "i",
     xlab = "", ylab = "", main = "",
     xlim = c(max(row_hc$height) * 1.05, 0))

## (2) Heatmap with reduced left margin
par(mar = c(10, 5, 3, 3))
WGCNA::labeledHeatmap(
  Matrix         = mat_re,
  xLabels        = xLabs_re,
  yLabels        = yLabs_re,
  ySymbols       = yLabs_re,
  colorLabels    = FALSE,
  colors         = WGCNA::blueWhiteRed(50),
  textMatrix     = text_re,         # already blank where p>0.05
  setStdMargins  = FALSE,
  cex.text       = 0.6,
  zlim           = c(-1, 1),
  main           = "Module–Trait Correlations\n(cor with p-values, p<0.05 shown only)"
)

# Overlay bold text for significant cells
nr <- nrow(mat_re)
nc <- ncol(mat_re)
for (i in seq_len(nr)) {
  for (j in seq_len(nc)) {
    if (isTRUE(sig_mask_re[i, j])) {
      text(x = j, y = nr - i + 1, labels = text_re[i, j],
           cex = 0.6, font = 2)  # bold
    }
  }
}

# reset
par(op); layout(1)
```