--- title: "WGCNA analysis of ortholog expression" author: "Ariana S Huffmyer, E5 RoL Team" date: "2025" output: html_document: code_folding: hide toc: yes toc_depth: 6 toc_float: yes editor_options: chunk_output_type: console --- # Set up ```{r} library(vegan) library(mixOmics) library(ggplot2) library(tidyverse) library(DESeq2) library(factoextra) library(WGCNA) library(ComplexHeatmap) ``` ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) # The following setting is important, do not omit. options(stringsAsFactors = FALSE) #Set Strings to character ``` # Load data Read in data generated for ortholog expression ```{r} orthos<-read_csv("M-multi-species/output/14-pca-orthologs/vst_counts_matrix.csv") str(orthos) metadata<-read_csv("master-molecular-metadata.csv")%>%select(colony, timepoint, site, species) str(metadata) ``` # WGCNA ```{r} ## ========================= ## WGCNA on ortholog matrix ## ========================= # Packages suppressPackageStartupMessages({ library(tidyverse) library(WGCNA) }) # Speed / reproducibility options(stringsAsFactors = FALSE) allowWGCNAThreads() # uses multi-core where possible ## ------------------------- ## 1) Prepare input matrices ## ------------------------- # Expect: orthos has first column 'group_id' then sample columns stopifnot("group_id" %in% names(orthos)) # Keep a clean copy orthos_df <- orthos %>% as.data.frame() # Pull sample columns (everything except group_id) sample_cols <- setdiff(names(orthos_df), "group_id") ``` ```{r} # --- Build a clean metadata with string cols and an explicit sample_id --- metadata <- metadata %>% as.data.frame() %>% # drop tibble/S4 wrappers mutate(across(everything(), as.character)) %>% mutate(sample_id = paste(colony, timepoint, sep = "-")) # Check for duplicates in sample_id (these would break ordering) dup_ids <- metadata$sample_id[duplicated(metadata$sample_id)] if (length(dup_ids) > 0) { stop("Duplicate sample_id(s) in metadata: ", paste(unique(dup_ids), collapse = ", ")) } # --- Order metadata to match the expression columns using a left_join --- # This preserves the order of sample_cols and is robust to Rle types. metadata <- tibble(sample_id = setdiff(names(orthos), "group_id")) %>% left_join(metadata, by = "sample_id") # Sanity check for missing joins missing_meta <- metadata %>% filter(is.na(colony) | is.na(timepoint)) if (nrow(missing_meta) > 0) { stop( "These expression samples are missing from metadata (no join): ", paste(missing_meta$sample_id, collapse = ", ") ) } ``` ```{r} # Rebuild the expression matrix to match this order expr_mat <- orthos %>% as.data.frame() %>% column_to_rownames("group_id") %>% .[, metadata$sample_id, drop = FALSE] %>% # force column order to match metadata t() %>% as.data.frame() identical(rownames(expr_mat), metadata$sample_id) # Sanity checks nrow(expr_mat) == nrow(metadata) identical(rownames(expr_mat), metadata$sample_id) ``` ```{r} dim(expr_mat) # Optional: remove genes with too much missingness / low variance # WGCNA will also check, but a pre-filter can help speed. # e.g., keep genes with variance > 0 across samples gene_var <- apply(expr_mat, 2, var, na.rm = TRUE) expr_mat <- expr_mat[, gene_var > 0] dim(expr_mat) #removed only 4 genes, most have variance >0 # Let WGCNA check for good samples/genes gsg <- goodSamplesGenes(expr_mat, verbose = 3) if (!gsg$allOK) { if (sum(!gsg$goodGenes) > 0) message("Removing genes: ", paste(colnames(expr_mat)[!gsg$goodGenes], collapse = ", ")) if (sum(!gsg$goodSamples) > 0) message("Removing samples: ", paste(rownames(expr_mat)[!gsg$goodSamples], collapse = ", ")) expr_mat <- expr_mat[gsg$goodSamples, gsg$goodGenes] metadata <- metadata[gsg$goodSamples, , drop = FALSE] } ``` ```{r} ## ------------------------- ## 2) Build trait matrix ## ------------------------- # We’ll correlate module eigengenes with timepoint and site (categorical). # Use one-hot encoding (model.matrix) to get numeric contrasts. # We'll drop intercept to avoid collinearity. traits_timepoint <- model.matrix(~ 0 + factor(metadata$timepoint)) colnames(traits_timepoint) <- gsub("^factor\\(metadata\\$timepoint\\)", "", colnames(traits_timepoint)) traits_site <- model.matrix(~ 0 + factor(metadata$site)) colnames(traits_site) <- gsub("^factor\\(metadata\\$site\\)", "", colnames(traits_site)) traits_species <- model.matrix(~ 0 + factor(metadata$species)) colnames(traits_species) <- gsub("^factor\\(metadata\\$species\\)", "", colnames(traits_species)) # Combine into one trait matrix trait_mat <- cbind(traits_timepoint, traits_site, traits_species) head(trait_mat) rownames(trait_mat) <- metadata$sample_id head(trait_mat) ``` ```{r} ## -------------------------------------- ## 3) Pick soft-thresholding power (beta) ## -------------------------------------- powers <- c(seq(1, 10), seq(12, 20, by = 2)) sft <- pickSoftThreshold(expr_mat, powerVector = powers, networkType = "signed", verbose = 5) # Choose the smallest power achieving scale-free topology fit ~0.8 (or your preference). # If none reach 0.8, pick the elbow / max fit. fitIdx <- which.max(sft$fitIndices[, "SFT.R.sq"] >= 0.8) if (length(fitIdx) == 0) { # fallback: best overall fit fitIdx <- which.max(sft$fitIndices[, "SFT.R.sq"]) } softPower <- sft$fitIndices[fitIdx, "Power"] message("Chosen soft-threshold power: ", softPower) ## -------------------------------------- ## Plot scale-free topology and connectivity ## -------------------------------------- par(mfrow = c(1, 2)) cex1 <- 0.9 # 1. Scale-free topology fit index as a function of power plot(sft$fitIndices[, 1], -sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2], xlab = "Soft Threshold (power)", ylab = "Scale Free Topology Model Fit, signed R^2", type = "n", main = paste("Scale independence")) text(sft$fitIndices[, 1], -sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2], labels = powers, cex = cex1, col = "blue") abline(h = 0.8, col = "red", lty = 2) abline(v = softPower, col = "red", lty = 3) # 2. Mean connectivity as a function of power plot(sft$fitIndices[, 1], sft$fitIndices[, 5], xlab = "Soft Threshold (power)", ylab = "Mean Connectivity", type = "n", main = paste("Mean connectivity")) text(sft$fitIndices[, 1], sft$fitIndices[, 5], labels = powers, cex = cex1, col = "blue") abline(v = softPower, col = "red", lty = 3) par(mfrow = c(1, 1)) dev.off() ``` Soft power = 16 ```{r} ## ----------------------------------- ## 4) Build network and detect modules ## ----------------------------------- # Use blockwiseModules for convenience and speed. net <- blockwiseModules( expr_mat, power = softPower, networkType = "signed", TOMType = "signed", corType = "pearson", minModuleSize = 30, # adjust based on your goals mergeCutHeight = 0.25, # ~cor 0.75 between MEs numericLabels = TRUE, pamRespectsDendro = TRUE, saveTOMs = FALSE, verbose = 3 ) # Extract module labels and colors module_labels <- net$colors module_colors <- labels2colors(net$colors) MEs <- net$MEs # Order eigengenes by hierarchical clustering MEs <- orderMEs(MEs) ``` ```{r} ## -------------------------------------------- ## 5) Correlate module eigengenes with the traits ## -------------------------------------------- # Ensure row order matches identical(rownames(MEs), rownames(trait_mat)) mod_trait_cor <- cor(MEs, trait_mat, use = "p") mod_trait_pval <- corPvalueStudent(mod_trait_cor, nSamples = nrow(expr_mat)) # --- Tidy module–trait correlation table (with BH-adjusted p per trait) --- # Correlations mod_trait_cor_df <- as.data.frame(mod_trait_cor) %>% tibble::rownames_to_column(var = "module") %>% tidyr::pivot_longer(-module, names_to = "trait", values_to = "cor") # P-values mod_trait_pval_df <- as.data.frame(mod_trait_pval) %>% tibble::rownames_to_column(var = "module") %>% tidyr::pivot_longer(-module, names_to = "trait", values_to = "p") # Join + tidy mod_trait_tbl <- mod_trait_cor_df %>% dplyr::left_join(mod_trait_pval_df, by = c("module", "trait")) %>% dplyr::mutate( cor = as.numeric(cor), p = as.numeric(p) ) %>% dplyr::group_by(trait) %>% dplyr::mutate(p_adj = p.adjust(p, method = "BH")) %>% dplyr::ungroup() %>% dplyr::arrange(trait, dplyr::desc(abs(cor))) print( mod_trait_tbl %>% dplyr::group_by(trait) %>% dplyr::slice_max(order_by = abs(cor), n = 5) %>% dplyr::ungroup() ) ``` ```{r} ## ---------------------------------------------------- ## 6) Map genes (orthogroups) to modules & save outputs ## ---------------------------------------------------- # Gene (orthogroup) IDs gene_ids <- colnames(expr_mat) gene_module_df <- tibble( group_id = gene_ids, module_label = module_labels, module_color = labels2colors(module_labels) ) # Example: save gene lists per module color write_csv(gene_module_df, "M-multi-species/output/18-ortholog-wgcna/gene_module_assignments.csv") #summarize number of genes in each module module_counts <- gene_module_df %>% as.data.frame() %>% dplyr::count(module_label, name = "n_genes") %>% arrange(desc(n_genes)) print(module_counts) ``` ```{r} ## -------------------------------------- ## 8) Optional: Module membership / kME ## -------------------------------------- # Correlate each gene with its module eigengene (kME) and attach p-values. kME <- as.data.frame(cor(expr_mat, MEs, use = "p")) kME_p <- as.data.frame(corPvalueStudent(as.matrix(kME), nSamples = nrow(expr_mat))) kME_tbl <- kME %>% rownames_to_column(var = "group_id") %>% as_tibble() %>% left_join(gene_module_df, by = "group_id") # Example: list top hub-like orthogroups within a module (by |kME|) top_hubs <- kME_tbl %>% mutate(across(starts_with("ME"), abs)) %>% rowwise() %>% mutate(kME_in_own = { me_col <- paste0("ME", module_color) if (me_col %in% colnames(kME_tbl)) get(me_col) else NA_real_ }) %>% ungroup() %>% arrange(module_color, desc(kME_in_own)) %>% group_by(module_color) %>% slice_head(n = 20) %>% ungroup() print(top_hubs) ``` ```{r} ## -------------------------------------- ## 9) Heatmap with compact row dendrogram + show text only if p<0.05 ## -------------------------------------- # --- Build text and significance mask --- text_matrix <- matrix("", nrow = nrow(mod_trait_cor), ncol = ncol(mod_trait_cor), dimnames = dimnames(mod_trait_cor)) sig_mask <- mod_trait_pval < 0.05 for (i in seq_len(nrow(mod_trait_cor))) { for (j in seq_len(ncol(mod_trait_cor))) { if (isTRUE(sig_mask[i, j])) { r <- signif(mod_trait_cor[i, j], 2) p <- signif(mod_trait_pval[i, j], 1) text_matrix[i, j] <- paste0(r, "\n(", p, ")") } else { text_matrix[i, j] <- "" # hide non-significant } } } # --- Cluster rows (modules) --- row_cor <- cor(t(mod_trait_cor), use = "pairwise.complete.obs") row_d <- as.dist(1 - row_cor) row_hc <- hclust(row_d, method = "average") rord <- row_hc$order mat_re <- mod_trait_cor[rord, , drop = FALSE] text_re <- text_matrix[rord, , drop = FALSE] sig_mask_re <- sig_mask[rord, , drop = FALSE] yLabs_re <- rownames(mod_trait_cor)[rord] xLabs_re <- colnames(trait_mat) # --- Compact layout: narrower dendrogram panel --- op <- par(no.readonly = TRUE) layout(matrix(c(1, 2), nrow = 1), widths = c(0.6, 5)) ## (1) Row dendrogram par(mar = c(10, 0, 3, 0)) plot(as.dendrogram(row_hc), horiz = TRUE, axes = FALSE, yaxs = "i", xaxs = "i", xlab = "", ylab = "", main = "", xlim = c(max(row_hc$height) * 1.05, 0)) ## (2) Heatmap with reduced left margin par(mar = c(10, 5, 3, 3)) WGCNA::labeledHeatmap( Matrix = mat_re, xLabels = xLabs_re, yLabels = yLabs_re, ySymbols = yLabs_re, colorLabels = FALSE, colors = WGCNA::blueWhiteRed(50), textMatrix = text_re, # already blank where p>0.05 setStdMargins = FALSE, cex.text = 0.6, zlim = c(-1, 1), main = "Module–Trait Correlations\n(cor with p-values, p<0.05 shown only)" ) # Overlay bold text for significant cells nr <- nrow(mat_re) nc <- ncol(mat_re) for (i in seq_len(nr)) { for (j in seq_len(nc)) { if (isTRUE(sig_mask_re[i, j])) { text(x = j, y = nr - i + 1, labels = text_re[i, j], cex = 0.6, font = 2) # bold } } } # reset par(op); layout(1) ```