--- title: "Metabolomic WGCNA" author: "Jill Ashey" date: "2025-08-11" output: html_document --- # Set Up ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE) ``` Load libraries. ```{r} library(WGCNA) library(tidyverse) library(dpylr) library(ggplot2) library(readxl) library(vegan) library(factoextra) library(ggfortify) library(ComplexHeatmap) library(viridis) library(lme4) library(lmerTest) library(emmeans) library(broom.mixed) library(broom) ``` # Read and load in data ## Load data. ```{r} metab_quant<-read_xlsx("../data/metabolomics/2025-04-15_Roberts-98.xlsx", sheet="Relative Quant Data")%>% rename(compound=`Current MS Compounds`) ``` Format NAs. ```{r} metab_quant[metab_quant == "N/A"] <- NA str(metab_quant) #remove kegg columns metab_quant<-metab_quant%>% dplyr::select(!"HMDB ID")%>% dplyr::select(!"KEGG ID") ``` Values missing or not quantified are now NA. Convert to numeric format. ```{r} # Keep the "sample" column unchanged, convert all other columns to numeric metab_quant_wide <- as.data.frame(lapply(names(metab_quant), function(col) { if (col == "compound") { return(metab_quant[[col]]) # Keep the "sample" column as is } else { # Convert other columns to character first, then to numeric return(as.numeric(as.character(metab_quant[[col]]))) } })) # Ensure the column names remain unchanged colnames(metab_quant_wide) <- names(metab_quant) str(metab_quant_wide) ``` Convert to long format. ```{r} metab_quant_long<-metab_quant_wide%>% pivot_longer(names_to="sample", values_to="rel.quant", cols = -c(compound)) ``` ## Add in metadata ```{r} metab_quant_long<-metab_quant_long%>% mutate( colony = str_extract(sample, "^[^_]+"), # Extract everything before the underscore timepoint = str_extract(sample, "(?<=_).*"), # Extract everything after the underscore species = case_when( str_detect(sample, "ACR") ~ "Acropora", str_detect(sample, "POC") ~ "Pocillopora", str_detect(sample, "POR") ~ "Porites", TRUE ~ NA_character_ # Default to NA if no match ), type = case_when( str_detect(sample, "PBQC") ~ "PBQC", TRUE ~ "sample" ) ) ``` ```{r} #add in site metadata sites<-read_csv("../data/phys_master_timeseries.csv")%>% dplyr::select(c("colony_id_corr", "site"))%>% dplyr::rename(colony="colony_id_corr") metab_quant_long<-left_join(metab_quant_long, sites) metab_quant_long<-unique(metab_quant_long) ``` Add in protein data from lipid analyses (protein is the same because samples were taken from the same replicate). ```{r} protein<-read_xlsx("../data/metabolomics/2025-04-15_Roberts-98.xlsx", sheet="Protein") ``` Merge in protein data. ```{r} metab_quant_long<-left_join(metab_quant_long, protein) ``` ## Filtering and normalizing How many lipids do we have currently? ```{r} length(levels(as.factor(metab_quant_long$compound))) ``` 404 total metabolites ## Remove metabolites not detected in the PBQC samples Remove facility internal standard metabolites. ```{r} standards<-c("3C13-Glycine", "4C13-Alanine", "2C13-Choline", "4C13-Serine", "d3-Creatinine", "6C13-Proline", "6C13-Valine", "5C13-Threonine", "5C13-Aspartic Acid", "6C13-Asparagine", "7C13-Leucine", "7C13-iso-Leucine", "2C13-Glutamic acid", "7C13-Glutamine", "6C13-Glutamic acid", "8C13-Lysine", "6C13-Methionine", "9C13-Histidine", "10C13-Phenylalanine", "2C13-Tyrosine", "10C13-Arginine", "10C13-Tyrosine", "13C13-Tryptophan", "8C13-Cystine", "11C13-Uridine", "3C13-Lactate", "4C13-3HBA", "2N15-Xanthine", "2N15-Urate", "1C13-Citrulline", "6C13-Glucose", "4C13-Pentothenate") metab_quant_long<-metab_quant_long%>% filter(!compound %in% standards) length(levels(as.factor(metab_quant_long$compound))) ``` 372 metabolites after removing standards. Only keep metabolites that were measured in at least one of the PBQCs. ```{r} length(levels(as.factor(metab_quant_long$compound))) #view metabolites not measured in the PBQC test <- metab_quant_long %>% group_by(compound) %>% filter((type == "PBQC" & is.na(rel.quant)))%>% pull(compound) test<-unique(test) length(unique(test)) metab_quant_long<-metab_quant_long%>% filter(!compound %in% test) length(levels(as.factor(metab_quant_long$compound))) metab_quant_long<-metab_quant_long%>% dplyr::filter(!type=="PBQC")%>% dplyr::select(!type) ``` There are 187 metabolites not detected in the PBQCs. After removing those we have 185 total compounds. Remove metabolites that have NA in more than 10% of samples. ```{r} # Identify compounds with less than 25% NA in rel_quant compounds_lt10pct_NA <- metab_quant_long %>% group_by(compound) %>% summarize( total = n(), na_count = sum(is.na(rel.quant)), na_prop = na_count / total ) %>% filter(na_prop < 0.1) %>% pull(compound) length(compounds_lt10pct_NA) metab_quant_long<-metab_quant_long%>% filter(compound %in% compounds_lt10pct_NA) length(levels(as.factor(metab_quant_long$compound))) ``` We have 141 compounds in the dataset that are measured in >90% of samples. We will use imputation to account for the NAs. Rename dataframe. ```{r} data<-metab_quant_long ``` ## Normalize to tissue input Next, conduct normalization to tissue input and multiply by constants as directed by the UW facility. ```{r} data<-data%>% mutate(rel.quant.ug=rel.quant/protein.ug)%>% dplyr::select(!protein.ug)%>% dplyr::select(!rel.quant) ``` Metabolite concentration is now normalized to protein. # WGCNA Set up matrix ```{r} # 1. Pivot wider so compounds are columns, samples are rows, values are rel.quant.ug wgcna_data <- data %>% dplyr::select(sample, compound, rel.quant.ug) %>% pivot_wider(names_from = compound, values_from = rel.quant.ug) # 2. Check for missing values; WGCNA requires no missing data (or impute/fill) # Convert to matrix (remove sample column first) expr_matrix <- as.data.frame(wgcna_data) %>% column_to_rownames(var = "sample") # Optional: Impute missing values (for example, replacing NA with small number or using an imputation method) expr_matrix[is.na(expr_matrix)] <- 0 # 3. Check good genes and samples: gsg <- goodSamplesGenes(expr_matrix, verbose = 3) if (!gsg$allOK) { # Remove offending genes/samples: expr_matrix <- expr_matrix[gsg$goodSamples, gsg$goodGenes] } # 4. Now expr_matrix is ready for WGCNA input: # Note: Add meta data later for traits (e.g., timepoint, species) if you want to correlate modules ``` ```{r} ##Soft threshold # Choose a set of soft-thresholding powers powers = c(c(1:10), seq(from = 12, to=20, by=2)) # Call the network topology analysis function #the below takes a long time to run, so is commented out and the pre-run results are loaded in below. I need to save it to my desktop because it is too large to push to github. Instead, it will go on OSF sft <- pickSoftThreshold(expr_matrix, powerVector = powers, verbose = 5) #...wait for this to finish # pickSoftThreshold # performs the analysis of network topology and aids the # user in choosing a proper soft-thresholding power. # The user chooses a set of candidate powers (the function provides suitable default values) # function returns a set of network indices that should be inspected getwd() sizeGrWindow(9, 5) # set window size # png to output png("../output/09-metabolomics-WGCNA/sft.png", 1000, 1000, pointsize=20) par(mfrow = c(1,2)); cex1 = 0.9; # # Scale-free topology fit index as a function of the soft-thresholding power plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n", main = paste("Scale independence")); text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], labels=powers,cex=cex1,col="red"); # # this line corresponds to using an R^2 cut-off abline(h=0.8,col="red") # # Mean connectivity as a function of the soft-thresholding power plot(sft$fitIndices[,1], sft$fitIndices[,5], xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n", main = paste("Mean connectivity")) text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red") dev.off() # output #I used a scale-free topology fit index **R^2 of 0.85**. This lowest recommended R^2 by Langfelder and Horvath is 0.8. I chose 0.85 because we want to use the smallest soft thresholding power that maximizes with model fit. It appears that our **soft thresholding power is 14** because it is the loweest power before the R^2=0.8 threshold that maximizes with model fit. ``` ## Step wise module construction #### Step 1: Create adjacency matrix ```{r} softPower = 7 # set the soft threshold based on the plots above # signed #to get this to work with a ton of genes >30K, I had to increase my memory limit using: memory.limit(size = 35000) adjacency_sign = adjacency(expr_matrix, power = softPower, type="signed") #Calculate adjacency ``` ### Step 2: Turn adjacency into topological overlap: Calculation of the topological overlap matrix, (TOM) and the corresponding dissimilarity, from a given adjacency matrix. ```{r} #the below takes a long time to run, so is commented out and the pre-run results are loaded in below TOM_sign = TOMsimilarity(adjacency_sign, TOMType="signed") #Translate adjacency into topological overlap matrix dissTOM_sign = 1-TOM_sign ``` ### Step 3: Call the hierarchical clustering function - plot the tree ```{r} # Call the hierarchical clustering function #to get this to work, I had to increase my memory limit using: memory.limit(size = 45000) geneTree_sign = hclust(as.dist(dissTOM_sign), method = "average"); # Plot the resulting clustering tree (dendrogram) Each leaf corresponds to a gene, branches grouping together densely are interconnected, highly co-expressed genes. sizeGrWindow(12,9) plot(geneTree_sign, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity - SIGNED", labels = FALSE, hang = 0.04) ``` ### Step 4: Set module size and 'cutreeDynamic' to create clusters ```{r} #Module identification is essentially cutting the branches off the tree in the dendrogram above. We like large modules, so we set the **minimum module size** relatively high, so we will set the minimum size at 30. minModuleSize = 30; # set this for the subseqent call... dynamicMods_sign = cutreeDynamic(dendro = geneTree_sign, distM = dissTOM_sign, deepSplit = 1, pamRespectsDendro = FALSE, minClusterSize = minModuleSize); table(dynamicMods_sign) # number of genes per module. Module 0 is reserved for unassigned genes. The are other modules will be listed largest to smallest. ``` ### Step 5: convert numeric network to colors and plot the dendrogram ```{r} # Convert numeric lables into colors dynamicColors_sign = labels2colors(dynamicMods_sign) # add colors to module labels (previously numbers) table(dynamicColors_sign) # lets look at this table... # Plot the dendrogram and colors underneath plotDendroAndColors(geneTree_sign, dynamicColors_sign, "Dynamic Tree Cut - SIGNED", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05, main = "Gene dendrogram and module colors 'SIGNED'") sizeGrWindow(8,6) png("../output/09-metabolomics-WGCNA/Dendrogram.png", 1000, 1000, pointsize=20) plotDendroAndColors(geneTree_sign, dynamicColors_sign, "Dynamic Tree Cut - SIGNED", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05, main = "Gene dendrogram and module colors 'SIGNED'") dev.off() ``` ### Step 6: Calculate Eigengenes - view thier connectivity based on 'MEDiss = 1-cor(MEs)' ```{r} # Calculate eigengenes MEList = moduleEigengenes(expr_matrix, colors = dynamicColors_sign, softPower = 14) MEs = MEList$eigengenes MEs MEs <- MEs %>% select_if(~ !any(is.na(.))) # Calculate dissimilarity of module eigengenes MEDiss = 1-cor(MEs, use = "pairwise.complete.obs"); # Cluster module eigengenes METree = hclust(as.dist(MEDiss), method = "average"); # Plot the result sizeGrWindow(7, 6) png("../output/09-metabolomics-WGCNA/ClusterEigengenes.png", 1000, 1000, pointsize=20) plot(METree, main = "Clustering of module eigengenes - SIGNED (dissimilarity calc = MEDiss = 1-cor(MEs))", xlab = "", sub = "") MEDissThres = 0.15 abline(h=MEDissThres, col = "red") dev.off() ``` ### Step 7: Specify the cut line for the dendrogram (module) - Calc MODULE EIGENGENES (mergeMEs) ```{r} MEDissThres = 0.15 # **Merge modules with >85% eigengene similarity.** Most studies use somewhere between 80-90% similarity. I will use 85% similarity as my merging threshold. # Plot the cut line into the dendrogram #abline(h=MEDissThres, col = "red") # Call an automatic merging function # merge = mergeCloseModules(dds.d0_vst, dynamicColors, cutHeight = MEDissThres, verbose = 3) merge = mergeCloseModules(expr_matrix, dynamicColors_sign, cutHeight = MEDissThres, verbose = 3) # The merged module colors mergedColors = merge$colors; # Eigengenes of the new merged modules: mergedMEs = merge$newMEs; library(dplyr) mergedMEs <- mergedMEs %>% select_if(~ !any(is.na(.))) # Cluster module eigengenes MEDiss2 = 1-cor(mergedMEs,use = 'pairwise.complete.obs'); MEDiss2 METree2 = hclust(as.dist(MEDiss2), method = "average"); # Plot the result plot(METree2, main = "Clustering of module eigengenes - SIGNED (dissimilarity calc = MEDiss = 1-cor(MEs))", xlab = "", sub = "") sizeGrWindow(7, 6) png("../output/09-metabolomics-WGCNA/ClusterEigengenes_merged.png", 1000, 1000, pointsize=20) plot(METree2, main = "Clustering of module eigengenes - SIGNED (dissimilarity calc = MEDiss = 1-cor(MEs))", xlab = "", sub = "") dev.off() ``` ### Step 8: Plot dendrogram with the cut line 'MEDissThres' ```{r} plotDendroAndColors(geneTree_sign, cbind(dynamicColors_sign, mergedColors), c("Dynamic Tree Cut", "Merged dynamic"), dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05) sizeGrWindow(12, 9) png("../output/09-metabolomics-WGCNA/ClusterDendrogram_signed.png", 1000, 1000, pointsize=20) plotDendroAndColors(geneTree_sign, cbind(dynamicColors_sign, mergedColors), c("Dynamic Tree Cut", "Merged dynamic"), dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05) dev.off() ``` ### Step 9: Commit to mergedcolors as 'MEs' and 'moduleColors' ```{r} # Rename to moduleColors moduleColors = mergedColors # Construct numerical labels corresponding to the colors colorOrder = c("grey", standardColors(50)); moduleLabels = match(moduleColors, colorOrder)-1; MEs = mergedMEs; # write csv - save the module eigengenes write.csv(MEs, file = "../output/09-metabolomics-WGCNA/WGCNA_ModuleEigengenes.csv") table(mergedColors) ```