---
title: "Metabolomic WGCNA"
author: "Jill Ashey"
date: "2025-08-11"
output: html_document
---

# Set Up    

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
```

Load libraries. 

```{r}
library(WGCNA)
library(tidyverse)
library(dpylr)
library(ggplot2)
library(readxl)
library(vegan)
library(factoextra)
library(ggfortify)
library(ComplexHeatmap)
library(viridis)
library(lme4)
library(lmerTest)
library(emmeans)
library(broom.mixed)
library(broom)
```

# Read and load in data 

## Load data. 

```{r}
metab_quant<-read_xlsx("../data/metabolomics/2025-04-15_Roberts-98.xlsx", sheet="Relative Quant Data")%>%
  rename(compound=`Current MS Compounds`)
```

Format NAs. 

```{r}
metab_quant[metab_quant == "N/A"] <- NA

str(metab_quant)

#remove kegg columns 
metab_quant<-metab_quant%>%
  dplyr::select(!"HMDB ID")%>%
  dplyr::select(!"KEGG ID")
```

Values missing or not quantified are now NA. 

Convert to numeric format. 

```{r}
# Keep the "sample" column unchanged, convert all other columns to numeric
metab_quant_wide <- as.data.frame(lapply(names(metab_quant), function(col) {
  if (col == "compound") {
    return(metab_quant[[col]])  # Keep the "sample" column as is
  } else {
    # Convert other columns to character first, then to numeric
    return(as.numeric(as.character(metab_quant[[col]])))
  }
}))

# Ensure the column names remain unchanged
colnames(metab_quant_wide) <- names(metab_quant)

str(metab_quant_wide)
```

Convert to long format. 

```{r}
metab_quant_long<-metab_quant_wide%>%
  pivot_longer(names_to="sample", values_to="rel.quant", cols = -c(compound))
```

## Add in metadata 

```{r}
metab_quant_long<-metab_quant_long%>%

  mutate(
    colony = str_extract(sample, "^[^_]+"),  # Extract everything before the underscore
    timepoint = str_extract(sample, "(?<=_).*"),  # Extract everything after the underscore
    species = case_when(
      str_detect(sample, "ACR") ~ "Acropora",
      str_detect(sample, "POC") ~ "Pocillopora",
      str_detect(sample, "POR") ~ "Porites",
      TRUE ~ NA_character_  # Default to NA if no match
    ), 
    type = case_when(
      str_detect(sample, "PBQC") ~ "PBQC", 
      TRUE ~ "sample"
      
    )
  )
```

```{r}
#add in site metadata
sites<-read_csv("../data/phys_master_timeseries.csv")%>%
  dplyr::select(c("colony_id_corr", "site"))%>%
  dplyr::rename(colony="colony_id_corr")

metab_quant_long<-left_join(metab_quant_long, sites)

metab_quant_long<-unique(metab_quant_long)
```

Add in protein data from lipid analyses (protein is the same because samples were taken from the same replicate). 

```{r}
protein<-read_xlsx("../data/metabolomics/2025-04-15_Roberts-98.xlsx", sheet="Protein")
```

Merge in protein data. 

```{r}
metab_quant_long<-left_join(metab_quant_long, protein)
```

## Filtering and normalizing 

How many lipids do we have currently? 
```{r}
length(levels(as.factor(metab_quant_long$compound)))
```
404 total metabolites 

## Remove metabolites not detected in the PBQC samples 

Remove facility internal standard metabolites. 
```{r}
standards<-c("3C13-Glycine", "4C13-Alanine", "2C13-Choline", "4C13-Serine", "d3-Creatinine", "6C13-Proline", "6C13-Valine", "5C13-Threonine", "5C13-Aspartic Acid", "6C13-Asparagine", "7C13-Leucine", "7C13-iso-Leucine", "2C13-Glutamic acid", "7C13-Glutamine", "6C13-Glutamic acid", "8C13-Lysine", "6C13-Methionine", "9C13-Histidine", "10C13-Phenylalanine", "2C13-Tyrosine", "10C13-Arginine", "10C13-Tyrosine", "13C13-Tryptophan", "8C13-Cystine", "11C13-Uridine", "3C13-Lactate", "4C13-3HBA", "2N15-Xanthine", "2N15-Urate", "1C13-Citrulline", "6C13-Glucose", "4C13-Pentothenate")

metab_quant_long<-metab_quant_long%>%
  filter(!compound %in% standards)

length(levels(as.factor(metab_quant_long$compound)))
```
372 metabolites after removing standards.  

Only keep metabolites that were measured in at least one of the PBQCs.  

```{r}
length(levels(as.factor(metab_quant_long$compound)))

#view metabolites not measured in the PBQC
test <- metab_quant_long %>%
  group_by(compound) %>%
  filter((type == "PBQC" & is.na(rel.quant)))%>%
  pull(compound)

test<-unique(test)

length(unique(test))

metab_quant_long<-metab_quant_long%>%
  filter(!compound %in% test)

length(levels(as.factor(metab_quant_long$compound)))

metab_quant_long<-metab_quant_long%>%
  dplyr::filter(!type=="PBQC")%>%
  dplyr::select(!type)
```
There are 187 metabolites not detected in the PBQCs. After removing those we have 185 total compounds.  

Remove metabolites that have NA in more than 10% of samples.  

```{r}
# Identify compounds with less than 25% NA in rel_quant
compounds_lt10pct_NA <- metab_quant_long %>%
  group_by(compound) %>%
  summarize(
    total = n(),
    na_count = sum(is.na(rel.quant)),
    na_prop = na_count / total
  ) %>%
  filter(na_prop < 0.1) %>%
  pull(compound)

length(compounds_lt10pct_NA)

metab_quant_long<-metab_quant_long%>%
  filter(compound %in% compounds_lt10pct_NA)

length(levels(as.factor(metab_quant_long$compound)))
```
We have 141 compounds in the dataset that are measured in >90% of samples. We will use imputation to account for the NAs.    

Rename dataframe. 

```{r}
data<-metab_quant_long
```

## Normalize to tissue input 

Next, conduct normalization to tissue input and multiply by constants as directed by the UW facility.  

```{r}
data<-data%>%
  mutate(rel.quant.ug=rel.quant/protein.ug)%>%
  dplyr::select(!protein.ug)%>%
  dplyr::select(!rel.quant)
```

Metabolite concentration is now normalized to protein.  

# WGCNA

Set up matrix

```{r}
# 1. Pivot wider so compounds are columns, samples are rows, values are rel.quant.ug
wgcna_data <- data %>%
  dplyr::select(sample, compound, rel.quant.ug) %>%
  pivot_wider(names_from = compound, values_from = rel.quant.ug)

# 2. Check for missing values; WGCNA requires no missing data (or impute/fill)
# Convert to matrix (remove sample column first)
expr_matrix <- as.data.frame(wgcna_data) %>% 
  column_to_rownames(var = "sample")

# Optional: Impute missing values (for example, replacing NA with small number or using an imputation method)
expr_matrix[is.na(expr_matrix)] <- 0

# 3. Check good genes and samples:
gsg <- goodSamplesGenes(expr_matrix, verbose = 3)
if (!gsg$allOK) {
  # Remove offending genes/samples:
  expr_matrix <- expr_matrix[gsg$goodSamples, gsg$goodGenes]
}

# 4. Now expr_matrix is ready for WGCNA input:
# Note: Add meta data later for traits (e.g., timepoint, species) if you want to correlate modules
```

```{r}
##Soft threshold
# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=20, by=2))
# Call the network topology analysis function

#the below takes a long time to run, so is commented out and the pre-run results are loaded in below. I need to save it to my desktop because it is too large to push to github. Instead, it will go on OSF
sft <- pickSoftThreshold(expr_matrix, powerVector = powers, verbose = 5) #...wait for this to finish

# pickSoftThreshold 
#  performs the analysis of network topology and aids the
# user in choosing a proper soft-thresholding power.
# The user chooses a set of candidate powers (the function provides suitable default values)
# function returns a set of network indices that should be inspected
getwd()
sizeGrWindow(9, 5) # set window size 
# png to output 
png("../output/09-metabolomics-WGCNA/sft.png", 1000, 1000, pointsize=20)
par(mfrow = c(1,2));
cex1 = 0.9;
# # Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
    main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
    labels=powers,cex=cex1,col="red");
# # this line corresponds to using an R^2 cut-off
abline(h=0.8,col="red")
# # Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
    xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
    main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
dev.off() # output 
#I used a scale-free topology fit index **R^2 of 0.85**. This lowest recommended R^2 by Langfelder and Horvath is 0.8. I chose 0.85 because we want to use the smallest soft thresholding power that maximizes with model fit. It appears that our **soft thresholding power is 14** because it is the loweest power before the R^2=0.8 threshold that maximizes with model fit.
```

## Step wise module construction 

#### Step 1: Create adjacency matrix 

```{r}
softPower = 7 # set the soft threshold based on the plots above 

# signed 
#to get this to work with a ton of genes >30K, I had to increase my memory limit using: memory.limit(size = 35000) 
adjacency_sign = adjacency(expr_matrix, power = softPower, type="signed")  #Calculate adjacency
```

###  Step 2: Turn adjacency into topological overlap: Calculation of the topological overlap matrix, (TOM) and the corresponding dissimilarity, from a given adjacency matrix.

```{r}
#the below takes a long time to run, so is commented out and the pre-run results are loaded in below
TOM_sign = TOMsimilarity(adjacency_sign, TOMType="signed") #Translate adjacency into topological overlap matrix
dissTOM_sign   = 1-TOM_sign
```

###  Step 3: Call the hierarchical clustering function - plot the tree

```{r}
# Call the hierarchical clustering function
#to get this to work, I had to increase my memory limit using: memory.limit(size = 45000) 
geneTree_sign   = hclust(as.dist(dissTOM_sign), method = "average");

# Plot the resulting clustering tree (dendrogram) Each leaf corresponds to a gene, branches grouping together densely are interconnected, highly co-expressed genes.  
sizeGrWindow(12,9)

plot(geneTree_sign, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity - SIGNED",
     labels = FALSE, hang = 0.04)
```

###  Step 4: Set module size and 'cutreeDynamic' to create clusters 

```{r}
#Module identification is essentially cutting the branches off the tree in the dendrogram above. We like large modules, so we set the **minimum module size** relatively high, so we will set the minimum size at 30. 
minModuleSize = 30; # set this for the subseqent call...

dynamicMods_sign = cutreeDynamic(dendro = geneTree_sign, distM = dissTOM_sign,
                            deepSplit = 1, pamRespectsDendro = FALSE,
                            minClusterSize = minModuleSize);
table(dynamicMods_sign) # number of genes per module. Module 0 is reserved for unassigned genes. The are other modules will be listed largest to smallest. 
```

###  Step 5: convert numeric network to colors and plot the dendrogram

```{r} 
# Convert numeric lables into colors
dynamicColors_sign = labels2colors(dynamicMods_sign) # add colors to module labels (previously numbers)
table(dynamicColors_sign) # lets look at this table...
# Plot the dendrogram and colors underneath

plotDendroAndColors(geneTree_sign, dynamicColors_sign, "Dynamic Tree Cut - SIGNED",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05,
                    main = "Gene dendrogram and module colors 'SIGNED'")

sizeGrWindow(8,6)
png("../output/09-metabolomics-WGCNA/Dendrogram.png", 1000, 1000, pointsize=20)
plotDendroAndColors(geneTree_sign, dynamicColors_sign, "Dynamic Tree Cut - SIGNED",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05,
                    main = "Gene dendrogram and module colors 'SIGNED'")
dev.off()
```

###  Step 6: Calculate Eigengenes - view thier connectivity based on 'MEDiss = 1-cor(MEs)'

```{r}
# Calculate eigengenes
MEList = moduleEigengenes(expr_matrix, colors = dynamicColors_sign, softPower = 14)

MEs = MEList$eigengenes
MEs
MEs <- MEs %>% 
    select_if(~ !any(is.na(.)))

# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs, use = "pairwise.complete.obs");

# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average");
# Plot the result
sizeGrWindow(7, 6)
png("../output/09-metabolomics-WGCNA/ClusterEigengenes.png", 1000, 1000, pointsize=20)
plot(METree, main = "Clustering of module eigengenes - SIGNED (dissimilarity calc = MEDiss = 1-cor(MEs))",
     xlab = "", sub = "")
MEDissThres = 0.15 
abline(h=MEDissThres, col = "red")
dev.off()
```

###  Step 7: Specify the cut line for the dendrogram (module) - Calc MODULE EIGENGENES (mergeMEs)

```{r}
MEDissThres = 0.15 # **Merge modules with >85% eigengene similarity.** Most studies use somewhere between 80-90% similarity. I will use 85% similarity as my merging threshold.
# Plot the cut line into the dendrogram
#abline(h=MEDissThres, col = "red")
# Call an automatic merging function
# merge = mergeCloseModules(dds.d0_vst, dynamicColors, cutHeight = MEDissThres, verbose = 3)
merge = mergeCloseModules(expr_matrix, dynamicColors_sign, cutHeight = MEDissThres, verbose = 3)
# The merged module colors
mergedColors = merge$colors;
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs;
library(dplyr)
mergedMEs <- mergedMEs %>% 
    select_if(~ !any(is.na(.)))
# Cluster module eigengenes
MEDiss2 = 1-cor(mergedMEs,use = 'pairwise.complete.obs');
MEDiss2
METree2 = hclust(as.dist(MEDiss2), method = "average");
# Plot the result
plot(METree2, main = "Clustering of module eigengenes - SIGNED (dissimilarity calc = MEDiss = 1-cor(MEs))",
     xlab = "", sub = "")


sizeGrWindow(7, 6)
png("../output/09-metabolomics-WGCNA/ClusterEigengenes_merged.png", 1000, 1000, pointsize=20)
plot(METree2, main = "Clustering of module eigengenes - SIGNED (dissimilarity calc = MEDiss = 1-cor(MEs))",
     xlab = "", sub = "")
dev.off()
```

###  Step 8: Plot dendrogram with the cut line 'MEDissThres' 

```{r}
plotDendroAndColors(geneTree_sign, cbind(dynamicColors_sign, mergedColors),
                    c("Dynamic Tree Cut", "Merged dynamic"),
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)

sizeGrWindow(12, 9)

png("../output/09-metabolomics-WGCNA/ClusterDendrogram_signed.png", 1000, 1000, pointsize=20)
plotDendroAndColors(geneTree_sign, cbind(dynamicColors_sign, mergedColors),
                    c("Dynamic Tree Cut", "Merged dynamic"),
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)
dev.off()
```

###  Step 9: Commit to mergedcolors as 'MEs' and 'moduleColors'
```{r}
# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs = mergedMEs;
# write csv - save the module eigengenes
write.csv(MEs, file = "../output/09-metabolomics-WGCNA/WGCNA_ModuleEigengenes.csv")
table(mergedColors)
```