--- title: "22.6-Apul-multiomic-machine-learning-updatedWGBS" author: "Kathleen Durkin" date: "2025-05-25" output: github_document: toc: true number_sections: true html_document: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true bookdown::html_document2: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true bibliography: references.bib --- Applying ML model using multiomic predictors -- miRNA + lncRNA + methylation as predictors of gene expression. Rerun of `22.3-Apul-multiomic-machine-learning-byTP` with revised input set of CpG sites. One bad sample, `ACR-225-TP1` had dramatically reduced the number of considered sites, so this rerun will exclude that sample. Inputs: - RNA counts matrix (raw): `../output/02.20-D-Apul-RNAseq-alignment-HiSat2/apul-gene_count_matrix.csv` - Gene sets of interest: `../output/21-Apul-annotate-miRNA-mRNA-WGCNA/filtered-gene-sets/` - sRNA/miRNA counts matrix (raw): `../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_miRNA_ShortStack_counts_formatted.txt` - lncRNA counts matrix (raw): `../output/08-Apul-lncNRA/counts.txt` - WGBS data (processed): Performed in `/timeseries_molecular/D-Apul/output/15.5-Apul-bismark/`, data in [large-file storage](https://gannet.fish.washington.edu/seashell/bu-github/timeseries_molecular/D-Apul/output/15.5-Apul-bismark/). Will be using filtered set generated by SR in `22.5-Apul-multiomic-SR` (available [here](https://github.com/urol-e5/timeseries_molecular/blob/main/D-Apul/output/22.5-Apul-multiomic-SR/Apul-filtered-WGBS-CpG-counts.csv)) - sample metadata: `../../M-multi-species/data/rna_metadata.csv` Will be predicting expression for a subset of genes related to energetic state (those annotated for the below GO terms) - Glycolysis GO:0006096 - Gluconeogenesis GO:0006094 - Lipolysis/lipid catabolism GO:0016042 - Fatty acid beta oxidation GO:0006635 - Starvation GO:0042594 - Lipid biosynthesis GO:000861 - Protein catabolic process GO:0030163 Selected GO terms evaluated at https://github.com/urol-e5/timeseries_molecular/blob/main/D-Apul/code/23-Apul-energetic-state.md # Set up ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, cache=TRUE) ``` # Load libraries ```{r, warning=FALSE, message=FALSE} library(tidyverse) library(ggplot2) library(DESeq2) library(igraph) library(psych) library(tidygraph) library(ggraph) library(WGCNA) library(edgeR) library(reshape2) library(ggcorrplot) library(corrplot) library(rvest) library(purrr) library(pheatmap) library(glmnet) library(caret) library(factoextra) library(vegan) library(ggfortify) library(genefilter) library(scales) library(purrr) ``` The model includes random processes, so set a seed for reproducability ```{r} set.seed(703) ``` # Load and format data ## RNA-seq data (mRNA, miRNA, lncRNA) ```{r load-data} ### mRNA ### # raw gene counts data (will filter and variance stabilize) Apul_genes <- read_csv("../output/02.20-D-Apul-RNAseq-alignment-HiSat2/apul-gene_count_matrix.csv") Apul_genes <- as.data.frame(Apul_genes) # format gene IDs as rownames (instead of a column) rownames(Apul_genes) <- Apul_genes$gene_id Apul_genes <- Apul_genes%>%select(!gene_id) ### miRNA ### # raw miRNA counts (will filter and variance stabilize) Apul_miRNA <- read.table(file = "../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_miRNA_ShortStack_counts_formatted.txt", header = TRUE, sep = "\t", check.names = FALSE) ### lncRNA ### # raw lncRNA counts (will filter and variance stabilize) Apul_lncRNA_full <- read.table("../output/08-Apul-lncRNA/counts.txt", header = TRUE, sep = "\t", skip = 1) # Remove info on genomic location, set lncRNA IDs as rownames rownames(Apul_lncRNA_full) <- Apul_lncRNA_full$Geneid Apul_lncRNA <- Apul_lncRNA_full %>% select(-Geneid, -Chr, -Start, -End, -Strand, -Length) ### load and format metadata ### metadata <- read_csv("../../M-multi-species/data/rna_metadata.csv")%>%select(AzentaSampleName, ColonyID, Timepoint) %>% filter(grepl("ACR", ColonyID)) metadata$Sample <- paste0(metadata$ColonyID, "-", metadata$Timepoint) rownames(metadata) <- metadata$Sample colonies <- unique(metadata$ColonyID) # Rename gene column names to include full sample info colnames(Apul_genes) <- metadata$Sample[match(colnames(Apul_genes), metadata$AzentaSampleName)] # Rename miRNA column names to match formatting colnames(Apul_miRNA) <- sub("_.*", "", colnames(Apul_miRNA)) colnames(Apul_miRNA) <- metadata$Sample[match(colnames(Apul_miRNA), metadata$AzentaSampleName)] # rename lncRNA column names to include full sample info colnames(Apul_lncRNA) <- sub("...data.", "", colnames(Apul_lncRNA)) colnames(Apul_lncRNA) <- sub(".sorted.bam", "", colnames(Apul_lncRNA)) colnames(Apul_lncRNA) <- metadata$Sample[match(colnames(Apul_lncRNA), metadata$AzentaSampleName)] ``` ## Islolate gene set Read in gene set table ```{r} # GO terms related to energetic state energetic_state_GO <- read.csv("../output/21-Apul-annotate-miRNA-mRNA-WGCNA/filtered-gene-sets/energetic_state_GO_terms_genes_FA.csv") ``` Isolate filtered counts by gene set ```{r} energetic_state_GO_genes <- Apul_genes[rownames(Apul_genes) %in% energetic_state_GO$gene,] ``` ## WGBS data ```{r, eval=FALSE} #pull processed files from Gannet # Note: Unfortunately we can't use the `cache` feature to make this process more time efficient, as it doesn't support long vectors # Define the base URL base_url <- "https://gannet.fish.washington.edu/seashell/bu-github/timeseries_molecular/D-Apul/output/15.5-Apul-bismark/" # Read the HTML page page <- read_html(base_url) # Extract links to files file_links <- page %>% html_nodes("a") %>% html_attr("href") # Filter for files ending in "processed.txt" processed_files <- file_links[grepl("processed\\.txt$", file_links)] # Create full URLs file_urls <- paste0(base_url, processed_files) # Function to read a file from URL read_processed_file <- function(url) { read_table(url, col_types = cols(.default = "c")) # Read as character to avoid parsing issues } # Import all processed files into a list processed_data <- lapply(file_urls, read_processed_file) # Name the list elements by file name names(processed_data) <- processed_files # Print structure of imported data str(processed_data) # add a header row that has "CpG" for the first column and "sample" for the second column, which will be populated by the file name processed_data <- Map(function(df, filename) { colnames(df) <- c("CpG", filename) # Rename columns return(df) }, processed_data, names(processed_data)) # Use stored file names #merge files together by "CpG" merged_data <- purrr::reduce(processed_data, full_join, by = "CpG") # Print structure of final merged data str(merged_data) ``` Replace any NA with 0. ```{r, eval=FALSE} # Convert all columns (except "CpG") to numeric and replace NAs with 0 merged_data <- merged_data %>% mutate(across(-CpG, as.numeric)) %>% # Convert all except CpG to numeric mutate(across(-CpG, ~ replace_na(.x, 0))) # Replace NA with 0 in numeric columns ``` ## Filter data sets ### WGBS Remove the sample with poor WGBS data quality (`ACR-225-TP1`) ```{r, eval=FALSE} merged_data <- merged_data %>% select(-"ACR-225-TP1_10x_processed.txt") ``` Only keep CpGs that have a non-zero value in all samples. ```{r, eval=FALSE} filtered_wgbs <- merged_data %>% filter(if_all(-CpG, ~ .x > 0)) # Ensure it's formatted as a data frame filtered_wgbs <- as.data.frame(filtered_wgbs) # Only keep the sample information in the column name. colnames(filtered_wgbs) <- gsub("^(.*?)_.*$", "\\1", colnames(filtered_wgbs)) # Set CpG IDs to rownames rownames(filtered_wgbs) <- filtered_wgbs$CpG filtered_wgbs <- filtered_wgbs %>% select(-CpG) nrow(merged_data) nrow(filtered_wgbs) ``` We had 12,093,025 CpGs before filtering and have 14872 after filtering. This is a huge improvement to the 507 CpG sites retained when we included Sample `ACR-225-TP1` Save filtered set to make code reruns/knitting quicker ```{r, eval=FALSE} write.csv(filtered_wgbs, "../output/22.6-Apul-multiomic-machine-learning-updatedWGBS/filtered-WGBS-CpG-counts.csv") ``` If knitting/rerunning code, we can load in this filtered data here, instead of loading raw counts and reprocessing. ```{r} filtered_wgbs <- read.csv("../output/22.6-Apul-multiomic-machine-learning-updatedWGBS/filtered-WGBS-CpG-counts.csv", row.names = 1, check.names = FALSE) ``` ### RNA Remove the sample with poor WGBS data quality (`ACR-225-TP1`) ```{r, eval=FALSE} energetic_state_GO_genes <- energetic_state_GO_genes %>% select(-"ACR-225-TP1") Apul_miRNA <- Apul_miRNA %>% select(-"ACR-225-TP1") Apul_lncRNA <- Apul_lncRNA %>% select(-"ACR-225-TP1") ``` Only keep genes, miRNA, and lncRNA that are present in at least one sample ```{r} # genes energetic_state_GO_genes_red <- energetic_state_GO_genes[rowSums(energetic_state_GO_genes) != 0, ] # miRNA Apul_miRNA_red <- Apul_miRNA[rowSums(Apul_miRNA) != 0, ] # lncRNA Apul_lncRNA_red <- Apul_lncRNA[rowSums(Apul_lncRNA) != 0, ] cat("Retained ", nrow(energetic_state_GO_genes_red), " of ", nrow(energetic_state_GO_genes), "genes; ", nrow(Apul_miRNA_red), " of ", nrow(Apul_miRNA), " miRNA; and ", nrow(Apul_lncRNA_red), " of ", nrow(Apul_lncRNA), " lncRNA") ``` *pOverA*: Specifying the minimum count for a proportion of samples for each gene. Setting 3/40 = 0.08. This would retain genes that are only expressed in a single season in a couple of the colonies. Additionally, setting the minimum count so that the minimum number of samples must have a gene count above a certain threshold. genes: ```{r} filt <- filterfun(pOverA(0.08, 5)) #create filter for the counts data gfilt <- genefilter(energetic_state_GO_genes_red, filt) #identify genes to keep by count filter gkeep <- energetic_state_GO_genes_red[gfilt,] #identify gene lists gn.keep <- rownames(gkeep) #gene count data filtered in PoverA, P percent of the samples have counts over A energetic_state_GO_genes_filt <- as.data.frame(energetic_state_GO_genes_red[which(rownames(energetic_state_GO_genes_red) %in% gn.keep),]) #How many rows do we have before and after filtering? cat("Pre-filtering:", nrow(energetic_state_GO_genes_red), "; Post-filtering:", nrow(energetic_state_GO_genes_filt)) ``` miRNA: ```{r} mifilt <- filterfun(pOverA(0.08, 5)) #create filter for the counts data mifilt <- genefilter(Apul_miRNA_red, mifilt) #identify miRNA to keep by count filter mikeep <- Apul_miRNA_red[mifilt,] #identify miRNA to keep by count filter mikeep <- Apul_miRNA_red[mifilt,] #identify miRNA lists mi.keep <- rownames(mikeep) #miRNA count data filtered in PoverA, P percent of the samples have counts over A Apul_miRNA_filt <- as.data.frame(Apul_miRNA_red[which(rownames(Apul_miRNA_red) %in% mi.keep),]) #How many rows do we have before and after filtering? cat("Pre-filtering:", nrow(Apul_miRNA_red), "; Post-filtering:", nrow(Apul_miRNA_filt)) ``` Of the 51 miRNA, 47 were retained. Which were removed? ```{r} setdiff(rownames(Apul_miRNA_red), rownames(Apul_miRNA_filt)) ``` lncRNA: ```{r} lncfilt <- filterfun(pOverA(0.08, 5)) #create filter for the counts data lncfilt <- genefilter(Apul_lncRNA_red, lncfilt) #identify lncRNA to keep by count filter lnckeep <- Apul_lncRNA_red[lncfilt,] #identify lncRNA to keep by count filter lnckeep <- Apul_lncRNA_red[lncfilt,] #identify lncRNA lists lnc.keep <- rownames(lnckeep) #lncRNA count data filtered in PoverA, P percent of the samples have counts over A Apul_lncRNA_filt <- as.data.frame(Apul_lncRNA_red[which(rownames(Apul_lncRNA_red) %in% lnc.keep),]) #How many rows do we have before and after filtering? cat("Pre-filtering:", nrow(Apul_lncRNA_red), "; Post-filtering:", nrow(Apul_lncRNA_filt)) ``` ## Transform data Remove Sample `ACR-225-TP` from metadata table ```{r} metadata <- metadata[!(rownames(metadata) == "ACR-225-TP1"), ] rownames(metadata) <- metadata$Sample ``` Set the order of genes, miRNA, lncRNA, wgbs, and metadata to all be the same. ```{r} # Ensure rownames of metadata are used as the desired column order desired_order <- rownames(metadata) # Reorder data frame columns energetic_state_GO_genes_filt <- energetic_state_GO_genes_filt[, desired_order] Apul_miRNA_filt <- Apul_miRNA_filt[, desired_order] Apul_lncRNA_filt <- Apul_lncRNA_filt[, desired_order] filtered_wgbs <- filtered_wgbs[, desired_order] # Check they all match identical(rownames(metadata), colnames(energetic_state_GO_genes_filt)) identical(rownames(metadata), colnames(Apul_miRNA_filt)) identical(rownames(metadata), colnames(Apul_lncRNA_filt)) identical(rownames(metadata), colnames(filtered_wgbs)) ``` Use a variance stabilized transformation for all four data sets. Variance stabilization essentially tries to make variance independent of the mean (Is this the most appropriate design to use?) genes: ```{r} dds_energetic_state_GO <- DESeqDataSetFromMatrix(countData = energetic_state_GO_genes_filt, colData = metadata, design = ~Timepoint+ColonyID) # Variance Stabilizing Transformation vsd_energetic_state_GO <- varianceStabilizingTransformation(dds_energetic_state_GO, blind = TRUE) # Must use varianceStabilizingTransformation() instead of vst() due to few input genes vsd_energetic_state_GO <- assay(vsd_energetic_state_GO) ``` miRNA: ```{r} dds_miRNA <- DESeqDataSetFromMatrix(countData = Apul_miRNA_filt, colData = metadata, design = ~Timepoint+ColonyID) # Variance Stabilizing Transformation vsd_miRNA <- varianceStabilizingTransformation(dds_miRNA, blind=TRUE) # Must use varianceStabilizingTransformation() instead of vst() due to few input genes vsd_miRNA <- assay(vsd_miRNA) ``` lncRNA: ```{r} dds_lncRNA <- DESeqDataSetFromMatrix(countData = Apul_lncRNA_filt, colData = metadata, design = ~Timepoint+ColonyID) # Variance Stabilizing Transformation vsd_lncRNA <- assay(vst(dds_lncRNA, blind = TRUE)) ``` Must round wgbs data to whole integers for normalization - need to return to this to decide if this is appropriate. ```{r} #round to integers filtered_wgbs<-filtered_wgbs %>% mutate(across(where(is.numeric), round)) dds_wgbs <- DESeqDataSetFromMatrix(countData = filtered_wgbs, colData = metadata, design = ~ Timepoint+ColonyID) # Variance Stabilizing Transformation vsd_wgbs <- assay(varianceStabilizingTransformation(dds_wgbs, blind = TRUE)) ``` ## Merge predictor features Create a merged dataset that contains the variance stabilized counts for all miRNA, lncRNA, and CpGs. ```{r} # Triple check that all three data frames have sample names in the same order identical(colnames(vsd_lncRNA), colnames(vsd_miRNA)) identical(colnames(vsd_lncRNA), colnames(vsd_wgbs)) # Bind (stack dataframes vertically, so that they match by column/sample) full_pred_counts <- rbind(vsd_lncRNA, vsd_miRNA, vsd_wgbs) # Transform so that samples are on rows and features are in columns full_pred_counts <- t(full_pred_counts) dim(full_pred_counts) ``` ## Format ```{r} # Transform the counts matrices so that samples are on the rows and gene IDs on the columns vsd_energetic_state_GO_t <- t(vsd_energetic_state_GO) # Ensure both are formatted as data frames full_pred_counts <- as.data.frame(full_pred_counts) vsd_energetic_state_GO_t <- as.data.frame(vsd_energetic_state_GO_t) # Ensure sample matching between gene and epigenetic dfs common_samples <- intersect(rownames(vsd_energetic_state_GO_t), rownames(full_pred_counts)) full_pred_counts <- full_pred_counts[common_samples, ] vsd_energetic_state_GO_t <- vsd_energetic_state_GO_t[common_samples,] ``` # Define model functions ```{r} train_models <- function(response_features, predictor_features) { models <- list() for (feature in colnames(response_features)) { y <- response_features[[feature]] # Gene expression X <- as.matrix(predictor_features) # miRNA/lncRNA/methylation as predictors # Train elastic net model (alpha = 0.5 for mix of LASSO & Ridge) # Reducing nfolds from 10 (default) to 5, since I'm using smaller sample sizes of just 10 samples per timepoint model <- cv.glmnet(X, y, alpha = 0.5, nfolds = 3) models[[feature]] <- model } return(models) } get_feature_importance <- function(models) { importance_list <- lapply(models, function(model) { coefs <- as.matrix(coef(model, s = "lambda.min"))[-1, , drop = FALSE] # Convert to regular matrix & remove intercept # Convert to data frame coefs_df <- data.frame(Feature = rownames(coefs), Importance = as.numeric(coefs)) return(coefs_df) }) # Combine feature importance across all predicted genes importance_df <- bind_rows(importance_list) %>% group_by(Feature) %>% summarize(MeanImportance = mean(abs(Importance)), .groups = "drop") %>% arrange(desc(MeanImportance)) return(importance_df) } evaluate_model_performance <- function(models, response_features, predictor_features) { results <- data.frame(Feature = colnames(response_features), R2 = NA) for (feature in colnames(response_features)) { y <- response_features[[feature]] X <- as.matrix(predictor_features) model <- models[[feature]] preds <- predict(model, X, s = "lambda.min") R2 <- cor(y, preds)^2 # R-squared metric results[results$Feature == feature, "R2"] <- R2 } return(results) } ``` # Energetic state (GO terms) # Split training and test ## Define model training and testing function ```{r} train_models_split <- function(response_features, predictor_features, train_frac = 0.8, alpha = 0.5, nfolds = 10) { # Set up storage models <- list() performance <- data.frame(Feature = colnames(response_features), R2 = NA) # Select fraction of data sample_idx <- sample(seq_len(nrow(predictor_features)), size = floor(train_frac * nrow(predictor_features))) # Split predictor data X_train <- predictor_features[sample_idx, ] X_test <- predictor_features[-sample_idx, ] for (feature in colnames(response_features)) { # Split response data y_train <- response_features[sample_idx, feature] y_test <- response_features[-sample_idx, feature] # Check if y_train is constant if (length(unique(y_train)) == 1) { warning(sprintf("Skipping feature '%s': y_train is constant", feature)) next } # Run model on training data model <- cv.glmnet(as.matrix(X_train), y_train, alpha = alpha, nfolds = nfolds) models[[feature]] <- model # Now apply the trained model to test data preds <- predict(model, newx = as.matrix(X_test), s = "lambda.min") R2 <- cor(y_test, preds)^2 performance[performance$Feature == feature, "R2"] <- R2 } return(list(models = models, performance = performance)) } ``` # Bootstrapping ```{r bootstrap-1, cache=TRUE} n_replicates <- 25 # number of replicates r2_threshold <- 0.5 # threshold for "good" prediction TM_list <- list() MP_list <- list() FI_list <- list() # Perform bootstrapping for (i in 1:n_replicates) { cat("Beginning replicate", i) cat("\n") result <- train_models_split(vsd_energetic_state_GO_t, full_pred_counts) cat("Extracting results for replicate", i) cat("\n") trained_models <- result$models model_performance <- result$performance feature_importance <- get_feature_importance(trained_models) TM_list[[i]] <- trained_models TM_list[[i]]$Replicate <- i MP_list[[i]] <- model_performance MP_list[[i]]$Replicate <- i FI_list[[i]] <- feature_importance FI_list[[i]]$Replicate <- i } # Combine all results all_TM <- do.call(rbind, TM_list) all_MP <- do.call(rbind, MP_list) all_FI <- do.call(rbind, FI_list) ``` ```{r} # Count how often each feature exceeds the R² threshold MP_summary <- all_MP %>% group_by(Feature) %>% summarize( Mean_R2 = mean(R2, na.rm = TRUE), SD_R2 = sd(R2, na.rm = TRUE), SE_R2 = SD_R2/sqrt(n()), High_Perf_Count = sum(R2 >= r2_threshold), .groups = 'drop' ) %>% arrange(desc(High_Perf_Count)) # View top consistently high-performing features print(MP_summary) # How many genes are consistently well-predicted? nrow(MP_summary[MP_summary$Mean_R2 > 0.5,]) ``` Sort genes by average R^2 across all replicates, then plot R^2 for all replicates ```{r} # Merge mean R2 info into full replicate-level data all_MP <- merge(all_MP, MP_summary[, c("Feature", "Mean_R2", "SD_R2")], by = "Feature") # Reorder Feature factor by Mean_R2 all_MP$Feature <- factor(all_MP$Feature, levels = MP_summary$Feature[order(MP_summary$Mean_R2)]) ``` ```{r} # Plot mean R² with error bars # Choose SD to describe variability in model performance ggplot(all_MP, aes(x = Feature, y = Mean_R2)) + geom_errorbar(aes(ymin = Mean_R2 - SD_R2, ymax = Mean_R2 + SD_R2), width = 0.4, color = "gray40", linewidth = 0.25) + geom_point(size = 2, color = "black") + geom_hline(yintercept = 0.5, linetype = "dashed", color = "blue") + theme_minimal() + labs(title = "Mean R² with Error Bars Across Gene Features", x = "Gene Expression Feature (Ordered by Mean R²)", y = "Mean R² ± SD") + theme(axis.text.x = element_text(angle = 45, hjust = 1, size=5)) + ylim(-0.2, 1.2) + scale_y_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1)) ``` ```{r} # Plot with features sorted by mean R² ggplot(all_MP, aes(x = Feature, y = R2)) + geom_point(aes(color = as.factor(Replicate)), size = 1) + geom_point(aes(y = Mean_R2), color = "black", shape = 18, size = 3) + geom_hline(yintercept = mean(all_MP$R2, na.rm = TRUE), linetype = "dashed", color = "blue") + theme_minimal() + labs(title = "Model Performance Across Gene Expression", x = "Gene Expression Feature (Ordered by Mean R²)", y = "R² (Variance Explained)", color = "Replicate") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0, 1) ``` Average importance of all predictive features and plot those with highest average importance accross all replicates ```{r} # Average importance across replicates mean_importance_df <- all_FI %>% group_by(Feature) %>% summarise(MeanImportance = mean(MeanImportance, na.rm = TRUE)) %>% arrange(desc(MeanImportance)) # Select top 50 features top_features <- mean_importance_df %>% top_n(50, MeanImportance) # Plot ggplot(top_features, aes(x = reorder(Feature, MeanImportance), y = MeanImportance)) + geom_bar(stat = "identity", fill = "steelblue") + coord_flip() + # Flip for readability theme_minimal() + labs(title = "Top Predictive Features by Mean Importance", x = "Feature", y = "Mean Importance") ``` # Boostrapping on predictable genes Now I only want to model genes that are well-predicted over many replicates Isolate well-predicted genes ```{r} well_predicted <- MP_summary[MP_summary$Mean_R2 > 0.5,] vsd_high_perf_t <- vsd_energetic_state_GO_t[,colnames(vsd_energetic_state_GO_t) %in% well_predicted$Feature] dim(vsd_high_perf_t) ``` Of the original 99 genes annotated for energetic state GO terms, 33 are consistently well-predicted by our model (mean R^2 > 0.5, over 25 replicates). Run bootstrapping on isolated genes ```{r bootstrap-2, cache=TRUE} n_replicates <- 50 # number of replicates r2_threshold <- 0.5 # threshold for "good" prediction TM_list <- list() MP_list <- list() FI_list <- list() # Perform bootstrapping for (i in 1:n_replicates) { cat("Beginning replicate", i) cat("\n") result <- train_models_split(vsd_high_perf_t, full_pred_counts) cat("Extracting results for replicate", i) cat("\n") trained_models <- result$models model_performance <- result$performance feature_importance <- get_feature_importance(trained_models) TM_list[[i]] <- list(Replicate = i, Models = trained_models) MP_list[[i]] <- model_performance MP_list[[i]]$Replicate <- i FI_list[[i]] <- feature_importance FI_list[[i]]$Replicate <- i } # Combine all results all_MP_highperf <- do.call(rbind, MP_list) all_FI_highperf <- do.call(rbind, FI_list) ``` ```{r} # Count how often each feature exceeds the R² threshold MP_summary_highperf <- all_MP_highperf %>% group_by(Feature) %>% summarize( Mean_R2 = mean(R2, na.rm = TRUE), SD_R2 = sd(R2, na.rm = TRUE), SE_R2 = SD_R2 / sqrt(n()), High_Perf_Count = sum(R2 >= r2_threshold), .groups = 'drop' ) %>% arrange(desc(High_Perf_Count)) # View top consistently high-performing features print(MP_summary_highperf) # How many genes are consistently well-predicted? nrow(MP_summary_highperf[MP_summary_highperf$Mean_R2 > 0.5,]) ``` When I train the model on only these 33 well-predicted genes, 28 of them are again well-predicted. That's a much higher success rate! This suggests that epigenetic mechanisms may indeed be able to predict expression of these genes Sort genes by average R^2 across all replicates, then plot R^2 for all replicates ```{r} # Merge mean R2 info into full replicate-level data all_MP_highperf <- merge(all_MP_highperf, MP_summary_highperf[, c("Feature", "Mean_R2", "SD_R2")], by = "Feature") # Reorder Feature factor by Mean_R2 all_MP_highperf$Feature <- factor(all_MP_highperf$Feature, levels = MP_summary_highperf$Feature[order(MP_summary_highperf$Mean_R2)]) ``` ```{r summarized-R2-plot} # Plot mean R² with error bars # Choose SD to describe variability in model performance ggplot(all_MP_highperf, aes(x = Feature, y = Mean_R2)) + geom_errorbar(aes(ymin = Mean_R2 - SD_R2, ymax = Mean_R2 + SD_R2), width = 0.3, color = "gray40") + geom_point(size = 2, color = "black") + geom_hline(yintercept = 0.5, linetype = "dashed", color = "blue") + theme_minimal() + labs(title = "Mean R² with Error Bars Across Gene Features", x = "Gene Expression Feature (Ordered by Mean R²)", y = "Mean R² ± SD") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(-0.2, 1.2) + scale_y_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1)) ``` ```{r} # Plot all replicate R2s, with features sorted by mean R² ggplot(all_MP_highperf, aes(x = Feature, y = R2)) + geom_point(aes(color = as.factor(Replicate)), size = 1.5) + geom_point(aes(y = Mean_R2), color = "black", shape = 18, size = 3) + geom_hline(yintercept = mean(all_MP_highperf$R2, na.rm = TRUE), linetype = "dashed", color = "blue") + theme_minimal() + labs(title = "Model Performance Across Gene Expression", x = "Gene Expression Feature (Ordered by Mean R²)", y = "R² (Variance Explained)", color = "Replicate") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ylim(0, 1) ``` ```{r} # Average importance across replicates mean_importance_df_highperf <- all_FI_highperf %>% group_by(Feature) %>% summarise(MeanImportance = mean(MeanImportance, na.rm = TRUE)) %>% arrange(desc(MeanImportance)) # Select top 50 features top_features_highperf <- mean_importance_df_highperf %>% top_n(50, MeanImportance) # Plot ggplot(top_features_highperf, aes(x = reorder(Feature, MeanImportance), y = MeanImportance)) + geom_bar(stat = "identity", fill = "steelblue") + coord_flip() + # Flip for readability theme_minimal() + labs(title = "Top Predictive Features by Mean Importance", x = "Feature", y = "Mean Importance") ``` Look at most predictive epigenetic features for these well-predicted genes ```{r} # Define color assignments for predictor types color_palette <- c(miRNA = "#E69F00", lncRNA = "#0072B2", CpG = "#009E73") # Define well-predicted features all_features_highR2 <- all_MP_highperf %>% filter(Mean_R2 > 0.5) %>% pull(Feature) %>% unique() # Function to extract importance from a single model get_feature_importance_for_feature <- function(model) { coefs <- as.matrix(coef(model, s = "lambda.min"))[-1, , drop = FALSE] # remove intercept data.frame(Predictor = rownames(coefs), Importance = abs(as.numeric(coefs))) } # Function to assign predictor type get_predictor_type <- function(predictor_name) { if (startsWith(predictor_name, "Cluster")) return("miRNA") if (startsWith(predictor_name, "lncRNA")) return("lncRNA") if (startsWith(predictor_name, "CpG")) return("CpG") return("Other") } # Initialize list to save top predictors top_predictors <- list() # Loop over features, aggregate importance across replicates, and plot for (target_feature in all_features_highR2) { # Extract and combine importance scores across all replicates importance_all_reps <- lapply(TM_list, function(rep_entry) { model <- rep_entry$Models[[target_feature]] if (!is.null(model)) { get_feature_importance_for_feature(model) } else { NULL } }) %>% bind_rows() # If no importance data was found, skip if (nrow(importance_all_reps) == 0) next # Compute average importance across replicates mean_importance <- importance_all_reps %>% group_by(Predictor) %>% summarise(MeanImportance = mean(Importance, na.rm = TRUE), .groups = "drop") %>% arrange(desc(MeanImportance)) %>% slice_head(n = 20) %>% mutate(Type = sapply(Predictor, get_predictor_type)) # Plot once per target feature plot <- ggplot(mean_importance, aes(x = reorder(Predictor, MeanImportance), y = MeanImportance, fill=Type)) + geom_bar(stat = "identity") + scale_fill_manual(values = color_palette) + coord_flip() + theme_minimal() + labs(title = paste("Top Predictors for", target_feature), x = "Predictor", y = "Mean Importance (across replicates)") # Save top 20 most important predictors for later use top20 <- mean_importance %>% mutate(Feature = target_feature) top_predictors[[target_feature]] <- top20 print(plot) } ``` # Plot expression of genes and their predictors ```{r, error=FALSE, warning=FALSE} # Put predictor expression in format with samples in columns full_pred_counts_t <- t(full_pred_counts) # Loop over each feature in the list for (feature in names(top_predictors)) { # Get top 5 predictors for this feature top_df <- top_predictors[[feature]] %>% slice_head(n = 5) predictors <- top_df$Predictor # Combine feature and predictors all_genes <- c(feature, predictors) # Check which genes are actually present present_genes <- all_genes[all_genes %in% rownames(full_pred_counts_t) | all_genes %in% rownames(vsd_energetic_state_GO)] if (!(feature %in% rownames(vsd_energetic_state_GO))) { warning(paste("Feature", feature, "not found in vsd_energetic_state_GO Skipping...")) next } # Extract predictor expression (only those present) predictor_expr <- full_pred_counts_t[intersect(predictors, rownames(full_pred_counts_t)), , drop = FALSE] %>% as.data.frame() # Add the feature expression feature_expr <- vsd_energetic_state_GO[feature, , drop = FALSE] %>% as.data.frame() # Combine into one data frame combined_expr <- rbind(predictor_expr, feature_expr) combined_expr$Gene <- rownames(combined_expr) # Convert to long format expr_long <- combined_expr %>% pivot_longer(-Gene, names_to = "Sample", values_to = "Expression") # Join with sample metadata expr_long <- expr_long %>% left_join(metadata, by = "Sample") # Plot, with colonies aggregated by timepoint # p <- ggplot(expr_long, aes(x = interaction(Timepoint), y = Expression, color = Gene, group = Gene)) + # geom_point(alpha = 0.7) + # geom_smooth(se = FALSE, method = "loess") + # labs(title = paste("Expression of", feature, "and Top 5 Predictors"), # x = "Colony-Timepoint", # y = "Expression") + # theme_minimal() + # theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Plot, faceted by colony p <- ggplot(expr_long, aes(x = Timepoint, y = Expression, color = Gene, group = Gene)) + geom_point(alpha = 0.7) + geom_smooth(se = FALSE, method = "loess", linewidth = 0.5) + facet_wrap(~ColonyID) + labs(title = paste("Expression of", feature, "and Top 5 Predictors by Colony"), x = "Timepoint", y = "Expression") + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) print(p) } ``` # Functions of well-predicted genes ```{r} all_features_highR2_FA <- energetic_state_GO[energetic_state_GO$gene %in% all_features_highR2,] print(all_features_highR2_FA) ``` Save ```{r} write.csv(all_features_highR2_FA, "../output/22.6-Apul-multiomic-machine-learning-updatedWGBS/energetic_state_highperf_FA.csv") ``` Save top predictors and metrics of each well-predicted gene ```{r} # Combine all sub-tibbles into a single data frame with the parent list name as an ID top_predictors_df <- imap_dfr(top_predictors, ~ { .x }) # View the resulting data frame print(top_predictors_df) # Optional: save to CSV write.csv(top_predictors_df, "../output/22.6-Apul-multiomic-machine-learning-updatedWGBS/top_predictors.csv", row.names = FALSE) ``` Wow! Removing Sample ACR-225-TP1 had a huge impact. 30x more CpG sites were retained, and a preliminary rerun of the model (5 replicates) showed CpGs as a much more important component of predicting gene expression! Will up the number of replicates to increase confidence