--- title: "22-Apul-miRNA-mRNA-machine-learning" author: "Kathleen Durkin" date: "2025-03-31" output: github_document: toc: true number_sections: true html_document: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true bookdown::html_document2: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show code_download: true bibliography: references.bib editor_options: chunk_output_type: console --- Before running, set knit directory to "Project Directory". - Ariana I'd like to see whether phenotype can predict gene and/or miRNA expression, will be testing this using the ML approach Ariana has been trialing (see her mRNA-WGBS ML post here: https://ahuffmyer.github.io/ASH_Putnam_Lab_Notebook/E5-timeseries-molecular-mRNA-WGBS-machine-learning-analysis-Part-1/) Inputs: - RNA counts matrix (raw): `../output/02.20-D-Apul-RNAseq-alignment-HiSat2/apul-gene_count_matrix.csv` - sRNA/miRNA counts matrix (raw): `../output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_miRNA_ShortStack_counts_formatted.txt` - sample metadata: `../../M-multi-species/data/rna_metadata.csv` - physiological data: https://github.com/urol-e5/timeseries/raw/refs/heads/master/time_series_analysis/Output/master_timeseries.csv Note that I'll start by using phenotype (e.g. biomass, respiration) as the predictor, which is suitable for understanding *how external factors drive gene expression changes.* If, instead, we wanted to build some sort of predictive model, where *gene expression could be used to predict phenotype*, we could switch so that gene counts are used as the predictors. # Load libraries ```{r} library(tidyverse) library(ggplot2) library(DESeq2) library(igraph) library(psych) library(tidygraph) library(ggraph) library(WGCNA) library(edgeR) library(reshape2) library(ggcorrplot) library(corrplot) library(rvest) library(purrr) library(pheatmap) library(glmnet) library(caret) library(factoextra) library(vegan) library(ggfortify) ``` # Load and prep data Load in count matrices for RNAseq. ```{r load-data} # raw gene counts data (will filter and variance stabilize) Apul_genes <- read_csv("D-Apul/output/02.20-D-Apul-RNAseq-alignment-HiSat2/apul-gene_count_matrix.csv") Apul_genes <- as.data.frame(Apul_genes) # format gene IDs as rownames (instead of a column) rownames(Apul_genes) <- Apul_genes$gene_id Apul_genes <- Apul_genes%>%select(!gene_id) # load and format metadata metadata <- read_csv("M-multi-species/data/rna_metadata.csv")%>%select(AzentaSampleName, ColonyID, Timepoint) %>% filter(grepl("ACR", ColonyID)) metadata$Sample <- paste(metadata$AzentaSampleName, metadata$ColonyID, metadata$Timepoint, sep = "_") colonies <- unique(metadata$ColonyID) # Load physiological data phys<-read_csv("https://github.com/urol-e5/timeseries/raw/refs/heads/master/time_series_analysis/Output/master_timeseries.csv")%>%filter(colony_id_corr %in% colonies)%>% select(colony_id_corr, species, timepoint, site, Host_AFDW.mg.cm2, Sym_AFDW.mg.cm2, Am, AQY, Rd, Ik, Ic, calc.umol.cm2.hr, cells.mgAFDW, prot_mg.mgafdw, Ratio_AFDW.mg.cm2, Total_Chl, Total_Chl_cell, cre.umol.mgafdw) # format timepoint phys$timepoint <- gsub("timepoint", "TP", phys$timepoint) #add column with full sample info phys <- merge(phys, metadata, by.x = c("colony_id_corr", "timepoint"), by.y = c("ColonyID", "Timepoint")) %>% select(-AzentaSampleName) #add site information into metadata metadata$Site<-phys$site[match(metadata$ColonyID, phys$colony_id_corr)] # Rename gene column names to include full sample info (as in miRNA table) colnames(Apul_genes) <- metadata$Sample[match(colnames(Apul_genes), metadata$AzentaSampleName)] # raw miRNA counts (will filter and variance stabilize) Apul_miRNA <- read.table(file = "D-Apul/output/03.10-D-Apul-sRNAseq-expression-DESeq2/Apul_miRNA_ShortStack_counts_formatted.txt", header = TRUE, sep = "\t", check.names = FALSE) ``` ## Counts filtering Ensure there are no genes or miRNAs with 0 counts across all samples. ```{r} nrow(Apul_genes) Apul_genes_filt<-Apul_genes %>% mutate(Total = rowSums(.[, 1:40]))%>% filter(!Total==0)%>% dplyr::select(!Total) nrow(Apul_genes_filt) # miRNAs nrow(Apul_miRNA) Apul_miRNA_filt<-Apul_miRNA %>% mutate(Total = rowSums(.[, 1:40]))%>% filter(!Total==0)%>% dplyr::select(!Total) nrow(Apul_miRNA_filt) ``` Removing genes with only 0 counts reduced number from 44371 to 35869. Retained all 51 miRNAs. Will not be performing pOverA filtering for now, since LM should presumabily incorporate sample representation (Why? - Ariana) ## Physiology filtering Run PCA on physiology data to see if there are phys outliers Export data for PERMANOVA test. ```{r} test<-as.data.frame(phys) test<-test[complete.cases(test), ] ``` Build PERMANOVA model. ```{r} scaled_test <-prcomp(test%>%select(where(is.numeric)), scale=TRUE, center=TRUE) fviz_eig(scaled_test) # scale data vegan <- scale(test%>%select(where(is.numeric))) # PerMANOVA permanova<-adonis2(vegan ~ timepoint*site, data = test, method='eu') permanova ``` ```{r} pca1<-ggplot2::autoplot(scaled_test, data=test, frame.colour="timepoint", loadings=FALSE, colour="timepoint", shape="site", loadings.label.colour="black", loadings.colour="black", loadings.label=FALSE, frame=FALSE, loadings.label.size=5, loadings.label.vjust=-1, size=5) + geom_text(aes(x = PC1, y = PC2, label = paste(colony_id_corr, timepoint)), vjust = -0.5)+ theme_classic()+ theme(legend.text = element_text(size=18), legend.position="right", plot.background = element_blank(), legend.title = element_text(size=18, face="bold"), axis.text = element_text(size=18), axis.title = element_text(size=18, face="bold"));pca1 ``` Remove ACR-173, timepoint 3 sample from analysis. This is Azenta sample 1B2. ```{r} Apul_genes_filt <- Apul_genes_filt %>% select(!`1B2_ACR-173_TP3`) Apul_miRNA_filt <- Apul_miRNA_filt %>% select(!`1B2_ACR-173_TP3`) metadata <- metadata %>% filter(Sample != "1B2_ACR-173_TP3") ``` We also do not have phys data for colony 1B9 ACR-265 at TP4, so I'll remove that here as well. ```{r} Apul_genes_filt <- Apul_genes_filt%>% select(!`1B9_ACR-265_TP4`) Apul_miRNA_filt <- Apul_miRNA_filt%>% select(!`1B9_ACR-265_TP4`) metadata <- metadata %>% filter(Sample != "1B9_ACR-265_TP4") ``` ## Assign metadata and arrange order of columns Order metadata the same as the column order in the gene matrix. ```{r} list<-colnames(Apul_genes_filt) list<-as.factor(list) metadata$Sample<-as.factor(metadata$Sample) # Re-order the levels metadata$Sample <- factor(as.character(metadata$Sample), levels=list) # Re-order the data.frame metadata_ordered <- metadata[order(metadata$Sample),] metadata_ordered$Sample # Make sure the miRNA colnames are also in the same order as the gene colnames Apul_miRNA_filt <- Apul_miRNA_filt[, colnames(Apul_genes_filt)] ``` Metadata and gene count matrix are now ordered the same. ## Conduct variance stabilized transformation VST should be performed on our two input datasets (gene counts and miRNA counts) separately ### Genes: ```{r} #Set DESeq2 design dds_genes <- DESeqDataSetFromMatrix(countData = Apul_genes_filt, colData = metadata_ordered, design = ~Timepoint+ColonyID) ``` Check size factors. ```{r} SF.dds_genes <- estimateSizeFactors(dds_genes) #estimate size factors to determine if we can use vst to transform our data. Size factors should be less than 4 for us to use vst print(sizeFactors(SF.dds_genes)) #View size factors all(sizeFactors(SF.dds_genes)) < 4 ``` All size factors are less than 4, so we can use VST transformation. ```{r} vsd_genes <- vst(dds_genes, blind=TRUE) #apply a variance stabilizing transformation to minimize effects of small counts and normalize with respect to library size vsd_genes <- assay(vsd_genes) head(vsd_genes, 3) #view transformed gene count data for the first three genes in the dataset. ``` ### miRNA: ```{r} #Set DESeq2 design dds_miRNA <- DESeqDataSetFromMatrix(countData = Apul_miRNA_filt, colData = metadata_ordered, design = ~Timepoint+ColonyID) ``` Check size factors. ```{r} SF.dds_miRNA <- estimateSizeFactors(dds_miRNA) #estimate size factors to determine if we can use vst to transform our data. Size factors should be less than 4 for us to use vst print(sizeFactors(SF.dds_miRNA)) #View size factors all(sizeFactors(SF.dds_miRNA)) < 4 ``` All size factors are less than 4, so we can use VST transformation. ```{r} vsd_miRNA <- varianceStabilizingTransformation(dds_miRNA, blind=TRUE) #apply a variance stabilizing transformation to minimize effects of small counts and normalize with respect to library size. Using varianceStabilizingTransformation() instead of vst() because few input genes vsd_miRNA <- assay(vsd_miRNA) head(vsd_miRNA, 3) #view transformed gene count data for the first three genes in the dataset. ``` ## Combine counts data ```{r} # Extract variance stabilized counts as dataframes # want samples in rows, genes/miRNAs in columns vsd_genes <- as.data.frame(t(vsd_genes)) vsd_miRNA <- as.data.frame(t(vsd_miRNA)) # Double check the row names (sample names) are in same order rownames(vsd_genes) == rownames(vsd_miRNA) # Combine vst gene counts and vst miRNA counts by rows (sample names) vsd_merged <- cbind(vsd_genes, vsd_miRNA) ``` # Feature selection ## Genes + miRNA We have a large number of genes, so we'll reduce dimensionality using PCA. Note that, since we only have a few phenotypes of interest, we don't need to reduce this dataset First need to remove any genes/miRNA that are invariant ```{r} vsd_merged_filt <- vsd_merged[, apply(vsd_merged, 2, var) > 0] ncol(vsd_merged) ncol(vsd_merged_filt) colnames(vsd_merged[, apply(vsd_merged, 2, var) == 0]) ``` Removed 74 invariant genes. I was worried we lost miRNA, but it looks like everything removed was a gene (prefix "FUN")! Reduce dimensionality (genes+miRNA) ```{r} # Perform PCA on gene+miRNA expression matrix pca_merged <- prcomp(vsd_merged_filt, scale. = TRUE) # Select top PCs that explain most variance (e.g., top 50 PCs) explained_var <- summary(pca_merged)$importance[2, ] # Cumulative variance explained num_pcs <- min(which(cumsum(explained_var) > 0.95)) # Keep PCs that explain 95% variance merged_pcs <- as.data.frame(pca_merged$x[, 1:num_pcs]) # Extract selected PCs dim(merged_pcs) ``` We have 27 gene/miRNA expression PCs ## Genes only To investigate gene expression separately from miRNA expression, reduce dimensionality of genes alone. Remove any genes that are invariant ```{r} vsd_genes_filt <- vsd_genes[, apply(vsd_genes, 2, var) > 0] ncol(vsd_genes) ncol(vsd_genes_filt) ``` Removed 74 invariant genes. Reduce dimensionality ```{r} # Perform PCA on gene expression matrix pca_genes <- prcomp(vsd_genes_filt, scale. = TRUE) # Select top PCs that explain most variance (e.g., top 50 PCs) explained_var_genes <- summary(pca_genes)$importance[2, ] # Cumulative variance explained num_pcs_genes <- min(which(cumsum(explained_var_genes) > 0.95)) # Keep PCs that explain 95% variance genes_pcs <- as.data.frame(pca_genes$x[, 1:num_pcs_genes]) # Extract selected PCs dim(genes_pcs) ``` ## Physiological metrics Select physiological metrics of interest. For now we'll focus on biomass ("Host_AFDW.mg.cm2"), protein ("prot_mg.mgafdw"), and respiration ("Rd"). These are all metrics of host energy storage and expenditure. ```{r} # Assign sample IDs to row names rownames(phys) <- phys$Sample # Select metrics phys_selection <- phys %>% select(Host_AFDW.mg.cm2, prot_mg.mgafdw, Rd) # Make sure the phys rownames are in the same order as the gene/miRNA rownames phys_selection <- phys_selection[rownames(merged_pcs),] ``` # Phenotype to predict gene/miRNA expression ## The model Train elastic models to predict gene expression PCs from phys data. ```{r} train_models <- function(response_pcs, predictor_pcs) { models <- list() for (pc in colnames(response_pcs)) { y <- response_pcs[[pc]] # Gene expression PC X <- as.matrix(predictor_pcs) # Phys as predictors # Train elastic net model (alpha = 0.5 for mix of LASSO & Ridge) model <- cv.glmnet(X, y, alpha = 0.5) models[[pc]] <- model } return(models) } # Train models predicting gene expression PCs from phys data models <- train_models(merged_pcs, phys_selection) ``` Extract feature importance. ```{r} get_feature_importance <- function(models) { importance_list <- lapply(models, function(model) { coefs <- as.matrix(coef(model, s = "lambda.min"))[-1, , drop = FALSE] # Convert to regular matrix & remove intercept # Convert to data frame coefs_df <- data.frame(Feature = rownames(coefs), Importance = as.numeric(coefs)) return(coefs_df) }) # Combine feature importance across all predicted gene PCs importance_df <- bind_rows(importance_list) %>% group_by(Feature) %>% summarize(MeanImportance = mean(abs(Importance)), .groups = "drop") %>% arrange(desc(MeanImportance)) return(importance_df) } feature_importance <- get_feature_importance(models) head(feature_importance, 20) # Top predictive phys features ``` Evaluate performance. ```{r} evaluate_model_performance <- function(models, response_pcs, predictor_pcs) { results <- data.frame(PC = colnames(response_pcs), R2 = NA) for (pc in colnames(response_pcs)) { y <- response_pcs[[pc]] X <- as.matrix(predictor_pcs) model <- models[[pc]] preds <- predict(model, X, s = "lambda.min") R2 <- cor(y, preds)^2 # R-squared metric results[results$PC == pc, "R2"] <- R2 } return(results) } performance_results <- evaluate_model_performance(models, merged_pcs, phys_selection) summary(performance_results$R2) ``` ## Results Plot results. ```{r} # Select top 20 predictive phys features top_features <- feature_importance %>% top_n(20, MeanImportance) # Plot ggplot(top_features, aes(x = reorder(Feature, MeanImportance), y = MeanImportance)) + geom_bar(stat = "identity", fill = "steelblue") + coord_flip() + # Flip for readability theme_minimal() + labs(title = "Top 20 Predictive Phys Features", x = "Physiological Metric", y = "Mean Importance") ``` ```{r} ggplot(performance_results, aes(x = PC, y = R2)) + geom_point(color = "darkred", size = 3) + geom_hline(yintercept = mean(performance_results$R2, na.rm = TRUE), linetype = "dashed", color = "blue") + theme_minimal() + labs(title = "Model Performance Across Gene Expression PCs", x = "Gene Expression PC", y = "R² (Variance Explained)") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate labels ``` Keep in mind that, while we ran the model with physiological predictors, we're really interested in the genes/miRNA associated with these predictors View components associated with gene/miRNA PCs ```{r} # Get the PCA rotation (loadings) matrix from the original gene/miRNA PCA merged_loadings <- pca_merged$rotation # Each column corresponds to a PC # Convert to data frame and reshape for plotting merged_loadings_df <- as.data.frame(merged_loadings) %>% rownames_to_column(var = "gene_miRNA") %>% pivot_longer(-gene_miRNA, names_to = "Merged_PC", values_to = "Loading") # View top CpGs contributing most to each PC top_genes <- merged_loadings_df %>% group_by(Merged_PC) %>% arrange(desc(abs(Loading))) %>% slice_head(n = 20) # Select top 10 CpGs per PC print(top_genes) ``` View top 20 miRNA/genes associated with PC9 (the PC with the highest R^2) ```{r} print(top_genes%>%filter(Merged_PC=="PC9")) ``` Interesting, there's an miRNA in there! View predicted vs actual gene expression values to evaluate model. ```{r} # Choose a gene expression PC to visualize (e.g., the most predictable one) best_pc <- performance_results$PC[which.max(performance_results$R2)] # Extract actual and predicted values for that PC actual_values <- merged_pcs[[best_pc]] predicted_values <- predict(models[[best_pc]], as.matrix(phys_selection), s = "lambda.min") # Create data frame prediction_df <- data.frame( Actual = actual_values, Predicted = predicted_values ) # Scatter plot with regression line ggplot(prediction_df, aes(x = Actual, y = lambda.min)) + geom_point(color = "blue", alpha = 0.7) + geom_smooth(method = "lm", color = "red", se = FALSE) + theme_minimal() + labs(title = paste("Predicted vs. Actual for", best_pc), x = "Actual Gene Expression PC", y = "Predicted Gene Expression PC") + annotate("text", x = min(actual_values), y = max(predicted_values), label = paste("R² =", round(max(performance_results$R2, na.rm=TRUE), 3)), hjust = 0, color = "black", size = 5) ## `geom_smooth()` using formula = 'y ~ x' ``` # miRNA expression to predict gene expression ## The model Train elastic models to predict gene expression PCs from phys data. ```{r} # Train models predicting gene expression PCs from phys data models2 <- train_models(genes_pcs, vsd_miRNA) ``` Extract feature importance. ```{r} feature_importance2 <- get_feature_importance(models2) head(feature_importance2, 20) # Top predictive phys features ``` Evaluate performance. ```{r} performance_results2 <- evaluate_model_performance(models2, genes_pcs, vsd_miRNA) summary(performance_results2$R2) ``` ## Results Plot results. ```{r} # Select top 20 predictive phys features top_features2 <- feature_importance2 %>% top_n(20, MeanImportance) # Plot ggplot(top_features2, aes(x = reorder(Feature, MeanImportance), y = MeanImportance)) + geom_bar(stat = "identity", fill = "steelblue") + coord_flip() + # Flip for readability theme_minimal() + labs(title = "Top 20 Predictive miRNA Features", x = "Gene Expression", y = "Mean Importance") ``` ```{r} ggplot(performance_results2, aes(x = PC, y = R2)) + geom_point(color = "darkred", size = 3) + geom_hline(yintercept = mean(performance_results2$R2, na.rm = TRUE), linetype = "dashed", color = "blue") + theme_minimal() + labs(title = "Model Performance Across Gene Expression PCs", x = "Gene Expression PC", y = "R² (Variance Explained)") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate labels ``` View components associated with gene PCs ```{r} # Get the PCA rotation (loadings) matrix from the original gene PCA genes_loadings <- pca_genes$rotation # Each column corresponds to a PC # Convert to data frame and reshape for plotting genes_loadings_df <- as.data.frame(genes_loadings) %>% rownames_to_column(var = "gene") %>% pivot_longer(-gene, names_to = "Genes_PC", values_to = "Loading") # View top genes contributing most to each PC top_genes2 <- genes_loadings_df %>% group_by(Genes_PC) %>% arrange(desc(abs(Loading))) %>% slice_head(n = 20) # Select top 10 CpGs per PC print(top_genes2) ``` View top 20 genes associated with PC11 (the most "predictable" PC, with the highest R^2) ```{r} print(top_genes2%>%filter(Genes_PC=="PC11")) ``` View predicted vs actual gene expression values to evaluate model. ```{r} # Choose a gene expression PC to visualize (e.g., the most predictable one) best_pc2 <- performance_results2$PC[which.max(performance_results2$R2)] # Extract actual and predicted values for that PC actual_values2 <- genes_pcs[[best_pc2]] predicted_values2 <- predict(models2[[best_pc2]], as.matrix(vsd_miRNA), s = "lambda.min") # Create data frame prediction_df2 <- data.frame( Actual = actual_values2, Predicted = predicted_values2 ) # Scatter plot with regression line ggplot(prediction_df2, aes(x = Actual, y = lambda.min)) + geom_point(color = "blue", alpha = 0.7) + geom_smooth(method = "lm", color = "red", se = FALSE) + theme_minimal() + labs(title = paste("Predicted vs. Actual for", best_pc2), x = "Actual Gene Expression PC", y = "Predicted Gene Expression PC") + annotate("text", x = min(actual_values2), y = max(predicted_values2), label = paste("R² =", round(max(performance_results2$R2, na.rm=TRUE), 3)), hjust = 0, color = "black", size = 5) ## `geom_smooth()` using formula = 'y ~ x' ``` We can also look att which miRNA(s) contributed most to predicting gene PCs of interest ```{r} get_feature_importance_for_pc <- function(model) { coefs <- as.matrix(coef(model, s = "lambda.min"))[-1, , drop = FALSE] # Remove intercept coefs_df <- data.frame(Feature = rownames(coefs), Importance = abs(as.numeric(coefs))) return(coefs_df %>% arrange(desc(Importance))) # Sort by importance } # Extract feature importance for the most predictable PC #best_pc2 <- "PC11" best_pc_model2 <- models2[[best_pc2]] best_pc_importance2 <- get_feature_importance_for_pc(best_pc_model2) # View top most important miRNA for predicting this PC print(head(best_pc_importance2, 10)) # Plot ggplot(best_pc_importance2 %>% head(20), aes(x = reorder(Feature, Importance), y = Importance)) + geom_bar(stat = "identity", fill = "steelblue") + coord_flip() + theme_minimal() + labs(title = paste("Top miRNA Predictors for", best_pc2), x = "miRNA", y = "Importance Score") ```