#!/usr/bin/env Rscript ################################################################################ # Elastic Net Regression with Permutation-Based Predictor Significance Testing # WITH PARALLEL PROCESSING FOR BOOTSTRAPPING AND PERMUTATION STEPS ################################################################################ # # OVERVIEW: # This script extends the original Elastic Net approach by adding permutation-based # significance testing for each predictor coefficient. This addresses a key limitation # of penalized regression methods: they provide coefficient estimates but not p-values. # # PARALLEL PROCESSING: # ------------------- # This version incorporates parallel processing for: # 1. Bootstrap replicates (each replicate runs independently) # 2. Permutation testing across genes (each gene's permutation test is independent) # 3. Permutations within each gene (each permutation is independent) # # This can dramatically reduce runtime when multiple cores are available. # On a 4-core machine, expect ~3-4x speedup for bootstrap and permutation steps. # # STATISTICAL RATIONALE: # ---------------------- # Elastic Net (and LASSO/Ridge) regression performs variable selection through # regularization, but the resulting coefficients lack traditional inference statistics. # This is because: # 1. The penalization introduces bias into coefficient estimates # 2. Standard asymptotic theory doesn't apply to penalized estimators # 3. The selection process invalidates classical hypothesis testing # # PERMUTATION APPROACH: # -------------------- # To obtain valid p-values, we use a permutation test framework: # 1. For each gene, fit the real Elastic Net model and record predictor coefficients # 2. Then, permute the response variable (gene expression) many times # 3. For each permutation, refit the model and record coefficients # 4. The p-value for each predictor is the proportion of permuted coefficients # that are >= the observed coefficient (in absolute value) # # REFERENCES: # ---------- # - Phipson & Smyth (2010). "Permutation P-values Should Never Be Zero" # Statistical Applications in Genetics and Molecular Biology. DOI: 10.2202/1544-6115.1585 # - Ojala & Garriga (2010). "Permutation Tests for Studying Classifier Performance" # Journal of Machine Learning Research. DOI: 10.5555/1756006.1859920 # - Altmann et al. (2010). "Permutation importance: a corrected feature importance measure" # Bioinformatics. DOI: 10.1093/bioinformatics/btq134 # ################################################################################ cat("=== Elastic Net with Permutation Significance Testing (Parallel) ===\n") cat("Loading libraries\n") suppressMessages(suppressWarnings(library(tidyverse))) suppressMessages(suppressWarnings(library(ggplot2))) suppressMessages(suppressWarnings(library(DESeq2))) suppressMessages(suppressWarnings(library(igraph))) suppressMessages(suppressWarnings(library(psych))) suppressMessages(suppressWarnings(library(tidygraph))) suppressMessages(suppressWarnings(library(ggraph))) suppressMessages(suppressWarnings(library(WGCNA))) suppressMessages(suppressWarnings(library(edgeR))) suppressMessages(suppressWarnings(library(reshape2))) suppressMessages(suppressWarnings(library(ggcorrplot))) suppressMessages(suppressWarnings(library(corrplot))) suppressMessages(suppressWarnings(library(rvest))) suppressMessages(suppressWarnings(library(purrr))) suppressMessages(suppressWarnings(library(pheatmap))) suppressMessages(suppressWarnings(library(glmnet))) suppressMessages(suppressWarnings(library(caret))) suppressMessages(suppressWarnings(library(factoextra))) suppressMessages(suppressWarnings(library(vegan))) suppressMessages(suppressWarnings(library(ggfortify))) suppressMessages(suppressWarnings(library(genefilter))) suppressMessages(suppressWarnings(library(scales))) suppressMessages(suppressWarnings(library(parallel))) # For parallel processing suppressMessages(suppressWarnings(library(doParallel))) # Additional parallel support # Set seed for reproducibility set.seed(703) ################################################################################ # LOGGING SYSTEM SETUP ################################################################################ # # This logging system writes all output to both: # 1. The terminal (stdout/stderr) for real-time monitoring # 2. A log file in the output directory for record-keeping # # We achieve this by: # 1. Creating a log file connection # 2. Using a custom tee_output() function that writes to both destinations # 3. Wrapping key output functions to use tee behavior # # Initialize logging variables (will be set up after output_dir is known) log_file_path <- NULL log_file_con <- NULL #' Initialize the logging system #' Call this after output_dir is created #' #' @param output_dir Directory where log file will be saved #' @param prefix Prefix for log filename #' @return Path to the log file init_logging <- function(output_dir, prefix = "analysis") { # Create timestamped log filename timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S") log_filename <- sprintf("%s_log_%s.txt", prefix, timestamp) log_path <- file.path(output_dir, log_filename) # Open log file connection for appending log_file_con <<- file(log_path, open = "wt") log_file_path <<- log_path # Write header to log file writeLines(sprintf("=== Log initialized: %s ===", log_path), log_file_con) writeLines(sprintf("Timestamp: %s\n", Sys.time()), log_file_con) flush(log_file_con) # Print to console too cat(sprintf("=== Log initialized: %s ===\n", log_path)) cat(sprintf("Timestamp: %s\n\n", Sys.time())) return(log_path) } #' Write message to both console and log file #' Use this instead of cat() for important messages #' #' @param ... Arguments to pass to cat() log_message <- function(...) { msg <- paste0(...) # Write to console cat(msg) # Write to log file if it's open if (!is.null(log_file_con) && isOpen(log_file_con)) { writeLines(gsub("\n$", "", msg), log_file_con) flush(log_file_con) } } #' Capture and log the output of print() calls #' #' @param x Object to print log_print <- function(x) { # Capture print output output <- capture.output(print(x)) # Write to console cat(paste(output, collapse = "\n"), "\n") # Write to log file if (!is.null(log_file_con) && isOpen(log_file_con)) { writeLines(output, log_file_con) flush(log_file_con) } } #' Close the logging system #' Call this at the end of the script close_logging <- function() { if (!is.null(log_file_con)) { tryCatch({ if (isOpen(log_file_con)) { writeLines(sprintf("\n=== Log closed: %s ===", Sys.time()), log_file_con) flush(log_file_con) close(log_file_con) } }, error = function(e) { # Silently ignore errors during cleanup }) log_file_con <<- NULL } } #' Wrapper for cat that also logs to file #' Override the default cat to enable dual output tee_cat <- function(..., file = "", sep = " ", fill = FALSE, labels = NULL, append = FALSE) { # Call original cat for console output base::cat(..., file = file, sep = sep, fill = fill, labels = labels, append = append) # Also write to log file if logging is active and not writing to another file if (file == "" && !is.null(log_file_con) && isOpen(log_file_con)) { msg <- paste(..., sep = sep) writeLines(gsub("\n$", "", msg), log_file_con) flush(log_file_con) } } #' Wrapper for print that also logs to file tee_print <- function(x, ...) { # Capture print output output <- capture.output(base::print(x, ...)) # Print to console base::cat(paste(output, collapse = "\n"), "\n") # Write to log file if (!is.null(log_file_con) && isOpen(log_file_con)) { writeLines(output, log_file_con) flush(log_file_con) } invisible(x) } # Store original functions before overriding original_cat <- base::cat original_print <- base::print ################################################################################ # PROGRESS TRACKING UTILITIES ################################################################################ # # These functions provide progress reporting for long-running operations. # # For parallel operations, real-time progress is challenging in R because # mclapply child processes can't communicate back to the parent. We use two # strategies: # 1. If pbapply package is available, use pbmclapply for built-in progress # 2. Otherwise, use a shared file counter with a monitoring thread # # Check if pbapply is available for nicer parallel progress bars has_pbapply <- requireNamespace("pbapply", quietly = TRUE) if (has_pbapply) { suppressMessages(library(pbapply)) cat("Using pbapply for progress bars\n") } else { cat("Note: Install 'pbapply' package for better parallel progress bars\n") cat(" install.packages('pbapply')\n") } #' Create a text-based progress bar string #' #' @param current Current progress count #' @param total Total count #' @param width Width of the progress bar in characters #' @param prefix Text to show before the bar #' @return Formatted progress string format_progress_bar <- function(current, total, width = 40, prefix = "Progress") { if (total == 0) return(sprintf("%s: 0/0 (0%%)", prefix)) pct <- current / total filled <- round(pct * width) empty <- width - filled bar <- paste0("[", paste(rep("=", filled), collapse = ""), ifelse(filled < width, ">", ""), paste(rep(" ", max(0, empty - 1)), collapse = ""), "]") sprintf("\r%s: %s %d/%d (%.1f%%)", prefix, bar, current, total, pct * 100) } #' Print progress update (overwrites current line for clean display) #' #' @param current Current progress count #' @param total Total count #' @param prefix Text to show before the bar #' @param newline Whether to add newline at end (use TRUE when complete) print_progress <- function(current, total, prefix = "Progress", newline = FALSE) { msg <- format_progress_bar(current, total, prefix = prefix) if (newline) { base::cat(msg, "\n") } else { base::cat(msg) flush.console() # Force immediate display } # Also write to log (without carriage return, as a simple status line) if (!is.null(log_file_con) && isOpen(log_file_con)) { if (current == total || current %% max(1, floor(total / 10)) == 0) { # Only log every 10% or at completion to avoid huge log files writeLines(sprintf("%s: %d/%d (%.1f%%)", prefix, current, total, current/total*100), log_file_con) flush(log_file_con) } } } #' Log a progress milestone to file only #' #' @param current Current count #' @param total Total count #' @param prefix Prefix for the message log_progress_milestone <- function(current, total, prefix = "Progress") { if (!is.null(log_file_con) && isOpen(log_file_con)) { writeLines(sprintf("%s: %d/%d (%.1f%%)", prefix, current, total, ifelse(total > 0, current/total*100, 0)), log_file_con) flush(log_file_con) } } ################################################################################ # COMMAND LINE ARGUMENTS ################################################################################ args <- commandArgs(trailingOnly = TRUE) cat("Raw args (length ", length(args), "):\n") print(args) # Expected positional args: # 1 genes_file # 2 miRNA_file # 3 lncRNA_file # 4 WGBS_file # 5 metadata_file # 6 output_dir # 7 excluded_samples_raw (comma-separated string like "s1,s2") # 8 alpha (optional; default 0.5) # 9 bootstrap1_reps (optional; default 50) # 10 bootstrap2_reps (optional; default 50) # 11 r2_threshold (optional; default 0.5) # 12 n_permutations (optional; default 1000) # 13 n_cores (optional; default 1) -- for parallel processing min_expected <- 6 if (length(args) < min_expected) { stop(sprintf("Usage: Rscript script.R [excluded_samples_raw] [alpha_value] [bootstrap1_reps] [bootstrap2_reps] [r2_threshold] [n_permutations] [n_cores]\nYou provided %d args.", length(args))) } genes_file <- args[1] miRNA_file <- args[2] lncRNA_file <- args[3] WGBS_file <- args[4] metadata_file <- args[5] output_dir <- args[6] # -------- Optional parameters with safe parsing ---------- # Helper functions to safely parse arguments safe_numeric <- function(x, default) { if (is.na(x) || x == "" || x == "NA" || trimws(x) == "") return(default) val <- suppressWarnings(as.numeric(trimws(x))) if (is.na(val)) return(default) return(val) } safe_integer <- function(x, default) { if (is.na(x) || x == "" || x == "NA" || trimws(x) == "") return(default) val <- suppressWarnings(as.integer(trimws(x))) if (is.na(val)) return(default) return(val) } excluded_samples_raw <- if (length(args) >= 7) args[7] else NA_character_ alpha_value <- if (length(args) >= 8) safe_numeric(args[8], 0.5) else 0.5 bootstrap1_reps <- if (length(args) >= 9) safe_integer(args[9], 50) else 50 bootstrap2_reps <- if (length(args) >= 10) safe_integer(args[10], 50) else 50 r2_threshold <- if (length(args) >= 11) safe_numeric(args[11], 0.5) else 0.5 n_permutations <- if (length(args) >= 12) safe_integer(args[12], 1000) else 1000 n_cores <- if (length(args) >= 13) safe_integer(args[13], 1) else 1 # Validate critical parameters if (is.na(n_permutations) || n_permutations < 1) { cat("Warning: Invalid n_permutations value, using default of 1000\n") n_permutations <- 1000 } if (is.na(n_cores) || n_cores < 1) { cat("Warning: Invalid n_cores value, using default of 1\n") n_cores <- 1 } # Cap n_cores at available cores max_cores <- detectCores() if (n_cores > max_cores) { cat(sprintf("Warning: Requested %d cores but only %d available. Using %d cores.\n", n_cores, max_cores, max_cores)) n_cores <- max_cores } # Parse excluded samples if (is.na(excluded_samples_raw) || excluded_samples_raw == "" || excluded_samples_raw == "NA") { excluded_samples <- character(0) } else { excluded_samples <- unlist(strsplit(excluded_samples_raw, "[,;\\s]+")) excluded_samples <- excluded_samples[excluded_samples != ""] } cat("List of samples to exclude:", excluded_samples, "\n") # Diagnostics cat("\nRunning EN model with permutation significance testing:\n") cat(" Response genes: ", genes_file, "\n") cat(" Predictor miRNA: ", miRNA_file, "\n") cat(" Predictor lncRNA: ", lncRNA_file, "\n") cat(" Predictor WGBS: ", WGBS_file, "\n") cat(" Metadata: ", metadata_file, "\n") cat(" Output dir: ", output_dir, "\n") cat(" Excluded samples: ", paste(excluded_samples, collapse = ", "), "\n") cat(" Alpha: ", alpha_value, "\n") cat(" Bootstrap1 reps: ", bootstrap1_reps, "\n") cat(" Bootstrap2 reps: ", bootstrap2_reps, "\n") cat(" R2 threshold: ", r2_threshold, "\n") cat(" N permutations: ", n_permutations, "\n") cat(" N cores: ", n_cores, "\n") cat(" Max available cores:", max_cores, "\n") # Create output directories if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE) ################################################################################ # INITIALIZE LOGGING ################################################################################ # Now that we have the output directory, initialize logging to file + console log_file_path <- init_logging(output_dir, prefix = "ElasticNet_permutation") # Override cat and print to use tee versions (dual output to console + log) # This ensures all subsequent cat() and print() calls go to both destinations cat <- tee_cat print <- tee_print cat("\nSession info:\n") print(sessionInfo()) cat("Start time:", as.character(Sys.time()), "\n\n") ################################################################################ # FUNCTION DEFINITIONS ################################################################################ cat("Defining model functions\n") #' Train Elastic Net Models for All Response Features #' #' @param response_features Data frame with response variables (genes) in columns #' @param predictor_features Data frame with predictor variables in columns #' @return List of cv.glmnet model objects, one per gene train_models <- function(response_features, predictor_features, alpha = alpha_value) { models <- list() for (feature in colnames(response_features)) { y <- response_features[[feature]] X <- as.matrix(predictor_features) model <- cv.glmnet(X, y, alpha = alpha, nfolds = 3) models[[feature]] <- model } return(models) } #' Extract Feature Importance (Coefficient Magnitudes) from Models #' #' @param models List of cv.glmnet models #' @return Data frame with Feature, MeanImportance columns get_feature_importance <- function(models) { importance_list <- lapply(models, function(model) { coefs <- as.matrix(coef(model, s = "lambda.min"))[-1, , drop = FALSE] coefs_df <- data.frame(Feature = rownames(coefs), Importance = as.numeric(coefs)) return(coefs_df) }) importance_df <- bind_rows(importance_list) %>% group_by(Feature) %>% summarize(MeanImportance = mean(abs(Importance)), .groups = "drop") %>% arrange(desc(MeanImportance)) return(importance_df) } #' Evaluate Model Performance (R-squared) #' #' @param models List of cv.glmnet models #' @param response_features Data frame with response variables #' @param predictor_features Data frame with predictor variables #' @return Data frame with Feature, R2 columns evaluate_model_performance <- function(models, response_features, predictor_features) { results <- data.frame(Feature = colnames(response_features), R2 = NA) for (feature in colnames(response_features)) { y <- response_features[[feature]] X <- as.matrix(predictor_features) model <- models[[feature]] preds <- predict(model, X, s = "lambda.min") R2 <- cor(y, preds)^2 results[results$Feature == feature, "R2"] <- R2 } return(results) } #' Train Models with Train/Test Split (for a single bootstrap replicate) #' #' @param response_features Data frame with response variables #' @param predictor_features Data frame with predictor variables #' @param train_frac Fraction of data for training #' @param alpha Elastic net mixing parameter #' @param nfolds Number of CV folds #' @param seed Optional seed for this replicate #' @return List with 'models' and 'performance' components train_models_split <- function(response_features, predictor_features, train_frac = 0.8, alpha = alpha_value, nfolds = 10, seed = NULL) { # Set seed if provided (important for parallel reproducibility) if (!is.null(seed)) set.seed(seed) models <- list() performance <- data.frame(Feature = colnames(response_features), R2 = NA) # Sample training indices sample_idx <- sample(seq_len(nrow(predictor_features)), size = floor(train_frac * nrow(predictor_features))) X_train <- predictor_features[sample_idx, ] X_test <- predictor_features[-sample_idx, ] for (feature in colnames(response_features)) { y_train <- response_features[sample_idx, feature] y_test <- response_features[-sample_idx, feature] # Skip constant response variables if (length(unique(y_train)) == 1) { warning(sprintf("Skipping feature '%s': y_train is constant", feature)) next } model <- tryCatch({ cv.glmnet(as.matrix(X_train), y_train, alpha = alpha, nfolds = nfolds) }, error = function(e) { warning(sprintf("Error fitting model for %s: %s", feature, e$message)) return(NULL) }) if (!is.null(model)) { models[[feature]] <- model preds <- predict(model, newx = as.matrix(X_test), s = "lambda.min") R2 <- cor(y_test, preds)^2 performance[performance$Feature == feature, "R2"] <- R2 } } return(list(models = models, performance = performance)) } ################################################################################ # PARALLEL BOOTSTRAP FUNCTION (NEW) ################################################################################ #' Run Bootstrap Replicates in Parallel #' #' This function distributes bootstrap replicates across multiple cores. #' Each replicate gets a unique seed derived from the base seed + replicate number #' to ensure reproducibility while maintaining independence. #' #' @param response_features Data frame with response variables #' @param predictor_features Data frame with predictor variables #' @param n_reps Number of bootstrap replicates #' @param n_cores Number of cores to use #' @param base_seed Base seed for reproducibility #' @param alpha Elastic net alpha parameter #' @param progress_prefix Prefix for progress display #' @return List with TM_list, MP_list, FI_list #' run_bootstrap_parallel <- function(response_features, predictor_features, n_reps, n_cores, base_seed = 703, alpha = alpha_value, progress_prefix = "Bootstrap") { cat(sprintf("Running %d bootstrap replicates using %d core(s)\n", n_reps, n_cores)) # Function to run a single bootstrap replicate run_single_bootstrap <- function(i) { # Set unique seed for this replicate rep_seed <- base_seed + i result <- train_models_split(response_features, predictor_features, alpha = alpha, seed = rep_seed) trained_models <- result$models model_performance <- result$performance feature_importance <- get_feature_importance(trained_models) # Add replicate ID model_performance$Replicate <- i feature_importance$Replicate <- i return(list( models = trained_models, performance = model_performance, importance = feature_importance, replicate = i )) } # Run in parallel or sequentially if (n_cores > 1) { cat(sprintf("%s: Running in parallel mode...\n", progress_prefix)) log_progress_milestone(0, n_reps, progress_prefix) if (has_pbapply) { # Use pbapply for nice progress bars pboptions(type = "txt", style = 3, char = "=") results <- pblapply(1:n_reps, run_single_bootstrap, cl = n_cores) } else { # Fall back to mclapply without real-time progress # Show start message and completion cat(sprintf("%s: Processing %d replicates (progress shown at completion)...\n", progress_prefix, n_reps)) results <- mclapply(1:n_reps, run_single_bootstrap, mc.cores = n_cores) cat(sprintf("%s: Completed %d/%d replicates\n", progress_prefix, n_reps, n_reps)) } log_progress_milestone(n_reps, n_reps, progress_prefix) } else { cat(sprintf("%s: Running in sequential mode...\n", progress_prefix)) results <- vector("list", n_reps) for (i in 1:n_reps) { print_progress(i - 1, n_reps, prefix = progress_prefix) results[[i]] <- run_single_bootstrap(i) } print_progress(n_reps, n_reps, prefix = progress_prefix, newline = TRUE) } # Extract and organize results TM_list <- lapply(results, function(x) { if (!is.null(x)) { x$models$Replicate <- x$replicate x$models } else { NULL } }) MP_list <- lapply(results, function(x) if (!is.null(x)) x$performance else NULL) FI_list <- lapply(results, function(x) if (!is.null(x)) x$importance else NULL) # Remove NULLs TM_list <- Filter(Negate(is.null), TM_list) MP_list <- Filter(Negate(is.null), MP_list) FI_list <- Filter(Negate(is.null), FI_list) return(list( TM_list = TM_list, MP_list = MP_list, FI_list = FI_list )) } ################################################################################ # PERMUTATION SIGNIFICANCE FUNCTIONS (WITH PARALLEL SUPPORT) ################################################################################ #' Calculate Permutation-Based P-values for Elastic Net Coefficients #' #' This function implements a permutation test to assess the statistical significance #' of each predictor's coefficient in an Elastic Net model. #' #' IMPORTANT: Only predictors with non-zero observed coefficients are tested. #' Predictors with zero coefficients are assigned p-value = NA because: #' 1. Testing whether 0 differs from a null distribution is not meaningful #' 2. Elastic Net has already decided these predictors are not important #' 3. This dramatically reduces the multiple testing burden #' #' @param y Response vector (gene expression) #' @param X Predictor matrix #' @param n_perm Number of permutations #' @param alpha Elastic net mixing parameter #' @param n_cores Number of cores for parallel permutations #' @param seed Random seed for reproducibility #' @param test_only_nonzero If TRUE (default), only test predictors with non-zero coefficients #' @param zero_threshold Threshold below which coefficients are considered zero #' @return Data frame with columns: Predictor, Observed_Coef, Perm_Mean, Perm_SD, P_value #' calculate_permutation_pvalues <- function(y, X, n_perm = 1000, alpha = 0.5, n_cores = 1, seed = NULL, test_only_nonzero = TRUE, zero_threshold = 1e-10) { if (!is.null(seed)) set.seed(seed) X <- as.matrix(X) n_predictors <- ncol(X) predictor_names <- colnames(X) # Step 1: Fit the observed (real) model observed_model <- tryCatch({ cv.glmnet(X, y, alpha = alpha, nfolds = 3) }, error = function(e) { warning("Error fitting observed model: ", e$message) return(NULL) }) if (is.null(observed_model)) { return(data.frame( Predictor = predictor_names, Observed_Coef = NA, Abs_Observed_Coef = NA, Perm_Mean = NA, Perm_SD = NA, P_value = NA, Tested = FALSE, stringsAsFactors = FALSE )) } # Extract observed coefficients (excluding intercept) observed_coefs <- as.numeric(coef(observed_model, s = "lambda.min")[-1]) names(observed_coefs) <- predictor_names # Store the lambda used for the observed model (for potential fixed-lambda approach) observed_lambda <- observed_model$lambda.min # Identify which predictors to test (non-zero coefficients only) nonzero_mask <- abs(observed_coefs) > zero_threshold n_nonzero <- sum(nonzero_mask) # If no non-zero coefficients, return early if (n_nonzero == 0) { return(data.frame( Predictor = predictor_names, Observed_Coef = observed_coefs, Abs_Observed_Coef = abs(observed_coefs), Perm_Mean = NA, Perm_SD = NA, P_value = NA, Tested = FALSE, stringsAsFactors = FALSE )) } # Get indices and names of predictors to test if (test_only_nonzero) { test_indices <- which(nonzero_mask) } else { test_indices <- seq_len(n_predictors) } # Step 2: Perform permutations # Function for a single permutation - using FIXED lambda from observed model # This ensures we're comparing apples to apples run_single_permutation <- function(perm_idx) { # Generate unique seed for this permutation if (!is.null(seed)) set.seed(seed + perm_idx) y_perm <- sample(y) perm_model <- tryCatch({ cv.glmnet(X, y_perm, alpha = alpha, nfolds = 3) }, error = function(e) { return(NULL) }) if (!is.null(perm_model)) { # Extract coefficients at the fixed lambda perm_coefs <- as.numeric(coef(perm_model, s = observed_lambda)[-1]) return(perm_coefs[test_indices]) } else { return(rep(NA, length(test_indices))) } } # Run permutations if (n_cores > 1 && n_perm >= n_cores) { perm_results <- mclapply(1:n_perm, run_single_permutation, mc.cores = n_cores) } else { perm_results <- lapply(1:n_perm, run_single_permutation) } # Convert to matrix (rows = permutations, cols = tested predictors) perm_coefs <- do.call(rbind, perm_results) # Step 3: Calculate p-values for tested predictors only p_values <- rep(NA, n_predictors) perm_means <- rep(NA, n_predictors) perm_sds <- rep(NA, n_predictors) tested <- rep(FALSE, n_predictors) for (idx in seq_along(test_indices)) { j <- test_indices[idx] obs_coef <- abs(observed_coefs[j]) perm_coef_vec <- abs(perm_coefs[, idx]) perm_coef_vec <- perm_coef_vec[!is.na(perm_coef_vec)] tested[j] <- TRUE if (length(perm_coef_vec) > 0) { n_extreme <- sum(perm_coef_vec >= obs_coef) p_values[j] <- (n_extreme + 1) / (length(perm_coef_vec) + 1) perm_means[j] <- mean(perm_coef_vec) perm_sds[j] <- sd(perm_coef_vec) } } results <- data.frame( Predictor = predictor_names, Observed_Coef = observed_coefs, Abs_Observed_Coef = abs(observed_coefs), Perm_Mean = perm_means, Perm_SD = perm_sds, P_value = p_values, Tested = tested, stringsAsFactors = FALSE ) return(results) } #' Wrapper to Calculate Permutation P-values for All Genes (Parallelized) #' #' This function distributes genes across cores for parallel processing. #' Each gene's permutation test runs independently. #' #' Only predictors with non-zero coefficients are tested for significance. #' This dramatically reduces the multiple testing burden and focuses on #' biologically relevant associations. #' #' @param response_features Data frame with genes in columns #' @param predictor_features Data frame with predictors in columns #' @param n_perm Number of permutations per gene #' @param alpha Elastic net alpha parameter #' @param n_cores Number of cores for parallel processing #' @param base_seed Base seed for reproducibility #' @param progress_prefix Prefix for progress display #' @return Data frame with Gene, Predictor, Observed_Coef, P_value, etc. #' calculate_all_permutation_pvalues <- function(response_features, predictor_features, n_perm = 1000, alpha = 0.5, n_cores = 1, base_seed = 703, progress_prefix = "Permutation") { gene_names <- colnames(response_features) n_genes <- length(gene_names) n_predictors <- ncol(predictor_features) X <- as.matrix(predictor_features) cat(sprintf("\nCalculating permutation p-values for %d genes with %d permutations each\n", n_genes, n_perm)) cat(sprintf("Total predictors per gene: %d\n", n_predictors)) cat(sprintf("Using %d core(s) for gene-level parallelization\n", n_cores)) cat(sprintf("NOTE: Only non-zero coefficients will be tested (reduces multiple testing burden)\n")) # Function to process a single gene process_gene <- function(gene_idx) { gene_name <- gene_names[gene_idx] y <- response_features[[gene_name]] # Skip if response is constant if (length(unique(y)) <= 1) { return(NULL) } # Unique seed per gene for reproducibility gene_seed <- base_seed + gene_idx * 1000 # Note: we use n_cores=1 here because we're already parallelizing across genes # This avoids nested parallelization which can cause issues result <- calculate_permutation_pvalues(y, X, n_perm = n_perm, alpha = alpha, n_cores = 1, seed = gene_seed, test_only_nonzero = TRUE) result$Gene <- gene_name return(result) } # Process genes in parallel or sequentially if (n_cores > 1) { cat(sprintf("%s: Running in parallel mode across %d genes...\n", progress_prefix, n_genes)) log_progress_milestone(0, n_genes, progress_prefix) if (has_pbapply) { # Use pbapply for nice progress bars pboptions(type = "txt", style = 3, char = "=") results_list <- pblapply(1:n_genes, process_gene, cl = n_cores) } else { # Fall back to mclapply without real-time progress cat(sprintf("%s: Processing %d genes (progress shown at completion)...\n", progress_prefix, n_genes)) results_list <- mclapply(1:n_genes, process_gene, mc.cores = n_cores) cat(sprintf("%s: Completed %d/%d genes\n", progress_prefix, n_genes, n_genes)) } log_progress_milestone(n_genes, n_genes, progress_prefix) } else { cat(sprintf("%s: Running in sequential mode...\n", progress_prefix)) results_list <- vector("list", n_genes) for (i in seq_along(gene_names)) { # Update progress bar print_progress(i - 1, n_genes, prefix = progress_prefix) results_list[[i]] <- process_gene(i) } print_progress(n_genes, n_genes, prefix = progress_prefix, newline = TRUE) } # Combine results all_results <- bind_rows(results_list) # Print diagnostic summary n_tested <- sum(all_results$Tested, na.rm = TRUE) n_total <- nrow(all_results) n_nonzero <- sum(all_results$Abs_Observed_Coef > 1e-10, na.rm = TRUE) cat(sprintf("\n=== Permutation Test Summary ===\n")) cat(sprintf("Total predictor-gene combinations: %d\n", n_total)) cat(sprintf("Non-zero coefficients: %d (%.2f%%)\n", n_nonzero, 100*n_nonzero/n_total)) cat(sprintf("Predictors tested: %d\n", n_tested)) cat(sprintf("Predictors skipped (zero coef): %d\n", n_total - n_tested)) # Reorder columns all_results <- all_results %>% select(Gene, Predictor, Observed_Coef, Abs_Observed_Coef, Perm_Mean, Perm_SD, P_value, Tested) return(all_results) } #' Apply Multiple Testing Correction (FDR) #' #' FDR correction is applied ONLY to tested predictors (those with non-zero coefficients). #' This is the correct approach because: #' 1. We only performed hypothesis tests on non-zero coefficients #' 2. Including untested predictors would artificially inflate the correction #' 3. The multiple testing burden is already greatly reduced by testing only non-zero coefficients #' #' @param perm_results Data frame from calculate_all_permutation_pvalues #' @return Data frame with additional P_adj column #' apply_fdr_correction <- function(perm_results) { # Apply FDR correction only to tested predictors, within each gene perm_results <- perm_results %>% group_by(Gene) %>% mutate( # Only adjust p-values for predictors that were actually tested P_adj = ifelse(Tested, p.adjust(P_value[Tested], method = "BH")[cumsum(Tested)], NA) ) %>% ungroup() # Actually, the above is tricky. Let's do it more explicitly: perm_results <- perm_results %>% group_by(Gene) %>% mutate( P_adj = { # Get indices of tested predictors tested_idx <- which(Tested) # Initialize with NA padj <- rep(NA_real_, n()) # Apply FDR only to tested predictors if (length(tested_idx) > 0) { padj[tested_idx] <- p.adjust(P_value[tested_idx], method = "BH") } padj } ) %>% ungroup() # Report correction statistics n_tested <- sum(perm_results$Tested, na.rm = TRUE) n_genes <- n_distinct(perm_results$Gene) avg_tested_per_gene <- n_tested / n_genes cat(sprintf("\nFDR correction applied to %d tests across %d genes\n", n_tested, n_genes)) cat(sprintf("Average tests per gene: %.1f (vs. %d total predictors)\n", avg_tested_per_gene, n_distinct(perm_results$Predictor))) return(perm_results) } #' Summarize Significant Predictors #' #' @param perm_results Data frame with permutation test results #' @param fdr_threshold FDR threshold for significance #' @return List with summary statistics and significant predictor table #' summarize_significant_predictors <- function(perm_results, fdr_threshold = 0.05) { # Only consider tested predictors for significance significant <- perm_results %>% filter(Tested, P_adj < fdr_threshold) %>% arrange(Gene, P_adj) gene_summary <- significant %>% group_by(Gene) %>% summarize( N_significant = n(), Top_predictor = Predictor[which.min(P_adj)], Min_padj = min(P_adj), .groups = "drop" ) predictor_summary <- significant %>% mutate( Predictor_Type = case_when( grepl("^Cluster", Predictor) ~ "miRNA", grepl("^lncRNA", Predictor) ~ "lncRNA", grepl("^CpG", Predictor) ~ "CpG", TRUE ~ "Other" ) ) %>% group_by(Predictor_Type) %>% summarize( N_significant = n(), N_unique_predictors = n_distinct(Predictor), N_genes_affected = n_distinct(Gene), .groups = "drop" ) # Also provide summary of tested (non-zero) predictors tested_summary <- perm_results %>% filter(Tested) %>% mutate( Predictor_Type = case_when( grepl("^Cluster", Predictor) ~ "miRNA", grepl("^lncRNA", Predictor) ~ "lncRNA", grepl("^CpG", Predictor) ~ "CpG", TRUE ~ "Other" ) ) %>% group_by(Predictor_Type) %>% summarize( N_tested = n(), N_significant = sum(P_adj < fdr_threshold, na.rm = TRUE), Pct_significant = 100 * N_significant / N_tested, .groups = "drop" ) return(list( significant_predictors = significant, gene_summary = gene_summary, predictor_type_summary = predictor_summary, tested_summary = tested_summary )) } ################################################################################ # VISUALIZATION FUNCTIONS ################################################################################ #' Plot Permutation Test Results for a Single Gene plot_permutation_results <- function(gene_name, perm_results, top_n = 20, output_dir, species_code) { gene_data <- perm_results %>% filter(Gene == gene_name) %>% arrange(P_adj) %>% head(top_n) if (nrow(gene_data) == 0) return(NULL) gene_data <- gene_data %>% mutate( Significant = P_adj < 0.05, Label = sprintf("%s\n(p=%.3f)", Predictor, P_adj) ) p <- ggplot(gene_data, aes(x = reorder(Predictor, Abs_Observed_Coef), y = Abs_Observed_Coef)) + geom_point(aes(color = Significant), size = 3) + geom_errorbar(aes(ymin = Perm_Mean - Perm_SD, ymax = Perm_Mean + Perm_SD), width = 0.2, color = "gray50", alpha = 0.5) + geom_hline(yintercept = 0, linetype = "dashed", color = "gray30") + scale_color_manual(values = c("FALSE" = "gray50", "TRUE" = "red")) + coord_flip() + labs( title = sprintf("Permutation Test Results: %s", gene_name), subtitle = sprintf("Top %d predictors by coefficient magnitude", top_n), x = "Predictor", y = "Absolute Coefficient", color = "Significant\n(FDR < 0.05)", caption = "Error bars show mean ± SD of permuted coefficients" ) + theme_minimal() + theme(legend.position = "bottom") outfile <- file.path(output_dir, paste0(species_code, "_gene_plots/"), paste0(gene_name, "_permutation_test.png")) ggsave(outfile, plot = p, width = 8, height = 8) return(p) } #' Plot Summary of Permutation Results Across All Genes plot_permutation_summary <- function(perm_results, output_dir, species_code) { # 1. Distribution of p-values p1 <- ggplot(perm_results, aes(x = P_value)) + geom_histogram(bins = 50, fill = "steelblue", color = "white") + geom_vline(xintercept = 0.05, linetype = "dashed", color = "red") + labs( title = "Distribution of Permutation P-values", subtitle = "Red line indicates p = 0.05", x = "P-value", y = "Count" ) + theme_minimal() outfile <- file.path(output_dir, paste0(species_code, "_pvalue_distribution.png")) ggsave(outfile, plot = p1, width = 7, height = 5) # 2. Volcano plot p2 <- ggplot(perm_results, aes(x = Observed_Coef, y = -log10(P_adj))) + geom_point(aes(color = P_adj < 0.05), alpha = 0.3, size = 0.5) + geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red") + scale_color_manual(values = c("FALSE" = "gray50", "TRUE" = "red")) + labs( title = "Volcano Plot: Effect Size vs. Significance", x = "Observed Coefficient", y = "-log10(FDR-adjusted p-value)", color = "Significant\n(FDR < 0.05)" ) + theme_minimal() + theme(legend.position = "bottom") outfile <- file.path(output_dir, paste0(species_code, "_volcano_plot.png")) ggsave(outfile, plot = p2, width = 8, height = 6) # 3. Summary by predictor type predictor_summary <- perm_results %>% mutate( Predictor_Type = case_when( grepl("^Cluster", Predictor) ~ "miRNA", grepl("^lncRNA", Predictor) ~ "lncRNA", grepl("^CpG", Predictor) ~ "CpG", TRUE ~ "Other" ), Significant = P_adj < 0.05 ) %>% group_by(Predictor_Type) %>% summarize( N_total = n(), N_significant = sum(Significant, na.rm = TRUE), Pct_significant = N_significant / N_total * 100, .groups = "drop" ) p3 <- ggplot(predictor_summary, aes(x = Predictor_Type, y = Pct_significant, fill = Predictor_Type)) + geom_bar(stat = "identity") + geom_text(aes(label = sprintf("%.1f%%\n(%d/%d)", Pct_significant, N_significant, N_total)), vjust = -0.5, size = 3) + labs( title = "Percentage of Significant Associations by Predictor Type", x = "Predictor Type", y = "% Significant (FDR < 0.05)" ) + theme_minimal() + theme(legend.position = "none") + ylim(0, max(predictor_summary$Pct_significant) * 1.2) outfile <- file.path(output_dir, paste0(species_code, "_predictor_type_summary.png")) ggsave(outfile, plot = p3, width = 6, height = 5) return(list(p1 = p1, p2 = p2, p3 = p3, predictor_summary = predictor_summary)) } ################################################################################ # DATA LOADING AND PREPROCESSING ################################################################################ ### mRNA ### cat("Loading gene counts\n") genes <- as.data.frame(read.csv(genes_file)) %>% select(-gene_id) colnames(genes) <- gsub("\\.", "-", colnames(genes)) rownames(genes) <- genes[,1] genes <- genes %>% select(-1) ### miRNA ### cat("Loading miRNA counts\n") miRNA <- read.table(file = miRNA_file, header = TRUE, sep = "\t", check.names = FALSE) colnames(miRNA) <- gsub("\\.", "-", colnames(miRNA)) rownames(miRNA) <- miRNA[,1] miRNA <- miRNA %>% select(-1) ### lncRNA ### cat("Loading lncRNA counts\n") lncRNA_full <- read.table(lncRNA_file, header = TRUE, sep = "\t", check.names = FALSE) rownames(lncRNA_full) <- lncRNA_full$Geneid lncRNA <- lncRNA_full %>% select(-Geneid, -Chr, -Start, -End, -Strand, -Length) colnames(lncRNA) <- gsub("\\.", "-", colnames(lncRNA)) ### WGBS data ### cat("Loading WGBS data\n") WGBS <- read.table(WGBS_file, header = TRUE) colnames(WGBS) <- gsub("\\.", "-", colnames(WGBS)) ### Load and format metadata ### cat("Loading metadata table\n") # Detect species prefix if (any(grepl("ACR", colnames(miRNA)))) { species_prefix <- "ACR" species_code <- "Apul" } else if (any(grepl("POR", colnames(miRNA)))) { species_prefix <- "POR" species_code <- "Peve" } else if (any(grepl("POC", colnames(miRNA)))) { species_prefix <- "POC" species_code <- "Ptuh" } else { stop("No recognized species prefix found in column names of miRNA data") } cat("Species prefix and code:", species_prefix, ",", species_code, "\n") metadata <- read_csv("../../M-multi-species/data/rna_metadata.csv", show_col_types = FALSE) %>% select(AzentaSampleName, ColonyID, Timepoint) %>% filter(grepl(species_prefix, ColonyID)) %>% as.data.frame() metadata$Sample <- paste0(metadata$ColonyID, "-", metadata$Timepoint) rownames(metadata) <- metadata$Sample colonies <- unique(metadata$ColonyID) metadata$Timepoint <- factor(metadata$Timepoint) metadata$ColonyID <- factor(metadata$ColonyID) # Create output subdirectory for gene plots dir.create(paste0(output_dir, "/", species_code, "_gene_plots"), showWarnings = FALSE) # --------- Data preprocessing ---------- ### Filter data sets genes <- genes %>% select(-any_of(excluded_samples)) miRNA <- miRNA %>% select(-any_of(excluded_samples)) lncRNA <- lncRNA %>% select(-any_of(excluded_samples)) WGBS <- WGBS %>% select(-any_of(excluded_samples)) metadata <- metadata[!(rownames(metadata) %in% excluded_samples), ] rownames(metadata) <- metadata$Sample #### WGBS WGBS_filt <- WGBS %>% filter(if_all(-1, ~ .x > 0)) WGBS_filt <- as.data.frame(WGBS_filt) rownames(WGBS_filt) <- WGBS_filt[,1] WGBS_filt <- WGBS_filt %>% select(-1) cat("Number of raw WGBS sites:", nrow(WGBS), "\n") cat("Number of WGBS sites retained after filtering:", nrow(WGBS_filt), "\n") #### RNA genes_red <- genes[rowSums(genes) != 0, ] miRNA_red <- miRNA[rowSums(miRNA) != 0, ] lncRNA_red <- lncRNA[rowSums(lncRNA) != 0, ] cat("Retained", nrow(genes_red), "of", nrow(genes), "genes;", nrow(miRNA_red), "of", nrow(miRNA), "miRNA; and", nrow(lncRNA_red), "of", nrow(lncRNA), "lncRNA through presence filtering\n") # pOverA filtering filtering_prop <- 3/ncol(genes_red) # genes: filt <- filterfun(pOverA(filtering_prop, 5)) gfilt <- genefilter(genes_red, filt) gkeep <- genes_red[gfilt,] gn.keep <- rownames(gkeep) genes_filt <- as.data.frame(genes_red[which(rownames(genes_red) %in% gn.keep),]) # miRNA: mifilt <- filterfun(pOverA(filtering_prop, 5)) mifilt <- genefilter(miRNA_red, mifilt) mikeep <- miRNA_red[mifilt,] mi.keep <- rownames(mikeep) miRNA_filt <- as.data.frame(miRNA_red[which(rownames(miRNA_red) %in% mi.keep),]) # lncRNA: lncfilt <- filterfun(pOverA(filtering_prop, 5)) lncfilt <- genefilter(lncRNA_red, lncfilt) lnckeep <- lncRNA_red[lncfilt,] lnc.keep <- rownames(lnckeep) lncRNA_filt <- as.data.frame(lncRNA_red[which(rownames(lncRNA_red) %in% lnc.keep),]) cat("Retained", nrow(genes_filt), "of", nrow(genes_red), "genes;", nrow(miRNA_filt), "of", nrow(miRNA_red), "miRNA; and", nrow(lncRNA_filt), "of", nrow(lncRNA_red), "lncRNA through pOverA filtering\n") ### Transform data desired_order <- rownames(metadata) genes_filt <- genes_filt[, desired_order] miRNA_filt <- miRNA_filt[, desired_order] lncRNA_filt <- lncRNA_filt[, desired_order] WGBS_filt <- WGBS_filt[, desired_order] # Variance stabilizing transformation ### genes ### dds_genes <- suppressMessages(DESeqDataSetFromMatrix(countData = genes_filt, colData = metadata, design = ~Timepoint+ColonyID)) vsd_genes <- suppressMessages(varianceStabilizingTransformation(dds_genes, blind = TRUE)) vsd_genes <- assay(vsd_genes) ### miRNA ### dds_miRNA <- suppressMessages(DESeqDataSetFromMatrix(countData = miRNA_filt, colData = metadata, design = ~Timepoint+ColonyID)) vsd_miRNA <- suppressMessages(varianceStabilizingTransformation(dds_miRNA, blind = TRUE)) vsd_miRNA <- assay(vsd_miRNA) ### lncRNA ### lncRNA_filt <- lncRNA_filt %>% mutate(across(where(is.numeric), round)) dds_lncRNA <- suppressMessages(DESeqDataSetFromMatrix(countData = lncRNA_filt, colData = metadata, design = ~Timepoint+ColonyID)) vsd_lncRNA <- assay(vst(dds_lncRNA, blind = TRUE)) ### WGBS ### WGBS_prop <- WGBS_filt/100 WGBS_prop[WGBS_prop == 0] <- 1e-6 WGBS_prop[WGBS_prop == 1] <- 1 - 1e-6 vsd_WGBS <- log2(WGBS_prop / (1 - WGBS_prop)) ### Scale standardization vsd_miRNA_scaled <- scale(vsd_miRNA) vsd_lncRNA_scaled <- scale(vsd_lncRNA) vsd_WGBS_scaled <- scale(vsd_WGBS) vsd_genes_scaled <- scale(vsd_genes) ### Merge predictor features identical(colnames(vsd_lncRNA_scaled), colnames(vsd_miRNA_scaled)) identical(colnames(vsd_lncRNA_scaled), colnames(vsd_WGBS_scaled)) pred_counts <- rbind(vsd_lncRNA_scaled, vsd_miRNA_scaled, vsd_WGBS_scaled) pred_counts <- t(pred_counts) cat("Predictor set dimensions:", dim(pred_counts), "\n") ### Format vsd_genes_t <- t(vsd_genes_scaled) cat("Gene set dimensions:", dim(vsd_genes_t), "\n") pred_counts <- as.data.frame(pred_counts) vsd_genes_t <- as.data.frame(vsd_genes_t) common_samples <- intersect(rownames(vsd_genes_t), rownames(pred_counts)) pred_counts <- pred_counts[common_samples, ] vsd_genes_t <- vsd_genes_t[common_samples, ] ################################################################################ # PART 1: BOOTSTRAPPED ELASTIC NET (WITH PARALLEL PROCESSING) ################################################################################ cat("\n=== PART 1: Bootstrapped Elastic Net Training ===\n") # Record start time for bootstrap bootstrap_start_time <- Sys.time() ## First round of bootstrapping (PARALLEL) cat("\n--- First Bootstrap Round ---\n") bootstrap1_results <- run_bootstrap_parallel( response_features = vsd_genes_t, predictor_features = pred_counts, n_reps = bootstrap1_reps, n_cores = n_cores, base_seed = 703, alpha = alpha_value, progress_prefix = "Bootstrap1" ) TM_list <- bootstrap1_results$TM_list MP_list <- bootstrap1_results$MP_list FI_list <- bootstrap1_results$FI_list cat("Finished first round of bootstrapping\n") # Combine results all_TM <- do.call(rbind, TM_list) all_MP <- do.call(rbind, MP_list) all_FI <- do.call(rbind, FI_list) # Summary statistics MP_summary <- all_MP %>% group_by(Feature) %>% summarize( Mean_R2 = mean(R2, na.rm = TRUE), SD_R2 = sd(R2, na.rm = TRUE), SE_R2 = SD_R2/sqrt(n()), High_Perf_Count = sum(R2 >= r2_threshold), .groups = 'drop' ) %>% arrange(desc(High_Perf_Count)) cat("On first round of bootstrapping,", sum(!is.na(MP_summary$Feature) & MP_summary$Mean_R2 > 0.5, na.rm = TRUE), "genes are consistently well-predicted\n") # Plotting all_MP <- merge(all_MP, MP_summary[, c("Feature", "Mean_R2", "SD_R2")], by = "Feature") all_MP$Feature <- factor(all_MP$Feature, levels = MP_summary$Feature[order(MP_summary$Mean_R2)]) p <- ggplot(all_MP, aes(x = Feature, y = Mean_R2)) + geom_errorbar(aes(ymin = Mean_R2 - SD_R2, ymax = Mean_R2 + SD_R2), width = 0.4, color = "gray40", linewidth = 0.25) + geom_point(size = 2, color = "black") + geom_hline(yintercept = 0.5, linetype = "dashed", color = "blue") + theme_minimal() + labs(title = paste0(species_code, ": Bootstrap1, Mean R^2 with Error Bars"), x = "Gene (Ordered by Mean R^2)", y = "Mean R^2 ± SD") + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5)) + scale_y_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1), limits = c(-0.2, 1.2)) outfile <- file.path(output_dir, paste0(species_code, "_bootstrap1_performance_errorbars.png")) ggsave(outfile, plot = p, width = 7, height = 7) ## Second round: bootstrapping on well-predicted genes (PARALLEL) cat("\n--- Second Bootstrap Round ---\n") well_predicted <- MP_summary[MP_summary$Mean_R2 > r2_threshold, ] vsd_high_perf_t <- vsd_genes_t[, colnames(vsd_genes_t) %in% well_predicted$Feature] cat("Number of well-predicted genes for second bootstrap:", ncol(vsd_high_perf_t), "\n") bootstrap2_results <- run_bootstrap_parallel( response_features = vsd_high_perf_t, predictor_features = pred_counts, n_reps = bootstrap2_reps, n_cores = n_cores, base_seed = 1703, # Different base seed for second round alpha = alpha_value, progress_prefix = "Bootstrap2" ) TM_list <- bootstrap2_results$TM_list MP_list <- bootstrap2_results$MP_list FI_list <- bootstrap2_results$FI_list all_TM_highperf <- do.call(rbind, TM_list) all_MP_highperf <- do.call(rbind, MP_list) all_FI_highperf <- do.call(rbind, FI_list) MP_summary_highperf <- all_MP_highperf %>% group_by(Feature) %>% summarize( Mean_R2 = mean(R2, na.rm = TRUE), SD_R2 = sd(R2, na.rm = TRUE), SE_R2 = SD_R2 / sqrt(n()), High_Perf_Count = sum(R2 >= r2_threshold), .groups = 'drop' ) %>% arrange(desc(High_Perf_Count)) cat("On second round of bootstrapping,", nrow(MP_summary_highperf[MP_summary_highperf$Mean_R2 > r2_threshold, ]), "genes are again consistently well-predicted\n") # Report bootstrap timing bootstrap_end_time <- Sys.time() bootstrap_duration <- difftime(bootstrap_end_time, bootstrap_start_time, units = "mins") cat(sprintf("\nBootstrapping completed in %.2f minutes\n", bootstrap_duration)) ################################################################################ # PART 2: PERMUTATION SIGNIFICANCE TESTING (WITH PARALLEL PROCESSING) ################################################################################ cat("\n=== PART 2: Permutation-Based Significance Testing ===\n") cat("This step assesses statistical significance of each predictor coefficient\n") cat("by comparing observed coefficients to a null distribution from permuted data.\n\n") genes_for_permutation <- colnames(vsd_high_perf_t) cat(sprintf("Running permutation tests for %d well-predicted genes\n", length(genes_for_permutation))) # Record start time perm_start_time <- Sys.time() # Calculate permutation p-values (PARALLEL) perm_results <- calculate_all_permutation_pvalues( response_features = vsd_high_perf_t, predictor_features = pred_counts, n_perm = n_permutations, alpha = alpha_value, n_cores = n_cores, base_seed = 2703, progress_prefix = "Permutation" ) # Record end time and report perm_end_time <- Sys.time() perm_duration <- difftime(perm_end_time, perm_start_time, units = "mins") cat(sprintf("\nPermutation testing completed in %.2f minutes\n", perm_duration)) # Apply FDR correction cat("Applying FDR (Benjamini-Hochberg) correction for multiple testing\n") perm_results <- apply_fdr_correction(perm_results) # Summarize significant predictors cat("\n=== Significant Predictor Summary ===\n") sig_summary <- summarize_significant_predictors(perm_results, fdr_threshold = 0.05) cat(sprintf("Number of significant predictor-gene associations (FDR < 0.05): %d\n", nrow(sig_summary$significant_predictors))) cat(sprintf("Number of genes with at least one significant predictor: %d\n", nrow(sig_summary$gene_summary))) cat("\nBreakdown of TESTED predictors by type:\n") print(sig_summary$tested_summary) cat("\nBreakdown of SIGNIFICANT predictors by type:\n") print(sig_summary$predictor_type_summary) ################################################################################ # PART 3: VISUALIZATION AND OUTPUT ################################################################################ cat("\n=== PART 3: Generating Visualizations and Output Files ===\n") # 1. Summary plots cat("Creating summary plots...\n") summary_plots <- plot_permutation_summary(perm_results, output_dir, species_code) # 2. Per-gene permutation plots (for top genes) cat("Creating per-gene permutation test plots...\n") top_genes_for_plots <- sig_summary$gene_summary %>% arrange(desc(N_significant)) %>% head(20) %>% pull(Gene) for (gene in top_genes_for_plots) { plot_permutation_results(gene, perm_results, top_n = 20, output_dir, species_code) } # 3. Save full permutation results cat("Saving permutation test results...\n") write.csv(perm_results, file.path(output_dir, paste0(species_code, "_permutation_results_full.csv")), row.names = FALSE) write.csv(sig_summary$significant_predictors, file.path(output_dir, paste0(species_code, "_significant_predictors.csv")), row.names = FALSE) write.csv(sig_summary$gene_summary, file.path(output_dir, paste0(species_code, "_gene_summary.csv")), row.names = FALSE) write.csv(sig_summary$predictor_type_summary, file.path(output_dir, paste0(species_code, "_predictor_type_summary.csv")), row.names = FALSE) # 4. Create enhanced top predictors table with significance cat("Enhancing top predictors table with significance information...\n") get_predictor_type <- function(predictor_name) { if (startsWith(predictor_name, "Cluster")) return("miRNA") if (startsWith(predictor_name, "lncRNA")) return("lncRNA") if (startsWith(predictor_name, "CpG")) return("CpG") return("Other") } color_palette <- c(miRNA = "#E69F00", lncRNA = "#0072B2", CpG = "#009E73") all_MP_highperf <- merge(all_MP_highperf, MP_summary_highperf[, c("Feature", "Mean_R2", "SD_R2")], by = "Feature") all_features_highR2 <- all_MP_highperf %>% filter(Mean_R2 > 0.5) %>% pull(Feature) %>% unique() get_feature_importance_for_feature <- function(model) { coefs <- as.matrix(coef(model, s = "lambda.min"))[-1, , drop = FALSE] data.frame(Predictor = rownames(coefs), Importance = abs(as.numeric(coefs))) } top_predictors_with_sig <- list() for (target_feature in all_features_highR2) { gene_perm <- perm_results %>% filter(Gene == target_feature) %>% select(Predictor, P_value, P_adj) importance_all_reps <- lapply(TM_list, function(rep_entry) { model <- rep_entry[[target_feature]] if (!is.null(model)) { get_feature_importance_for_feature(model) } else { NULL } }) %>% bind_rows() if (nrow(importance_all_reps) == 0) next mean_importance <- importance_all_reps %>% group_by(Predictor) %>% summarise(MeanImportance = mean(Importance, na.rm = TRUE), .groups = "drop") %>% arrange(desc(MeanImportance)) %>% slice_head(n = 20) %>% mutate(Type = sapply(Predictor, get_predictor_type)) mean_importance <- mean_importance %>% left_join(gene_perm, by = "Predictor") %>% mutate( Significant = P_adj < 0.05, Significance_Label = case_when( P_adj < 0.001 ~ "***", P_adj < 0.01 ~ "**", P_adj < 0.05 ~ "*", TRUE ~ "" ) ) p <- ggplot(mean_importance, aes(x = reorder(Predictor, MeanImportance), y = MeanImportance, fill = Type)) + geom_bar(stat = "identity", aes(alpha = Significant)) + geom_text(aes(label = Significance_Label), hjust = -0.5, size = 4) + scale_fill_manual(values = color_palette) + scale_alpha_manual(values = c("FALSE" = 0.4, "TRUE" = 1)) + coord_flip() + theme_minimal() + labs( title = sprintf("Top Predictors for %s", target_feature), subtitle = "Significance: * p<0.05, ** p<0.01, *** p<0.001 (FDR-corrected)", x = "Predictor", y = "Mean Importance (across replicates)", alpha = "Significant\n(FDR < 0.05)" ) outfile <- file.path(output_dir, paste0(species_code, "_gene_plots/"), paste0(target_feature, "_predictors_with_sig.png")) ggsave(outfile, plot = p, width = 8, height = 8) mean_importance$Feature <- target_feature top_predictors_with_sig[[target_feature]] <- mean_importance } top_predictors_df <- bind_rows(top_predictors_with_sig) write.csv(top_predictors_df, file.path(output_dir, paste0(species_code, "_top_predictors_with_significance.csv")), row.names = FALSE) ################################################################################ # FINAL SUMMARY ################################################################################ cat("\n=== ANALYSIS COMPLETE ===\n") cat("End time:", as.character(Sys.time()), "\n\n") cat("Timing summary:\n") cat(sprintf(" - Bootstrapping: %.2f minutes\n", as.numeric(bootstrap_duration))) cat(sprintf(" - Permutation testing: %.2f minutes\n", as.numeric(perm_duration))) cat(sprintf(" - Total runtime: %.2f minutes\n", as.numeric(bootstrap_duration) + as.numeric(perm_duration))) cat("\nOutput files generated:\n") cat(sprintf(" 1. %s_permutation_results_full.csv - All predictor-gene p-values\n", species_code)) cat(sprintf(" 2. %s_significant_predictors.csv - FDR < 0.05 associations only\n", species_code)) cat(sprintf(" 3. %s_gene_summary.csv - Summary per gene\n", species_code)) cat(sprintf(" 4. %s_predictor_type_summary.csv - Summary by predictor type\n", species_code)) cat(sprintf(" 5. %s_top_predictors_with_significance.csv - Top predictors with p-values\n", species_code)) cat(sprintf(" 6. Various plots in %s/\n", output_dir)) cat("\nKey statistics:\n") cat(sprintf(" - Total genes analyzed: %d\n", ncol(vsd_genes_t))) cat(sprintf(" - Well-predicted genes (R2 > %.2f): %d\n", r2_threshold, ncol(vsd_high_perf_t))) cat(sprintf(" - Significant predictor-gene associations: %d\n", nrow(sig_summary$significant_predictors))) cat(sprintf(" - Bootstrap replicates: %d (round 1), %d (round 2)\n", bootstrap1_reps, bootstrap2_reps)) cat(sprintf(" - Permutations per gene: %d\n", n_permutations)) cat(sprintf(" - Cores used: %d\n", n_cores)) cat("\nStatistical notes:\n") cat(" - P-values were calculated using permutation testing\n") cat(" - Multiple testing correction: Benjamini-Hochberg FDR (within each gene)\n") cat(" - Significance threshold: FDR < 0.05\n") cat(sprintf("\nLog file saved to: %s\n", log_file_path)) cat("\n=== END OF SCRIPT ===\n") ################################################################################ # CLOSE LOGGING ################################################################################ close_logging() # Restore original cat and print functions cat <- original_cat print <- original_print cat(sprintf("Analysis complete. Log saved to: %s\n", log_file_path))