---
title: "26.1-rank35-optimization-ElasticNet-component24"
author: "Kathleen Durkin"
date: "2025-11-11"
output: 
  github_document:
    toc: true
    number_sections: true
  html_document:
    theme: cosmo
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
  bookdown::html_document2:
    theme: cosmo
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
    code_download: true
bibliography: references.bib
---

Steven has run some optimization tests to settle on a rank of 35 for the Barnacle tensor decomposition tool (`26-rank35-optimization`). I'm taking the gene set from component 24 of this run and applying our Elastic Net model to predict expression of this gene set from multiomic epigenetic predictors (miRNA + lncRNA + CpG site methylation), for all 3 species.


Inputs:

-   RNA counts matrix (for gene set, raw):

    - `https://github.com/urol-e5/timeseries_molecular/raw/refs/heads/main/M-multi-species/output/26-rank35-optimization/lambda_gene_0.2/top_genes_per_component/Component_24_apul_count_matrix.csv`
    
    - `https://github.com/urol-e5/timeseries_molecular/raw/refs/heads/main/M-multi-species/output/26-rank35-optimization/lambda_gene_0.2/top_genes_per_component/Component_24_peve_count_matrix.csv`
    
    - `https://github.com/urol-e5/timeseries_molecular/raw/refs/heads/main/M-multi-species/output/26-rank35-optimization/lambda_gene_0.2/top_genes_per_component/Component_24_ptua_count_matrix.csv`

-   miRNA counts matrix (raw):  

    - `https://github.com/urol-e5/timeseries_molecular/raw/refs/heads/main/M-multi-species/output/10.1-format-miRNA-counts-updated/Apul_miRNA_counts_formatted_updated.txt`
    
    - `https://github.com/urol-e5/timeseries_molecular/raw/refs/heads/main/M-multi-species/output/10.1-format-miRNA-counts-updated/Peve_miRNA_counts_formatted_updated.txt`
    
    - `https://github.com/urol-e5/timeseries_molecular/raw/refs/heads/main/M-multi-species/output/10.1-format-miRNA-counts-updated/Ptuh_miRNA_counts_formatted_updated.txt`

-   lncRNA counts matrix (raw):

    - `https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/D-Apul/output/31.5-Apul-lncRNA-discovery/lncRNA_counts.clean.filtered.txt`
    
    - `https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/E-Peve/output/12-Peve-lncRNA-discovery/lncRNA_counts.clean.filtered.txt`
    
    - `https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/F-Ptua/output/06-Ptua-lncRNA-discovery/lncRNA_counts.clean.filtered.txt`

-   WGBS data (processed): Shelly's generated a WGBS count tables for a bunch of different parameter combinations (alignment score and sample coverage) (see details in [the `timeseries_molecular` wiki](https://github.com/urol-e5/timeseries_molecular/wiki/02%E2%80%90Expression-Count-Matrices)). For now, let's go with the most conservative parameter combination available for all 3 species, which is an alignment score of -0.8 and coverage in all samples (n=39 in Apul, n=37 in Peve, and n=32 in Ptuh):
  
    - `https://gannet.fish.washington.edu/gitrepos/urol-e5/timeseries_molecular/D-Apul/output/22.5-Apul-multiomic-SR/Apul-filtered-WGBS-CpG-counts.csv`
    
    - `https://gannet.fish.washington.edu/metacarcinus/E5/Pevermanni/20250821_meth_Peve/merged-WGBS-CpG-counts_filtered.csv`
    
    - `https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250821_meth_Ptua/merged-WGBS-CpG-counts_filtered.csv`

-   sample metadata:  `../../M-multi-species/data/rna_metadata.csv`


# Set up 

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, cache=TRUE)
```

# Load libraries 
```{r, warning=FALSE, message=FALSE}
library(tidyverse)
library(ggplot2)
library(DESeq2)
library(igraph)
library(psych)
library(tidygraph)
library(ggraph)
library(WGCNA)
library(edgeR)
library(reshape2)
library(ggcorrplot)
library(corrplot)
library(rvest)
library(purrr)
library(pheatmap)
library(glmnet)
library(caret)
library(factoextra)
library(vegan)
library(ggfortify)
library(genefilter)
library(scales)
library(purrr)
```

The model includes random processes, so set a seed for reproducability
```{r}
set.seed(703)
```

# Define model functions

```{r}
train_models <- function(response_features, predictor_features) {
  models <- list()
  
  for (feature in colnames(response_features)) {
    y <- response_features[[feature]]  # Gene expression
    X <- as.matrix(predictor_features)  # miRNA/lncRNA/methylation as predictors
    
    # Train elastic net model (alpha = 0.5 for mix of LASSO & Ridge)
    # Reducing nfolds from 10 (default) to 5, since I'm using smaller sample sizes of just 10 samples per timepoint
    model <- cv.glmnet(X, y, alpha = 0.5, nfolds = 3)
    
    models[[feature]] <- model
  }
  
  return(models)
}

get_feature_importance <- function(models) {
  importance_list <- lapply(models, function(model) {
    coefs <- as.matrix(coef(model, s = "lambda.min"))[-1, , drop = FALSE]  # Convert to regular matrix & remove intercept
    
    # Convert to data frame
    coefs_df <- data.frame(Feature = rownames(coefs), Importance = as.numeric(coefs))
    
    return(coefs_df)
  })
  
  # Combine feature importance across all predicted genes
  importance_df <- bind_rows(importance_list) %>%
    group_by(Feature) %>%
    summarize(MeanImportance = mean(abs(Importance)), .groups = "drop") %>%
    arrange(desc(MeanImportance))
  
  return(importance_df)
}

evaluate_model_performance <- function(models, response_features, predictor_features) {
  results <- data.frame(Feature = colnames(response_features), R2 = NA)

  for (feature in colnames(response_features)) {
    y <- response_features[[feature]]
    X <- as.matrix(predictor_features)

    model <- models[[feature]]
    preds <- predict(model, X, s = "lambda.min")

    R2 <- cor(y, preds)^2  # R-squared metric
    results[results$Feature == feature, "R2"] <- R2
  }

  return(results)
}
```


# Define training and testing function

```{r}
train_models_split <- function(response_features, predictor_features, train_frac = 0.8, alpha = 0.5, nfolds = 10) {
  
  # Set up storage
  models <- list()
  performance <- data.frame(Feature = colnames(response_features), R2 = NA)

  # Select fraction of data
  sample_idx <- sample(seq_len(nrow(predictor_features)), size = floor(train_frac * nrow(predictor_features)))

  # Split predictor data
  X_train <- predictor_features[sample_idx, ]
  X_test  <- predictor_features[-sample_idx, ]

  for (feature in colnames(response_features)) {
    # Split response data
    y_train <- response_features[sample_idx, feature]
    y_test  <- response_features[-sample_idx, feature]

    
    # Check if y_train is constant
    if (length(unique(y_train)) == 1) {
      warning(sprintf("Skipping feature '%s': y_train is constant", feature))
      next
    }
    
    # Run model on training data
    model <- cv.glmnet(as.matrix(X_train), y_train, alpha = alpha, nfolds = nfolds)
    models[[feature]] <- model

    # Now apply the trained model to test data
    preds <- predict(model, newx = as.matrix(X_test), s = "lambda.min")
    R2 <- cor(y_test, preds)^2
    performance[performance$Feature == feature, "R2"] <- R2
  }

  return(list(models = models, performance = performance))
}

```
# A. pulchra
## Load and format data 

```{r}
### mRNA ###
# raw gene counts data (will filter and variance stabilize)
Apul_genes <- as.data.frame(read_csv("https://github.com/urol-e5/timeseries_molecular/raw/refs/heads/main/M-multi-species/output/26-rank35-optimization/lambda_gene_0.2/top_genes_per_component/Component_24_apul_count_matrix.csv"))
Peve_genes <- as.data.frame(read_csv("https://github.com/urol-e5/timeseries_molecular/raw/refs/heads/main/M-multi-species/output/26-rank35-optimization/lambda_gene_0.2/top_genes_per_component/Component_24_peve_count_matrix.csv"))
Ptuh_genes <- as.data.frame(read_csv("https://github.com/urol-e5/timeseries_molecular/raw/refs/heads/main/M-multi-species/output/26-rank35-optimization/lambda_gene_0.2/top_genes_per_component/Component_24_ptua_count_matrix.csv"))

# format gene IDs as rownames (instead of a column)
rownames(Apul_genes) <- Apul_genes$gene_id
Apul_genes <- Apul_genes%>%select(!gene_id)

### miRNA ###
# raw miRNA counts (will filter and variance stabilize)
Apul_miRNA <- read.table(file = "https://github.com/urol-e5/timeseries_molecular/raw/refs/heads/main/M-multi-species/output/10.1-format-miRNA-counts-updated/Apul_miRNA_counts_formatted_updated.txt", header = TRUE, sep = "\t", check.names = FALSE)

# format miRNA IDs as rownames (instead of a column)
rownames(Apul_miRNA) <- Apul_miRNA$Name
Apul_miRNA <- Apul_miRNA%>%select(!Name)

### lncRNA ###
# raw lncRNA counts (will filter and variance stabilize)
Apul_lncRNA_full <- read.table("https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/D-Apul/output/31.5-Apul-lncRNA-discovery/lncRNA_counts.clean.filtered.txt", header = TRUE, sep = "\t")

# Remove info on genomic location, set lncRNA IDs as rownames
rownames(Apul_lncRNA_full) <- Apul_lncRNA_full$Geneid
Apul_lncRNA <- Apul_lncRNA_full %>% select(-Geneid, -Chr, -Start, -End, -Strand, -Length)

# Format sample names to match standard (SPEC-Sample#-TP#)
colnames(Apul_lncRNA) <- gsub("\\.", "-", colnames(Apul_lncRNA))


### WGBS data ###
Apul_WGBS <- read.csv("https://gannet.fish.washington.edu/gitrepos/urol-e5/timeseries_molecular/D-Apul/output/22.5-Apul-multiomic-SR/Apul-filtered-WGBS-CpG-counts.csv")

# Format sample names to match standard (SPEC-Sample#-TP#)
colnames(Apul_WGBS) <- gsub("\\.", "-", colnames(Apul_WGBS))


### load and format metadata ###
metadata <- read_csv("../../M-multi-species/data/rna_metadata.csv")%>%select(AzentaSampleName, ColonyID, Timepoint) %>%
  filter(grepl("ACR", ColonyID))
metadata$Sample <- paste0(metadata$ColonyID, "-", metadata$Timepoint)
rownames(metadata) <- metadata$Sample

colonies <- unique(metadata$ColonyID)
```

### Filter data sets 

#### WGBS

Only keep CpGs that have a non-zero value in all samples. 

```{r}
Apul_WGBS_filt <- Apul_WGBS %>% filter(if_all(-X, ~ .x > 0))

# Ensure it's formatted as a data frame
Apul_WGBS_filt <- as.data.frame(Apul_WGBS_filt)
# Set CpG IDs to rownames
rownames(Apul_WGBS_filt) <- Apul_WGBS_filt$X
Apul_WGBS_filt <- Apul_WGBS_filt %>% select(-X)

nrow(Apul_WGBS)
nrow(Apul_WGBS_filt)
```

We had 96,785 CpGs before filtering and have 14,872 after filtering. 

#### RNA

Remove the sample with poor WGBS data quality (`ACR-225-TP1`), which was already excluded from the WGBS counts

```{r}
Apul_genes <- Apul_genes %>% select(-"ACR-225-TP1")
Apul_miRNA <- Apul_miRNA %>% select(-"ACR-225-TP1")
Apul_lncRNA <- Apul_lncRNA %>% select(-"ACR-225-TP1")
```

Ensure we only have genes, miRNA, and lncRNA that are present in at least one sample
```{r}
# genes
Apul_genes_red <- Apul_genes[rowSums(Apul_genes) != 0, ]
# miRNA
Apul_miRNA_red <- Apul_miRNA[rowSums(Apul_miRNA) != 0, ]
# lncRNA
Apul_lncRNA_red <- Apul_lncRNA[rowSums(Apul_lncRNA) != 0, ]

cat("Retained ", nrow(Apul_genes_red), " of ", nrow(Apul_genes), "genes; ",
       nrow(Apul_miRNA_red), " of ", nrow(Apul_miRNA), " miRNA; and ", 
       nrow(Apul_lncRNA_red), " of ", nrow(Apul_lncRNA), " lncRNA")
```

*pOverA*: 
Specifying the minimum count for a proportion of samples for each gene. Setting 3/40 = 0.08. This would retain genes that are only expressed in a single season in a couple of the colonies. Additionally, setting the minimum count so that the minimum number of samples must have a gene count above a certain threshold. 

genes:
```{r}
filt <- filterfun(pOverA(0.08, 5))

#create filter for the counts data
gfilt <- genefilter(Apul_genes_red, filt)

#identify genes to keep by count filter
gkeep <- Apul_genes_red[gfilt,]

#identify gene lists
gn.keep <- rownames(gkeep)

#gene count data filtered in PoverA, P percent of the samples have counts over A
Apul_genes_filt <- as.data.frame(Apul_genes_red[which(rownames(Apul_genes_red) %in% gn.keep),])

#How many rows do we have before and after filtering?
cat("Pre-filtering:", nrow(Apul_genes_red), "; Post-filtering:", nrow(Apul_genes_filt))
```

miRNA:
```{r}
mifilt <- filterfun(pOverA(0.08, 5))

#create filter for the counts data
mifilt <- genefilter(Apul_miRNA_red, mifilt)

#identify miRNA to keep by count filter
mikeep <- Apul_miRNA_red[mifilt,]

#identify miRNA to keep by count filter
mikeep <- Apul_miRNA_red[mifilt,]

#identify miRNA lists
mi.keep <- rownames(mikeep)

#miRNA count data filtered in PoverA, P percent of the samples have counts over A
Apul_miRNA_filt <- as.data.frame(Apul_miRNA_red[which(rownames(Apul_miRNA_red) %in% mi.keep),])

#How many rows do we have before and after filtering?
cat("Pre-filtering:", nrow(Apul_miRNA_red), "; Post-filtering:", nrow(Apul_miRNA_filt))
```

Of the 51 miRNA, 47 were retained. Which were removed?
```{r}
setdiff(rownames(Apul_miRNA_red), rownames(Apul_miRNA_filt))
```

lncRNA:
```{r}
lncfilt <- filterfun(pOverA(0.08, 5))

#create filter for the counts data
lncfilt <- genefilter(Apul_lncRNA_red, lncfilt)

#identify lncRNA to keep by count filter
lnckeep <- Apul_lncRNA_red[lncfilt,]

#identify lncRNA to keep by count filter
lnckeep <- Apul_lncRNA_red[lncfilt,]

#identify lncRNA lists
lnc.keep <- rownames(lnckeep)

#lncRNA count data filtered in PoverA, P percent of the samples have counts over A
Apul_lncRNA_filt <- as.data.frame(Apul_lncRNA_red[which(rownames(Apul_lncRNA_red) %in% lnc.keep),])

#How many rows do we have before and after filtering?
cat("Pre-filtering:", nrow(Apul_lncRNA_red), "; Post-filtering:", nrow(Apul_lncRNA_filt))
```

### Transform data 

Remove Sample `ACR-225-TP1` from metadata table
```{r}
metadata <-  metadata[!(rownames(metadata) == "ACR-225-TP1"), ]
rownames(metadata) <- metadata$Sample
```

Set the order of genes, miRNA, lncRNA, wgbs, and metadata to all be the same. 
```{r}
# Ensure rownames of metadata are used as the desired column order
desired_order <- rownames(metadata)

# Reorder data frame columns
Apul_genes_filt <- Apul_genes_filt[, desired_order]
Apul_miRNA_filt <- Apul_miRNA_filt[, desired_order]
Apul_lncRNA_filt <- Apul_lncRNA_filt[, desired_order]
Apul_WGBS_filt <- Apul_WGBS_filt[, desired_order]

# Check they all match
identical(rownames(metadata), colnames(Apul_genes_filt))
identical(rownames(metadata), colnames(Apul_miRNA_filt))
identical(rownames(metadata), colnames(Apul_lncRNA_filt))
identical(rownames(metadata), colnames(Apul_WGBS_filt))
```

Use a variance stabilized transformation for all four data sets. Variance stabilization essentially tries to make variance independent of the mean

(Is this the most appropriate design to use?)

genes:
```{r}
dds_Apul <- DESeqDataSetFromMatrix(countData = Apul_genes_filt, 
                              colData = metadata, 
                              design = ~Timepoint+ColonyID)

# Variance Stabilizing Transformation
vsd_Apul_genes <- varianceStabilizingTransformation(dds_Apul, blind = TRUE)  # Must use varianceStabilizingTransformation() instead of vst() due to few input genes
vsd_Apul_genes <- assay(vsd_Apul_genes)
```

miRNA:
```{r}
dds_miRNA <- DESeqDataSetFromMatrix(countData = Apul_miRNA_filt, 
                              colData = metadata, 
                              design = ~Timepoint+ColonyID)

# Variance Stabilizing Transformation
vsd_Apul_miRNA <- varianceStabilizingTransformation(dds_miRNA, blind=TRUE) # Must use varianceStabilizingTransformation() instead of vst() due to few input genes
vsd_Apul_miRNA <- assay(vsd_Apul_miRNA)

```

lncRNA:
```{r}
# All values must be integers
Apul_lncRNA_filt <- Apul_lncRNA_filt %>% mutate(across(where(is.numeric), round))

dds_lncRNA <- DESeqDataSetFromMatrix(countData = Apul_lncRNA_filt, 
                              colData = metadata, 
                              design = ~Timepoint+ColonyID)

# Variance Stabilizing Transformation
vsd_Apul_lncRNA <- assay(vst(dds_lncRNA, blind = TRUE))
```

Also must round WGBS data to whole integers for normalization - need to return to this to decide if this is appropriate. 
```{r}
# All values must be integers
Apul_WGBS_filt<-Apul_WGBS_filt %>% mutate(across(where(is.numeric), round))

dds_wgbs <- DESeqDataSetFromMatrix(countData = Apul_WGBS_filt, 
                              colData = metadata, 
                              design = ~ Timepoint+ColonyID)

# Variance Stabilizing Transformation
vsd_Apul_WGBS <- assay(varianceStabilizingTransformation(dds_wgbs, blind = TRUE))
```

### Merge predictor features

Create a merged dataset that contains the variance stabilized counts for all miRNA, lncRNA, and CpGs.

```{r}
# Triple check that all three data frames have sample names in the same order
identical(colnames(vsd_Apul_lncRNA), colnames(vsd_Apul_miRNA))
identical(colnames(vsd_Apul_lncRNA), colnames(vsd_Apul_WGBS))

# Bind (stack dataframes vertically, so that they match by column/sample)
Apul_pred_counts <- rbind(vsd_Apul_lncRNA, vsd_Apul_miRNA, vsd_Apul_WGBS)

# Transform so that samples are on rows and features are in columns
Apul_pred_counts <- t(Apul_pred_counts)

dim(Apul_pred_counts)
```

### Format

```{r}
# Transform the counts matrices so that samples are on the rows and gene IDs on the columns
vsd_Apul_genes_t <- t(vsd_Apul_genes)

# Ensure both are formatted as data frames
Apul_pred_counts <- as.data.frame(Apul_pred_counts)
vsd_Apul_genes_t <- as.data.frame(vsd_Apul_genes_t)

# Ensure sample matching between gene and epigenetic dfs
common_samples <- intersect(rownames(vsd_Apul_genes_t), rownames(Apul_pred_counts))

Apul_pred_counts <- Apul_pred_counts[common_samples, ]
vsd_Apul_genes_t <- vsd_Apul_genes_t[common_samples,]
```


## Split training and test
## Bootstrapping

```{r, cache=TRUE}
n_replicates <- 10  # number of replicates
r2_threshold <- 0.5 # threshold for "good" prediction
Apul_TM_list <- list()
Apul_MP_list <- list()
Apul_FI_list <- list()

# Perform bootstrapping
for (i in 1:n_replicates) {
  cat("Beginning replicate", i)
  cat("\n")
  Apul_result <- train_models_split(vsd_Apul_genes_t, Apul_pred_counts)
  
  cat("Extracting results for replicate", i)
  cat("\n")
  Apul_trained_models <- Apul_result$models
  Apul_model_performance <- Apul_result$performance
  Apul_feature_importance <- get_feature_importance(Apul_trained_models)
  
  Apul_TM_list[[i]] <- Apul_trained_models
  Apul_TM_list[[i]]$Replicate <- i
  
  Apul_MP_list[[i]] <- Apul_model_performance
  Apul_MP_list[[i]]$Replicate <- i
  
  Apul_FI_list[[i]] <- Apul_feature_importance
  Apul_FI_list[[i]]$Replicate <- i
}

# Combine all results
Apul_all_TM <- do.call(rbind, Apul_TM_list)
Apul_all_MP <- do.call(rbind, Apul_MP_list)
Apul_all_FI <- do.call(rbind, Apul_FI_list)
```

```{r}
# Count how often each feature exceeds the R² threshold

Apul_MP_summary <- Apul_all_MP %>%
  group_by(Feature) %>%
  summarize(
    Mean_R2 = mean(R2, na.rm = TRUE),
    SD_R2 = sd(R2, na.rm = TRUE),
    SE_R2 = SD_R2/sqrt(n()),
    High_Perf_Count = sum(R2 >= r2_threshold),
    .groups = 'drop'
  ) %>%
  arrange(desc(High_Perf_Count))

# View top consistently high-performing features
print(Apul_MP_summary)

# How many genes are consistently well-predicted?
sum(!is.na(Apul_MP_summary$Feature) & Apul_MP_summary$Mean_R2 > 0.5, na.rm = TRUE)

```

Sort genes by average R^2 across all replicates, then plot R^2 for all replicates
```{r}
# Merge mean R2 info into full replicate-level data
Apul_all_MP <- merge(Apul_all_MP, Apul_MP_summary[, c("Feature", "Mean_R2", "SD_R2")], by = "Feature")

# Reorder Feature factor by Mean_R2
Apul_all_MP$Feature <- factor(Apul_all_MP$Feature,
                                          levels = Apul_MP_summary$Feature[order(Apul_MP_summary$Mean_R2)])
```

```{r}
# Plot mean R² with error bars
# Choose SD to describe variability in model performance
ggplot(Apul_all_MP, aes(x = Feature, y = Mean_R2)) +
  geom_errorbar(aes(ymin = Mean_R2 - SD_R2, ymax = Mean_R2 + SD_R2), width = 0.4, color = "gray40", linewidth = 0.25) +
  geom_point(size = 2, color = "black") +
  geom_hline(yintercept = 0.5,
             linetype = "dashed", color = "blue") +
  theme_minimal() +
  labs(title = "Apul: Mean R² with Error Bars Across Gene Features",
       x = "Gene Expression Feature (Ordered by Mean R²)",
       y = "Mean R² ± SD") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size=5)) +
  ylim(-0.2, 1.2) +
  scale_y_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1))

```

```{r}
# Plot with features sorted by mean R²
ggplot(Apul_all_MP, aes(x = Feature, y = R2)) +
  geom_point(aes(color = as.factor(Replicate)), size = 1) +
  geom_point(aes(y = Mean_R2), color = "black", shape = 18, size = 3) + 
  geom_hline(yintercept = mean(Apul_all_MP$R2, na.rm = TRUE),
             linetype = "dashed", color = "blue") +
  theme_minimal() +
  labs(title = "Apul: Model Performance Across Gene Expression",
       x = "Gene Expression Feature (Ordered by Mean R²)",
       y = "R² (Variance Explained)",
       color = "Replicate") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ylim(0, 1)

```

Average importance of all predictive features and plot those with highest average importance accross all replicates
```{r}
# Average importance across replicates
Apul_mean_importance_df <- Apul_all_FI %>%
  group_by(Feature) %>%
  summarise(MeanImportance = mean(MeanImportance, na.rm = TRUE)) %>%
  arrange(desc(MeanImportance))

# Select top 50 features
Apul_top_features <- Apul_mean_importance_df %>% top_n(50, MeanImportance)

# Plot
ggplot(Apul_top_features, aes(x = reorder(Feature, MeanImportance), y = MeanImportance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +  # Flip for readability
  theme_minimal() +
  labs(title = "Apul: Top Predictive Features by Mean Importance",
       x = "Feature",
       y = "Mean Importance")

```

## Boostrapping on predictable genes

Now I only want to model genes that are well-predicted over many replicates

Isolate well-predicted genes
```{r}
Apul_well_predicted <- Apul_MP_summary[Apul_MP_summary$Mean_R2 > 0.5,]

vsd_Apul_high_perf_t <- vsd_Apul_genes_t[,colnames(vsd_Apul_genes_t) %in% Apul_well_predicted$Feature]
dim(vsd_Apul_high_perf_t)
```

Of the original 100 genes selected from component 24, 75 are consistently well-predicted by our model (mean R^2 > 0.5, over 10 replicates).


Run bootstrapping on isolated genes
```{r, cache=TRUE}
n_replicates <- 25  # number of replicates
r2_threshold <- 0.5 # threshold for "good" prediction
Apul_TM_list <- list()
Apul_MP_list <- list()
Apul_FI_list <- list()

# Perform bootstrapping
for (i in 1:n_replicates) {
  cat("Beginning replicate", i)
  cat("\n")
  Apul_result <- train_models_split(vsd_Apul_high_perf_t, Apul_pred_counts)
  
  cat("Extracting results for replicate", i)
  cat("\n")
  Apul_trained_models <-Apul_result$models
  Apul_model_performance <- Apul_result$performance
  Apul_feature_importance <- get_feature_importance(Apul_trained_models)
  
  Apul_TM_list[[i]] <- Apul_trained_models
  Apul_TM_list[[i]]$Replicate <- i
  
  Apul_MP_list[[i]] <- Apul_model_performance
  Apul_MP_list[[i]]$Replicate <- i
  
  Apul_FI_list[[i]] <- Apul_feature_importance
  Apul_FI_list[[i]]$Replicate <- i
}

# Combine all results
Apul_all_TM_highperf <- do.call(rbind, Apul_TM_list)
Apul_all_MP_highperf <- do.call(rbind, Apul_MP_list)
Apul_all_FI_highperf <- do.call(rbind, Apul_FI_list)
```

```{r}
# Count how often each feature exceeds the R² threshold

Apul_MP_summary_highperf <- Apul_all_MP_highperf %>%
  group_by(Feature) %>%
  summarize(
    Mean_R2 = mean(R2, na.rm = TRUE),
    SD_R2 = sd(R2, na.rm = TRUE),
    SE_R2 = SD_R2 / sqrt(n()),
    High_Perf_Count = sum(R2 >= r2_threshold),
    .groups = 'drop'
  ) %>%
  arrange(desc(High_Perf_Count))

# View top consistently high-performing features
print(Apul_MP_summary_highperf)

# How many genes are consistently well-predicted?
nrow(Apul_MP_summary_highperf[Apul_MP_summary_highperf$Mean_R2 > 0.5,])
```

When I train the model on only these 75 well-predicted genes, 68 of them are again well-predicted. That's a much higher success rate! This suggests that epigenetic mechanisms may indeed be able to predict expression of these genes


Sort genes by average R^2 across all replicates, then plot R^2 for all replicates
```{r}
# Merge mean R2 info into full replicate-level data
Apul_all_MP_highperf <- merge(Apul_all_MP_highperf, Apul_MP_summary_highperf[, c("Feature", "Mean_R2", "SD_R2")], by = "Feature")

# Reorder Feature factor by Mean_R2
Apul_all_MP_highperf$Feature <- factor(Apul_all_MP_highperf$Feature,
                                          levels = Apul_MP_summary_highperf$Feature[order(Apul_MP_summary_highperf$Mean_R2)])
```


```{r}
# Plot mean R² with error bars
# Choose SD to describe variability in model performance
ggplot(Apul_all_MP_highperf, aes(x = Feature, y = Mean_R2)) +
  geom_errorbar(aes(ymin = Mean_R2 - SD_R2, ymax = Mean_R2 + SD_R2), width = 0.3, color = "gray40") +
  geom_point(size = 2, color = "black") +
  geom_hline(yintercept = 0.5,
             linetype = "dashed", color = "blue") +
  theme_minimal() +
  labs(title = "Apul: Mean R² with Error Bars Across Gene Features",
       x = "Gene Expression Feature (Ordered by Mean R²)",
       y = "Mean R² ± SD") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ylim(-0.2, 1.2) +
  scale_y_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1))

```

```{r}
# Plot all replicate R2s, with features sorted by mean R²
ggplot(Apul_all_MP_highperf, aes(x = Feature, y = R2)) +
  geom_point(aes(color = as.factor(Replicate)), size = 1.5) +
  geom_point(aes(y = Mean_R2), color = "black", shape = 18, size = 3) + 
  geom_hline(yintercept = mean(Apul_all_MP_highperf$R2, na.rm = TRUE),
             linetype = "dashed", color = "blue") +
  theme_minimal() +
  labs(title = "Apul: Model Performance Across Gene Expression",
       x = "Gene Expression Feature (Ordered by Mean R²)",
       y = "R² (Variance Explained)",
       color = "Replicate") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ylim(0, 1)

```

```{r}
# Average importance across replicates
Apul_mean_importance_df_highperf <- Apul_all_FI_highperf %>%
  group_by(Feature) %>%
  summarise(MeanImportance = mean(MeanImportance, na.rm = TRUE)) %>%
  arrange(desc(MeanImportance))

# Select top 50 features
Apul_top_features_highperf <- Apul_mean_importance_df_highperf %>% top_n(50, MeanImportance)

# Plot
ggplot(Apul_top_features_highperf, aes(x = reorder(Feature, MeanImportance), y = MeanImportance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +  # Flip for readability
  theme_minimal() +
  labs(title = "Apul: Top Predictive Features by Mean Importance",
       x = "Feature",
       y = "Mean Importance")

```

Look at most predictive epigenetic features for these well-predicted genes
```{r}
# Define color assignments for predictor types
color_palette <- c(miRNA = "#E69F00", lncRNA = "#0072B2", CpG = "#009E73")

#  Define well-predicted features
Apul_all_features_highR2 <- Apul_all_MP_highperf %>%
  filter(Mean_R2 > 0.5) %>%
  pull(Feature) %>%
  unique()

# Function to extract importance from a single model
get_feature_importance_for_feature <- function(model) {
  coefs <- as.matrix(coef(model, s = "lambda.min"))[-1, , drop = FALSE]  # remove intercept
  data.frame(Predictor = rownames(coefs), Importance = abs(as.numeric(coefs)))
}

# Function to assign predictor type
get_predictor_type <- function(predictor_name) {
  if (startsWith(predictor_name, "Cluster")) return("miRNA")
  if (startsWith(predictor_name, "lncRNA")) return("lncRNA")
  if (startsWith(predictor_name, "CpG")) return("CpG")
  return("Other")
}

# Initialize list to save top predictors
Apul_top_predictors <- list()

# Loop over features, aggregate importance across replicates, and plot
for (target_feature in Apul_all_features_highR2) {
  
  # Extract and combine importance scores across all replicates
  importance_all_reps <- lapply(Apul_TM_list, function(rep_entry) {
    model <- rep_entry[[target_feature]]
    if (!is.null(model)) {
      get_feature_importance_for_feature(model)
    } else {
      NULL
    }
  }) %>% bind_rows()

  # If no importance data was found, skip
  if (nrow(importance_all_reps) == 0) next

  # Compute average importance across replicates
  mean_importance <- importance_all_reps %>%
    group_by(Predictor) %>%
    summarise(MeanImportance = mean(Importance, na.rm = TRUE), .groups = "drop") %>%
    arrange(desc(MeanImportance)) %>%
    slice_head(n = 20) %>%
    mutate(Type = sapply(Predictor, get_predictor_type))


  # Plot once per target feature
  plot <- ggplot(mean_importance, aes(x = reorder(Predictor, MeanImportance), y = MeanImportance, fill=Type)) +
    geom_bar(stat = "identity") +
    scale_fill_manual(values = color_palette) +
    coord_flip() +
    theme_minimal() +
    labs(title = paste("Top Predictors for", target_feature),
         x = "Predictor",
         y = "Mean Importance (across replicates)")

  # Save top 20 most important predictors for later use
  top20 <- mean_importance %>% 
    mutate(Feature = target_feature)
  
  Apul_top_predictors[[target_feature]] <- top20
  
  print(plot)
}
```


## Plot expression of genes and their predictors

```{r, error=FALSE, warning=FALSE}
# Put predictor expression in format with samples in columns
Apul_pred_counts_t <- t(Apul_pred_counts)

# Loop over each feature in the list
for (feature in names(Apul_top_predictors)) {
  
  # Get top 5 predictors for this feature
  top_df <- Apul_top_predictors[[feature]] %>%
    slice_head(n = 5)
  predictors <- top_df$Predictor
  
  # Combine feature and predictors
  all_genes <- c(feature, predictors)
  
  # Check which genes are actually present
  present_genes <- all_genes[all_genes %in% rownames(Apul_pred_counts_t) | all_genes %in% rownames(vsd_Apul_genes)]
  
  if (!(feature %in% rownames(vsd_Apul_genes))) {
    warning(paste("Feature", feature, "not found in vsd_Apul_genes Skipping..."))
    next
  }

  # Extract predictor expression (only those present)
  predictor_expr <- Apul_pred_counts_t[intersect(predictors, rownames(Apul_pred_counts_t)), , drop = FALSE] %>% as.data.frame()

  # Add the feature expression
  feature_expr <- vsd_Apul_genes[feature, , drop = FALSE] %>% as.data.frame()
  
  # Combine into one data frame
  combined_expr <- rbind(predictor_expr, feature_expr)
  combined_expr$Gene <- rownames(combined_expr)
  
  # Convert to long format
  expr_long <- combined_expr %>%
    pivot_longer(-Gene, names_to = "Sample", values_to = "Expression")
  
  # Join with sample metadata
  expr_long <- expr_long %>%
    left_join(metadata, by = "Sample")
  
  # Plot, with colonies aggregated by timepoint
  # p <- ggplot(expr_long, aes(x = interaction(Timepoint), y = Expression, color = Gene, group = Gene)) +
  #   geom_point(alpha = 0.7) +
  #   geom_smooth(se = FALSE, method = "loess") +
  #   labs(title = paste("Expression of", feature, "and Top 5 Predictors"),
  #        x = "Colony-Timepoint",
  #        y = "Expression") +
  #   theme_minimal() +
  #   theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  # Plot, faceted by colony
  p <- ggplot(expr_long, aes(x = Timepoint, y = Expression, color = Gene, group = Gene)) +
  geom_point(alpha = 0.7) +
  geom_smooth(se = FALSE, method = "loess", linewidth = 0.5) +
  facet_wrap(~ColonyID) +
  labs(title = paste("Expression of", feature, "and Top 5 Predictors by Colony"),
       x = "Timepoint",
       y = "Expression") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

  
  print(p)
}
```


Save top predictors and metrics of each well-predicted gene
```{r}
# Combine all sub-tibbles into a single data frame with the parent list name as an ID
Apul_top_predictors_df <- imap_dfr(Apul_top_predictors, ~ {
  .x
})

# View the resulting data frame
print(Apul_top_predictors_df)

# Optional: save to CSV
write.csv(Apul_top_predictors_df, "../output/26.1-rank35-optemization-ElasticNet-component24/Apul_top_predictors.csv", row.names = FALSE)

```


# P. evermanni
## Load and format data 

```{r}
### mRNA ###
# raw gene counts data (will filter and variance stabilize)
Peve_genes <- as.data.frame(read_csv("https://github.com/urol-e5/timeseries_molecular/raw/refs/heads/main/M-multi-species/output/26-rank35-optimization/lambda_gene_0.2/top_genes_per_component/Component_24_peve_count_matrix.csv"))

# format gene IDs as rownames (instead of a column)
rownames(Peve_genes) <- Peve_genes$gene_id
Peve_genes <- Peve_genes%>%select(!gene_id)

### miRNA ###
# raw miRNA counts (will filter and variance stabilize)
Peve_miRNA <- read.table(file = "https://github.com/urol-e5/timeseries_molecular/raw/refs/heads/main/M-multi-species/output/10.1-format-miRNA-counts-updated/Peve_miRNA_counts_formatted_updated.txt", header = TRUE, sep = "\t", check.names = FALSE)

# format miRNA IDs as rownames (instead of a column)
rownames(Peve_miRNA) <- Peve_miRNA$Name
Peve_miRNA <- Peve_miRNA%>%select(!Name)

### lncRNA ###
# raw lncRNA counts (will filter and variance stabilize)
Peve_lncRNA_full <- read.table("https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/E-Peve/output/12-Peve-lncRNA-discovery/lncRNA_counts.clean.filtered.txt", header = TRUE, sep = "\t")

# Remove info on genomic location, set lncRNA IDs as rownames
rownames(Peve_lncRNA_full) <- Peve_lncRNA_full$Geneid
Peve_lncRNA <- Peve_lncRNA_full %>% select(-Geneid, -Chr, -Start, -End, -Strand, -Length)

# Format sample names to match standard (SPEC-Sample#-TP#)
colnames(Peve_lncRNA) <- gsub("\\.", "-", colnames(Peve_lncRNA))


### WGBS data ###
Peve_WGBS <- read.csv("https://gannet.fish.washington.edu/metacarcinus/E5/Pevermanni/20250821_meth_Peve/merged-WGBS-CpG-counts_filtered.csv")

# Format sample names to match standard (SPEC-Sample#-TP#)
colnames(Peve_WGBS) <- gsub("\\.", "-", colnames(Peve_WGBS))


### load and format metadata ###
metadata <- read_csv("../../M-multi-species/data/rna_metadata.csv")%>%select(AzentaSampleName, ColonyID, Timepoint) %>%
  filter(grepl("POR", ColonyID))
metadata$Sample <- paste0(metadata$ColonyID, "-", metadata$Timepoint)
rownames(metadata) <- metadata$Sample

colonies <- unique(metadata$ColonyID)
```

### Filter data sets 

#### WGBS

Only keep CpGs that have a non-zero value in all samples. 

```{r}
Peve_WGBS_filt <- Peve_WGBS %>% filter(if_all(-CpG, ~ .x > 0))

# Ensure it's formatted as a data frame
Peve_WGBS_filt <- as.data.frame(Peve_WGBS_filt)
# Set CpG IDs to rownames
rownames(Peve_WGBS_filt) <- Peve_WGBS_filt$CpG
Peve_WGBS_filt <- Peve_WGBS_filt %>% select(-CpG)

nrow(Peve_WGBS)
nrow(Peve_WGBS_filt)
```

We had 242,100 CpGs before filtering and have 29,465 after filtering. 

#### RNA


Remove samples which were excluded from sRNA (`POR-69-TP3`) and WGBS counts (`POR-73-TP2`)

```{r}
Peve_genes <- Peve_genes %>% select(-"POR-69-TP3", -"POR-73-TP2")
Peve_miRNA <- Peve_miRNA %>% select(-"POR-73-TP2")
Peve_lncRNA <- Peve_lncRNA %>% select(-"POR-69-TP3", -"POR-73-TP2")
Peve_WGBS <- Peve_WGBS %>% select(-"POR-69-TP3")
```

Ensure we only have genes, miRNA, and lncRNA that are present in at least one sample
```{r}
# genes
Peve_genes_red <- Peve_genes[rowSums(Peve_genes) != 0, ]
# miRNA
Peve_miRNA_red <- Peve_miRNA[rowSums(Peve_miRNA) != 0, ]
# lncRNA
Peve_lncRNA_red <- Peve_lncRNA[rowSums(Peve_lncRNA) != 0, ]

cat("Retained ", nrow(Peve_genes_red), " of ", nrow(Peve_genes), "genes; ",
       nrow(Peve_miRNA_red), " of ", nrow(Peve_miRNA), " miRNA; and ", 
       nrow(Peve_lncRNA_red), " of ", nrow(Peve_lncRNA), " lncRNA")
```

*pOverA*: 
Specifying the minimum count for a proportion of samples for each gene. Setting 3/40 = 0.08. This would retain genes that are only expressed in a single season in a couple of the colonies. Additionally, setting the minimum count so that the minimum number of samples must have a gene count above a certain threshold. 

genes:
```{r}
filt <- filterfun(pOverA(0.08, 5))

#create filter for the counts data
gfilt <- genefilter(Peve_genes_red, filt)

#identify genes to keep by count filter
gkeep <- Peve_genes_red[gfilt,]

#identify gene lists
gn.keep <- rownames(gkeep)

#gene count data filtered in PoverA, P percent of the samples have counts over A
Peve_genes_filt <- as.data.frame(Peve_genes_red[which(rownames(Peve_genes_red) %in% gn.keep),])

#How many rows do we have before and after filtering?
cat("Pre-filtering:", nrow(Peve_genes_red), "; Post-filtering:", nrow(Peve_genes_filt))
```

miRNA:
```{r}
mifilt <- filterfun(pOverA(0.08, 5))

#create filter for the counts data
mifilt <- genefilter(Peve_miRNA_red, mifilt)

#identify miRNA to keep by count filter
mikeep <- Peve_miRNA_red[mifilt,]

#identify miRNA to keep by count filter
mikeep <- Peve_miRNA_red[mifilt,]

#identify miRNA lists
mi.keep <- rownames(mikeep)

#miRNA count data filtered in PoverA, P percent of the samples have counts over A
Peve_miRNA_filt <- as.data.frame(Peve_miRNA_red[which(rownames(Peve_miRNA_red) %in% mi.keep),])

#How many rows do we have before and after filtering?
cat("Pre-filtering:", nrow(Peve_miRNA_red), "; Post-filtering:", nrow(Peve_miRNA_filt))
```

lncRNA:
```{r}
lncfilt <- filterfun(pOverA(0.08, 5))

#create filter for the counts data
lncfilt <- genefilter(Peve_lncRNA_red, lncfilt)

#identify lncRNA to keep by count filter
lnckeep <- Peve_lncRNA_red[lncfilt,]

#identify lncRNA to keep by count filter
lnckeep <- Peve_lncRNA_red[lncfilt,]

#identify lncRNA lists
lnc.keep <- rownames(lnckeep)

#lncRNA count data filtered in PoverA, P percent of the samples have counts over A
Peve_lncRNA_filt <- as.data.frame(Peve_lncRNA_red[which(rownames(Peve_lncRNA_red) %in% lnc.keep),])

#How many rows do we have before and after filtering?
cat("Pre-filtering:", nrow(Peve_lncRNA_red), "; Post-filtering:", nrow(Peve_lncRNA_filt))
```

### Transform data 

Remove `POR-69-TP3` and `POR-73-TP2` from metadata table
```{r}
metadata <- metadata[!(rownames(metadata) %in% c("POR-69-TP3", "POR-73-TP2")), ]
rownames(metadata) <- metadata$Sample
```

Set the order of genes, miRNA, lncRNA, wgbs, and metadata to all be the same. 
```{r}
# Ensure rownames of metadata are used as the desired column order
desired_order <- rownames(metadata)

# Reorder data frame columns
Peve_genes_filt <- Peve_genes_filt[, desired_order]
Peve_miRNA_filt <- Peve_miRNA_filt[, desired_order]
Peve_lncRNA_filt <- Peve_lncRNA_filt[, desired_order]
Peve_WGBS_filt <- Peve_WGBS_filt[, desired_order]

# Check they all match
identical(rownames(metadata), colnames(Peve_genes_filt))
identical(rownames(metadata), colnames(Peve_miRNA_filt))
identical(rownames(metadata), colnames(Peve_lncRNA_filt))
identical(rownames(metadata), colnames(Peve_WGBS_filt))
```

Use a variance stabilized transformation for all four data sets. Variance stabilization essentially tries to make variance independent of the mean

(Is this the most appropriate design to use?)

genes:
```{r}
dds_Peve <- DESeqDataSetFromMatrix(countData = Peve_genes_filt, 
                              colData = metadata, 
                              design = ~Timepoint+ColonyID)

# Variance Stabilizing Transformation
vsd_Peve_genes <- varianceStabilizingTransformation(dds_Peve, blind = TRUE)  # Must use varianceStabilizingTransformation() instead of vst() due to few input genes
vsd_Peve_genes <- assay(vsd_Peve_genes)
```

miRNA:

This is throwing an error: "Error in estimateSizeFactorsForMatrix(counts(object), locfunc = locfunc, :
every gene contains at least one zero, cannot compute log geometric means"

To avoid having excessive 0 values that interfere with geometric mean calculations, I'm going to add a pseudo-count of 1 to all counts. This is a simple workaround, but I should revisit later to determine whether its appropriate or I need to do a more in-depth fix.
```{r}
dds_miRNA <- DESeqDataSetFromMatrix(countData = Peve_miRNA_filt + 1, 
                              colData = metadata, 
                              design = ~Timepoint+ColonyID)

# Variance Stabilizing Transformation
vsd_Peve_miRNA <- varianceStabilizingTransformation(dds_miRNA, blind=TRUE) # Must use varianceStabilizingTransformation() instead of vst() due to few input genes
vsd_Peve_miRNA <- assay(vsd_Peve_miRNA)

```

lncRNA:
```{r}
# All values must be integers
Peve_lncRNA_filt <- Peve_lncRNA_filt %>% mutate(across(where(is.numeric), round))

dds_lncRNA <- DESeqDataSetFromMatrix(countData = Peve_lncRNA_filt, 
                              colData = metadata, 
                              design = ~Timepoint+ColonyID)

# Variance Stabilizing Transformation
vsd_Peve_lncRNA <- assay(vst(dds_lncRNA, blind = TRUE))
```

Also must round WGBS data to whole integers for normalization - need to return to this to decide if this is appropriate. 
```{r}
# All values must be integers
Peve_WGBS_filt<-Peve_WGBS_filt %>% mutate(across(where(is.numeric), round))

dds_wgbs <- DESeqDataSetFromMatrix(countData = Peve_WGBS_filt, 
                              colData = metadata, 
                              design = ~ Timepoint+ColonyID)

# Variance Stabilizing Transformation
vsd_Peve_WGBS <- assay(varianceStabilizingTransformation(dds_wgbs, blind = TRUE))
```

### Merge predictor features

Create a merged dataset that contains the variance stabilized counts for all miRNA, lncRNA, and CpGs.

```{r}
# Triple check that all three data frames have sample names in the same order
identical(colnames(vsd_Peve_lncRNA), colnames(vsd_Peve_miRNA))
identical(colnames(vsd_Peve_lncRNA), colnames(vsd_Peve_WGBS))

# Bind (stack dataframes vertically, so that they match by column/sample)
Peve_pred_counts <- rbind(vsd_Peve_lncRNA, vsd_Peve_miRNA, vsd_Peve_WGBS)

# Transform so that samples are on rows and features are in columns
Peve_pred_counts <- t(Peve_pred_counts)

dim(Peve_pred_counts)
```

### Format

```{r}
# Transform the counts matrices so that samples are on the rows and gene IDs on the columns
vsd_Peve_genes_t <- t(vsd_Peve_genes)

# Ensure both are formatted as data frames
Peve_pred_counts <- as.data.frame(Peve_pred_counts)
vsd_Peve_genes_t <- as.data.frame(vsd_Peve_genes_t)

# Ensure sample matching between gene and epigenetic dfs
common_samples <- intersect(rownames(vsd_Peve_genes_t), rownames(Peve_pred_counts))

Peve_pred_counts <- Peve_pred_counts[common_samples, ]
vsd_Peve_genes_t <- vsd_Peve_genes_t[common_samples,]
```


## Split training and test
## Bootstrapping

```{r, cache=TRUE}
n_replicates <- 10  # number of replicates
r2_threshold <- 0.5 # threshold for "good" prediction
Peve_TM_list <- list()
Peve_MP_list <- list()
Peve_FI_list <- list()

# Perform bootstrapping
for (i in 1:n_replicates) {
  cat("Beginning replicate", i)
  cat("\n")
  Peve_result <- train_models_split(vsd_Peve_genes_t, Peve_pred_counts)
  
  cat("Extracting results for replicate", i)
  cat("\n")
  Peve_trained_models <- Peve_result$models
  Peve_model_performance <- Peve_result$performance
  Peve_feature_importance <- get_feature_importance(Peve_trained_models)
  
  Peve_TM_list[[i]] <- Peve_trained_models
  Peve_TM_list[[i]]$Replicate <- i
  
  Peve_MP_list[[i]] <- Peve_model_performance
  Peve_MP_list[[i]]$Replicate <- i
  
  Peve_FI_list[[i]] <- Peve_feature_importance
  Peve_FI_list[[i]]$Replicate <- i
}

# Combine all results
Peve_all_TM <- do.call(rbind, Peve_TM_list)
Peve_all_MP <- do.call(rbind, Peve_MP_list)
Peve_all_FI <- do.call(rbind, Peve_FI_list)
```

```{r}
# Count how often each feature exceeds the R² threshold

Peve_MP_summary <- Peve_all_MP %>%
  group_by(Feature) %>%
  summarize(
    Mean_R2 = mean(R2, na.rm = TRUE),
    SD_R2 = sd(R2, na.rm = TRUE),
    SE_R2 = SD_R2/sqrt(n()),
    High_Perf_Count = sum(R2 >= r2_threshold),
    .groups = 'drop'
  ) %>%
  arrange(desc(High_Perf_Count))

# View top consistently high-performing features
print(Peve_MP_summary)

# How many genes are consistently well-predicted?
sum(!is.na(Peve_MP_summary$Feature) & Peve_MP_summary$Mean_R2 > 0.5, na.rm = TRUE)

```

Sort genes by average R^2 across all replicates, then plot R^2 for all replicates
```{r}
# Merge mean R2 info into full replicate-level data
Peve_all_MP <- merge(Peve_all_MP, Peve_MP_summary[, c("Feature", "Mean_R2", "SD_R2")], by = "Feature")

# Reorder Feature factor by Mean_R2
Peve_all_MP$Feature <- factor(Peve_all_MP$Feature,
                                          levels = Peve_MP_summary$Feature[order(Peve_MP_summary$Mean_R2)])
```

```{r}
# Plot mean R² with error bars
# Choose SD to describe variability in model performance
ggplot(Peve_all_MP, aes(x = Feature, y = Mean_R2)) +
  geom_errorbar(aes(ymin = Mean_R2 - SD_R2, ymax = Mean_R2 + SD_R2), width = 0.4, color = "gray40", linewidth = 0.25) +
  geom_point(size = 2, color = "black") +
  geom_hline(yintercept = 0.5,
             linetype = "dashed", color = "blue") +
  theme_minimal() +
  labs(title = "Peve: Mean R² with Error Bars Across Gene Features",
       x = "Gene Expression Feature (Ordered by Mean R²)",
       y = "Mean R² ± SD") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size=5)) +
  ylim(-0.2, 1.2) +
  scale_y_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1))

```

```{r}
# Plot with features sorted by mean R²
ggplot(Peve_all_MP, aes(x = Feature, y = R2)) +
  geom_point(aes(color = as.factor(Replicate)), size = 1) +
  geom_point(aes(y = Mean_R2), color = "black", shape = 18, size = 3) + 
  geom_hline(yintercept = mean(Peve_all_MP$R2, na.rm = TRUE),
             linetype = "dashed", color = "blue") +
  theme_minimal() +
  labs(title = "Peve: Model Performance Across Gene Expression",
       x = "Gene Expression Feature (Ordered by Mean R²)",
       y = "R² (Variance Explained)",
       color = "Replicate") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ylim(0, 1)

```

Average importance of all predictive features and plot those with highest average importance accross all replicates
```{r}
# Average importance across replicates
Peve_mean_importance_df <- Peve_all_FI %>%
  group_by(Feature) %>%
  summarise(MeanImportance = mean(MeanImportance, na.rm = TRUE)) %>%
  arrange(desc(MeanImportance))

# Select top 50 features
Peve_top_features <- Peve_mean_importance_df %>% top_n(50, MeanImportance)

# Plot
ggplot(Peve_top_features, aes(x = reorder(Feature, MeanImportance), y = MeanImportance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +  # Flip for readability
  theme_minimal() +
  labs(title = "Peve: Top Predictive Features by Mean Importance",
       x = "Feature",
       y = "Mean Importance")

```

## Boostrapping on predictable genes

Now I only want to model genes that are well-predicted over many replicates

Isolate well-predicted genes
```{r}
Peve_well_predicted <- Peve_MP_summary[Peve_MP_summary$Mean_R2 > 0.5,]

vsd_Peve_high_perf_t <- vsd_Peve_genes_t[,colnames(vsd_Peve_genes_t) %in% Peve_well_predicted$Feature]
dim(vsd_Peve_high_perf_t)
```

Of the original 100 genes selected from component 24, 29 are consistently well-predicted by our model (mean R^2 > 0.5, over 10 replicates).


Run bootstrapping on isolated genes
```{r, cache=TRUE}
n_replicates <- 25  # number of replicates
r2_threshold <- 0.5 # threshold for "good" prediction
Peve_TM_list <- list()
Peve_MP_list <- list()
Peve_FI_list <- list()

# Perform bootstrapping
for (i in 1:n_replicates) {
  cat("Beginning replicate", i)
  cat("\n")
  Peve_result <- train_models_split(vsd_Peve_high_perf_t, Peve_pred_counts)
  
  cat("Extracting results for replicate", i)
  cat("\n")
  Peve_trained_models <-Peve_result$models
  Peve_model_performance <- Peve_result$performance
  Peve_feature_importance <- get_feature_importance(Peve_trained_models)
  
  Peve_TM_list[[i]] <- Peve_trained_models
  Peve_TM_list[[i]]$Replicate <- i
  
  Peve_MP_list[[i]] <- Peve_model_performance
  Peve_MP_list[[i]]$Replicate <- i
  
  Peve_FI_list[[i]] <- Peve_feature_importance
  Peve_FI_list[[i]]$Replicate <- i
}

# Combine all results
Peve_all_TM_highperf <- do.call(rbind, Peve_TM_list)
Peve_all_MP_highperf <- do.call(rbind, Peve_MP_list)
Peve_all_FI_highperf <- do.call(rbind, Peve_FI_list)
```

```{r}
# Count how often each feature exceeds the R² threshold

Peve_MP_summary_highperf <- Peve_all_MP_highperf %>%
  group_by(Feature) %>%
  summarize(
    Mean_R2 = mean(R2, na.rm = TRUE),
    SD_R2 = sd(R2, na.rm = TRUE),
    SE_R2 = SD_R2 / sqrt(n()),
    High_Perf_Count = sum(R2 >= r2_threshold),
    .groups = 'drop'
  ) %>%
  arrange(desc(High_Perf_Count))

# View top consistently high-performing features
print(Peve_MP_summary_highperf)

# How many genes are consistently well-predicted?
nrow(Peve_MP_summary_highperf[Peve_MP_summary_highperf$Mean_R2 > 0.5,])
```

When I train the model on only these 29 well-predicted genes, 25 of them are again well-predicted. That's a much higher success rate! This suggests that epigenetic mechanisms may indeed be able to predict expression of these genes


Sort genes by average R^2 across all replicates, then plot R^2 for all replicates
```{r}
# Merge mean R2 info into full replicate-level data
Peve_all_MP_highperf <- merge(Peve_all_MP_highperf, Peve_MP_summary_highperf[, c("Feature", "Mean_R2", "SD_R2")], by = "Feature")

# Reorder Feature factor by Mean_R2
Peve_all_MP_highperf$Feature <- factor(Peve_all_MP_highperf$Feature,
                                          levels = Peve_MP_summary_highperf$Feature[order(Peve_MP_summary_highperf$Mean_R2)])
```


```{r}

# Plot mean R² with error bars
# Choose SD to describe variability in model performance
ggplot(Peve_all_MP_highperf, aes(x = Feature, y = Mean_R2)) +
  geom_errorbar(aes(ymin = Mean_R2 - SD_R2, ymax = Mean_R2 + SD_R2), width = 0.3, color = "gray40") +
  geom_point(size = 2, color = "black") +
  geom_hline(yintercept = 0.5,
             linetype = "dashed", color = "blue") +
  theme_minimal() +
  labs(title = "Peve: Mean R² with Error Bars Across Gene Features",
       x = "Gene Expression Feature (Ordered by Mean R²)",
       y = "Mean R² ± SD") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ylim(-0.2, 1.2) +
  scale_y_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1))

```

```{r}
# Plot all replicate R2s, with features sorted by mean R²
ggplot(Peve_all_MP_highperf, aes(x = Feature, y = R2)) +
  geom_point(aes(color = as.factor(Replicate)), size = 1.5) +
  geom_point(aes(y = Mean_R2), color = "black", shape = 18, size = 3) + 
  geom_hline(yintercept = mean(Peve_all_MP_highperf$R2, na.rm = TRUE),
             linetype = "dashed", color = "blue") +
  theme_minimal() +
  labs(title = "Peve: Model Performance Across Gene Expression",
       x = "Gene Expression Feature (Ordered by Mean R²)",
       y = "R² (Variance Explained)",
       color = "Replicate") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ylim(0, 1)

```

```{r}
# Average importance across replicates
Peve_mean_importance_df_highperf <- Peve_all_FI_highperf %>%
  group_by(Feature) %>%
  summarise(MeanImportance = mean(MeanImportance, na.rm = TRUE)) %>%
  arrange(desc(MeanImportance))

# Select top 50 features
Peve_top_features_highperf <- Peve_mean_importance_df_highperf %>% top_n(50, MeanImportance)

# Plot
ggplot(Peve_top_features_highperf, aes(x = reorder(Feature, MeanImportance), y = MeanImportance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +  # Flip for readability
  theme_minimal() +
  labs(title = "Peve: Top Predictive Features by Mean Importance",
       x = "Feature",
       y = "Mean Importance")

```

Look at most predictive epigenetic features for these well-predicted genes
```{r}
# Define color assignments for predictor types
color_palette <- c(miRNA = "#E69F00", lncRNA = "#0072B2", CpG = "#009E73")

#  Define well-predicted features
Peve_all_features_highR2 <- Peve_all_MP_highperf %>%
  filter(Mean_R2 > 0.5) %>%
  pull(Feature) %>%
  unique()

# Function to extract importance from a single model
get_feature_importance_for_feature <- function(model) {
  coefs <- as.matrix(coef(model, s = "lambda.min"))[-1, , drop = FALSE]  # remove intercept
  data.frame(Predictor = rownames(coefs), Importance = abs(as.numeric(coefs)))
}

# Function to assign predictor type
get_predictor_type <- function(predictor_name) {
  if (startsWith(predictor_name, "Cluster")) return("miRNA")
  if (startsWith(predictor_name, "lncRNA")) return("lncRNA")
  if (startsWith(predictor_name, "CpG")) return("CpG")
  return("Other")
}

# Initialize list to save top predictors
Peve_top_predictors <- list()

# Loop over features, aggregate importance across replicates, and plot
for (target_feature in Peve_all_features_highR2) {
  
  # Extract and combine importance scores across all replicates
  importance_all_reps <- lapply(Peve_TM_list, function(rep_entry) {
    model <- rep_entry[[target_feature]]
    if (!is.null(model)) {
      get_feature_importance_for_feature(model)
    } else {
      NULL
    }
  }) %>% bind_rows()

  # If no importance data was found, skip
  if (nrow(importance_all_reps) == 0) next

  # Compute average importance across replicates
  mean_importance <- importance_all_reps %>%
    group_by(Predictor) %>%
    summarise(MeanImportance = mean(Importance, na.rm = TRUE), .groups = "drop") %>%
    arrange(desc(MeanImportance)) %>%
    slice_head(n = 20) %>%
    mutate(Type = sapply(Predictor, get_predictor_type))


  # Plot once per target feature
  plot <- ggplot(mean_importance, aes(x = reorder(Predictor, MeanImportance), y = MeanImportance, fill=Type)) +
    geom_bar(stat = "identity") +
    scale_fill_manual(values = color_palette) +
    coord_flip() +
    theme_minimal() +
    labs(title = paste("Top Predictors for", target_feature),
         x = "Predictor",
         y = "Mean Importance (across replicates)")

  # Save top 20 most important predictors for later use
  top20 <- mean_importance %>% 
    mutate(Feature = target_feature)
  
  Peve_top_predictors[[target_feature]] <- top20
  
  print(plot)
}
```


## Plot expression of genes and their predictors

```{r, error=FALSE, warning=FALSE}
# Put predictor expression in format with samples in columns
Peve_pred_counts_t <- t(Peve_pred_counts)

# Loop over each feature in the list
for (feature in names(Peve_top_predictors)) {
  
  # Get top 5 predictors for this feature
  top_df <- Peve_top_predictors[[feature]] %>%
    slice_head(n = 5)
  predictors <- top_df$Predictor
  
  # Combine feature and predictors
  all_genes <- c(feature, predictors)
  
  # Check which genes are actually present
  present_genes <- all_genes[all_genes %in% rownames(Peve_pred_counts_t) | all_genes %in% rownames(vsd_Peve_genes)]
  
  if (!(feature %in% rownames(vsd_Peve_genes))) {
    warning(paste("Feature", feature, "not found in vsd_Peve_genes Skipping..."))
    next
  }

  # Extract predictor expression (only those present)
  predictor_expr <- Peve_pred_counts_t[intersect(predictors, rownames(Peve_pred_counts_t)), , drop = FALSE] %>% as.data.frame()

  # Add the feature expression
  feature_expr <- vsd_Peve_genes[feature, , drop = FALSE] %>% as.data.frame()
  
  # Combine into one data frame
  combined_expr <- rbind(predictor_expr, feature_expr)
  combined_expr$Gene <- rownames(combined_expr)
  
  # Convert to long format
  expr_long <- combined_expr %>%
    pivot_longer(-Gene, names_to = "Sample", values_to = "Expression")
  
  # Join with sample metadata
  expr_long <- expr_long %>%
    left_join(metadata, by = "Sample")
  
  # Plot, with colonies aggregated by timepoint
  # p <- ggplot(expr_long, aes(x = interaction(Timepoint), y = Expression, color = Gene, group = Gene)) +
  #   geom_point(alpha = 0.7) +
  #   geom_smooth(se = FALSE, method = "loess") +
  #   labs(title = paste("Expression of", feature, "and Top 5 Predictors"),
  #        x = "Colony-Timepoint",
  #        y = "Expression") +
  #   theme_minimal() +
  #   theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  # Plot, faceted by colony
  p <- ggplot(expr_long, aes(x = Timepoint, y = Expression, color = Gene, group = Gene)) +
  geom_point(alpha = 0.7) +
  geom_smooth(se = FALSE, method = "loess", linewidth = 0.5) +
  facet_wrap(~ColonyID) +
  labs(title = paste("Expression of", feature, "and Top 5 Predictors by Colony"),
       x = "Timepoint",
       y = "Expression") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

  
  print(p)
}
```


Save top predictors and metrics of each well-predicted gene
```{r}
# Combine all sub-tibbles into a single data frame with the parent list name as an ID
Peve_top_predictors_df <- imap_dfr(Peve_top_predictors, ~ {
  .x
})

# View the resulting data frame
print(Peve_top_predictors_df)

# Optional: save to CSV
write.csv(Peve_top_predictors_df, "../output/26.1-rank35-optemization-ElasticNet-component24/Peve_top_predictors.csv", row.names = FALSE)

```


# P. tuahiniensis
## Load and format data 

```{r}
### mRNA ###
# raw gene counts data (will filter and variance stabilize)
Ptuh_genes <- as.data.frame(read_csv("https://github.com/urol-e5/timeseries_molecular/raw/refs/heads/main/M-multi-species/output/26-rank35-optimization/lambda_gene_0.2/top_genes_per_component/Component_24_ptua_count_matrix.csv"))

# format gene IDs as rownames (instead of a column)
rownames(Ptuh_genes) <- Ptuh_genes$gene_id
Ptuh_genes <- Ptuh_genes%>%select(!gene_id)

### miRNA ###
# raw miRNA counts (will filter and variance stabilize)
Ptuh_miRNA <- read.table(file = "https://github.com/urol-e5/timeseries_molecular/raw/refs/heads/main/M-multi-species/output/10.1-format-miRNA-counts-updated/Ptuh_miRNA_counts_formatted_updated.txt", header = TRUE, sep = "\t", check.names = FALSE)

# format miRNA IDs as rownames (instead of a column)
rownames(Ptuh_miRNA) <- Ptuh_miRNA$Name
Ptuh_miRNA <- Ptuh_miRNA%>%select(!Name)

### lncRNA ###
# raw lncRNA counts (will filter and variance stabilize)
Ptuh_lncRNA_full <- read.table("https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/F-Ptua/output/06-Ptua-lncRNA-discovery/lncRNA_counts.clean.filtered.txt", header = TRUE, sep = "\t")

# Remove info on genomic location, set lncRNA IDs as rownames
rownames(Ptuh_lncRNA_full) <- Ptuh_lncRNA_full$Geneid
Ptuh_lncRNA <- Ptuh_lncRNA_full %>% select(-Geneid, -Chr, -Start, -End, -Strand, -Length)

# Format sample names to match standard (SPEC-Sample#-TP#)
colnames(Ptuh_lncRNA) <- gsub("\\.", "-", colnames(Ptuh_lncRNA))


### WGBS data ###
Ptuh_WGBS <- read.csv("https://gannet.fish.washington.edu/metacarcinus/E5/Ptuahiniensis/20250821_meth_Ptua/merged-WGBS-CpG-counts_filtered.csv")

# Format sample names to match standard (SPEC-Sample#-TP#)
colnames(Ptuh_WGBS) <- gsub("\\.", "-", colnames(Ptuh_WGBS))


### load and format metadata ###
metadata <- read_csv("../../M-multi-species/data/rna_metadata.csv")%>%select(AzentaSampleName, ColonyID, Timepoint) %>%
  filter(grepl("POC", ColonyID))
metadata$Sample <- paste0(metadata$ColonyID, "-", metadata$Timepoint)
rownames(metadata) <- metadata$Sample

colonies <- unique(metadata$ColonyID)
```

### Filter data sets 

#### WGBS

Only keep CpGs that have a non-zero value in all samples. 

```{r}
Ptuh_WGBS_filt <- Ptuh_WGBS %>% filter(if_all(-CpG, ~ .x > 0))

# Ensure it's formatted as a data frame
Ptuh_WGBS_filt <- as.data.frame(Ptuh_WGBS_filt)
# Set CpG IDs to rownames
rownames(Ptuh_WGBS_filt) <- Ptuh_WGBS_filt$CpG
Ptuh_WGBS_filt <- Ptuh_WGBS_filt %>% select(-CpG)

nrow(Ptuh_WGBS)
nrow(Ptuh_WGBS_filt)
```

We had 1,991,606 CpGs before filtering and have 58,872 after filtering. 

#### RNA

Remove the samples with poor WGBS data quality (`POC-201-TP3` `POC-222-TP2` `POC-255-TP4` `POC-259-TP3` `POC-259-TP4` `POC-42-TP1` `POC-42-TP3`), which was already excluded from the WGBS counts

```{r}
Ptuh_genes <- Ptuh_genes %>% select(-"POC-201-TP3", -"POC-222-TP2", -"POC-255-TP4", -"POC-259-TP3", -"POC-259-TP4", -"POC-42-TP1",  -"POC-42-TP3")
Ptuh_miRNA <- Ptuh_miRNA %>% select(-"POC-201-TP3", -"POC-222-TP2", -"POC-255-TP4", -"POC-259-TP3", -"POC-259-TP4", -"POC-42-TP1",  -"POC-42-TP3")
Ptuh_lncRNA <- Ptuh_lncRNA %>% select(-"POC-201-TP3", -"POC-222-TP2", -"POC-255-TP4", -"POC-259-TP3", -"POC-259-TP4", -"POC-42-TP1",  -"POC-42-TP3")
```

Ensure we only have genes, miRNA, and lncRNA that are present in at least one sample
```{r}
# genes
Ptuh_genes_red <- Ptuh_genes[rowSums(Ptuh_genes) != 0, ]
# miRNA
Ptuh_miRNA_red <- Ptuh_miRNA[rowSums(Ptuh_miRNA) != 0, ]
# lncRNA
Ptuh_lncRNA_red <- Ptuh_lncRNA[rowSums(Ptuh_lncRNA) != 0, ]

cat("Retained ", nrow(Ptuh_genes_red), " of ", nrow(Ptuh_genes), "genes; ",
       nrow(Ptuh_miRNA_red), " of ", nrow(Ptuh_miRNA), " miRNA; and ", 
       nrow(Ptuh_lncRNA_red), " of ", nrow(Ptuh_lncRNA), " lncRNA")
```

*pOverA*: 
Specifying the minimum count for a proportion of samples for each gene. Setting 3/40 = 0.08. This would retain genes that are only expressed in a single season in a couple of the colonies. Additionally, setting the minimum count so that the minimum number of samples must have a gene count above a certain threshold. 

genes:
```{r}
filt <- filterfun(pOverA(0.08, 5))

#create filter for the counts data
gfilt <- genefilter(Ptuh_genes_red, filt)

#identify genes to keep by count filter
gkeep <- Ptuh_genes_red[gfilt,]

#identify gene lists
gn.keep <- rownames(gkeep)

#gene count data filtered in PoverA, P percent of the samples have counts over A
Ptuh_genes_filt <- as.data.frame(Ptuh_genes_red[which(rownames(Ptuh_genes_red) %in% gn.keep),])

#How many rows do we have before and after filtering?
cat("Pre-filtering:", nrow(Ptuh_genes_red), "; Post-filtering:", nrow(Ptuh_genes_filt))
```

miRNA:
```{r}
mifilt <- filterfun(pOverA(0.08, 5))

#create filter for the counts data
mifilt <- genefilter(Ptuh_miRNA_red, mifilt)

#identify miRNA to keep by count filter
mikeep <- Ptuh_miRNA_red[mifilt,]

#identify miRNA to keep by count filter
mikeep <- Ptuh_miRNA_red[mifilt,]

#identify miRNA lists
mi.keep <- rownames(mikeep)

#miRNA count data filtered in PoverA, P percent of the samples have counts over A
Ptuh_miRNA_filt <- as.data.frame(Ptuh_miRNA_red[which(rownames(Ptuh_miRNA_red) %in% mi.keep),])

#How many rows do we have before and after filtering?
cat("Pre-filtering:", nrow(Ptuh_miRNA_red), "; Post-filtering:", nrow(Ptuh_miRNA_filt))
```

lncRNA:
```{r}
lncfilt <- filterfun(pOverA(0.08, 5))

#create filter for the counts data
lncfilt <- genefilter(Ptuh_lncRNA_red, lncfilt)

#identify lncRNA to keep by count filter
lnckeep <- Ptuh_lncRNA_red[lncfilt,]

#identify lncRNA to keep by count filter
lnckeep <- Ptuh_lncRNA_red[lncfilt,]

#identify lncRNA lists
lnc.keep <- rownames(lnckeep)

#lncRNA count data filtered in PoverA, P percent of the samples have counts over A
Ptuh_lncRNA_filt <- as.data.frame(Ptuh_lncRNA_red[which(rownames(Ptuh_lncRNA_red) %in% lnc.keep),])

#How many rows do we have before and after filtering?
cat("Pre-filtering:", nrow(Ptuh_lncRNA_red), "; Post-filtering:", nrow(Ptuh_lncRNA_filt))
```

### Transform data 

Remove samples from metadata table
```{r}
metadata <- metadata[!(rownames(metadata) %in% c("POC-201-TP3", "POC-222-TP2", "POC-255-TP4", "POC-259-TP3", "POC-259-TP4", "POC-42-TP1", "POC-42-TP3")), ]
rownames(metadata) <- metadata$Sample
```

Set the order of genes, miRNA, lncRNA, wgbs, and metadata to all be the same. 
```{r}
# Ensure rownames of metadata are used as the desired column order
desired_order <- rownames(metadata)

# Reorder data frame columns
Ptuh_genes_filt <- Ptuh_genes_filt[, desired_order]
Ptuh_miRNA_filt <- Ptuh_miRNA_filt[, desired_order]
Ptuh_lncRNA_filt <- Ptuh_lncRNA_filt[, desired_order]
Ptuh_WGBS_filt <- Ptuh_WGBS_filt[, desired_order]

# Check they all match
identical(rownames(metadata), colnames(Ptuh_genes_filt))
identical(rownames(metadata), colnames(Ptuh_miRNA_filt))
identical(rownames(metadata), colnames(Ptuh_lncRNA_filt))
identical(rownames(metadata), colnames(Ptuh_WGBS_filt))
```

Use a variance stabilized transformation for all four data sets. Variance stabilization essentially tries to make variance independent of the mean

(Is this the most appropriate design to use?)

genes:
```{r}
dds_Ptuh <- DESeqDataSetFromMatrix(countData = Ptuh_genes_filt, 
                              colData = metadata, 
                              design = ~Timepoint+ColonyID)

# Variance Stabilizing Transformation
vsd_Ptuh_genes <- varianceStabilizingTransformation(dds_Ptuh, blind = TRUE)  # Must use varianceStabilizingTransformation() instead of vst() due to few input genes
vsd_Ptuh_genes <- assay(vsd_Ptuh_genes)
```

miRNA:
```{r}
dds_miRNA <- DESeqDataSetFromMatrix(countData = Ptuh_miRNA_filt, 
                              colData = metadata, 
                              design = ~Timepoint+ColonyID)

# Variance Stabilizing Transformation
vsd_Ptuh_miRNA <- varianceStabilizingTransformation(dds_miRNA, blind=TRUE) # Must use varianceStabilizingTransformation() instead of vst() due to few input genes
vsd_Ptuh_miRNA <- assay(vsd_Ptuh_miRNA)

```

lncRNA:
```{r}
# All values must be integers
Ptuh_lncRNA_filt <- Ptuh_lncRNA_filt %>% mutate(across(where(is.numeric), round))

dds_lncRNA <- DESeqDataSetFromMatrix(countData = Ptuh_lncRNA_filt, 
                              colData = metadata, 
                              design = ~Timepoint+ColonyID)

# Variance Stabilizing Transformation
vsd_Ptuh_lncRNA <- assay(vst(dds_lncRNA, blind = TRUE))
```

Also must round WGBS data to whole integers for normalization - need to return to this to decide if this is appropriate. 
```{r}
# All values must be integers
Ptuh_WGBS_filt<-Ptuh_WGBS_filt %>% mutate(across(where(is.numeric), round))

dds_wgbs <- DESeqDataSetFromMatrix(countData = Ptuh_WGBS_filt, 
                              colData = metadata, 
                              design = ~ Timepoint+ColonyID)

# Variance Stabilizing Transformation
vsd_Ptuh_WGBS <- assay(varianceStabilizingTransformation(dds_wgbs, blind = TRUE))
```

### Merge predictor features

Create a merged dataset that contains the variance stabilized counts for all miRNA, lncRNA, and CpGs.

```{r}
# Triple check that all three data frames have sample names in the same order
identical(colnames(vsd_Ptuh_lncRNA), colnames(vsd_Ptuh_miRNA))
identical(colnames(vsd_Ptuh_lncRNA), colnames(vsd_Ptuh_WGBS))

# Bind (stack dataframes vertically, so that they match by column/sample)
Ptuh_pred_counts <- rbind(vsd_Ptuh_lncRNA, vsd_Ptuh_miRNA, vsd_Ptuh_WGBS)

# Transform so that samples are on rows and features are in columns
Ptuh_pred_counts <- t(Ptuh_pred_counts)

dim(Ptuh_pred_counts)
```

### Format

```{r}
# Transform the counts matrices so that samples are on the rows and gene IDs on the columns
vsd_Ptuh_genes_t <- t(vsd_Ptuh_genes)

# Ensure both are formatted as data frames
Ptuh_pred_counts <- as.data.frame(Ptuh_pred_counts)
vsd_Ptuh_genes_t <- as.data.frame(vsd_Ptuh_genes_t)

# Ensure sample matching between gene and epigenetic dfs
common_samples <- intersect(rownames(vsd_Ptuh_genes_t), rownames(Ptuh_pred_counts))

Ptuh_pred_counts <- Ptuh_pred_counts[common_samples, ]
vsd_Ptuh_genes_t <- vsd_Ptuh_genes_t[common_samples,]
```


## Split training and test
## Bootstrapping

Hitting a caching issue when running initial bootstrapping n=10, so decreasing to n=5 for now
```{r, cache=TRUE}
n_replicates <- 5  # number of replicates
r2_threshold <- 0.5 # threshold for "good" prediction
Ptuh_TM_list <- list()
Ptuh_MP_list <- list()
Ptuh_FI_list <- list()

# Perform bootstrapping
for (i in 1:n_replicates) {
  cat("Beginning replicate", i)
  cat("\n")
  Ptuh_result <- train_models_split(vsd_Ptuh_genes_t, Ptuh_pred_counts)
  
  cat("Extracting results for replicate", i)
  cat("\n")
  Ptuh_trained_models <- Ptuh_result$models
  Ptuh_model_performance <- Ptuh_result$performance
  Ptuh_feature_importance <- get_feature_importance(Ptuh_trained_models)
  
  Ptuh_TM_list[[i]] <- Ptuh_trained_models
  Ptuh_TM_list[[i]]$Replicate <- i
  
  Ptuh_MP_list[[i]] <- Ptuh_model_performance
  Ptuh_MP_list[[i]]$Replicate <- i
  
  Ptuh_FI_list[[i]] <- Ptuh_feature_importance
  Ptuh_FI_list[[i]]$Replicate <- i
}

# Combine all results
Ptuh_all_TM <- do.call(rbind, Ptuh_TM_list)
Ptuh_all_MP <- do.call(rbind, Ptuh_MP_list)
Ptuh_all_FI <- do.call(rbind, Ptuh_FI_list)
```

```{r}
# Count how often each feature exceeds the R² threshold

Ptuh_MP_summary <- Ptuh_all_MP %>%
  group_by(Feature) %>%
  summarize(
    Mean_R2 = mean(R2, na.rm = TRUE),
    SD_R2 = sd(R2, na.rm = TRUE),
    SE_R2 = SD_R2/sqrt(n()),
    High_Perf_Count = sum(R2 >= r2_threshold),
    .groups = 'drop'
  ) %>%
  arrange(desc(High_Perf_Count))

# View top consistently high-performing features
print(Ptuh_MP_summary)

# How many genes are consistently well-predicted?
sum(!is.na(Ptuh_MP_summary$Feature) & Ptuh_MP_summary$Mean_R2 > 0.5, na.rm = TRUE)

```

Sort genes by average R^2 across all replicates, then plot R^2 for all replicates
```{r}
# Merge mean R2 info into full replicate-level data
Ptuh_all_MP <- merge(Ptuh_all_MP, Ptuh_MP_summary[, c("Feature", "Mean_R2", "SD_R2")], by = "Feature")

# Reorder Feature factor by Mean_R2
Ptuh_all_MP$Feature <- factor(Ptuh_all_MP$Feature,
                                          levels = Ptuh_MP_summary$Feature[order(Ptuh_MP_summary$Mean_R2)])
```

```{r}
# Plot mean R² with error bars
# Choose SD to describe variability in model performance
ggplot(Ptuh_all_MP, aes(x = Feature, y = Mean_R2)) +
  geom_errorbar(aes(ymin = Mean_R2 - SD_R2, ymax = Mean_R2 + SD_R2), width = 0.4, color = "gray40", linewidth = 0.25) +
  geom_point(size = 2, color = "black") +
  geom_hline(yintercept = 0.5,
             linetype = "dashed", color = "blue") +
  theme_minimal() +
  labs(title = "Ptuh: Mean R² with Error Bars Across Gene Features",
       x = "Gene Expression Feature (Ordered by Mean R²)",
       y = "Mean R² ± SD") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size=5)) +
  ylim(-0.2, 1.2) +
  scale_y_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1))

```

```{r}
# Plot with features sorted by mean R²
ggplot(Ptuh_all_MP, aes(x = Feature, y = R2)) +
  geom_point(aes(color = as.factor(Replicate)), size = 1) +
  geom_point(aes(y = Mean_R2), color = "black", shape = 18, size = 3) + 
  geom_hline(yintercept = mean(Ptuh_all_MP$R2, na.rm = TRUE),
             linetype = "dashed", color = "blue") +
  theme_minimal() +
  labs(title = "Ptuh: Model Performance Across Gene Expression",
       x = "Gene Expression Feature (Ordered by Mean R²)",
       y = "R² (Variance Explained)",
       color = "Replicate") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ylim(0, 1)

```

Average importance of all predictive features and plot those with highest average importance accross all replicates
```{r}
# Average importance across replicates
Ptuh_mean_importance_df <- Ptuh_all_FI %>%
  group_by(Feature) %>%
  summarise(MeanImportance = mean(MeanImportance, na.rm = TRUE)) %>%
  arrange(desc(MeanImportance))

# Select top 50 features
Ptuh_top_features <- Ptuh_mean_importance_df %>% top_n(50, MeanImportance)

# Plot
ggplot(Ptuh_top_features, aes(x = reorder(Feature, MeanImportance), y = MeanImportance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +  # Flip for readability
  theme_minimal() +
  labs(title = "Ptuh: Top Predictive Features by Mean Importance",
       x = "Feature",
       y = "Mean Importance")

```

## Boostrapping on predictable genes

Now I only want to model genes that are well-predicted over many replicates

Isolate well-predicted genes
```{r}
Ptuh_well_predicted <- Ptuh_MP_summary[Ptuh_MP_summary$Mean_R2 > 0.5,]

vsd_Ptuh_high_perf_t <- vsd_Ptuh_genes_t[,colnames(vsd_Ptuh_genes_t) %in% Ptuh_well_predicted$Feature]
dim(vsd_Ptuh_high_perf_t)
```

Of the original 100 genes selected from component 24, 30 are consistently well-predicted by our model (mean R^2 > 0.5, over 5 replicates).


Run bootstrapping on isolated genes
```{r, cache=TRUE}
n_replicates <- 10  # number of replicates
r2_threshold <- 0.5 # threshold for "good" prediction
Ptuh_TM_list <- list()
Ptuh_MP_list <- list()
Ptuh_FI_list <- list()

# Perform bootstrapping
for (i in 1:n_replicates) {
  cat("Beginning replicate", i)
  cat("\n")
  Ptuh_result <- train_models_split(vsd_Ptuh_high_perf_t, Ptuh_pred_counts)
  
  cat("Extracting results for replicate", i)
  cat("\n")
  Ptuh_trained_models <-Ptuh_result$models
  Ptuh_model_performance <- Ptuh_result$performance
  Ptuh_feature_importance <- get_feature_importance(Ptuh_trained_models)
  
  Ptuh_TM_list[[i]] <- Ptuh_trained_models
  Ptuh_TM_list[[i]]$Replicate <- i
  
  Ptuh_MP_list[[i]] <- Ptuh_model_performance
  Ptuh_MP_list[[i]]$Replicate <- i
  
  Ptuh_FI_list[[i]] <- Ptuh_feature_importance
  Ptuh_FI_list[[i]]$Replicate <- i
}

# Combine all results
Ptuh_all_TM_highperf <- do.call(rbind, Ptuh_TM_list)
Ptuh_all_MP_highperf <- do.call(rbind, Ptuh_MP_list)
Ptuh_all_FI_highperf <- do.call(rbind, Ptuh_FI_list)
```

```{r}
# Count how often each feature exceeds the R² threshold

Ptuh_MP_summary_highperf <- Ptuh_all_MP_highperf %>%
  group_by(Feature) %>%
  summarize(
    Mean_R2 = mean(R2, na.rm = TRUE),
    SD_R2 = sd(R2, na.rm = TRUE),
    SE_R2 = SD_R2 / sqrt(n()),
    High_Perf_Count = sum(R2 >= r2_threshold),
    .groups = 'drop'
  ) %>%
  arrange(desc(High_Perf_Count))

# View top consistently high-performing features
print(Ptuh_MP_summary_highperf)

# How many genes are consistently well-predicted?
nrow(Ptuh_MP_summary_highperf[Ptuh_MP_summary_highperf$Mean_R2 > 0.5,])
```

When I train the model on only these 20 well-predicted genes, 25 of them are again well-predicted. That's a much higher success rate! This suggests that epigenetic mechanisms may indeed be able to predict expression of these genes


Sort genes by average R^2 across all replicates, then plot R^2 for all replicates
```{r}
# Merge mean R2 info into full replicate-level data
Ptuh_all_MP_highperf <- merge(Ptuh_all_MP_highperf, Ptuh_MP_summary_highperf[, c("Feature", "Mean_R2", "SD_R2")], by = "Feature")

# Reorder Feature factor by Mean_R2
Ptuh_all_MP_highperf$Feature <- factor(Ptuh_all_MP_highperf$Feature,
                                          levels = Ptuh_MP_summary_highperf$Feature[order(Ptuh_MP_summary_highperf$Mean_R2)])
```


```{r}

# Plot mean R² with error bars
# Choose SD to describe variability in model performance
ggplot(Ptuh_all_MP_highperf, aes(x = Feature, y = Mean_R2)) +
  geom_errorbar(aes(ymin = Mean_R2 - SD_R2, ymax = Mean_R2 + SD_R2), width = 0.3, color = "gray40") +
  geom_point(size = 2, color = "black") +
  geom_hline(yintercept = 0.5,
             linetype = "dashed", color = "blue") +
  theme_minimal() +
  labs(title = "Ptuh: Mean R² with Error Bars Across Gene Features",
       x = "Gene Expression Feature (Ordered by Mean R²)",
       y = "Mean R² ± SD") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ylim(-0.2, 1.2) +
  scale_y_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1))

```

```{r}
# Plot all replicate R2s, with features sorted by mean R²
ggplot(Ptuh_all_MP_highperf, aes(x = Feature, y = R2)) +
  geom_point(aes(color = as.factor(Replicate)), size = 1.5) +
  geom_point(aes(y = Mean_R2), color = "black", shape = 18, size = 3) + 
  geom_hline(yintercept = mean(Ptuh_all_MP_highperf$R2, na.rm = TRUE),
             linetype = "dashed", color = "blue") +
  theme_minimal() +
  labs(title = "Ptuh: Model Performance Across Gene Expression",
       x = "Gene Expression Feature (Ordered by Mean R²)",
       y = "R² (Variance Explained)",
       color = "Replicate") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ylim(0, 1)

```

```{r}
# Average importance across replicates
Ptuh_mean_importance_df_highperf <- Ptuh_all_FI_highperf %>%
  group_by(Feature) %>%
  summarise(MeanImportance = mean(MeanImportance, na.rm = TRUE)) %>%
  arrange(desc(MeanImportance))

# Select top 50 features
Ptuh_top_features_highperf <- Ptuh_mean_importance_df_highperf %>% top_n(50, MeanImportance)

# Plot
ggplot(Ptuh_top_features_highperf, aes(x = reorder(Feature, MeanImportance), y = MeanImportance)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +  # Flip for readability
  theme_minimal() +
  labs(title = "Ptuh: Top Predictive Features by Mean Importance",
       x = "Feature",
       y = "Mean Importance")

```

Look at most predictive epigenetic features for these well-predicted genes
```{r}
# Define color assignments for predictor types
color_palette <- c(miRNA = "#E69F00", lncRNA = "#0072B2", CpG = "#009E73")

#  Define well-predicted features
Ptuh_all_features_highR2 <- Ptuh_all_MP_highperf %>%
  filter(Mean_R2 > 0.5) %>%
  pull(Feature) %>%
  unique()

# Function to extract importance from a single model
get_feature_importance_for_feature <- function(model) {
  coefs <- as.matrix(coef(model, s = "lambda.min"))[-1, , drop = FALSE]  # remove intercept
  data.frame(Predictor = rownames(coefs), Importance = abs(as.numeric(coefs)))
}

# Function to assign predictor type
get_predictor_type <- function(predictor_name) {
  if (startsWith(predictor_name, "Cluster")) return("miRNA")
  if (startsWith(predictor_name, "lncRNA")) return("lncRNA")
  if (startsWith(predictor_name, "CpG")) return("CpG")
  return("Other")
}

# Initialize list to save top predictors
Ptuh_top_predictors <- list()

# Loop over features, aggregate importance across replicates, and plot
for (target_feature in Ptuh_all_features_highR2) {
  
  # Extract and combine importance scores across all replicates
  importance_all_reps <- lapply(Ptuh_TM_list, function(rep_entry) {
    model <- rep_entry[[target_feature]]
    if (!is.null(model)) {
      get_feature_importance_for_feature(model)
    } else {
      NULL
    }
  }) %>% bind_rows()

  # If no importance data was found, skip
  if (nrow(importance_all_reps) == 0) next

  # Compute average importance across replicates
  mean_importance <- importance_all_reps %>%
    group_by(Predictor) %>%
    summarise(MeanImportance = mean(Importance, na.rm = TRUE), .groups = "drop") %>%
    arrange(desc(MeanImportance)) %>%
    slice_head(n = 20) %>%
    mutate(Type = sapply(Predictor, get_predictor_type))


  # Plot once per target feature
  plot <- ggplot(mean_importance, aes(x = reorder(Predictor, MeanImportance), y = MeanImportance, fill=Type)) +
    geom_bar(stat = "identity") +
    scale_fill_manual(values = color_palette) +
    coord_flip() +
    theme_minimal() +
    labs(title = paste("Top Predictors for", target_feature),
         x = "Predictor",
         y = "Mean Importance (across replicates)")

  # Save top 20 most important predictors for later use
  top20 <- mean_importance %>% 
    mutate(Feature = target_feature)
  
  Ptuh_top_predictors[[target_feature]] <- top20
  
  print(plot)
}
```


## Plot expression of genes and their predictors

```{r, error=FALSE, warning=FALSE}
# Put predictor expression in format with samples in columns
Ptuh_pred_counts_t <- t(Ptuh_pred_counts)

# Loop over each feature in the list
for (feature in names(Ptuh_top_predictors)) {
  
  # Get top 5 predictors for this feature
  top_df <- Ptuh_top_predictors[[feature]] %>%
    slice_head(n = 5)
  predictors <- top_df$Predictor
  
  # Combine feature and predictors
  all_genes <- c(feature, predictors)
  
  # Check which genes are actually present
  present_genes <- all_genes[all_genes %in% rownames(Ptuh_pred_counts_t) | all_genes %in% rownames(vsd_Ptuh_genes)]
  
  if (!(feature %in% rownames(vsd_Ptuh_genes))) {
    warning(paste("Feature", feature, "not found in vsd_Ptuh_genes Skipping..."))
    next
  }

  # Extract predictor expression (only those present)
  predictor_expr <- Ptuh_pred_counts_t[intersect(predictors, rownames(Ptuh_pred_counts_t)), , drop = FALSE] %>% as.data.frame()

  # Add the feature expression
  feature_expr <- vsd_Ptuh_genes[feature, , drop = FALSE] %>% as.data.frame()
  
  # Combine into one data frame
  combined_expr <- rbind(predictor_expr, feature_expr)
  combined_expr$Gene <- rownames(combined_expr)
  
  # Convert to long format
  expr_long <- combined_expr %>%
    pivot_longer(-Gene, names_to = "Sample", values_to = "Expression")
  
  # Join with sample metadata
  expr_long <- expr_long %>%
    left_join(metadata, by = "Sample")
  
  # Plot, with colonies aggregated by timepoint
  # p <- ggplot(expr_long, aes(x = interaction(Timepoint), y = Expression, color = Gene, group = Gene)) +
  #   geom_point(alpha = 0.7) +
  #   geom_smooth(se = FALSE, method = "loess") +
  #   labs(title = paste("Expression of", feature, "and Top 5 Predictors"),
  #        x = "Colony-Timepoint",
  #        y = "Expression") +
  #   theme_minimal() +
  #   theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  # Plot, faceted by colony
  p <- ggplot(expr_long, aes(x = Timepoint, y = Expression, color = Gene, group = Gene)) +
  geom_point(alpha = 0.7) +
  geom_smooth(se = FALSE, method = "loess", linewidth = 0.5) +
  facet_wrap(~ColonyID) +
  labs(title = paste("Expression of", feature, "and Top 5 Predictors by Colony"),
       x = "Timepoint",
       y = "Expression") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

  
  print(p)
}
```


Save top predictors and metrics of each well-predicted gene
```{r}
# Combine all sub-tibbles into a single data frame with the parent list name as an ID
Ptuh_top_predictors_df <- imap_dfr(Ptuh_top_predictors, ~ {
  .x
})

# View the resulting data frame
print(Ptuh_top_predictors_df)

# Optional: save to CSV
write.csv(Ptuh_top_predictors_df, "../output/26.1-rank35-optemization-ElasticNet-component24/Ptuh_top_predictors.csv", row.names = FALSE)

```