---
title: "32-R35c05-regulation"
author: "Steven Roberts"
date: "`r format(Sys.time(), '%Y-%m-%d')`"
output: 
  github_document:
    toc: true
    number_sections: true
  html_document:
    theme: cosmo
    toc: true
    toc_float: true
    number_sections: true
    code_folding: show
---

This analysis identifies genes with high scores in Rank 05 from the barnacle tensor decomposition and investigates which epigenetic features (lncRNA, gene body methylation, miRNA) may be regulating their expression across species.

# Setup

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, cache = TRUE)
```

```{r load-libraries}
library(tidyverse)
library(ggplot2)
library(reshape2)
library(corrplot)
library(pheatmap)
library(RColorBrewer)
library(scales)
library(knitr)
library(viridis)
```

# Load Gene Factors from Barnacle Decomposition

Load the gene_factors.csv from the rank 35 optimization output.

```{r load-gene-factors}
# Path to barnacle factors
gene_factors_path <- "../output/27-rank35-optimization/lambda_gene_0.2/barnacle_factors/gene_factors.csv"

# Read gene factors
gene_factors <- read.csv(gene_factors_path, row.names = 1)

# Fix column names - columns 0-34 correspond to ranks 01-35
colnames(gene_factors) <- paste0("Rank_", sprintf("%02d", 1:35))

# View structure
dim(gene_factors)
head(gene_factors[, 1:10])
```

# Identify Genes with Scores > 0 in Rank 05

```{r rank05-genes}
# Get rank 05 scores (5th column)
rank05_scores <- gene_factors$Rank_05

# Create dataframe with gene IDs and rank 05 scores
rank05_df <- data.frame(
  Gene = rownames(gene_factors),
  Rank_05_Score = rank05_scores
) %>%
  filter(Rank_05_Score > 0) %>%
  arrange(desc(Rank_05_Score))

# Summary statistics
cat("Total genes in dataset:", nrow(gene_factors), "\n")
cat("Genes with Rank 05 score > 0:", nrow(rank05_df), "\n")
cat("\nTop 20 genes by Rank 05 score:\n")
print(head(rank05_df, 20))
```

# Plot Rank 05 Genes Across All 35 Ranks

## Prepare data for visualization

```{r prepare-heatmap-data}
# Filter gene_factors to only genes with positive rank 05 scores
rank05_genes <- rank05_df$Gene
gene_factors_filtered <- gene_factors[rank05_genes, ]

# For visualization, take top 100 genes by rank 05 score
top100_genes <- rank05_df$Gene[1:min(100, nrow(rank05_df))]
gene_factors_top100 <- gene_factors[top100_genes, ]

cat("Number of genes for heatmap:", nrow(gene_factors_top100), "\n")
```

## Heatmap of gene factor scores across ranks

```{r heatmap-rank05-genes, fig.width=12, fig.height=14}
# Create heatmap of top 100 genes across all ranks
pheatmap(
  as.matrix(gene_factors_top100),
  cluster_rows = TRUE,
  cluster_cols = FALSE,
  scale = "row",
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100),
  main = "Top 100 Rank 05 Genes: Factor Scores Across 35 Ranks",
  fontsize_row = 5,
  fontsize_col = 8,
  show_rownames = TRUE
)
```

## Line plot showing gene trajectories across ranks

```{r lineplot-trajectories, fig.width=14, fig.height=8}
# Convert to long format for ggplot
gene_factors_long <- gene_factors_top100 %>%
  rownames_to_column("Gene") %>%
  pivot_longer(
    cols = starts_with("Rank_"),
    names_to = "Rank",
    values_to = "Score"
  ) %>%
  mutate(
    Rank_Num = as.numeric(gsub("Rank_", "", Rank))
  )

# Line plot
ggplot(gene_factors_long, aes(x = Rank_Num, y = Score, group = Gene)) +
  geom_line(alpha = 0.3, color = "steelblue") +
  geom_point(data = gene_factors_long %>% filter(Rank_Num == 5), 
             aes(x = Rank_Num, y = Score), 
             color = "red", size = 2, alpha = 0.5) +
  labs(
    title = "Gene Factor Scores Across 35 Ranks",
    subtitle = "Top 100 genes by Rank 05 score (red points indicate Rank 05)",
    x = "Rank",
    y = "Factor Score"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  scale_x_continuous(breaks = 1:35)
```

## Distribution of Rank 05 scores

```{r rank05-distribution, fig.width=10, fig.height=5}
ggplot(rank05_df, aes(x = Rank_05_Score)) +
  geom_histogram(bins = 50, fill = "steelblue", color = "white", alpha = 0.7) +
  geom_vline(xintercept = quantile(rank05_df$Rank_05_Score, 0.9), 
             color = "red", linetype = "dashed", size = 1) +
  labs(
    title = "Distribution of Rank 05 Scores (Genes with Score > 0)",
    subtitle = "Red dashed line indicates 90th percentile",
    x = "Rank 05 Score",
    y = "Count"
  ) +
  theme_minimal()
```

# Load Expression Data

Load gene expression matrices for all three species.

```{r load-expression-data}
# Gene Expression Matrices from GitHub
apul_expr_url <- "https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/M-multi-species/output/33-biomin-pathway-counts/apul_biomin_counts.csv"
peve_expr_url <- "https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/M-multi-species/output/33-biomin-pathway-counts/peve_biomin_counts.csv"
ptua_expr_url <- "https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/M-multi-species/output/33-biomin-pathway-counts/ptua_biomin_counts.csv"

# Try to load expression data - these URLs may or may not be available
tryCatch({
  apul_expr <- read.csv(apul_expr_url, row.names = 1)
  cat("Loaded Apul expression:", dim(apul_expr)[1], "genes x", dim(apul_expr)[2], "samples\n")
}, error = function(e) {
  cat("Could not load Apul expression data. URL may not be accessible.\n")
  apul_expr <- NULL
})

tryCatch({
  peve_expr <- read.csv(peve_expr_url, row.names = 1)
  cat("Loaded Peve expression:", dim(peve_expr)[1], "genes x", dim(peve_expr)[2], "samples\n")
}, error = function(e) {
  cat("Could not load Peve expression data. URL may not be accessible.\n")
  peve_expr <- NULL
})

tryCatch({
  ptua_expr <- read.csv(ptua_expr_url, row.names = 1)
  cat("Loaded Ptua expression:", dim(ptua_expr)[1], "genes x", dim(ptua_expr)[2], "samples\n")
}, error = function(e) {
  cat("Could not load Ptua expression data. URL may not be accessible.\n")
  ptua_expr <- NULL
})
```

# Load Epigenetic Features

## Long non-coding RNA (lncRNA) counts

```{r load-lncrna}
# lncRNA URLs
apul_lncrna_url <- "https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/D-Apul/output/31.5-Apul-lncRNA-discovery/lncRNA_counts.clean.filtered.txt"
peve_lncrna_url <- "https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/E-Peve/output/12-Peve-lncRNA-discovery/lncRNA_counts.clean.filtered.txt"
ptua_lncrna_url <- "https://raw.githubusercontent.com/urol-e5/timeseries_molecular/refs/heads/main/F-Ptua/output/06-Ptua-lncRNA-discovery/lncRNA_counts.clean.filtered.txt"

# Load lncRNA data
tryCatch({
  apul_lncrna <- read.delim(apul_lncrna_url, row.names = 1)
  cat("Loaded Apul lncRNA:", dim(apul_lncrna)[1], "lncRNAs x", dim(apul_lncrna)[2], "samples\n")
}, error = function(e) {
  cat("Could not load Apul lncRNA data:", conditionMessage(e), "\n")
  apul_lncrna <<- NULL
})

tryCatch({
  peve_lncrna <- read.delim(peve_lncrna_url, row.names = 1)
  cat("Loaded Peve lncRNA:", dim(peve_lncrna)[1], "lncRNAs x", dim(peve_lncrna)[2], "samples\n")
}, error = function(e) {
  cat("Could not load Peve lncRNA data:", conditionMessage(e), "\n")
  peve_lncrna <<- NULL
})

tryCatch({
  ptua_lncrna <- read.delim(ptua_lncrna_url, row.names = 1)
  cat("Loaded Ptua lncRNA:", dim(ptua_lncrna)[1], "lncRNAs x", dim(ptua_lncrna)[2], "samples\n")
}, error = function(e) {
  cat("Could not load Ptua lncRNA data:", conditionMessage(e), "\n")
  ptua_lncrna <<- NULL
})
```

## Gene Body Methylation

```{r load-methylation}
# Gene body methylation URLs
apul_meth_url <- "https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/D-Apul/output/40-Apul-Gene-Methylation/Apul-gene-methylation_75pct.tsv"
peve_meth_url <- "https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/E-Peve/output/15-Peve-Gene-Methylation/Peve-gene-methylation_75pct.tsv"
ptua_meth_url <- "https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/F-Ptua/output/09-Ptua-Gene-Methylation/Ptua-gene-methylation_75pct.tsv"

# Load methylation data
tryCatch({
  apul_meth <- read.delim(apul_meth_url, row.names = 1)
  cat("Loaded Apul methylation:", dim(apul_meth)[1], "genes x", dim(apul_meth)[2], "samples\n")
}, error = function(e) {
  cat("Could not load Apul methylation data:", conditionMessage(e), "\n")
  apul_meth <<- NULL
})

tryCatch({
  peve_meth <- read.delim(peve_meth_url, row.names = 1)
  cat("Loaded Peve methylation:", dim(peve_meth)[1], "genes x", dim(peve_meth)[2], "samples\n")
}, error = function(e) {
  cat("Could not load Peve methylation data:", conditionMessage(e), "\n")
  peve_meth <<- NULL
})

tryCatch({
  ptua_meth <- read.delim(ptua_meth_url, row.names = 1)
  cat("Loaded Ptua methylation:", dim(ptua_meth)[1], "genes x", dim(ptua_meth)[2], "samples\n")
}, error = function(e) {
  cat("Could not load Ptua methylation data:", conditionMessage(e), "\n")
  ptua_meth <<- NULL
})
```

## miRNA counts

```{r load-mirna}
# miRNA URLs
apul_mirna_url <- "https://raw.githubusercontent.com/urol-e5/timeseries_molecular/refs/heads/main/M-multi-species/output/10-format-miRNA-counts/Apul_miRNA_counts_formatted.txt"
peve_mirna_url <- "https://raw.githubusercontent.com/urol-e5/timeseries_molecular/refs/heads/main/M-multi-species/output/10-format-miRNA-counts/Peve_miRNA_counts_formatted.txt"
ptua_mirna_url <- "https://raw.githubusercontent.com/urol-e5/timeseries_molecular/refs/heads/main/M-multi-species/output/10-format-miRNA-counts/Ptuh_miRNA_counts_formatted.txt"

# Load miRNA data
tryCatch({
  apul_mirna <- read.delim(apul_mirna_url, row.names = 1)
  cat("Loaded Apul miRNA:", dim(apul_mirna)[1], "miRNAs x", dim(apul_mirna)[2], "samples\n")
}, error = function(e) {
  cat("Could not load Apul miRNA data:", conditionMessage(e), "\n")
  apul_mirna <<- NULL
})

tryCatch({
  peve_mirna <- read.delim(peve_mirna_url, row.names = 1)
  cat("Loaded Peve miRNA:", dim(peve_mirna)[1], "miRNAs x", dim(peve_mirna)[2], "samples\n")
}, error = function(e) {
  cat("Could not load Peve miRNA data:", conditionMessage(e), "\n")
  peve_mirna <<- NULL
})

tryCatch({
  ptua_mirna <- read.delim(ptua_mirna_url, row.names = 1)
  cat("Loaded Ptua miRNA:", dim(ptua_mirna)[1], "miRNAs x", dim(ptua_mirna)[2], "samples\n")
}, error = function(e) {
  cat("Could not load Ptua miRNA data:", conditionMessage(e), "\n")
  ptua_mirna <<- NULL
})
```

# Load Sample Mapping

```{r load-sample-mapping}
# Load sample mapping to link ortholog groups to species-specific genes
sample_mapping <- read.csv("../output/27-rank35-optimization/lambda_gene_0.2/barnacle_factors/sample_mapping.csv")
print(head(sample_mapping, 15))
```

# Correlation Analysis: Expression vs Epigenetic Features

## Helper function for correlation analysis

```{r correlation-functions}
# Function to calculate correlations between expression and epigenetic features
calculate_correlations <- function(expr_data, epi_data, expr_name, epi_name, species) {
  
  if (is.null(expr_data) || is.null(epi_data)) {
    return(NULL)
  }
  
  # Find common samples
  common_samples <- intersect(colnames(expr_data), colnames(epi_data))
  
  if (length(common_samples) < 3) {
    cat("Insufficient common samples for", species, expr_name, "vs", epi_name, "\n")
    return(NULL)
  }
  
  cat("Analyzing", species, ":", length(common_samples), "common samples for", 
      expr_name, "vs", epi_name, "\n")
  
  # Subset to common samples
  expr_subset <- expr_data[, common_samples, drop = FALSE]
  epi_subset <- epi_data[, common_samples, drop = FALSE]
  
  # Calculate pairwise correlations
  correlations <- list()
  
  for (gene in rownames(expr_subset)) {
    gene_expr <- as.numeric(expr_subset[gene, ])
    
    for (epi_feature in rownames(epi_subset)) {
      epi_values <- as.numeric(epi_subset[epi_feature, ])
      
      # Calculate correlation if there's variation
      if (sd(gene_expr, na.rm = TRUE) > 0 && sd(epi_values, na.rm = TRUE) > 0) {
        cor_result <- tryCatch({
          cor.test(gene_expr, epi_values, method = "spearman", use = "complete.obs")
        }, error = function(e) NULL)
        
        if (!is.null(cor_result)) {
          correlations[[length(correlations) + 1]] <- data.frame(
            Species = species,
            Gene = gene,
            EpigeneticFeature = epi_feature,
            FeatureType = epi_name,
            Correlation = cor_result$estimate,
            PValue = cor_result$p.value
          )
        }
      }
    }
  }
  
  if (length(correlations) > 0) {
    result <- bind_rows(correlations)
    # Adjust p-values for multiple testing
    result$AdjPValue <- p.adjust(result$PValue, method = "BH")
    return(result)
  }
  
  return(NULL)
}
```

## Run correlation analyses for each species

```{r run-correlations}
all_correlations <- list()

# Apul correlations
if (exists("apul_expr") && !is.null(apul_expr)) {
  if (exists("apul_lncrna") && !is.null(apul_lncrna)) {
    cor_apul_lncrna <- calculate_correlations(apul_expr, apul_lncrna, "Expression", "lncRNA", "Apul")
    if (!is.null(cor_apul_lncrna)) all_correlations[["apul_lncrna"]] <- cor_apul_lncrna
  }
  if (exists("apul_meth") && !is.null(apul_meth)) {
    cor_apul_meth <- calculate_correlations(apul_expr, apul_meth, "Expression", "Methylation", "Apul")
    if (!is.null(cor_apul_meth)) all_correlations[["apul_meth"]] <- cor_apul_meth
  }
  if (exists("apul_mirna") && !is.null(apul_mirna)) {
    cor_apul_mirna <- calculate_correlations(apul_expr, apul_mirna, "Expression", "miRNA", "Apul")
    if (!is.null(cor_apul_mirna)) all_correlations[["apul_mirna"]] <- cor_apul_mirna
  }
}

# Peve correlations
if (exists("peve_expr") && !is.null(peve_expr)) {
  if (exists("peve_lncrna") && !is.null(peve_lncrna)) {
    cor_peve_lncrna <- calculate_correlations(peve_expr, peve_lncrna, "Expression", "lncRNA", "Peve")
    if (!is.null(cor_peve_lncrna)) all_correlations[["peve_lncrna"]] <- cor_peve_lncrna
  }
  if (exists("peve_meth") && !is.null(peve_meth)) {
    cor_peve_meth <- calculate_correlations(peve_expr, peve_meth, "Expression", "Methylation", "Peve")
    if (!is.null(cor_peve_meth)) all_correlations[["peve_meth"]] <- cor_peve_meth
  }
  if (exists("peve_mirna") && !is.null(peve_mirna)) {
    cor_peve_mirna <- calculate_correlations(peve_expr, peve_mirna, "Expression", "miRNA", "Peve")
    if (!is.null(cor_peve_mirna)) all_correlations[["peve_mirna"]] <- cor_peve_mirna
  }
}

# Ptua correlations
if (exists("ptua_expr") && !is.null(ptua_expr)) {
  if (exists("ptua_lncrna") && !is.null(ptua_lncrna)) {
    cor_ptua_lncrna <- calculate_correlations(ptua_expr, ptua_lncrna, "Expression", "lncRNA", "Ptua")
    if (!is.null(cor_ptua_lncrna)) all_correlations[["ptua_lncrna"]] <- cor_ptua_lncrna
  }
  if (exists("ptua_meth") && !is.null(ptua_meth)) {
    cor_ptua_meth <- calculate_correlations(ptua_expr, ptua_meth, "Expression", "Methylation", "Ptua")
    if (!is.null(cor_ptua_meth)) all_correlations[["ptua_meth"]] <- cor_ptua_meth
  }
  if (exists("ptua_mirna") && !is.null(ptua_mirna)) {
    cor_ptua_mirna <- calculate_correlations(ptua_expr, ptua_mirna, "Expression", "miRNA", "Ptua")
    if (!is.null(cor_ptua_mirna)) all_correlations[["ptua_mirna"]] <- cor_ptua_mirna
  }
}

# Combine all correlations
if (length(all_correlations) > 0) {
  combined_correlations <- bind_rows(all_correlations)
  cat("\nTotal correlations calculated:", nrow(combined_correlations), "\n")
} else {
  combined_correlations <- NULL
  cat("\nNo correlations could be calculated - data may not be available\n")
}
```

## Summary Table: Significant Correlations by Species and Feature Type

```{r correlation-summary-table}
if (!is.null(combined_correlations) && nrow(combined_correlations) > 0) {
  
  # Filter for significant correlations (adjusted p-value < 0.05)
  sig_correlations <- combined_correlations %>%
    filter(AdjPValue < 0.05)
  
  cat("Total significant correlations (adj. p < 0.05):", nrow(sig_correlations), "\n\n")
  
  # Summary by species and feature type
  correlation_summary <- sig_correlations %>%
    group_by(Species, FeatureType) %>%
    summarize(
      N_Significant = n(),
      N_Positive = sum(Correlation > 0),
      N_Negative = sum(Correlation < 0),
      Mean_AbsCorr = mean(abs(Correlation)),
      Max_AbsCorr = max(abs(Correlation)),
      N_Genes = n_distinct(Gene),
      N_Features = n_distinct(EpigeneticFeature),
      .groups = "drop"
    ) %>%
    arrange(Species, FeatureType)
  
  cat("Summary of Significant Correlations by Species and Feature Type:\n")
  print(knitr::kable(correlation_summary, digits = 3))
  
} else {
  cat("No correlation data available for summary.\n")
}
```

## Top Correlated Epigenetic Features per Species

```{r top-correlated-features}
if (!is.null(combined_correlations) && nrow(combined_correlations) > 0) {
  
  # Get top 20 most significant correlations per species
  top_correlations <- combined_correlations %>%
    filter(AdjPValue < 0.05) %>%
    group_by(Species) %>%
    slice_min(order_by = AdjPValue, n = 20) %>%
    ungroup() %>%
    select(Species, Gene, EpigeneticFeature, FeatureType, Correlation, AdjPValue) %>%
    arrange(Species, AdjPValue)
  
  cat("Top 20 Most Significant Correlations per Species:\n\n")
  print(knitr::kable(top_correlations, digits = 4))
  
}
```

## Visualization: Correlation Distribution by Species and Feature Type

```{r correlation-distribution-plot, fig.width=14, fig.height=8}
if (!is.null(combined_correlations) && nrow(combined_correlations) > 0) {
  
  # Filter significant correlations
  sig_corr <- combined_correlations %>%
    filter(AdjPValue < 0.05)
  
  if (nrow(sig_corr) > 0) {
    # Violin plot of correlations by species and feature type
    p1 <- ggplot(sig_corr, aes(x = FeatureType, y = Correlation, fill = Species)) +
      geom_violin(alpha = 0.7, position = position_dodge(0.8)) +
      geom_boxplot(width = 0.2, position = position_dodge(0.8), alpha = 0.8) +
      geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
      scale_fill_brewer(palette = "Set2") +
      labs(
        title = "Distribution of Significant Correlations",
        subtitle = "Expression vs Epigenetic Features (adj. p < 0.05)",
        x = "Epigenetic Feature Type",
        y = "Spearman Correlation"
      ) +
      theme_minimal() +
      theme(
        plot.title = element_text(size = 14, face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1)
      )
    
    print(p1)
  }
}
```

## Visualization: Number of Correlated Features per Gene

```{r features-per-gene-plot, fig.width=12, fig.height=8}
if (!is.null(combined_correlations) && nrow(combined_correlations) > 0) {
  
  sig_corr <- combined_correlations %>%
    filter(AdjPValue < 0.05)
  
  if (nrow(sig_corr) > 0) {
    # Count features per gene per species
    features_per_gene <- sig_corr %>%
      group_by(Species, Gene, FeatureType) %>%
      summarize(N_Features = n(), .groups = "drop") %>%
      pivot_wider(names_from = FeatureType, values_from = N_Features, values_fill = 0)
    
    # Stacked bar plot
    features_summary <- sig_corr %>%
      group_by(Species, FeatureType) %>%
      summarize(
        N_Correlations = n(),
        N_Genes = n_distinct(Gene),
        .groups = "drop"
      )
    
    p2 <- ggplot(features_summary, aes(x = Species, y = N_Correlations, fill = FeatureType)) +
      geom_bar(stat = "identity", position = "dodge", alpha = 0.8) +
      geom_text(aes(label = N_Correlations), position = position_dodge(0.9), vjust = -0.5, size = 3) +
      scale_fill_brewer(palette = "Dark2") +
      labs(
        title = "Number of Significant Correlations by Species",
        subtitle = "Expression-Epigenetic Feature Pairs (adj. p < 0.05)",
        x = "Species",
        y = "Number of Significant Correlations",
        fill = "Feature Type"
      ) +
      theme_minimal() +
      theme(plot.title = element_text(size = 14, face = "bold"))
    
    print(p2)
  }
}
```

## Heatmap: Top Epigenetic Features Correlated with Expression

```{r top-features-heatmap, fig.width=14, fig.height=10}
if (!is.null(combined_correlations) && nrow(combined_correlations) > 0) {
  
  sig_corr <- combined_correlations %>%
    filter(AdjPValue < 0.05)
  
  if (nrow(sig_corr) > 0) {
    # Get top 50 most frequently correlated epigenetic features
    top_features <- sig_corr %>%
      group_by(EpigeneticFeature, FeatureType) %>%
      summarize(
        N_Genes = n_distinct(Gene),
        N_Species = n_distinct(Species),
        Mean_Correlation = mean(Correlation),
        .groups = "drop"
      ) %>%
      arrange(desc(N_Species), desc(N_Genes)) %>%
      head(50)
    
    cat("\nTop 50 Epigenetic Features by Number of Correlated Genes:\n")
    print(knitr::kable(head(top_features, 30), digits = 3))
    
    # Create heatmap data
    heatmap_data <- sig_corr %>%
      filter(EpigeneticFeature %in% top_features$EpigeneticFeature) %>%
      group_by(Species, EpigeneticFeature, FeatureType) %>%
      summarize(
        Mean_Correlation = mean(Correlation),
        N_Genes = n(),
        .groups = "drop"
      )
    
    # Wide format for heatmap
    heatmap_wide <- heatmap_data %>%
      select(Species, EpigeneticFeature, Mean_Correlation) %>%
      pivot_wider(names_from = Species, values_from = Mean_Correlation, values_fill = 0)
    
    if (nrow(heatmap_wide) > 1) {
      # Convert to matrix
      heatmap_mat <- as.matrix(heatmap_wide[, -1])
      rownames(heatmap_mat) <- heatmap_wide$EpigeneticFeature
      
      # Create annotation for feature type
      feature_types <- heatmap_data %>%
        select(EpigeneticFeature, FeatureType) %>%
        distinct()
      
      row_annotation <- data.frame(
        FeatureType = feature_types$FeatureType[match(rownames(heatmap_mat), feature_types$EpigeneticFeature)]
      )
      rownames(row_annotation) <- rownames(heatmap_mat)
      
      # Heatmap
      pheatmap(
        heatmap_mat,
        cluster_rows = TRUE,
        cluster_cols = FALSE,
        color = colorRampPalette(c("blue", "white", "red"))(100),
        breaks = seq(-1, 1, length.out = 101),
        annotation_row = row_annotation,
        main = "Mean Correlation of Top Epigenetic Features Across Species",
        fontsize_row = 6,
        fontsize_col = 10
      )
    }
  }
}
```

# Cross-Species Epigenetic Regulation Analysis

## Identify Shared Regulatory Patterns

This section identifies epigenetic features that show consistent correlation patterns with gene expression across multiple species - these represent potential conserved regulatory mechanisms.

```{r cross-species-analysis}
if (!is.null(combined_correlations) && nrow(combined_correlations) > 0) {
  
  sig_corr <- combined_correlations %>%
    filter(AdjPValue < 0.05)
  
  if (nrow(sig_corr) > 0) {
    # Find genes with significant correlations in multiple species
    gene_species_count <- sig_corr %>%
      group_by(Gene) %>%
      summarize(
        N_Species = n_distinct(Species),
        Species_List = paste(unique(Species), collapse = ", "),
        N_Features = n_distinct(EpigeneticFeature),
        Feature_Types = paste(unique(FeatureType), collapse = ", "),
        .groups = "drop"
      ) %>%
      filter(N_Species >= 2) %>%
      arrange(desc(N_Species), desc(N_Features))
    
    cat("\nGenes with Epigenetic Correlations in Multiple Species:\n")
    print(knitr::kable(head(gene_species_count, 30)))
  }
}
```

## Table: Conserved Epigenetic Features Across Species

```{r conserved-features-table}
if (!is.null(combined_correlations) && nrow(combined_correlations) > 0) {
  
  sig_corr <- combined_correlations %>%
    filter(AdjPValue < 0.05)
  
  if (nrow(sig_corr) > 0) {
    # Find epigenetic features correlated with same genes across species
    # Group by gene and feature type to find cross-species patterns
    cross_species_features <- sig_corr %>%
      group_by(Gene, FeatureType) %>%
      summarize(
        N_Species = n_distinct(Species),
        Species = paste(sort(unique(Species)), collapse = ", "),
        Features = paste(unique(EpigeneticFeature), collapse = "; "),
        Mean_Correlation = mean(Correlation),
        Correlation_Direction = ifelse(mean(Correlation) > 0, "Positive", "Negative"),
        .groups = "drop"
      ) %>%
      filter(N_Species >= 2) %>%
      arrange(desc(N_Species), FeatureType, Gene)
    
    cat("\nConserved Epigenetic Regulation Patterns (Same Gene, Multiple Species):\n\n")
    
    if (nrow(cross_species_features) > 0) {
      # Summary by feature type
      conserved_summary <- cross_species_features %>%
        group_by(FeatureType, N_Species) %>%
        summarize(
          N_Genes = n(),
          N_Positive = sum(Correlation_Direction == "Positive"),
          N_Negative = sum(Correlation_Direction == "Negative"),
          .groups = "drop"
        )
      
      cat("Summary of Conserved Patterns by Feature Type:\n")
      print(knitr::kable(conserved_summary))
      
      cat("\n\nDetailed Conserved Patterns (showing first 50):\n")
      print(knitr::kable(head(cross_species_features, 50), digits = 3))
    } else {
      cat("No conserved patterns found across species.\n")
    }
  }
}
```

## Visualization: Cross-Species Regulatory Network

```{r cross-species-network, fig.width=14, fig.height=12}
if (!is.null(combined_correlations) && nrow(combined_correlations) > 0) {
  
  sig_corr <- combined_correlations %>%
    filter(AdjPValue < 0.05)
  
  if (nrow(sig_corr) > 0) {
    # Create summary for visualization
    species_feature_summary <- sig_corr %>%
      group_by(FeatureType, Species) %>%
      summarize(
        N_Correlations = n(),
        N_Genes = n_distinct(Gene),
        N_Features = n_distinct(EpigeneticFeature),
        Mean_AbsCorr = mean(abs(Correlation)),
        .groups = "drop"
      )
    
    # Grouped bar chart
    p3 <- ggplot(species_feature_summary, 
                 aes(x = Species, y = N_Genes, fill = FeatureType)) +
      geom_bar(stat = "identity", position = "dodge", alpha = 0.8) +
      geom_text(aes(label = N_Genes), 
                position = position_dodge(0.9), 
                vjust = -0.5, size = 3) +
      scale_fill_brewer(palette = "Set1") +
      labs(
        title = "Number of Genes with Significant Epigenetic Correlations",
        subtitle = "By Species and Epigenetic Feature Type",
        x = "Species",
        y = "Number of Genes",
        fill = "Feature Type"
      ) +
      theme_minimal() +
      theme(
        plot.title = element_text(size = 14, face = "bold"),
        legend.position = "bottom"
      )
    
    print(p3)
    
    # Faceted scatter plot of correlations
    p4 <- ggplot(sig_corr %>% sample_n(min(5000, n())), 
                 aes(x = Correlation, y = -log10(AdjPValue), color = Species)) +
      geom_point(alpha = 0.5, size = 1) +
      facet_wrap(~FeatureType, scales = "free_y") +
      geom_vline(xintercept = 0, linetype = "dashed", color = "gray50") +
      scale_color_brewer(palette = "Set2") +
      labs(
        title = "Volcano Plot: Expression-Epigenetic Correlations",
        subtitle = "Significant correlations by feature type (sample of up to 5000 points)",
        x = "Spearman Correlation",
        y = "-log10(Adjusted P-value)"
      ) +
      theme_minimal() +
      theme(
        plot.title = element_text(size = 14, face = "bold"),
        strip.text = element_text(face = "bold")
      )
    
    print(p4)
  }
}
```

## Species Comparison: Correlation Strength by Feature Type

```{r species-comparison-plot, fig.width=12, fig.height=8}
if (!is.null(combined_correlations) && nrow(combined_correlations) > 0) {
  
  sig_corr <- combined_correlations %>%
    filter(AdjPValue < 0.05)
  
  if (nrow(sig_corr) > 0) {
    # Box plot comparing correlation strengths
    p5 <- ggplot(sig_corr, aes(x = Species, y = abs(Correlation), fill = FeatureType)) +
      geom_boxplot(alpha = 0.7, outlier.size = 0.5) +
      scale_fill_brewer(palette = "Pastel1") +
      labs(
        title = "Comparison of Correlation Strengths Across Species",
        subtitle = "Absolute value of significant correlations (adj. p < 0.05)",
        x = "Species",
        y = "Absolute Correlation",
        fill = "Feature Type"
      ) +
      theme_minimal() +
      theme(
        plot.title = element_text(size = 14, face = "bold"),
        legend.position = "bottom"
      )
    
    print(p5)
  }
}
```

## Summary Table: Epigenetic Features Regulating Genes in All Three Species

```{r all-three-species-table}
if (!is.null(combined_correlations) && nrow(combined_correlations) > 0) {
  
  sig_corr <- combined_correlations %>%
    filter(AdjPValue < 0.05)
  
  if (nrow(sig_corr) > 0) {
    # Find genes regulated by same feature type in all 3 species
    genes_all_species <- sig_corr %>%
      group_by(Gene, FeatureType) %>%
      filter(n_distinct(Species) == 3) %>%
      summarize(
        Species = paste(sort(unique(Species)), collapse = ", "),
        Correlations = paste(sprintf("%.3f", Correlation), collapse = ", "),
        Mean_Correlation = mean(Correlation),
        All_Positive = all(Correlation > 0),
        All_Negative = all(Correlation < 0),
        Direction = case_when(
          all(Correlation > 0) ~ "Consistently Positive",
          all(Correlation < 0) ~ "Consistently Negative",
          TRUE ~ "Mixed"
        ),
        .groups = "drop"
      ) %>%
      arrange(FeatureType, desc(abs(Mean_Correlation)))
    
    if (nrow(genes_all_species) > 0) {
      cat("\n==========================================\n")
      cat("GENES WITH EPIGENETIC CORRELATIONS IN ALL THREE SPECIES\n")
      cat("==========================================\n\n")
      
      cat("These genes show significant correlation between expression and\n")
      cat("epigenetic features in Apul, Peve, AND Ptua - potential conserved regulation.\n\n")
      
      print(knitr::kable(genes_all_species, digits = 3))
      
      # Count by direction
      direction_summary <- genes_all_species %>%
        group_by(FeatureType, Direction) %>%
        summarize(N_Genes = n(), .groups = "drop")
      
      cat("\n\nSummary by Feature Type and Direction:\n")
      print(knitr::kable(direction_summary))
      
    } else {
      cat("No genes found with significant correlations in all three species.\n")
    }
  }
}
```

# Identify Cross-Species Regulatory Patterns

## Map ortholog groups to species-specific genes

```{r map-orthologs}
# The top 100 genes from rank 05 are ortholog groups (OG_XXXXX)
# We need to map these to species-specific genes to analyze epigenetic regulation

# Load ortholog annotation if available
ortho_annot_path <- "../output/12-ortho-annot/ortholog_groups_annotated.csv"

if (file.exists(ortho_annot_path)) {
  ortho_annot <- read.csv(ortho_annot_path)
  cat("Loaded ortholog annotations:", nrow(ortho_annot), "entries\n")
  print(head(ortho_annot))
} else {
  cat("Ortholog annotation file not found. Manual mapping may be required.\n")
}
```

## Summary of top Rank 05 genes

```{r top-genes-summary}
# Create summary table of top 100 rank 05 genes with their scores across key ranks
top_genes_summary <- gene_factors_top100 %>%
  rownames_to_column("Gene") %>%
  mutate(
    Rank_05 = Rank_05,
    Max_Score = apply(gene_factors_top100, 1, max),
    Max_Rank = apply(gene_factors_top100, 1, which.max)
  ) %>%
  select(Gene, Rank_05, Max_Score, Max_Rank, everything()) %>%
  arrange(desc(Rank_05))

cat("\nTop 20 genes from Rank 05 with their peak ranks:\n")
print(head(top_genes_summary[, c("Gene", "Rank_05", "Max_Score", "Max_Rank")], 20))
```

## Identify genes with consistent patterns

```{r consistent-patterns, fig.width=12, fig.height=8}
# Find genes that have high scores in rank 05 AND other ranks
# These may represent core regulatory modules

# Calculate summary statistics for each gene
gene_stats <- gene_factors_top100 %>%
  rownames_to_column("Gene") %>%
  pivot_longer(
    cols = starts_with("Rank_"),
    names_to = "Rank",
    values_to = "Score"
  ) %>%
  group_by(Gene) %>%
  summarize(
    Mean_Score = mean(Score),
    SD_Score = sd(Score),
    N_HighRanks = sum(Score > 0.5),
    Rank_05_Score = Score[Rank == "Rank_05"],
    .groups = "drop"
  ) %>%
  arrange(desc(N_HighRanks), desc(Mean_Score))

cat("\nGenes with highest mean scores across ranks:\n")
print(head(gene_stats, 20))

# Visualize gene variability
ggplot(gene_stats, aes(x = Mean_Score, y = SD_Score, color = N_HighRanks)) +
  geom_point(alpha = 0.7, size = 3) +
  scale_color_viridis_c() +
  labs(
    title = "Gene Score Variability Across Ranks",
    subtitle = "Top 100 Rank 05 genes",
    x = "Mean Score Across All Ranks",
    y = "Standard Deviation",
    color = "# Ranks\n> 0.5"
  ) +
  theme_minimal()
```

# Export Results

```{r export-results}
# Create output directory
output_dir <- "../output/32-R35c05-regulation"
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)

# Export top 100 rank 05 genes
write.csv(rank05_df, 
          file.path(output_dir, "rank05_genes_positive.csv"), 
          row.names = FALSE)

# Export gene factor scores for top 100
write.csv(gene_factors_top100,
          file.path(output_dir, "top100_rank05_gene_factors.csv"))

# Export gene statistics
write.csv(gene_stats,
          file.path(output_dir, "top100_gene_statistics.csv"),
          row.names = FALSE)

# Export correlation results if available
if (!is.null(combined_correlations) && nrow(combined_correlations) > 0) {
  
  # All correlations
  write.csv(combined_correlations,
            file.path(output_dir, "all_correlations.csv"),
            row.names = FALSE)
  
  # Significant correlations only
  sig_correlations <- combined_correlations %>%
    filter(AdjPValue < 0.05)
  
  write.csv(sig_correlations,
            file.path(output_dir, "significant_correlations.csv"),
            row.names = FALSE)
  
  # Cross-species conserved patterns
  cross_species_conserved <- sig_correlations %>%
    group_by(Gene, FeatureType) %>%
    filter(n_distinct(Species) >= 2) %>%
    summarize(
      N_Species = n_distinct(Species),
      Species = paste(sort(unique(Species)), collapse = ", "),
      Features = paste(unique(EpigeneticFeature), collapse = "; "),
      Mean_Correlation = mean(Correlation),
      Min_AdjPValue = min(AdjPValue),
      .groups = "drop"
    ) %>%
    arrange(desc(N_Species), FeatureType)
  
  write.csv(cross_species_conserved,
            file.path(output_dir, "cross_species_conserved_patterns.csv"),
            row.names = FALSE)
  
  # Summary statistics by species and feature type
  correlation_summary_export <- sig_correlations %>%
    group_by(Species, FeatureType) %>%
    summarize(
      N_Significant_Correlations = n(),
      N_Positive = sum(Correlation > 0),
      N_Negative = sum(Correlation < 0),
      N_Genes = n_distinct(Gene),
      N_EpigeneticFeatures = n_distinct(EpigeneticFeature),
      Mean_Correlation = mean(Correlation),
      Mean_AbsCorrelation = mean(abs(Correlation)),
      .groups = "drop"
    )
  
  write.csv(correlation_summary_export,
            file.path(output_dir, "correlation_summary_by_species_feature.csv"),
            row.names = FALSE)
  
  # Top correlated features per gene
  top_features_per_gene <- sig_correlations %>%
    group_by(Species, Gene) %>%
    slice_min(order_by = AdjPValue, n = 5) %>%
    ungroup()
  
  write.csv(top_features_per_gene,
            file.path(output_dir, "top_correlated_features_per_gene.csv"),
            row.names = FALSE)
  
  cat("Correlation results exported:\n")
  cat("  - all_correlations.csv\n")
  cat("  - significant_correlations.csv\n")
  cat("  - cross_species_conserved_patterns.csv\n")
  cat("  - correlation_summary_by_species_feature.csv\n")
  cat("  - top_correlated_features_per_gene.csv\n")
}

cat("\nAll results exported to:", output_dir, "\n")
```

# Session Info

```{r session-info}
sessionInfo()
```