--- title: "32-R35c05-regulation" author: "Steven Roberts" date: "`r format(Sys.time(), '%Y-%m-%d')`" output: github_document: toc: true number_sections: true html_document: theme: cosmo toc: true toc_float: true number_sections: true code_folding: show --- This analysis identifies genes with high scores in Rank 05 from the barnacle tensor decomposition and investigates which epigenetic features (lncRNA, gene body methylation, miRNA) may be regulating their expression across species. # Setup ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, cache = TRUE) ``` ```{r load-libraries} library(tidyverse) library(ggplot2) library(reshape2) library(corrplot) library(pheatmap) library(RColorBrewer) library(scales) library(knitr) library(viridis) ``` # Load Gene Factors from Barnacle Decomposition Load the gene_factors.csv from the rank 35 optimization output. ```{r load-gene-factors} # Path to barnacle factors gene_factors_path <- "../output/27-rank35-optimization/lambda_gene_0.2/barnacle_factors/gene_factors.csv" # Read gene factors gene_factors <- read.csv(gene_factors_path, row.names = 1) # Fix column names - columns 0-34 correspond to ranks 01-35 colnames(gene_factors) <- paste0("Rank_", sprintf("%02d", 1:35)) # View structure dim(gene_factors) head(gene_factors[, 1:10]) ``` # Identify Genes with Scores > 0 in Rank 05 ```{r rank05-genes} # Get rank 05 scores (5th column) rank05_scores <- gene_factors$Rank_05 # Create dataframe with gene IDs and rank 05 scores rank05_df <- data.frame( Gene = rownames(gene_factors), Rank_05_Score = rank05_scores ) %>% filter(Rank_05_Score > 0) %>% arrange(desc(Rank_05_Score)) # Summary statistics cat("Total genes in dataset:", nrow(gene_factors), "\n") cat("Genes with Rank 05 score > 0:", nrow(rank05_df), "\n") cat("\nTop 20 genes by Rank 05 score:\n") print(head(rank05_df, 20)) ``` # Plot Rank 05 Genes Across All 35 Ranks ## Prepare data for visualization ```{r prepare-heatmap-data} # Filter gene_factors to only genes with positive rank 05 scores rank05_genes <- rank05_df$Gene gene_factors_filtered <- gene_factors[rank05_genes, ] # For visualization, take top 100 genes by rank 05 score top100_genes <- rank05_df$Gene[1:min(100, nrow(rank05_df))] gene_factors_top100 <- gene_factors[top100_genes, ] cat("Number of genes for heatmap:", nrow(gene_factors_top100), "\n") ``` ## Heatmap of gene factor scores across ranks ```{r heatmap-rank05-genes, fig.width=12, fig.height=14} # Create heatmap of top 100 genes across all ranks pheatmap( as.matrix(gene_factors_top100), cluster_rows = TRUE, cluster_cols = FALSE, scale = "row", color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100), main = "Top 100 Rank 05 Genes: Factor Scores Across 35 Ranks", fontsize_row = 5, fontsize_col = 8, show_rownames = TRUE ) ``` ## Line plot showing gene trajectories across ranks ```{r lineplot-trajectories, fig.width=14, fig.height=8} # Convert to long format for ggplot gene_factors_long <- gene_factors_top100 %>% rownames_to_column("Gene") %>% pivot_longer( cols = starts_with("Rank_"), names_to = "Rank", values_to = "Score" ) %>% mutate( Rank_Num = as.numeric(gsub("Rank_", "", Rank)) ) # Line plot ggplot(gene_factors_long, aes(x = Rank_Num, y = Score, group = Gene)) + geom_line(alpha = 0.3, color = "steelblue") + geom_point(data = gene_factors_long %>% filter(Rank_Num == 5), aes(x = Rank_Num, y = Score), color = "red", size = 2, alpha = 0.5) + labs( title = "Gene Factor Scores Across 35 Ranks", subtitle = "Top 100 genes by Rank 05 score (red points indicate Rank 05)", x = "Rank", y = "Factor Score" ) + theme_minimal() + theme( plot.title = element_text(size = 14, face = "bold"), axis.text.x = element_text(angle = 45, hjust = 1) ) + scale_x_continuous(breaks = 1:35) ``` ## Distribution of Rank 05 scores ```{r rank05-distribution, fig.width=10, fig.height=5} ggplot(rank05_df, aes(x = Rank_05_Score)) + geom_histogram(bins = 50, fill = "steelblue", color = "white", alpha = 0.7) + geom_vline(xintercept = quantile(rank05_df$Rank_05_Score, 0.9), color = "red", linetype = "dashed", size = 1) + labs( title = "Distribution of Rank 05 Scores (Genes with Score > 0)", subtitle = "Red dashed line indicates 90th percentile", x = "Rank 05 Score", y = "Count" ) + theme_minimal() ``` # Load Expression Data Load gene expression matrices for all three species. ```{r load-expression-data} # Gene Expression Matrices from GitHub apul_expr_url <- "https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/M-multi-species/output/33-biomin-pathway-counts/apul_biomin_counts.csv" peve_expr_url <- "https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/M-multi-species/output/33-biomin-pathway-counts/peve_biomin_counts.csv" ptua_expr_url <- "https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/M-multi-species/output/33-biomin-pathway-counts/ptua_biomin_counts.csv" # Try to load expression data - these URLs may or may not be available tryCatch({ apul_expr <- read.csv(apul_expr_url, row.names = 1) cat("Loaded Apul expression:", dim(apul_expr)[1], "genes x", dim(apul_expr)[2], "samples\n") }, error = function(e) { cat("Could not load Apul expression data. URL may not be accessible.\n") apul_expr <- NULL }) tryCatch({ peve_expr <- read.csv(peve_expr_url, row.names = 1) cat("Loaded Peve expression:", dim(peve_expr)[1], "genes x", dim(peve_expr)[2], "samples\n") }, error = function(e) { cat("Could not load Peve expression data. URL may not be accessible.\n") peve_expr <- NULL }) tryCatch({ ptua_expr <- read.csv(ptua_expr_url, row.names = 1) cat("Loaded Ptua expression:", dim(ptua_expr)[1], "genes x", dim(ptua_expr)[2], "samples\n") }, error = function(e) { cat("Could not load Ptua expression data. URL may not be accessible.\n") ptua_expr <- NULL }) ``` # Load Epigenetic Features ## Long non-coding RNA (lncRNA) counts ```{r load-lncrna} # lncRNA URLs apul_lncrna_url <- "https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/D-Apul/output/31.5-Apul-lncRNA-discovery/lncRNA_counts.clean.filtered.txt" peve_lncrna_url <- "https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/timeseries_molecular/E-Peve/output/12-Peve-lncRNA-discovery/lncRNA_counts.clean.filtered.txt" ptua_lncrna_url <- "https://raw.githubusercontent.com/urol-e5/timeseries_molecular/refs/heads/main/F-Ptua/output/06-Ptua-lncRNA-discovery/lncRNA_counts.clean.filtered.txt" # Load lncRNA data tryCatch({ apul_lncrna <- read.delim(apul_lncrna_url, row.names = 1) cat("Loaded Apul lncRNA:", dim(apul_lncrna)[1], "lncRNAs x", dim(apul_lncrna)[2], "samples\n") }, error = function(e) { cat("Could not load Apul lncRNA data:", conditionMessage(e), "\n") apul_lncrna <<- NULL }) tryCatch({ peve_lncrna <- read.delim(peve_lncrna_url, row.names = 1) cat("Loaded Peve lncRNA:", dim(peve_lncrna)[1], "lncRNAs x", dim(peve_lncrna)[2], "samples\n") }, error = function(e) { cat("Could not load Peve lncRNA data:", conditionMessage(e), "\n") peve_lncrna <<- NULL }) tryCatch({ ptua_lncrna <- read.delim(ptua_lncrna_url, row.names = 1) cat("Loaded Ptua lncRNA:", dim(ptua_lncrna)[1], "lncRNAs x", dim(ptua_lncrna)[2], "samples\n") }, error = function(e) { cat("Could not load Ptua lncRNA data:", conditionMessage(e), "\n") ptua_lncrna <<- NULL }) ``` ## Gene Body Methylation ```{r load-methylation} # Gene body methylation URLs apul_meth_url <- "https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/D-Apul/output/40-Apul-Gene-Methylation/Apul-gene-methylation_75pct.tsv" peve_meth_url <- "https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/E-Peve/output/15-Peve-Gene-Methylation/Peve-gene-methylation_75pct.tsv" ptua_meth_url <- "https://raw.githubusercontent.com/urol-e5/timeseries-molecular-calcification/refs/heads/main/F-Ptua/output/09-Ptua-Gene-Methylation/Ptua-gene-methylation_75pct.tsv" # Load methylation data tryCatch({ apul_meth <- read.delim(apul_meth_url, row.names = 1) cat("Loaded Apul methylation:", dim(apul_meth)[1], "genes x", dim(apul_meth)[2], "samples\n") }, error = function(e) { cat("Could not load Apul methylation data:", conditionMessage(e), "\n") apul_meth <<- NULL }) tryCatch({ peve_meth <- read.delim(peve_meth_url, row.names = 1) cat("Loaded Peve methylation:", dim(peve_meth)[1], "genes x", dim(peve_meth)[2], "samples\n") }, error = function(e) { cat("Could not load Peve methylation data:", conditionMessage(e), "\n") peve_meth <<- NULL }) tryCatch({ ptua_meth <- read.delim(ptua_meth_url, row.names = 1) cat("Loaded Ptua methylation:", dim(ptua_meth)[1], "genes x", dim(ptua_meth)[2], "samples\n") }, error = function(e) { cat("Could not load Ptua methylation data:", conditionMessage(e), "\n") ptua_meth <<- NULL }) ``` ## miRNA counts ```{r load-mirna} # miRNA URLs apul_mirna_url <- "https://raw.githubusercontent.com/urol-e5/timeseries_molecular/refs/heads/main/M-multi-species/output/10-format-miRNA-counts/Apul_miRNA_counts_formatted.txt" peve_mirna_url <- "https://raw.githubusercontent.com/urol-e5/timeseries_molecular/refs/heads/main/M-multi-species/output/10-format-miRNA-counts/Peve_miRNA_counts_formatted.txt" ptua_mirna_url <- "https://raw.githubusercontent.com/urol-e5/timeseries_molecular/refs/heads/main/M-multi-species/output/10-format-miRNA-counts/Ptuh_miRNA_counts_formatted.txt" # Load miRNA data tryCatch({ apul_mirna <- read.delim(apul_mirna_url, row.names = 1) cat("Loaded Apul miRNA:", dim(apul_mirna)[1], "miRNAs x", dim(apul_mirna)[2], "samples\n") }, error = function(e) { cat("Could not load Apul miRNA data:", conditionMessage(e), "\n") apul_mirna <<- NULL }) tryCatch({ peve_mirna <- read.delim(peve_mirna_url, row.names = 1) cat("Loaded Peve miRNA:", dim(peve_mirna)[1], "miRNAs x", dim(peve_mirna)[2], "samples\n") }, error = function(e) { cat("Could not load Peve miRNA data:", conditionMessage(e), "\n") peve_mirna <<- NULL }) tryCatch({ ptua_mirna <- read.delim(ptua_mirna_url, row.names = 1) cat("Loaded Ptua miRNA:", dim(ptua_mirna)[1], "miRNAs x", dim(ptua_mirna)[2], "samples\n") }, error = function(e) { cat("Could not load Ptua miRNA data:", conditionMessage(e), "\n") ptua_mirna <<- NULL }) ``` # Load Sample Mapping ```{r load-sample-mapping} # Load sample mapping to link ortholog groups to species-specific genes sample_mapping <- read.csv("../output/27-rank35-optimization/lambda_gene_0.2/barnacle_factors/sample_mapping.csv") print(head(sample_mapping, 15)) ``` # Correlation Analysis: Expression vs Epigenetic Features ## Helper function for correlation analysis ```{r correlation-functions} # Function to calculate correlations between expression and epigenetic features calculate_correlations <- function(expr_data, epi_data, expr_name, epi_name, species) { if (is.null(expr_data) || is.null(epi_data)) { return(NULL) } # Find common samples common_samples <- intersect(colnames(expr_data), colnames(epi_data)) if (length(common_samples) < 3) { cat("Insufficient common samples for", species, expr_name, "vs", epi_name, "\n") return(NULL) } cat("Analyzing", species, ":", length(common_samples), "common samples for", expr_name, "vs", epi_name, "\n") # Subset to common samples expr_subset <- expr_data[, common_samples, drop = FALSE] epi_subset <- epi_data[, common_samples, drop = FALSE] # Calculate pairwise correlations correlations <- list() for (gene in rownames(expr_subset)) { gene_expr <- as.numeric(expr_subset[gene, ]) for (epi_feature in rownames(epi_subset)) { epi_values <- as.numeric(epi_subset[epi_feature, ]) # Calculate correlation if there's variation if (sd(gene_expr, na.rm = TRUE) > 0 && sd(epi_values, na.rm = TRUE) > 0) { cor_result <- tryCatch({ cor.test(gene_expr, epi_values, method = "spearman", use = "complete.obs") }, error = function(e) NULL) if (!is.null(cor_result)) { correlations[[length(correlations) + 1]] <- data.frame( Species = species, Gene = gene, EpigeneticFeature = epi_feature, FeatureType = epi_name, Correlation = cor_result$estimate, PValue = cor_result$p.value ) } } } } if (length(correlations) > 0) { result <- bind_rows(correlations) # Adjust p-values for multiple testing result$AdjPValue <- p.adjust(result$PValue, method = "BH") return(result) } return(NULL) } ``` ## Run correlation analyses for each species ```{r run-correlations} all_correlations <- list() # Apul correlations if (exists("apul_expr") && !is.null(apul_expr)) { if (exists("apul_lncrna") && !is.null(apul_lncrna)) { cor_apul_lncrna <- calculate_correlations(apul_expr, apul_lncrna, "Expression", "lncRNA", "Apul") if (!is.null(cor_apul_lncrna)) all_correlations[["apul_lncrna"]] <- cor_apul_lncrna } if (exists("apul_meth") && !is.null(apul_meth)) { cor_apul_meth <- calculate_correlations(apul_expr, apul_meth, "Expression", "Methylation", "Apul") if (!is.null(cor_apul_meth)) all_correlations[["apul_meth"]] <- cor_apul_meth } if (exists("apul_mirna") && !is.null(apul_mirna)) { cor_apul_mirna <- calculate_correlations(apul_expr, apul_mirna, "Expression", "miRNA", "Apul") if (!is.null(cor_apul_mirna)) all_correlations[["apul_mirna"]] <- cor_apul_mirna } } # Peve correlations if (exists("peve_expr") && !is.null(peve_expr)) { if (exists("peve_lncrna") && !is.null(peve_lncrna)) { cor_peve_lncrna <- calculate_correlations(peve_expr, peve_lncrna, "Expression", "lncRNA", "Peve") if (!is.null(cor_peve_lncrna)) all_correlations[["peve_lncrna"]] <- cor_peve_lncrna } if (exists("peve_meth") && !is.null(peve_meth)) { cor_peve_meth <- calculate_correlations(peve_expr, peve_meth, "Expression", "Methylation", "Peve") if (!is.null(cor_peve_meth)) all_correlations[["peve_meth"]] <- cor_peve_meth } if (exists("peve_mirna") && !is.null(peve_mirna)) { cor_peve_mirna <- calculate_correlations(peve_expr, peve_mirna, "Expression", "miRNA", "Peve") if (!is.null(cor_peve_mirna)) all_correlations[["peve_mirna"]] <- cor_peve_mirna } } # Ptua correlations if (exists("ptua_expr") && !is.null(ptua_expr)) { if (exists("ptua_lncrna") && !is.null(ptua_lncrna)) { cor_ptua_lncrna <- calculate_correlations(ptua_expr, ptua_lncrna, "Expression", "lncRNA", "Ptua") if (!is.null(cor_ptua_lncrna)) all_correlations[["ptua_lncrna"]] <- cor_ptua_lncrna } if (exists("ptua_meth") && !is.null(ptua_meth)) { cor_ptua_meth <- calculate_correlations(ptua_expr, ptua_meth, "Expression", "Methylation", "Ptua") if (!is.null(cor_ptua_meth)) all_correlations[["ptua_meth"]] <- cor_ptua_meth } if (exists("ptua_mirna") && !is.null(ptua_mirna)) { cor_ptua_mirna <- calculate_correlations(ptua_expr, ptua_mirna, "Expression", "miRNA", "Ptua") if (!is.null(cor_ptua_mirna)) all_correlations[["ptua_mirna"]] <- cor_ptua_mirna } } # Combine all correlations if (length(all_correlations) > 0) { combined_correlations <- bind_rows(all_correlations) cat("\nTotal correlations calculated:", nrow(combined_correlations), "\n") } else { combined_correlations <- NULL cat("\nNo correlations could be calculated - data may not be available\n") } ``` ## Summary Table: Significant Correlations by Species and Feature Type ```{r correlation-summary-table} if (!is.null(combined_correlations) && nrow(combined_correlations) > 0) { # Filter for significant correlations (adjusted p-value < 0.05) sig_correlations <- combined_correlations %>% filter(AdjPValue < 0.05) cat("Total significant correlations (adj. p < 0.05):", nrow(sig_correlations), "\n\n") # Summary by species and feature type correlation_summary <- sig_correlations %>% group_by(Species, FeatureType) %>% summarize( N_Significant = n(), N_Positive = sum(Correlation > 0), N_Negative = sum(Correlation < 0), Mean_AbsCorr = mean(abs(Correlation)), Max_AbsCorr = max(abs(Correlation)), N_Genes = n_distinct(Gene), N_Features = n_distinct(EpigeneticFeature), .groups = "drop" ) %>% arrange(Species, FeatureType) cat("Summary of Significant Correlations by Species and Feature Type:\n") print(knitr::kable(correlation_summary, digits = 3)) } else { cat("No correlation data available for summary.\n") } ``` ## Top Correlated Epigenetic Features per Species ```{r top-correlated-features} if (!is.null(combined_correlations) && nrow(combined_correlations) > 0) { # Get top 20 most significant correlations per species top_correlations <- combined_correlations %>% filter(AdjPValue < 0.05) %>% group_by(Species) %>% slice_min(order_by = AdjPValue, n = 20) %>% ungroup() %>% select(Species, Gene, EpigeneticFeature, FeatureType, Correlation, AdjPValue) %>% arrange(Species, AdjPValue) cat("Top 20 Most Significant Correlations per Species:\n\n") print(knitr::kable(top_correlations, digits = 4)) } ``` ## Visualization: Correlation Distribution by Species and Feature Type ```{r correlation-distribution-plot, fig.width=14, fig.height=8} if (!is.null(combined_correlations) && nrow(combined_correlations) > 0) { # Filter significant correlations sig_corr <- combined_correlations %>% filter(AdjPValue < 0.05) if (nrow(sig_corr) > 0) { # Violin plot of correlations by species and feature type p1 <- ggplot(sig_corr, aes(x = FeatureType, y = Correlation, fill = Species)) + geom_violin(alpha = 0.7, position = position_dodge(0.8)) + geom_boxplot(width = 0.2, position = position_dodge(0.8), alpha = 0.8) + geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") + scale_fill_brewer(palette = "Set2") + labs( title = "Distribution of Significant Correlations", subtitle = "Expression vs Epigenetic Features (adj. p < 0.05)", x = "Epigenetic Feature Type", y = "Spearman Correlation" ) + theme_minimal() + theme( plot.title = element_text(size = 14, face = "bold"), axis.text.x = element_text(angle = 45, hjust = 1) ) print(p1) } } ``` ## Visualization: Number of Correlated Features per Gene ```{r features-per-gene-plot, fig.width=12, fig.height=8} if (!is.null(combined_correlations) && nrow(combined_correlations) > 0) { sig_corr <- combined_correlations %>% filter(AdjPValue < 0.05) if (nrow(sig_corr) > 0) { # Count features per gene per species features_per_gene <- sig_corr %>% group_by(Species, Gene, FeatureType) %>% summarize(N_Features = n(), .groups = "drop") %>% pivot_wider(names_from = FeatureType, values_from = N_Features, values_fill = 0) # Stacked bar plot features_summary <- sig_corr %>% group_by(Species, FeatureType) %>% summarize( N_Correlations = n(), N_Genes = n_distinct(Gene), .groups = "drop" ) p2 <- ggplot(features_summary, aes(x = Species, y = N_Correlations, fill = FeatureType)) + geom_bar(stat = "identity", position = "dodge", alpha = 0.8) + geom_text(aes(label = N_Correlations), position = position_dodge(0.9), vjust = -0.5, size = 3) + scale_fill_brewer(palette = "Dark2") + labs( title = "Number of Significant Correlations by Species", subtitle = "Expression-Epigenetic Feature Pairs (adj. p < 0.05)", x = "Species", y = "Number of Significant Correlations", fill = "Feature Type" ) + theme_minimal() + theme(plot.title = element_text(size = 14, face = "bold")) print(p2) } } ``` ## Heatmap: Top Epigenetic Features Correlated with Expression ```{r top-features-heatmap, fig.width=14, fig.height=10} if (!is.null(combined_correlations) && nrow(combined_correlations) > 0) { sig_corr <- combined_correlations %>% filter(AdjPValue < 0.05) if (nrow(sig_corr) > 0) { # Get top 50 most frequently correlated epigenetic features top_features <- sig_corr %>% group_by(EpigeneticFeature, FeatureType) %>% summarize( N_Genes = n_distinct(Gene), N_Species = n_distinct(Species), Mean_Correlation = mean(Correlation), .groups = "drop" ) %>% arrange(desc(N_Species), desc(N_Genes)) %>% head(50) cat("\nTop 50 Epigenetic Features by Number of Correlated Genes:\n") print(knitr::kable(head(top_features, 30), digits = 3)) # Create heatmap data heatmap_data <- sig_corr %>% filter(EpigeneticFeature %in% top_features$EpigeneticFeature) %>% group_by(Species, EpigeneticFeature, FeatureType) %>% summarize( Mean_Correlation = mean(Correlation), N_Genes = n(), .groups = "drop" ) # Wide format for heatmap heatmap_wide <- heatmap_data %>% select(Species, EpigeneticFeature, Mean_Correlation) %>% pivot_wider(names_from = Species, values_from = Mean_Correlation, values_fill = 0) if (nrow(heatmap_wide) > 1) { # Convert to matrix heatmap_mat <- as.matrix(heatmap_wide[, -1]) rownames(heatmap_mat) <- heatmap_wide$EpigeneticFeature # Create annotation for feature type feature_types <- heatmap_data %>% select(EpigeneticFeature, FeatureType) %>% distinct() row_annotation <- data.frame( FeatureType = feature_types$FeatureType[match(rownames(heatmap_mat), feature_types$EpigeneticFeature)] ) rownames(row_annotation) <- rownames(heatmap_mat) # Heatmap pheatmap( heatmap_mat, cluster_rows = TRUE, cluster_cols = FALSE, color = colorRampPalette(c("blue", "white", "red"))(100), breaks = seq(-1, 1, length.out = 101), annotation_row = row_annotation, main = "Mean Correlation of Top Epigenetic Features Across Species", fontsize_row = 6, fontsize_col = 10 ) } } } ``` # Cross-Species Epigenetic Regulation Analysis ## Identify Shared Regulatory Patterns This section identifies epigenetic features that show consistent correlation patterns with gene expression across multiple species - these represent potential conserved regulatory mechanisms. ```{r cross-species-analysis} if (!is.null(combined_correlations) && nrow(combined_correlations) > 0) { sig_corr <- combined_correlations %>% filter(AdjPValue < 0.05) if (nrow(sig_corr) > 0) { # Find genes with significant correlations in multiple species gene_species_count <- sig_corr %>% group_by(Gene) %>% summarize( N_Species = n_distinct(Species), Species_List = paste(unique(Species), collapse = ", "), N_Features = n_distinct(EpigeneticFeature), Feature_Types = paste(unique(FeatureType), collapse = ", "), .groups = "drop" ) %>% filter(N_Species >= 2) %>% arrange(desc(N_Species), desc(N_Features)) cat("\nGenes with Epigenetic Correlations in Multiple Species:\n") print(knitr::kable(head(gene_species_count, 30))) } } ``` ## Table: Conserved Epigenetic Features Across Species ```{r conserved-features-table} if (!is.null(combined_correlations) && nrow(combined_correlations) > 0) { sig_corr <- combined_correlations %>% filter(AdjPValue < 0.05) if (nrow(sig_corr) > 0) { # Find epigenetic features correlated with same genes across species # Group by gene and feature type to find cross-species patterns cross_species_features <- sig_corr %>% group_by(Gene, FeatureType) %>% summarize( N_Species = n_distinct(Species), Species = paste(sort(unique(Species)), collapse = ", "), Features = paste(unique(EpigeneticFeature), collapse = "; "), Mean_Correlation = mean(Correlation), Correlation_Direction = ifelse(mean(Correlation) > 0, "Positive", "Negative"), .groups = "drop" ) %>% filter(N_Species >= 2) %>% arrange(desc(N_Species), FeatureType, Gene) cat("\nConserved Epigenetic Regulation Patterns (Same Gene, Multiple Species):\n\n") if (nrow(cross_species_features) > 0) { # Summary by feature type conserved_summary <- cross_species_features %>% group_by(FeatureType, N_Species) %>% summarize( N_Genes = n(), N_Positive = sum(Correlation_Direction == "Positive"), N_Negative = sum(Correlation_Direction == "Negative"), .groups = "drop" ) cat("Summary of Conserved Patterns by Feature Type:\n") print(knitr::kable(conserved_summary)) cat("\n\nDetailed Conserved Patterns (showing first 50):\n") print(knitr::kable(head(cross_species_features, 50), digits = 3)) } else { cat("No conserved patterns found across species.\n") } } } ``` ## Visualization: Cross-Species Regulatory Network ```{r cross-species-network, fig.width=14, fig.height=12} if (!is.null(combined_correlations) && nrow(combined_correlations) > 0) { sig_corr <- combined_correlations %>% filter(AdjPValue < 0.05) if (nrow(sig_corr) > 0) { # Create summary for visualization species_feature_summary <- sig_corr %>% group_by(FeatureType, Species) %>% summarize( N_Correlations = n(), N_Genes = n_distinct(Gene), N_Features = n_distinct(EpigeneticFeature), Mean_AbsCorr = mean(abs(Correlation)), .groups = "drop" ) # Grouped bar chart p3 <- ggplot(species_feature_summary, aes(x = Species, y = N_Genes, fill = FeatureType)) + geom_bar(stat = "identity", position = "dodge", alpha = 0.8) + geom_text(aes(label = N_Genes), position = position_dodge(0.9), vjust = -0.5, size = 3) + scale_fill_brewer(palette = "Set1") + labs( title = "Number of Genes with Significant Epigenetic Correlations", subtitle = "By Species and Epigenetic Feature Type", x = "Species", y = "Number of Genes", fill = "Feature Type" ) + theme_minimal() + theme( plot.title = element_text(size = 14, face = "bold"), legend.position = "bottom" ) print(p3) # Faceted scatter plot of correlations p4 <- ggplot(sig_corr %>% sample_n(min(5000, n())), aes(x = Correlation, y = -log10(AdjPValue), color = Species)) + geom_point(alpha = 0.5, size = 1) + facet_wrap(~FeatureType, scales = "free_y") + geom_vline(xintercept = 0, linetype = "dashed", color = "gray50") + scale_color_brewer(palette = "Set2") + labs( title = "Volcano Plot: Expression-Epigenetic Correlations", subtitle = "Significant correlations by feature type (sample of up to 5000 points)", x = "Spearman Correlation", y = "-log10(Adjusted P-value)" ) + theme_minimal() + theme( plot.title = element_text(size = 14, face = "bold"), strip.text = element_text(face = "bold") ) print(p4) } } ``` ## Species Comparison: Correlation Strength by Feature Type ```{r species-comparison-plot, fig.width=12, fig.height=8} if (!is.null(combined_correlations) && nrow(combined_correlations) > 0) { sig_corr <- combined_correlations %>% filter(AdjPValue < 0.05) if (nrow(sig_corr) > 0) { # Box plot comparing correlation strengths p5 <- ggplot(sig_corr, aes(x = Species, y = abs(Correlation), fill = FeatureType)) + geom_boxplot(alpha = 0.7, outlier.size = 0.5) + scale_fill_brewer(palette = "Pastel1") + labs( title = "Comparison of Correlation Strengths Across Species", subtitle = "Absolute value of significant correlations (adj. p < 0.05)", x = "Species", y = "Absolute Correlation", fill = "Feature Type" ) + theme_minimal() + theme( plot.title = element_text(size = 14, face = "bold"), legend.position = "bottom" ) print(p5) } } ``` ## Summary Table: Epigenetic Features Regulating Genes in All Three Species ```{r all-three-species-table} if (!is.null(combined_correlations) && nrow(combined_correlations) > 0) { sig_corr <- combined_correlations %>% filter(AdjPValue < 0.05) if (nrow(sig_corr) > 0) { # Find genes regulated by same feature type in all 3 species genes_all_species <- sig_corr %>% group_by(Gene, FeatureType) %>% filter(n_distinct(Species) == 3) %>% summarize( Species = paste(sort(unique(Species)), collapse = ", "), Correlations = paste(sprintf("%.3f", Correlation), collapse = ", "), Mean_Correlation = mean(Correlation), All_Positive = all(Correlation > 0), All_Negative = all(Correlation < 0), Direction = case_when( all(Correlation > 0) ~ "Consistently Positive", all(Correlation < 0) ~ "Consistently Negative", TRUE ~ "Mixed" ), .groups = "drop" ) %>% arrange(FeatureType, desc(abs(Mean_Correlation))) if (nrow(genes_all_species) > 0) { cat("\n==========================================\n") cat("GENES WITH EPIGENETIC CORRELATIONS IN ALL THREE SPECIES\n") cat("==========================================\n\n") cat("These genes show significant correlation between expression and\n") cat("epigenetic features in Apul, Peve, AND Ptua - potential conserved regulation.\n\n") print(knitr::kable(genes_all_species, digits = 3)) # Count by direction direction_summary <- genes_all_species %>% group_by(FeatureType, Direction) %>% summarize(N_Genes = n(), .groups = "drop") cat("\n\nSummary by Feature Type and Direction:\n") print(knitr::kable(direction_summary)) } else { cat("No genes found with significant correlations in all three species.\n") } } } ``` # Identify Cross-Species Regulatory Patterns ## Map ortholog groups to species-specific genes ```{r map-orthologs} # The top 100 genes from rank 05 are ortholog groups (OG_XXXXX) # We need to map these to species-specific genes to analyze epigenetic regulation # Load ortholog annotation if available ortho_annot_path <- "../output/12-ortho-annot/ortholog_groups_annotated.csv" if (file.exists(ortho_annot_path)) { ortho_annot <- read.csv(ortho_annot_path) cat("Loaded ortholog annotations:", nrow(ortho_annot), "entries\n") print(head(ortho_annot)) } else { cat("Ortholog annotation file not found. Manual mapping may be required.\n") } ``` ## Summary of top Rank 05 genes ```{r top-genes-summary} # Create summary table of top 100 rank 05 genes with their scores across key ranks top_genes_summary <- gene_factors_top100 %>% rownames_to_column("Gene") %>% mutate( Rank_05 = Rank_05, Max_Score = apply(gene_factors_top100, 1, max), Max_Rank = apply(gene_factors_top100, 1, which.max) ) %>% select(Gene, Rank_05, Max_Score, Max_Rank, everything()) %>% arrange(desc(Rank_05)) cat("\nTop 20 genes from Rank 05 with their peak ranks:\n") print(head(top_genes_summary[, c("Gene", "Rank_05", "Max_Score", "Max_Rank")], 20)) ``` ## Identify genes with consistent patterns ```{r consistent-patterns, fig.width=12, fig.height=8} # Find genes that have high scores in rank 05 AND other ranks # These may represent core regulatory modules # Calculate summary statistics for each gene gene_stats <- gene_factors_top100 %>% rownames_to_column("Gene") %>% pivot_longer( cols = starts_with("Rank_"), names_to = "Rank", values_to = "Score" ) %>% group_by(Gene) %>% summarize( Mean_Score = mean(Score), SD_Score = sd(Score), N_HighRanks = sum(Score > 0.5), Rank_05_Score = Score[Rank == "Rank_05"], .groups = "drop" ) %>% arrange(desc(N_HighRanks), desc(Mean_Score)) cat("\nGenes with highest mean scores across ranks:\n") print(head(gene_stats, 20)) # Visualize gene variability ggplot(gene_stats, aes(x = Mean_Score, y = SD_Score, color = N_HighRanks)) + geom_point(alpha = 0.7, size = 3) + scale_color_viridis_c() + labs( title = "Gene Score Variability Across Ranks", subtitle = "Top 100 Rank 05 genes", x = "Mean Score Across All Ranks", y = "Standard Deviation", color = "# Ranks\n> 0.5" ) + theme_minimal() ``` # Export Results ```{r export-results} # Create output directory output_dir <- "../output/32-R35c05-regulation" dir.create(output_dir, showWarnings = FALSE, recursive = TRUE) # Export top 100 rank 05 genes write.csv(rank05_df, file.path(output_dir, "rank05_genes_positive.csv"), row.names = FALSE) # Export gene factor scores for top 100 write.csv(gene_factors_top100, file.path(output_dir, "top100_rank05_gene_factors.csv")) # Export gene statistics write.csv(gene_stats, file.path(output_dir, "top100_gene_statistics.csv"), row.names = FALSE) # Export correlation results if available if (!is.null(combined_correlations) && nrow(combined_correlations) > 0) { # All correlations write.csv(combined_correlations, file.path(output_dir, "all_correlations.csv"), row.names = FALSE) # Significant correlations only sig_correlations <- combined_correlations %>% filter(AdjPValue < 0.05) write.csv(sig_correlations, file.path(output_dir, "significant_correlations.csv"), row.names = FALSE) # Cross-species conserved patterns cross_species_conserved <- sig_correlations %>% group_by(Gene, FeatureType) %>% filter(n_distinct(Species) >= 2) %>% summarize( N_Species = n_distinct(Species), Species = paste(sort(unique(Species)), collapse = ", "), Features = paste(unique(EpigeneticFeature), collapse = "; "), Mean_Correlation = mean(Correlation), Min_AdjPValue = min(AdjPValue), .groups = "drop" ) %>% arrange(desc(N_Species), FeatureType) write.csv(cross_species_conserved, file.path(output_dir, "cross_species_conserved_patterns.csv"), row.names = FALSE) # Summary statistics by species and feature type correlation_summary_export <- sig_correlations %>% group_by(Species, FeatureType) %>% summarize( N_Significant_Correlations = n(), N_Positive = sum(Correlation > 0), N_Negative = sum(Correlation < 0), N_Genes = n_distinct(Gene), N_EpigeneticFeatures = n_distinct(EpigeneticFeature), Mean_Correlation = mean(Correlation), Mean_AbsCorrelation = mean(abs(Correlation)), .groups = "drop" ) write.csv(correlation_summary_export, file.path(output_dir, "correlation_summary_by_species_feature.csv"), row.names = FALSE) # Top correlated features per gene top_features_per_gene <- sig_correlations %>% group_by(Species, Gene) %>% slice_min(order_by = AdjPValue, n = 5) %>% ungroup() write.csv(top_features_per_gene, file.path(output_dir, "top_correlated_features_per_gene.csv"), row.names = FALSE) cat("Correlation results exported:\n") cat(" - all_correlations.csv\n") cat(" - significant_correlations.csv\n") cat(" - cross_species_conserved_patterns.csv\n") cat(" - correlation_summary_by_species_feature.csv\n") cat(" - top_correlated_features_per_gene.csv\n") } cat("\nAll results exported to:", output_dir, "\n") ``` # Session Info ```{r session-info} sessionInfo() ```