--- title: "36-biomin-contra" author: "Steven Roberts" date: "`r Sys.Date()`" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) library(tidyverse) ``` # Read in Cross-Species Methylation miRNA Comparison Data ```{r read-data} # Read CSV from remote URL biomin_data <- read_csv("https://gannet.fish.washington.edu/v1_web/owlshell/bu-github/ConTra/output/biomin_comparison_20251130_052506/cross_species_methylation_mirna_comparison.csv") # Display structure of the data glimpse(biomin_data) ``` ```{r view-columns} # View column names to identify context dependent columns colnames(biomin_data) ``` # Summary of Context Dependent Groups ```{r context-dependent-summary} # Identify columns that contain "context" in their name (case insensitive) context_cols <- grep("context", names(biomin_data), ignore.case = TRUE, value = TRUE) cat("Context-related columns found:\n") print(context_cols) ``` ```{r filter-context-true} # Filter for rows where any context dependent column is TRUE # First, let's see unique values in context columns if (length(context_cols) > 0) { for (col in context_cols) { cat("\nUnique values in", col, ":\n") print(unique(biomin_data[[col]])) } } ``` ```{r summarize-context-dependent} # Filter and summarize data where context dependent columns have TRUE values if (length(context_cols) > 0) { # Create filter for rows where any context dependent column is TRUE context_true_data <- biomin_data %>% filter(if_any(all_of(context_cols), ~ . == TRUE | . == "TRUE" | . == "true")) cat("Number of rows with context dependent = TRUE:", nrow(context_true_data), "\n\n") # Summarize by og_id if ("og_id" %in% names(context_true_data)) { context_summary <- context_true_data %>% group_by(og_id) %>% summarise( n_records = n(), across(where(is.numeric), ~ mean(.x, na.rm = TRUE), .names = "mean_{.col}"), .groups = "drop" ) print(context_summary) } else { cat("Column 'og_id' not found in data\n") print(head(context_true_data)) } } ``` ```{r detailed-view} # Display detailed view of context-dependent records if (exists("context_true_data") && nrow(context_true_data) > 0) { cat("\nDetailed view of context-dependent records:\n") print(context_true_data) } ``` # Species-Specific and Shared Context-Dependent OGs ```{r species-analysis} # Identify species-specific context dependent columns species_context_cols <- context_cols[!grepl("shared|common", context_cols, ignore.case = TRUE)] # Create summary of which OGs are TRUE for each species species_summary <- biomin_data %>% select(og_id, all_of(context_cols), everything()) # Count how many species have TRUE for each OG if (length(context_cols) > 0) { og_species_count <- biomin_data %>% rowwise() %>% mutate( species_true_count = sum(c_across(all_of(context_cols)) == TRUE | c_across(all_of(context_cols)) == "TRUE" | c_across(all_of(context_cols)) == "true", na.rm = TRUE) ) %>% ungroup() # OGs with TRUE in exactly one species (species-specific) single_species_ogs <- og_species_count %>% filter(species_true_count == 1) # OGs shared across two species two_species_ogs <- og_species_count %>% filter(species_true_count == 2) # OGs shared across three species three_species_ogs <- og_species_count %>% filter(species_true_count == 3) cat("OGs with context-dependent = TRUE in exactly 1 species:", nrow(single_species_ogs), "\n") cat("OGs with context-dependent = TRUE in exactly 2 species:", nrow(two_species_ogs), "\n") cat("OGs with context-dependent = TRUE in all 3 species:", nrow(three_species_ogs), "\n") } ``` ```{r species-specific-details} # Detailed breakdown by species for (col in context_cols) { species_name <- gsub("_context.*|context_dependent_|_methylation.*", "", col, ignore.case = TRUE) true_count <- sum(biomin_data[[col]] == TRUE | biomin_data[[col]] == "TRUE" | biomin_data[[col]] == "true", na.rm = TRUE) cat("\n", species_name, "- OGs with context-dependent = TRUE:", true_count, "\n") species_ogs <- biomin_data %>% filter(.data[[col]] == TRUE | .data[[col]] == "TRUE" | .data[[col]] == "true") %>% select(og_id, all_of(col), where(is.numeric)) if (nrow(species_ogs) > 0) { print(species_ogs) } } ``` ```{r shared-ogs-summary} # Summary of shared OGs with quantitative metrics if (exists("two_species_ogs") && nrow(two_species_ogs) > 0) { cat("\n=== OGs Shared Across Two Species ===\n") two_species_summary <- two_species_ogs %>% select(og_id, all_of(context_cols), where(is.numeric)) print(two_species_summary) } if (exists("three_species_ogs") && nrow(three_species_ogs) > 0) { cat("\n=== OGs Shared Across All Three Species ===\n") three_species_summary <- three_species_ogs %>% select(og_id, all_of(context_cols), where(is.numeric)) print(three_species_summary) } ``` ```{r quantitative-summary} # Generate quantitative summary statistics for context-dependent OGs if (exists("context_true_data") && nrow(context_true_data) > 0) { numeric_cols <- names(context_true_data)[sapply(context_true_data, is.numeric)] if (length(numeric_cols) > 0) { cat("\n=== Quantitative Summary of Context-Dependent OGs ===\n") quant_summary <- context_true_data %>% summarise(across(all_of(numeric_cols), list(mean = ~mean(.x, na.rm = TRUE), sd = ~sd(.x, na.rm = TRUE), min = ~min(.x, na.rm = TRUE), max = ~max(.x, na.rm = TRUE)), .names = "{.col}_{.fn}")) # Transpose for better readability quant_summary_long <- quant_summary %>% pivot_longer(everything(), names_to = "metric", values_to = "value") print(quant_summary_long) } } ``` # Summary Paragraph ```{r generate-paragraph, results='asis'} # Generate dynamic summary paragraph if (exists("context_cols") && length(context_cols) > 0) { # Get counts per species species_counts <- sapply(context_cols, function(col) { sum(biomin_data[[col]] == TRUE | biomin_data[[col]] == "TRUE" | biomin_data[[col]] == "true", na.rm = TRUE) }) # Get OG lists per species species_ogs_list <- lapply(context_cols, function(col) { biomin_data %>% filter(.data[[col]] == TRUE | .data[[col]] == "TRUE" | .data[[col]] == "true") %>% pull(og_id) }) names(species_ogs_list) <- context_cols # Build paragraph cat("\n## Cross-Species Context-Dependent Methylation and miRNA Analysis\n\n") cat("This analysis examined orthogroup (OG) patterns across species for context-dependent methylation and miRNA associations. ") # Species-specific findings for (i in seq_along(context_cols)) { col <- context_cols[i] species_name <- gsub("_context.*|context_dependent_|_methylation.*", "", col, ignore.case = TRUE) ogs <- species_ogs_list[[col]] if (length(ogs) > 0) { cat("**", species_name, "** showed context-dependent patterns in ", length(ogs), " OGs (", paste(head(ogs, 5), collapse = ", "), if(length(ogs) > 5) ", ..." else "", "). ", sep = "") } } # Shared patterns if (exists("two_species_ogs") && nrow(two_species_ogs) > 0) { cat("\n\nA total of **", nrow(two_species_ogs), " OGs** exhibited context-dependent patterns shared across exactly two species", sep = "") if (nrow(two_species_ogs) <= 10) { cat(" (", paste(two_species_ogs$og_id, collapse = ", "), ")", sep = "") } cat(". ") } if (exists("three_species_ogs") && nrow(three_species_ogs) > 0) { cat("Notably, **", nrow(three_species_ogs), " OGs** showed conserved context-dependent patterns across all three species", sep = "") if (nrow(three_species_ogs) <= 10) { cat(" (", paste(three_species_ogs$og_id, collapse = ", "), ")", sep = "") } cat(", suggesting evolutionary conservation of these regulatory mechanisms. ") } # Quantitative highlights if (exists("context_true_data") && nrow(context_true_data) > 0) { numeric_cols <- names(context_true_data)[sapply(context_true_data, is.numeric)] if (length(numeric_cols) > 0) { cat("\n\nQuantitatively, context-dependent OGs showed the following characteristics: ") for (nc in head(numeric_cols, 3)) { mean_val <- mean(context_true_data[[nc]], na.rm = TRUE) sd_val <- sd(context_true_data[[nc]], na.rm = TRUE) cat("**", nc, "** (mean ± SD: ", round(mean_val, 3), " ± ", round(sd_val, 3), "); ", sep = "") } } } cat("\n") } ```