---
title: "Analyzing Group ID Redundancy Across Rank 35 Components"
author: "Analysis"
date: "`r Sys.Date()`"
output: 
  html_document:
    toc: true
    toc_float: true
    code_folding: hide
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, 
                      fig.width = 14, fig.height = 10)
```

## Overview

This analysis examines whether the same `group_id` (orthogroup) appears across multiple components with the same GO BP terms, which would indicate redundancy in the cross-component frequency counts.

## Load Libraries

```{r load-libraries}
library(tidyverse)
library(ggplot2)
library(RColorBrewer)
library(knitr)
```

## Define Paths

```{r paths}
# Input directory
input_dir <- "/Users/sr320/Documents/GitHub/timeseries_molecular/M-multi-species/output/22-Visualizing-Rank-outs"

# Output directory
output_dir <- "/Users/sr320/Documents/GitHub/timeseries_molecular/M-multi-species/output/23-visualizing-rank35"

# Create output directory if it doesn't exist
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)
```

## Load All Data

```{r load-data}
# Get all rank_35 top100 annotation files
files <- list.files(input_dir, 
                   pattern = "rank_35_comp.*_top100_annotation\\.csv$", 
                   full.names = TRUE)

# Sort files by component number
file_df <- data.frame(
  path = files,
  component = str_extract(basename(files), "comp\\d+")
) %>%
  mutate(comp_num = as.numeric(str_extract(component, "\\d+"))) %>%
  arrange(comp_num)

cat(sprintf("Loading data from %d files...\n", nrow(file_df)))

# Read all files and combine
all_data <- map2_df(file_df$path, file_df$component, function(path, comp) {
  read_csv(path, show_col_types = FALSE) %>%
    mutate(component = comp)
})

cat(sprintf("Total rows loaded: %d\n", nrow(all_data)))
cat(sprintf("Unique group_ids: %d\n", n_distinct(all_data$group_id)))
```

## Parse GO BP Terms with Group ID Tracking

```{r parse-go-terms}
# Function to expand GO BP terms while maintaining group_id
expand_go_bp_terms <- function(data) {
  data %>%
    filter(!is.na(go_bp) & go_bp != "") %>%
    select(group_id, component, go_bp) %>%
    mutate(
      # Split GO terms by semicolon
      go_terms = str_split(go_bp, ";")
    ) %>%
    unnest(go_terms) %>%
    mutate(
      # Clean up the term (remove GO:XXXXX part)
      go_term_clean = str_trim(go_terms),
      go_term_clean = str_replace(go_term_clean, "\\s*\\[GO:.*\\]", "")
    ) %>%
    filter(go_term_clean != "") %>%
    select(group_id, component, go_term = go_term_clean)
}

# Expand all GO terms with group_id tracking
go_expanded <- expand_go_bp_terms(all_data)

cat(sprintf("Total GO term occurrences: %d\n", nrow(go_expanded)))
cat(sprintf("Unique GO terms: %d\n", n_distinct(go_expanded$go_term)))
```

## Analyze Redundancy

```{r analyze-redundancy}
# For each GO term, count:
# 1. Total occurrences across all components
# 2. Number of unique group_ids
# 3. Number of components it appears in
# 4. How many times the same group_id appears in multiple components

go_term_analysis <- go_expanded %>%
  group_by(go_term) %>%
  summarise(
    total_occurrences = n(),
    unique_group_ids = n_distinct(group_id),
    n_components = n_distinct(component),
    # Calculate redundancy: if a group_id appears in 3 components, that's 2 redundant occurrences
    redundant_occurrences = total_occurrences - unique_group_ids
  ) %>%
  mutate(
    # Percentage of occurrences that are redundant (same group_id in multiple components)
    redundancy_pct = (redundant_occurrences / total_occurrences) * 100,
    # Average times each unique group_id appears
    avg_times_per_group = total_occurrences / unique_group_ids
  ) %>%
  arrange(desc(total_occurrences))

# Display top terms
cat("\n### Top 30 GO Terms with Redundancy Analysis\n\n")
print(kable(head(go_term_analysis, 30), digits = 2))
```

## Detailed Analysis: Top Terms

```{r top-terms-detail}
# Focus on top 30 terms from previous analysis
top_30_terms <- head(go_term_analysis$go_term, 30)

# For each top term, show which group_ids appear in multiple components
redundancy_details <- go_expanded %>%
  filter(go_term %in% top_30_terms) %>%
  group_by(go_term, group_id) %>%
  summarise(
    n_components = n_distinct(component),
    components = paste(sort(unique(component)), collapse = ", "),
    .groups = "drop"
  ) %>%
  filter(n_components > 1) %>%
  arrange(go_term, desc(n_components))

cat(sprintf("\nFound %d group_ids that appear in multiple components\n", nrow(redundancy_details)))
cat(sprintf("Across the top 30 GO terms\n\n"))

# Show examples
if (nrow(redundancy_details) > 0) {
  cat("### Examples of Group IDs Appearing in Multiple Components\n\n")
  print(kable(head(redundancy_details, 20)))
}
```

## Summary Statistics

```{r summary-stats}
# Overall statistics
overall_stats <- data.frame(
  Metric = c(
    "Total GO term occurrences",
    "Unique GO terms",
    "Unique group_ids with GO terms",
    "Redundant occurrences (same group_id in multiple components)",
    "Overall redundancy percentage"
  ),
  Value = c(
    nrow(go_expanded),
    n_distinct(go_expanded$go_term),
    n_distinct(go_expanded$group_id),
    sum(go_term_analysis$redundant_occurrences),
    round(sum(go_term_analysis$redundant_occurrences) / nrow(go_expanded) * 100, 2)
  )
)

cat("\n### Overall Statistics\n\n")
print(kable(overall_stats))

# Stats for top 30 terms
top_30_stats <- go_term_analysis %>%
  slice_head(n = 30) %>%
  summarise(
    total_occurrences = sum(total_occurrences),
    redundant_occurrences = sum(redundant_occurrences),
    avg_redundancy_pct = mean(redundancy_pct)
  )

cat("\n### Top 30 Terms Statistics\n\n")
cat(sprintf("Total occurrences: %d\n", top_30_stats$total_occurrences))
cat(sprintf("Redundant occurrences: %d\n", top_30_stats$redundant_occurrences))
cat(sprintf("Redundancy percentage: %.2f%%\n", (top_30_stats$redundant_occurrences / top_30_stats$total_occurrences * 100)))
cat(sprintf("Average redundancy per term: %.2f%%\n", top_30_stats$avg_redundancy_pct))
```

## Visualization: Redundancy Analysis

```{r redundancy-plot, fig.width=14, fig.height=10}
# Plot for top 30 terms
plot_data <- go_term_analysis %>%
  slice_head(n = 30) %>%
  mutate(
    go_term = factor(go_term, levels = rev(go_term))
  ) %>%
  pivot_longer(
    cols = c(unique_group_ids, redundant_occurrences),
    names_to = "type",
    values_to = "count"
  ) %>%
  mutate(
    type = case_when(
      type == "unique_group_ids" ~ "Unique group_ids",
      type == "redundant_occurrences" ~ "Redundant (same group_id)"
    )
  )

p1 <- ggplot(plot_data, aes(x = go_term, y = count, fill = type)) +
  geom_bar(stat = "identity", position = "stack") +
  coord_flip() +
  scale_fill_manual(
    values = c("Unique group_ids" = "steelblue", 
               "Redundant (same group_id)" = "orange"),
    name = "Occurrence Type"
  ) +
  labs(
    title = "Redundancy Analysis: Top 30 GO BP Terms",
    subtitle = "Blue = unique group_ids, Orange = redundant occurrences (same group_id in multiple components)",
    x = "GO Biological Process Term",
    y = "Count"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    plot.subtitle = element_text(size = 11, color = "gray40"),
    axis.text.y = element_text(size = 10),
    legend.position = "bottom"
  )

print(p1)

ggsave(
  filename = file.path(output_dir, "rank35_GO_redundancy_analysis.png"),
  plot = p1,
  width = 14,
  height = 10,
  dpi = 300
)
```

## Redundancy Percentage Plot

```{r redundancy-pct-plot, fig.width=14, fig.height=10}
p2 <- go_term_analysis %>%
  slice_head(n = 30) %>%
  mutate(go_term = factor(go_term, levels = rev(go_term))) %>%
  ggplot(aes(x = go_term, y = redundancy_pct)) +
  geom_bar(stat = "identity", fill = "coral", alpha = 0.8) +
  geom_text(aes(label = sprintf("%.1f%%", redundancy_pct)), 
            hjust = -0.1, size = 3) +
  coord_flip() +
  labs(
    title = "Redundancy Percentage: Top 30 GO BP Terms",
    subtitle = "Percentage of occurrences due to same group_id appearing in multiple components",
    x = "GO Biological Process Term",
    y = "Redundancy Percentage (%)"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    plot.subtitle = element_text(size = 11, color = "gray40"),
    axis.text.y = element_text(size = 10),
    panel.grid.major.y = element_blank()
  )

print(p2)

ggsave(
  filename = file.path(output_dir, "rank35_GO_redundancy_percentage.png"),
  plot = p2,
  width = 14,
  height = 10,
  dpi = 300
)
```

## Group IDs Appearing in Most Components

```{r most-redundant-groups, fig.width=12, fig.height=8}
# Which group_ids appear in the most components?
group_id_frequency <- go_expanded %>%
  group_by(group_id) %>%
  summarise(
    n_components = n_distinct(component),
    n_go_terms = n_distinct(go_term),
    go_terms_sample = paste(head(unique(go_term), 3), collapse = "; ")
  ) %>%
  arrange(desc(n_components)) %>%
  head(50)

cat("\n### Top 50 Group IDs by Component Frequency\n\n")
print(kable(head(group_id_frequency, 50)))

# Plot
p3 <- group_id_frequency %>%
  head(30) %>%
  mutate(group_id = factor(group_id, levels = rev(group_id))) %>%
  ggplot(aes(x = group_id, y = n_components)) +
  geom_bar(stat = "identity", fill = "darkgreen", alpha = 0.7) +
  geom_text(aes(label = n_components), hjust = -0.2, size = 3) +
  coord_flip() +
  labs(
    title = "Top 30 Group IDs by Number of Components",
    subtitle = "Group IDs that appear in the most components",
    x = "Group ID",
    y = "Number of Components"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.text.y = element_text(size = 9),
    panel.grid.major.y = element_blank()
  )

print(p3)

ggsave(
  filename = file.path(output_dir, "rank35_most_frequent_group_ids.png"),
  plot = p3,
  width = 12,
  height = 8,
  dpi = 300
)
```

## Export Detailed Results

```{r export-data}
# Export full redundancy analysis
write_csv(
  go_term_analysis,
  file.path(output_dir, "rank35_GO_redundancy_full_analysis.csv")
)

# Export group IDs appearing in multiple components
write_csv(
  redundancy_details,
  file.path(output_dir, "rank35_redundant_group_ids_details.csv")
)

# Export group ID frequency
write_csv(
  group_id_frequency,
  file.path(output_dir, "rank35_group_id_component_frequency.csv")
)

cat("\nExported detailed results to CSV files\n")
```

## Session Info

```{r session-info}
sessionInfo()
```