Overview

This analysis visualizes the Biological Process (BP) Gene Ontology terms from the top 100 genes in each of the 5 components from the rank 05 tensor decomposition.

Load Libraries

library(tidyverse)
library(ggplot2)
library(RColorBrewer)
library(scales)

Define Paths

# Input directory
input_dir <- "/Users/sr320/Documents/GitHub/timeseries_molecular/M-multi-species/output/22-Visualizing-Rank-outs"

# Output directory
output_dir <- "/Users/sr320/Documents/GitHub/timeseries_molecular/M-multi-species/output/24-visualizing-rank05"

# Create output directory if it doesn't exist
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)

Get List of Files

# Get all rank_05 top100 annotation files
files <- list.files(input_dir, 
                   pattern = "rank_05_comp.*_top100_annotation\\.csv$", 
                   full.names = TRUE)

# Sort files by component number
file_df <- data.frame(
  path = files,
  component = str_extract(basename(files), "comp\\d+")
) %>%
  mutate(comp_num = as.numeric(str_extract(component, "\\d+"))) %>%
  arrange(comp_num)

cat(sprintf("Found %d files to process\n", nrow(file_df)))
## Found 5 files to process

Function to Parse GO BP Terms

# Function to parse GO BP column
parse_go_bp <- function(go_bp_string) {
  if (is.na(go_bp_string) || go_bp_string == "") {
    return(NULL)
  }
  
  # Split by semicolon and trim whitespace
  terms <- str_split(go_bp_string, ";")[[1]] %>%
    str_trim()
  
  # Extract term name (everything before [GO:...)
  term_names <- str_replace(terms, "\\s*\\[GO:.*\\]", "")
  
  return(term_names)
}

# Function to process a single file
process_file <- function(file_path, component_name) {
  # Read the file
  data <- read_csv(file_path, show_col_types = FALSE)
  
  # Extract all GO BP terms
  all_terms <- data %>%
    filter(!is.na(go_bp) & go_bp != "") %>%
    pull(go_bp) %>%
    map(parse_go_bp) %>%
    unlist()
  
  if (length(all_terms) == 0) {
    return(NULL)
  }
  
  # Count term frequencies
  term_counts <- as.data.frame(table(all_terms)) %>%
    rename(term = all_terms, count = Freq) %>%
    arrange(desc(count)) %>%
    head(20)  # Top 20 terms
  
  term_counts$component <- component_name
  
  return(term_counts)
}

Process All Files and Generate Visualizations

# Store all results for summary
all_results <- list()

for (i in 1:nrow(file_df)) {
  file_path <- file_df$path[i]
  component <- file_df$component[i]
  comp_num <- file_df$comp_num[i]
  
  cat(sprintf("\n\n## Component %d\n\n", comp_num))
  
  # Process the file
  term_counts <- process_file(file_path, component)
  
  if (is.null(term_counts) || nrow(term_counts) == 0) {
    cat("No GO BP terms found for this component.\n\n")
    next
  }
  
  all_results[[component]] <- term_counts
  
  # Create visualization
  p <- ggplot(term_counts, aes(x = reorder(term, count), y = count)) +
    geom_bar(stat = "identity", fill = "steelblue", alpha = 0.8) +
    coord_flip() +
    labs(
      title = sprintf("Top 20 Biological Process GO Terms - %s", component),
      subtitle = sprintf("From top 100 genes in rank 05 decomposition"),
      x = "GO Biological Process Term",
      y = "Frequency"
    ) +
    theme_minimal(base_size = 12) +
    theme(
      plot.title = element_text(size = 16, face = "bold"),
      plot.subtitle = element_text(size = 12, color = "gray40"),
      axis.text.y = element_text(size = 10),
      axis.text.x = element_text(size = 10),
      panel.grid.major.y = element_blank(),
      panel.grid.minor = element_blank()
    )
  
  print(p)
  
  # Save individual plot
  ggsave(
    filename = file.path(output_dir, sprintf("rank05_%s_BP_GO_top20.png", component)),
    plot = p,
    width = 14,
    height = 10,
    dpi = 300
  )
  
  # Print top 10 terms as table
  cat("\n### Top 10 Terms\n\n")
  print(knitr::kable(head(term_counts, 10), row.names = FALSE))
  cat("\n\n")
}

Component 1

### Top 10 Terms

term count component
negative regulation of transcription by RNA polymerase II 7 comp1
regulation of transcription by RNA polymerase II 6 comp1
positive regulation of transcription by RNA polymerase II 5 comp1
cell population proliferation 4 comp1
nervous system development 4 comp1
kidney development 3 comp1
negative regulation of cell population proliferation 3 comp1
positive regulation of gene expression 3 comp1
proteolysis 3 comp1
roof of mouth development 3 comp1

Component 2

### Top 10 Terms

term count component
negative regulation of transcription by RNA polymerase II 5 comp2
regulation of transcription by RNA polymerase II 5 comp2
positive regulation of DNA-templated transcription 4 comp2
cell division 3 comp2
cell population proliferation 3 comp2
kidney development 3 comp2
negative regulation of apoptotic process 3 comp2
positive regulation of gene expression 3 comp2
positive regulation of transcription by RNA polymerase II 3 comp2
regulation of DNA-templated transcription 3 comp2

Component 3

### Top 10 Terms

term count component
positive regulation of transcription by RNA polymerase II 7 comp3
cell differentiation 4 comp3
intracellular signal transduction 4 comp3
positive regulation of DNA-templated transcription 4 comp3
positive regulation of gene expression 4 comp3
regulation of transcription by RNA polymerase II 4 comp3
Wnt signaling pathway 3 comp3
axon guidance 3 comp3
basement membrane organization 3 comp3
homophilic cell adhesion via plasma membrane adhesion molecules 3 comp3

Component 4

### Top 10 Terms

term count component
negative regulation of transcription by RNA polymerase II 7 comp4
cell division 4 comp4
regulation of transcription by RNA polymerase II 4 comp4
DNA repair 3 comp4
Wnt signaling pathway 3 comp4
cell population proliferation 3 comp4
cellular response to leukemia inhibitory factor 3 comp4
kidney development 3 comp4
negative regulation of apoptotic process 3 comp4
nervous system development 3 comp4

Component 5

### Top 10 Terms

term count component
regulation of transcription by RNA polymerase II 7 comp5
negative regulation of transcription by RNA polymerase II 6 comp5
positive regulation of DNA-templated transcription 6 comp5
positive regulation of transcription by RNA polymerase II 5 comp5
Wnt signaling pathway 4 comp5
cell division 4 comp5
cell population proliferation 4 comp5
nervous system development 4 comp5
positive regulation of gene expression 4 comp5
cellular response to leukemia inhibitory factor 3 comp5

Summary: Most Common GO BP Terms Across All Components

# Combine all results
if (length(all_results) > 0) {
  combined_data <- bind_rows(all_results)
  
  # Get overall most common terms
  overall_counts <- combined_data %>%
    group_by(term) %>%
    summarise(
      total_count = sum(count),
      n_components = n()
    ) %>%
    arrange(desc(total_count)) %>%
    head(30)
  
  # Create overall summary plot
  p_summary <- ggplot(overall_counts, aes(x = reorder(term, total_count), y = total_count)) +
    geom_bar(stat = "identity", fill = "darkblue", alpha = 0.7) +
    geom_text(aes(label = n_components), hjust = -0.3, size = 3, color = "red") +
    coord_flip() +
    labs(
      title = "Top 30 Biological Process GO Terms Across All Rank 05 Components",
      subtitle = "Red numbers indicate how many components contain each term",
      x = "GO Biological Process Term",
      y = "Total Frequency Across All Components"
    ) +
    theme_minimal(base_size = 12) +
    theme(
      plot.title = element_text(size = 16, face = "bold"),
      plot.subtitle = element_text(size = 12, color = "gray40"),
      axis.text.y = element_text(size = 10),
      axis.text.x = element_text(size = 10),
      panel.grid.major.y = element_blank(),
      panel.grid.minor = element_blank()
    )
  
  print(p_summary)
  
  # Save summary plot
  ggsave(
    filename = file.path(output_dir, "rank05_ALL_components_BP_GO_summary.png"),
    plot = p_summary,
    width = 14,
    height = 12,
    dpi = 300
  )
  
  # Save summary table
  write_csv(overall_counts, 
            file.path(output_dir, "rank05_BP_GO_summary_table.csv"))
  
  cat("\n### Top 30 GO BP Terms Summary\n\n")
  print(knitr::kable(overall_counts, row.names = FALSE))
}

## 
## ### Top 30 GO BP Terms Summary
## 
## 
## 
## |term                                                            | total_count| n_components|
## |:---------------------------------------------------------------|-----------:|------------:|
## |negative regulation of transcription by RNA polymerase II       |          28|            5|
## |regulation of transcription by RNA polymerase II                |          26|            5|
## |positive regulation of transcription by RNA polymerase II       |          23|            5|
## |positive regulation of DNA-templated transcription              |          17|            4|
## |positive regulation of gene expression                          |          17|            5|
## |Wnt signaling pathway                                           |          14|            5|
## |cell population proliferation                                   |          14|            4|
## |nervous system development                                      |          14|            4|
## |cell division                                                   |          13|            4|
## |kidney development                                              |          12|            4|
## |negative regulation of apoptotic process                        |          12|            4|
## |cellular response to leukemia inhibitory factor                 |          10|            4|
## |basement membrane organization                                  |           9|            4|
## |angiogenesis                                                    |           6|            3|
## |branching involved in ureteric bud morphogenesis                |           6|            3|
## |regulation of DNA-templated transcription                       |           6|            2|
## |regulation of cell cycle                                        |           6|            2|
## |roof of mouth development                                       |           6|            2|
## |DNA repair                                                      |           5|            2|
## |DNA replication initiation                                      |           4|            2|
## |anterior/posterior pattern specification                        |           4|            2|
## |cGMP-mediated signaling                                         |           4|            2|
## |cell differentiation                                            |           4|            1|
## |central nervous system development                              |           4|            2|
## |intracellular signal transduction                               |           4|            1|
## |axon guidance                                                   |           3|            1|
## |homophilic cell adhesion via plasma membrane adhesion molecules |           3|            1|
## |mesoderm development                                            |           3|            1|
## |negative regulation of cell population proliferation            |           3|            1|
## |ossification                                                    |           3|            1|

Component Comparison Heatmap

if (length(all_results) > 0) {
  # Get top 30 overall terms
  top_terms <- overall_counts$term[1:min(30, nrow(overall_counts))]
  
  # Create matrix of term counts by component
  heatmap_data <- combined_data %>%
    filter(term %in% top_terms) %>%
    select(component, term, count) %>%
    pivot_wider(names_from = component, values_from = count, values_fill = 0)
  
  # Convert to matrix format for better visualization
  heatmap_matrix <- heatmap_data %>%
    column_to_rownames("term") %>%
    as.matrix()
  
  # Reorder by component number
  comp_order <- str_sort(colnames(heatmap_matrix), numeric = TRUE)
  heatmap_matrix <- heatmap_matrix[, comp_order]
  
  # Convert back for ggplot
  heatmap_long <- heatmap_matrix %>%
    as.data.frame() %>%
    rownames_to_column("term") %>%
    pivot_longer(-term, names_to = "component", values_to = "count")
  
  # Order terms by total frequency
  term_order <- overall_counts$term[1:min(30, nrow(overall_counts))]
  heatmap_long$term <- factor(heatmap_long$term, levels = rev(term_order))
  
  # Order components numerically
  heatmap_long <- heatmap_long %>%
    mutate(comp_num = as.numeric(str_extract(component, "\\d+"))) %>%
    arrange(comp_num)
  heatmap_long$component <- factor(heatmap_long$component, 
                                   levels = unique(heatmap_long$component[order(heatmap_long$comp_num)]))
  
  # Create heatmap
  p_heatmap <- ggplot(heatmap_long, aes(x = component, y = term, fill = count)) +
    geom_tile(color = "white", size = 0.5) +
    scale_fill_gradient2(low = "white", mid = "lightblue", high = "darkblue", 
                        midpoint = max(heatmap_long$count)/2,
                        name = "Count") +
    labs(
      title = "GO BP Terms Distribution Across Rank 05 Components",
      subtitle = "Top 30 most frequent terms",
      x = "Component",
      y = "GO Biological Process Term"
    ) +
    theme_minimal(base_size = 10) +
    theme(
      plot.title = element_text(size = 16, face = "bold"),
      plot.subtitle = element_text(size = 12, color = "gray40"),
      axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 10),
      axis.text.y = element_text(size = 9),
      panel.grid = element_blank(),
      legend.position = "right"
    )
  
  print(p_heatmap)
  
  # Save heatmap
  ggsave(
    filename = file.path(output_dir, "rank05_BP_GO_heatmap.png"),
    plot = p_heatmap,
    width = 12,
    height = 10,
    dpi = 300
  )
}

Session Info

sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Sonoma 14.7.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] C
## 
## time zone: America/Los_Angeles
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] scales_1.3.0       RColorBrewer_1.1-3 lubridate_1.9.3    forcats_1.0.0     
##  [5] stringr_1.5.1      dplyr_1.1.4        purrr_1.0.2        readr_2.1.5       
##  [9] tidyr_1.3.1        tibble_3.2.1       ggplot2_3.5.1      tidyverse_2.0.0   
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.9        utf8_1.2.4        generics_0.1.3    stringi_1.8.4    
##  [5] hms_1.1.3         digest_0.6.37     magrittr_2.0.3    evaluate_1.0.0   
##  [9] grid_4.3.2        timechange_0.3.0  fastmap_1.2.0     jsonlite_1.8.9   
## [13] fansi_1.0.6       textshaping_0.4.0 jquerylib_0.1.4   cli_3.6.3        
## [17] rlang_1.1.4       crayon_1.5.3      bit64_4.5.2       munsell_0.5.1    
## [21] withr_3.0.1       cachem_1.1.0      yaml_2.3.10       tools_4.3.2      
## [25] parallel_4.3.2    tzdb_0.4.0        colorspace_2.1-1  vctrs_0.6.5      
## [29] R6_2.5.1          lifecycle_1.0.4   bit_4.5.0         vroom_1.6.5      
## [33] ragg_1.3.3        pkgconfig_2.0.3   pillar_1.9.0      bslib_0.8.0      
## [37] gtable_0.3.6      glue_1.8.0        systemfonts_1.1.0 highr_0.11       
## [41] xfun_0.48         tidyselect_1.2.1  knitr_1.48        farver_2.1.2     
## [45] htmltools_0.5.8.1 rmarkdown_2.28    labeling_0.4.3    compiler_4.3.2