This analysis visualizes the Biological Process (BP) Gene Ontology terms from the top 100 genes in each of the 5 components from the rank 05 tensor decomposition.
library(tidyverse)
library(ggplot2)
library(RColorBrewer)
library(scales)
# Input directory
input_dir <- "/Users/sr320/Documents/GitHub/timeseries_molecular/M-multi-species/output/22-Visualizing-Rank-outs"
# Output directory
output_dir <- "/Users/sr320/Documents/GitHub/timeseries_molecular/M-multi-species/output/24-visualizing-rank05"
# Create output directory if it doesn't exist
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)
# Get all rank_05 top100 annotation files
files <- list.files(input_dir,
pattern = "rank_05_comp.*_top100_annotation\\.csv$",
full.names = TRUE)
# Sort files by component number
file_df <- data.frame(
path = files,
component = str_extract(basename(files), "comp\\d+")
) %>%
mutate(comp_num = as.numeric(str_extract(component, "\\d+"))) %>%
arrange(comp_num)
cat(sprintf("Found %d files to process\n", nrow(file_df)))
## Found 5 files to process
# Function to parse GO BP column
parse_go_bp <- function(go_bp_string) {
if (is.na(go_bp_string) || go_bp_string == "") {
return(NULL)
}
# Split by semicolon and trim whitespace
terms <- str_split(go_bp_string, ";")[[1]] %>%
str_trim()
# Extract term name (everything before [GO:...)
term_names <- str_replace(terms, "\\s*\\[GO:.*\\]", "")
return(term_names)
}
# Function to process a single file
process_file <- function(file_path, component_name) {
# Read the file
data <- read_csv(file_path, show_col_types = FALSE)
# Extract all GO BP terms
all_terms <- data %>%
filter(!is.na(go_bp) & go_bp != "") %>%
pull(go_bp) %>%
map(parse_go_bp) %>%
unlist()
if (length(all_terms) == 0) {
return(NULL)
}
# Count term frequencies
term_counts <- as.data.frame(table(all_terms)) %>%
rename(term = all_terms, count = Freq) %>%
arrange(desc(count)) %>%
head(20) # Top 20 terms
term_counts$component <- component_name
return(term_counts)
}
# Store all results for summary
all_results <- list()
for (i in 1:nrow(file_df)) {
file_path <- file_df$path[i]
component <- file_df$component[i]
comp_num <- file_df$comp_num[i]
cat(sprintf("\n\n## Component %d\n\n", comp_num))
# Process the file
term_counts <- process_file(file_path, component)
if (is.null(term_counts) || nrow(term_counts) == 0) {
cat("No GO BP terms found for this component.\n\n")
next
}
all_results[[component]] <- term_counts
# Create visualization
p <- ggplot(term_counts, aes(x = reorder(term, count), y = count)) +
geom_bar(stat = "identity", fill = "steelblue", alpha = 0.8) +
coord_flip() +
labs(
title = sprintf("Top 20 Biological Process GO Terms - %s", component),
subtitle = sprintf("From top 100 genes in rank 05 decomposition"),
x = "GO Biological Process Term",
y = "Frequency"
) +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = element_text(size = 12, color = "gray40"),
axis.text.y = element_text(size = 10),
axis.text.x = element_text(size = 10),
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank()
)
print(p)
# Save individual plot
ggsave(
filename = file.path(output_dir, sprintf("rank05_%s_BP_GO_top20.png", component)),
plot = p,
width = 14,
height = 10,
dpi = 300
)
# Print top 10 terms as table
cat("\n### Top 10 Terms\n\n")
print(knitr::kable(head(term_counts, 10), row.names = FALSE))
cat("\n\n")
}
### Top 10 Terms
| term | count | component |
|---|---|---|
| negative regulation of transcription by RNA polymerase II | 7 | comp1 |
| regulation of transcription by RNA polymerase II | 6 | comp1 |
| positive regulation of transcription by RNA polymerase II | 5 | comp1 |
| cell population proliferation | 4 | comp1 |
| nervous system development | 4 | comp1 |
| kidney development | 3 | comp1 |
| negative regulation of cell population proliferation | 3 | comp1 |
| positive regulation of gene expression | 3 | comp1 |
| proteolysis | 3 | comp1 |
| roof of mouth development | 3 | comp1 |
### Top 10 Terms
| term | count | component |
|---|---|---|
| negative regulation of transcription by RNA polymerase II | 5 | comp2 |
| regulation of transcription by RNA polymerase II | 5 | comp2 |
| positive regulation of DNA-templated transcription | 4 | comp2 |
| cell division | 3 | comp2 |
| cell population proliferation | 3 | comp2 |
| kidney development | 3 | comp2 |
| negative regulation of apoptotic process | 3 | comp2 |
| positive regulation of gene expression | 3 | comp2 |
| positive regulation of transcription by RNA polymerase II | 3 | comp2 |
| regulation of DNA-templated transcription | 3 | comp2 |
### Top 10 Terms
| term | count | component |
|---|---|---|
| positive regulation of transcription by RNA polymerase II | 7 | comp3 |
| cell differentiation | 4 | comp3 |
| intracellular signal transduction | 4 | comp3 |
| positive regulation of DNA-templated transcription | 4 | comp3 |
| positive regulation of gene expression | 4 | comp3 |
| regulation of transcription by RNA polymerase II | 4 | comp3 |
| Wnt signaling pathway | 3 | comp3 |
| axon guidance | 3 | comp3 |
| basement membrane organization | 3 | comp3 |
| homophilic cell adhesion via plasma membrane adhesion molecules | 3 | comp3 |
### Top 10 Terms
| term | count | component |
|---|---|---|
| negative regulation of transcription by RNA polymerase II | 7 | comp4 |
| cell division | 4 | comp4 |
| regulation of transcription by RNA polymerase II | 4 | comp4 |
| DNA repair | 3 | comp4 |
| Wnt signaling pathway | 3 | comp4 |
| cell population proliferation | 3 | comp4 |
| cellular response to leukemia inhibitory factor | 3 | comp4 |
| kidney development | 3 | comp4 |
| negative regulation of apoptotic process | 3 | comp4 |
| nervous system development | 3 | comp4 |
### Top 10 Terms
| term | count | component |
|---|---|---|
| regulation of transcription by RNA polymerase II | 7 | comp5 |
| negative regulation of transcription by RNA polymerase II | 6 | comp5 |
| positive regulation of DNA-templated transcription | 6 | comp5 |
| positive regulation of transcription by RNA polymerase II | 5 | comp5 |
| Wnt signaling pathway | 4 | comp5 |
| cell division | 4 | comp5 |
| cell population proliferation | 4 | comp5 |
| nervous system development | 4 | comp5 |
| positive regulation of gene expression | 4 | comp5 |
| cellular response to leukemia inhibitory factor | 3 | comp5 |
# Combine all results
if (length(all_results) > 0) {
combined_data <- bind_rows(all_results)
# Get overall most common terms
overall_counts <- combined_data %>%
group_by(term) %>%
summarise(
total_count = sum(count),
n_components = n()
) %>%
arrange(desc(total_count)) %>%
head(30)
# Create overall summary plot
p_summary <- ggplot(overall_counts, aes(x = reorder(term, total_count), y = total_count)) +
geom_bar(stat = "identity", fill = "darkblue", alpha = 0.7) +
geom_text(aes(label = n_components), hjust = -0.3, size = 3, color = "red") +
coord_flip() +
labs(
title = "Top 30 Biological Process GO Terms Across All Rank 05 Components",
subtitle = "Red numbers indicate how many components contain each term",
x = "GO Biological Process Term",
y = "Total Frequency Across All Components"
) +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = element_text(size = 12, color = "gray40"),
axis.text.y = element_text(size = 10),
axis.text.x = element_text(size = 10),
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank()
)
print(p_summary)
# Save summary plot
ggsave(
filename = file.path(output_dir, "rank05_ALL_components_BP_GO_summary.png"),
plot = p_summary,
width = 14,
height = 12,
dpi = 300
)
# Save summary table
write_csv(overall_counts,
file.path(output_dir, "rank05_BP_GO_summary_table.csv"))
cat("\n### Top 30 GO BP Terms Summary\n\n")
print(knitr::kable(overall_counts, row.names = FALSE))
}
##
## ### Top 30 GO BP Terms Summary
##
##
##
## |term | total_count| n_components|
## |:---------------------------------------------------------------|-----------:|------------:|
## |negative regulation of transcription by RNA polymerase II | 28| 5|
## |regulation of transcription by RNA polymerase II | 26| 5|
## |positive regulation of transcription by RNA polymerase II | 23| 5|
## |positive regulation of DNA-templated transcription | 17| 4|
## |positive regulation of gene expression | 17| 5|
## |Wnt signaling pathway | 14| 5|
## |cell population proliferation | 14| 4|
## |nervous system development | 14| 4|
## |cell division | 13| 4|
## |kidney development | 12| 4|
## |negative regulation of apoptotic process | 12| 4|
## |cellular response to leukemia inhibitory factor | 10| 4|
## |basement membrane organization | 9| 4|
## |angiogenesis | 6| 3|
## |branching involved in ureteric bud morphogenesis | 6| 3|
## |regulation of DNA-templated transcription | 6| 2|
## |regulation of cell cycle | 6| 2|
## |roof of mouth development | 6| 2|
## |DNA repair | 5| 2|
## |DNA replication initiation | 4| 2|
## |anterior/posterior pattern specification | 4| 2|
## |cGMP-mediated signaling | 4| 2|
## |cell differentiation | 4| 1|
## |central nervous system development | 4| 2|
## |intracellular signal transduction | 4| 1|
## |axon guidance | 3| 1|
## |homophilic cell adhesion via plasma membrane adhesion molecules | 3| 1|
## |mesoderm development | 3| 1|
## |negative regulation of cell population proliferation | 3| 1|
## |ossification | 3| 1|
if (length(all_results) > 0) {
# Get top 30 overall terms
top_terms <- overall_counts$term[1:min(30, nrow(overall_counts))]
# Create matrix of term counts by component
heatmap_data <- combined_data %>%
filter(term %in% top_terms) %>%
select(component, term, count) %>%
pivot_wider(names_from = component, values_from = count, values_fill = 0)
# Convert to matrix format for better visualization
heatmap_matrix <- heatmap_data %>%
column_to_rownames("term") %>%
as.matrix()
# Reorder by component number
comp_order <- str_sort(colnames(heatmap_matrix), numeric = TRUE)
heatmap_matrix <- heatmap_matrix[, comp_order]
# Convert back for ggplot
heatmap_long <- heatmap_matrix %>%
as.data.frame() %>%
rownames_to_column("term") %>%
pivot_longer(-term, names_to = "component", values_to = "count")
# Order terms by total frequency
term_order <- overall_counts$term[1:min(30, nrow(overall_counts))]
heatmap_long$term <- factor(heatmap_long$term, levels = rev(term_order))
# Order components numerically
heatmap_long <- heatmap_long %>%
mutate(comp_num = as.numeric(str_extract(component, "\\d+"))) %>%
arrange(comp_num)
heatmap_long$component <- factor(heatmap_long$component,
levels = unique(heatmap_long$component[order(heatmap_long$comp_num)]))
# Create heatmap
p_heatmap <- ggplot(heatmap_long, aes(x = component, y = term, fill = count)) +
geom_tile(color = "white", size = 0.5) +
scale_fill_gradient2(low = "white", mid = "lightblue", high = "darkblue",
midpoint = max(heatmap_long$count)/2,
name = "Count") +
labs(
title = "GO BP Terms Distribution Across Rank 05 Components",
subtitle = "Top 30 most frequent terms",
x = "Component",
y = "GO Biological Process Term"
) +
theme_minimal(base_size = 10) +
theme(
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = element_text(size = 12, color = "gray40"),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 10),
axis.text.y = element_text(size = 9),
panel.grid = element_blank(),
legend.position = "right"
)
print(p_heatmap)
# Save heatmap
ggsave(
filename = file.path(output_dir, "rank05_BP_GO_heatmap.png"),
plot = p_heatmap,
width = 12,
height = 10,
dpi = 300
)
}
sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Sonoma 14.7.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] C
##
## time zone: America/Los_Angeles
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] scales_1.3.0 RColorBrewer_1.1-3 lubridate_1.9.3 forcats_1.0.0
## [5] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2 readr_2.1.5
## [9] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] sass_0.4.9 utf8_1.2.4 generics_0.1.3 stringi_1.8.4
## [5] hms_1.1.3 digest_0.6.37 magrittr_2.0.3 evaluate_1.0.0
## [9] grid_4.3.2 timechange_0.3.0 fastmap_1.2.0 jsonlite_1.8.9
## [13] fansi_1.0.6 textshaping_0.4.0 jquerylib_0.1.4 cli_3.6.3
## [17] rlang_1.1.4 crayon_1.5.3 bit64_4.5.2 munsell_0.5.1
## [21] withr_3.0.1 cachem_1.1.0 yaml_2.3.10 tools_4.3.2
## [25] parallel_4.3.2 tzdb_0.4.0 colorspace_2.1-1 vctrs_0.6.5
## [29] R6_2.5.1 lifecycle_1.0.4 bit_4.5.0 vroom_1.6.5
## [33] ragg_1.3.3 pkgconfig_2.0.3 pillar_1.9.0 bslib_0.8.0
## [37] gtable_0.3.6 glue_1.8.0 systemfonts_1.1.0 highr_0.11
## [41] xfun_0.48 tidyselect_1.2.1 knitr_1.48 farver_2.1.2
## [45] htmltools_0.5.8.1 rmarkdown_2.28 labeling_0.4.3 compiler_4.3.2