--- title: "Visualizing BP GO Processes for Rank 05 Components" author: "Analysis" date: "`r Sys.Date()`" output: html_document: toc: true toc_float: true code_folding: hide --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, fig.width = 12, fig.height = 8) ``` ## Overview This analysis visualizes the Biological Process (BP) Gene Ontology terms from the top 100 genes in each of the 5 components from the rank 05 tensor decomposition. ## Load Libraries ```{r load-libraries} library(tidyverse) library(ggplot2) library(RColorBrewer) library(scales) ``` ## Define Paths ```{r paths} # Input directory input_dir <- "/Users/sr320/Documents/GitHub/timeseries_molecular/M-multi-species/output/22-Visualizing-Rank-outs" # Output directory output_dir <- "/Users/sr320/Documents/GitHub/timeseries_molecular/M-multi-species/output/24-visualizing-rank05" # Create output directory if it doesn't exist dir.create(output_dir, showWarnings = FALSE, recursive = TRUE) ``` ## Get List of Files ```{r get-files} # Get all rank_05 top100 annotation files files <- list.files(input_dir, pattern = "rank_05_comp.*_top100_annotation\\.csv$", full.names = TRUE) # Sort files by component number file_df <- data.frame( path = files, component = str_extract(basename(files), "comp\\d+") ) %>% mutate(comp_num = as.numeric(str_extract(component, "\\d+"))) %>% arrange(comp_num) cat(sprintf("Found %d files to process\n", nrow(file_df))) ``` ## Function to Parse GO BP Terms ```{r parse-function} # Function to parse GO BP column parse_go_bp <- function(go_bp_string) { if (is.na(go_bp_string) || go_bp_string == "") { return(NULL) } # Split by semicolon and trim whitespace terms <- str_split(go_bp_string, ";")[[1]] %>% str_trim() # Extract term name (everything before [GO:...) term_names <- str_replace(terms, "\\s*\\[GO:.*\\]", "") return(term_names) } # Function to process a single file process_file <- function(file_path, component_name) { # Read the file data <- read_csv(file_path, show_col_types = FALSE) # Extract all GO BP terms all_terms <- data %>% filter(!is.na(go_bp) & go_bp != "") %>% pull(go_bp) %>% map(parse_go_bp) %>% unlist() if (length(all_terms) == 0) { return(NULL) } # Count term frequencies term_counts <- as.data.frame(table(all_terms)) %>% rename(term = all_terms, count = Freq) %>% arrange(desc(count)) %>% head(20) # Top 20 terms term_counts$component <- component_name return(term_counts) } ``` ## Process All Files and Generate Visualizations ```{r process-files, results='asis', fig.width=14, fig.height=10} # Store all results for summary all_results <- list() for (i in 1:nrow(file_df)) { file_path <- file_df$path[i] component <- file_df$component[i] comp_num <- file_df$comp_num[i] cat(sprintf("\n\n## Component %d\n\n", comp_num)) # Process the file term_counts <- process_file(file_path, component) if (is.null(term_counts) || nrow(term_counts) == 0) { cat("No GO BP terms found for this component.\n\n") next } all_results[[component]] <- term_counts # Create visualization p <- ggplot(term_counts, aes(x = reorder(term, count), y = count)) + geom_bar(stat = "identity", fill = "steelblue", alpha = 0.8) + coord_flip() + labs( title = sprintf("Top 20 Biological Process GO Terms - %s", component), subtitle = sprintf("From top 100 genes in rank 05 decomposition"), x = "GO Biological Process Term", y = "Frequency" ) + theme_minimal(base_size = 12) + theme( plot.title = element_text(size = 16, face = "bold"), plot.subtitle = element_text(size = 12, color = "gray40"), axis.text.y = element_text(size = 10), axis.text.x = element_text(size = 10), panel.grid.major.y = element_blank(), panel.grid.minor = element_blank() ) print(p) # Save individual plot ggsave( filename = file.path(output_dir, sprintf("rank05_%s_BP_GO_top20.png", component)), plot = p, width = 14, height = 10, dpi = 300 ) # Print top 10 terms as table cat("\n### Top 10 Terms\n\n") print(knitr::kable(head(term_counts, 10), row.names = FALSE)) cat("\n\n") } ``` ## Summary: Most Common GO BP Terms Across All Components ```{r summary-analysis, fig.width=14, fig.height=12} # Combine all results if (length(all_results) > 0) { combined_data <- bind_rows(all_results) # Get overall most common terms overall_counts <- combined_data %>% group_by(term) %>% summarise( total_count = sum(count), n_components = n() ) %>% arrange(desc(total_count)) %>% head(30) # Create overall summary plot p_summary <- ggplot(overall_counts, aes(x = reorder(term, total_count), y = total_count)) + geom_bar(stat = "identity", fill = "darkblue", alpha = 0.7) + geom_text(aes(label = n_components), hjust = -0.3, size = 3, color = "red") + coord_flip() + labs( title = "Top 30 Biological Process GO Terms Across All Rank 05 Components", subtitle = "Red numbers indicate how many components contain each term", x = "GO Biological Process Term", y = "Total Frequency Across All Components" ) + theme_minimal(base_size = 12) + theme( plot.title = element_text(size = 16, face = "bold"), plot.subtitle = element_text(size = 12, color = "gray40"), axis.text.y = element_text(size = 10), axis.text.x = element_text(size = 10), panel.grid.major.y = element_blank(), panel.grid.minor = element_blank() ) print(p_summary) # Save summary plot ggsave( filename = file.path(output_dir, "rank05_ALL_components_BP_GO_summary.png"), plot = p_summary, width = 14, height = 12, dpi = 300 ) # Save summary table write_csv(overall_counts, file.path(output_dir, "rank05_BP_GO_summary_table.csv")) cat("\n### Top 30 GO BP Terms Summary\n\n") print(knitr::kable(overall_counts, row.names = FALSE)) } ``` ## Component Comparison Heatmap ```{r heatmap, fig.width=12, fig.height=10} if (length(all_results) > 0) { # Get top 30 overall terms top_terms <- overall_counts$term[1:min(30, nrow(overall_counts))] # Create matrix of term counts by component heatmap_data <- combined_data %>% filter(term %in% top_terms) %>% select(component, term, count) %>% pivot_wider(names_from = component, values_from = count, values_fill = 0) # Convert to matrix format for better visualization heatmap_matrix <- heatmap_data %>% column_to_rownames("term") %>% as.matrix() # Reorder by component number comp_order <- str_sort(colnames(heatmap_matrix), numeric = TRUE) heatmap_matrix <- heatmap_matrix[, comp_order] # Convert back for ggplot heatmap_long <- heatmap_matrix %>% as.data.frame() %>% rownames_to_column("term") %>% pivot_longer(-term, names_to = "component", values_to = "count") # Order terms by total frequency term_order <- overall_counts$term[1:min(30, nrow(overall_counts))] heatmap_long$term <- factor(heatmap_long$term, levels = rev(term_order)) # Order components numerically heatmap_long <- heatmap_long %>% mutate(comp_num = as.numeric(str_extract(component, "\\d+"))) %>% arrange(comp_num) heatmap_long$component <- factor(heatmap_long$component, levels = unique(heatmap_long$component[order(heatmap_long$comp_num)])) # Create heatmap p_heatmap <- ggplot(heatmap_long, aes(x = component, y = term, fill = count)) + geom_tile(color = "white", size = 0.5) + scale_fill_gradient2(low = "white", mid = "lightblue", high = "darkblue", midpoint = max(heatmap_long$count)/2, name = "Count") + labs( title = "GO BP Terms Distribution Across Rank 05 Components", subtitle = "Top 30 most frequent terms", x = "Component", y = "GO Biological Process Term" ) + theme_minimal(base_size = 10) + theme( plot.title = element_text(size = 16, face = "bold"), plot.subtitle = element_text(size = 12, color = "gray40"), axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 10), axis.text.y = element_text(size = 9), panel.grid = element_blank(), legend.position = "right" ) print(p_heatmap) # Save heatmap ggsave( filename = file.path(output_dir, "rank05_BP_GO_heatmap.png"), plot = p_heatmap, width = 12, height = 10, dpi = 300 ) } ``` ## Session Info ```{r session-info} sessionInfo() ```