#!/usr/bin/env python3 """ Summarize GO Slim term occurrences across component gene annotation files. This script: 1. Reads all CSV files with "annotation" in the filename from M-multi-species/output/26-rank35-optimization/lambda_gene_0.2/top_genes_per_component 2. Counts occurrences of terms in the goslim_names column (semicolon-separated) 3. Creates a single CSV file with term counts """ import os import sys import pandas as pd from pathlib import Path from collections import Counter def count_goslim_terms_per_component(annotation_files): """ Count occurrences of GO Slim terms per component annotation file. Args: annotation_files: List of paths to annotation CSV files Returns: Dictionary mapping component names to Counter objects with term counts """ component_term_counts = {} for file_path in annotation_files: try: # Extract component name from filename (e.g., "Component_1" from "Component_1_top100_annotation.csv") filename = file_path.stem # Gets filename without extension component_name = filename.split('_top')[0] # Extract "Component_X" part # Read the annotation file df = pd.read_csv(file_path) # Check if goslim_names column exists if 'goslim_names' not in df.columns: print(f" Warning: 'goslim_names' column not found in {file_path.name}") continue # Extract all terms from goslim_names column for this component component_terms = [] for terms_str in df['goslim_names'].dropna(): terms_str = str(terms_str) if terms_str and terms_str.strip(): # Split by semicolon and clean up each term terms = [t.strip() for t in terms_str.split(';') if t.strip()] component_terms.extend(terms) # Count occurrences for this component component_term_counts[component_name] = Counter(component_terms) except Exception as e: print(f" Error processing {file_path.name}: {e}") continue return component_term_counts def main(): # Set up paths repo_root = Path(__file__).parent.parent component_dir = repo_root / 'output' / '26-rank35-optimization' / 'lambda_gene_0.2' / 'top_genes_per_component' output_file = component_dir / 'goslim_term_counts.csv' # Check if directory exists if not component_dir.exists(): print(f"Error: Component directory not found: {component_dir}") sys.exit(1) # Find all annotation files annotation_files = sorted([f for f in component_dir.glob('*.csv') if 'annotation' in f.name.lower()]) print(f"Found {len(annotation_files)} annotation files") if len(annotation_files) == 0: print("No annotation files found!") sys.exit(1) # Count GO Slim terms per component print("\nCounting GO Slim term occurrences per component...") component_term_counts = count_goslim_terms_per_component(annotation_files) # Get all unique terms across all components all_terms = set() for term_counter in component_term_counts.values(): all_terms.update(term_counter.keys()) print(f"Found {len(all_terms)} unique GO Slim terms") # Create a matrix: rows are terms, columns are components # Sort component names numerically (Component_1, Component_2, ..., Component_35) component_names = sorted(component_term_counts.keys(), key=lambda x: int(x.split('_')[1])) # Build the results dictionary results = {'term': []} for comp in component_names: results[comp] = [] results['total'] = [] # Sort terms by total count (descending) term_totals = Counter() for comp_counts in component_term_counts.values(): term_totals.update(comp_counts) sorted_terms = [term for term, count in term_totals.most_common()] # Fill in the matrix for term in sorted_terms: results['term'].append(term) total = 0 for comp in component_names: count = component_term_counts[comp].get(term, 0) results[comp].append(count) total += count results['total'].append(total) # Create DataFrame results_df = pd.DataFrame(results) # Save to CSV results_df.to_csv(output_file, index=False) print(f"\nResults saved to: {output_file}") # Display summary statistics print(f"\nTotal term occurrences: {results_df['total'].sum()}") print(f"Number of components: {len(component_names)}") # Display top 10 terms with their totals print("\nTop 10 most common GO Slim terms:") print(results_df[['term', 'total']].head(10).to_string(index=False)) print("\nDone!") if __name__ == "__main__": main()