#!/usr/bin/env python3 """ Test script to validate the GO Slim term summarization script. """ import os import sys import pandas as pd from pathlib import Path def test_goslim_summary(): """Test that the GO Slim summary was generated correctly.""" print("Testing GO Slim term summarization...") print("=" * 70) # Set up paths repo_root = Path(__file__).parent.parent component_dir = repo_root / 'output' / '26-rank35-optimization' / 'lambda_gene_0.2' / 'top_genes_per_component' summary_file = component_dir / 'goslim_term_counts.csv' # Test 1: Check if summary file exists print("\n1. Checking if summary file exists...") if not summary_file.exists(): raise FileNotFoundError(f"Summary file not found: {summary_file}") print(f" ✓ Found summary file: {summary_file}") # Test 2: Load and validate structure print("\n2. Validating summary file structure...") df = pd.read_csv(summary_file) # Check that we have term column, component columns, and total column if 'term' not in df.columns: raise ValueError("Missing 'term' column") if 'total' not in df.columns: raise ValueError("Missing 'total' column") # Check for Component columns (should have Component_1 through Component_35) component_cols = [col for col in df.columns if col.startswith('Component_')] if len(component_cols) != 35: raise ValueError(f"Expected 35 Component columns, found {len(component_cols)}") print(f" ✓ Correct structure: term column, {len(component_cols)} component columns, total column") print(f" Total columns: {len(df.columns)}") # Test 3: Check data types print("\n3. Checking data types...") if df['term'].dtype != 'object': raise ValueError(f"Expected 'term' to be string, got {df['term'].dtype}") if not pd.api.types.is_integer_dtype(df['total']): raise ValueError(f"Expected 'total' to be integer, got {df['total'].dtype}") # Check that component columns are integers component_cols = [col for col in df.columns if col.startswith('Component_')] for col in component_cols[:3]: # Check first 3 as a sample if not pd.api.types.is_integer_dtype(df[col]): raise ValueError(f"Expected '{col}' to be integer, got {df[col].dtype}") print(f" ✓ Correct data types") # Test 4: Check for data print("\n4. Checking for data...") if len(df) == 0: raise ValueError("Summary file is empty") print(f" ✓ Found {len(df)} unique GO Slim terms") # Test 5: Validate counts are positive and totals are correct print("\n5. Validating counts...") if (df['total'] <= 0).any(): raise ValueError("Found non-positive totals") # Verify that totals match sum of component columns component_cols = [col for col in df.columns if col.startswith('Component_')] for i in range(min(5, len(df))): # Check first 5 rows row_sum = df.iloc[i][component_cols].sum() total = df.iloc[i]['total'] if row_sum != total: raise ValueError(f"Row {i}: sum of components ({row_sum}) != total ({total})") print(f" ✓ All counts are positive") print(f" ✓ Totals match sum of component columns") print(f" Total occurrences: {df['total'].sum()}") # Test 6: Check sorting (should be descending by total) print("\n6. Checking sort order...") if not df['total'].is_monotonic_decreasing: raise ValueError("Results are not sorted by total (descending)") print(f" ✓ Results sorted by total (descending)") # Test 7: Verify against source files print("\n7. Verifying against source files...") annotation_files = sorted([f for f in component_dir.glob('*.csv') if 'annotation' in f.name.lower()]) print(f" Found {len(annotation_files)} annotation files") if len(annotation_files) == 0: raise ValueError("No annotation files found for verification") # Sample a few terms to verify counts print("\n8. Spot-checking term counts...") # Get a sample annotation file sample_df = pd.read_csv(annotation_files[0]) if 'goslim_names' in sample_df.columns: sample_terms = [] for terms_str in sample_df['goslim_names'].dropna(): if terms_str and str(terms_str).strip(): terms = [t.strip() for t in str(terms_str).split(';') if t.strip()] sample_terms.extend(terms) if sample_terms: # Check if these terms exist in summary sample_term = sample_terms[0] if sample_term in df['term'].values: total = df[df['term'] == sample_term]['total'].iloc[0] print(f" ✓ Sample term '{sample_term}' found with total count {total}") else: print(f" ⚠ Sample term '{sample_term}' not found in summary") # Test 9: Display summary statistics print("\n9. Summary statistics...") print(f" Total unique terms: {len(df)}") print(f" Total occurrences: {df['total'].sum()}") print(f" Number of components: {len([col for col in df.columns if col.startswith('Component_')])}") print(f" Most common term: '{df.iloc[0]['term']}' ({df.iloc[0]['total']} occurrences)") print(f" Least common terms: {(df['total'] == 1).sum()} terms with 1 occurrence") # Display top 5 print("\n Top 5 terms:") for idx, row in df.head(5).iterrows(): print(f" {row['term']}: {row['total']}") # Summary print("\n" + "=" * 70) print("✅ ALL TESTS PASSED!") print("\nThe GO Slim term summary was generated successfully.") print(f"Output file: {summary_file}") print("=" * 70) if __name__ == '__main__': try: test_goslim_summary() except Exception as e: print(f"\n❌ TEST FAILED: {e}") sys.exit(1)