#!/usr/bin/env python3 """ Test script to validate the WGCNA module annotation output. """ import os import sys import pandas as pd from pathlib import Path def test_wgcna_annotation(): """Test that the WGCNA module annotation was generated correctly.""" print("Testing WGCNA module annotation...") print("=" * 70) # Set up paths repo_root = Path(__file__).parent.parent output_dir = repo_root / 'output' / '18-ortholog-wgcna' # Define expected output files expected_files = [ 'wgcna_module_annotation_summary.txt', 'wgcna_module_overview.csv', 'wgcna_module_go_bp_summary.csv', 'wgcna_module_go_cc_summary.csv', 'wgcna_module_go_mf_summary.csv', 'wgcna_module_goslim_summary.csv', 'wgcna_module_protein_summary.csv' ] # Test 1: Check if all output files exist print("\n1. Checking if all output files exist...") for filename in expected_files: file_path = output_dir / filename if not file_path.exists(): raise FileNotFoundError(f"Output file not found: {file_path}") print(f" ✓ Found: {filename}") # Test 2: Validate overview file structure print("\n2. Validating overview file structure...") overview_file = output_dir / 'wgcna_module_overview.csv' overview_df = pd.read_csv(overview_file) expected_columns = [ 'module', 'num_orthologs', 'num_annotated', 'annotation_coverage_%', 'num_go_bp_terms', 'num_go_cc_terms', 'num_go_mf_terms', 'num_goslim_terms', 'num_unique_proteins', 'top_go_bp', 'top_goslim', 'top_protein' ] for col in expected_columns: if col not in overview_df.columns: raise ValueError(f"Missing expected column: {col}") print(f" ✓ All expected columns present") print(f" Total modules: {len(overview_df)}") # Test 3: Check number of modules print("\n3. Checking number of modules...") if len(overview_df) != 15: raise ValueError(f"Expected 15 modules, found {len(overview_df)}") print(f" ✓ Found 15 WGCNA modules (0-14)") # Test 4: Validate module IDs print("\n4. Validating module IDs...") expected_modules = list(range(15)) actual_modules = sorted(overview_df['module'].tolist()) if actual_modules != expected_modules: raise ValueError(f"Module IDs mismatch. Expected {expected_modules}, got {actual_modules}") print(f" ✓ Module IDs are correct (0-14)") # Test 5: Check data integrity print("\n5. Checking data integrity...") # Check that num_annotated <= num_orthologs if (overview_df['num_annotated'] > overview_df['num_orthologs']).any(): raise ValueError("num_annotated is greater than num_orthologs for some modules") print(f" ✓ Annotation counts are valid") # Check that annotation_coverage matches calculation calculated_coverage = (overview_df['num_annotated'] / overview_df['num_orthologs'] * 100).round(1) if not (calculated_coverage == overview_df['annotation_coverage_%']).all(): raise ValueError("Annotation coverage percentages don't match calculations") print(f" ✓ Annotation coverage percentages are correct") # Test 6: Validate CSV summary structures print("\n6. Validating CSV summary file structures...") csv_files = [ 'wgcna_module_go_bp_summary.csv', 'wgcna_module_go_cc_summary.csv', 'wgcna_module_go_mf_summary.csv', 'wgcna_module_goslim_summary.csv', 'wgcna_module_protein_summary.csv' ] for csv_file in csv_files: file_path = output_dir / csv_file df = pd.read_csv(file_path) # Check that 'term' column exists if 'term' not in df.columns: raise ValueError(f"Missing 'term' column in {csv_file}") # Check that module columns exist (module_0 through module_14) module_cols = [f'module_{i}' for i in range(15)] for col in module_cols: if col not in df.columns: raise ValueError(f"Missing {col} column in {csv_file}") # Check that 'total' column exists if 'total' not in df.columns: raise ValueError(f"Missing 'total' column in {csv_file}") # Verify that totals are sum of module columns calculated_total = df[module_cols].sum(axis=1) if not (calculated_total == df['total']).all(): raise ValueError(f"Total column doesn't match sum of modules in {csv_file}") print(f" ✓ {csv_file} structure is valid ({len(df)} terms)") # Test 7: Check text report print("\n7. Checking text report...") report_file = output_dir / 'wgcna_module_annotation_summary.txt' with open(report_file, 'r') as f: content = f.read() # Check that all modules are present in the report for module_id in range(15): if f"MODULE {module_id}" not in content: raise ValueError(f"MODULE {module_id} not found in text report") print(f" ✓ All 15 modules present in text report") print(f" Report size: {len(content)} characters") # Test 8: Validate data values print("\n8. Validating data values...") # Check that all modules have orthologs if (overview_df['num_orthologs'] <= 0).any(): raise ValueError("Some modules have 0 orthologs") print(f" ✓ All modules have orthologs") # Check total number of orthologs total_orthologs = overview_df['num_orthologs'].sum() print(f" Total orthologs across all modules: {total_orthologs}") total_annotated = overview_df['num_annotated'].sum() print(f" Total annotated orthologs: {total_annotated}") overall_coverage = (total_annotated / total_orthologs * 100) print(f" Overall annotation coverage: {overall_coverage:.1f}%") # Test 9: Display summary statistics print("\n9. Summary statistics by module...") print("\n Top 5 modules by number of orthologs:") top_modules = overview_df.nlargest(5, 'num_orthologs')[['module', 'num_orthologs', 'num_annotated', 'annotation_coverage_%']] print(top_modules.to_string(index=False)) print("\n Modules with highest annotation coverage:") top_coverage = overview_df.nlargest(5, 'annotation_coverage_%')[['module', 'num_orthologs', 'num_annotated', 'annotation_coverage_%']] print(top_coverage.to_string(index=False)) # Summary print("\n" + "=" * 70) print("✅ ALL TESTS PASSED!") print("\nThe WGCNA module annotation was generated successfully.") print(f"Output directory: {output_dir}") print("=" * 70) if __name__ == '__main__': try: test_wgcna_annotation() except Exception as e: print(f"\n❌ TEST FAILED: {e}") import traceback traceback.print_exc() sys.exit(1)