#!/usr/bin/env python3
"""
Merge ortholog_groups.csv with annotation_with_goslim.tsv
Keep all records from ortholog_groups and join using FUN ID
"""

import pandas as pd
import sys
from pathlib import Path

def main():
    # File paths
    ortholog_file = Path("../11-orthology-analysis/ortholog_groups.csv")
    annotation_file = Path("run_20250831_172744/annotation_with_goslim.tsv")
    output_file = Path("ortholog_groups_annotated.csv")
    
    print(f"Reading ortholog groups from: {ortholog_file}")
    ortholog_df = pd.read_csv(ortholog_file)
    print(f"Ortholog groups shape: {ortholog_df.shape}")
    
    print(f"Reading annotations from: {annotation_file}")
    annotation_df = pd.read_csv(annotation_file, sep='\t')
    print(f"Annotations shape: {annotation_df.shape}")
    
    # The FUN ID is in the 'apul' column of ortholog_groups and 'query' column of annotations
    print("Merging on FUN ID (apul column from ortholog_groups, query column from annotations)")
    
    # Perform left join to keep all records from ortholog_groups
    merged_df = ortholog_df.merge(
        annotation_df, 
        left_on='apul', 
        right_on='query', 
        how='left'
    )
    
    print(f"Merged data shape: {merged_df.shape}")
    
    # Check how many ortholog groups got annotations
    annotated_count = merged_df['query'].notna().sum()
    print(f"Ortholog groups with annotations: {annotated_count}")
    print(f"Ortholog groups without annotations: {len(merged_df) - annotated_count}")
    
    # Save the merged file
    print(f"Saving merged data to: {output_file}")
    merged_df.to_csv(output_file, index=False)
    
    print("Done!")
    
    # Show sample of the merged data
    print("\nSample of merged data:")
    print(merged_df.head())
    
    # Show columns in the merged file
    print(f"\nColumns in merged file ({len(merged_df.columns)} total):")
    for i, col in enumerate(merged_df.columns):
        print(f"{i+1:2d}. {col}")

if __name__ == "__main__":
    main()