#!/usr/bin/env python3
"""
Annotate rank CSV files with ortholog group annotations.

This script:
1. Reads all CSV files from M-multi-species/output/22-Visualizing-Rank-outs
2. Joins each with ortholog_groups_annotated.csv on OG ID
3. Creates _annotation.csv files with the merged data
4. Generates a summary markdown table with predominant GO processes and gene functions
"""

import os
import sys
import pandas as pd
from pathlib import Path
from collections import Counter

def extract_predominant_terms(go_terms_list, top_n=3):
    """
    Extract the most common GO terms from a list of GO term strings.
    
    Args:
        go_terms_list: List of GO term strings (may contain semicolon-separated values)
        top_n: Number of top terms to return
    
    Returns:
        String with comma-separated top terms
    """
    all_terms = []
    for terms_str in go_terms_list:
        if pd.notna(terms_str) and terms_str.strip():
            # Split by semicolon and clean up each term
            terms = [t.strip() for t in str(terms_str).split(';') if t.strip()]
            all_terms.extend(terms)
    
    if not all_terms:
        return "N/A"
    
    # Count occurrences
    term_counts = Counter(all_terms)
    top_terms = term_counts.most_common(top_n)
    
    # Return as comma-separated string
    return ", ".join([term for term, count in top_terms])

def extract_predominant_functions(protein_names_list, top_n=3):
    """
    Extract the most common gene/protein functions.
    
    Args:
        protein_names_list: List of protein name strings
        top_n: Number of top functions to return
    
    Returns:
        String with comma-separated top functions
    """
    all_functions = []
    for name in protein_names_list:
        if pd.notna(name) and name.strip():
            # Extract the main function name (before parentheses or brackets)
            clean_name = str(name).split('(')[0].strip()
            if clean_name:
                all_functions.append(clean_name)
    
    if not all_functions:
        return "N/A"
    
    # Count occurrences
    func_counts = Counter(all_functions)
    top_funcs = func_counts.most_common(top_n)
    
    # Return as comma-separated string
    return ", ".join([func for func, count in top_funcs])

def generate_physiological_summary(go_bp_list, protein_names_list):
    """
    Generate a descriptive phrase summarizing the physiological processes.
    
    Analyzes GO biological processes and protein functions to create a concise
    summary of the predominant physiological themes in the gene list.
    
    Args:
        go_bp_list: List of GO biological process strings
        protein_names_list: List of protein names
    
    Returns:
        String describing the predominant physiological processes
    """
    if not go_bp_list or len(go_bp_list) == 0:
        return "Unannotated genes"
    
    # Extract all GO terms
    all_go_terms = []
    for terms_str in go_bp_list:
        if pd.notna(terms_str) and terms_str.strip():
            terms = [t.strip() for t in str(terms_str).split(';') if t.strip()]
            all_go_terms.extend(terms)
    
    if not all_go_terms:
        return "Unannotated genes"
    
    # Keywords for categorizing physiological processes
    process_keywords = {
        'transcription': ['transcription', 'gene expression', 'chromatin'],
        'cell_division': ['cell division', 'mitosis', 'cell cycle', 'cytokinesis'],
        'development': ['development', 'differentiation', 'morphogenesis', 'embryonic'],
        'metabolism': ['metabolic', 'biosynthetic', 'catabolic', 'metabolism'],
        'signaling': ['signal transduction', 'signaling pathway', 'receptor signaling'],
        'immune': ['immune', 'defense response', 'innate immune', 'inflammatory'],
        'transport': ['transport', 'localization', 'secretion', 'endocytosis'],
        'protein_processing': ['protein folding', 'proteolysis', 'protein modification', 'ubiquitin'],
        'cell_structure': ['cytoskeleton', 'cell adhesion', 'extracellular matrix'],
        'reproduction': ['spermatogenesis', 'fertilization', 'gamete', 'meiosis'],
        'cilium': ['cilium', 'flagell', 'axoneme'],
        'apoptosis': ['apoptosis', 'programmed cell death', 'cell death'],
        'stress': ['stress response', 'heat shock', 'oxidative stress']
    }
    
    # Count categories
    category_counts = Counter()
    for term in all_go_terms:
        term_lower = term.lower()
        for category, keywords in process_keywords.items():
            if any(keyword in term_lower for keyword in keywords):
                category_counts[category] += 1
    
    if not category_counts:
        # Fallback: extract key words from most common terms
        term_counts = Counter(all_go_terms)
        top_term = term_counts.most_common(1)[0][0] if term_counts else ""
        # Extract the main process name (before bracketed GO ID)
        main_process = top_term.split('[')[0].strip()
        if main_process:
            return f"Genes involved in {main_process.lower()}"
        return "Mixed physiological processes"
    
    # Generate summary based on top categories
    top_categories = category_counts.most_common(3)
    
    # Map category names to readable descriptions
    category_descriptions = {
        'transcription': 'gene expression regulation',
        'cell_division': 'cell division and proliferation',
        'development': 'development and differentiation',
        'metabolism': 'metabolic processes',
        'signaling': 'signal transduction',
        'immune': 'immune and defense responses',
        'transport': 'intracellular transport',
        'protein_processing': 'protein processing and degradation',
        'cell_structure': 'cytoskeletal organization',
        'reproduction': 'reproduction and gametogenesis',
        'cilium': 'cilium assembly and function',
        'apoptosis': 'apoptosis and cell death',
        'stress': 'stress response'
    }
    
    # Build summary
    if len(top_categories) == 1:
        cat = top_categories[0][0]
        return f"Genes primarily involved in {category_descriptions.get(cat, cat)}"
    elif len(top_categories) >= 2:
        primary = category_descriptions.get(top_categories[0][0], top_categories[0][0])
        secondary = category_descriptions.get(top_categories[1][0], top_categories[1][0])
        return f"Genes involved in {primary} and {secondary}"
    
    return "Mixed physiological processes"

def annotate_rank_file(rank_file_path, annotation_df, output_dir):
    """
    Annotate a single rank CSV file with ortholog annotations.
    
    Args:
        rank_file_path: Path to the rank CSV file
        annotation_df: DataFrame with ortholog annotations
        output_dir: Directory to save output files
    
    Returns:
        Tuple of (output_file_path, summary_dict)
    """
    # Read the rank file
    rank_df = pd.read_csv(rank_file_path)
    
    # The first column should be the OG ID - get its name and validate
    og_column = rank_df.columns[0]
    
    # Validate that the first column looks like an OG ID column
    if not (og_column.upper() in ['OG', 'GROUP_ID'] or 
            og_column.startswith('OG_') or 
            'OG' in og_column.upper()):
        # Check if first few values look like OG IDs
        sample_values = rank_df[og_column].head(3).astype(str)
        if not sample_values.str.contains('OG_', case=False).any():
            print(f"  Warning: First column '{og_column}' may not be an OG ID column")
    
    # Rename to match annotation file's group_id column
    rank_df_renamed = rank_df.rename(columns={og_column: 'group_id'})
    
    # Merge with annotations
    annotated_df = rank_df_renamed.merge(
        annotation_df, 
        on='group_id', 
        how='left'
    )
    
    # Create output filename with _annotation.csv suffix
    input_filename = os.path.basename(rank_file_path)
    output_filename = input_filename.replace('.csv', '_annotation.csv')
    output_path = os.path.join(output_dir, output_filename)
    
    # Save annotated file
    annotated_df.to_csv(output_path, index=False)
    
    # Generate summary statistics
    summary = {
        'filename': input_filename,
        'total_genes': len(annotated_df),
        'annotated_genes': annotated_df['protein_name'].notna().sum(),
        'predominant_go_process': extract_predominant_terms(annotated_df['go_bp'].dropna()),
        'predominant_function': extract_predominant_functions(annotated_df['protein_name'].dropna()),
        'physiological_summary': generate_physiological_summary(
            annotated_df['go_bp'].dropna().tolist(),
            annotated_df['protein_name'].dropna().tolist()
        )
    }
    
    return output_path, summary

def main():
    # Set up paths
    repo_root = Path(__file__).parent.parent.parent
    rank_dir = repo_root / 'M-multi-species' / 'output' / '22-Visualizing-Rank-outs'
    annotation_file = repo_root / 'M-multi-species' / 'output' / '12-ortho-annot' / 'ortholog_groups_annotated.csv'
    output_dir = rank_dir  # Save annotated files in the same directory
    summary_output = repo_root / 'M-multi-species' / 'output' / '22-Visualizing-Rank-outs' / 'annotation_summary.md'
    
    # Check if paths exist
    if not rank_dir.exists():
        print(f"Error: Rank directory not found: {rank_dir}")
        sys.exit(1)
    
    if not annotation_file.exists():
        print(f"Error: Annotation file not found: {annotation_file}")
        sys.exit(1)
    
    # Load the ortholog annotations
    print(f"Loading ortholog annotations from {annotation_file}...")
    annotation_df = pd.read_csv(annotation_file)
    print(f"Loaded {len(annotation_df)} ortholog group annotations")
    
    # Find all CSV files in the rank directory (excluding annotation files)
    csv_files = [f for f in rank_dir.glob('*.csv') 
                 if not f.name.endswith('_annotation.csv')]
    
    print(f"\nFound {len(csv_files)} CSV files to process")
    
    # Process each file
    summaries = []
    processed = 0
    
    for rank_file in sorted(csv_files):
        try:
            print(f"Processing {rank_file.name}...")
            output_path, summary = annotate_rank_file(rank_file, annotation_df, output_dir)
            summaries.append(summary)
            processed += 1
            
            if processed % 50 == 0:
                print(f"  Processed {processed}/{len(csv_files)} files...")
        except Exception as e:
            print(f"  Error processing {rank_file.name}: {e}")
            continue
    
    print(f"\nSuccessfully processed {processed}/{len(csv_files)} files")
    
    # Create summary markdown table
    print(f"\nGenerating summary table...")
    with open(summary_output, 'w') as f:
        f.write("# Gene List Annotation Summary\n\n")
        f.write(f"Total files processed: {len(summaries)}\n\n")
        f.write("| File | Total Genes | Annotated | Predominant GO Biological Process | Predominant Gene Function | Physiological Summary |\n")
        f.write("|------|-------------|-----------|-----------------------------------|---------------------------|-----------------------|\n")
        
        for summary in summaries:
            f.write(f"| {summary['filename']} | {summary['total_genes']} | {summary['annotated_genes']} | "
                   f"{summary['predominant_go_process']} | {summary['predominant_function']} | "
                   f"{summary['physiological_summary']} |\n")
    
    print(f"Summary table saved to {summary_output}")
    print("\nDone!")

if __name__ == "__main__":
    main()