#!/usr/bin/env python3
"""
Filter count matrices based on ortholog groups from a component.

This script:
1. Reads OG_IDs from a component file (e.g., Component_24_top100.csv)
2. Maps OG_IDs to species-specific gene IDs from ortholog_groups_annotated.csv
3. Filters count matrices for each species to include only genes from the component
4. Saves filtered matrices to the output directory
"""

import sys
import os
import csv
import pandas as pd


def read_component_og_ids(component_file):
    """
    Read OG_IDs from a component CSV file.
    
    Args:
        component_file: Path to component CSV file (e.g., Component_24_top100.csv)
        
    Returns:
        List of OG_IDs
    """
    og_ids = []
    with open(component_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            og_ids.append(row['OG_ID'])
    
    print(f"Read {len(og_ids)} OG_IDs from {component_file}")
    return og_ids


def map_og_to_species_genes(ortholog_file, og_ids):
    """
    Map OG_IDs to species-specific gene IDs.
    
    Args:
        ortholog_file: Path to ortholog_groups_annotated.csv
        og_ids: List of OG_IDs to map
        
    Returns:
        Dictionary with keys 'apul', 'peve', 'ptua', each containing list of gene IDs
    """
    og_set = set(og_ids)
    gene_mapping = {
        'apul': [],
        'peve': [],
        'ptua': []
    }
    
    with open(ortholog_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            if row['group_id'] in og_set:
                # Add gene IDs if they exist (not empty)
                if row['apul']:
                    gene_mapping['apul'].append(row['apul'])
                if row['peve']:
                    gene_mapping['peve'].append(row['peve'])
                if row['ptua']:
                    gene_mapping['ptua'].append(row['ptua'])
    
    print(f"\nMapped to species-specific genes:")
    print(f"  apul: {len(gene_mapping['apul'])} genes")
    print(f"  peve: {len(gene_mapping['peve'])} genes")
    print(f"  ptua: {len(gene_mapping['ptua'])} genes")
    
    return gene_mapping


def filter_count_matrix(count_matrix_file, gene_ids, output_file, gene_prefix="", strip_suffix=""):
    """
    Filter a count matrix to include only specified genes.
    
    Args:
        count_matrix_file: Path to input count matrix CSV
        gene_ids: List of gene IDs to keep
        output_file: Path to output filtered CSV
        gene_prefix: Prefix to add to gene IDs when looking them up (e.g., "gene-")
        strip_suffix: Suffix to remove from gene IDs before matching (e.g., "-T1")
    """
    # Create set of gene IDs to keep (with prefix and/or without suffix if needed)
    processed_ids = []
    for gene_id in gene_ids:
        # Strip suffix if specified
        if strip_suffix and gene_id.endswith(strip_suffix):
            gene_id = gene_id[:-len(strip_suffix)]
        # Add prefix if specified
        if gene_prefix:
            gene_id = f"{gene_prefix}{gene_id}"
        processed_ids.append(gene_id)
    
    genes_to_keep = set(processed_ids)
    
    print(f"\nFiltering {count_matrix_file}...")
    print(f"Looking for {len(genes_to_keep)} genes...")
    
    # Read the count matrix
    df = pd.read_csv(count_matrix_file, encoding='utf-8')
    
    # Filter rows where gene_id is in our set
    filtered_df = df[df['gene_id'].isin(genes_to_keep)]
    
    print(f"Found {len(filtered_df)} genes in count matrix")
    
    # Save filtered matrix
    filtered_df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"Saved to {output_file}")
    
    return len(filtered_df)


def main():
    # Base paths - use script location to find repository root
    script_dir = os.path.dirname(os.path.abspath(__file__))
    base_dir = os.path.abspath(os.path.join(script_dir, "..", ".."))
    
    # Input files
    component_file = os.path.join(
        base_dir,
        "M-multi-species/output/26-rank35-optimization/lambda_gene_0.2/top_genes_per_component",
        "Component_24_top100.csv"
    )
    
    ortholog_file = os.path.join(
        base_dir,
        "M-multi-species/output/12-ortho-annot",
        "ortholog_groups_annotated.csv"
    )
    
    # Count matrix files
    apul_matrix = os.path.join(
        base_dir,
        "D-Apul/output/02.20-D-Apul-RNAseq-alignment-HiSat2",
        "apul-gene_count_matrix.csv"
    )
    
    peve_matrix = os.path.join(
        base_dir,
        "E-Peve/output/02.20-E-Peve-RNAseq-alignment-HiSat2",
        "peve-gene_count_matrix.csv"
    )
    
    ptua_matrix = os.path.join(
        base_dir,
        "F-Ptua/output/02.20-F-Ptua-RNAseq-alignment-HiSat2",
        "ptua-gene_count_matrix.csv"
    )
    
    # Output directory
    output_dir = os.path.join(
        base_dir,
        "M-multi-species/output/26-rank35-optimization/lambda_gene_0.2/top_genes_per_component"
    )
    
    # Verify input files exist
    for filepath in [component_file, ortholog_file, apul_matrix, peve_matrix, ptua_matrix]:
        if not os.path.exists(filepath):
            print(f"Error: Input file not found: {filepath}")
            sys.exit(1)
    
    print("=== Generating Component 24 Count Matrices ===\n")
    
    # Step 1: Read OG_IDs from component file
    og_ids = read_component_og_ids(component_file)
    
    # Step 2: Map OG_IDs to species-specific gene IDs
    gene_mapping = map_og_to_species_genes(ortholog_file, og_ids)
    
    # Step 3: Filter count matrices for each species
    # Note: 
    #   - apul IDs in ortholog file have "-T1" suffix but count matrix doesn't
    #   - peve and ptua have "gene-" prefix in count matrix
    results = {}
    
    results['apul'] = filter_count_matrix(
        apul_matrix,
        gene_mapping['apul'],
        os.path.join(output_dir, "Component_24_apul_count_matrix.csv"),
        gene_prefix="",
        strip_suffix="-T1"
    )
    
    results['peve'] = filter_count_matrix(
        peve_matrix,
        gene_mapping['peve'],
        os.path.join(output_dir, "Component_24_peve_count_matrix.csv"),
        gene_prefix="gene-"
    )
    
    results['ptua'] = filter_count_matrix(
        ptua_matrix,
        gene_mapping['ptua'],
        os.path.join(output_dir, "Component_24_ptua_count_matrix.csv"),
        gene_prefix="gene-"
    )
    
    print("\n=== Summary ===")
    print(f"Total OG_IDs in Component_24: {len(og_ids)}")
    print(f"Genes found in count matrices:")
    print(f"  apul: {results['apul']}")
    print(f"  peve: {results['peve']}")
    print(f"  ptua: {results['ptua']}")
    print("\nFiltered count matrices saved successfully!")


if __name__ == "__main__":
    main()