#!/usr/bin/env python3
"""
Analysis of miRNA, lncRNA, and DNA methylation influence on gene expression.
This script analyzes the regulatory effects of multiple molecular layers on gene expression.
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

def load_and_preprocess_data():
    """Load and preprocess all data matrices."""
    print("Loading data matrices...")
    
    # Load cleaned data files
    gene_data = pd.read_csv('../data/gene_counts_cleaned.csv')
    mirna_data = pd.read_csv('../data/mirna_counts_cleaned.csv')
    lncrna_data = pd.read_csv('../data/lncrna_counts_cleaned.csv')
    methylation_data = pd.read_csv('../data/methylation_counts_cleaned.csv')
    
    return gene_data, mirna_data, lncrna_data, methylation_data

def clean_gene_data(gene_data):
    """Clean and prepare gene expression data."""
    print("Processing gene expression data...")
    
    # Set gene_id as index
    gene_data.set_index('gene_id', inplace=True)
    
    # Log2 transform (add 1 to avoid log(0))
    gene_data_log = np.log2(gene_data + 1)
    
    return gene_data, gene_data_log

def clean_mirna_data(mirna_data):
    """Clean and prepare miRNA data."""
    print("Processing miRNA data...")
    
    # Set miRNA name as index
    mirna_data.set_index('Name', inplace=True)
    
    # Log2 transform
    mirna_data_log = np.log2(mirna_data + 1)
    
    return mirna_data, mirna_data_log

def clean_lncrna_data(lncrna_data):
    """Clean and prepare lncRNA data."""
    print("Processing lncRNA data...")
    
    # Set Geneid as index
    lncrna_data.set_index('Geneid', inplace=True)
    
    # Log2 transform
    lncrna_counts_log = np.log2(lncrna_data + 1)
    
    return lncrna_data, lncrna_counts_log

def clean_methylation_data(methylation_data):
    """Clean and prepare DNA methylation data."""
    print("Processing DNA methylation data...")
    
    # Set CpG as index
    methylation_data.set_index('CpG', inplace=True)
    
    return methylation_data

def standardize_sample_names(data_dict):
    """Standardize sample names across all datasets."""
    print("Standardizing sample names...")
    
    # All datasets now have consistent sample names
    # Just return a simple mapping
    gene_samples = list(data_dict['gene'].columns)
    sample_mapping = {sample: sample for sample in gene_samples}
    
    return sample_mapping

def select_subset_genes(gene_data_log, n_genes=20):
    """Select a subset of genes for analysis."""
    print(f"Selecting {n_genes} genes for analysis...")
    
    # Select genes with highest variance (most variable)
    gene_variance = gene_data_log.var(axis=1)
    top_genes = gene_variance.nlargest(n_genes).index.tolist()
    
    print(f"Selected genes: {top_genes[:5]}... (showing first 5)")
    
    return top_genes

def create_integrated_dataset(gene_data_log, mirna_data_log, lncrna_data_log, 
                            methylation_data, sample_mapping, selected_genes):
    """Create integrated dataset with all molecular layers."""
    print("Creating integrated dataset...")
    
    # Get standardized sample names
    std_samples = list(set(sample_mapping.values()))
    std_samples.sort()
    
    # Initialize integrated dataset
    integrated_data = {}
    
    # Add gene expression data
    for gene in selected_genes:
        if gene in gene_data_log.index:
            gene_expr = gene_data_log.loc[gene]
            # Map to standardized sample names
            mapped_expr = {}
            for orig_sample, std_sample in sample_mapping.items():
                if orig_sample in gene_expr.index:
                    mapped_expr[std_sample] = gene_expr[orig_sample]
            
            integrated_data[f"gene_{gene}"] = mapped_expr
    
    # Add miRNA data (average across all miRNAs per sample)
    mirna_avg = mirna_data_log.mean(axis=0)
    for sample in std_samples:
        if sample in mirna_avg.index:
            integrated_data[f"mirna_avg"] = mirna_avg.to_dict()
    
    # Add lncRNA data (average across all lncRNAs per sample)
    lncrna_avg = lncrna_data_log.mean(axis=0)
    for sample in std_samples:
        if sample in lncrna_avg.index:
            integrated_data[f"lncrna_avg"] = lncrna_avg.to_dict()
    
    # Add methylation data (average across all CpGs per sample)
    methylation_avg = methylation_data.mean(axis=0)
    for sample in std_samples:
        if sample in methylation_avg.index:
            integrated_data[f"methylation_avg"] = methylation_avg.to_dict()
    
    # Convert to DataFrame
    integrated_df = pd.DataFrame(integrated_data)
    
    # Transpose to have samples as rows
    integrated_df = integrated_df.T
    
    return integrated_df

def analyze_regulatory_influence(integrated_df):
    """Analyze the influence of regulatory factors on gene expression."""
    print("Analyzing regulatory influence...")
    
    # Separate target variables (genes) and features (regulatory factors)
    gene_cols = [col for col in integrated_df.columns if col.startswith('gene_')]
    feature_cols = [col for col in integrated_df.columns if not col.startswith('gene_')]
    
    results = {}
    
    for gene_col in gene_cols:
        print(f"Analyzing {gene_col}...")
        
        # Prepare data for this gene
        y = integrated_df.loc[gene_col].dropna()
        X = integrated_df.loc[feature_cols, y.index].T
        
        if len(y) > 0 and len(X) > 0:
            # Standardize features
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X)
            
            # Multiple linear regression
            try:
                lr_model = LinearRegression()
                lr_scores = cross_val_score(lr_model, X_scaled, y, cv=3, scoring='r2')
                lr_r2 = np.mean(lr_scores)
                
                # Ridge regression
                ridge_model = Ridge(alpha=1.0)
                ridge_scores = cross_val_score(ridge_model, X_scaled, y, cv=3, scoring='r2')
                ridge_r2 = np.mean(ridge_scores)
                
                # Lasso regression
                lasso_model = Lasso(alpha=0.1)
                lasso_scores = cross_val_score(lasso_model, X_scaled, y, cv=3, scoring='r2')
                lasso_r2 = np.mean(lasso_scores)
                
                # Random Forest
                rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
                rf_scores = cross_val_score(rf_model, X_scaled, y, cv=3, scoring='r2')
                rf_r2 = np.mean(rf_scores)
                
                results[gene_col] = {
                    'linear_r2': lr_r2,
                    'ridge_r2': ridge_r2,
                    'lasso_r2': lasso_r2,
                    'random_forest_r2': rf_r2,
                    'mean_r2': np.mean([lr_r2, ridge_r2, lasso_r2, rf_r2])
                }
                
            except Exception as e:
                print(f"Error analyzing {gene_col}: {e}")
                results[gene_col] = {
                    'linear_r2': np.nan,
                    'ridge_r2': np.nan,
                    'lasso_r2': np.nan,
                    'random_forest_r2': np.nan,
                    'mean_r2': np.nan
                }
    
    return results

def create_visualizations(integrated_df, results, output_dir):
    """Create comprehensive visualizations."""
    print("Creating visualizations...")
    
    # 1. Overall regulatory influence heatmap
    plt.figure(figsize=(12, 8))
    
    # Prepare data for heatmap
    gene_cols = [col for col in integrated_df.index if col.startswith('gene_')]
    feature_cols = [col for col in integrated_df.index if not col.startswith('gene_')]
    
    heatmap_data = []
    for gene in gene_cols:
        if gene in results:
            row = [
                results[gene]['linear_r2'],
                results[gene]['ridge_r2'],
                results[gene]['lasso_r2'],
                results[gene]['random_forest_r2']
            ]
            heatmap_data.append(row)
    
    if heatmap_data:
        heatmap_df = pd.DataFrame(heatmap_data, 
                                index=gene_cols,
                                columns=['Linear', 'Ridge', 'Lasso', 'Random Forest'])
        
        sns.heatmap(heatmap_df, annot=True, cmap='RdYlBu_r', center=0,
                   fmt='.3f', cbar_kws={'label': 'R² Score'})
        plt.title('Regulatory Influence Analysis: Model Performance by Gene')
        plt.xlabel('Regression Model')
        plt.ylabel('Gene')
        plt.tight_layout()
        plt.savefig(f'{output_dir}/regulatory_influence_heatmap.png', dpi=300, bbox_inches='tight')
        plt.close()
    
    # 2. Distribution of R² scores
    plt.figure(figsize=(10, 6))
    
    r2_scores = []
    for gene in results:
        r2_scores.append(results[gene]['mean_r2'])
    
    plt.hist(r2_scores, bins=20, alpha=0.7, edgecolor='black')
    plt.xlabel('Mean R² Score')
    plt.ylabel('Number of Genes')
    plt.title('Distribution of Regulatory Influence Scores')
    plt.axvline(np.mean(r2_scores), color='red', linestyle='--', 
                label=f'Mean: {np.mean(r2_scores):.3f}')
    plt.legend()
    plt.tight_layout()
    plt.savefig(f'{output_dir}/r2_distribution.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 3. Model comparison
    plt.figure(figsize=(10, 6))
    
    model_scores = {
        'Linear': [results[gene]['linear_r2'] for gene in results if not np.isnan(results[gene]['linear_r2'])],
        'Ridge': [results[gene]['ridge_r2'] for gene in results if not np.isnan(results[gene]['ridge_r2'])],
        'Lasso': [results[gene]['lasso_r2'] for gene in results if not np.isnan(results[gene]['lasso_r2'])],
        'Random Forest': [results[gene]['random_forest_r2'] for gene in results if not np.isnan(results[gene]['random_forest_r2'])]
    }
    
    plt.boxplot(model_scores.values(), labels=model_scores.keys())
    plt.ylabel('R² Score')
    plt.title('Model Performance Comparison')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f'{output_dir}/model_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 4. Sample correlation heatmap
    plt.figure(figsize=(12, 10))
    
    # Get sample correlation matrix
    sample_corr = integrated_df.T.corr()
    
    sns.heatmap(sample_corr, cmap='coolwarm', center=0, 
               square=True, cbar_kws={'label': 'Correlation'})
    plt.title('Sample Correlation Matrix')
    plt.tight_layout()
    plt.savefig(f'{output_dir}/sample_correlation.png', dpi=300, bbox_inches='tight')
    plt.close()

def save_results(results, integrated_df, output_dir):
    """Save analysis results to files."""
    print("Saving results...")
    
    # Save results summary
    results_df = pd.DataFrame(results).T
    results_df.to_csv(f'{output_dir}/regulatory_influence_results.csv')
    
    # Save integrated dataset
    integrated_df.to_csv(f'{output_dir}/integrated_dataset.csv')
    
    # Create summary statistics
    summary_stats = {
        'total_genes_analyzed': len(results),
        'mean_linear_r2': np.mean([results[gene]['linear_r2'] for gene in results if not np.isnan(results[gene]['linear_r2'])]),
        'mean_ridge_r2': np.mean([results[gene]['ridge_r2'] for gene in results if not np.isnan(results[gene]['ridge_r2'])]),
        'mean_lasso_r2': np.mean([results[gene]['lasso_r2'] for gene in results if not np.isnan(results[gene]['lasso_r2'])]),
        'mean_rf_r2': np.mean([results[gene]['random_forest_r2'] for gene in results if not np.isnan(results[gene]['random_forest_r2'])]),
        'best_performing_model': max(['Linear', 'Ridge', 'Lasso', 'Random Forest'], 
                                   key=lambda x: np.mean([results[gene][f'{x.lower().replace(" ", "_")}_r2'] 
                                                        for gene in results if not np.isnan(results[gene][f'{x.lower().replace(" ", "_")}_r2'])])),
        'genes_with_high_influence': len([gene for gene in results if results[gene]['mean_r2'] > 0.5])
    }
    
    # Save summary
    with open(f'{output_dir}/analysis_summary.txt', 'w') as f:
        f.write("GENE REGULATION ANALYSIS SUMMARY\n")
        f.write("=" * 50 + "\n\n")
        f.write(f"Total genes analyzed: {summary_stats['total_genes_analyzed']}\n")
        f.write(f"Mean Linear R²: {summary_stats['mean_linear_r2']:.3f}\n")
        f.write(f"Mean Ridge R²: {summary_stats['mean_ridge_r2']:.3f}\n")
        f.write(f"Mean Lasso R²: {summary_stats['mean_lasso_r2']:.3f}\n")
        f.write(f"Mean Random Forest R²: {summary_stats['mean_rf_r2']:.3f}\n")
        f.write(f"Best performing model: {summary_stats['best_performing_model']}\n")
        f.write(f"Genes with high regulatory influence (R² > 0.5): {summary_stats['genes_with_high_influence']}\n")
        
        f.write("\nTOP 5 GENES BY REGULATORY INFLUENCE:\n")
        f.write("-" * 40 + "\n")
        sorted_genes = sorted(results.items(), key=lambda x: x[1]['mean_r2'], reverse=True)
        for i, (gene, scores) in enumerate(sorted_genes[:5]):
            f.write(f"{i+1}. {gene}: Mean R² = {scores['mean_r2']:.3f}\n")
    
    print(f"Results saved to {output_dir}/")

def main():
    """Main analysis pipeline."""
    print("Starting Gene Regulation Analysis Pipeline")
    print("=" * 50)
    
    # Create output directory
    import os
    output_dir = 'output'
    os.makedirs(output_dir, exist_ok=True)
    
    try:
        # Load data
        gene_data, mirna_data, lncrna_data, methylation_data = load_and_preprocess_data()
        
        # Clean data
        gene_data, gene_data_log = clean_gene_data(gene_data)
        mirna_data, mirna_data_log = clean_mirna_data(mirna_data)
        lncrna_data, lncrna_data_log = clean_lncrna_data(lncrna_data)
        methylation_data = clean_methylation_data(methylation_data)
        
        # Standardize sample names
        sample_mapping = standardize_sample_names({
            'gene': gene_data,
            'mirna': mirna_data,
            'lncrna': lncrna_data,
            'methylation': methylation_data
        })
        
        # Select subset of genes
        selected_genes = select_subset_genes(gene_data_log, n_genes=20)
        
        # Create integrated dataset
        integrated_df = create_integrated_dataset(
            gene_data_log, mirna_data_log, lncrna_data_log, 
            methylation_data, sample_mapping, selected_genes
        )
        
        # Analyze regulatory influence
        results = analyze_regulatory_influence(integrated_df)
        
        # Create visualizations
        create_visualizations(integrated_df, results, output_dir)
        
        # Save results
        save_results(results, integrated_df, output_dir)
        
        print("\nAnalysis completed successfully!")
        print(f"Results saved to {output_dir}/")
        
    except Exception as e:
        print(f"Error in analysis pipeline: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()