#!/usr/bin/env python3 """ Cross-Species Biomin Analysis and Comparison This script: 1. Runs context_dependent_analysis.py on all three species in full-species-biomin 2. Extracts OG_XXXXX orthologous group IDs from results 3. Compares context-dependent interactions across species to identify: - Conserved regulatory patterns (same OG in multiple species) - Species-specific regulatory patterns - Cross-species correlation of effect sizes Usage: python code/run_biomin_species_comparison.py [--skip-analysis] [--output-dir PATH] Options: --skip-analysis: Skip running the per-species analysis (use existing output) --output-dir: Directory for comparison results (default: output/biomin_comparison_TIMESTAMP) """ import os import sys import subprocess import argparse import re from datetime import datetime from typing import Dict, List, Set, Tuple, Optional from collections import defaultdict import pandas as pd import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import seaborn as sns from scipy.stats import pearsonr, spearmanr # Species configuration SPECIES_CONFIG = { 'apul': { 'name': 'A. pulchra', 'data_dir': 'data/full-species-biomin/cleaned_apul', 'color': '#E74C3C' # Red }, 'peve': { 'name': 'P. evermanni', 'data_dir': 'data/full-species-biomin/cleaned_peve', 'color': '#3498DB' # Blue }, 'ptua': { 'name': 'P. tuahiniensis', 'data_dir': 'data/full-species-biomin/cleaned_ptua', 'color': '#2ECC71' # Green } } def get_workspace_root() -> str: """Get workspace root directory.""" return os.path.dirname(os.path.dirname(os.path.abspath(__file__))) def extract_og_id(gene_id: str) -> Optional[str]: """Extract OG_XXXXX ID from gene identifier. Handles formats like: - "('OG_13910', 'FUN_002435')" - "OG_13910" - "OG_13910_FUN_002435" """ if pd.isna(gene_id): return None gene_id = str(gene_id) # Pattern to match OG_XXXXX (digits after OG_) match = re.search(r'OG_(\d+)', gene_id) if match: return f"OG_{match.group(1)}" return None def run_species_analysis(species_key: str, workspace_root: str) -> Optional[str]: """Run context_dependent_analysis.py for a single species. Returns the output directory path if successful, None otherwise. """ config = SPECIES_CONFIG[species_key] data_dir = os.path.join(workspace_root, config['data_dir']) if not os.path.exists(data_dir): print(f" ❌ Data directory not found: {data_dir}") return None print(f"\n{'='*60}") print(f"Running analysis for {config['name']} ({species_key})") print(f"{'='*60}") # Run the analysis script script_path = os.path.join(workspace_root, 'code', 'context_dependent_analysis.py') cmd = [ sys.executable, script_path, '--data-dir', data_dir, '--n-jobs', '8' # Use reasonable parallelism ] try: result = subprocess.run(cmd, capture_output=True, text=True, cwd=workspace_root) if result.returncode != 0: print(f" ❌ Analysis failed for {species_key}") print(f" stderr: {result.stderr[:500]}") return None # Parse output directory from stdout for line in result.stdout.split('\n'): if 'Output directory:' in line: output_dir = line.split('Output directory:')[-1].strip() print(f" ✅ Analysis complete: {output_dir}") return output_dir # Fallback: find most recent output directory output_base = os.path.join(workspace_root, 'output') dirs = [d for d in os.listdir(output_base) if d.startswith('context_dependent_analysis_')] if dirs: latest = sorted(dirs)[-1] output_dir = os.path.join(output_base, latest) print(f" ✅ Analysis complete: {output_dir}") return output_dir except Exception as e: print(f" ❌ Error running analysis for {species_key}: {e}") return None return None def find_latest_output_dir(species_key: str, workspace_root: str) -> Optional[str]: """Find the most recent output directory for a species. Looks for output directories and tries to match by data patterns. """ output_base = os.path.join(workspace_root, 'output') if not os.path.exists(output_base): return None # Get all context_dependent_analysis directories dirs = [d for d in os.listdir(output_base) if d.startswith('context_dependent_analysis_') and os.path.isdir(os.path.join(output_base, d))] if not dirs: return None # Sort by timestamp (newest first) dirs = sorted(dirs, reverse=True) # Return the latest one for now # In a production setting, you might want to track which dir corresponds to which species return os.path.join(output_base, dirs[0]) def load_species_results(output_dir: str, species_key: str) -> Dict[str, pd.DataFrame]: """Load analysis results for a species.""" results = {} tables_dir = os.path.join(output_dir, 'tables') if not os.path.exists(tables_dir): print(f" ⚠️ Tables directory not found: {tables_dir}") return results # Load main context results files_to_load = [ 'methylation_mirna_context.csv', 'lncrna_mirna_context.csv', 'multi_way_interactions.csv' ] for filename in files_to_load: filepath = os.path.join(tables_dir, filename) if os.path.exists(filepath): try: df = pd.read_csv(filepath) # Extract OG IDs from target column if 'target' in df.columns: df['og_id'] = df['target'].apply(extract_og_id) elif 'gene' in df.columns: df['og_id'] = df['gene'].apply(extract_og_id) df['species'] = species_key results[filename.replace('.csv', '')] = df print(f" ✅ Loaded {filename}: {len(df)} rows") except Exception as e: print(f" ⚠️ Error loading {filename}: {e}") return results def compare_methylation_mirna_context( species_results: Dict[str, Dict[str, pd.DataFrame]], output_dir: str ) -> pd.DataFrame: """Compare methylation-miRNA context results across species. Identifies OGs with context-dependent regulation in multiple species. """ print("\n" + "="*60) print("COMPARING METHYLATION-MIRNA CONTEXT ACROSS SPECIES") print("="*60) # Collect all results all_results = [] og_by_species: Dict[str, Dict[str, pd.DataFrame]] = defaultdict(dict) for species_key, results in species_results.items(): if 'methylation_mirna_context' in results: df = results['methylation_mirna_context'] all_results.append(df) # Group by OG ID for this species for og_id, group in df.groupby('og_id'): if og_id and pd.notna(og_id): # Filter to rows with valid context_strength valid_group = group.dropna(subset=['context_strength']) if not valid_group.empty: # Take the interaction with highest context_strength for this OG best_idx = valid_group['context_strength'].idxmax() og_by_species[og_id][species_key] = valid_group.loc[best_idx] if not all_results: print(" ⚠️ No methylation-miRNA context results found") return pd.DataFrame() # Find OGs present in multiple species multi_species_ogs = {og: species for og, species in og_by_species.items() if len(species) >= 2} print(f"\n📊 Summary:") print(f" Total unique OGs across all species: {len(og_by_species)}") print(f" OGs in 2+ species: {len([og for og, sp in og_by_species.items() if len(sp) >= 2])}") print(f" OGs in all 3 species: {len([og for og, sp in og_by_species.items() if len(sp) == 3])}") # Create comparison dataframe comparison_rows = [] for og_id, species_data in og_by_species.items(): row = {'og_id': og_id, 'n_species': len(species_data)} for species_key in SPECIES_CONFIG.keys(): if species_key in species_data: data = species_data[species_key] row[f'{species_key}_context_strength'] = data.get('context_strength', np.nan) row[f'{species_key}_context_dependent'] = data.get('context_dependent', False) row[f'{species_key}_improvement'] = data.get('improvement_from_interaction', np.nan) row[f'{species_key}_r2'] = data.get('r2_with_interaction', np.nan) else: row[f'{species_key}_context_strength'] = np.nan row[f'{species_key}_context_dependent'] = np.nan row[f'{species_key}_improvement'] = np.nan row[f'{species_key}_r2'] = np.nan comparison_rows.append(row) comparison_df = pd.DataFrame(comparison_rows) # Sort by number of species and average context strength if not comparison_df.empty: strength_cols = [f'{sp}_context_strength' for sp in SPECIES_CONFIG.keys()] comparison_df['mean_context_strength'] = comparison_df[strength_cols].mean(axis=1) comparison_df = comparison_df.sort_values( ['n_species', 'mean_context_strength'], ascending=[False, False] ) # Save comparison table comparison_path = os.path.join(output_dir, 'cross_species_methylation_mirna_comparison.csv') comparison_df.to_csv(comparison_path, index=False) print(f"\n✅ Saved comparison table: {comparison_path}") # Print top conserved OGs print("\n🏆 Top 15 Conserved Context-Dependent OGs (in 2+ species):") top_conserved = comparison_df[comparison_df['n_species'] >= 2].head(15) print(top_conserved[['og_id', 'n_species', 'mean_context_strength'] + [f'{sp}_context_strength' for sp in SPECIES_CONFIG.keys()]].to_string(index=False)) return comparison_df def plot_species_comparison( comparison_df: pd.DataFrame, species_results: Dict[str, Dict[str, pd.DataFrame]], output_dir: str ) -> None: """Generate comparison visualizations.""" plots_dir = os.path.join(output_dir, 'plots') os.makedirs(plots_dir, exist_ok=True) # 1. Venn-style bar chart of OG overlaps fig, axes = plt.subplots(2, 2, figsize=(14, 12)) # OG counts by species species_keys = list(SPECIES_CONFIG.keys()) og_counts = {sp: set() for sp in species_keys} for species_key, results in species_results.items(): if 'methylation_mirna_context' in results: df = results['methylation_mirna_context'] og_counts[species_key] = set(df['og_id'].dropna().unique()) # Bar chart of total OGs per species counts = [len(og_counts[sp]) for sp in species_keys] colors = [SPECIES_CONFIG[sp]['color'] for sp in species_keys] names = [SPECIES_CONFIG[sp]['name'] for sp in species_keys] axes[0, 0].bar(names, counts, color=colors, alpha=0.7, edgecolor='black') axes[0, 0].set_ylabel('Number of OGs with Context-Dependent Regulation') axes[0, 0].set_title('OGs per Species') for i, (name, count) in enumerate(zip(names, counts)): axes[0, 0].text(i, count + 5, str(count), ha='center', fontweight='bold') # Overlap statistics all_ogs = set.union(*og_counts.values()) if og_counts.values() else set() ogs_in_2plus = set() ogs_in_3 = set() for og in all_ogs: n_species = sum(1 for sp in species_keys if og in og_counts[sp]) if n_species >= 2: ogs_in_2plus.add(og) if n_species == 3: ogs_in_3.add(og) overlap_labels = ['In 1 species only', 'In 2+ species', 'In all 3 species'] overlap_counts = [ len(all_ogs) - len(ogs_in_2plus), len(ogs_in_2plus) - len(ogs_in_3), len(ogs_in_3) ] axes[0, 1].bar(overlap_labels, overlap_counts, color=['#95A5A6', '#F39C12', '#27AE60'], alpha=0.7, edgecolor='black') axes[0, 1].set_ylabel('Number of OGs') axes[0, 1].set_title('OG Conservation Across Species') for i, count in enumerate(overlap_counts): axes[0, 1].text(i, count + 2, str(count), ha='center', fontweight='bold') # 2. Scatter plot comparing context strength between species pairs if not comparison_df.empty and len(species_keys) >= 2: # Compare first two species sp1, sp2 = species_keys[0], species_keys[1] x = comparison_df[f'{sp1}_context_strength'].dropna() y = comparison_df[f'{sp2}_context_strength'].dropna() common_idx = x.index.intersection(y.index) if len(common_idx) > 5: x_common = x.loc[common_idx] y_common = y.loc[common_idx] axes[1, 0].scatter(x_common, y_common, alpha=0.5, s=20, c='#3498DB') axes[1, 0].set_xlabel(f'{SPECIES_CONFIG[sp1]["name"]} Context Strength') axes[1, 0].set_ylabel(f'{SPECIES_CONFIG[sp2]["name"]} Context Strength') axes[1, 0].set_title(f'Context Strength: {SPECIES_CONFIG[sp1]["name"]} vs {SPECIES_CONFIG[sp2]["name"]}') # Add correlation corr, pval = spearmanr(x_common, y_common) axes[1, 0].text(0.05, 0.95, f'ρ = {corr:.3f}\np = {pval:.2e}', transform=axes[1, 0].transAxes, fontsize=10, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat')) # Add diagonal line lims = [min(x_common.min(), y_common.min()), max(x_common.max(), y_common.max())] axes[1, 0].plot(lims, lims, 'r--', alpha=0.5, label='y=x') else: axes[1, 0].text(0.5, 0.5, 'Insufficient overlapping data', ha='center', va='center', transform=axes[1, 0].transAxes) # 3. Distribution of context strength by species strength_data = [] for species_key, results in species_results.items(): if 'methylation_mirna_context' in results: df = results['methylation_mirna_context'] if 'context_strength' in df.columns: for val in df['context_strength'].dropna(): strength_data.append({ 'species': SPECIES_CONFIG[species_key]['name'], 'context_strength': val }) if strength_data: strength_df = pd.DataFrame(strength_data) species_order = [SPECIES_CONFIG[sp]['name'] for sp in species_keys] palette = {SPECIES_CONFIG[sp]['name']: SPECIES_CONFIG[sp]['color'] for sp in species_keys} sns.violinplot(data=strength_df, x='species', y='context_strength', order=species_order, palette=palette, ax=axes[1, 1]) axes[1, 1].set_xlabel('Species') axes[1, 1].set_ylabel('Context Strength') axes[1, 1].set_title('Distribution of Context Strength by Species') plt.tight_layout() plt.savefig(os.path.join(plots_dir, 'species_comparison_overview.png'), dpi=300, bbox_inches='tight') plt.close() print(f"✅ Saved comparison plots: {plots_dir}/species_comparison_overview.png") def generate_comparison_report( comparison_df: pd.DataFrame, species_results: Dict[str, Dict[str, pd.DataFrame]], output_dir: str ) -> None: """Generate markdown report summarizing cross-species comparison.""" report_path = os.path.join(output_dir, 'cross_species_comparison_report.md') with open(report_path, 'w') as f: f.write("# Cross-Species Biomin Context-Dependent Analysis Comparison\n\n") f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") f.write("## Overview\n\n") f.write("This report compares context-dependent regulatory interactions across three coral species:\n\n") for species_key, config in SPECIES_CONFIG.items(): f.write(f"- **{config['name']}** (`{species_key}`)\n") f.write("\n") f.write("## Per-Species Summary\n\n") for species_key, results in species_results.items(): config = SPECIES_CONFIG[species_key] f.write(f"### {config['name']} ({species_key})\n\n") if 'methylation_mirna_context' in results: df = results['methylation_mirna_context'] f.write(f"- Total methylation-miRNA interactions: {len(df)}\n") if 'context_dependent' in df.columns: f.write(f"- Context-dependent interactions: {df['context_dependent'].sum()}\n") if 'context_strength' in df.columns: f.write(f"- Mean context strength: {df['context_strength'].mean():.3f}\n") f.write(f"- Unique OGs: {df['og_id'].nunique()}\n") else: f.write("- No methylation-miRNA context results\n") f.write("\n") f.write("## Cross-Species Conservation\n\n") if not comparison_df.empty: n_total = len(comparison_df) n_2plus = len(comparison_df[comparison_df['n_species'] >= 2]) n_all3 = len(comparison_df[comparison_df['n_species'] == 3]) f.write(f"- **Total unique OGs:** {n_total}\n") f.write(f"- **OGs in 2+ species:** {n_2plus} ({100*n_2plus/n_total:.1f}%)\n") f.write(f"- **OGs in all 3 species:** {n_all3} ({100*n_all3/n_total:.1f}%)\n\n") f.write("### Top Conserved OGs (Context-Dependent in Multiple Species)\n\n") top_conserved = comparison_df[comparison_df['n_species'] >= 2].head(20) if not top_conserved.empty: f.write("| OG ID | # Species | Mean Strength | A. pulchra | P. evermanni | P. tuahiniensis |\n") f.write("|-------|-----------|---------------|------------|--------------|------------------|\n") for _, row in top_conserved.iterrows(): f.write(f"| {row['og_id']} | {row['n_species']} | {row['mean_context_strength']:.3f} | ") f.write(f"{row.get('apul_context_strength', np.nan):.3f} | ") f.write(f"{row.get('peve_context_strength', np.nan):.3f} | ") f.write(f"{row.get('ptua_context_strength', np.nan):.3f} |\n") f.write("\n") f.write("## Interpretation\n\n") f.write("OGs (orthologous groups) that show context-dependent regulation in multiple species suggest:\n\n") f.write("1. **Conserved regulatory mechanisms** - These genes may have fundamental roles where regulation is evolutionarily maintained\n") f.write("2. **Robust biological signals** - Conservation across species reduces the likelihood of false positives\n") f.write("3. **Candidates for functional validation** - These genes are high-priority targets for experimental follow-up\n\n") f.write("## Files Generated\n\n") f.write(f"- `cross_species_methylation_mirna_comparison.csv` - Full comparison table\n") f.write(f"- `plots/species_comparison_overview.png` - Visualization of cross-species patterns\n") f.write(f"- `cross_species_comparison_report.md` - This report\n") print(f"✅ Saved comparison report: {report_path}") def main(): parser = argparse.ArgumentParser( description="Run context-dependent analysis on all biomin species and compare results" ) parser.add_argument( '--skip-analysis', action='store_true', help='Skip running per-species analysis (use most recent existing outputs)' ) parser.add_argument( '--output-dir', type=str, default=None, help='Output directory for comparison results' ) parser.add_argument( '--species-outputs', type=str, nargs=3, metavar=('APUL_DIR', 'PEVE_DIR', 'PTUA_DIR'), help='Specify existing output directories for each species' ) args = parser.parse_args() workspace_root = get_workspace_root() # Create comparison output directory if args.output_dir: comparison_output = args.output_dir else: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") comparison_output = os.path.join(workspace_root, f"output/biomin_comparison_{timestamp}") os.makedirs(comparison_output, exist_ok=True) print("="*70) print("🧬 CROSS-SPECIES BIOMIN CONTEXT-DEPENDENT ANALYSIS") print("="*70) print(f"Workspace: {workspace_root}") print(f"Comparison output: {comparison_output}") print("="*70) # Track output directories for each species species_output_dirs = {} if args.species_outputs: # Use provided output directories species_output_dirs = { 'apul': args.species_outputs[0], 'peve': args.species_outputs[1], 'ptua': args.species_outputs[2] } elif args.skip_analysis: # Find most recent outputs (user will need to specify or we use latest) print("\n⚠️ --skip-analysis specified. Looking for existing output directories...") print(" For best results, use --species-outputs to specify exact directories.\n") # This is a simplified approach - in practice you'd want better tracking output_base = os.path.join(workspace_root, 'output') dirs = sorted([d for d in os.listdir(output_base) if d.startswith('context_dependent_analysis_')], reverse=True) if len(dirs) >= 3: # Assume last 3 are for our species (user should verify) for i, species in enumerate(['apul', 'peve', 'ptua']): if i < len(dirs): species_output_dirs[species] = os.path.join(output_base, dirs[i]) print(f" Using {dirs[i]} for {species}") else: print(" ❌ Not enough existing output directories found. Run without --skip-analysis.") return 1 else: # Run analysis for each species for species_key in SPECIES_CONFIG.keys(): output_dir = run_species_analysis(species_key, workspace_root) if output_dir: species_output_dirs[species_key] = output_dir else: print(f" ⚠️ Skipping {species_key} due to analysis failure") if len(species_output_dirs) < 2: print("\n❌ Need at least 2 species with successful analysis for comparison") return 1 # Load results for each species print("\n" + "="*60) print("LOADING SPECIES RESULTS") print("="*60) species_results = {} for species_key, output_dir in species_output_dirs.items(): print(f"\n📂 Loading {SPECIES_CONFIG[species_key]['name']}...") species_results[species_key] = load_species_results(output_dir, species_key) # Compare results across species comparison_df = compare_methylation_mirna_context(species_results, comparison_output) # Generate visualizations if not comparison_df.empty: print("\n" + "="*60) print("GENERATING VISUALIZATIONS") print("="*60) plot_species_comparison(comparison_df, species_results, comparison_output) # Generate report generate_comparison_report(comparison_df, species_results, comparison_output) print("\n" + "="*70) print("🎉 CROSS-SPECIES COMPARISON COMPLETE!") print("="*70) print(f"Results saved to: {comparison_output}") print("="*70) return 0 if __name__ == "__main__": sys.exit(main())