#!/usr/bin/env python3
"""
12-pav.py - Present-Absent Variation (PAV) Analysis

This script identifies genomic regions that are present or absent in each sample
relative to the reference genome (GCF_016432855.1_SaNama_1.0).

PAV analysis uses coverage depth from aligned PacBio HiFi reads to identify:
1. Absent regions: Genomic regions with zero/very low coverage (potential deletions)
2. Present regions: Genomic regions with sufficient coverage
3. Insertions: Novel sequences in samples not in reference (from CIGAR analysis)
4. Deletions: Sequences in reference absent from samples (from CIGAR analysis)

Author: Generated for project-lake-trout
Date: 2025-12-07
"""

import os
import subprocess
import gzip
import re
import pandas as pd
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed

# =============================================================================
# Configuration
# =============================================================================

# Define samples by subspecies
# LEAN: bc2041, bc2069, bc2070, bc2068
# SISCOWET: bc2071, bc2073, bc2072, bc2096
SAMPLES_LEAN = ["bc2041", "bc2069", "bc2070", "bc2068"]
SAMPLES_SISCOWET = ["bc2071", "bc2073", "bc2072", "bc2096"]
SAMPLES = SAMPLES_LEAN + SAMPLES_SISCOWET  # All 8 samples

# Paths (relative to code directory)
BASE_DIR = Path(__file__).parent.parent
BAM_DIR = BASE_DIR / "analyses" / "05-pacbio-align"
OUTPUT_DIR = BASE_DIR / "analyses" / "11-pav"
REFERENCE = BASE_DIR / "data" / "GCF_016432855.1_SaNama_1.0_genomic.fa"
GENES_BED = BASE_DIR / "data" / "20220818-snam-GCF_016432855.1_SaNama_1.0_genes.bed"

# Analysis parameters
WINDOW_SIZE = 1000  # Window size for coverage calculation
MIN_COVERAGE = 1    # Minimum coverage to consider region "present"
MIN_INS_SIZE = 50   # Minimum insertion size to report
MIN_DEL_SIZE = 50   # Minimum deletion size to report
THREADS = 16        # Number of threads for tools

# =============================================================================
# Helper Functions
# =============================================================================

def run_command(cmd, description=""):
    """Run a shell command and handle errors."""
    print(f"  Running: {description or cmd[:80]}...")
    try:
        result = subprocess.run(
            cmd, shell=True, check=True,
            capture_output=True, text=True
        )
        return result.stdout
    except subprocess.CalledProcessError as e:
        print(f"  ERROR: {e.stderr}")
        raise

def ensure_directory(path):
    """Create directory if it doesn't exist."""
    Path(path).mkdir(parents=True, exist_ok=True)

def file_exists_and_nonempty(filepath):
    """Check if file exists and is not empty."""
    path = Path(filepath)
    return path.exists() and path.stat().st_size > 0

# =============================================================================
# Step 1: Generate Reference Genome Index and Chromosome Sizes
# =============================================================================

def setup_reference():
    """Index reference genome and create chromosome sizes file."""
    print("\n" + "="*60)
    print("Step 1: Setting up reference genome files")
    print("="*60)
    
    # Index reference if not already done
    fai_file = f"{REFERENCE}.fai"
    if not Path(fai_file).exists():
        print("  Indexing reference genome...")
        run_command(f"samtools faidx {REFERENCE}", "samtools faidx")
    else:
        print("  Reference index already exists.")
    
    # Create chromosome sizes file
    chrom_sizes = OUTPUT_DIR / "genome.chrom.sizes"
    run_command(
        f"cut -f1,2 {fai_file} > {chrom_sizes}",
        "Creating chromosome sizes file"
    )
    print(f"  Chromosome sizes saved to: {chrom_sizes}")
    
    return chrom_sizes

# =============================================================================
# Step 2: Calculate Coverage with samtools/bedtools
# =============================================================================

def create_genome_windows(chrom_sizes_file, window_size):
    """Create genome windows BED file."""
    windows_file = OUTPUT_DIR / "genome_windows.bed"
    cmd = f"bedtools makewindows -g {chrom_sizes_file} -w {window_size} > {windows_file}"
    run_command(cmd, "Creating genome windows")
    return windows_file

def calculate_sample_coverage(sample, windows_file):
    """Calculate coverage for a single sample using samtools and bedtools."""
    bam_file = BAM_DIR / f"{sample}.bam"
    regions_file = OUTPUT_DIR / f"{sample}.regions.bed.gz"
    
    if not bam_file.exists():
        print(f"  WARNING: BAM file not found for {sample}: {bam_file}")
        return None
    
    # Use bedtools coverage to get mean coverage per window
    # This is more efficient than per-base depth for large files
    temp_file = OUTPUT_DIR / f"{sample}.regions.bed"
    
    cmd = f"""bedtools coverage \
        -a {windows_file} \
        -b {bam_file} \
        -mean \
        > {temp_file}"""
    
    run_command(cmd, f"bedtools coverage for {sample}")
    
    # Gzip the output to match mosdepth format
    cmd = f"gzip -f {temp_file}"
    run_command(cmd, f"Compressing coverage file for {sample}")
    
    return sample

def calculate_coverage():
    """Calculate coverage for all samples using samtools/bedtools."""
    print("\n" + "="*60)
    print("Step 2: Calculating coverage with bedtools")
    print("="*60)
    
    # First create genome windows
    chrom_sizes = OUTPUT_DIR / "genome.chrom.sizes"
    windows_file = create_genome_windows(chrom_sizes, WINDOW_SIZE)
    print(f"  Created genome windows: {windows_file}")
    
    for sample in SAMPLES:
        print(f"\n  Processing {sample}...")
        calculate_sample_coverage(sample, windows_file)
    
    print("\n  Coverage calculation complete for all samples.")

# =============================================================================
# Step 3: Identify Absent Regions (Zero Coverage)
# =============================================================================

def extract_absent_regions(sample):
    """Extract regions with zero coverage for a sample."""
    regions_file = OUTPUT_DIR / f"{sample}.regions.bed.gz"
    absent_file = OUTPUT_DIR / f"{sample}.absent_regions.bed"
    absent_merged = OUTPUT_DIR / f"{sample}.absent_regions_merged.bed"
    
    if not regions_file.exists():
        print(f"  WARNING: Regions file not found for {sample}")
        return 0
    
    # Read and filter for zero coverage
    # bedtools coverage -mean output: chrom, start, end, mean_coverage
    absent_regions = []
    with gzip.open(regions_file, 'rt') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) >= 4:
                chrom, start, end = parts[0], int(parts[1]), int(parts[2])
                try:
                    cov = float(parts[3])
                except ValueError:
                    continue
                if cov == 0:
                    absent_regions.append((chrom, start, end, "absent", 0, "."))
    
    # Write absent regions
    with open(absent_file, 'w') as f:
        for region in absent_regions:
            f.write('\t'.join(map(str, region)) + '\n')
    
    # Merge adjacent regions using bedtools
    if absent_regions:
        cmd = f"""bedtools merge -i {absent_file} | \
            awk -v OFS='\\t' '{{print $1, $2, $3, "absent_region", 0, "."}}' > {absent_merged}"""
        run_command(cmd, f"Merging absent regions for {sample}")
    else:
        # Create empty file
        open(absent_merged, 'w').close()
    
    # Count merged regions
    with open(absent_merged, 'r') as f:
        count = sum(1 for _ in f)
    
    return count

def identify_absent_regions():
    """Identify absent regions for all samples."""
    print("\n" + "="*60)
    print("Step 3: Identifying absent regions (zero coverage)")
    print("="*60)
    
    for sample in SAMPLES:
        count = extract_absent_regions(sample)
        print(f"  {sample}: {count} absent regions")

# =============================================================================
# Step 4: Identify Present Regions (With Coverage)
# =============================================================================

def extract_present_regions(sample):
    """Extract regions with coverage for a sample."""
    regions_file = OUTPUT_DIR / f"{sample}.regions.bed.gz"
    present_file = OUTPUT_DIR / f"{sample}.present_regions.bed"
    present_merged = OUTPUT_DIR / f"{sample}.present_regions_merged.bed"
    
    if not regions_file.exists():
        print(f"  WARNING: Regions file not found for {sample}")
        return 0
    
    # Read and filter for coverage >= MIN_COVERAGE
    # bedtools coverage -mean output: chrom, start, end, mean_coverage
    present_regions = []
    with gzip.open(regions_file, 'rt') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) >= 4:
                chrom, start, end = parts[0], int(parts[1]), int(parts[2])
                try:
                    cov = float(parts[3])
                except ValueError:
                    continue
                if cov >= MIN_COVERAGE:
                    present_regions.append((chrom, start, end, "present", cov, "."))
    
    # Write present regions
    with open(present_file, 'w') as f:
        for region in present_regions:
            f.write('\t'.join(map(str, region)) + '\n')
    
    # Merge adjacent regions
    if present_regions:
        cmd = f"""bedtools merge -i {present_file} | \
            awk -v OFS='\\t' '{{print $1, $2, $3, "present_region", 0, "."}}' > {present_merged}"""
        run_command(cmd, f"Merging present regions for {sample}")
    else:
        open(present_merged, 'w').close()
    
    # Count merged regions
    with open(present_merged, 'r') as f:
        count = sum(1 for _ in f)
    
    return count

def identify_present_regions():
    """Identify present regions for all samples."""
    print("\n" + "="*60)
    print("Step 4: Identifying present regions (with coverage)")
    print("="*60)
    
    for sample in SAMPLES:
        count = extract_present_regions(sample)
        print(f"  {sample}: {count} present regions")

# =============================================================================
# Step 5: Extract Insertions from BAM Alignments
# =============================================================================

def parse_cigar_for_indels(cigar_string, ref_pos, min_ins_size, min_del_size):
    """Parse CIGAR string and extract insertions and deletions."""
    insertions = []
    deletions = []
    
    # Parse CIGAR: match numbers followed by operation letters
    pattern = re.compile(r'(\d+)([MIDNSHP=X])')
    
    current_pos = ref_pos
    for match in pattern.finditer(cigar_string):
        length = int(match.group(1))
        op = match.group(2)
        
        if op == 'I' and length >= min_ins_size:
            insertions.append((current_pos, current_pos + 1, length))
        elif op == 'D' and length >= min_del_size:
            deletions.append((current_pos, current_pos + length, length))
        
        # Advance reference position for consuming operations
        if op in ['M', 'D', 'N', '=', 'X']:
            current_pos += length
    
    return insertions, deletions

def extract_indels_from_bam(sample):
    """Extract insertions and deletions from BAM file."""
    bam_file = BAM_DIR / f"{sample}.bam"
    ins_raw = OUTPUT_DIR / f"{sample}.insertions_raw.bed"
    del_raw = OUTPUT_DIR / f"{sample}.deletions_raw.bed"
    ins_file = OUTPUT_DIR / f"{sample}.insertions.bed"
    del_file = OUTPUT_DIR / f"{sample}.deletions.bed"
    
    if not bam_file.exists():
        print(f"  WARNING: BAM file not found for {sample}")
        return 0, 0
    
    print(f"  Extracting indels from {sample}...")
    
    insertions = []
    deletions = []
    
    # Use samtools view to read BAM
    cmd = f"samtools view {bam_file}"
    proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, text=True)
    
    line_count = 0
    for line in proc.stdout:
        line_count += 1
        if line_count % 100000 == 0:
            print(f"    Processed {line_count} reads...")
        
        parts = line.strip().split('\t')
        if len(parts) < 6:
            continue
        
        chrom = parts[2]
        if chrom == '*':
            continue
        
        try:
            pos = int(parts[3])
        except ValueError:
            continue
        
        cigar = parts[5]
        
        ins, dels = parse_cigar_for_indels(cigar, pos, MIN_INS_SIZE, MIN_DEL_SIZE)
        
        for start, end, length in ins:
            insertions.append((chrom, start, end, f"insertion_{length}bp", length, "+"))
        
        for start, end, length in dels:
            deletions.append((chrom, start, end, f"deletion_{length}bp", length, "+"))
    
    proc.wait()
    
    # Sort and write insertions
    insertions.sort(key=lambda x: (x[0], x[1]))
    with open(ins_raw, 'w') as f:
        for ins in insertions:
            f.write('\t'.join(map(str, ins)) + '\n')
    
    # Sort and write deletions
    deletions.sort(key=lambda x: (x[0], x[1]))
    with open(del_raw, 'w') as f:
        for d in deletions:
            f.write('\t'.join(map(str, d)) + '\n')
    
    # Merge insertions
    if insertions:
        cmd = f"""bedtools merge -i {ins_raw} -c 4,5 -o count,sum | \
            awk -v OFS='\\t' '{{print $1, $2, $3, "insertion_cluster", $4, "+", $5}}' > {ins_file}"""
        run_command(cmd, f"Merging insertions for {sample}")
    else:
        open(ins_file, 'w').close()
    
    # Merge deletions
    if deletions:
        cmd = f"""bedtools merge -i {del_raw} -c 4,5 -o count,sum | \
            awk -v OFS='\\t' '{{print $1, $2, $3, "deletion_cluster", $4, "+", $5}}' > {del_file}"""
        run_command(cmd, f"Merging deletions for {sample}")
    else:
        open(del_file, 'w').close()
    
    # Count results
    ins_count = sum(1 for _ in open(ins_file)) if file_exists_and_nonempty(ins_file) else 0
    del_count = sum(1 for _ in open(del_file)) if file_exists_and_nonempty(del_file) else 0
    
    return ins_count, del_count

def extract_all_indels():
    """Extract insertions and deletions for all samples."""
    print("\n" + "="*60)
    print("Step 5: Extracting insertions and deletions from BAM files")
    print("="*60)
    
    for sample in SAMPLES:
        ins_count, del_count = extract_indels_from_bam(sample)
        print(f"  {sample}: {ins_count} insertions, {del_count} deletions")

# =============================================================================
# Step 6: Create Combined PAV Feature Files
# =============================================================================

def create_combined_pav(sample):
    """Create combined PAV feature file for a sample."""
    absent_file = OUTPUT_DIR / f"{sample}.absent_regions_merged.bed"
    ins_file = OUTPUT_DIR / f"{sample}.insertions.bed"
    del_file = OUTPUT_DIR / f"{sample}.deletions.bed"
    pav_file = OUTPUT_DIR / f"{sample}.pav_features.bed"
    
    features = []
    
    # Add absent regions
    if file_exists_and_nonempty(absent_file):
        with open(absent_file, 'r') as f:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) >= 3:
                    chrom, start, end = parts[0], int(parts[1]), int(parts[2])
                    features.append((chrom, start, end, "ABSENT", end - start, "."))
    
    # Add insertions
    if file_exists_and_nonempty(ins_file):
        with open(ins_file, 'r') as f:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) >= 6:
                    chrom, start, end = parts[0], int(parts[1]), int(parts[2])
                    score = parts[4] if len(parts) > 4 else "0"
                    strand = parts[5] if len(parts) > 5 else "."
                    features.append((chrom, start, end, "INSERTION", score, strand))
    
    # Add deletions
    if file_exists_and_nonempty(del_file):
        with open(del_file, 'r') as f:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) >= 6:
                    chrom, start, end = parts[0], int(parts[1]), int(parts[2])
                    score = parts[4] if len(parts) > 4 else "0"
                    strand = parts[5] if len(parts) > 5 else "."
                    features.append((chrom, start, end, "DELETION", score, strand))
    
    # Sort features
    features.sort(key=lambda x: (x[0], x[1]))
    
    # Write combined file with header
    with open(pav_file, 'w') as f:
        f.write("#chrom\tstart\tend\tfeature_type\tscore\tstrand\n")
        for feat in features:
            f.write('\t'.join(map(str, feat)) + '\n')
    
    return len(features)

def create_all_combined_pav():
    """Create combined PAV files for all samples."""
    print("\n" + "="*60)
    print("Step 6: Creating combined PAV feature files")
    print("="*60)
    
    for sample in SAMPLES:
        count = create_combined_pav(sample)
        print(f"  {sample}: {count} total PAV features")

# =============================================================================
# Step 7: Intersect PAV Features with Gene Annotations
# =============================================================================

def intersect_with_genes(sample):
    """Intersect PAV features with gene annotations."""
    absent_file = OUTPUT_DIR / f"{sample}.absent_regions_merged.bed"
    del_file = OUTPUT_DIR / f"{sample}.deletions.bed"
    ins_file = OUTPUT_DIR / f"{sample}.insertions.bed"
    
    results = {}
    
    # Intersect absent regions with genes
    if file_exists_and_nonempty(absent_file):
        out_file = OUTPUT_DIR / f"{sample}.genes_in_absent_regions.bed"
        cmd = f"bedtools intersect -a {GENES_BED} -b {absent_file} -wa -wb > {out_file}"
        run_command(cmd, f"Intersecting absent regions with genes for {sample}")
        results['absent'] = sum(1 for _ in open(out_file)) if file_exists_and_nonempty(out_file) else 0
    
    # Intersect deletions with genes
    if file_exists_and_nonempty(del_file):
        out_file = OUTPUT_DIR / f"{sample}.genes_with_deletions.bed"
        cmd = f"bedtools intersect -a {GENES_BED} -b {del_file} -wa -wb > {out_file}"
        run_command(cmd, f"Intersecting deletions with genes for {sample}")
        results['deletions'] = sum(1 for _ in open(out_file)) if file_exists_and_nonempty(out_file) else 0
    
    # Intersect insertions with genes
    if file_exists_and_nonempty(ins_file):
        out_file = OUTPUT_DIR / f"{sample}.genes_with_insertions.bed"
        cmd = f"bedtools intersect -a {GENES_BED} -b {ins_file} -wa -wb > {out_file}"
        run_command(cmd, f"Intersecting insertions with genes for {sample}")
        results['insertions'] = sum(1 for _ in open(out_file)) if file_exists_and_nonempty(out_file) else 0
    
    return results

def intersect_all_with_genes():
    """Intersect PAV features with genes for all samples."""
    print("\n" + "="*60)
    print("Step 7: Intersecting PAV features with gene annotations")
    print("="*60)
    
    for sample in SAMPLES:
        results = intersect_with_genes(sample)
        print(f"  {sample} genes affected:")
        for feat_type, count in results.items():
            print(f"    - {feat_type}: {count}")

# =============================================================================
# Step 8: Generate Summary Statistics
# =============================================================================

def generate_summary():
    """Generate summary statistics for all samples."""
    print("\n" + "="*60)
    print("Step 8: Generating summary statistics")
    print("="*60)
    
    summary_data = []
    
    for sample in SAMPLES:
        absent_file = OUTPUT_DIR / f"{sample}.absent_regions_merged.bed"
        del_file = OUTPUT_DIR / f"{sample}.deletions.bed"
        ins_file = OUTPUT_DIR / f"{sample}.insertions.bed"
        
        # Count absent regions
        n_absent = 0
        absent_bp = 0
        if file_exists_and_nonempty(absent_file):
            with open(absent_file, 'r') as f:
                for line in f:
                    parts = line.strip().split('\t')
                    if len(parts) >= 3:
                        n_absent += 1
                        absent_bp += int(parts[2]) - int(parts[1])
        
        # Count deletions
        n_del = 0
        del_bp = 0
        if file_exists_and_nonempty(del_file):
            with open(del_file, 'r') as f:
                for line in f:
                    parts = line.strip().split('\t')
                    if len(parts) >= 3:
                        n_del += 1
                        del_bp += int(parts[2]) - int(parts[1])
        
        # Count insertions
        n_ins = 0
        if file_exists_and_nonempty(ins_file):
            with open(ins_file, 'r') as f:
                n_ins = sum(1 for _ in f)
        
        summary_data.append({
            'sample': sample,
            'n_absent_regions': n_absent,
            'absent_total_bp': absent_bp,
            'n_deletions': n_del,
            'deletion_total_bp': del_bp,
            'n_insertions': n_ins
        })
    
    # Create DataFrame and save
    df = pd.DataFrame(summary_data)
    summary_file = OUTPUT_DIR / "pav_summary_stats.csv"
    df.to_csv(summary_file, index=False)
    
    print("\nSummary Statistics:")
    print(df.to_string(index=False))
    print(f"\n  Summary saved to: {summary_file}")
    
    return df

# =============================================================================
# Step 9: Compare PAV Across Samples
# =============================================================================

def compare_samples():
    """Compare PAV regions across all samples."""
    print("\n" + "="*60)
    print("Step 9: Comparing PAV across samples")
    print("="*60)
    
    # Create list of absent region files
    absent_files = [str(OUTPUT_DIR / f"{s}.absent_regions_merged.bed") for s in SAMPLES]
    
    # Check all files exist
    existing_files = [f for f in absent_files if file_exists_and_nonempty(f)]
    
    if len(existing_files) < 2:
        print("  Not enough samples with absent regions for comparison.")
        return
    
    # Run multiinter
    multiinter_out = OUTPUT_DIR / "absent_regions_multiinter.bed"
    cmd = f"""bedtools multiinter \
        -i {' '.join(existing_files)} \
        -header \
        -names {' '.join(SAMPLES)} \
        > {multiinter_out}"""
    run_command(cmd, "Running bedtools multiinter")
    
    # Filter for regions absent in all samples
    all_samples_out = OUTPUT_DIR / "absent_in_all_samples.bed"
    n_samples = len(SAMPLES)
    cmd = f"awk '$4 == {n_samples}' {multiinter_out} > {all_samples_out}"
    run_command(cmd, "Filtering regions absent in all samples")
    
    # Filter for regions absent in only one sample
    one_sample_out = OUTPUT_DIR / "absent_in_one_sample.bed"
    cmd = f"awk '$4 == 1' {multiinter_out} > {one_sample_out}"
    run_command(cmd, "Filtering sample-specific absent regions")
    
    # Count results
    all_count = sum(1 for _ in open(all_samples_out)) if file_exists_and_nonempty(all_samples_out) else 0
    one_count = sum(1 for _ in open(one_sample_out)) if file_exists_and_nonempty(one_sample_out) else 0
    
    print(f"\n  Regions absent in all samples: {all_count}")
    print(f"  Regions absent in only one sample: {one_count}")

# =============================================================================
# Main Function
# =============================================================================

def main():
    """Main function to run the complete PAV analysis."""
    print("="*60)
    print("Present-Absent Variation (PAV) Analysis")
    print("="*60)
    print(f"\nSamples: {', '.join(SAMPLES)}")
    print(f"Output directory: {OUTPUT_DIR}")
    print(f"Reference genome: {REFERENCE}")
    
    # Ensure output directory exists
    ensure_directory(OUTPUT_DIR)
    
    # Run analysis steps
    setup_reference()
    calculate_coverage()
    identify_absent_regions()
    identify_present_regions()
    extract_all_indels()
    create_all_combined_pav()
    intersect_all_with_genes()
    summary_df = generate_summary()
    compare_samples()
    
    print("\n" + "="*60)
    print("PAV Analysis Complete!")
    print("="*60)
    print(f"\nOutput files are in: {OUTPUT_DIR}")
    print("\nKey output files per sample:")
    print("  - {sample}.pav_features.bed      : Combined PAV features")
    print("  - {sample}.absent_regions_merged.bed : Zero-coverage regions")
    print("  - {sample}.insertions.bed        : Insertions ≥50bp")
    print("  - {sample}.deletions.bed         : Deletions ≥50bp")
    print("  - {sample}.genes_in_absent_regions.bed : Genes in absent regions")
    print("\nCross-sample files:")
    print("  - pav_summary_stats.csv          : Summary statistics")
    print("  - absent_regions_multiinter.bed  : Multi-sample comparison")
    print("  - absent_in_all_samples.bed      : Regions absent in all samples")
    print("  - absent_in_one_sample.bed       : Sample-specific absent regions")

if __name__ == "__main__":
    main()