#!/usr/bin/env python3 """ 15.5-diff-pav.py - Stringent Differential PAV Analysis: Siscowet vs Lean Trout This script identifies insertions and deletions that are specific to each ecotype (Siscowet vs Lean) using STRINGENT criteria: 1. A PAV must be present in ALL 4 samples of one ecotype 2. A PAV must be absent in ALL 4 samples of the other ecotype 3. Only PAVs > 100bp are considered Sample Classification: LEAN: bc2041, bc2069, bc2070, bc2068 SISCOWET: bc2071, bc2073, bc2072, bc2096 Output files are formatted for genome browser visualization with: - Color coding (blue for Lean-specific, red for Siscowet-specific) - Descriptive names - Track headers for UCSC/IGV compatibility Author: Generated for project-lake-trout Date: 2025-12-09 """ import os import subprocess import pandas as pd from pathlib import Path from collections import defaultdict import tempfile # ============================================================================= # Configuration # ============================================================================= # Define samples by ecotype SAMPLES_LEAN = ["bc2041", "bc2069", "bc2070", "bc2068"] SAMPLES_SISCOWET = ["bc2071", "bc2073", "bc2072", "bc2096"] # Paths BASE_DIR = Path(__file__).parent.parent INPUT_DIR = BASE_DIR / "analyses" / "11-pav" OUTPUT_DIR = BASE_DIR / "analyses" / "15-diff-pav" # BED file color coding (RGB format for genome browsers) COLOR_LEAN = "0,0,255" # Blue for Lean-specific COLOR_SISCOWET = "255,0,0" # Red for Siscowet-specific COLOR_SHARED = "128,128,128" # Gray for shared variants # Stringent parameters MIN_OVERLAP_FRACTION = 0.5 # 50% reciprocal overlap to consider same variant MIN_PAV_SIZE = 100 # Minimum PAV size in bp (>100bp) REQUIRE_ALL_SAMPLES = True # Require presence in ALL samples of ecotype # ============================================================================= # Helper Functions # ============================================================================= def run_command(cmd, description=""): """Run a shell command and handle errors.""" print(f" Running: {description or cmd[:80]}...") try: result = subprocess.run( cmd, shell=True, check=True, capture_output=True, text=True ) return result.stdout except subprocess.CalledProcessError as e: print(f" ERROR: {e.stderr}") raise def ensure_directory(path): """Create directory if it doesn't exist.""" Path(path).mkdir(parents=True, exist_ok=True) def file_exists_and_nonempty(filepath): """Check if file exists and is not empty.""" path = Path(filepath) return path.exists() and path.stat().st_size > 0 def get_available_samples(samples, variant_type): """Get list of samples that have data for the variant type.""" available = [] for sample in samples: filepath = INPUT_DIR / f"{sample}.{variant_type}.bed" if file_exists_and_nonempty(filepath): available.append(sample) return available def count_bed_lines(filepath): """Count non-header lines in a BED file.""" if not file_exists_and_nonempty(filepath): return 0 count = 0 with open(filepath, 'r') as f: for line in f: if not line.startswith('#') and not line.startswith('track'): count += 1 return count def load_bed_file(filepath, min_size=0): """ Load a BED file and return list of variants. Optionally filter by minimum size. """ variants = [] if not file_exists_and_nonempty(filepath): return variants with open(filepath, 'r') as f: for line in f: if line.startswith('#') or line.startswith('track'): continue parts = line.strip().split('\t') if len(parts) >= 3: chrom = parts[0] start = int(parts[1]) end = int(parts[2]) size = end - start # Filter by minimum size if size > min_size: variants.append({ 'chrom': chrom, 'start': start, 'end': end, 'size': size, 'line': line.strip() }) return variants def write_bed_file(variants, filepath): """Write variants to a BED file.""" with open(filepath, 'w') as f: for v in variants: f.write(f"{v['chrom']}\t{v['start']}\t{v['end']}\n") def variants_overlap(v1, v2, min_overlap_frac=0.5): """ Check if two variants overlap with at least min_overlap_frac reciprocal overlap. """ if v1['chrom'] != v2['chrom']: return False # Calculate overlap overlap_start = max(v1['start'], v2['start']) overlap_end = min(v1['end'], v2['end']) if overlap_start >= overlap_end: return False overlap_size = overlap_end - overlap_start # Calculate overlap fraction for both variants frac1 = overlap_size / v1['size'] frac2 = overlap_size / v2['size'] # Reciprocal overlap: both must meet threshold return frac1 >= min_overlap_frac and frac2 >= min_overlap_frac # ============================================================================= # Step 1: Find variants present in ALL samples of an ecotype (stringent) # ============================================================================= def find_variants_in_all_samples(samples, variant_type): """ Find variants that are present in ALL samples of an ecotype. Uses iterative intersection approach. Returns list of variants present in all samples. """ available = get_available_samples(samples, variant_type) if len(available) < len(samples): print(f" WARNING: Only {len(available)}/{len(samples)} samples available for {variant_type}") print(f" Available: {available}") print(f" Missing: {set(samples) - set(available)}") if len(available) == 0: return None print(f" Processing {len(available)} samples for {variant_type}") # Load first sample as starting point (filtered by size) first_sample = available[0] first_file = INPUT_DIR / f"{first_sample}.{variant_type}.bed" # Create temporary directory for intermediate files temp_dir = OUTPUT_DIR / "temp" ensure_directory(temp_dir) # Filter first file by size and sort current_file = temp_dir / f"current.{variant_type}.bed" # Load and filter by size variants = load_bed_file(first_file, min_size=MIN_PAV_SIZE) print(f" {first_sample}: {len(variants)} variants > {MIN_PAV_SIZE}bp") if not variants: return None # Write filtered variants write_bed_file(variants, current_file) # Sort sorted_file = temp_dir / f"current.sorted.{variant_type}.bed" cmd = f"sort -k1,1 -k2,2n {current_file} > {sorted_file}" run_command(cmd, f"Sorting {first_sample}") current_file = sorted_file # Iteratively intersect with remaining samples for sample in available[1:]: sample_file = INPUT_DIR / f"{sample}.{variant_type}.bed" # Filter sample by size sample_filtered = temp_dir / f"{sample}.filtered.{variant_type}.bed" variants = load_bed_file(sample_file, min_size=MIN_PAV_SIZE) print(f" {sample}: {len(variants)} variants > {MIN_PAV_SIZE}bp") if not variants: print(f" WARNING: No variants > {MIN_PAV_SIZE}bp in {sample}") # Clean up and return None - can't have variants in ALL samples return None write_bed_file(variants, sample_filtered) # Sort sample file sample_sorted = temp_dir / f"{sample}.sorted.{variant_type}.bed" cmd = f"sort -k1,1 -k2,2n {sample_filtered} > {sample_sorted}" run_command(cmd, f"Sorting {sample}") # Intersect with current set (keep only variants in BOTH) # -f and -r for reciprocal overlap # -wa writes the original entry from file A next_file = temp_dir / f"intersect.{sample}.{variant_type}.bed" cmd = f"""bedtools intersect \ -a {current_file} \ -b {sample_sorted} \ -wa \ -f {MIN_OVERLAP_FRACTION} \ -r \ -u \ > {next_file}""" run_command(cmd, f"Intersecting with {sample}") # Count remaining variants remaining = count_bed_lines(next_file) print(f" After intersecting with {sample}: {remaining} variants remain") if remaining == 0: print(f" No variants present in all samples up to {sample}") return None current_file = next_file # Final result final_count = count_bed_lines(current_file) print(f" Found {final_count} variants present in ALL {len(available)} samples") # Copy to output output_file = OUTPUT_DIR / f"stringent.all_{len(available)}_samples.{variant_type}.bed" cmd = f"cp {current_file} {output_file}" run_command(cmd, "Copying final result") return output_file # ============================================================================= # Step 2: Find ecotype-specific variants (stringent) # ============================================================================= def find_stringent_ecotype_specific(ecotype_all_file, other_ecotype_files, output_file, ecotype_name, variant_type): """ Find variants present in ALL samples of one ecotype and ABSENT from ALL samples of the other ecotype. ecotype_all_file: BED file with variants present in ALL samples of this ecotype other_ecotype_files: list of BED files for each sample of the other ecotype """ if ecotype_all_file is None or not file_exists_and_nonempty(ecotype_all_file): print(f" No variants present in all {ecotype_name} samples") return None # Create temp directory temp_dir = OUTPUT_DIR / "temp" ensure_directory(temp_dir) current_file = ecotype_all_file # Subtract each sample from the other ecotype for other_file in other_ecotype_files: if not file_exists_and_nonempty(other_file): continue sample_name = Path(other_file).stem.split('.')[0] # Filter other sample by size first other_filtered = temp_dir / f"other.{sample_name}.filtered.bed" variants = load_bed_file(other_file, min_size=MIN_PAV_SIZE) if not variants: print(f" {sample_name}: no variants > {MIN_PAV_SIZE}bp (skipping)") continue write_bed_file(variants, other_filtered) # Sort other_sorted = temp_dir / f"other.{sample_name}.sorted.bed" cmd = f"sort -k1,1 -k2,2n {other_filtered} > {other_sorted}" run_command(cmd, f"Sorting {sample_name}") # Remove any overlapping variants next_file = temp_dir / f"subtract.{sample_name}.bed" cmd = f"""bedtools intersect \ -a {current_file} \ -b {other_sorted} \ -v \ -f {MIN_OVERLAP_FRACTION} \ -r \ > {next_file}""" run_command(cmd, f"Subtracting {sample_name}") remaining = count_bed_lines(next_file) print(f" After removing overlaps with {sample_name}: {remaining} variants remain") current_file = next_file # Copy final result final_count = count_bed_lines(current_file) if final_count > 0: cmd = f"cp {current_file} {output_file}" run_command(cmd, f"Saving {ecotype_name}-specific variants") print(f" Found {final_count} {ecotype_name}-specific {variant_type}") return output_file else: print(f" No {ecotype_name}-specific {variant_type} found") return None # ============================================================================= # Step 3: Create genome browser tracks # ============================================================================= def create_genome_browser_bed(input_file, output_file, track_name, track_description, color, variant_type): """ Format BED file for genome browser visualization with track header. Creates BED9 format with color coding. """ if not file_exists_and_nonempty(input_file): print(f" Skipping empty file: {input_file}") return None # Read input variants = [] with open(input_file, 'r') as f: for line in f: if line.startswith('#') or line.startswith('track'): continue parts = line.strip().split('\t') if len(parts) >= 3: chrom = parts[0] start = int(parts[1]) end = int(parts[2]) # Calculate variant size size = end - start # Create descriptive name if variant_type == "insertions": name = f"INS_{size}bp" elif variant_type == "deletions": name = f"DEL_{size}bp" else: name = f"VAR_{size}bp" # Score (use count if available, else 0) score = parts[3] if len(parts) > 3 else "0" try: score = min(int(float(score)), 1000) # Cap at 1000 for BED format except: score = 0 variants.append({ 'chrom': chrom, 'start': start, 'end': end, 'name': name, 'score': score, 'strand': '.', 'thickStart': start, 'thickEnd': end, 'itemRgb': color }) # Sort by position variants.sort(key=lambda x: (x['chrom'], x['start'])) # Write output with track header with open(output_file, 'w') as f: # Write track header for UCSC genome browser f.write(f'track name="{track_name}" description="{track_description}" ') f.write(f'visibility=pack itemRgb="On"\n') for v in variants: f.write(f"{v['chrom']}\t{v['start']}\t{v['end']}\t{v['name']}\t") f.write(f"{v['score']}\t{v['strand']}\t{v['thickStart']}\t{v['thickEnd']}\t") f.write(f"{v['itemRgb']}\n") return output_file # ============================================================================= # Step 4: Generate summary statistics # ============================================================================= def generate_summary(): """Generate summary statistics for stringent differential PAV analysis.""" summary_data = [] variant_types = ['insertions', 'deletions'] categories = ['lean_specific', 'siscowet_specific'] for vtype in variant_types: for category in categories: bed_file = OUTPUT_DIR / f"stringent.{category}.{vtype}.browser.bed" count = count_bed_lines(bed_file) # Calculate total bp and size stats total_bp = 0 sizes = [] if file_exists_and_nonempty(bed_file): with open(bed_file, 'r') as f: for line in f: if line.startswith('#') or line.startswith('track'): continue parts = line.strip().split('\t') if len(parts) >= 3: size = int(parts[2]) - int(parts[1]) total_bp += size sizes.append(size) min_size = min(sizes) if sizes else 0 max_size = max(sizes) if sizes else 0 mean_size = sum(sizes) / len(sizes) if sizes else 0 summary_data.append({ 'variant_type': vtype, 'category': category, 'count': count, 'total_bp': total_bp, 'min_size': min_size, 'max_size': max_size, 'mean_size': round(mean_size, 1) }) df = pd.DataFrame(summary_data) summary_file = OUTPUT_DIR / "stringent_diff_pav_summary.csv" df.to_csv(summary_file, index=False) return df # ============================================================================= # Step 5: Create combined visualization tracks # ============================================================================= def create_combined_tracks(): """ Create combined track files with all variants colored by ecotype. """ print("\n" + "="*60) print("Creating combined visualization tracks") print("="*60) for vtype in ['insertions', 'deletions']: combined_file = OUTPUT_DIR / f"stringent.all_{vtype}_by_ecotype.browser.bed" all_variants = [] # Add Lean-specific (blue) lean_file = OUTPUT_DIR / f"stringent.lean_specific.{vtype}.browser.bed" if file_exists_and_nonempty(lean_file): with open(lean_file, 'r') as f: for line in f: if not line.startswith('track') and line.strip(): parts = line.strip().split('\t') if len(parts) >= 9: parts[3] = f"LEAN_{parts[3]}" # Prefix name all_variants.append(parts) # Add Siscowet-specific (red) sisc_file = OUTPUT_DIR / f"stringent.siscowet_specific.{vtype}.browser.bed" if file_exists_and_nonempty(sisc_file): with open(sisc_file, 'r') as f: for line in f: if not line.startswith('track') and line.strip(): parts = line.strip().split('\t') if len(parts) >= 9: parts[3] = f"SISC_{parts[3]}" # Prefix name all_variants.append(parts) # Sort all variants all_variants.sort(key=lambda x: (x[0], int(x[1]))) # Write combined file with open(combined_file, 'w') as f: vtype_cap = vtype.capitalize() f.write(f'track name="Stringent_All_{vtype_cap}_by_Ecotype" ') f.write(f'description="Stringent {vtype} (>100bp, in all 4 samples): Blue=Lean, Red=Siscowet" ') f.write('visibility=pack itemRgb="On"\n') for v in all_variants: f.write('\t'.join(v) + '\n') print(f" Created: {combined_file}") print(f" Total variants: {len(all_variants)}") # ============================================================================= # Cleanup # ============================================================================= def cleanup_temp_files(): """Remove temporary files.""" temp_dir = OUTPUT_DIR / "temp" if temp_dir.exists(): import shutil shutil.rmtree(temp_dir) print(" Cleaned up temporary files") # ============================================================================= # Main Analysis # ============================================================================= def main(): """Run the stringent differential PAV analysis.""" print("="*60) print("STRINGENT Differential PAV Analysis: Siscowet vs Lean Trout") print("="*60) print(f"\nStringent Criteria:") print(f" 1. PAV must be present in ALL 4 samples of one ecotype") print(f" 2. PAV must be absent from ALL 4 samples of other ecotype") print(f" 3. PAV size must be > {MIN_PAV_SIZE}bp") print(f" 4. Reciprocal overlap threshold: {MIN_OVERLAP_FRACTION*100}%") print(f"\nLean samples: {', '.join(SAMPLES_LEAN)}") print(f"Siscowet samples: {', '.join(SAMPLES_SISCOWET)}") print(f"Input directory: {INPUT_DIR}") print(f"Output directory: {OUTPUT_DIR}") # Create output directory ensure_directory(OUTPUT_DIR) # Check available data print("\n" + "="*60) print("Checking available data") print("="*60) lean_ins = get_available_samples(SAMPLES_LEAN, "insertions") lean_del = get_available_samples(SAMPLES_LEAN, "deletions") sisc_ins = get_available_samples(SAMPLES_SISCOWET, "insertions") sisc_del = get_available_samples(SAMPLES_SISCOWET, "deletions") print(f" Lean insertions: {len(lean_ins)}/4 samples available") print(f" Lean deletions: {len(lean_del)}/4 samples available") print(f" Siscowet insertions: {len(sisc_ins)}/4 samples available") print(f" Siscowet deletions: {len(sisc_del)}/4 samples available") # Get file paths for other ecotype samples lean_ins_files = [INPUT_DIR / f"{s}.insertions.bed" for s in SAMPLES_LEAN] lean_del_files = [INPUT_DIR / f"{s}.deletions.bed" for s in SAMPLES_LEAN] sisc_ins_files = [INPUT_DIR / f"{s}.insertions.bed" for s in SAMPLES_SISCOWET] sisc_del_files = [INPUT_DIR / f"{s}.deletions.bed" for s in SAMPLES_SISCOWET] # ========================================================================== # Process Insertions # ========================================================================== print("\n" + "="*60) print("Step 1: Processing INSERTIONS (stringent)") print("="*60) # Find insertions in ALL Lean samples print("\n Finding insertions present in ALL Lean samples...") lean_ins_all = find_variants_in_all_samples(SAMPLES_LEAN, "insertions") # Find insertions in ALL Siscowet samples print("\n Finding insertions present in ALL Siscowet samples...") sisc_ins_all = find_variants_in_all_samples(SAMPLES_SISCOWET, "insertions") # Find Lean-specific insertions (in all Lean, absent from all Siscowet) print("\n Finding LEAN-specific insertions...") lean_specific_ins = OUTPUT_DIR / "stringent.lean_specific.insertions.bed" find_stringent_ecotype_specific( lean_ins_all, sisc_ins_files, lean_specific_ins, "Lean", "insertions" ) # Find Siscowet-specific insertions (in all Siscowet, absent from all Lean) print("\n Finding SISCOWET-specific insertions...") sisc_specific_ins = OUTPUT_DIR / "stringent.siscowet_specific.insertions.bed" find_stringent_ecotype_specific( sisc_ins_all, lean_ins_files, sisc_specific_ins, "Siscowet", "insertions" ) # Create genome browser tracks print("\n Creating genome browser tracks for insertions...") create_genome_browser_bed( lean_specific_ins, OUTPUT_DIR / "stringent.lean_specific.insertions.browser.bed", "Stringent_Lean_Insertions", f"Insertions in ALL 4 Lean samples, absent from ALL Siscowet (>{MIN_PAV_SIZE}bp, blue)", COLOR_LEAN, "insertions" ) create_genome_browser_bed( sisc_specific_ins, OUTPUT_DIR / "stringent.siscowet_specific.insertions.browser.bed", "Stringent_Siscowet_Insertions", f"Insertions in ALL 4 Siscowet samples, absent from ALL Lean (>{MIN_PAV_SIZE}bp, red)", COLOR_SISCOWET, "insertions" ) # ========================================================================== # Process Deletions # ========================================================================== print("\n" + "="*60) print("Step 2: Processing DELETIONS (stringent)") print("="*60) # Find deletions in ALL Lean samples print("\n Finding deletions present in ALL Lean samples...") lean_del_all = find_variants_in_all_samples(SAMPLES_LEAN, "deletions") # Find deletions in ALL Siscowet samples print("\n Finding deletions present in ALL Siscowet samples...") sisc_del_all = find_variants_in_all_samples(SAMPLES_SISCOWET, "deletions") # Find Lean-specific deletions (in all Lean, absent from all Siscowet) print("\n Finding LEAN-specific deletions...") lean_specific_del = OUTPUT_DIR / "stringent.lean_specific.deletions.bed" find_stringent_ecotype_specific( lean_del_all, sisc_del_files, lean_specific_del, "Lean", "deletions" ) # Find Siscowet-specific deletions (in all Siscowet, absent from all Lean) print("\n Finding SISCOWET-specific deletions...") sisc_specific_del = OUTPUT_DIR / "stringent.siscowet_specific.deletions.bed" find_stringent_ecotype_specific( sisc_del_all, lean_del_files, sisc_specific_del, "Siscowet", "deletions" ) # Create genome browser tracks print("\n Creating genome browser tracks for deletions...") create_genome_browser_bed( lean_specific_del, OUTPUT_DIR / "stringent.lean_specific.deletions.browser.bed", "Stringent_Lean_Deletions", f"Deletions in ALL 4 Lean samples, absent from ALL Siscowet (>{MIN_PAV_SIZE}bp, blue)", COLOR_LEAN, "deletions" ) create_genome_browser_bed( sisc_specific_del, OUTPUT_DIR / "stringent.siscowet_specific.deletions.browser.bed", "Stringent_Siscowet_Deletions", f"Deletions in ALL 4 Siscowet samples, absent from ALL Lean (>{MIN_PAV_SIZE}bp, red)", COLOR_SISCOWET, "deletions" ) # ========================================================================== # Create combined visualization tracks # ========================================================================== create_combined_tracks() # ========================================================================== # Generate summary statistics # ========================================================================== print("\n" + "="*60) print("Generating Summary Statistics") print("="*60) summary_df = generate_summary() print("\nStringent Differential PAV Summary:") print(summary_df.to_string(index=False)) # ========================================================================== # Cleanup # ========================================================================== print("\n" + "="*60) print("Cleaning up") print("="*60) cleanup_temp_files() # ========================================================================== # Final output # ========================================================================== print("\n" + "="*60) print("Stringent Differential PAV Analysis Complete!") print("="*60) print(f"\nStringent Criteria Applied:") print(f" - PAV present in ALL 4 samples of one ecotype") print(f" - PAV absent from ALL 4 samples of other ecotype") print(f" - PAV size > {MIN_PAV_SIZE}bp") print(f"\nOutput files in: {OUTPUT_DIR}") print("\nGenome Browser Track Files (stringent):") print(" Lean-specific (Blue):") print(" - stringent.lean_specific.insertions.browser.bed") print(" - stringent.lean_specific.deletions.browser.bed") print(" Siscowet-specific (Red):") print(" - stringent.siscowet_specific.insertions.browser.bed") print(" - stringent.siscowet_specific.deletions.browser.bed") print(" Combined tracks:") print(" - stringent.all_insertions_by_ecotype.browser.bed") print(" - stringent.all_deletions_by_ecotype.browser.bed") print("\nSummary file:") print(" - stringent_diff_pav_summary.csv") print("\nTo view in UCSC Genome Browser:") print(" 1. Go to https://genome.ucsc.edu/cgi-bin/hgCustom") print(" 2. Choose the appropriate genome assembly") print(" 3. Upload the .browser.bed files as custom tracks") if __name__ == "__main__": main()