#!/usr/bin/env python3
"""
Convert annotation_summary.md to CSV format
"""

import csv
import re
import os
from pathlib import Path

# Get the script directory and construct relative paths
script_dir = Path(__file__).parent
repo_root = script_dir.parent.parent
output_dir = repo_root / "M-multi-species" / "output" / "22-Visualizing-Rank-outs"

# Input and output file paths
input_file = output_dir / "annotation_summary.md"
output_file = output_dir / "annotation_summary.csv"

# Read the markdown file
with open(input_file, 'r') as f:
    lines = f.readlines()

# Find the header row and data rows
header = None
data_rows = []

for i, line in enumerate(lines):
    line = line.strip()
    
    # Skip empty lines and title
    if not line or line.startswith('#'):
        continue
    
    # Skip the "Total files processed" line
    if line.startswith('Total files processed'):
        continue
    
    # Check if this is a table row (contains pipe characters)
    if '|' in line:
        # Split by pipe and clean up
        cells = [cell.strip() for cell in line.split('|')]
        # Remove only the leading and trailing empty elements from pipe splitting
        if cells and cells[0] == '':
            cells = cells[1:]
        if cells and cells[-1] == '':
            cells = cells[:-1]
        
        # Check if this is the separator line (contains dashes and possibly colons/spaces)
        if all(re.match(r'^[\s:-]+$', cell) for cell in cells):
            continue
        
        # If we haven't found the header yet, this is it
        if header is None:
            header = cells
        else:
            data_rows.append(cells)

# Write to CSV
with open(output_file, 'w', newline='') as f:
    writer = csv.writer(f)
    
    # Write header
    if header:
        writer.writerow(header)
    
    # Write data rows
    for row in data_rows:
        writer.writerow(row)

print(f"Converted {len(data_rows)} rows from {input_file} to {output_file}")