""" Note that before running this, I took the output of augment_metadata.py and did the following:
ran temp_merge_treatment.py
Added the species as the Family (it was blank before) for PRJNA778545.
Changed 'Injection' values to 'Injected' for uniformity.
In the former 'collection_time' column, I fixed the values.
They were 0.0, 3.0, and 24.0.
I corrected the values to 0, 30, and 1 (i.e. 24 hours) respectively, according to the paper.
Renamed the column to 'Collection_Interval_Days'.
I hope that captures all the ad-hoc changes I made.
"""
import os
import pandas as pd

output_dir = "/home/syost/git/Cvirg_Pmarinus_RNAseq/data/"  # Update this path as needed
# Load the metadata CSV file
updated_metadata_path = os.path.join(output_dir, "augmented_metadata.csv")
further_updated_metadata_path = os.path.join(output_dir, "further_augmented_metadata.csv")
merged_df = pd.read_csv(updated_metadata_path)

# Convert Collection_Date to datetime format for calculations
merged_df['Collection_Date'] = pd.to_datetime(merged_df['Collection_Date'], errors='coerce')

# Fill in Collection_Interval_Days where missing
for bioproject in merged_df['BioProject'].unique():
    # Get earliest date for the BioProject
    min_date = merged_df.loc[merged_df['BioProject'] == bioproject, 'Collection_Date'].min()
    # Calculate difference in days and fill missing values
    merged_df.loc[(merged_df['BioProject'] == bioproject) & (merged_df['Collection_Interval_Days'].isna()), 'Collection_Interval_Days'] = \
        (merged_df['Collection_Date'] - min_date).dt.days

# Update Collection_Interval_Days for rows where BioProject is PRJNA894694
merged_df.loc[merged_df['BioProject'] == 'PRJNA894694', 'Collection_Interval_Days'] = 7.0

# Calculate the number of days in 14 months (assuming an average month length of 30.44 days)
days_in_14_months = round(14 * 30.44)

# Update Collection_Interval_Days for rows where BioProject is PRJNA604121
merged_df.loc[merged_df['BioProject'] == 'PRJNA604121', 'Collection_Interval_Days'] = days_in_14_months


# Define BioProject to Study mapping
study_mapping = {
    "PRJNA590205": "P&S 2020",
    "PRJNA691949": "P&S 2021",
    "PRJNA894694": "P&S 2023",
    "PRJNA778545": "Chan et al. 2021",
    "PRJNA604121": "Johnson et al. 2020"
}

# Create the Study column using mapping
merged_df['Study'] = merged_df['BioProject'].map(study_mapping)

# Reorder columns: Move Study to the beginning and BioProject next to it
column_order = ['Study', 'BioProject'] + [col for col in merged_df.columns if col not in ['Study', 'BioProject']]
merged_df = merged_df[column_order]

# Save updated metadata file
merged_df.to_csv(further_updated_metadata_path, index=False)

print(f"Processing complete! Updated metadata saved to: {further_updated_metadata_path}")