""" This is a one-off script to add the treatment column from SraRunTable (4).csv
which was missing because the treatment heading was misspelled
"""
import os
import pandas as pd

output_dir = "/home/syost/git/Cvirg_Pmarinus_RNAseq/data/"  # Update this path as needed
# Load the metadata CSV file
first_metadata_file = os.path.join(output_dir, "updated_metadata.csv")
updated_metadata_path = os.path.join(output_dir, "updated_metadata_2.csv")

# Load the first metadata file
df1 = pd.read_csv(first_metadata_file)

# Load the second metadata file
second_metadata_file = os.path.join(output_dir, "SraRunTable (4).csv")
df2 = pd.read_csv(second_metadata_file)

# Store the original treatment column to compare later
original_treatment = df1['treatment'].copy()

# Merge the treatment column based on matching Experiment in both files
df1 = df1.merge(df2[['Experiment', 'treatment']], on='Experiment', how='left', suffixes=('', '_new'))

# Replace the old treatment column with the new one where matches are found
df1['treatment'] = df1['treatment_new'].combine_first(df1['treatment'])

# Drop the auxiliary column
df1.drop(columns=['treatment_new'], inplace=True)

# Count how many rows had their treatment value changed
changed_rows = (df1['treatment'] != original_treatment) & df1['treatment'].notna()
num_changed = changed_rows.sum()

# Save the updated DataFrame
df1.to_csv(os.path.join(output_dir, "updated_first_metadata.csv"), index=False)

# Print the report
print(f"Updated first metadata file saved as 'updated_first_metadata.csv'.")
print(f"Number of rows where the treatment value was changed: {num_changed}")