import os
import pandas as pd

output_dir = "/home/syost/git/Cvirg_Pmarinus_RNAseq/data/"  # Update this path as needed
# Load the metadata CSV file
metadata_file_path = os.path.join(output_dir, "merged_metadata.csv")
updated_metadata_path = os.path.join(output_dir, "updated_metadata.csv")
metadata_df = pd.read_csv(metadata_file_path)

# Manually defined second table (replace this with a CSV read if available)
trait_data = pd.DataFrame([
    ["P&S 2020", "PRJNA590205", "242", "susceptible"],
    ["P&S 2020", "PRJNA590205", "266", "resistant"],
    ["P&S 2021", "PRJNA691949", "286", "unknown"],
    ["P&S 2023", "PRJNA894694", "084", "tolerant"],
    ["P&S 2023", "PRJNA894694", "089", "tolerant"],
    ["P&S 2023", "PRJNA894694", "090", "sensitive"],
    ["P&S 2023", "PRJNA894694", "120", "sensitive"],
    ["Johnson et al. 2020", "PRJNA604121", "A", "resistant"],
    ["Johnson et al. 2020", "PRJNA604121", "B", "resistant"],
    ["Johnson et al. 2020", "PRJNA604121", "C", "susceptible"],
    ["Johnson et al. 2020", "PRJNA604121", "D", "susceptible"]
], columns=["Study", "BioProject", "Family", "Trait"])

# Ensure Family is treated as a string
trait_data["Family"] = trait_data["Family"].astype(str)

# Create a dictionary mapping BioProject to Family lengths
family_lengths = trait_data.set_index("BioProject")["Family"].astype(str).apply(len).to_dict()

# Extract the corresponding number of digits from BREED dynamically
metadata_df["BREED_suffix"] = metadata_df.apply(
    lambda row: str(row["BREED"])[-family_lengths.get(row["BioProject"], 0):], axis=1
)

# Merge using BioProject and dynamically extracted Family ID from BREED
merged_df = metadata_df.merge(
    trait_data[["BioProject", "Family", "Trait"]],
    how="left",
    left_on=["BioProject", "BREED_suffix"],
    right_on=["BioProject", "Family"],
    suffixes=("", "_trait1")
)

# Perform the second merge: Match BioProject and Infection level
merged_df = merged_df.merge(trait_data[["BioProject", "Family", "Trait"]], 
                            how='left', left_on=["BioProject", "infection"],
                            right_on=["BioProject", "Family"], suffixes=('', '_trait2'))

# Assign TRSS correctly, ensuring we prioritize the second table's Trait column
merged_df["TRSS"] = merged_df["Trait_trait1"].combine_first(merged_df["Trait_trait2"])

# Ensure TRSS values are locked before proceeding further
trss_values_after_fix = merged_df["TRSS"].notna().sum()

# Update the TRS column based on BioProject PRJNA778545 and Trait values
condition = merged_df["BioProject"] == "PRJNA778545"
merged_df.loc[condition, "TRSS"] = merged_df.loc[condition, "Trait"].map({"tolerant": "resistant", "sensitive": "susceptible"}).fillna("TBD")
merged_df.loc[condition, "BREED"] = merged_df.loc[condition, "Trait"].map({"tolerant": "Crassostrea gigas", "sensitive": "Crassostrea virginica"}).fillna("TBD")

# Append the 'infection' value to 'BREED', prefixed with an underscore, where 'infection' is not NaN
merged_df.loc[merged_df['infection'].notna(), 'BREED'] = merged_df['BREED'] + '_' + merged_df['infection']


# Ensure the TRSS column is not accidentally removed
columns_to_keep = list(metadata_df.columns) + ["TRSS"]
columns_to_keep.remove("BREED_suffix")
columns_to_keep.remove("infection")
merged_df = merged_df[columns_to_keep]

# Save updated metadata file
merged_df.to_csv(updated_metadata_path, index=False)

print(f"Processing complete! Updated metadata saved to: {updated_metadata_path}")