mvppred / scripts /train_final.py
Md Wasi Ul Kabir
Initial commit
8bb21fb
# scripts/train_final.py
from __future__ import annotations
import pandas as pd
import sys
from pathlib import Path
import numpy as np
import os
import os
os.environ["OMP_NUM_THREADS"] = "64"
os.environ["OPENBLAS_NUM_THREADS"] = "64"
os.environ["MKL_NUM_THREADS"] = "64"
os.environ["NUMEXPR_NUM_THREADS"] = "64"
# Add project root to PYTHONPATH so `import src...` works when running as a script
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from src.pipeline import run_10fold_cv_paper_like
from src.pipeline import train_and_save_final_models
def load_data() -> tuple[pd.DataFrame, list[str], list[str]]:
import pandas as pd
morph_path = "Data/morph_dataset.csv"
perf_path = "Data/performance_dataset.csv"
morph_df = pd.read_csv(morph_path)
perf_df = pd.read_csv(perf_path)
# Convert -1 to NaN
morph_df = morph_df.replace(-1, np.nan)
perf_df = perf_df.replace(-1, np.nan)
# If row counts differ, stop (can't safely align)
if len(morph_df) != len(perf_df):
raise ValueError(
f"Row counts differ: morph={len(morph_df)} perf={len(perf_df)}. "
"Need a shared ID column to merge."
)
# Row-aligned join
data = pd.concat([morph_df.reset_index(drop=True),
perf_df.reset_index(drop=True)], axis=1)
# Define targets exactly like your performance CSV columns (excluding ecomorph label)
targets = [
"sprint", "endurance", "bite", "distance_capacity",
"jump_distance", "jump_vel", "jump_accel", "jump_power", "angle"
]
# Categorical columns to one-hot encode (based on your header)
cat_cols = ["taxon", "genus", "species", "sex", "ecomorph"]
# Morphology numeric columns (everything in morph_df except categorical)
# Your morph headers: taxon, genus, species, sex, mass, svl, hl, hw, ...
morph_numeric = [c for c in morph_df.columns if c not in cat_cols]
# One-hot encode categorical columns (keep NaNs as its own category if any)
data[cat_cols] = data[cat_cols].astype("Int64") # keep missing-safe ints
data = pd.get_dummies(data, columns=cat_cols, dummy_na=True)
# After get_dummies, morph_cols should be:
# - numeric morphology columns (mass, svl, hl, ...)
# - plus the created one-hot columns for taxon/genus/species/sex/ecomorph
dummy_cols = [c for c in data.columns if any(c.startswith(f"{cc}_") for cc in cat_cols)]
morph_cols = morph_numeric + dummy_cols
# Sanity check: ensure targets exist
missing = [t for t in targets if t not in data.columns]
if missing:
raise ValueError(f"Targets missing from merged data: {missing}. Check perf CSV header.")
return data, morph_cols, targets
def main():
data, morph_cols, targets = load_data()
saved = train_and_save_final_models(
data=data,
morph_cols=morph_cols,
targets=targets,
save_dir="artifacts_inference",
experiment_name="InferenceModels_SM2_KNN",
experiment_tag="final_fit",
)
print("\nSaved bundles:")
print(saved)
if __name__ == "__main__":
main()