sinxcosx11's picture
Automated model and inference script upload v2
3961ee7 verified
import pandas as pd
import numpy as np
import joblib
import os
import logging
from pymatgen.core import Composition
import re
from .constants import KNOWN_ELEMENT_SYMBOLS, ATMOSPHERE_CONFIG, MIXING_METHOD_CONFIG, MAGPIE_FEATURIZER, MAGPIE_LABELS, matminer_available
from .feature_engineering_utils import standardize_chemical_formula, generate_compositional_features
from .process_feature_utils import generate_process_features_for_input, generate_stoichiometry_features_for_input
MODEL_DIR = "../models"
PREPROCESSOR_DIR = "../models"
ELEMENTAL_DATA_PATH = os.path.join(MODEL_DIR, "df_elements_processed.pkl")
ESSENTIAL_OBJECTS = {}
DF_ELEMENTS_PROCESSED_GLOBAL = None
def load_all_artifacts_once():
global DF_ELEMENTS_PROCESSED_GLOBAL, ESSENTIAL_OBJECTS, matminer_available, MAGPIE_FEATURIZER, MAGPIE_LABELS
if ESSENTIAL_OBJECTS.get("loaded_successfully"):
logging.info("Artifacts already loaded.")
return True
logging.info("--- Loading Essential Artifacts for Prediction ---")
script_dir = os.path.dirname(__file__)
try:
elemental_data_full_path = os.path.join(script_dir, ELEMENTAL_DATA_PATH)
DF_ELEMENTS_PROCESSED_GLOBAL = pd.read_pickle(elemental_data_full_path)
ESSENTIAL_OBJECTS["elemental_data"] = DF_ELEMENTS_PROCESSED_GLOBAL
logging.info(f"Loaded processed elemental data from {elemental_data_full_path}")
except Exception as e:
logging.critical(f"CRITICAL: Error loading elemental data from {elemental_data_full_path}: {e}")
return False
if not matminer_available: # Attempt to re-init if constants.py didn't catch it
try:
from matminer.featurizers.composition import ElementProperty
MAGPIE_FEATURIZER = ElementProperty.from_preset("magpie", impute_nan=True)
MAGPIE_LABELS = [f'magpie_{label.replace(" ", "_")}' for label in MAGPIE_FEATURIZER.feature_labels()]
matminer_available = True
logging.info("Matminer re-initialized in inference script.")
except:
logging.warning("Matminer could not be re-initialized in inference script.")
ESSENTIAL_OBJECTS["models"] = {}
ESSENTIAL_OBJECTS["encoders"] = {}
ESSENTIAL_OBJECTS["imputers"] = {}
ESSENTIAL_OBJECTS["scalers"] = {}
ESSENTIAL_OBJECTS["feature_columns"] = {}
all_loaded_successfully = True
for model_type_key in ["temperature_bin", "atmosphere_category"]:
model_artifact_name = f"{model_type_key}_tuned"
try:
ESSENTIAL_OBJECTS["models"][model_type_key] = joblib.load(os.path.join(script_dir, MODEL_DIR, f"{model_artifact_name}_lgbm_model.joblib"))
ESSENTIAL_OBJECTS["encoders"][model_type_key] = joblib.load(os.path.join(script_dir, MODEL_DIR, f"{model_artifact_name}_label_encoder.joblib"))
ESSENTIAL_OBJECTS["imputers"][model_type_key] = joblib.load(os.path.join(script_dir, PREPROCESSOR_DIR, f"{model_artifact_name}_imputer.joblib"))
ESSENTIAL_OBJECTS["scalers"][model_type_key] = joblib.load(os.path.join(script_dir, PREPROCESSOR_DIR, f"{model_artifact_name}_scaler.joblib"))
ESSENTIAL_OBJECTS["feature_columns"][model_type_key] = joblib.load(os.path.join(script_dir, PREPROCESSOR_DIR, f"{model_artifact_name}_feature_columns.joblib"))
logging.info(f"Loaded artifacts for {model_artifact_name} model.")
except Exception as e:
logging.error(f"Error loading one or more artifacts for '{model_artifact_name}': {e}. Predictions for it may fail.")
ESSENTIAL_OBJECTS["models"][model_type_key] = None
all_loaded_successfully = False
ESSENTIAL_OBJECTS["loaded_successfully"] = all_loaded_successfully
return all_loaded_successfully
def create_feature_vector_for_prediction(raw_synthesis_input, model_target_name):
global DF_ELEMENTS_PROCESSED_GLOBAL, ESSENTIAL_OBJECTS
if DF_ELEMENTS_PROCESSED_GLOBAL is None:
logging.error("Elemental data not loaded. Call load_all_artifacts_once() first.")
return None
expected_feature_cols = ESSENTIAL_OBJECTS["feature_columns"].get(model_target_name)
if not expected_feature_cols:
logging.error(f"Feature column list for '{model_target_name}' not found in loaded artifacts.")
return None
feature_dict = {col: (0 if col.startswith(("ops_", "proc_has_", "elem_block_")) or "is_stoichiometric" in col or "is_elements_only" in col else np.nan) for col in expected_feature_cols}
# Target Compositional Features
std_target_output = standardize_chemical_formula(raw_synthesis_input.get('target_formula_raw'), "predict_target")
target_comp_feats = generate_compositional_features(std_target_output, DF_ELEMENTS_PROCESSED_GLOBAL, "predict_target_comp")
for k, v in target_comp_feats.items():
feature_key = f'target_{k}'
if feature_key in feature_dict: feature_dict[feature_key] = v
# Precursor Compositional Features
precursor_formulas_raw = raw_synthesis_input.get('precursor_formulas_raw', [])
std_precursors_outputs = [standardize_chemical_formula(p, f"predict_prec_{i}") for i, p in enumerate(precursor_formulas_raw)]
num_valid_precursors, num_stoich_precursors, num_elements_only_precursors = 0,0,0
precursor_comp_feats_list = []
for std_p_output in std_precursors_outputs:
if std_p_output is not None:
num_valid_precursors += 1
if isinstance(std_p_output, str): num_stoich_precursors += 1
elif isinstance(std_p_output, dict) and std_p_output.get('type') == 'elements_only': num_elements_only_precursors +=1
precursor_comp_feats_list.append(generate_compositional_features(std_p_output, DF_ELEMENTS_PROCESSED_GLOBAL, "predict_prec_comp"))
feature_dict['num_valid_precursors'] = num_valid_precursors
feature_dict['all_prec_are_stoichiometric'] = (num_stoich_precursors == num_valid_precursors) if num_valid_precursors > 0 else False
feature_dict['any_prec_is_elements_only'] = (num_elements_only_precursors > 0) if num_valid_precursors > 0 else False
if precursor_comp_feats_list:
df_prec_feats = pd.DataFrame(precursor_comp_feats_list)
numeric_cols_df_prec = df_prec_feats.select_dtypes(include=np.number)
if not numeric_cols_df_prec.empty:
temp_sample_df = pd.DataFrame([generate_compositional_features("H2O", DF_ELEMENTS_PROCESSED_GLOBAL)])
numeric_sample_comp_keys = [k for k in temp_sample_df.columns if pd.api.types.is_numeric_dtype(temp_sample_df[k]) and k not in ['is_stoichiometric_formula']]
for agg_func_name in ['mean', 'std', 'min', 'max', 'sum']:
aggregated_vals = getattr(numeric_cols_df_prec, agg_func_name)()
for feat_name_suffix in numeric_sample_comp_keys:
agg_feat_key = f"{agg_func_name}_prec_{feat_name_suffix}"
if agg_feat_key in feature_dict and feat_name_suffix in aggregated_vals:
feature_dict[agg_feat_key] = aggregated_vals[feat_name_suffix]
# Process Features
process_input_ops_list = raw_synthesis_input.get('operations_simplified_list', [])
all_atm_cats = list(set([col.split('ops_atm_cat_')[-1] for col in expected_feature_cols if col.startswith('ops_atm_cat_')]))
all_mix_meths = list(set([col.split('ops_mix_meth_')[-1] for col in expected_feature_cols if col.startswith('ops_mix_meth_')]))
proc_feats_generated = generate_process_features_for_input(process_input_ops_list, all_atm_cats, all_mix_meths)
for k, v in proc_feats_generated.items():
if k in feature_dict: feature_dict[k] = v
# Stoichiometry features
reactants_simplified = raw_synthesis_input.get('reactants_simplified', [])
products_simplified = raw_synthesis_input.get('products_simplified', [])
stoich_feats_generated = generate_stoichiometry_features_for_input(reactants_simplified, products_simplified, standardize_chemical_formula)
for k, v in stoich_feats_generated.items():
if k in feature_dict: feature_dict[k] = v
feature_vector_df = pd.DataFrame([feature_dict], columns=expected_feature_cols)
# Impute and Scale
imputer = ESSENTIAL_OBJECTS["imputers"].get(model_target_name)
scaler = ESSENTIAL_OBJECTS["scalers"].get(model_target_name)
numerical_features_for_transform = [col for col in expected_feature_cols if col in feature_vector_df.columns and pd.api.types.is_numeric_dtype(feature_vector_df[col].dtype) and not col.startswith('ops_') and not col.startswith('proc_has_') and not col.startswith('elem_block_') and col not in ['is_stoichiometric_formula', 'all_prec_are_stoichiometric', 'any_prec_is_elements_only', 'num_valid_precursors']]
if imputer and scaler and numerical_features_for_transform:
try:
feature_vector_df[numerical_features_for_transform] = feature_vector_df[numerical_features_for_transform].astype(np.float64)
feature_vector_df[numerical_features_for_transform] = imputer.transform(feature_vector_df[numerical_features_for_transform])
feature_vector_df[numerical_features_for_transform] = scaler.transform(feature_vector_df[numerical_features_for_transform])
logging.info("Feature vector imputed and scaled for prediction.")
except Exception as e_transform:
logging.error(f"Error during imputation/scaling for prediction: {e_transform}", exc_info=True)
return None
else:
logging.warning("Imputer, Scaler or numerical features missing for prediction. Proceeding with caution.")
return feature_vector_df
def predict_synthesis_outcome(raw_synthesis_input):
global ESSENTIAL_OBJECTS
if not ESSENTIAL_OBJECTS.get("loaded_successfully"):
success = load_all_artifacts_once()
if not success:
logging.error("Essential artifacts could not be loaded. Cannot make predictions.")
return {}
predictions = {}
model_types_to_predict = ["temperature_bin", "atmosphere_category"]
for model_type in model_types_to_predict:
if ESSENTIAL_OBJECTS["models"].get(model_type):
logging.info(f"\n--- Predicting {model_type} ---")
feature_vector = create_feature_vector_for_prediction(raw_synthesis_input, model_type)
if feature_vector is not None:
model = ESSENTIAL_OBJECTS["models"][model_type]
encoder = ESSENTIAL_OBJECTS["encoders"][model_type]
try:
pred_encoded = model.predict(feature_vector)
pred_proba = model.predict_proba(feature_vector)
pred_label = encoder.inverse_transform(pred_encoded)[0]
predictions[model_type] = {
'predicted_label': pred_label,
'probabilities': {str(cls): prob for cls, prob in zip(encoder.classes_, pred_proba[0])}
}
logging.info(f"Predicted {model_type}: {pred_label}")
logging.info(f"Probabilities: {predictions[model_type]['probabilities']}")
except Exception as e:
logging.error(f"Error during {model_type} prediction: {e}", exc_info=True)
predictions[model_type] = f"Prediction Error: {e}"
else:
logging.error(f"Could not create feature vector for {model_type} model.")
predictions[model_type] = "Feature vector creation error"
else:
logging.warning(f"{model_type} model not available for prediction.")
return predictions
if __name__ == '__main__':
# This block is for testing this inference script directly.
# Ensure artifacts are loaded
if not load_all_artifacts_once():
print("Exiting due to failure in loading essential artifacts.")
else:
print("\n--- Example Interactive Prediction ---")
example_input_with_ops_list = {
'target_formula_raw': "YBa2Cu3O7",
'precursor_formulas_raw': ["Y2O3", "BaCO3", "CuO"],
'operations_simplified_list': [
{'type': 'MixingOperation', 'string': 'Mix precursors by ball milling for 4h', 'conditions': {'duration': [{'value':4, 'unit':'h'}]}},
{'type': 'HeatingOperation', 'string': 'Calcined at 900C for 12h in air', 'conditions': {'heating_temperature': [{'value':900, 'unit':'C'}], 'heating_time': [{'value':12, 'unit':'h'}], 'atmosphere': 'Air'}},
{'type': 'HeatingOperation', 'string': 'Sintered at 950C for 24h in O2', 'conditions': {'heating_temperature': [{'value':950, 'unit':'C'}], 'heating_time': [{'value':20, 'unit':'h'}], 'atmosphere': 'Oxygen'}}
],
'reactants_simplified': [{'material': 'Y2O3', 'amount': 0.5}, {'material':'BaCO3', 'amount': 2.0}, {'material':'CuO', 'amount': 3.0}],
'products_simplified': [{'material':'YBa2Cu3O7', 'amount': 1.0}]
}
predictions = predict_synthesis_outcome(example_input_with_ops_list)
print(f"\nFinal Predictions for example input: {predictions}")