synthesis_condition_temperaturepredict / src /feature_engineering_utils.py
sinxcosx11's picture
Automated model and inference script upload v2
3961ee7 verified
import pandas as pd
import numpy as np
from pymatgen.core import Composition, Element as PymatgenElement
import ast
import re
import logging
from .constants import KNOWN_ELEMENT_SYMBOLS, MAGPIE_FEATURIZER, MAGPIE_LABELS, matminer_available
# --- Formula Cleaning and Standardization ---
def clean_formula_string_advanced(formula_str_original):
if not isinstance(formula_str_original, str): return formula_str_original
cleaned = formula_str_original.strip()
paren_match = re.search(r'\(([^()]+)\)[^()]*$', cleaned)
if paren_match:
potential_formula_in_parens = paren_match.group(1).strip()
part_before_parens = cleaned[:paren_match.start()].strip()
if len(potential_formula_in_parens) > 1 and re.search(r"[A-Z]", potential_formula_in_parens) and re.fullmatch(r"[A-Za-z0-9\.\(\)\[\]]+", potential_formula_in_parens):
if not part_before_parens or " " in part_before_parens or len(part_before_parens) > len(potential_formula_in_parens) + 5 or (part_before_parens.isalpha() and len(part_before_parens)>4) or re.fullmatch(r"\d+(\.\d+)?", part_before_parens) or re.fullmatch(r"\d*N", part_before_parens, re.IGNORECASE):
cleaned = potential_formula_in_parens
elif not re.search(r"[A-Za-z]", part_before_parens) and re.search(r"\d", part_before_parens):
cleaned = potential_formula_in_parens
cleaned = re.sub(r"^[αΑβΒγΓδΔεΕζΖηΗθΘιΙκΚλΛμΜνΝξΞοΟπΠρΡσΣτΤυΥφΦχΧψΨωΩ]-", "", cleaned)
cleaned = re.sub(r"^[a-zA-Z]-", "", cleaned)
cleaned = re.sub(r"[·*]\s*\d*(\.\d+)?[nNxX]?\s*H2O", "", cleaned)
cleaned = re.sub(r"\s*\(\s*H2O\s*\)\s*\d*(\.\d+)?", "", cleaned)
cleaned = re.sub(r"·\s*H2O", "", cleaned)
cleaned = re.sub(r"\s*\(\s*(?:\d*N|\d+(?:\.\d+)?%?|solution|gas|powder|aq|amorphous|amorph|polytype|phase|\d{1,4})\s*\)\s*$", "", cleaned, flags=re.IGNORECASE)
cleaned = re.sub(r"^\s*\(\s*\d+(\.\d+)?\s*\)\s*(?=[A-Z])", "", cleaned)
def replace_frac(match):
try: num = float(match.group(1)); den = float(match.group(2)); return str(round(num / den, 4)) if den != 0 else match.group(0)
except: return match.group(0)
cleaned = re.sub(r"(?<=[A-Za-z\d\)])(\d+)\s*/\s*(\d+)", replace_frac, cleaned)
cleaned = re.sub(r"^(\d+)\s*/\s*(\d+)", replace_frac, cleaned)
cleaned = re.sub(r"^\s*\(?[a-zA-Z\s]+\)?-", "", cleaned); cleaned = re.sub(r"^[a-zA-Z]+-", "", cleaned)
cleaned = cleaned.strip(" .,;·*()")
return cleaned
def is_plausible_formula_for_pymatgen(cleaned_formula_str, entry_identifier):
if not isinstance(cleaned_formula_str, str) or not cleaned_formula_str.strip(): return False
if '+' in cleaned_formula_str or '==' in cleaned_formula_str or '->' in cleaned_formula_str or ';' in cleaned_formula_str: return False
variable_indicators = [r"[A-Za-z]\d*\s*[-+*]\s*[xyzδδn]", r"[xyzδδn]\s*[-+*]", r"[A-Za-z]\d*\(\s*\d*\s*[-+]\s*[xyzδδn]\s*\)?", r"(?<![A-Za-z])(?:[1-9]\d*|0)?\.\d*[xyzδδn]", r"[xyzδδn]\d+", r"[A-Za-z]\s*[xyzδδn]\s*\d*", r"1-[xyzδδn]",]
variable_char_pattern = r"(?i)(?<![A-Z])([xyzδδn])(?![a-z])"
for pattern in variable_indicators:
if re.search(pattern, cleaned_formula_str, re.IGNORECASE):
possible_vars = re.findall(variable_char_pattern, cleaned_formula_str)
if any(pv.upper() not in KNOWN_ELEMENT_SYMBOLS for pv in possible_vars if len(pv)==1): return False
return True
def standardize_chemical_formula(raw_formula_str, entry_identifier="Unknown_Entry"):
if not isinstance(raw_formula_str, str) or not raw_formula_str.strip(): return None
cleaned_formula_str = clean_formula_string_advanced(raw_formula_str)
if not cleaned_formula_str: return None
if is_plausible_formula_for_pymatgen(cleaned_formula_str, f"{entry_identifier} (Original: '{raw_formula_str}', Cleaned: '{cleaned_formula_str}')"):
try:
comp_formula_for_pymatgen = cleaned_formula_str.replace(" ", "")
if not comp_formula_for_pymatgen: return None
comp = Composition(comp_formula_for_pymatgen)
if all(el.symbol in KNOWN_ELEMENT_SYMBOLS for el in comp.elements): return comp.get_reduced_formula_and_factor()[0].replace(" ", "")
except Exception: pass
extracted_elements = {el for el in re.findall(r"([A-Z][a-z]?)", cleaned_formula_str) if el in KNOWN_ELEMENT_SYMBOLS}
if extracted_elements: return {'type': 'elements_only', 'elements': extracted_elements, 'original_cleaned': cleaned_formula_str}
return None
def get_valence_features(valences_input, entry_identifier="Unknown_Entry"):
valences_list = valences_input
if isinstance(valences_input, str):
try: valences_list = ast.literal_eval(valences_input)
except (ValueError, SyntaxError, TypeError): valences_list = []
if not isinstance(valences_list, list) or not valences_list: return {'avg_valence': np.nan, 'min_valence': np.nan, 'max_valence': np.nan}
numeric_valences = [v for v in valences_list if isinstance(v, (int, float))]
if not numeric_valences: return {'avg_valence': np.nan, 'min_valence': np.nan, 'max_valence': np.nan}
return {'avg_valence': np.mean(numeric_valences), 'min_valence': np.min(numeric_valences), 'max_valence': np.max(numeric_valences)}
def generate_compositional_features(formula_input, df_elements_processed, entry_identifier="Unknown_Formula"):
default_feature_dict = {'is_stoichiometric_formula': False, 'num_elements_in_formula': 0}
basic_props = ['avg_atomic_weight', 'avg_electronegativity', 'avg_atomic_radius', 'avg_melting_point', 'avg_density', 'avg_specific_heat', 'avg_thermal_conductivity', 'avg_heat_of_fusion', 'sum_atomic_weight', 'range_electronegativity', 'min_electronegativity', 'max_electronegativity', 'var_atomic_radius', 'min_atomic_radius', 'max_atomic_radius', 'avg_valence_of_comp', 'avg_est_valence_electrons']
unweighted_props = [f'avg_{prop.lower()}_unweighted' for prop in ['Atomic_Weight', 'Electronegativity', 'Atomic_Radius', 'Melting_Point', 'Density', 'avg_valence', 'valence_electrons_estimated']] + [f'min_{prop.lower()}_unweighted' for prop in ['Atomic_Weight', 'Electronegativity', 'Atomic_Radius', 'Melting_Point', 'Density', 'avg_valence', 'valence_electrons_estimated']] + [f'max_{prop.lower()}_unweighted' for prop in ['Atomic_Weight', 'Electronegativity', 'Atomic_Radius', 'Melting_Point', 'Density', 'avg_valence', 'valence_electrons_estimated']] + [f'var_{prop.lower()}_unweighted' for prop in ['Atomic_Weight', 'Electronegativity', 'Atomic_Radius', 'Melting_Point', 'Density', 'avg_valence', 'valence_electrons_estimated']]
for k in basic_props + unweighted_props: default_feature_dict[k] = np.nan
if matminer_available and MAGPIE_LABELS:
for label in MAGPIE_LABELS: default_feature_dict[label] = np.nan
if formula_input is None: return default_feature_dict.copy()
features = {}
if isinstance(formula_input, str):
try:
comp = Composition(formula_input); el_amt_dict = comp.get_el_amt_dict(); total_atoms = sum(el_amt_dict.values())
if total_atoms == 0: return {**default_feature_dict, 'is_stoichiometric_formula': False}
features['is_stoichiometric_formula'] = True; features['num_elements_in_formula'] = len(el_amt_dict)
props_for_avg_mapping = {'avg_atomic_weight': 'Atomic_Weight', 'avg_electronegativity': 'Electronegativity', 'avg_atomic_radius': 'Atomic_Radius', 'avg_melting_point': 'Melting_Point', 'avg_density': 'Density', 'avg_specific_heat': 'Specific_Heat', 'avg_thermal_conductivity': 'Thermal_Conductivity', 'avg_heat_of_fusion': 'Heat_of_Fusion', 'avg_valence_of_comp': 'avg_valence', 'avg_est_valence_electrons': 'valence_electrons_estimated'}
element_values_for_stats_mapping = {'electronegativity': 'Electronegativity', 'atomic_radius': 'Atomic_Radius'}
current_props_for_avg = {k: [] for k in props_for_avg_mapping.keys()}; current_element_values_for_stats = {k: [] for k in element_values_for_stats_mapping.keys()}; valid_elements_for_avg_count = {k: 0 for k in props_for_avg_mapping.keys()}
for el_obj, amt in el_amt_dict.items():
el_symbol_str = el_obj.symbol if isinstance(el_obj, PymatgenElement) else str(el_obj)
if el_symbol_str not in KNOWN_ELEMENT_SYMBOLS: continue
if el_symbol_str in df_elements_processed.index:
el_props_series = df_elements_processed.loc[el_symbol_str]
for feat_key, elem_col_name in props_for_avg_mapping.items():
val = el_props_series.get(elem_col_name, np.nan)
if pd.notna(val): current_props_for_avg[feat_key].append(val * amt); valid_elements_for_avg_count[feat_key] += amt
for feat_key, elem_col_name in element_values_for_stats_mapping.items():
val = el_props_series.get(elem_col_name, np.nan)
if pd.notna(val): current_element_values_for_stats[feat_key].extend([val] * int(round(amt)))
for key, val_list in current_props_for_avg.items(): features[key] = np.nansum(val_list) / valid_elements_for_avg_count[key] if valid_elements_for_avg_count[key] > 0 else np.nan
features['sum_atomic_weight'] = comp.weight
for key, val_list in current_element_values_for_stats.items():
clean_val_list = [v for v in val_list if pd.notna(v)]
if clean_val_list: features[f'range_{key}'] = np.max(clean_val_list) - np.min(clean_val_list); features[f'min_{key}'] = np.min(clean_val_list); features[f'max_{key}'] = np.max(clean_val_list); features[f'var_{key}'] = np.var(clean_val_list)
else:
for stat in ['range_', 'min_', 'max_', 'var_']: features[f'{stat}{key}'] = np.nan
if matminer_available and MAGPIE_FEATURIZER:
try:
magpie_vals = MAGPIE_FEATURIZER.featurize(comp)
for i, label in enumerate(MAGPIE_LABELS): features[label] = magpie_vals[i]
except: pass
except: features['is_stoichiometric_formula'] = False
elif isinstance(formula_input, dict) and formula_input.get('type') == 'elements_only':
features['is_stoichiometric_formula'] = False
elements_present = formula_input.get('elements', set())
valid_elements = [el for el in elements_present if el in df_elements_processed.index]
features['num_elements_in_formula'] = len(valid_elements)
if valid_elements:
element_props_subset = df_elements_processed.loc[valid_elements]
unweighted_props_to_calc = ['Atomic_Weight', 'Electronegativity', 'Atomic_Radius', 'Melting_Point', 'Density', 'avg_valence', 'valence_electrons_estimated']
for prop_col in unweighted_props_to_calc:
if prop_col in element_props_subset.columns:
clean_vals = element_props_subset[prop_col].dropna()
if not clean_vals.empty:
features[f'avg_{prop_col.lower()}_unweighted'] = clean_vals.mean()
features[f'min_{prop_col.lower()}_unweighted'] = clean_vals.min()
features[f'max_{prop_col.lower()}_unweighted'] = clean_vals.max()
features[f'var_{prop_col.lower()}_unweighted'] = clean_vals.var()
final_features = default_feature_dict.copy(); final_features.update(features)
return final_features