|
|
|
import pandas as pd |
|
import numpy as np |
|
from pymatgen.core import Composition, Element as PymatgenElement |
|
import ast |
|
import re |
|
import logging |
|
from .constants import KNOWN_ELEMENT_SYMBOLS, MAGPIE_FEATURIZER, MAGPIE_LABELS, matminer_available |
|
|
|
|
|
def clean_formula_string_advanced(formula_str_original): |
|
if not isinstance(formula_str_original, str): return formula_str_original |
|
cleaned = formula_str_original.strip() |
|
paren_match = re.search(r'\(([^()]+)\)[^()]*$', cleaned) |
|
if paren_match: |
|
potential_formula_in_parens = paren_match.group(1).strip() |
|
part_before_parens = cleaned[:paren_match.start()].strip() |
|
if len(potential_formula_in_parens) > 1 and re.search(r"[A-Z]", potential_formula_in_parens) and re.fullmatch(r"[A-Za-z0-9\.\(\)\[\]]+", potential_formula_in_parens): |
|
if not part_before_parens or " " in part_before_parens or len(part_before_parens) > len(potential_formula_in_parens) + 5 or (part_before_parens.isalpha() and len(part_before_parens)>4) or re.fullmatch(r"\d+(\.\d+)?", part_before_parens) or re.fullmatch(r"\d*N", part_before_parens, re.IGNORECASE): |
|
cleaned = potential_formula_in_parens |
|
elif not re.search(r"[A-Za-z]", part_before_parens) and re.search(r"\d", part_before_parens): |
|
cleaned = potential_formula_in_parens |
|
cleaned = re.sub(r"^[αΑβΒγΓδΔεΕζΖηΗθΘιΙκΚλΛμΜνΝξΞοΟπΠρΡσΣτΤυΥφΦχΧψΨωΩ]-", "", cleaned) |
|
cleaned = re.sub(r"^[a-zA-Z]-", "", cleaned) |
|
cleaned = re.sub(r"[·*]\s*\d*(\.\d+)?[nNxX]?\s*H2O", "", cleaned) |
|
cleaned = re.sub(r"\s*\(\s*H2O\s*\)\s*\d*(\.\d+)?", "", cleaned) |
|
cleaned = re.sub(r"·\s*H2O", "", cleaned) |
|
cleaned = re.sub(r"\s*\(\s*(?:\d*N|\d+(?:\.\d+)?%?|solution|gas|powder|aq|amorphous|amorph|polytype|phase|\d{1,4})\s*\)\s*$", "", cleaned, flags=re.IGNORECASE) |
|
cleaned = re.sub(r"^\s*\(\s*\d+(\.\d+)?\s*\)\s*(?=[A-Z])", "", cleaned) |
|
def replace_frac(match): |
|
try: num = float(match.group(1)); den = float(match.group(2)); return str(round(num / den, 4)) if den != 0 else match.group(0) |
|
except: return match.group(0) |
|
cleaned = re.sub(r"(?<=[A-Za-z\d\)])(\d+)\s*/\s*(\d+)", replace_frac, cleaned) |
|
cleaned = re.sub(r"^(\d+)\s*/\s*(\d+)", replace_frac, cleaned) |
|
cleaned = re.sub(r"^\s*\(?[a-zA-Z\s]+\)?-", "", cleaned); cleaned = re.sub(r"^[a-zA-Z]+-", "", cleaned) |
|
cleaned = cleaned.strip(" .,;·*()") |
|
return cleaned |
|
|
|
def is_plausible_formula_for_pymatgen(cleaned_formula_str, entry_identifier): |
|
if not isinstance(cleaned_formula_str, str) or not cleaned_formula_str.strip(): return False |
|
if '+' in cleaned_formula_str or '==' in cleaned_formula_str or '->' in cleaned_formula_str or ';' in cleaned_formula_str: return False |
|
variable_indicators = [r"[A-Za-z]\d*\s*[-+*]\s*[xyzδδn]", r"[xyzδδn]\s*[-+*]", r"[A-Za-z]\d*\(\s*\d*\s*[-+]\s*[xyzδδn]\s*\)?", r"(?<![A-Za-z])(?:[1-9]\d*|0)?\.\d*[xyzδδn]", r"[xyzδδn]\d+", r"[A-Za-z]\s*[xyzδδn]\s*\d*", r"1-[xyzδδn]",] |
|
variable_char_pattern = r"(?i)(?<![A-Z])([xyzδδn])(?![a-z])" |
|
for pattern in variable_indicators: |
|
if re.search(pattern, cleaned_formula_str, re.IGNORECASE): |
|
possible_vars = re.findall(variable_char_pattern, cleaned_formula_str) |
|
if any(pv.upper() not in KNOWN_ELEMENT_SYMBOLS for pv in possible_vars if len(pv)==1): return False |
|
return True |
|
|
|
def standardize_chemical_formula(raw_formula_str, entry_identifier="Unknown_Entry"): |
|
if not isinstance(raw_formula_str, str) or not raw_formula_str.strip(): return None |
|
cleaned_formula_str = clean_formula_string_advanced(raw_formula_str) |
|
if not cleaned_formula_str: return None |
|
if is_plausible_formula_for_pymatgen(cleaned_formula_str, f"{entry_identifier} (Original: '{raw_formula_str}', Cleaned: '{cleaned_formula_str}')"): |
|
try: |
|
comp_formula_for_pymatgen = cleaned_formula_str.replace(" ", "") |
|
if not comp_formula_for_pymatgen: return None |
|
comp = Composition(comp_formula_for_pymatgen) |
|
if all(el.symbol in KNOWN_ELEMENT_SYMBOLS for el in comp.elements): return comp.get_reduced_formula_and_factor()[0].replace(" ", "") |
|
except Exception: pass |
|
extracted_elements = {el for el in re.findall(r"([A-Z][a-z]?)", cleaned_formula_str) if el in KNOWN_ELEMENT_SYMBOLS} |
|
if extracted_elements: return {'type': 'elements_only', 'elements': extracted_elements, 'original_cleaned': cleaned_formula_str} |
|
return None |
|
|
|
def get_valence_features(valences_input, entry_identifier="Unknown_Entry"): |
|
valences_list = valences_input |
|
if isinstance(valences_input, str): |
|
try: valences_list = ast.literal_eval(valences_input) |
|
except (ValueError, SyntaxError, TypeError): valences_list = [] |
|
if not isinstance(valences_list, list) or not valences_list: return {'avg_valence': np.nan, 'min_valence': np.nan, 'max_valence': np.nan} |
|
numeric_valences = [v for v in valences_list if isinstance(v, (int, float))] |
|
if not numeric_valences: return {'avg_valence': np.nan, 'min_valence': np.nan, 'max_valence': np.nan} |
|
return {'avg_valence': np.mean(numeric_valences), 'min_valence': np.min(numeric_valences), 'max_valence': np.max(numeric_valences)} |
|
|
|
|
|
def generate_compositional_features(formula_input, df_elements_processed, entry_identifier="Unknown_Formula"): |
|
default_feature_dict = {'is_stoichiometric_formula': False, 'num_elements_in_formula': 0} |
|
basic_props = ['avg_atomic_weight', 'avg_electronegativity', 'avg_atomic_radius', 'avg_melting_point', 'avg_density', 'avg_specific_heat', 'avg_thermal_conductivity', 'avg_heat_of_fusion', 'sum_atomic_weight', 'range_electronegativity', 'min_electronegativity', 'max_electronegativity', 'var_atomic_radius', 'min_atomic_radius', 'max_atomic_radius', 'avg_valence_of_comp', 'avg_est_valence_electrons'] |
|
unweighted_props = [f'avg_{prop.lower()}_unweighted' for prop in ['Atomic_Weight', 'Electronegativity', 'Atomic_Radius', 'Melting_Point', 'Density', 'avg_valence', 'valence_electrons_estimated']] + [f'min_{prop.lower()}_unweighted' for prop in ['Atomic_Weight', 'Electronegativity', 'Atomic_Radius', 'Melting_Point', 'Density', 'avg_valence', 'valence_electrons_estimated']] + [f'max_{prop.lower()}_unweighted' for prop in ['Atomic_Weight', 'Electronegativity', 'Atomic_Radius', 'Melting_Point', 'Density', 'avg_valence', 'valence_electrons_estimated']] + [f'var_{prop.lower()}_unweighted' for prop in ['Atomic_Weight', 'Electronegativity', 'Atomic_Radius', 'Melting_Point', 'Density', 'avg_valence', 'valence_electrons_estimated']] |
|
for k in basic_props + unweighted_props: default_feature_dict[k] = np.nan |
|
if matminer_available and MAGPIE_LABELS: |
|
for label in MAGPIE_LABELS: default_feature_dict[label] = np.nan |
|
|
|
if formula_input is None: return default_feature_dict.copy() |
|
features = {} |
|
if isinstance(formula_input, str): |
|
try: |
|
comp = Composition(formula_input); el_amt_dict = comp.get_el_amt_dict(); total_atoms = sum(el_amt_dict.values()) |
|
if total_atoms == 0: return {**default_feature_dict, 'is_stoichiometric_formula': False} |
|
features['is_stoichiometric_formula'] = True; features['num_elements_in_formula'] = len(el_amt_dict) |
|
props_for_avg_mapping = {'avg_atomic_weight': 'Atomic_Weight', 'avg_electronegativity': 'Electronegativity', 'avg_atomic_radius': 'Atomic_Radius', 'avg_melting_point': 'Melting_Point', 'avg_density': 'Density', 'avg_specific_heat': 'Specific_Heat', 'avg_thermal_conductivity': 'Thermal_Conductivity', 'avg_heat_of_fusion': 'Heat_of_Fusion', 'avg_valence_of_comp': 'avg_valence', 'avg_est_valence_electrons': 'valence_electrons_estimated'} |
|
element_values_for_stats_mapping = {'electronegativity': 'Electronegativity', 'atomic_radius': 'Atomic_Radius'} |
|
current_props_for_avg = {k: [] for k in props_for_avg_mapping.keys()}; current_element_values_for_stats = {k: [] for k in element_values_for_stats_mapping.keys()}; valid_elements_for_avg_count = {k: 0 for k in props_for_avg_mapping.keys()} |
|
for el_obj, amt in el_amt_dict.items(): |
|
el_symbol_str = el_obj.symbol if isinstance(el_obj, PymatgenElement) else str(el_obj) |
|
if el_symbol_str not in KNOWN_ELEMENT_SYMBOLS: continue |
|
if el_symbol_str in df_elements_processed.index: |
|
el_props_series = df_elements_processed.loc[el_symbol_str] |
|
for feat_key, elem_col_name in props_for_avg_mapping.items(): |
|
val = el_props_series.get(elem_col_name, np.nan) |
|
if pd.notna(val): current_props_for_avg[feat_key].append(val * amt); valid_elements_for_avg_count[feat_key] += amt |
|
for feat_key, elem_col_name in element_values_for_stats_mapping.items(): |
|
val = el_props_series.get(elem_col_name, np.nan) |
|
if pd.notna(val): current_element_values_for_stats[feat_key].extend([val] * int(round(amt))) |
|
for key, val_list in current_props_for_avg.items(): features[key] = np.nansum(val_list) / valid_elements_for_avg_count[key] if valid_elements_for_avg_count[key] > 0 else np.nan |
|
features['sum_atomic_weight'] = comp.weight |
|
for key, val_list in current_element_values_for_stats.items(): |
|
clean_val_list = [v for v in val_list if pd.notna(v)] |
|
if clean_val_list: features[f'range_{key}'] = np.max(clean_val_list) - np.min(clean_val_list); features[f'min_{key}'] = np.min(clean_val_list); features[f'max_{key}'] = np.max(clean_val_list); features[f'var_{key}'] = np.var(clean_val_list) |
|
else: |
|
for stat in ['range_', 'min_', 'max_', 'var_']: features[f'{stat}{key}'] = np.nan |
|
if matminer_available and MAGPIE_FEATURIZER: |
|
try: |
|
magpie_vals = MAGPIE_FEATURIZER.featurize(comp) |
|
for i, label in enumerate(MAGPIE_LABELS): features[label] = magpie_vals[i] |
|
except: pass |
|
except: features['is_stoichiometric_formula'] = False |
|
elif isinstance(formula_input, dict) and formula_input.get('type') == 'elements_only': |
|
features['is_stoichiometric_formula'] = False |
|
elements_present = formula_input.get('elements', set()) |
|
valid_elements = [el for el in elements_present if el in df_elements_processed.index] |
|
features['num_elements_in_formula'] = len(valid_elements) |
|
if valid_elements: |
|
element_props_subset = df_elements_processed.loc[valid_elements] |
|
unweighted_props_to_calc = ['Atomic_Weight', 'Electronegativity', 'Atomic_Radius', 'Melting_Point', 'Density', 'avg_valence', 'valence_electrons_estimated'] |
|
for prop_col in unweighted_props_to_calc: |
|
if prop_col in element_props_subset.columns: |
|
clean_vals = element_props_subset[prop_col].dropna() |
|
if not clean_vals.empty: |
|
features[f'avg_{prop_col.lower()}_unweighted'] = clean_vals.mean() |
|
features[f'min_{prop_col.lower()}_unweighted'] = clean_vals.min() |
|
features[f'max_{prop_col.lower()}_unweighted'] = clean_vals.max() |
|
features[f'var_{prop_col.lower()}_unweighted'] = clean_vals.var() |
|
|
|
final_features = default_feature_dict.copy(); final_features.update(features) |
|
return final_features |
|
|