File size: 11,701 Bytes
3961ee7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137

import pandas as pd
import numpy as np
from pymatgen.core import Composition, Element as PymatgenElement
import ast
import re
import logging
from .constants import KNOWN_ELEMENT_SYMBOLS, MAGPIE_FEATURIZER, MAGPIE_LABELS, matminer_available

# --- Formula Cleaning and Standardization ---
def clean_formula_string_advanced(formula_str_original):
    if not isinstance(formula_str_original, str): return formula_str_original
    cleaned = formula_str_original.strip()
    paren_match = re.search(r'\(([^()]+)\)[^()]*$', cleaned) 
    if paren_match:
        potential_formula_in_parens = paren_match.group(1).strip()
        part_before_parens = cleaned[:paren_match.start()].strip()
        if len(potential_formula_in_parens) > 1 and re.search(r"[A-Z]", potential_formula_in_parens) and            re.fullmatch(r"[A-Za-z0-9\.\(\)\[\]]+", potential_formula_in_parens):
            if not part_before_parens or " " in part_before_parens or len(part_before_parens) > len(potential_formula_in_parens) + 5 or                (part_before_parens.isalpha() and len(part_before_parens)>4) or                re.fullmatch(r"\d+(\.\d+)?", part_before_parens) or                re.fullmatch(r"\d*N", part_before_parens, re.IGNORECASE): 
                cleaned = potential_formula_in_parens
            elif not re.search(r"[A-Za-z]", part_before_parens) and re.search(r"\d", part_before_parens): 
                 cleaned = potential_formula_in_parens
    cleaned = re.sub(r"^[αΑβΒγΓδΔεΕζΖηΗθΘιΙκΚλΛμΜνΝξΞοΟπΠρΡσΣτΤυΥφΦχΧψΨωΩ]-", "", cleaned)
    cleaned = re.sub(r"^[a-zA-Z]-", "", cleaned)
    cleaned = re.sub(r"[·*]\s*\d*(\.\d+)?[nNxX]?\s*H2O", "", cleaned)
    cleaned = re.sub(r"\s*\(\s*H2O\s*\)\s*\d*(\.\d+)?", "", cleaned)
    cleaned = re.sub(r"·\s*H2O", "", cleaned)
    cleaned = re.sub(r"\s*\(\s*(?:\d*N|\d+(?:\.\d+)?%?|solution|gas|powder|aq|amorphous|amorph|polytype|phase|\d{1,4})\s*\)\s*$", "", cleaned, flags=re.IGNORECASE)
    cleaned = re.sub(r"^\s*\(\s*\d+(\.\d+)?\s*\)\s*(?=[A-Z])", "", cleaned)
    def replace_frac(match):
        try: num = float(match.group(1)); den = float(match.group(2)); return str(round(num / den, 4)) if den != 0 else match.group(0)
        except: return match.group(0)
    cleaned = re.sub(r"(?<=[A-Za-z\d\)])(\d+)\s*/\s*(\d+)", replace_frac, cleaned)
    cleaned = re.sub(r"^(\d+)\s*/\s*(\d+)", replace_frac, cleaned)
    cleaned = re.sub(r"^\s*\(?[a-zA-Z\s]+\)?-", "", cleaned); cleaned = re.sub(r"^[a-zA-Z]+-", "", cleaned)
    cleaned = cleaned.strip(" .,;·*()")
    return cleaned

def is_plausible_formula_for_pymatgen(cleaned_formula_str, entry_identifier):
    if not isinstance(cleaned_formula_str, str) or not cleaned_formula_str.strip(): return False
    if '+' in cleaned_formula_str or '==' in cleaned_formula_str or '->' in cleaned_formula_str or ';' in cleaned_formula_str: return False
    variable_indicators = [r"[A-Za-z]\d*\s*[-+*]\s*[xyzδδn]", r"[xyzδδn]\s*[-+*]", r"[A-Za-z]\d*\(\s*\d*\s*[-+]\s*[xyzδδn]\s*\)?", r"(?<![A-Za-z])(?:[1-9]\d*|0)?\.\d*[xyzδδn]", r"[xyzδδn]\d+", r"[A-Za-z]\s*[xyzδδn]\s*\d*", r"1-[xyzδδn]",]
    variable_char_pattern = r"(?i)(?<![A-Z])([xyzδδn])(?![a-z])" 
    for pattern in variable_indicators:
        if re.search(pattern, cleaned_formula_str, re.IGNORECASE):
            possible_vars = re.findall(variable_char_pattern, cleaned_formula_str)
            if any(pv.upper() not in KNOWN_ELEMENT_SYMBOLS for pv in possible_vars if len(pv)==1): return False
    return True

def standardize_chemical_formula(raw_formula_str, entry_identifier="Unknown_Entry"):
    if not isinstance(raw_formula_str, str) or not raw_formula_str.strip(): return None
    cleaned_formula_str = clean_formula_string_advanced(raw_formula_str)
    if not cleaned_formula_str: return None
    if is_plausible_formula_for_pymatgen(cleaned_formula_str, f"{entry_identifier} (Original: '{raw_formula_str}', Cleaned: '{cleaned_formula_str}')"):
        try:
            comp_formula_for_pymatgen = cleaned_formula_str.replace(" ", "")
            if not comp_formula_for_pymatgen: return None
            comp = Composition(comp_formula_for_pymatgen)
            if all(el.symbol in KNOWN_ELEMENT_SYMBOLS for el in comp.elements): return comp.get_reduced_formula_and_factor()[0].replace(" ", "")
        except Exception: pass
    extracted_elements = {el for el in re.findall(r"([A-Z][a-z]?)", cleaned_formula_str) if el in KNOWN_ELEMENT_SYMBOLS}
    if extracted_elements: return {'type': 'elements_only', 'elements': extracted_elements, 'original_cleaned': cleaned_formula_str}
    return None

def get_valence_features(valences_input, entry_identifier="Unknown_Entry"):
    valences_list = valences_input
    if isinstance(valences_input, str): 
        try: valences_list = ast.literal_eval(valences_input)
        except (ValueError, SyntaxError, TypeError): valences_list = []
    if not isinstance(valences_list, list) or not valences_list: return {'avg_valence': np.nan, 'min_valence': np.nan, 'max_valence': np.nan}
    numeric_valences = [v for v in valences_list if isinstance(v, (int, float))]
    if not numeric_valences: return {'avg_valence': np.nan, 'min_valence': np.nan, 'max_valence': np.nan}
    return {'avg_valence': np.mean(numeric_valences), 'min_valence': np.min(numeric_valences), 'max_valence': np.max(numeric_valences)}


def generate_compositional_features(formula_input, df_elements_processed, entry_identifier="Unknown_Formula"):
    default_feature_dict = {'is_stoichiometric_formula': False, 'num_elements_in_formula': 0} 
    basic_props = ['avg_atomic_weight', 'avg_electronegativity', 'avg_atomic_radius', 'avg_melting_point', 'avg_density', 'avg_specific_heat', 'avg_thermal_conductivity', 'avg_heat_of_fusion', 'sum_atomic_weight', 'range_electronegativity', 'min_electronegativity', 'max_electronegativity', 'var_atomic_radius', 'min_atomic_radius', 'max_atomic_radius', 'avg_valence_of_comp', 'avg_est_valence_electrons']
    unweighted_props = [f'avg_{prop.lower()}_unweighted' for prop in ['Atomic_Weight', 'Electronegativity', 'Atomic_Radius', 'Melting_Point', 'Density', 'avg_valence', 'valence_electrons_estimated']] +                        [f'min_{prop.lower()}_unweighted' for prop in ['Atomic_Weight', 'Electronegativity', 'Atomic_Radius', 'Melting_Point', 'Density', 'avg_valence', 'valence_electrons_estimated']] +                        [f'max_{prop.lower()}_unweighted' for prop in ['Atomic_Weight', 'Electronegativity', 'Atomic_Radius', 'Melting_Point', 'Density', 'avg_valence', 'valence_electrons_estimated']] +                        [f'var_{prop.lower()}_unweighted' for prop in ['Atomic_Weight', 'Electronegativity', 'Atomic_Radius', 'Melting_Point', 'Density', 'avg_valence', 'valence_electrons_estimated']]
    for k in basic_props + unweighted_props: default_feature_dict[k] = np.nan
    if matminer_available and MAGPIE_LABELS:
        for label in MAGPIE_LABELS: default_feature_dict[label] = np.nan
    
    if formula_input is None: return default_feature_dict.copy()
    features = {}
    if isinstance(formula_input, str):
        try:
            comp = Composition(formula_input); el_amt_dict = comp.get_el_amt_dict(); total_atoms = sum(el_amt_dict.values())
            if total_atoms == 0: return {**default_feature_dict, 'is_stoichiometric_formula': False}
            features['is_stoichiometric_formula'] = True; features['num_elements_in_formula'] = len(el_amt_dict)
            props_for_avg_mapping = {'avg_atomic_weight': 'Atomic_Weight', 'avg_electronegativity': 'Electronegativity', 'avg_atomic_radius': 'Atomic_Radius', 'avg_melting_point': 'Melting_Point', 'avg_density': 'Density', 'avg_specific_heat': 'Specific_Heat', 'avg_thermal_conductivity': 'Thermal_Conductivity', 'avg_heat_of_fusion': 'Heat_of_Fusion', 'avg_valence_of_comp': 'avg_valence', 'avg_est_valence_electrons': 'valence_electrons_estimated'}
            element_values_for_stats_mapping = {'electronegativity': 'Electronegativity', 'atomic_radius': 'Atomic_Radius'}
            current_props_for_avg = {k: [] for k in props_for_avg_mapping.keys()}; current_element_values_for_stats = {k: [] for k in element_values_for_stats_mapping.keys()}; valid_elements_for_avg_count = {k: 0 for k in props_for_avg_mapping.keys()}
            for el_obj, amt in el_amt_dict.items():
                el_symbol_str = el_obj.symbol if isinstance(el_obj, PymatgenElement) else str(el_obj)
                if el_symbol_str not in KNOWN_ELEMENT_SYMBOLS: continue
                if el_symbol_str in df_elements_processed.index:
                    el_props_series = df_elements_processed.loc[el_symbol_str]
                    for feat_key, elem_col_name in props_for_avg_mapping.items():
                        val = el_props_series.get(elem_col_name, np.nan)
                        if pd.notna(val): current_props_for_avg[feat_key].append(val * amt); valid_elements_for_avg_count[feat_key] += amt
                    for feat_key, elem_col_name in element_values_for_stats_mapping.items():
                        val = el_props_series.get(elem_col_name, np.nan)
                        if pd.notna(val): current_element_values_for_stats[feat_key].extend([val] * int(round(amt)))
            for key, val_list in current_props_for_avg.items(): features[key] = np.nansum(val_list) / valid_elements_for_avg_count[key] if valid_elements_for_avg_count[key] > 0 else np.nan
            features['sum_atomic_weight'] = comp.weight 
            for key, val_list in current_element_values_for_stats.items():
                clean_val_list = [v for v in val_list if pd.notna(v)]
                if clean_val_list: features[f'range_{key}'] = np.max(clean_val_list) - np.min(clean_val_list); features[f'min_{key}'] = np.min(clean_val_list); features[f'max_{key}'] = np.max(clean_val_list); features[f'var_{key}'] = np.var(clean_val_list)
                else:
                    for stat in ['range_', 'min_', 'max_', 'var_']: features[f'{stat}{key}'] = np.nan
            if matminer_available and MAGPIE_FEATURIZER:
                try:
                    magpie_vals = MAGPIE_FEATURIZER.featurize(comp)
                    for i, label in enumerate(MAGPIE_LABELS): features[label] = magpie_vals[i]
                except: pass 
        except: features['is_stoichiometric_formula'] = False
    elif isinstance(formula_input, dict) and formula_input.get('type') == 'elements_only':
        features['is_stoichiometric_formula'] = False
        elements_present = formula_input.get('elements', set())
        valid_elements = [el for el in elements_present if el in df_elements_processed.index]
        features['num_elements_in_formula'] = len(valid_elements)
        if valid_elements:
            element_props_subset = df_elements_processed.loc[valid_elements]
            unweighted_props_to_calc = ['Atomic_Weight', 'Electronegativity', 'Atomic_Radius', 'Melting_Point', 'Density', 'avg_valence', 'valence_electrons_estimated']
            for prop_col in unweighted_props_to_calc:
                if prop_col in element_props_subset.columns:
                    clean_vals = element_props_subset[prop_col].dropna()
                    if not clean_vals.empty:
                        features[f'avg_{prop_col.lower()}_unweighted'] = clean_vals.mean()
                        features[f'min_{prop_col.lower()}_unweighted'] = clean_vals.min()
                        features[f'max_{prop_col.lower()}_unweighted'] = clean_vals.max()
                        features[f'var_{prop_col.lower()}_unweighted'] = clean_vals.var()
    
    final_features = default_feature_dict.copy(); final_features.update(features)
    return final_features