|
|
|
import pandas as pd |
|
import numpy as np |
|
import re |
|
from .constants import ATMOSPHERE_CONFIG, MIXING_METHOD_CONFIG |
|
|
|
def _extract_numerical_value_from_op_condition(condition_entry, target_keys=['value', 'max_value', 'values']): |
|
if isinstance(condition_entry, list) and condition_entry: |
|
if isinstance(condition_entry[0], dict): |
|
for key in target_keys: |
|
val = condition_entry[0].get(key) |
|
if val is not None: |
|
if isinstance(val, list) and val: |
|
try: return float(val[0]) |
|
except: continue |
|
try: return float(val) |
|
except: continue |
|
elif isinstance(condition_entry[0], (int, float, np.number)): |
|
try: return float(condition_entry[0]) |
|
except: pass |
|
elif isinstance(condition_entry, dict): |
|
for key in target_keys: |
|
val = condition_entry.get(key) |
|
if val is not None: |
|
if isinstance(val, list) and val: |
|
try: return float(val[0]) |
|
except: continue |
|
try: return float(val) |
|
except: continue |
|
elif isinstance(condition_entry, (int, float, np.number)): |
|
try: return float(condition_entry) |
|
except: pass |
|
return np.nan |
|
|
|
def _extract_atmosphere_from_op(op_conditions_dict, op_string, atm_config_local, entry_identifier): |
|
atm_specific = atm_config_local["default_specific"] |
|
atm_category = atm_config_local["default_category"] |
|
found_atm = False |
|
if isinstance(op_conditions_dict, dict): |
|
atm_source_key_val = op_conditions_dict.get('atmosphere') |
|
if not atm_source_key_val and 'text' in op_conditions_dict and isinstance(op_conditions_dict['text'], str) : |
|
atm_source_key_val = op_conditions_dict['text'] |
|
if atm_source_key_val: |
|
atm_str_to_parse = None |
|
if isinstance(atm_source_key_val, list) and atm_source_key_val: atm_str_to_parse = str(atm_source_key_val[0]) |
|
elif isinstance(atm_source_key_val, str): atm_str_to_parse = atm_source_key_val |
|
elif isinstance(atm_source_key_val, dict): atm_str_to_parse = str(atm_source_key_val.get('gas', atm_source_key_val.get('value', ''))) |
|
if atm_str_to_parse: |
|
atm_str_lower = atm_str_to_parse.lower() |
|
for pattern_regex, specific, category in atm_config_local["patterns"]: |
|
if specific.lower() == atm_str_lower or re.search(pattern_regex, atm_str_to_parse, re.IGNORECASE): |
|
atm_specific, atm_category, found_atm = specific, category, True; break |
|
if not found_atm and '/' in atm_str_lower: atm_specific, atm_category, found_atm = atm_str_to_parse, "Mixed", True |
|
if not found_atm and isinstance(op_string, str) and op_string: |
|
for pattern_regex, specific, category in atm_config_local["patterns"]: |
|
if re.search(pattern_regex, op_string, re.IGNORECASE): |
|
atm_specific, atm_category, found_atm = specific, category, True; break |
|
return atm_specific, atm_category |
|
|
|
def _extract_mixing_method_from_op(op_dict, op_string, mix_config_local, entry_identifier): |
|
mix_method = mix_config_local["default_method"] |
|
op_type = str(op_dict.get('type', '')).lower() |
|
if isinstance(op_string, str) and op_string: |
|
for pattern_regex, method_name in mix_config_local["patterns"]: |
|
if re.search(pattern_regex, op_string, re.IGNORECASE): return method_name |
|
if 'mix' in op_type or 'grind' in op_type or 'mill' in op_type: |
|
for pattern_regex, method_name in mix_config_local["patterns"]: |
|
if re.search(pattern_regex, op_type, re.IGNORECASE): return method_name |
|
if op_type.strip() and op_type not in ["mixing", "liquidgrinding", "solutionmixing", "grinding"]: return op_type |
|
return mix_method |
|
|
|
def _extract_thermal_conditions(conditions_dict, op_string, entry_identifier): |
|
temps, durs = [], [] |
|
if isinstance(conditions_dict, dict): |
|
temp_data = conditions_dict.get('heating_temperature') |
|
if temp_data: |
|
extracted_temp = _extract_numerical_value_from_op_condition(temp_data) |
|
if pd.notna(extracted_temp): temps.append(extracted_temp) |
|
dur_data = conditions_dict.get('heating_time') |
|
if dur_data: |
|
extracted_dur = _extract_numerical_value_from_op_condition(dur_data) |
|
if pd.notna(extracted_dur): durs.append(extracted_dur) |
|
return temps, durs |
|
|
|
def parse_single_operation_detailed_for_input(op_dict_raw, entry_identifier="predict_op"): |
|
if not isinstance(op_dict_raw, dict): return {} |
|
op_type_lower = str(op_dict_raw.get('type', 'UnknownType')).lower() |
|
op_string_lower = str(op_dict_raw.get('string', '')).lower() |
|
conditions = op_dict_raw.get('conditions', {}) |
|
op_features = {} |
|
temps, durs = _extract_thermal_conditions(conditions, op_string_lower, entry_identifier) |
|
op_features['op_temp_C_list'], op_features['op_duration_h_list'] = temps, durs |
|
op_features['op_atmosphere_specific'], op_features['op_atmosphere_category'] = _extract_atmosphere_from_op(conditions, op_string_lower, ATMOSPHERE_CONFIG, entry_identifier) |
|
op_features['op_mixing_method'] = _extract_mixing_method_from_op(op_dict_raw, op_string_lower, MIXING_METHOD_CONFIG, entry_identifier) |
|
op_features['op_is_heating'] = any(k in op_type_lower for k in ['heat', 'anneal', 'sinter', 'calcination']) |
|
op_features['op_is_mixing'] = 'mix' in op_type_lower or op_features['op_mixing_method'] != MIXING_METHOD_CONFIG["default_method"] |
|
op_features['op_is_grinding'] = any(k in op_type_lower for k in ['grind', 'mill']) or 'pulverize' in op_string_lower or op_features['op_mixing_method'] in ['grinding', 'ball_milling', 'planetary_milling', 'attritor_milling', 'shaker_milling', 'mortar_pestle'] |
|
op_features['op_is_shaping'] = 'shap' in op_type_lower |
|
op_features['op_is_drying'] = 'dry' in op_type_lower or 'drying' in op_type_lower |
|
op_features['op_is_quenching'] = 'quench' in op_type_lower |
|
op_features['op_is_annealing'] = 'anneal' in op_type_lower or 'anneal' in op_string_lower |
|
op_features['op_is_sintering'] = 'sinter' in op_type_lower or 'sinter' in op_string_lower |
|
op_features['op_is_calcination'] = any(k in op_type_lower for k in ['calcine', 'calcination']) or 'calcination' in op_string_lower |
|
return op_features |
|
|
|
def generate_process_features_for_input(operations_simplified_list, all_possible_atm_categories, all_possible_mix_methods): |
|
aggregated_ops_features = { |
|
'proc_total_heating_duration_h': 0.0, 'proc_max_temperature_C': np.nan, |
|
'proc_min_temperature_C': np.nan, 'proc_avg_temperature_C': np.nan, |
|
'proc_primary_heating_temp_C': np.nan, |
|
'proc_num_total_steps': 0, 'proc_num_heating_steps': 0, |
|
'proc_num_mixing_steps': 0, 'proc_num_grinding_steps': 0, |
|
'proc_has_annealing': False, 'proc_has_sintering': False, |
|
'proc_has_calcination': False, 'proc_has_quenching': False, |
|
'proc_has_shaping': False, 'proc_has_drying': False, |
|
} |
|
for cat in all_possible_atm_categories: aggregated_ops_features[f"ops_atm_cat_{cat}"] = 0 |
|
for meth in all_possible_mix_methods: aggregated_ops_features[f"ops_mix_meth_{meth}"] = 0 |
|
|
|
if not isinstance(operations_simplified_list, list): operations_simplified_list = [] |
|
aggregated_ops_features['proc_num_total_steps'] = len(operations_simplified_list) |
|
all_temps_in_reaction, heating_steps_details_for_reaction, mixing_methods_found_in_reaction = [], [], [] |
|
atm_set_for_reaction_flag = False |
|
parsed_atm_category_for_input = ATMOSPHERE_CONFIG["default_category"] |
|
parsed_mix_method_for_input = MIXING_METHOD_CONFIG["default_method"] |
|
|
|
for op_idx, op_dict_raw in enumerate(operations_simplified_list): |
|
op_features = parse_single_operation_detailed_for_input(op_dict_raw, f"predict_op_{op_idx}") |
|
if op_features.get('op_temp_C_list'): all_temps_in_reaction.extend(op_features['op_temp_C_list']) |
|
if op_features.get('op_is_heating'): |
|
aggregated_ops_features['proc_num_heating_steps'] += 1 |
|
if op_features.get('op_duration_h_list'): aggregated_ops_features['proc_total_heating_duration_h'] += np.nansum(op_features['op_duration_h_list']) |
|
heating_steps_details_for_reaction.append({'temp': np.nanmax(op_features['op_temp_C_list']) if op_features.get('op_temp_C_list') and len(op_features['op_temp_C_list']) > 0 else np.nan, |
|
'duration': np.nansum(op_features.get('op_duration_h_list', [0.0])), |
|
'atm_category': op_features.get('op_atmosphere_category'), |
|
'is_anneal': op_features.get('op_is_annealing'), 'is_sinter': op_features.get('op_is_sintering'), 'is_calcine': op_features.get('op_is_calcination')}) |
|
if op_features.get('op_is_mixing'): |
|
aggregated_ops_features['proc_num_mixing_steps'] += 1 |
|
current_mix_method = op_features.get('op_mixing_method', MIXING_METHOD_CONFIG["default_method"]) |
|
if current_mix_method != MIXING_METHOD_CONFIG["default_method"]: mixing_methods_found_in_reaction.append(current_mix_method) |
|
if op_features.get('op_is_grinding'): aggregated_ops_features['proc_num_grinding_steps'] += 1 |
|
if op_features.get('op_is_shaping'): aggregated_ops_features['proc_has_shaping'] = True |
|
if op_features.get('op_is_sintering'): aggregated_ops_features['proc_has_sintering'] = True |
|
if op_features.get('op_is_drying'): aggregated_ops_features['proc_has_drying'] = True |
|
if op_features.get('op_is_quenching'): aggregated_ops_features['proc_has_quenching'] = True |
|
if op_features.get('op_is_annealing'): aggregated_ops_features['proc_has_annealing'] = True |
|
if op_features.get('op_is_calcination'): aggregated_ops_features['proc_has_calcination'] = True |
|
if not atm_set_for_reaction_flag and op_features.get('op_atmosphere_category') != ATMOSPHERE_CONFIG["default_category"]: |
|
parsed_atm_category_for_input = op_features['op_atmosphere_category'] |
|
atm_set_for_reaction_flag = True |
|
|
|
if heating_steps_details_for_reaction: |
|
primary_heat_step = max(heating_steps_details_for_reaction, key=lambda x: (x['temp'] if pd.notna(x['temp']) else -float('inf'), x['duration'])) |
|
if pd.notna(primary_heat_step['temp']): aggregated_ops_features['proc_primary_heating_temp_C'] = primary_heat_step['temp'] |
|
if not atm_set_for_reaction_flag and primary_heat_step.get('atm_category') != ATMOSPHERE_CONFIG["default_category"]: |
|
parsed_atm_category_for_input = primary_heat_step['atm_category'] |
|
|
|
if mixing_methods_found_in_reaction: |
|
parsed_mix_method_for_input = mixing_methods_found_in_reaction[0] |
|
|
|
atm_ohe_col = f"ops_atm_cat_{parsed_atm_category_for_input}" |
|
if atm_ohe_col in aggregated_ops_features: aggregated_ops_features[atm_ohe_col] = 1 |
|
|
|
mix_ohe_col = f"ops_mix_meth_{parsed_mix_method_for_input}" |
|
if mix_ohe_col in aggregated_ops_features: aggregated_ops_features[mix_ohe_col] = 1 |
|
|
|
if all_temps_in_reaction : |
|
aggregated_ops_features['proc_max_temperature_C'] = np.nanmax(all_temps_in_reaction) |
|
aggregated_ops_features['proc_min_temperature_C'] = np.nanmin(all_temps_in_reaction) |
|
aggregated_ops_features['proc_avg_temperature_C'] = np.nanmean(all_temps_in_reaction) |
|
if aggregated_ops_features['proc_num_heating_steps'] == 0 or pd.isna(aggregated_ops_features['proc_total_heating_duration_h']) or aggregated_ops_features['proc_total_heating_duration_h'] == 0: |
|
aggregated_ops_features['proc_total_heating_duration_h'] = np.nan |
|
|
|
return aggregated_ops_features |
|
|
|
def generate_stoichiometry_features_for_input(reactants_simplified, products_simplified, standardize_fn_local): |
|
stoich_features = {} |
|
max_r, max_p = 3, 2 |
|
for i in range(max_r): stoich_features[f'reactant{i+1}_coeff'] = np.nan |
|
for i in range(max_p): stoich_features[f'product{i+1}_coeff'] = np.nan |
|
|
|
stoich_features['num_reactants_in_reaction'] = len(reactants_simplified) if reactants_simplified else 0 |
|
if reactants_simplified: |
|
for i, r_item in enumerate(reactants_simplified[:max_r]): |
|
if isinstance(r_item, dict): |
|
stoich_features[f'reactant{i+1}_coeff'] = float(r_item.get('amount')) if pd.notna(r_item.get('amount')) else np.nan |
|
stoich_features['num_products_in_reaction'] = len(products_simplified) if products_simplified else 0 |
|
if products_simplified: |
|
for i, p_item in enumerate(products_simplified[:max_p]): |
|
if isinstance(p_item, dict): |
|
stoich_features[f'product{i+1}_coeff'] = float(p_item.get('amount')) if pd.notna(p_item.get('amount')) else np.nan |
|
return stoich_features |
|
|