import pandas as pd import numpy as np import re from .constants import ATMOSPHERE_CONFIG, MIXING_METHOD_CONFIG def _extract_numerical_value_from_op_condition(condition_entry, target_keys=['value', 'max_value', 'values']): if isinstance(condition_entry, list) and condition_entry: if isinstance(condition_entry[0], dict): for key in target_keys: val = condition_entry[0].get(key) if val is not None: if isinstance(val, list) and val: try: return float(val[0]) except: continue try: return float(val) except: continue elif isinstance(condition_entry[0], (int, float, np.number)): try: return float(condition_entry[0]) except: pass elif isinstance(condition_entry, dict): for key in target_keys: val = condition_entry.get(key) if val is not None: if isinstance(val, list) and val: try: return float(val[0]) except: continue try: return float(val) except: continue elif isinstance(condition_entry, (int, float, np.number)): try: return float(condition_entry) except: pass return np.nan def _extract_atmosphere_from_op(op_conditions_dict, op_string, atm_config_local, entry_identifier): atm_specific = atm_config_local["default_specific"] atm_category = atm_config_local["default_category"] found_atm = False if isinstance(op_conditions_dict, dict): atm_source_key_val = op_conditions_dict.get('atmosphere') if not atm_source_key_val and 'text' in op_conditions_dict and isinstance(op_conditions_dict['text'], str) : atm_source_key_val = op_conditions_dict['text'] if atm_source_key_val: atm_str_to_parse = None if isinstance(atm_source_key_val, list) and atm_source_key_val: atm_str_to_parse = str(atm_source_key_val[0]) elif isinstance(atm_source_key_val, str): atm_str_to_parse = atm_source_key_val elif isinstance(atm_source_key_val, dict): atm_str_to_parse = str(atm_source_key_val.get('gas', atm_source_key_val.get('value', ''))) if atm_str_to_parse: atm_str_lower = atm_str_to_parse.lower() for pattern_regex, specific, category in atm_config_local["patterns"]: if specific.lower() == atm_str_lower or re.search(pattern_regex, atm_str_to_parse, re.IGNORECASE): atm_specific, atm_category, found_atm = specific, category, True; break if not found_atm and '/' in atm_str_lower: atm_specific, atm_category, found_atm = atm_str_to_parse, "Mixed", True if not found_atm and isinstance(op_string, str) and op_string: for pattern_regex, specific, category in atm_config_local["patterns"]: if re.search(pattern_regex, op_string, re.IGNORECASE): atm_specific, atm_category, found_atm = specific, category, True; break return atm_specific, atm_category def _extract_mixing_method_from_op(op_dict, op_string, mix_config_local, entry_identifier): mix_method = mix_config_local["default_method"] op_type = str(op_dict.get('type', '')).lower() if isinstance(op_string, str) and op_string: for pattern_regex, method_name in mix_config_local["patterns"]: if re.search(pattern_regex, op_string, re.IGNORECASE): return method_name if 'mix' in op_type or 'grind' in op_type or 'mill' in op_type: for pattern_regex, method_name in mix_config_local["patterns"]: if re.search(pattern_regex, op_type, re.IGNORECASE): return method_name if op_type.strip() and op_type not in ["mixing", "liquidgrinding", "solutionmixing", "grinding"]: return op_type return mix_method def _extract_thermal_conditions(conditions_dict, op_string, entry_identifier): temps, durs = [], [] if isinstance(conditions_dict, dict): temp_data = conditions_dict.get('heating_temperature') if temp_data: extracted_temp = _extract_numerical_value_from_op_condition(temp_data) if pd.notna(extracted_temp): temps.append(extracted_temp) dur_data = conditions_dict.get('heating_time') if dur_data: extracted_dur = _extract_numerical_value_from_op_condition(dur_data) if pd.notna(extracted_dur): durs.append(extracted_dur) return temps, durs def parse_single_operation_detailed_for_input(op_dict_raw, entry_identifier="predict_op"): if not isinstance(op_dict_raw, dict): return {} op_type_lower = str(op_dict_raw.get('type', 'UnknownType')).lower() op_string_lower = str(op_dict_raw.get('string', '')).lower() conditions = op_dict_raw.get('conditions', {}) op_features = {} temps, durs = _extract_thermal_conditions(conditions, op_string_lower, entry_identifier) op_features['op_temp_C_list'], op_features['op_duration_h_list'] = temps, durs op_features['op_atmosphere_specific'], op_features['op_atmosphere_category'] = _extract_atmosphere_from_op(conditions, op_string_lower, ATMOSPHERE_CONFIG, entry_identifier) op_features['op_mixing_method'] = _extract_mixing_method_from_op(op_dict_raw, op_string_lower, MIXING_METHOD_CONFIG, entry_identifier) op_features['op_is_heating'] = any(k in op_type_lower for k in ['heat', 'anneal', 'sinter', 'calcination']) op_features['op_is_mixing'] = 'mix' in op_type_lower or op_features['op_mixing_method'] != MIXING_METHOD_CONFIG["default_method"] op_features['op_is_grinding'] = any(k in op_type_lower for k in ['grind', 'mill']) or 'pulverize' in op_string_lower or op_features['op_mixing_method'] in ['grinding', 'ball_milling', 'planetary_milling', 'attritor_milling', 'shaker_milling', 'mortar_pestle'] op_features['op_is_shaping'] = 'shap' in op_type_lower op_features['op_is_drying'] = 'dry' in op_type_lower or 'drying' in op_type_lower op_features['op_is_quenching'] = 'quench' in op_type_lower op_features['op_is_annealing'] = 'anneal' in op_type_lower or 'anneal' in op_string_lower op_features['op_is_sintering'] = 'sinter' in op_type_lower or 'sinter' in op_string_lower op_features['op_is_calcination'] = any(k in op_type_lower for k in ['calcine', 'calcination']) or 'calcination' in op_string_lower return op_features def generate_process_features_for_input(operations_simplified_list, all_possible_atm_categories, all_possible_mix_methods): aggregated_ops_features = { 'proc_total_heating_duration_h': 0.0, 'proc_max_temperature_C': np.nan, 'proc_min_temperature_C': np.nan, 'proc_avg_temperature_C': np.nan, 'proc_primary_heating_temp_C': np.nan, 'proc_num_total_steps': 0, 'proc_num_heating_steps': 0, 'proc_num_mixing_steps': 0, 'proc_num_grinding_steps': 0, 'proc_has_annealing': False, 'proc_has_sintering': False, 'proc_has_calcination': False, 'proc_has_quenching': False, 'proc_has_shaping': False, 'proc_has_drying': False, } for cat in all_possible_atm_categories: aggregated_ops_features[f"ops_atm_cat_{cat}"] = 0 for meth in all_possible_mix_methods: aggregated_ops_features[f"ops_mix_meth_{meth}"] = 0 if not isinstance(operations_simplified_list, list): operations_simplified_list = [] aggregated_ops_features['proc_num_total_steps'] = len(operations_simplified_list) all_temps_in_reaction, heating_steps_details_for_reaction, mixing_methods_found_in_reaction = [], [], [] atm_set_for_reaction_flag = False parsed_atm_category_for_input = ATMOSPHERE_CONFIG["default_category"] parsed_mix_method_for_input = MIXING_METHOD_CONFIG["default_method"] for op_idx, op_dict_raw in enumerate(operations_simplified_list): op_features = parse_single_operation_detailed_for_input(op_dict_raw, f"predict_op_{op_idx}") if op_features.get('op_temp_C_list'): all_temps_in_reaction.extend(op_features['op_temp_C_list']) if op_features.get('op_is_heating'): aggregated_ops_features['proc_num_heating_steps'] += 1 if op_features.get('op_duration_h_list'): aggregated_ops_features['proc_total_heating_duration_h'] += np.nansum(op_features['op_duration_h_list']) heating_steps_details_for_reaction.append({'temp': np.nanmax(op_features['op_temp_C_list']) if op_features.get('op_temp_C_list') and len(op_features['op_temp_C_list']) > 0 else np.nan, 'duration': np.nansum(op_features.get('op_duration_h_list', [0.0])), 'atm_category': op_features.get('op_atmosphere_category'), 'is_anneal': op_features.get('op_is_annealing'), 'is_sinter': op_features.get('op_is_sintering'), 'is_calcine': op_features.get('op_is_calcination')}) if op_features.get('op_is_mixing'): aggregated_ops_features['proc_num_mixing_steps'] += 1 current_mix_method = op_features.get('op_mixing_method', MIXING_METHOD_CONFIG["default_method"]) if current_mix_method != MIXING_METHOD_CONFIG["default_method"]: mixing_methods_found_in_reaction.append(current_mix_method) if op_features.get('op_is_grinding'): aggregated_ops_features['proc_num_grinding_steps'] += 1 if op_features.get('op_is_shaping'): aggregated_ops_features['proc_has_shaping'] = True if op_features.get('op_is_sintering'): aggregated_ops_features['proc_has_sintering'] = True if op_features.get('op_is_drying'): aggregated_ops_features['proc_has_drying'] = True if op_features.get('op_is_quenching'): aggregated_ops_features['proc_has_quenching'] = True if op_features.get('op_is_annealing'): aggregated_ops_features['proc_has_annealing'] = True if op_features.get('op_is_calcination'): aggregated_ops_features['proc_has_calcination'] = True if not atm_set_for_reaction_flag and op_features.get('op_atmosphere_category') != ATMOSPHERE_CONFIG["default_category"]: parsed_atm_category_for_input = op_features['op_atmosphere_category'] atm_set_for_reaction_flag = True if heating_steps_details_for_reaction: primary_heat_step = max(heating_steps_details_for_reaction, key=lambda x: (x['temp'] if pd.notna(x['temp']) else -float('inf'), x['duration'])) if pd.notna(primary_heat_step['temp']): aggregated_ops_features['proc_primary_heating_temp_C'] = primary_heat_step['temp'] if not atm_set_for_reaction_flag and primary_heat_step.get('atm_category') != ATMOSPHERE_CONFIG["default_category"]: parsed_atm_category_for_input = primary_heat_step['atm_category'] if mixing_methods_found_in_reaction: parsed_mix_method_for_input = mixing_methods_found_in_reaction[0] atm_ohe_col = f"ops_atm_cat_{parsed_atm_category_for_input}" if atm_ohe_col in aggregated_ops_features: aggregated_ops_features[atm_ohe_col] = 1 mix_ohe_col = f"ops_mix_meth_{parsed_mix_method_for_input}" if mix_ohe_col in aggregated_ops_features: aggregated_ops_features[mix_ohe_col] = 1 if all_temps_in_reaction : aggregated_ops_features['proc_max_temperature_C'] = np.nanmax(all_temps_in_reaction) aggregated_ops_features['proc_min_temperature_C'] = np.nanmin(all_temps_in_reaction) aggregated_ops_features['proc_avg_temperature_C'] = np.nanmean(all_temps_in_reaction) if aggregated_ops_features['proc_num_heating_steps'] == 0 or pd.isna(aggregated_ops_features['proc_total_heating_duration_h']) or aggregated_ops_features['proc_total_heating_duration_h'] == 0: aggregated_ops_features['proc_total_heating_duration_h'] = np.nan return aggregated_ops_features def generate_stoichiometry_features_for_input(reactants_simplified, products_simplified, standardize_fn_local): stoich_features = {} max_r, max_p = 3, 2 for i in range(max_r): stoich_features[f'reactant{i+1}_coeff'] = np.nan for i in range(max_p): stoich_features[f'product{i+1}_coeff'] = np.nan stoich_features['num_reactants_in_reaction'] = len(reactants_simplified) if reactants_simplified else 0 if reactants_simplified: for i, r_item in enumerate(reactants_simplified[:max_r]): if isinstance(r_item, dict): stoich_features[f'reactant{i+1}_coeff'] = float(r_item.get('amount')) if pd.notna(r_item.get('amount')) else np.nan stoich_features['num_products_in_reaction'] = len(products_simplified) if products_simplified else 0 if products_simplified: for i, p_item in enumerate(products_simplified[:max_p]): if isinstance(p_item, dict): stoich_features[f'product{i+1}_coeff'] = float(p_item.get('amount')) if pd.notna(p_item.get('amount')) else np.nan return stoich_features