import ast from typing import List import pandas as pd from config import EXCHANGE_RATES def convert_to_usd(value, country): """Convert local currency to USD using EXCHANGE_RATES""" if pd.isna(value) or pd.isna(country): return value return value / EXCHANGE_RATES.get(country, 1.0) def parse_emtec_list(txt: str) -> List[str]: """Safely parse the EMTEC list string""" try: return [] if pd.isna(txt) or txt in ('[]', '') else ast.literal_eval(txt) except Exception: return [] def create_multiple_classification_data(df: pd.DataFrame) -> pd.DataFrame: """Generate one row per every EMSEC × EMTEC combination (cross‑product).""" expanded_rows: List[pd.Series] = [] for _, row in df.iterrows(): # ── 1) Collect valid EMSEC levels ------------------------------------------------ emsec_list = [] for i in range(1, 6): emsec_code = row.get(f'EMSEC{i}') if pd.notna(emsec_code): emsec_list.append({ 'sector': row.get(f'EMSEC{i}_Sector', 'Unclassified') or 'Unclassified', 'industry': row.get(f'EMSEC{i}_Industry', 'Unclassified') or 'Unclassified', 'sub_industry': emsec_code }) # ── 2) Collect EMTEC hierarchy --------------------------------------------------- lvl1 = parse_emtec_list(row.get('EMTEC_LEVEL1')) lvl2 = parse_emtec_list(row.get('EMTEC_LEVEL2')) lvl3 = parse_emtec_list(row.get('EMTEC_LEVEL3')) emtec_combos: List[dict] = [] if lvl1: for l1 in lvl1: for l2 in (lvl2 or ['Unclassified']): for l3 in (lvl3 or ['Unclassified']): emtec_combos.append({'theme': l1, 'technology': l2, 'sub_technology': l3}) else: emtec_combos.append({'theme': 'Unclassified', 'technology': 'Unclassified', 'sub_technology': 'Unclassified'}) # ── 3) Produce cross‑product rows ---------------------------------------------- for emsec in emsec_list: for emtec in emtec_combos: new_row = row.to_dict() new_row.update({ 'Sector': emsec['sector'], 'Industry': emsec['industry'], 'Sub_industry': emsec['sub_industry'], 'Theme': emtec['theme'], 'Technology': emtec['technology'], 'Sub_Technology': emtec['sub_technology'], }) expanded_rows.append(new_row) return pd.DataFrame(expanded_rows)