File size: 2,654 Bytes
bd24fca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import ast
from typing import List
import pandas as pd

from config import EXCHANGE_RATES


def convert_to_usd(value, country):
    """Convert local currency to USD using EXCHANGE_RATES"""
    if pd.isna(value) or pd.isna(country):
        return value
    return value / EXCHANGE_RATES.get(country, 1.0)


def parse_emtec_list(txt: str) -> List[str]:
    """Safely parse the EMTEC list string"""
    try:
        return [] if pd.isna(txt) or txt in ('[]', '') else ast.literal_eval(txt)
    except Exception:
        return []


def create_multiple_classification_data(df: pd.DataFrame) -> pd.DataFrame:
    """Generate one row per every EMSEC × EMTEC combination (cross‑product)."""
    expanded_rows: List[pd.Series] = []

    for _, row in df.iterrows():
        # ── 1) Collect valid EMSEC levels ------------------------------------------------
        emsec_list = []
        for i in range(1, 6):
            emsec_code = row.get(f'EMSEC{i}')
            if pd.notna(emsec_code):
                emsec_list.append({
                    'sector': row.get(f'EMSEC{i}_Sector', 'Unclassified') or 'Unclassified',
                    'industry': row.get(f'EMSEC{i}_Industry', 'Unclassified') or 'Unclassified',
                    'sub_industry': emsec_code
                })

        # ── 2) Collect EMTEC hierarchy ---------------------------------------------------
        lvl1 = parse_emtec_list(row.get('EMTEC_LEVEL1'))
        lvl2 = parse_emtec_list(row.get('EMTEC_LEVEL2'))
        lvl3 = parse_emtec_list(row.get('EMTEC_LEVEL3'))

        emtec_combos: List[dict] = []
        if lvl1:
            for l1 in lvl1:
                for l2 in (lvl2 or ['Unclassified']):
                    for l3 in (lvl3 or ['Unclassified']):
                        emtec_combos.append({'theme': l1, 'technology': l2, 'sub_technology': l3})
        else:
            emtec_combos.append({'theme': 'Unclassified', 'technology': 'Unclassified', 'sub_technology': 'Unclassified'})

        # ── 3) Produce cross‑product rows ----------------------------------------------
        for emsec in emsec_list:
            for emtec in emtec_combos:
                new_row = row.to_dict()
                new_row.update({
                    'Sector': emsec['sector'],
                    'Industry': emsec['industry'],
                    'Sub_industry': emsec['sub_industry'],
                    'Theme': emtec['theme'],
                    'Technology': emtec['technology'],
                    'Sub_Technology': emtec['sub_technology'],
                })
                expanded_rows.append(new_row)

    return pd.DataFrame(expanded_rows)