Spaces:

joseph-data
/

occupation_ai

Sleeping

App Files Files Community

joseph-data commited on Dec 15, 2025

Commit

3e12d11

unverified ·

1 Parent(s): 5ad2292

updated the app

Browse files

Files changed (25) hide show

.gitattributes +4 -0
.gitattributes copy +4 -0
.gitignore +8 -0
app.py +311 -197
css/theme.css +44 -0
data/01_translation_files/ssyk96_en.xlsx +0 -0
data/03_daioe_aggregated/daioe_ssyk2012_emp_weighted.csv +0 -0
data/03_daioe_aggregated/daioe_ssyk2012_simple_avg.csv +0 -0
data/03_daioe_aggregated/daioe_ssyk96_emp_weighted.csv +0 -0
data/03_daioe_aggregated/daioe_ssyk96_simple_avg.csv +0 -0
data/scb_employment_v1.csv +3 -0
main.py +0 -72
requirements copy.txt +77 -0
requirements.txt +15 -94
scripts/01_scbPull.py +0 -129
scripts/02_weighting.py +0 -258
scripts/04_occ.py +0 -109
scripts/__init__.py +0 -0
src/__init__.py +6 -0
src/config.py +53 -0
src/data_manager.py +140 -0
src/label_enrichment.py +72 -0
src/pipeline.py +180 -0
src/plot_helper.py +107 -0
src/scb_fetch.py +143 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,4 @@

+data/daioe_simple.csv filter=lfs diff=lfs merge=lfs -text
+data/daioe_weighted.csv filter=lfs diff=lfs merge=lfs -text
+data/*.csv filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text

.gitattributes copy ADDED Viewed

	@@ -0,0 +1,4 @@

+data/daioe_simple.csv filter=lfs diff=lfs merge=lfs -text
+data/daioe_weighted.csv filter=lfs diff=lfs merge=lfs -text
+data/*.csv filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -8,6 +8,8 @@ wheels/
 # Virtual environments
 .venv
 # Project-specific artifacts
 test_notebooks/
@@ -24,3 +26,9 @@ data/04_translation_files/
 scripts/03_translate_ssyk2012.py
 scripts/03_translate_ssyk96.py
 _brand.yml

 # Virtual environments
 .venv
+.ruff_cache
+.vscode
 # Project-specific artifacts
 test_notebooks/
 scripts/03_translate_ssyk2012.py
 scripts/03_translate_ssyk96.py
 _brand.yml
+data/daioe_simple.csv
+data/daioe_weighted.csv
+test.py
+test2.py

app.py CHANGED Viewed

@@ -1,207 +1,321 @@
-"""
-Shiny app: Employment headcount by age group for a selected SSYK3 occupation,
-indexed to 2022 = 1. Uses SCB AKU employment pulled via scripts/04_occ.py.
-"""
-from __future__ import annotations
-from functools import lru_cache
 from pathlib import Path
-from typing import Dict, List
-import matplotlib.pyplot as plt
-import pandas as pd
-from shiny import App, render, ui
-ROOT = Path(__file__).resolve().parent
-OCC_PATH = ROOT / "scripts" / "04_occ.py"
-# Age groups available from SCB; keep order consistent for the UI and legend.
-AGE_ORDER: List[str] = [
-    "16-24",
-    "25-29",
-    "30-34",
-    "35-39",
-    "40-44",
-    "45-49",
-    "50-54",
-    "55-59",
-    "60-64",
-]
-AGE_LABELS: Dict[str, str] = {age: f"{age} years" for age in AGE_ORDER}
-def _load_occ_module():
-    """Load the employment fetcher from scripts/04_occ.py."""
-    import importlib.util
-    spec = importlib.util.spec_from_file_location("scripts.occ", OCC_PATH)
-    module = importlib.util.module_from_spec(spec)
-    assert spec.loader is not None
-    spec.loader.exec_module(module)
-    return module
-@lru_cache(maxsize=1)
-def load_employment() -> pd.DataFrame:
-    """Fetch SCB AKU employment by occupation, age, and year."""
-    occ_mod = _load_occ_module()
-    df = occ_mod.fetch_scb_aku_occupations()
-    df = df.rename(columns={"code_3": "code"})
-    df["code"] = df["code"].astype(str).str.zfill(3)
-    df["year"] = df["year"].astype(int)
-    df["value"] = df["value"].astype(int)
-    df = df[df["age"].isin(AGE_ORDER)].copy()
-    return df
-@lru_cache(maxsize=1)
-def profession_choices() -> Dict[str, str]:
-    """
-    Build a mapping of SSYK3 codes to display labels.
-    Uses the most frequent occupation label observed for each code.
-    """
-    df = load_employment()
-    df = df[df["code"].str.len() == 3].copy()
-    df = df.dropna(subset=["occupation"])
-    def pick_label(group: pd.Series) -> str:
-        return group.mode().iat[0] if not group.mode().empty else group.iloc[0]
-    labels = (
-        df.groupby("code")["occupation"]
-        .apply(pick_label)
-        .reset_index()
-        .sort_values("code")
-    )
-    return {row.code: f"{row.code} - {row.occupation}" for row in labels.itertuples()}
-@lru_cache(maxsize=1)
-def available_years() -> List[int]:
-    """Years present in the employment series, sorted ascending."""
-    df = load_employment()
-    return sorted(df["year"].unique().tolist())
-def build_headcount(code: str, ages: List[str], base_year: int | None) -> pd.DataFrame:
-    """
-    Filter employment to a single SSYK3 code and selected age groups.
-    Optionally index each age group to the selected base year.
-    """
-    emp = load_employment()
-    filtered = emp[(emp["code"] == code) & (emp["age"].isin(ages))].copy()
-    if filtered.empty:
-        return filtered
-    if base_year is not None:
-        base = (
-            filtered[filtered["year"] == base_year][["age", "value"]]
-            .rename(columns={"value": "base_value"})
-            .set_index("age")
-        )
-        filtered["base_value"] = filtered["age"].map(base["base_value"])
-        filtered = filtered[filtered["base_value"].notna()].copy()
-        if filtered.empty:
-            return filtered
-        filtered["metric"] = filtered["value"] / filtered["base_value"]
-    else:
-        filtered["metric"] = filtered["value"]
-    filtered["age_label"] = filtered["age"].map(AGE_LABELS)
-    filtered = filtered.sort_values(["age", "year"])
-    return filtered
-def make_headcount_plot(df: pd.DataFrame, title: str, base_year: int | None):
-    """Create a line plot of headcount by age group for one occupation."""
-    fig, ax = plt.subplots(figsize=(10, 6))
-    palette = [
-        "#0072B2",
-        "#009E73",
-        "#E69F00",
-        "#D55E00",
-        "#CC79A7",
-        "#56B4E9",
-        "#999999",
-        "#F0E442",
-        "#8C564B",
-    ]
-    for idx, (age, group) in enumerate(df.groupby("age_label")):
-        ax.plot(group["year"], group["metric"], label=age, color=palette[idx % len(palette)], linewidth=2)
-    if base_year is not None:
-        ax.axvline(base_year, color="#555555", linestyle="--", linewidth=1, alpha=0.7)
-    ax.set_xlabel("Year")
-    ylabel = f"Normalized headcount (base={base_year})" if base_year is not None else "Headcount"
-    ax.set_ylabel(ylabel)
-    ax.set_title(f"Headcount over time by age group\n{title}")
-    ax.legend(title="Age group", loc="upper left")
-    ax.grid(True, linestyle="--", alpha=0.2)
-    fig.tight_layout()
-    return fig
-profession_map = profession_choices()
-default_code = next(iter(profession_map.keys()), "")
-app_ui = ui.page_fluid(
-    ui.h2("Headcount over time by age group"),
-    ui.input_select(
-        "profession",
-        "SSYK 3-digit occupation",
-        choices=profession_map,
-        selected=default_code,
-    ),
-    ui.input_select(
-        "base_year",
-        "Base year (optional)",
-        choices={"": "No indexing (show raw values)", **{str(y): str(y) for y in available_years()}},
-        selected="",
-    ),
-    ui.input_checkbox_group(
-        "age_groups",
-        "Age groups",
-        choices={age: AGE_LABELS[age] for age in AGE_ORDER},
-        selected=AGE_ORDER,
-        inline=True,
-    ),
-    ui.output_plot("headcount_plot", width="100%", height="650px"),
-    ui.markdown(
-        "Data: SCB AKU employment. Select a base year to normalize, or leave blank to see raw headcount."
-    ),
 )
-def server(input, output, session):
-    @render.plot
-    def headcount_plot():
-        code = input.profession()
-        ages = input.age_groups()
-        base_year_raw = input.base_year()
-        base_year = int(base_year_raw) if base_year_raw else None
-        if not code or not ages:
-            fig, ax = plt.subplots(figsize=(8, 3))
-            ax.text(0.5, 0.5, "Select an occupation and at least one age group.", ha="center", va="center")
-            ax.axis("off")
-            return fig
-        df = build_headcount(code, ages, base_year)
-        if df.empty:
-            fig, ax = plt.subplots(figsize=(8, 3))
-            ax.text(0.5, 0.5, "No data available for this selection.", ha="center", va="center")
-            ax.axis("off")
-            return fig
-        title = profession_map.get(code, code)
-        return make_headcount_plot(df, title, base_year)
-app = App(app_ui, server)
-if __name__ == "__main__":
-    # Run with: shiny run --reload app_headcount_age.py
-    app.run()

 from pathlib import Path
+import plotly.graph_objects as go
+import plotly.express as px
+from plotly.subplots import make_subplots
+from shiny import reactive
+from shiny.express import input, ui
+from shinywidgets import render_plotly, output_widget
+from src.config import (
+    DEFAULT_LEVEL,
+    DEFAULT_YEAR_RANGE,
+    LEVEL_OPTIONS,
+    GLOBAL_YEAR_MIN,
+    GLOBAL_YEAR_MAX,
 )
+from src.data_manager import load_payload
+# Helpers for UI mapping
+LEVEL_CHOICES = {value: label for label, value in LEVEL_OPTIONS}
+YEAR_RANGE_DEFAULT = list(range(DEFAULT_YEAR_RANGE[0], DEFAULT_YEAR_RANGE[1] + 1))
+# ======================================================
+#  UI LAYOUT
+# ======================================================
+css_file = Path(__file__).parent / "css" / "theme.css"
+ui.include_css(css_file)
+ui.page_opts(
+    fillable=False,
+    fillable_mobile=True,
+    full_width=True,
+    id="page",
+    lang="en",
+)
+with ui.sidebar(open="desktop", position="right"):
+    ui.input_select(
+        "level", "Select Occupation level", LEVEL_CHOICES, selected=DEFAULT_LEVEL
+    )
+    ui.input_selectize(
+        "selectize",
+        "Select Occupation title(s)",
+        {},
+        multiple=True,
+        options=(
+            {
+                "placeholder": "Statisticians...",
+                "create": False,
+                "plugins": ["clear_button"],
+            }
+        ),
+    )
+    # ui.input_radio_buttons(
+    #     "count_mode",
+    #     "Employed persons display",
+    #     {"raw": "Raw counts", "index": "Index to base year"},
+    #     selected="raw",
+    # )
+    # with ui.panel_conditional("input.count_mode == 'index'"):
+    #     ui.input_select(
+    #         "base_year",
+    #         "Base year",
+    #         YEAR_RANGE_DEFAULT,
+    #         selected=2022,
+    #     )
+    ui.input_slider(
+        "year_range",
+        "Year range",
+        min=GLOBAL_YEAR_MIN,
+        max=GLOBAL_YEAR_MAX,
+        value=DEFAULT_YEAR_RANGE,
+        step=1,
+        sep="",
+    )
+    ui.input_action_button("refresh_data", "Refresh data", class_="btn-primary")
+# ======================================================
+#  REACTIVE STATE
+# ======================================================
+# Reactive value to store the loaded payload
+payload_store = reactive.Value(load_payload())
+@reactive.effect
+@reactive.event(input.refresh_data)
+def _refresh_payload():
+    with ui.Progress() as progress:
+        progress.set(message="Refreshing data...", value=0.1)
+        # Force recompute in data manager
+        updated = load_payload(force_recompute=True)
+        progress.set(message="Updating UI...", value=0.8)
+        payload_store.set(updated)
+        progress.set(message="Done", value=1.0)
+# Build Selectize choices per selected level
+@reactive.calc
+def level_label_choices():
+    df = payload_store()
+    lvl = int(input.level())
+    subset = df[df["level"] == lvl][["code", "label"]].dropna().drop_duplicates()
+    choices_list = []
+    for _, row in subset.iterrows():
+        key = row["label"]
+        value = f"{row['code']} - {row['label']}"
+        choices_list.append((key, value))
+    # Sort by the code (extract code from display value)
+    choices_list.sort(key=lambda x: x[1].split(" - ")[0])
+    # Convert to dictionary while maintaining order
+    return {key: value for key, value in choices_list}
+# keep selectize choices in sync with level selection
+@reactive.effect
+def _sync_selectize_choices():
+    choices = level_label_choices()
+    current = input.selectize() or []
+    # only keep items still valid
+    valid_selected = [s for s in current if s in choices]
+    # apply a default when nothing valid remains
+    if not valid_selected and choices:
+        # pick the first option (or slice for multiple defaults)
+        valid_selected = [next(iter(choices))]
+    ui.update_selectize("selectize", choices=choices, selected=valid_selected)
+# Filtered data based on UI inputs
+@reactive.calc
+def filtered_data():
+    df = payload_store()
+    level = int(input.level())
+    year_min, year_max = input.year_range()
+    selected_titles = input.selectize()
+    idx_level = df["level"] == level
+    idx_year = df["year"].between(year_min, year_max)
+    # If no titles selected, return empty dataframe
+    if not selected_titles:
+        return df[idx_level & idx_year & (df["label"] == "")].copy()  # Empty result
+    idx_title = df["label"].isin(selected_titles)
+    filtered_df = df[idx_level & idx_year & idx_title]
+    return filtered_df
+# # Warning message for no selections
+# with ui.div(style="margin: 20px;"):
+#     @render.ui
+#     def selection_status():
+#         if not input.selectize():
+#             return ui.div(
+#                 ui.tags.div(
+#                     "⚠️ Please select at least one occupation title to view data.",
+#                     style="background-color: #fff3cd; color: #856404; padding: 15px; border: 1px solid #ffeaa7; border-radius: 5px; text-align: center; font-weight: bold;",
+#                 )
+#             )
+#         else:
+#             return ui.div()  # Return empty div when selections exist
+# @render_plotly
+# def data_table():
+#     df = filtered_data()
+#     # Show message if no data available
+#     if df.empty:
+#         fig = go.Figure()
+#         fig.add_annotation(
+#             text="No data available. Please select occupation titles.",
+#             xref="paper",
+#             yref="paper",
+#             x=0.5,
+#             y=0.5,
+#             showarrow=False,
+#             font=dict(size=16),
+#         )
+#         fig.update_layout(
+#             xaxis=dict(visible=False), yaxis=dict(visible=False), plot_bgcolor="white"
+#         )
+#         return fig
+#     fig = go.Figure(
+#         data=go.Table(
+#             header=dict(
+#                 values=list(df.columns), fill_color="paleturquoise", align="left"
+#             ),
+#             cells=dict(
+#                 values=[df[col] for col in df.columns],
+#                 fill_color="lavender",
+#                 align="left",
+#             ),
+#         )
+#     )
+#     return fig
+with ui.div(style="display:flex; justify-content:center;"):
+    output_widget("employment_plot")
+    @render_plotly
+    def employment_plot2():
+        df = filtered_data()
+        age_groups = sorted(df["age"].dropna().unique())
+        occupations = sorted(df["label"].dropna().unique())
+        # Use a Plotly qualitative palette
+        palette = px.colors.qualitative.Plotly
+        # Cycle safely if occupations > palette length
+        occ_color_map = {
+            occ: palette[i % len(palette)] for i, occ in enumerate(occupations)
+        }
+        # ------------------------------------------------------------------
+        # 2. Create multi-row subplot scaffolding
+        # ------------------------------------------------------------------
+        subplot_titles = [
+            (f"<b>Employed Persons Aged {age} Years by Occupation")
+            for age in age_groups
+        ]
+        fig = make_subplots(
+            rows=len(age_groups),
+            cols=1,
+            shared_xaxes=False,
+            vertical_spacing=0.03,
+            subplot_titles=subplot_titles,
+        )
+        # ------------------------------------------------------------------
+        # 3. Add traces per age group and exposure level
+        # ------------------------------------------------------------------
+        # Need to pre-define the max row number for the final x-axis update
+        for i, age in enumerate(age_groups, start=1):
+            df_age = df[df["age"] == age]
+            # Aggregate by Year and Label
+            df_plot = df_age.groupby(["year", "label"], as_index=False)[
+                "employment"
+            ].sum()
+            for occ_title, sub in df_plot.groupby("label"):
+                fig.add_trace(
+                    go.Scatter(
+                        x=sub["year"],
+                        y=sub["employment"],
+                        mode="lines+markers",
+                        showlegend=True
+                        if i == 1
+                        else False,  # Show legend only in the first subplot
+                        name=occ_title,
+                        line=dict(color=occ_color_map[occ_title], width=3),
+                        # Add group/age info to the hover template for debugging/clarity
+                        hovertemplate=f"Age: {age}<br>Year: %{{x}}<br>Employment: %{{y:,}}<extra>{occ_title}</extra>",
+                    ),
+                    row=i,
+                    col=1,
+                )
+            # Y-axis update must be inside the loop to target the current row (i)
+            fig.update_yaxes(
+                title_text="Number of Employed Persons",
+                tickformat=",",
+                rangemode="tozero",
+                row=i,
+                col=1,
+            )
+            # X-axis update must be inside the loop to target the current row (i)
+            fig.update_xaxes(
+                title_text="Year",
+                tickmode="linear",
+                dtick=1,
+                row=i,
+                col=1,
+            )
+        # ------------------------------------------------------------------
+        # 4. Global layout tweaks
+        # ------------------------------------------------------------------
+        fig.update_annotations(yshift=30)
+        fig.update_layout(
+            height=700 * len(age_groups),
+            width=1200,
+            legend_traceorder="normal",
+            legend=dict(
+                title="Occupation Title(s)",
+                orientation="v",
+                yanchor="top",
+                y=1.0,
+                xanchor="left",
+                x=-0.5,
+                bordercolor="#c7c7c7",
+                borderwidth=2,
+                bgcolor="#f9f9f9",
+                font=dict(size=10),
+            ),
+            margin=dict(t=100, l=50, r=80, b=40),
+            plot_bgcolor="#f5f7fb",
+            xaxis_showgrid=True,
+        )
+        return fig

css/theme.css ADDED Viewed

	@@ -0,0 +1,44 @@

+/*-- scss:defaults --*/
+$link-color: #39729E;
+$text-muted: #6a737b;
+/*-- scss:rules --*/
+.layout-example {
+  background: $gray-500;
+  color: $white;
+  text-align: center;
+  margin-bottom: 1em;
+  font-family: $font-family-monospace;
+  font-size: .875em;
+  font-weight: 600;
+  padding-top: 1em;
+  border-radius: 3px;
+}
+.left {
+  text-align: left;
+  padding-left: 1em;
+}
+.right {
+  text-align: right;
+  padding-right: 1em;
+}
+.hello-quarto-banner h1 {
+  margin-top: 0;
+  margin-bottom: 0.5rem;
+}
+#quarto-announcement {
+  padding: 1em;
+  font-size: 1em;
+  font-weight: bold;
+  color: $white;
+  background-color: #447099;
+}
+#quarto-announcement a {
+  color: $white;
+}

data/01_translation_files/ssyk96_en.xlsx DELETED Viewed

Binary file (19.9 kB)

data/03_daioe_aggregated/daioe_ssyk2012_emp_weighted.csv DELETED Viewed

The diff for this file is too large to render. See raw diff

data/03_daioe_aggregated/daioe_ssyk2012_simple_avg.csv DELETED Viewed

The diff for this file is too large to render. See raw diff

data/03_daioe_aggregated/daioe_ssyk96_emp_weighted.csv DELETED Viewed

The diff for this file is too large to render. See raw diff

data/03_daioe_aggregated/daioe_ssyk96_simple_avg.csv DELETED Viewed

The diff for this file is too large to render. See raw diff

data/scb_employment_v1.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f9eb2919a2a005828571797bd3c3005300e5c32a50c169c304787f97e998c5b
+size 4277339

main.py DELETED Viewed

@@ -1,72 +0,0 @@
-from __future__ import annotations
-import argparse
-import importlib.util
-from pathlib import Path
-from typing import Iterable
-PROJECT_ROOT = Path(__file__).resolve().parent
-SCRIPTS_DIR = PROJECT_ROOT / "scripts"
-def load_module(name: str, filename: str):
-    """Import a script with a numeric prefix via importlib."""
-    spec = importlib.util.spec_from_file_location(name, SCRIPTS_DIR / filename)
-    module = importlib.util.module_from_spec(spec)
-    if spec.loader is None:  # pragma: no cover - defensive
-        raise ImportError(f"Could not load module '{name}' from {filename}")
-    spec.loader.exec_module(module)
-    return module
-SCB_PULL = load_module("scb_pull", "01_scbPull.py")
-WEIGHTING = load_module("weighting", "02_weighting.py")
-def run_pipeline(taxonomies: Iterable[WEIGHTING.Taxonomy]):
-    """Run SCB pull + weighting for each taxonomy and collect output paths."""
-    summary = []
-    for taxonomy in taxonomies:
-        scb_path = SCB_PULL.pull_taxonomy(taxonomy)
-        weighted_path, simple_path = WEIGHTING.run_weighting(taxonomy)
-        summary.append(
-            {
-                "taxonomy": taxonomy,
-                "scb": scb_path,
-                "weighted": weighted_path,
-                "simple": simple_path,
-            }
-        )
-    return summary
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="Pull SCB data and build employment-weighted DAIOE aggregates",
-    )
-    parser.add_argument(
-        "--taxonomy",
-        action="append",
-        choices=["ssyk2012", "ssyk96"],
-        help="Taxonomy to refresh (can be provided multiple times). Defaults to both.",
-    )
-    return parser.parse_args()
-def main() -> None:
-    args = parse_args()
-    taxonomies = args.taxonomy or ["ssyk2012", "ssyk96"]
-    results = run_pipeline(taxonomies)
-    print("\nDAIOE datasets refreshed:\n" + "-" * 40)
-    for item in results:
-        print(f"Taxonomy: {item['taxonomy']}")
-        print(f"  SCB weights:         {item['scb']}")
-        print(f"  Employment-weighted: {item['weighted']}")
-        print(f"  Simple-average:      {item['simple']}\n")
-    print("Outputs are ready under data/03_daioe_aggregated for app.py")
-if __name__ == "__main__":
-    main()

requirements copy.txt ADDED Viewed

	@@ -0,0 +1,77 @@

+anyio==4.12.0
+anywidget==0.9.21
+asgiref==3.11.0
+asttokens==3.0.1
+certifi==2025.11.12
+charset-normalizer==3.4.4
+click==8.3.1
+comm==0.2.3
+contourpy==1.3.3
+cycler==0.12.1
+decorator==5.2.1
+et-xmlfile==2.0.0
+executing==2.2.1
+fonttools==4.61.0
+h11==0.16.0
+htmltools==0.6.0
+idna==3.11
+ipython==9.8.0
+ipython-pygments-lexers==1.1.1
+ipywidgets==8.1.8
+jedi==0.19.2
+jupyter-core==5.9.1
+jupyterlab-widgets==3.0.16
+kiwisolver==1.4.9
+linkify-it-py==2.0.3
+markdown-it-py==4.0.0
+matplotlib==3.10.7
+matplotlib-inline==0.2.1
+mdit-py-plugins==0.5.0
+mdurl==0.1.2
+mizani==0.14.3
+narwhals==2.13.0
+numpy==2.3.5
+openpyxl==3.1.5
+orjson==3.11.5
+packaging==25.0
+pandas==2.3.3
+parso==0.8.5
+pathlib==1.0.1
+patsy==1.0.2
+pexpect==4.9.0
+pillow==12.0.0
+platformdirs==4.5.1
+plotly==6.5.0
+plotnine==0.15.1
+prompt-toolkit==3.0.52
+psygnal==0.15.0
+ptyprocess==0.7.0
+pure-eval==0.2.3
+pygments==2.19.2
+pyparsing==3.2.5
+pyscbwrapper==0.1.2
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytz==2025.2
+questionary==2.1.1
+requests==2.32.5
+ruff==0.14.9
+scipy==1.16.3
+setuptools==80.9.0
+shiny==1.5.1
+shinychat==0.2.8
+shinywidgets==0.7.0
+six==1.17.0
+stack-data==0.6.3
+starlette==0.50.0
+statsmodels==0.14.6
+traitlets==5.14.3
+typing-extensions==4.15.0
+tzdata==2025.2
+uc-micro-py==1.0.3
+urllib3==2.6.1
+uvicorn==0.38.0
+watchfiles==1.1.1
+wcwidth==0.2.14
+websockets==15.0.1
+widgetsnbextension==4.0.15

requirements.txt CHANGED Viewed

@@ -1,154 +1,75 @@
-annotated-types==0.7.0
-anyio==4.11.0
 anywidget==0.9.21
-argon2-cffi==25.1.0
-argon2-cffi-bindings==25.1.0
-arrow==1.4.0
-asgiref==3.10.0
-asttokens==3.0.0
-async-lru==2.0.5
-attrs==25.4.0
-babel==2.17.0
-beautifulsoup4==4.14.2
-bleach==6.3.0
-brand-yml==0.1.1
 certifi==2025.11.12
-cffi==2.0.0
 charset-normalizer==3.4.4
-click==8.3.0
 comm==0.2.3
 contourpy==1.3.3
 cycler==0.12.1
-debugpy==1.8.17
 decorator==5.2.1
-defusedxml==0.7.1
-et-xmlfile==2.0.0
-eval-type-backport==0.3.0
 executing==2.2.1
-fastjsonschema==2.21.2
-fonttools==4.60.1
-fqdn==1.5.1
-git-filter-repo==2.47.0
 h11==0.16.0
 htmltools==0.6.0
-httpcore==1.0.9
-httpx==0.28.1
 idna==3.11
-ipykernel==7.1.0
-ipython==9.7.0
 ipython-pygments-lexers==1.1.1
 ipywidgets==8.1.8
-isoduration==20.11.0
-itables==2.5.2
 jedi==0.19.2
-jinja2==3.1.6
-json5==0.12.1
-jsonpointer==3.0.0
-jsonschema==4.25.1
-jsonschema-specifications==2025.9.1
-jupyter==1.1.1
-jupyter-client==8.6.3
-jupyter-console==6.6.3
 jupyter-core==5.9.1
-jupyter-events==0.12.0
-jupyter-lsp==2.3.0
-jupyter-server==2.17.0
-jupyter-server-terminals==0.5.3
-jupyterlab==4.4.10
-jupyterlab-pygments==0.3.0
-jupyterlab-server==2.28.0
 jupyterlab-widgets==3.0.16
 kiwisolver==1.4.9
-lark==1.3.1
-libsass==0.23.0
 linkify-it-py==2.0.3
 markdown-it-py==4.0.0
-markupsafe==3.0.3
 matplotlib==3.10.7
 matplotlib-inline==0.2.1
 mdit-py-plugins==0.5.0
 mdurl==0.1.2
-mistune==3.1.4
 mizani==0.14.3
-narwhals==2.11.0
-nbclient==0.10.2
-nbconvert==7.16.6
-nbformat==5.10.4
-nest-asyncio==1.6.0
-notebook==7.4.7
-notebook-shim==0.2.4
-numpy==2.3.4
-openpyxl==3.1.5
-orjson==3.11.4
 packaging==25.0
-palmerpenguins==0.1.4
 pandas==2.3.3
-pandocfilters==1.5.1
 parso==0.8.5
 patsy==1.0.2
-penguins==0.5.2
 pexpect==4.9.0
 pillow==12.0.0
-platformdirs==4.5.0
-plotly==6.4.0
-plotly-express==0.4.1
 plotnine==0.15.1
-prometheus-client==0.23.1
 prompt-toolkit==3.0.52
-psutil==7.1.3
 psygnal==0.15.0
 ptyprocess==0.7.0
 pure-eval==0.2.3
-pycparser==2.23
-pydantic==2.12.4
-pydantic-core==2.41.5
 pygments==2.19.2
 pyparsing==3.2.5
 pyscbwrapper==0.1.2
 python-dateutil==2.9.0.post0
-python-json-logger==4.0.0
 python-multipart==0.0.20
 pytz==2025.2
-pyyaml==6.0.3
-pyzmq==27.1.0
 questionary==2.1.1
-referencing==0.37.0
 requests==2.32.5
-rfc3339-validator==0.1.4
-rfc3986-validator==0.1.1
-rfc3987-syntax==1.1.0
-rpds-py==0.28.0
-ruamel-yaml==0.18.16
-ruamel-yaml-clib==0.2.15
 scipy==1.16.3
-seaborn==0.13.2
-send2trash==1.8.3
 setuptools==80.9.0
-shiny==1.5.0
 shinychat==0.2.8
 shinyswatch==0.9.0
 shinywidgets==0.7.0
 six==1.17.0
-sniffio==1.3.1
-soupsieve==2.8
 stack-data==0.6.3
 starlette==0.50.0
-statsmodels==0.14.5
-terminado==0.18.1
-tinycss2==1.4.0
-tornado==6.5.2
 traitlets==5.14.3
 typing-extensions==4.15.0
-typing-inspection==0.4.2
 tzdata==2025.2
 uc-micro-py==1.0.3
-uri-template==1.3.0
-urllib3==2.5.0
 uvicorn==0.38.0
 watchfiles==1.1.1
 wcwidth==0.2.14
-webcolors==25.10.0
-webencodings==0.5.1
-websocket-client==1.9.0
 websockets==15.0.1
 widgetsnbextension==4.0.15

+anyio==4.12.0
 anywidget==0.9.21
+asgiref==3.11.0
+asttokens==3.0.1
 certifi==2025.11.12
 charset-normalizer==3.4.4
+click==8.3.1
 comm==0.2.3
 contourpy==1.3.3
 cycler==0.12.1
 decorator==5.2.1
 executing==2.2.1
+fonttools==4.61.0
 h11==0.16.0
 htmltools==0.6.0
 idna==3.11
+ipython==9.8.0
 ipython-pygments-lexers==1.1.1
 ipywidgets==8.1.8
 jedi==0.19.2
 jupyter-core==5.9.1
 jupyterlab-widgets==3.0.16
 kiwisolver==1.4.9
 linkify-it-py==2.0.3
 markdown-it-py==4.0.0
 matplotlib==3.10.7
 matplotlib-inline==0.2.1
 mdit-py-plugins==0.5.0
 mdurl==0.1.2
 mizani==0.14.3
+narwhals==2.13.0
+numpy==2.3.5
+orjson==3.11.5
 packaging==25.0
 pandas==2.3.3
 parso==0.8.5
+pathlib==1.0.1
 patsy==1.0.2
 pexpect==4.9.0
 pillow==12.0.0
+platformdirs==4.5.1
+plotly==6.5.0
 plotnine==0.15.1
 prompt-toolkit==3.0.52
 psygnal==0.15.0
 ptyprocess==0.7.0
 pure-eval==0.2.3
 pygments==2.19.2
 pyparsing==3.2.5
 pyscbwrapper==0.1.2
 python-dateutil==2.9.0.post0
 python-multipart==0.0.20
 pytz==2025.2
 questionary==2.1.1
 requests==2.32.5
 scipy==1.16.3
 setuptools==80.9.0
+shiny==1.5.1
 shinychat==0.2.8
 shinyswatch==0.9.0
 shinywidgets==0.7.0
 six==1.17.0
 stack-data==0.6.3
 starlette==0.50.0
+statsmodels==0.14.6
 traitlets==5.14.3
 typing-extensions==4.15.0
 tzdata==2025.2
 uc-micro-py==1.0.3
+urllib3==2.6.1
 uvicorn==0.38.0
 watchfiles==1.1.1
 wcwidth==0.2.14
 websockets==15.0.1
 widgetsnbextension==4.0.15

scripts/01_scbPull.py DELETED Viewed

@@ -1,129 +0,0 @@
-from __future__ import annotations
-import argparse
-from pathlib import Path
-from typing import Literal
-import pandas as pd
-from pyscbwrapper import SCB
-Taxonomy = Literal["ssyk2012", "ssyk96"]
-try:
-    ROOT = Path(__file__).resolve().parents[1]
-except NameError:  # pragma: no cover - interactive fallback
-    ROOT = Path.cwd().resolve()
-DATA_DIR = ROOT / "data"
-SCB_DIR = DATA_DIR / "02_scb_data"
-TABLES = {
-    "ssyk2012": ("en", "AM", "AM0208", "AM0208E", "YREG51BAS"),
-    "ssyk96": ("en", "AM", "AM0208", "AM0208E", "YREG33"),
-}
-def coerce_year(value: str | int | None) -> int | None:
-    try:
-        return int(value) if value is not None else None
-    except (TypeError, ValueError):
-        return None
-def latest_year(var_block: dict) -> str:
-    years = [coerce_year(year) for year in var_block.get("year", [])]
-    valid = [year for year in years if year is not None]
-    if not valid:
-        raise ValueError("SCB variable metadata did not provide any valid years")
-    return str(max(valid))
-def fetch_taxonomy_dataframe(taxonomy: Taxonomy) -> tuple[pd.DataFrame, str]:
-    if taxonomy not in TABLES:
-        raise KeyError(f"Unknown taxonomy '{taxonomy}'")
-    scb = SCB(*TABLES[taxonomy])
-    var_block = scb.get_variables()
-    occupations_key, occupations = next(iter(var_block.items()))
-    clean_key = occupations_key.replace(" ", "")
-    year = latest_year(var_block)
-    scb.set_query(**{clean_key: occupations, "year": [year]})
-    scb_fetch = scb.get_data()["data"]
-    codes = scb.get_query()["query"][0]["selection"]["values"]
-    occ_dict = dict(zip(codes, occupations))
-    records = []
-    for record in scb_fetch:
-        code, obs_year = record["key"][:2]
-        if code == "0002":
-            continue  # drop unspecified bucket
-        value = int(record["values"][0])
-        records.append(
-            {
-                "code_4": str(code).zfill(4),
-                "code_3": str(code).zfill(4)[:3],
-                "code_2": str(code).zfill(4)[:2],
-                "code_1": str(code).zfill(4)[:1],
-                "year": obs_year,
-                "value": value,
-            }
-        )
-    df = pd.DataFrame(records)
-    if df.empty:
-        raise RuntimeError(f"SCB returned no data for taxonomy '{taxonomy}'")
-    level_map = {4: "code_4", 3: "code_3", 2: "code_2", 1: "code_1"}
-    frames = []
-    for level, column in level_map.items():
-        level_df = (
-            df.groupby(["year", column], as_index=False)["value"]
-            .sum()
-            .rename(columns={column: "code"})
-        )
-        level_df["level"] = level
-        frames.append(level_df)
-    stacked = (
-        pd.concat(frames, ignore_index=True)
-        .assign(taxonomy=taxonomy)[["taxonomy", "year", "level", "code", "value"]]
-        .sort_values(["year", "level", "code"], ignore_index=True)
-    )
-    return stacked, year
-def write_taxonomy_csv(df: pd.DataFrame, taxonomy: Taxonomy, year: str) -> Path:
-    SCB_DIR.mkdir(parents=True, exist_ok=True)
-    out_path = SCB_DIR / f"{taxonomy}_en_{year}.csv"
-    df.to_csv(out_path, index=False)
-    return out_path
-def pull_taxonomy(taxonomy: Taxonomy) -> Path:
-    df, year = fetch_taxonomy_dataframe(taxonomy)
-    return write_taxonomy_csv(df, taxonomy, year)
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Pull SCB weights for a taxonomy")
-    parser.add_argument(
-        "--taxonomy",
-        default="ssyk2012",
-        choices=["ssyk2012", "ssyk96"],
-        help="Taxonomy to download (default: ssyk2012)",
-    )
-    return parser.parse_args()
-def main() -> None:
-    args = parse_args()
-    path = pull_taxonomy(args.taxonomy)
-    print(f"Wrote {path}")
-if __name__ == "__main__":
-    main()

scripts/02_weighting.py DELETED Viewed

@@ -1,258 +0,0 @@
-from __future__ import annotations
-import argparse
-from pathlib import Path
-from typing import Literal
-import pandas as pd
-Taxonomy = Literal["ssyk2012", "ssyk96"]
-try:
-    ROOT = Path(__file__).resolve().parents[1]
-except NameError:  # pragma: no cover - interactive fallback
-    ROOT = Path.cwd()
-DATA_DIR = ROOT / "data"
-def data_path(*parts: str | Path) -> Path:
-    return DATA_DIR.joinpath(*parts)
-def latest_file(directory: Path, pattern: str) -> Path:
-    files = sorted(directory.glob(pattern))
-    if not files:
-        raise FileNotFoundError(f"No files matching '{pattern}' in {directory}")
-    return files[-1]
-def load_daioe_raw(taxonomy: Taxonomy, sep: str = "\t") -> pd.DataFrame:
-    return pd.read_csv(data_path("01_daioe_raw", f"daioe_{taxonomy}.csv"), sep=sep)
-def load_scb_employment(taxonomy: Taxonomy) -> pd.DataFrame:
-    scb_path = latest_file(data_path("02_scb_data"), f"{taxonomy}*.csv")
-    return pd.read_csv(scb_path).drop(columns=["year"], errors="ignore")
-def ensure_columns(df: pd.DataFrame, required: list[str]) -> None:
-    missing = [col for col in required if col not in df.columns]
-    if missing:
-        raise KeyError(f"Missing expected columns: {missing}")
-def split_code_label(series: pd.Series) -> tuple[pd.Series, pd.Series]:
-    parts = series.astype(str).str.split(" ", n=1, expand=True)
-    parts = parts.fillna({0: "", 1: ""})
-    return parts[0], parts[1]
-def prepare_raw_dataframe(raw: pd.DataFrame, taxonomy: Taxonomy) -> tuple[pd.DataFrame, list[str]]:
-    df = raw.drop(columns=["Unnamed: 0"], errors="ignore").copy()
-    ensure_columns(df, ["year"])
-    daioe_cols = [col for col in df.columns if col.startswith("daioe_")]
-    if not daioe_cols:
-        raise KeyError("Expected at least one 'daioe_*' column in DAIOE raw file.")
-    code_cols = {
-        4: f"{taxonomy}_4",
-        3: f"{taxonomy}_3",
-        2: f"{taxonomy}_2",
-        1: f"{taxonomy}_1",
-    }
-    ensure_columns(df, list(code_cols.values()))
-    for level, col in code_cols.items():
-        codes, labels = split_code_label(df[col])
-        df[f"code{level}"] = codes
-        df[f"label{level}"] = labels
-    df["code4"] = df["code4"].str.zfill(4)
-    for level in (1, 2, 3):
-        df[f"code{level}"] = df[f"code{level}"].str.lstrip("0")
-    return df, daioe_cols
-def attach_employment(df: pd.DataFrame, scb: pd.DataFrame) -> pd.DataFrame:
-    scb_lvl4 = scb[scb["level"] == 4].copy()
-    if scb_lvl4.empty:
-        raise ValueError("SCB data must contain level-4 rows for weighting.")
-    scb_lvl4["code4"] = scb_lvl4["code"].astype(str).str.zfill(4)
-    merged = df.merge(
-        scb_lvl4[["code4", "value"]],
-        on="code4",
-        how="left",
-        validate="many_to_one",
-    )
-    return merged.rename(columns={"value": "emp"})
-def compute_children_maps(df: pd.DataFrame) -> dict[int, pd.DataFrame]:
-    counts = {
-        1: df.groupby(["year", "code1"])["code2"].nunique().reset_index(name="n_children"),
-        2: df.groupby(["year", "code2"])["code3"].nunique().reset_index(name="n_children"),
-        3: df.groupby(["year", "code3"])["code4"].nunique().reset_index(name="n_children"),
-    }
-    lvl4 = df.groupby(["year", "code4"]).size().reset_index(name="n_children")
-    lvl4["n_children"] = 1
-    counts[4] = lvl4
-    return counts
-def aggregate_level(
-    df: pd.DataFrame,
-    *,
-    daioe_cols: list[str],
-    n_children: dict[int, pd.DataFrame],
-    taxonomy: Taxonomy,
-    level: int,
-    method: Literal["weighted", "simple"],
-) -> pd.DataFrame:
-    if level not in (1, 2, 3):
-        raise ValueError("Only levels 1–3 can be aggregated from level 4.")
-    code_col, label_col = f"code{level}", f"label{level}"
-    group_cols = ["year", code_col, label_col]
-    if method == "weighted":
-        tmp = df[group_cols + ["emp"] + daioe_cols].copy()
-        for metric in daioe_cols:
-            mask = tmp[metric].notna()
-            tmp[f"{metric}_wx"] = tmp[metric].where(mask, 0) * tmp["emp"].where(mask, 0)
-            tmp[f"{metric}_w"] = tmp["emp"].where(mask, 0)
-        agg_cols = {f"{metric}_wx": "sum" for metric in daioe_cols}
-        agg_cols.update({f"{metric}_w": "sum" for metric in daioe_cols})
-        grouped = tmp.groupby(group_cols, as_index=False).agg(agg_cols)
-        for metric in daioe_cols:
-            denom = grouped[f"{metric}_w"].replace(0, pd.NA)
-            grouped[metric] = grouped[f"{metric}_wx"] / denom
-            grouped.drop(columns=[f"{metric}_wx", f"{metric}_w"], inplace=True)
-    else:
-        grouped = df[group_cols + daioe_cols].groupby(group_cols, as_index=False).mean()
-    grouped = grouped.merge(
-        n_children[level],
-        left_on=["year", code_col],
-        right_on=["year", code_col],
-        how="left",
-    )
-    out = grouped[["year", code_col, label_col, "n_children"] + daioe_cols].copy()
-    out["taxonomy"] = taxonomy
-    out["level"] = level
-    out = out.rename(columns={code_col: "code", label_col: "label"})
-    out["code"] = out["code"].astype(str)
-    return out
-def base_level_four(df: pd.DataFrame, daioe_cols: list[str], taxonomy: Taxonomy, n_children: pd.DataFrame) -> pd.DataFrame:
-    base = df[["year", "code4", "label4"] + daioe_cols].copy()
-    base = base.merge(n_children, on=["year", "code4"], how="left")
-    base["taxonomy"] = taxonomy
-    base["level"] = 4
-    base = base.rename(columns={"code4": "code", "label4": "label"})
-    base["code"] = base["code"].astype(str)
-    return base
-def add_percentiles(df: pd.DataFrame, metrics: list[str]) -> list[str]:
-    pct_cols: list[str] = []
-    for metric in metrics:
-        suffix = metric.removeprefix("daioe_")
-        rank_col = f"pct_rank_{suffix}"
-        df[rank_col] = df.groupby(["year", "level"])[metric].rank(pct=True)
-        pct_cols.append(rank_col)
-    return pct_cols
-def build_pipeline(
-    df: pd.DataFrame,
-    *,
-    daioe_cols: list[str],
-    taxonomy: Taxonomy,
-    n_children: dict[int, pd.DataFrame],
-    method: Literal["weighted", "simple"],
-) -> pd.DataFrame:
-    lvl4 = base_level_four(df, daioe_cols, taxonomy, n_children[4])
-    lvl1 = aggregate_level(df, daioe_cols=daioe_cols, n_children=n_children, taxonomy=taxonomy, level=1, method=method)
-    lvl2 = aggregate_level(df, daioe_cols=daioe_cols, n_children=n_children, taxonomy=taxonomy, level=2, method=method)
-    lvl3 = aggregate_level(df, daioe_cols=daioe_cols, n_children=n_children, taxonomy=taxonomy, level=3, method=method)
-    combined = pd.concat([lvl1, lvl2, lvl3, lvl4], ignore_index=True)
-    pct_cols = add_percentiles(combined, daioe_cols)
-    ordered = [
-        "taxonomy",
-        "level",
-        "code",
-        "label",
-        "year",
-        "n_children",
-        *daioe_cols,
-        *pct_cols,
-    ]
-    return combined[ordered].sort_values(["level", "code", "year"], ignore_index=True)
-def write_outputs(taxonomy: Taxonomy, weighted: pd.DataFrame, simple: pd.DataFrame) -> tuple[Path, Path]:
-    out_dir = data_path("03_daioe_aggregated")
-    out_dir.mkdir(parents=True, exist_ok=True)
-    weighted_path = out_dir / f"daioe_{taxonomy}_emp_weighted.csv"
-    simple_path = out_dir / f"daioe_{taxonomy}_simple_avg.csv"
-    weighted.to_csv(weighted_path, index=False)
-    simple.to_csv(simple_path, index=False)
-    return weighted_path, simple_path
-def run_weighting(taxonomy: Taxonomy, sep: str = "\t") -> tuple[Path, Path]:
-    raw = load_daioe_raw(taxonomy, sep=sep)
-    scb = load_scb_employment(taxonomy)
-    prepared, daioe_cols = prepare_raw_dataframe(raw, taxonomy)
-    prepared = attach_employment(prepared, scb)
-    n_children = compute_children_maps(prepared)
-    weighted = build_pipeline(
-        prepared,
-        daioe_cols=daioe_cols,
-        taxonomy=taxonomy,
-        n_children=n_children,
-        method="weighted",
-    )
-    simple = build_pipeline(
-        prepared,
-        daioe_cols=daioe_cols,
-        taxonomy=taxonomy,
-        n_children=n_children,
-        method="simple",
-    )
-    return write_outputs(taxonomy, weighted, simple)
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Run DAIOE weighting pipeline")
-    parser.add_argument(
-        "--taxonomy",
-        default="ssyk2012",
-        choices=["ssyk2012", "ssyk96"],
-        help="Taxonomy to process (default: ssyk2012)",
-    )
-    parser.add_argument(
-        "--sep",
-        default="\t",
-        help="Delimiter used in DAIOE raw files (default: tab)",
-    )
-    return parser.parse_args()
-def main() -> None:
-    args = parse_args()
-    weighted_path, simple_path = run_weighting(args.taxonomy, sep=args.sep)
-    print("Written employment-weighted file:", weighted_path)
-    print("Written simple-average file:    ", simple_path)
-if __name__ == "__main__":
-    main()

scripts/04_occ.py DELETED Viewed

@@ -1,109 +0,0 @@
-import pandas as pd
-from pyscbwrapper import SCB
-from pathlib import Path
-# Optional: project root if you need it elsewhere
-ROOT = Path(__file__).resolve().parent
-TAX_ID = "ssyk2012"
-TABLES = {
-    "ssyk2012_tab": ("en", "AM", "AM0208", "AM0208B", "YREG61BAS"),
-    # "ssyk96_tab": ("en", "AM", "AM0208", "AM0208E", "YREG33"),
-}
-def fetch_scb_aku_occupations(tax_id: str = TAX_ID) -> pd.DataFrame:
-    """
-    Fetch SCB AKU employment by occupation (SSYK 2012), age and year,
-    and return a cleaned DataFrame at the SSYK3 level (string codes).
-    Columns:
-      - code_3      (SSYK code as returned by SCB; can be 2–4 digits)
-      - occupation  (text label from SCB)
-      - age
-      - year
-      - value       (string as provided by SCB)
-    """
-    # ---- 1) Init SCB table ----
-    scb = SCB(*TABLES[f"{tax_id}_tab"])
-    var_ = scb.get_variables()
-    # First variable is the occupation variable (as in your original code)
-    occupations_key, occupations = next(iter(var_.items()))
-    clean_key = occupations_key.replace(" ", "")
-    # ---- 2) Years: coerce to int, use all valid years ----
-    def coerce_year(y):
-        try:
-            return int(y)
-        except Exception:
-            return None
-    years = [coerce_year(y) for y in var_["year"]]
-    years = [y for y in years if y is not None]
-    if not years:
-        raise ValueError("No valid years found in SCB variables")
-    years_sorted = sorted(set(years))
-    year_values = [str(y) for y in years_sorted]
-    # ---- 3) All ages as provided by SCB ----
-    age_values = var_["age"]
-    # ---- 4) Build and send query ----
-    scb.set_query(
-        **{
-            clean_key: occupations,
-            "year": year_values,  # all years
-            "age": age_values,  # all ages
-        }
-    )
-    scb_data = scb.get_data()
-    scb_fetch = scb_data["data"]
-    # Map occupation codes to their labels
-    codes = scb.get_query()["query"][0]["selection"]["values"]
-    occ_dict = dict(zip(codes, occupations))
-    # ---- 5) Build DataFrame ----
-    records = []
-    for r in scb_fetch:
-        # The order follows the SCB query; your original code assumed:
-        # occupation code, age, year
-        code, age, year = r["key"]
-        name = occ_dict.get(code, code)
-        value = r["values"][0]  # raw string
-        records.append(
-            {
-                "code_3": code,
-                "occupation": name,
-                "age": age,
-                "year": year,
-                "value": value,
-            }
-        )
-    df = pd.DataFrame(records)
-    # Remove unidentified group 002 (as in your original code)
-    df = df[df["code_3"] != "002"].reset_index(drop=True)
-    return df
-def main() -> pd.DataFrame:
-    """Entry point when run as a script; returns the DataFrame."""
-    df = fetch_scb_aku_occupations()
-    # Optional: quick check
-    print(df.head())
-    print(f"\nRows: {len(df)}, columns: {list(df.columns)}")
-    return df
-if __name__ == "__main__":
-    main()

scripts/__init__.py DELETED Viewed

File without changes

src/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""src package initializer.
+This package contains the core SCB employment data pipeline modules.
+Modules include data loading, caching and aggregation helpers.  See
+individual module docstrings for details.
+"""

src/config.py ADDED Viewed

	@@ -0,0 +1,53 @@

+"""
+Configuration constants for the SCB-only employment data pipeline.
+"""
+from typing import Dict, List, Literal, Tuple
+# ======================================================
+#  DATA SOURCES / CONSTANTS
+# ======================================================
+TAXONOMY: Literal["ssyk2012"] = "ssyk2012"
+TRANSLATION_URL: str = (
+    "https://raw.githubusercontent.com/joseph-data/07_translate_ssyk/main/"
+    "02_translation_files/ssyk2012_en.xlsx"
+)
+# SCB table definitions
+TABLES: Dict[str, Tuple[str, str, str, str, str]] = {
+    "14_to_18": ("en", "AM", "AM0208", "AM0208E", "YREG51"),
+    "19_to_21": ("en", "AM", "AM0208", "AM0208E", "YREG51N"),
+    "20_to_23": ("en", "AM", "AM0208", "AM0208E", "YREG51BAS"),
+}
+AGE_EXCLUSIONS: List[str] = ["65-69 years"]
+EXCLUDED_CODES: List[str] = ["0002", "0000"]
+# ======================================================
+#  UI DEFAULTS
+# ======================================================
+LEVEL_OPTIONS: List[Tuple[str, str]] = [
+    ("Level 4 (4-digit)", "4"),
+    ("Level 3 (3-digit)", "3"),
+    ("Level 2 (2-digit)", "2"),
+    ("Level 1 (1-digit)", "1"),
+]
+DEFAULT_LEVEL: str = "3"
+GLOBAL_YEAR_MIN: int = 2014
+GLOBAL_YEAR_MAX: int = 2023
+DEFAULT_YEAR_RANGE: Tuple[int, int] = (GLOBAL_YEAR_MIN, GLOBAL_YEAR_MAX)
+AGE_ORDER: List[str] = [
+    "16-24",
+    "25-29",
+    "30-34",
+    "35-39",
+    "40-44",
+    "45-49",
+    "50-54",
+    "55-59",
+    "60-64",
+]

src/data_manager.py ADDED Viewed

	@@ -0,0 +1,140 @@

+"""Data manager for loading and caching SCB employment pipeline results.
+This module encapsulates the logic for computing the SCB-only
+transformations in ``pipeline.py`` and persisting the result to disk.
+It adds a small amount of resilience around caching and uses
+``logging`` instead of printing directly to stdout.  The cache file
+includes a version tag to make it easy to invalidate caches when
+fundamental changes are made to the pipeline logic.
+"""
+import os
+import tempfile
+import logging
+from pathlib import Path
+from functools import lru_cache
+import pandas as pd
+from . import pipeline
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Cache setup
+# ---------------------------------------------------------------------------
+# A version tag to embed into the cache filenames.  Bump this value
+# whenever the underlying ``pipeline`` logic changes in a way that
+# invalidates existing caches.
+CACHE_VERSION: str = "v1"
+def _resolve_cache_dir() -> Path:
+    """Select a writable directory for caching.
+    The lookup order is:
+    1. The ``DATA_CACHE_DIR`` environment variable, if set.
+    2. A ``data`` folder at the repository root.
+    3. A temporary directory in ``/tmp``.
+    Each candidate path is tested for writability by attempting to
+    create and delete a sentinel file.  The first path that succeeds
+    is returned.  If none succeed, a final fallback directory in ``/tmp``
+    is created and returned.
+    """
+    candidates: list[Path] = []
+    env = os.getenv("DATA_CACHE_DIR")
+    if env:
+        # Expand relative or user paths to absolute
+        candidates.append(Path(env).expanduser().resolve())
+    # Repo root /data (two levels up from this file)
+    candidates.append(Path(__file__).resolve().parent.parent / "data")
+    # Temp fallback
+    candidates.append(Path(tempfile.gettempdir()) / "employment_ai_cache")
+    for path in candidates:
+        try:
+            path.mkdir(parents=True, exist_ok=True)
+            test_file = path / ".write_test"
+            test_file.write_text("ok", encoding="utf-8")
+            test_file.unlink()
+            return path
+        except Exception:
+            continue
+    # Final fallback: ensure the last candidate exists
+    fallback = Path(tempfile.gettempdir()) / "employment_ai_cache"
+    fallback.mkdir(parents=True, exist_ok=True)
+    return fallback
+# Resolve the directory once at import time
+DATA_DIR: Path = _resolve_cache_dir()
+# Single cache file for the SCB-only output DataFrame.
+SCB_CACHE: Path = DATA_DIR / f"scb_employment_{CACHE_VERSION}.csv"
+def _atomic_to_csv(df: pd.DataFrame, path: Path) -> None:
+    """Write a DataFrame to CSV atomically.
+    The CSV is first written to a temporary file in the same directory
+    and then renamed to the final location.  This avoids leaving a
+    partially written file if the process is interrupted mid‑write.
+    """
+    path.parent.mkdir(parents=True, exist_ok=True)
+    tmp_path = path.with_suffix(path.suffix + ".tmp")
+    df.to_csv(tmp_path, index=False)
+    tmp_path.replace(path)
+@lru_cache(maxsize=1)
+def _compute_pipeline_payload() -> pd.DataFrame:
+    """Runs the SCB-only pipeline calculation."""
+    return pipeline.run_pipeline()
+def load_payload(force_recompute: bool = False) -> pd.DataFrame:
+    """
+    Load employment data from disk cache if available, otherwise compute and save.
+    Parameters
+    ----------
+    force_recompute : bool, optional
+        If ``True``, recompute the pipeline even if cache files exist.
+    Returns
+    -------
+    pd.DataFrame
+        The SCB employment data with hierarchy levels, age groups and totals.
+    """
+    # If a cached payload exists and recomputation is not forced, return it
+    if not force_recompute and SCB_CACHE.exists():
+        logger.info("Loading pipeline output from cache directory %s", DATA_DIR)
+        try:
+            return pd.read_csv(SCB_CACHE)
+        except Exception as exc:
+            # If reading the cache fails, fall back to recomputing
+            logger.warning(
+                "Error reading cache file %s: %s; falling back to recompute",
+                SCB_CACHE,
+                exc,
+            )
+    if force_recompute:
+        # Clear the LRU cache before recomputing
+        _compute_pipeline_payload.cache_clear()
+    logger.info("Computing SCB employment data – this may take a while…")
+    payload = _compute_pipeline_payload()
+    # Persist to disk atomically
+    try:
+        _atomic_to_csv(payload, SCB_CACHE)
+        logger.info("Cache updated: %s", SCB_CACHE.name)
+    except Exception as exc:
+        logger.warning("Could not write cache file: %s", exc)
+    return payload

src/label_enrichment.py ADDED Viewed

	@@ -0,0 +1,72 @@

+"""
+Utilities to add English occupation labels to pipeline output using the
+published SSYK2012 translation workbook.
+The translation file is read directly from:
+https://github.com/joseph-data/07_translate_ssyk/blob/main/02_translation_files/ssyk2012_en.xlsx
+"""
+from __future__ import annotations
+from typing import Dict
+import pandas as pd
+from .config import TRANSLATION_URL
+def _load_level(sheet_name: str, level: int, url: str) -> pd.DataFrame:
+    """Load a single level sheet and return columns ``code<level>``/``label<level>``."""
+    # Header row with code/name resides at index 3 (0-based)
+    df = pd.read_excel(url, sheet_name=sheet_name, header=3)
+    df = df.rename(columns=lambda c: str(c).strip())
+    code_col = next(c for c in df.columns if "SSYK" in str(c))
+    name_col = next(c for c in df.columns if "Name" in str(c))
+    df = df[[code_col, name_col]].dropna(subset=[code_col])
+    df[code_col] = df[code_col].astype(str).str.strip().str.zfill(level)
+    df[name_col] = df[name_col].astype(str).str.strip()
+    return df.rename(columns={code_col: f"code{level}", name_col: f"label{level}"})
+def load_translation_tables(url: str = TRANSLATION_URL) -> Dict[int, pd.DataFrame]:
+    """Return translation tables for SSYK levels 1–4 keyed by level."""
+    tables: Dict[int, pd.DataFrame] = {}
+    for level, sheet in ((1, "1-digit"), (2, "2-digit"), (3, "3-digit"), (4, "4-digit")):
+        tables[level] = _load_level(sheet, level, url)
+    return tables
+def apply_translations(df: pd.DataFrame, *, tables: Dict[int, pd.DataFrame] | None = None) -> pd.DataFrame:
+    """
+    Apply English labels to an aggregated SCB DataFrame with columns ``level``, ``code`` and ``label``.
+    The ``label`` column is replaced (when available) with the translation matching
+    the SSYK level/code combination. Rows without a translation keep their original label.
+    """
+    if tables is None:
+        tables = load_translation_tables()
+    label_maps = {
+        level: tbl.set_index(f"code{level}")[f"label{level}"] for level, tbl in tables.items()
+    }
+    out = df.copy()
+    for level, mapping in label_maps.items():
+        mask = out["level"] == level
+        if mask.any():
+            out.loc[mask, "label"] = out.loc[mask, "code"].map(mapping).fillna(
+                out.loc[mask, "label"]
+            )
+    return out
+if __name__ == "__main__":
+    # Example usage: enrich pipeline output with translated labels and preview
+    from .data_manager import load_payload
+    pipeline_df = load_payload()
+    labeled = apply_translations(pipeline_df)
+    print(labeled.head())

src/pipeline.py ADDED Viewed

	@@ -0,0 +1,180 @@

+"""Core pipeline logic for SCB employment-only data.
+This module fetches employment data from Statistics Sweden (SCB),
+derives SSYK2012 hierarchy columns from 4-digit codes, and aggregates
+employment totals across hierarchy levels. DAIOE exposure inputs have
+been removed so the output contains only SCB employment counts.
+"""
+from __future__ import annotations
+from typing import Dict, Optional
+import logging
+import pandas as pd
+from .config import TAXONOMY
+from .label_enrichment import apply_translations
+from .scb_fetch import fetch_all_employment_data
+logger = logging.getLogger(__name__)
+def filter_years(
+    df: pd.DataFrame,
+    year_min: Optional[int],
+    year_max: Optional[int],
+    *,
+    year_col: str,
+) -> pd.DataFrame:
+    """Return a DataFrame filtered to the inclusive year range."""
+    if year_min is None and year_max is None:
+        return df.copy()
+    mask = pd.Series(True, index=df.index, dtype=bool)
+    if year_min is not None:
+        mask &= df[year_col] >= year_min
+    if year_max is not None:
+        mask &= df[year_col] <= year_max
+    mask = mask.fillna(False)
+    return df.loc[mask].copy()
+def prepare_employment(
+    raw: pd.DataFrame,
+    *,
+    year_min: Optional[int] = None,
+    year_max: Optional[int] = None,
+) -> pd.DataFrame:
+    """Clean SCB employment data and derive SSYK hierarchy columns."""
+    if raw.empty:
+        raise ValueError("SCB fetch returned an empty DataFrame.")
+    emp = raw.copy()
+    emp["code4"] = emp["code_4"].astype(str).str.zfill(4)
+    emp["code3"] = emp["code4"].str[:3]
+    emp["code2"] = emp["code4"].str[:2]
+    emp["code1"] = emp["code4"].str[:1]
+    emp["label4"] = emp["occupation"].fillna("").str.strip()
+    emp["label3"] = emp["code3"]
+    emp["label2"] = emp["code2"]
+    emp["label1"] = emp["code1"]
+    emp["age"] = emp["age"].astype(str).str.strip()
+    emp["year"] = pd.to_numeric(emp["year"], errors="coerce").astype("Int64")
+    emp["employment"] = pd.to_numeric(emp["value"], errors="coerce").fillna(0)
+    emp = emp.dropna(subset=["year"])
+    emp = filter_years(emp, year_min, year_max, year_col="year")
+    ordered_cols = [
+        "year",
+        "age",
+        "code4",
+        "label4",
+        "code3",
+        "label3",
+        "code2",
+        "label2",
+        "code1",
+        "label1",
+        "employment",
+    ]
+    return emp[ordered_cols]
+def compute_children_maps(df: pd.DataFrame) -> Dict[int, pd.DataFrame]:
+    """Count the number of descendants for each code at each hierarchy level."""
+    base = df[["year", "code4", "code3", "code2", "code1"]].drop_duplicates()
+    counts: Dict[int, pd.DataFrame] = {}
+    counts[3] = (
+        base.groupby(["year", "code3"])["code4"]
+        .nunique()
+        .reset_index(name="n_children")
+    )
+    counts[2] = (
+        base.groupby(["year", "code2"])["code3"]
+        .nunique()
+        .reset_index(name="n_children")
+    )
+    counts[1] = (
+        base.groupby(["year", "code1"])["code2"]
+        .nunique()
+        .reset_index(name="n_children")
+    )
+    lvl4 = base.groupby(["year", "code4"]).size().reset_index(name="n_children")
+    lvl4["n_children"] = 1
+    counts[4] = lvl4
+    return counts
+def build_employment_views(emp: pd.DataFrame) -> Dict[int, Dict[str, pd.DataFrame]]:
+    """Build employment views (age and totals) for each hierarchy level."""
+    views: Dict[int, Dict[str, pd.DataFrame]] = {}
+    for level in (4, 3, 2, 1):
+        code_col, label_col = f"code{level}", f"label{level}"
+        age_view = emp.groupby(
+            ["year", "age", code_col, label_col], as_index=False
+        )["employment"].sum()
+        total_view = (
+            age_view.groupby(["year", code_col, label_col], as_index=False)["employment"]
+            .sum()
+            .rename(columns={"employment": "employment_total"})
+        )
+        views[level] = {"age": age_view, "total": total_view}
+    return views
+def build_level_frame(
+    level: int, views: Dict[int, Dict[str, pd.DataFrame]], children: Dict[int, pd.DataFrame]
+) -> pd.DataFrame:
+    """Combine age-level employment, totals and child counts for a level."""
+    code_col, label_col = f"code{level}", f"label{level}"
+    age_view = views[level]["age"].copy()
+    totals = views[level]["total"]
+    merged = (
+        age_view.merge(totals, on=["year", code_col, label_col], how="left")
+        .merge(children[level], on=["year", code_col], how="left")
+    )
+    merged["level"] = level
+    merged["taxonomy"] = TAXONOMY
+    merged = merged.rename(columns={code_col: "code", label_col: "label"})
+    ordered = [
+        "taxonomy",
+        "level",
+        "code",
+        "label",
+        "year",
+        "n_children",
+        "age",
+        "employment",
+        "employment_total",
+    ]
+    return merged[ordered]
+def run_pipeline(
+    *,
+    year_min: Optional[int] = None,
+    year_max: Optional[int] = None,
+) -> pd.DataFrame:
+    """Run the SCB-only pipeline and return aggregated employment data."""
+    logger.info("Starting SCB-only employment pipeline")
+    raw = fetch_all_employment_data()
+    employment = prepare_employment(raw, year_min=year_min, year_max=year_max)
+    if employment.empty:
+        raise ValueError("No SCB employment rows remain after filtering.")
+    children = compute_children_maps(employment)
+    emp_views = build_employment_views(employment)
+    levels = [
+        build_level_frame(level, emp_views, children) for level in (1, 2, 3, 4)
+    ]
+    combined = pd.concat(levels, ignore_index=True)
+    combined = combined.sort_values(["level", "code", "year", "age"], ignore_index=True)
+    combined = apply_translations(combined)
+    return combined

src/plot_helper.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import pandas as pd
+import plotly.graph_objects as go
+import plotly.express as px
+from plotly.subplots import make_subplots
+def multi_plot(df: pd.DataFrame) -> go.Figure:
+    age_groups = sorted(df["age"].dropna().unique())
+    occupations = sorted(df["label"].dropna().unique())
+    # Use a Plotly qualitative palette
+    palette = px.colors.qualitative.Plotly
+    # Cycle safely if occupations > palette length
+    occ_color_map = {
+        occ: palette[i % len(palette)] for i, occ in enumerate(occupations)
+    }
+    # ------------------------------------------------------------------
+    # 2. Create multi-row subplot scaffolding
+    # ------------------------------------------------------------------
+    subplot_titles = [
+        (f"<b>Employed Persons Aged {age} Years by Occupation") for age in age_groups
+    ]
+    fig = make_subplots(
+        rows=len(age_groups),
+        cols=1,
+        shared_xaxes=False,
+        vertical_spacing=0.03,
+        subplot_titles=subplot_titles,
+    )
+    # ------------------------------------------------------------------
+    # 3. Add traces per age group and exposure level
+    # ------------------------------------------------------------------
+    # Need to pre-define the max row number for the final x-axis update
+    max_row = len(age_groups)
+    for i, age in enumerate(age_groups, start=1):
+        df_age = df[df["age"] == age]
+        # Aggregate by Year and Label
+        df_plot = df_age.groupby(["year", "label"], as_index=False)["employment"].sum()
+        for occ_title, sub in df_plot.groupby("label"):
+            fig.add_trace(
+                go.Scatter(
+                    x=sub["year"],
+                    y=sub["employment"],
+                    mode="lines+markers",
+                    showlegend=True
+                    if i == 1
+                    else False,  # Show legend only in the first subplot
+                    name=occ_title,
+                    line=dict(color=occ_color_map[occ_title], width=2),
+                    # Add group/age info to the hover template for debugging/clarity
+                    hovertemplate=f"Age: {age}<br>Year: %{{x}}<br>Employment: %{{y:,}}<extra>{occ_title}</extra>",
+                ),
+                row=i,
+                col=1,
+            )
+        # Y-axis update must be inside the loop to target the current row (i)
+        fig.update_yaxes(
+            title_text="Number of Employed Persons",
+            tickformat=",",
+            rangemode="tozero",
+            row=i,
+            col=1,
+        )
+        # X-axis update must target the bottom row (max_row)
+        fig.update_xaxes(
+            title_text="Year",
+            tickmode="linear",
+            dtick=1,
+            row=max_row,
+            col=1,
+        )
+    # ------------------------------------------------------------------
+    # 4. Global layout tweaks
+    # ------------------------------------------------------------------
+    fig.update_annotations(yshift=30)
+    fig.update_layout(
+        height=400 * len(age_groups),  # Reduced height for sample data
+        width=1000,  # Added a main title
+        legend_traceorder="normal",
+        legend=dict(
+            title="Occupation Title(s)",
+            orientation="v",
+            yanchor="top",
+            y=1.0,
+            xanchor="left",
+            x=1.02,
+            bordercolor="#c7c7c7",
+            borderwidth=1,
+            bgcolor="#f9f9f9",
+            font=dict(size=10),
+        ),
+        margin=dict(t=100, l=50, r=80, b=40),
+        plot_bgcolor="#f5f7fb",
+        xaxis_showgrid=True,
+    )
+    return fig

src/scb_fetch.py ADDED Viewed

	@@ -0,0 +1,143 @@

+"""Helpers for fetching employment data from the SCB API.
+This module wraps the ``pyscbwrapper`` library to download
+occupation/employment tables from Statistics Sweden.  Error handling
+and logging are centralised here so that callers of ``fetch_all_employment_data``
+can remain agnostic of the details.
+"""
+from typing import Tuple
+import logging
+import pandas as pd
+from pyscbwrapper import SCB
+from .config import AGE_EXCLUSIONS, EXCLUDED_CODES, TABLES
+logger = logging.getLogger(__name__)
+def fetch_scb_table(
+    table_id: str, config: Tuple[str, str, str, str, str]
+) -> pd.DataFrame:
+    """Fetch and transform a single SCB table.
+    Parameters
+    ----------
+    table_id : str
+        A key identifying which table definition in ``TABLES`` to use.
+    config : Tuple[str, str, str, str, str]
+        The tuple of (language, subject, table, variable_code, filter) used
+        by ``pyscbwrapper.SCB`` to form the query.
+    Returns
+    -------
+    pd.DataFrame
+        A DataFrame containing one row per (4‑digit occupation code, age,
+        year) combination.  Returns an empty frame on error.
+    """
+    logger.info("Starting SCB fetch for table %s", table_id)
+    try:
+        scb = SCB(*config)
+        var_ = scb.get_variables()
+        def get_key_raw(term: str) -> str:
+            return next(k for k in var_ if term in k.lower())
+        # Identify variable keys from the SCB metadata
+        occ_key_raw = get_key_raw("occupation")
+        year_key_raw = get_key_raw("year")
+        age_key_raw = get_key_raw("age")
+        # Filter out excluded ages
+        all_ages = var_[age_key_raw]
+        filtered_ages = [age for age in all_ages if age not in AGE_EXCLUSIONS]
+        # Build the query: remove spaces from the occupation key because SCB
+        # uses inconsistent spacing conventions
+        query_args = {
+            occ_key_raw.replace(" ", ""): var_[occ_key_raw],
+            year_key_raw: var_[year_key_raw],
+            age_key_raw: filtered_ages,
+        }
+        scb.set_query(**query_args)
+        raw_data = scb.get_data()
+        scb_fetch = raw_data.get("data", [])
+        # Build a mapping from code to human‑readable occupation name using the
+        # query metadata.  We fall back to the code itself if no mapping
+        # exists.
+        query_meta = scb.get_query().get("query", [])
+        occ_meta_vals = next(
+            q["selection"]["values"]
+            for q in query_meta
+            if "occupation" in q["code"].lower() or q["code"] == "Yrke2012"
+        )
+        occ_dict = dict(zip(occ_meta_vals, var_[occ_key_raw]))
+        records = []
+        for r in scb_fetch:
+            code, age, year = r.get("key", [])[:3]
+            records.append(
+                {
+                    "code_4": code,
+                    "occupation": occ_dict.get(code, code),
+                    "age": age,
+                    "year": year,
+                    "value": r.get("values", [None])[0],
+                    "source_table": table_id,
+                }
+            )
+        return pd.DataFrame.from_records(records)
+    except Exception as exc:
+        logger.error("Error processing SCB table %s: %s", table_id, exc)
+        return pd.DataFrame()
+def fetch_all_employment_data() -> pd.DataFrame:
+    """Fetch and consolidate employment data across all configured SCB tables.
+    The configured tables in ``TABLES`` may overlap in years.  When
+    overlaps occur, later tables in the dictionary take precedence over
+    earlier ones.  Rows whose occupation codes are listed in
+    ``EXCLUDED_CODES`` are removed.
+    Returns
+    -------
+    pd.DataFrame
+        A DataFrame indexed by (code_4, age, year) with a single
+        numeric ``value`` column containing the employment counts.
+        Returns an empty frame if no data could be retrieved.
+    """
+    logger.info("Beginning employment data collection from SCB")
+    dfs: list[pd.DataFrame] = []
+    for tab_id, config in TABLES.items():
+        df_part = fetch_scb_table(tab_id, config)
+        if not df_part.empty:
+            dfs.append(df_part)
+        else:
+            logger.warning("No data retrieved for table %s", tab_id)
+    # If nothing fetched, return an empty DataFrame
+    if not dfs:
+        logger.warning("All SCB table fetches returned empty DataFrames")
+        return pd.DataFrame()
+    df = pd.concat(dfs, ignore_index=True)
+    # Resolve overlaps between tables by assigning a priority to each table.
+    table_priority = {key: i for i, key in enumerate(TABLES.keys())}
+    df["table_priority"] = df["source_table"].map(table_priority)
+    df = (
+        df.sort_values(["code_4", "age", "year", "table_priority"])
+        .drop_duplicates(subset=["code_4", "age", "year"], keep="last")
+        .drop(columns=["table_priority"])
+    )
+    # Exclude specified codes and coerce the value column to numeric
+    df = df[~df["code_4"].isin(EXCLUDED_CODES)].reset_index(drop=True)
+    df["value"] = pd.to_numeric(df["value"], errors="coerce")
+    return df