Spaces:

joseph-data
/

yearly_explorer

Running

App Files Files Community

joseph-data commited on 4 days ago

Commit

0a13764

verified ·

1 Parent(s): af128a9

Sync from GitHub via hub-sync

Browse files

Files changed (4) hide show

README.md +1 -1
src/calcs.py +152 -0
src/setup.py +256 -0
src/visuals.py +188 -0

README.md CHANGED Viewed

@@ -24,4 +24,4 @@ An interactive Shiny app for exploring AI exposure and employment trends across
 | Data | Source |
 |------|--------|
 | AI Exposure Index | [DAIOE — AI Econ Lab](https://www.ai-econlab.com/ai-exposure-daioe) |
-| Employment Statistics | [Swedish Occupational Register, SCB](https://www.scb.se/en/finding-statistics/statistics-by-subject-area/labour-market/labour-force-supply/the-swedish-occupational-register-with-statistics/).

 | Data | Source |
 |------|--------|
 | AI Exposure Index | [DAIOE — AI Econ Lab](https://www.ai-econlab.com/ai-exposure-daioe) |
+| Employment Statistics | [Swedish Occupational Register, SCB](https://www.scb.se/en/finding-statistics/statistics-by-subject-area/labour-market/labour-force-supply/the-swedish-occupational-register-with-statistics/)

src/calcs.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import polars as pl
+def get_occ_summary(lf: pl.LazyFrame, occupation: str, year: int) -> dict | None:
+    """
+    Aggregate employment count and percentage changes for one occupation and year.
+    Returns a dict with keys: employment, pct_1y, pct_3y, pct_5y, year.
+    Returns None if no data matches the filters.
+    """
+    df = (
+        lf.filter(
+            (pl.col("occupation") == occupation) & (pl.col("year") == year),
+        )
+        .select(["count", "pct_chg_1y", "pct_chg_3y", "pct_chg_5y", "year"])
+        .collect()
+    )
+    if df.is_empty():
+        return None
+    def _mean_or_none(col: str) -> float | None:
+        val = df[col].mean()
+        return None if val is None else float(val)
+    return {
+        "employment": df["count"].sum(),
+        "pct_1y": _mean_or_none("pct_chg_1y"),
+        "pct_3y": _mean_or_none("pct_chg_3y"),
+        "pct_5y": _mean_or_none("pct_chg_5y"),
+        "year": int(df["year"][0]),
+    }
+AI_WAVG_COLS = [
+    "daioe_genai_wavg",
+    "daioe_allapps_wavg",
+    "daioe_stratgames_wavg",
+    "daioe_videogames_wavg",
+    "daioe_imgrec_wavg",
+    "daioe_imgcompr_wavg",
+    "daioe_imggen_wavg",
+    "daioe_readcompr_wavg",
+    "daioe_lngmod_wavg",
+    "daioe_translat_wavg",
+    "daioe_speechrec_wavg",
+]
+AI_LABELS = {
+    "daioe_genai_wavg": "🧠 Generative AI",
+    "daioe_allapps_wavg": "📚 All Applications",
+    "daioe_stratgames_wavg": "♟️ Strategy Games",
+    "daioe_videogames_wavg": "🎮 Video Games",
+    "daioe_imgrec_wavg": "🖼️ Image Recognition",
+    "daioe_imgcompr_wavg": "🧩 Image Comprehension",
+    "daioe_imggen_wavg": "🎨 Image Generation",
+    "daioe_readcompr_wavg": "📖 Reading Comprehension",
+    "daioe_lngmod_wavg": "✍️ Language Modeling",
+    "daioe_translat_wavg": "🌐 Translation",
+    "daioe_speechrec_wavg": "🎙️ Speech Recognition",
+}
+AI_LEVEL_COLS = [c.replace("_wavg", "_Level_Exposure") for c in AI_WAVG_COLS]
+AI_PCTL_COLS = [f"pctl_{c}" for c in AI_WAVG_COLS]
+EXPOSURE_LABELS = {1: "Very Low", 2: "Low", 3: "Medium", 4: "High", 5: "Very High"}
+def get_occ_ai_exposure(
+    lf: pl.LazyFrame, occupation: str, year: int,
+) -> pl.DataFrame:
+    """
+    Return mean weighted AI exposure scores, exposure levels, and percentile ranks per sub-domain.
+    Returns a long-format DataFrame with columns: domain, score, level, level_label, percentile.
+    Used to power the ranked horizontal bar chart in Card 2.
+    """
+    select_cols = AI_WAVG_COLS + AI_LEVEL_COLS + AI_PCTL_COLS
+    df = (
+        lf.filter(
+            (pl.col("occupation") == occupation) & (pl.col("year") == year),
+        )
+        .select(select_cols)
+        .collect()
+    )
+    rows = []
+    for wavg_col, level_col, pctl_col in zip(AI_WAVG_COLS, AI_LEVEL_COLS, AI_PCTL_COLS, strict=False):
+        raw_level = df[level_col].mean()
+        level_val = round(raw_level) if raw_level is not None else None
+        rows.append({
+            "domain": AI_LABELS[wavg_col],
+            "score": df[wavg_col].mean(),
+            "level": level_val,
+            "level_label": EXPOSURE_LABELS.get(level_val, "Unknown") if level_val else "Unknown",
+            "percentile": df[pctl_col].mean(),
+        })
+    return pl.DataFrame(rows).sort("score")
+def get_occ_ai_trend(
+    lf: pl.LazyFrame, occupation: str, year_range: tuple[int, int],
+) -> pl.DataFrame:
+    """
+    Return yearly mean weighted AI exposure (All Applications) for one occupation over a year range.
+    Returns a DataFrame with columns: year, daioe_allapps_wavg.
+    Used to power the trend line in Card 2.
+    """
+    year_min, year_max = year_range
+    return (
+        lf.filter(
+            (pl.col("occupation") == occupation)
+            & (pl.col("year") >= year_min)
+            & (pl.col("year") <= year_max),
+        )
+        .group_by("year")
+        .agg(pl.col("daioe_allapps_wavg").mean())
+        .sort("year")
+        .collect()
+    )
+def get_occ_employment_by_age(
+    lf: pl.LazyFrame,
+    occupation: str,
+    year_range: tuple[int, int],
+    age_groups: list[str],
+) -> pl.DataFrame:
+    """
+    Return yearly employment counts per age group for a given occupation and year range.
+    Used to power the employment change line chart in Card 3.
+    Returns a long-format DataFrame with columns: year, age_group, count.
+    """
+    year_min, year_max = year_range
+    return (
+        lf.filter(
+            (pl.col("occupation") == occupation)
+            & (pl.col("year") >= year_min)
+            & (pl.col("year") <= year_max)
+            & (pl.col("age_group").is_in(age_groups)),
+        )
+        .group_by(["year", "age_group"])
+        .agg([
+            pl.col("count").sum(),
+            pl.col("pct_chg_1y").mean(),
+        ])
+        .sort(["age_group", "year"])
+        .collect()
+    )

src/setup.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import importlib.util
+import io
+import re
+from pathlib import Path
+import pandas as pd
+import plotly.graph_objects as go
+import polars as pl
+from great_tables import GT
+from shiny import ui
+# ---------------------------------------------------
+# Mardown Files
+# ------------
+BASE_DIR = Path(__file__).resolve().parent.parent
+INTRO_MD = (BASE_DIR / "md_files" / "intro.md").read_text(encoding="utf-8")
+# ---------------------------------------------------
+# Data Preliminaries
+# ---------------------------------------------------
+DATA_PATH = BASE_DIR / "data" / "daioe_scb_years_processed.parquet"
+lf = pl.scan_parquet(DATA_PATH)
+lf.collect_schema()
+# ---------------------------------------------------
+# Defining Input Values
+# ---------------------------------------------------
+# 1. SSYK12 Levels
+LEVELS = lf.select(pl.col("level").unique().sort()).collect().to_series().to_list()
+def build_choices_by_level(
+    lf: pl.LazyFrame,
+    levels: list[str],
+) -> dict[str, dict[str, str]]:
+    out = {}
+    for lvl in levels:
+        occs = (
+            lf.filter(pl.col("level") == lvl)
+            .select(pl.col("occupation").unique().sort())
+            .collect()
+            .to_series()
+            .to_list()
+        )
+        out[lvl] = {o: o for o in occs}
+    return out
+# 2. Men and Women
+SEXES = lf.select(pl.col("sex").unique().sort()).collect().to_series().to_list()
+# 3. Age groupings
+AGE_ORDER = [
+    "Early Career 1 (16-24)",
+    "Early Career 2 (25-29)",
+    "Developing (30-34)",
+    "Mid-Career 1 (35-39)",
+    "Mid-Career 1 (40-44)",
+    "Mid-Career 2 (45-49)",
+    "Senior (50+)",
+]
+present = lf.select(pl.col("age_group").unique()).collect().to_series().to_list()
+AGES = [x for x in AGE_ORDER if x in present]
+YEARS = lf.select(pl.col("year").unique().sort()).collect().to_series().to_list()
+# 4. Years from the dataset
+YEAR_MIN, YEAR_MAX = min(YEARS), max(YEARS)
+# 5. AI Sub-Indexes
+METRICS: dict[str, str] = {
+    "daioe_genai": "🧠 Generative AI",
+    "daioe_allapps": "📚 All Applications",
+    "daioe_stratgames": "♟️ Strategy Games",
+    "daioe_videogames": "🎮 Video Games (Real-Time)",
+    "daioe_imgrec": "🖼️🔎 Image Recognition",
+    "daioe_imgcompr": "🧩🖼️ Image Comprehension",
+    "daioe_imggen": "🖌️🖼️ Image Generation",
+    "daioe_readcompr": "📖 Reading Comprehension",
+    "daioe_lngmod": "✍️🤖 Language Modeling",
+    "daioe_translat": "🌐🔤 Translation",
+    "daioe_speechrec": "🗣️🎙️ Speech Recognition",
+}
+first_cols = [
+    "level",
+    "ssyk_code",
+    "occupation",
+    "year",
+    "sex",
+    "age",
+    "age_group",
+    "count",
+    "weight_sum",
+    "chg_1y",
+    "chg_3y",
+    "chg_5y",
+    "pct_chg_1y",
+    "pct_chg_3y",
+    "pct_chg_5y",
+]
+# ---------------------------------------------------
+# Shared UI Helpers
+# ---------------------------------------------------
+def apply_plot_style(fig: go.Figure, brand: dict[str, str]) -> go.Figure:
+    """Apply a consistent visual style to Plotly charts."""
+    fig.update_layout(
+        paper_bgcolor="rgba(0,0,0,0)",
+        plot_bgcolor="rgba(0,0,0,0)",
+        font={"family": "Nunito Sans", "color": brand["text"]},
+        hoverlabel={"bgcolor": "white", "font_size": 12},
+        margin={"l": 20, "r": 20, "t": 40, "b": 20},
+    )
+    fig.update_xaxes(gridcolor="#E5E5E5", zeroline=False)
+    fig.update_yaxes(gridcolor="#E5E5E5", zeroline=False)
+    return fig
+def empty_figure(message: str, brand: dict[str, str]) -> go.Figure:
+    """Create a styled empty Plotly figure with a centered message."""
+    fig = go.Figure()
+    fig.add_annotation(text=message, showarrow=False, font_size=16)
+    fig.update_xaxes(visible=False)
+    fig.update_yaxes(visible=False)
+    return apply_plot_style(fig, brand)
+# ---------------------------------------------------
+# Shared Table/Label Helpers
+# ---------------------------------------------------
+def metric_display_name(metric_key: str, metrics: dict[str, str]) -> str:
+    """Return a clean human-readable metric label without leading icons."""
+    label = metrics.get(metric_key, metric_key.replace("_", " ").title())
+    return re.sub(r"^[^A-Za-z0-9]+\s*", "", label).strip()
+def readable_column_name(col: str, metrics: dict[str, str]) -> str:
+    """Convert raw dataset column names into readable table headers."""
+    exact = {
+        "ssyk_code": "SSYK Code",
+        "age_group": "Age Group",
+        "count": "Employees",
+        "year": "Year",
+        "sex": "Sex",
+        "level": "SSYK Level",
+        "occupation": "Occupation",
+        "chg_1y": "1-year Change",
+        "chg_3y": "3-year Change",
+        "chg_5y": "5-year Change",
+    }
+    if col in exact:
+        return exact[col]
+    col_l = col.lower()
+    if col_l.startswith("pctl_") and col_l.endswith("_wavg"):
+        metric_key = col[5:-5]
+        return f"{metric_display_name(metric_key, metrics)} Percentile (Weighted Avg)"
+    if col_l.endswith("_wavg"):
+        metric_key = col[:-5]
+        return f"{metric_display_name(metric_key, metrics)} (Weighted Avg)"
+    if col_l.endswith("_avg"):
+        metric_key = col[:-4]
+        return f"{metric_display_name(metric_key, metrics)} (Average)"
+    if col_l.endswith("_level_exposure"):
+        metric_key = col[: -len("_level_exposure")]
+        return f"{metric_display_name(metric_key, metrics)} Exposure Level"
+    fallback = col.replace("_", " ").title()
+    return (
+        fallback.replace("Ssyk", "SSYK").replace("Ai", "AI").replace("Daioe", "DAIOE")
+    )
+def as_great_table_html(df, metrics: dict[str, str]) -> ui.TagChild:
+    """Render a pandas DataFrame as Great Tables HTML with readable headers."""
+    if df.empty:
+        return ui.p("No data available for the selected filters.")
+    df_display = df.rename(
+        columns={c: readable_column_name(c, metrics) for c in df.columns},
+    )
+    float_cols = [
+        c
+        for c in df_display.columns
+        if c != "Year" and pd.api.types.is_float_dtype(df_display[c])
+    ]
+    gt = (
+        GT(df_display)
+        .opt_row_striping()
+        .tab_options(table_font_names=["Nunito Sans", "Arial", "sans-serif"])
+        .opt_stylize(style=2, color="blue")
+    )
+    if float_cols:
+        gt = gt.fmt_number(columns=float_cols, decimals=2)
+    return ui.HTML(gt.as_raw_html())
+# ---------------------------------------------------
+# Shared Download Helpers
+# ---------------------------------------------------
+def download_extension(fmt: str) -> str:
+    """Map selected download format to its file extension."""
+    return {"csv": "csv", "parquet": "parquet", "excel": "xlsx"}.get(fmt, "csv")
+def download_media_type(fmt: str) -> str:
+    """Return browser media type for each supported download format."""
+    if fmt == "parquet":
+        return "application/octet-stream"
+    if fmt == "excel":
+        return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+    return "text/csv"
+def export_filtered_data(df, fmt: str) -> str | bytes:
+    """Export a pandas DataFrame to csv/parquet/excel payload for Shiny download."""
+    if fmt == "parquet":
+        return df.to_parquet(index=False)
+    if fmt == "excel":
+        engine = None
+        if importlib.util.find_spec("openpyxl") is not None:
+            engine = "openpyxl"
+        elif importlib.util.find_spec("xlsxwriter") is not None:
+            engine = "xlsxwriter"
+        else:
+            raise RuntimeError("Excel export requires openpyxl or xlsxwriter.")
+        buffer = io.BytesIO()
+        df.to_excel(buffer, index=False, engine=engine)
+        return buffer.getvalue()
+    return df.to_csv(index=False)

src/visuals.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import faicons as fa
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from shiny import ui
+SCB_SOURCE_MD = (
+    "Source: [Swedish Occupational Register, SCB]"
+    "(https://www.scb.se/en/finding-statistics/statistics-by-subject-area/"
+    "labour-market/labour-force-supply/"
+    "the-swedish-occupational-register-with-statistics/)"
+)
+DAIOE_SOURCE_MD = "Source: [DAIOEs](https://www.ai-econlab.com/ai-exposure-daioe)"
+# Brand colours from _brand.yml
+_C_BG = "rgba(0,0,0,0)"
+_C_GRID = "#E5E5E5"
+_C_TEXT = "#1C2826"  # black
+_C_TITLE = "#0C0A3E"  # primary / blue
+_FONT_BASE = "Nunito Sans"
+_FONT_HEAD = "Montserrat"
+_BASE_LAYOUT = {
+    "paper_bgcolor": _C_BG,
+    "plot_bgcolor": _C_BG,
+    "font": {"family": _FONT_BASE, "color": _C_TEXT, "size": 13},
+    "title_font": {"family": _FONT_HEAD, "color": _C_TITLE, "size": 15},
+    "hoverlabel": {"font": {"family": _FONT_BASE, "size": 12}},
+    "margin": {"l": 20, "r": 20, "t": 45, "b": 20},
+}
+def build_value_boxes(summary: dict, occupation: str) -> ui.Tag:
+    """
+    Build the employment summary value boxes for a given occupation.
+    Returns a div containing a heading, four value boxes (employment, 1/3/5-yr
+    change), and a markdown source note.
+    """
+    def _arrow(v):
+        return "▼" if v < 0 else "▲"
+    def _theme(v):
+        return "danger" if v < 0 else "success"
+    def _fmt_pct(v):
+        return f"{_arrow(v)} {v:.0f}%" if v is not None else "N/A"
+    def _fmt_theme(v):
+        return _theme(v) if v is not None else "secondary"
+    emp = summary["employment"]
+    pct1 = summary["pct_1y"]
+    pct3 = summary["pct_3y"]
+    pct5 = summary["pct_5y"]
+    year = summary["year"]
+    return ui.div(
+        ui.h6(f"National Employment of {occupation}", class_="mt-3 mb-2 fw-semibold"),
+        ui.layout_columns(
+            ui.value_box(
+                title="Employment",
+                showcase=fa.icon_svg("users"),
+                value=f"{emp:,.0f}",
+                theme="primary",
+            ),
+            ui.value_box(
+                title="1-yr change",
+                value=_fmt_pct(pct1),
+                showcase=fa.icon_svg("arrow-trend-up" if pct1 is None or pct1 >= 0 else "arrow-trend-down"),
+                theme=_fmt_theme(pct1),
+            ),
+            ui.value_box(
+                title="3-yr change",
+                value=_fmt_pct(pct3),
+                showcase=fa.icon_svg("arrow-trend-up" if pct3 is None or pct3 >= 0 else "arrow-trend-down"),
+                theme=_fmt_theme(pct3),
+            ),
+            ui.value_box(
+                title="5-yr change",
+                value=_fmt_pct(pct5),
+                showcase=fa.icon_svg("arrow-trend-up" if pct5 is None or pct5 >= 0 else "arrow-trend-down"),
+                theme=_fmt_theme(pct5),
+            ),
+            col_widths=[3, 3, 3, 3],
+        ),
+        ui.markdown(f"Data as at **{year}**.\n\n{SCB_SOURCE_MD}"),
+    )
+def build_age_chart(df: pd.DataFrame, occupation: str) -> go.Figure:
+    """
+    Build a Plotly line chart of 1-yr employment % change by age group over time.
+    Absolute employment count is shown on hover. Returns an empty figure if df is empty.
+    """
+    if df.empty:
+        return go.Figure()
+    fig = px.line(
+        df,
+        x="year",
+        y="pct_chg_1y",
+        color="age_group",
+        markers=True,
+        custom_data=["count"],
+        labels={
+            "year": "Year",
+            "pct_chg_1y": "Employment change (%)",
+            "age_group": "Age Group",
+        },
+    )
+    fig.update_traces(
+        hovertemplate=(
+            "<b>%{fullData.name}</b><br>"
+            "Year: %{x}<br>"
+            "Change: %{y:.1f}%<br>"
+            "Employment: %{customdata[0]:,}<extra></extra>"
+        ),
+    )
+    fig.add_hline(y=0, line_color="grey", line_width=1)
+    fig.update_layout(
+        **_BASE_LAYOUT,
+        title={
+            "text": f"Annual Employment Change of {occupation} in Sweden",
+            "x": 0.01,
+            "xanchor": "left",
+        },
+        legend={"title": None},
+        yaxis={"ticksuffix": "%"},
+    )
+    fig.update_xaxes(gridcolor=_C_GRID, zeroline=False, dtick=1)
+    fig.update_yaxes(gridcolor=_C_GRID, zeroline=False)
+    return fig
+def build_ai_exposure_bar(df: pd.DataFrame, occupation: str, year: int) -> go.Figure:
+    """
+    Build a vertical bar chart of AI exposure level per sub-domain.
+    X-axis: AI sub-domains with emoji labels.
+    Y-axis: exposure level (1=Low, 2=Medium, 3=High).
+    Bar colour intensity driven by the weighted average score.
+    Hover shows exposure level label, index score, and percentile rank.
+    """
+    if df.empty:
+        return go.Figure()
+    fig = go.Figure(
+        go.Bar(
+            x=df["percentile"],
+            y=df["domain"],
+            orientation="h",
+            marker={
+                "color": df["percentile"],
+                "colorscale": "Blues",
+                "colorbar": {"title": "Percentile Rank"},
+                "showscale": True,
+                "cmin": 0,
+                "cmax": 100,
+            },
+            customdata=list(
+                zip(df["level_label"], df["level"], df["score"], strict=False)
+            ),
+            hovertemplate=(
+                "<b>%{y}</b><br>"
+                "Percentile Rank: %{x:.0f}<br>"
+                "Exposure Level: %{customdata[0]} (%{customdata[1]}/5)<br>"
+                "Index Score: %{customdata[2]:.3f}<extra></extra>"
+            ),
+        ),
+    )
+    fig.update_layout(
+        **_BASE_LAYOUT,
+        title={
+            "text": f"{occupation} Level of AI Exposure ({year})",
+            "x": 0.01,
+            "xanchor": "left",
+        },
+        xaxis={"title": "Percentile Rank", "range": [0, 100]},
+        yaxis={"title": None},
+    )
+    fig.update_xaxes(gridcolor=_C_GRID, zeroline=False)
+    fig.update_yaxes(gridcolor=_C_GRID, zeroline=False)
+    return fig