joseph-data commited on
Commit
3e12d11
·
unverified ·
1 Parent(s): 5ad2292

updated the app

Browse files
.gitattributes ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ data/daioe_simple.csv filter=lfs diff=lfs merge=lfs -text
2
+ data/daioe_weighted.csv filter=lfs diff=lfs merge=lfs -text
3
+ data/*.csv filter=lfs diff=lfs merge=lfs -text
4
+ *.csv filter=lfs diff=lfs merge=lfs -text
.gitattributes copy ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ data/daioe_simple.csv filter=lfs diff=lfs merge=lfs -text
2
+ data/daioe_weighted.csv filter=lfs diff=lfs merge=lfs -text
3
+ data/*.csv filter=lfs diff=lfs merge=lfs -text
4
+ *.csv filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -8,6 +8,8 @@ wheels/
8
 
9
  # Virtual environments
10
  .venv
 
 
11
 
12
  # Project-specific artifacts
13
  test_notebooks/
@@ -24,3 +26,9 @@ data/04_translation_files/
24
  scripts/03_translate_ssyk2012.py
25
  scripts/03_translate_ssyk96.py
26
  _brand.yml
 
 
 
 
 
 
 
8
 
9
  # Virtual environments
10
  .venv
11
+ .ruff_cache
12
+ .vscode
13
 
14
  # Project-specific artifacts
15
  test_notebooks/
 
26
  scripts/03_translate_ssyk2012.py
27
  scripts/03_translate_ssyk96.py
28
  _brand.yml
29
+ data/daioe_simple.csv
30
+ data/daioe_weighted.csv
31
+
32
+
33
+ test.py
34
+ test2.py
app.py CHANGED
@@ -1,207 +1,321 @@
1
- """
2
- Shiny app: Employment headcount by age group for a selected SSYK3 occupation,
3
- indexed to 2022 = 1. Uses SCB AKU employment pulled via scripts/04_occ.py.
4
- """
5
-
6
- from __future__ import annotations
7
-
8
- from functools import lru_cache
9
  from pathlib import Path
10
- from typing import Dict, List
11
-
12
- import matplotlib.pyplot as plt
13
- import pandas as pd
14
- from shiny import App, render, ui
15
-
16
- ROOT = Path(__file__).resolve().parent
17
- OCC_PATH = ROOT / "scripts" / "04_occ.py"
18
-
19
- # Age groups available from SCB; keep order consistent for the UI and legend.
20
- AGE_ORDER: List[str] = [
21
- "16-24",
22
- "25-29",
23
- "30-34",
24
- "35-39",
25
- "40-44",
26
- "45-49",
27
- "50-54",
28
- "55-59",
29
- "60-64",
30
- ]
31
- AGE_LABELS: Dict[str, str] = {age: f"{age} years" for age in AGE_ORDER}
32
-
33
-
34
- def _load_occ_module():
35
- """Load the employment fetcher from scripts/04_occ.py."""
36
- import importlib.util
37
-
38
- spec = importlib.util.spec_from_file_location("scripts.occ", OCC_PATH)
39
- module = importlib.util.module_from_spec(spec)
40
- assert spec.loader is not None
41
- spec.loader.exec_module(module)
42
- return module
43
-
44
-
45
- @lru_cache(maxsize=1)
46
- def load_employment() -> pd.DataFrame:
47
- """Fetch SCB AKU employment by occupation, age, and year."""
48
- occ_mod = _load_occ_module()
49
- df = occ_mod.fetch_scb_aku_occupations()
50
- df = df.rename(columns={"code_3": "code"})
51
- df["code"] = df["code"].astype(str).str.zfill(3)
52
- df["year"] = df["year"].astype(int)
53
- df["value"] = df["value"].astype(int)
54
- df = df[df["age"].isin(AGE_ORDER)].copy()
55
- return df
56
-
57
-
58
- @lru_cache(maxsize=1)
59
- def profession_choices() -> Dict[str, str]:
60
- """
61
- Build a mapping of SSYK3 codes to display labels.
62
- Uses the most frequent occupation label observed for each code.
63
- """
64
- df = load_employment()
65
- df = df[df["code"].str.len() == 3].copy()
66
- df = df.dropna(subset=["occupation"])
67
-
68
- def pick_label(group: pd.Series) -> str:
69
- return group.mode().iat[0] if not group.mode().empty else group.iloc[0]
70
-
71
- labels = (
72
- df.groupby("code")["occupation"]
73
- .apply(pick_label)
74
- .reset_index()
75
- .sort_values("code")
76
- )
77
- return {row.code: f"{row.code} - {row.occupation}" for row in labels.itertuples()}
78
-
79
-
80
- @lru_cache(maxsize=1)
81
- def available_years() -> List[int]:
82
- """Years present in the employment series, sorted ascending."""
83
- df = load_employment()
84
- return sorted(df["year"].unique().tolist())
85
-
86
-
87
- def build_headcount(code: str, ages: List[str], base_year: int | None) -> pd.DataFrame:
88
- """
89
- Filter employment to a single SSYK3 code and selected age groups.
90
- Optionally index each age group to the selected base year.
91
- """
92
- emp = load_employment()
93
- filtered = emp[(emp["code"] == code) & (emp["age"].isin(ages))].copy()
94
- if filtered.empty:
95
- return filtered
96
-
97
- if base_year is not None:
98
- base = (
99
- filtered[filtered["year"] == base_year][["age", "value"]]
100
- .rename(columns={"value": "base_value"})
101
- .set_index("age")
102
- )
103
- filtered["base_value"] = filtered["age"].map(base["base_value"])
104
- filtered = filtered[filtered["base_value"].notna()].copy()
105
- if filtered.empty:
106
- return filtered
107
- filtered["metric"] = filtered["value"] / filtered["base_value"]
108
- else:
109
- filtered["metric"] = filtered["value"]
110
-
111
- filtered["age_label"] = filtered["age"].map(AGE_LABELS)
112
- filtered = filtered.sort_values(["age", "year"])
113
- return filtered
114
-
115
-
116
- def make_headcount_plot(df: pd.DataFrame, title: str, base_year: int | None):
117
- """Create a line plot of headcount by age group for one occupation."""
118
- fig, ax = plt.subplots(figsize=(10, 6))
119
-
120
- palette = [
121
- "#0072B2",
122
- "#009E73",
123
- "#E69F00",
124
- "#D55E00",
125
- "#CC79A7",
126
- "#56B4E9",
127
- "#999999",
128
- "#F0E442",
129
- "#8C564B",
130
- ]
131
-
132
- for idx, (age, group) in enumerate(df.groupby("age_label")):
133
- ax.plot(group["year"], group["metric"], label=age, color=palette[idx % len(palette)], linewidth=2)
134
-
135
- if base_year is not None:
136
- ax.axvline(base_year, color="#555555", linestyle="--", linewidth=1, alpha=0.7)
137
- ax.set_xlabel("Year")
138
- ylabel = f"Normalized headcount (base={base_year})" if base_year is not None else "Headcount"
139
- ax.set_ylabel(ylabel)
140
- ax.set_title(f"Headcount over time by age group\n{title}")
141
- ax.legend(title="Age group", loc="upper left")
142
- ax.grid(True, linestyle="--", alpha=0.2)
143
- fig.tight_layout()
144
- return fig
145
-
146
-
147
- profession_map = profession_choices()
148
- default_code = next(iter(profession_map.keys()), "")
149
-
150
- app_ui = ui.page_fluid(
151
- ui.h2("Headcount over time by age group"),
152
- ui.input_select(
153
- "profession",
154
- "SSYK 3-digit occupation",
155
- choices=profession_map,
156
- selected=default_code,
157
- ),
158
- ui.input_select(
159
- "base_year",
160
- "Base year (optional)",
161
- choices={"": "No indexing (show raw values)", **{str(y): str(y) for y in available_years()}},
162
- selected="",
163
- ),
164
- ui.input_checkbox_group(
165
- "age_groups",
166
- "Age groups",
167
- choices={age: AGE_LABELS[age] for age in AGE_ORDER},
168
- selected=AGE_ORDER,
169
- inline=True,
170
- ),
171
- ui.output_plot("headcount_plot", width="100%", height="650px"),
172
- ui.markdown(
173
- "Data: SCB AKU employment. Select a base year to normalize, or leave blank to see raw headcount."
174
- ),
175
  )
176
 
 
177
 
178
- def server(input, output, session):
179
- @render.plot
180
- def headcount_plot():
181
- code = input.profession()
182
- ages = input.age_groups()
183
- base_year_raw = input.base_year()
184
- base_year = int(base_year_raw) if base_year_raw else None
185
- if not code or not ages:
186
- fig, ax = plt.subplots(figsize=(8, 3))
187
- ax.text(0.5, 0.5, "Select an occupation and at least one age group.", ha="center", va="center")
188
- ax.axis("off")
189
- return fig
190
 
191
- df = build_headcount(code, ages, base_year)
192
- if df.empty:
193
- fig, ax = plt.subplots(figsize=(8, 3))
194
- ax.text(0.5, 0.5, "No data available for this selection.", ha="center", va="center")
195
- ax.axis("off")
196
- return fig
197
 
198
- title = profession_map.get(code, code)
199
- return make_headcount_plot(df, title, base_year)
 
 
200
 
 
201
 
202
- app = App(app_ui, server)
 
 
 
 
 
 
203
 
204
 
205
- if __name__ == "__main__":
206
- # Run with: shiny run --reload app_headcount_age.py
207
- app.run()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from pathlib import Path
2
+
3
+ import plotly.graph_objects as go
4
+ import plotly.express as px
5
+ from plotly.subplots import make_subplots
6
+
7
+ from shiny import reactive
8
+ from shiny.express import input, ui
9
+ from shinywidgets import render_plotly, output_widget
10
+ from src.config import (
11
+ DEFAULT_LEVEL,
12
+ DEFAULT_YEAR_RANGE,
13
+ LEVEL_OPTIONS,
14
+ GLOBAL_YEAR_MIN,
15
+ GLOBAL_YEAR_MAX,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  )
17
 
18
+ from src.data_manager import load_payload
19
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ # Helpers for UI mapping
22
+ LEVEL_CHOICES = {value: label for label, value in LEVEL_OPTIONS}
23
+ YEAR_RANGE_DEFAULT = list(range(DEFAULT_YEAR_RANGE[0], DEFAULT_YEAR_RANGE[1] + 1))
 
 
 
24
 
25
+ # ======================================================
26
+ # UI LAYOUT
27
+ # ======================================================
28
+ css_file = Path(__file__).parent / "css" / "theme.css"
29
 
30
+ ui.include_css(css_file)
31
 
32
+ ui.page_opts(
33
+ fillable=False,
34
+ fillable_mobile=True,
35
+ full_width=True,
36
+ id="page",
37
+ lang="en",
38
+ )
39
 
40
 
41
+ with ui.sidebar(open="desktop", position="right"):
42
+ ui.input_select(
43
+ "level", "Select Occupation level", LEVEL_CHOICES, selected=DEFAULT_LEVEL
44
+ )
45
+ ui.input_selectize(
46
+ "selectize",
47
+ "Select Occupation title(s)",
48
+ {},
49
+ multiple=True,
50
+ options=(
51
+ {
52
+ "placeholder": "Statisticians...",
53
+ "create": False,
54
+ "plugins": ["clear_button"],
55
+ }
56
+ ),
57
+ )
58
+ # ui.input_radio_buttons(
59
+ # "count_mode",
60
+ # "Employed persons display",
61
+ # {"raw": "Raw counts", "index": "Index to base year"},
62
+ # selected="raw",
63
+ # )
64
+ # with ui.panel_conditional("input.count_mode == 'index'"):
65
+ # ui.input_select(
66
+ # "base_year",
67
+ # "Base year",
68
+ # YEAR_RANGE_DEFAULT,
69
+ # selected=2022,
70
+ # )
71
+
72
+ ui.input_slider(
73
+ "year_range",
74
+ "Year range",
75
+ min=GLOBAL_YEAR_MIN,
76
+ max=GLOBAL_YEAR_MAX,
77
+ value=DEFAULT_YEAR_RANGE,
78
+ step=1,
79
+ sep="",
80
+ )
81
+ ui.input_action_button("refresh_data", "Refresh data", class_="btn-primary")
82
+
83
+
84
+ # ======================================================
85
+ # REACTIVE STATE
86
+ # ======================================================
87
+
88
+ # Reactive value to store the loaded payload
89
+ payload_store = reactive.Value(load_payload())
90
+
91
+
92
+ @reactive.effect
93
+ @reactive.event(input.refresh_data)
94
+ def _refresh_payload():
95
+ with ui.Progress() as progress:
96
+ progress.set(message="Refreshing data...", value=0.1)
97
+ # Force recompute in data manager
98
+ updated = load_payload(force_recompute=True)
99
+ progress.set(message="Updating UI...", value=0.8)
100
+ payload_store.set(updated)
101
+ progress.set(message="Done", value=1.0)
102
+
103
+
104
+ # Build Selectize choices per selected level
105
+ @reactive.calc
106
+ def level_label_choices():
107
+ df = payload_store()
108
+ lvl = int(input.level())
109
+ subset = df[df["level"] == lvl][["code", "label"]].dropna().drop_duplicates()
110
+ choices_list = []
111
+ for _, row in subset.iterrows():
112
+ key = row["label"]
113
+ value = f"{row['code']} - {row['label']}"
114
+ choices_list.append((key, value))
115
+
116
+ # Sort by the code (extract code from display value)
117
+ choices_list.sort(key=lambda x: x[1].split(" - ")[0])
118
+
119
+ # Convert to dictionary while maintaining order
120
+ return {key: value for key, value in choices_list}
121
+
122
+
123
+ # keep selectize choices in sync with level selection
124
+ @reactive.effect
125
+ def _sync_selectize_choices():
126
+ choices = level_label_choices()
127
+ current = input.selectize() or []
128
+
129
+ # only keep items still valid
130
+ valid_selected = [s for s in current if s in choices]
131
+
132
+ # apply a default when nothing valid remains
133
+ if not valid_selected and choices:
134
+ # pick the first option (or slice for multiple defaults)
135
+ valid_selected = [next(iter(choices))]
136
+
137
+ ui.update_selectize("selectize", choices=choices, selected=valid_selected)
138
+
139
+
140
+ # Filtered data based on UI inputs
141
+ @reactive.calc
142
+ def filtered_data():
143
+ df = payload_store()
144
+ level = int(input.level())
145
+ year_min, year_max = input.year_range()
146
+ selected_titles = input.selectize()
147
+
148
+ idx_level = df["level"] == level
149
+ idx_year = df["year"].between(year_min, year_max)
150
+
151
+ # If no titles selected, return empty dataframe
152
+ if not selected_titles:
153
+ return df[idx_level & idx_year & (df["label"] == "")].copy() # Empty result
154
+
155
+ idx_title = df["label"].isin(selected_titles)
156
+ filtered_df = df[idx_level & idx_year & idx_title]
157
+
158
+ return filtered_df
159
+
160
+
161
+ # # Warning message for no selections
162
+ # with ui.div(style="margin: 20px;"):
163
+
164
+ # @render.ui
165
+ # def selection_status():
166
+ # if not input.selectize():
167
+ # return ui.div(
168
+ # ui.tags.div(
169
+ # "⚠️ Please select at least one occupation title to view data.",
170
+ # style="background-color: #fff3cd; color: #856404; padding: 15px; border: 1px solid #ffeaa7; border-radius: 5px; text-align: center; font-weight: bold;",
171
+ # )
172
+ # )
173
+ # else:
174
+ # return ui.div() # Return empty div when selections exist
175
+
176
+
177
+ # @render_plotly
178
+ # def data_table():
179
+ # df = filtered_data()
180
+
181
+ # # Show message if no data available
182
+ # if df.empty:
183
+ # fig = go.Figure()
184
+ # fig.add_annotation(
185
+ # text="No data available. Please select occupation titles.",
186
+ # xref="paper",
187
+ # yref="paper",
188
+ # x=0.5,
189
+ # y=0.5,
190
+ # showarrow=False,
191
+ # font=dict(size=16),
192
+ # )
193
+ # fig.update_layout(
194
+ # xaxis=dict(visible=False), yaxis=dict(visible=False), plot_bgcolor="white"
195
+ # )
196
+ # return fig
197
+
198
+ # fig = go.Figure(
199
+ # data=go.Table(
200
+ # header=dict(
201
+ # values=list(df.columns), fill_color="paleturquoise", align="left"
202
+ # ),
203
+ # cells=dict(
204
+ # values=[df[col] for col in df.columns],
205
+ # fill_color="lavender",
206
+ # align="left",
207
+ # ),
208
+ # )
209
+ # )
210
+ # return fig
211
+
212
+
213
+ with ui.div(style="display:flex; justify-content:center;"):
214
+ output_widget("employment_plot")
215
+
216
+ @render_plotly
217
+ def employment_plot2():
218
+ df = filtered_data()
219
+
220
+ age_groups = sorted(df["age"].dropna().unique())
221
+
222
+ occupations = sorted(df["label"].dropna().unique())
223
+ # Use a Plotly qualitative palette
224
+ palette = px.colors.qualitative.Plotly
225
+ # Cycle safely if occupations > palette length
226
+ occ_color_map = {
227
+ occ: palette[i % len(palette)] for i, occ in enumerate(occupations)
228
+ }
229
+
230
+ # ------------------------------------------------------------------
231
+ # 2. Create multi-row subplot scaffolding
232
+ # ------------------------------------------------------------------
233
+ subplot_titles = [
234
+ (f"<b>Employed Persons Aged {age} Years by Occupation")
235
+ for age in age_groups
236
+ ]
237
+
238
+ fig = make_subplots(
239
+ rows=len(age_groups),
240
+ cols=1,
241
+ shared_xaxes=False,
242
+ vertical_spacing=0.03,
243
+ subplot_titles=subplot_titles,
244
+ )
245
+
246
+ # ------------------------------------------------------------------
247
+ # 3. Add traces per age group and exposure level
248
+ # ------------------------------------------------------------------
249
+
250
+ # Need to pre-define the max row number for the final x-axis update
251
+
252
+ for i, age in enumerate(age_groups, start=1):
253
+ df_age = df[df["age"] == age]
254
+
255
+ # Aggregate by Year and Label
256
+ df_plot = df_age.groupby(["year", "label"], as_index=False)[
257
+ "employment"
258
+ ].sum()
259
+
260
+ for occ_title, sub in df_plot.groupby("label"):
261
+ fig.add_trace(
262
+ go.Scatter(
263
+ x=sub["year"],
264
+ y=sub["employment"],
265
+ mode="lines+markers",
266
+ showlegend=True
267
+ if i == 1
268
+ else False, # Show legend only in the first subplot
269
+ name=occ_title,
270
+ line=dict(color=occ_color_map[occ_title], width=3),
271
+ # Add group/age info to the hover template for debugging/clarity
272
+ hovertemplate=f"Age: {age}<br>Year: %{{x}}<br>Employment: %{{y:,}}<extra>{occ_title}</extra>",
273
+ ),
274
+ row=i,
275
+ col=1,
276
+ )
277
+
278
+ # Y-axis update must be inside the loop to target the current row (i)
279
+ fig.update_yaxes(
280
+ title_text="Number of Employed Persons",
281
+ tickformat=",",
282
+ rangemode="tozero",
283
+ row=i,
284
+ col=1,
285
+ )
286
+
287
+ # X-axis update must be inside the loop to target the current row (i)
288
+ fig.update_xaxes(
289
+ title_text="Year",
290
+ tickmode="linear",
291
+ dtick=1,
292
+ row=i,
293
+ col=1,
294
+ )
295
+
296
+ # ------------------------------------------------------------------
297
+ # 4. Global layout tweaks
298
+ # ------------------------------------------------------------------
299
+ fig.update_annotations(yshift=30)
300
+ fig.update_layout(
301
+ height=700 * len(age_groups),
302
+ width=1200,
303
+ legend_traceorder="normal",
304
+ legend=dict(
305
+ title="Occupation Title(s)",
306
+ orientation="v",
307
+ yanchor="top",
308
+ y=1.0,
309
+ xanchor="left",
310
+ x=-0.5,
311
+ bordercolor="#c7c7c7",
312
+ borderwidth=2,
313
+ bgcolor="#f9f9f9",
314
+ font=dict(size=10),
315
+ ),
316
+ margin=dict(t=100, l=50, r=80, b=40),
317
+ plot_bgcolor="#f5f7fb",
318
+ xaxis_showgrid=True,
319
+ )
320
+
321
+ return fig
css/theme.css ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*-- scss:defaults --*/
2
+ $link-color: #39729E;
3
+ $text-muted: #6a737b;
4
+
5
+ /*-- scss:rules --*/
6
+
7
+ .layout-example {
8
+ background: $gray-500;
9
+ color: $white;
10
+ text-align: center;
11
+ margin-bottom: 1em;
12
+ font-family: $font-family-monospace;
13
+ font-size: .875em;
14
+ font-weight: 600;
15
+ padding-top: 1em;
16
+ border-radius: 3px;
17
+ }
18
+
19
+ .left {
20
+ text-align: left;
21
+ padding-left: 1em;
22
+ }
23
+
24
+ .right {
25
+ text-align: right;
26
+ padding-right: 1em;
27
+ }
28
+
29
+ .hello-quarto-banner h1 {
30
+ margin-top: 0;
31
+ margin-bottom: 0.5rem;
32
+ }
33
+
34
+ #quarto-announcement {
35
+ padding: 1em;
36
+ font-size: 1em;
37
+ font-weight: bold;
38
+ color: $white;
39
+ background-color: #447099;
40
+ }
41
+
42
+ #quarto-announcement a {
43
+ color: $white;
44
+ }
data/01_translation_files/ssyk96_en.xlsx DELETED
Binary file (19.9 kB)
 
data/03_daioe_aggregated/daioe_ssyk2012_emp_weighted.csv DELETED
The diff for this file is too large to render. See raw diff
 
data/03_daioe_aggregated/daioe_ssyk2012_simple_avg.csv DELETED
The diff for this file is too large to render. See raw diff
 
data/03_daioe_aggregated/daioe_ssyk96_emp_weighted.csv DELETED
The diff for this file is too large to render. See raw diff
 
data/03_daioe_aggregated/daioe_ssyk96_simple_avg.csv DELETED
The diff for this file is too large to render. See raw diff
 
data/scb_employment_v1.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f9eb2919a2a005828571797bd3c3005300e5c32a50c169c304787f97e998c5b
3
+ size 4277339
main.py DELETED
@@ -1,72 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import argparse
4
- import importlib.util
5
- from pathlib import Path
6
- from typing import Iterable
7
-
8
-
9
- PROJECT_ROOT = Path(__file__).resolve().parent
10
- SCRIPTS_DIR = PROJECT_ROOT / "scripts"
11
-
12
-
13
- def load_module(name: str, filename: str):
14
- """Import a script with a numeric prefix via importlib."""
15
- spec = importlib.util.spec_from_file_location(name, SCRIPTS_DIR / filename)
16
- module = importlib.util.module_from_spec(spec)
17
- if spec.loader is None: # pragma: no cover - defensive
18
- raise ImportError(f"Could not load module '{name}' from {filename}")
19
- spec.loader.exec_module(module)
20
- return module
21
-
22
-
23
- SCB_PULL = load_module("scb_pull", "01_scbPull.py")
24
- WEIGHTING = load_module("weighting", "02_weighting.py")
25
-
26
-
27
- def run_pipeline(taxonomies: Iterable[WEIGHTING.Taxonomy]):
28
- """Run SCB pull + weighting for each taxonomy and collect output paths."""
29
- summary = []
30
- for taxonomy in taxonomies:
31
- scb_path = SCB_PULL.pull_taxonomy(taxonomy)
32
- weighted_path, simple_path = WEIGHTING.run_weighting(taxonomy)
33
- summary.append(
34
- {
35
- "taxonomy": taxonomy,
36
- "scb": scb_path,
37
- "weighted": weighted_path,
38
- "simple": simple_path,
39
- }
40
- )
41
- return summary
42
-
43
-
44
- def parse_args() -> argparse.Namespace:
45
- parser = argparse.ArgumentParser(
46
- description="Pull SCB data and build employment-weighted DAIOE aggregates",
47
- )
48
- parser.add_argument(
49
- "--taxonomy",
50
- action="append",
51
- choices=["ssyk2012", "ssyk96"],
52
- help="Taxonomy to refresh (can be provided multiple times). Defaults to both.",
53
- )
54
- return parser.parse_args()
55
-
56
-
57
- def main() -> None:
58
- args = parse_args()
59
- taxonomies = args.taxonomy or ["ssyk2012", "ssyk96"]
60
- results = run_pipeline(taxonomies)
61
-
62
- print("\nDAIOE datasets refreshed:\n" + "-" * 40)
63
- for item in results:
64
- print(f"Taxonomy: {item['taxonomy']}")
65
- print(f" SCB weights: {item['scb']}")
66
- print(f" Employment-weighted: {item['weighted']}")
67
- print(f" Simple-average: {item['simple']}\n")
68
- print("Outputs are ready under data/03_daioe_aggregated for app.py")
69
-
70
-
71
- if __name__ == "__main__":
72
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements copy.txt ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ anyio==4.12.0
2
+ anywidget==0.9.21
3
+ asgiref==3.11.0
4
+ asttokens==3.0.1
5
+ certifi==2025.11.12
6
+ charset-normalizer==3.4.4
7
+ click==8.3.1
8
+ comm==0.2.3
9
+ contourpy==1.3.3
10
+ cycler==0.12.1
11
+ decorator==5.2.1
12
+ et-xmlfile==2.0.0
13
+ executing==2.2.1
14
+ fonttools==4.61.0
15
+ h11==0.16.0
16
+ htmltools==0.6.0
17
+ idna==3.11
18
+ ipython==9.8.0
19
+ ipython-pygments-lexers==1.1.1
20
+ ipywidgets==8.1.8
21
+ jedi==0.19.2
22
+ jupyter-core==5.9.1
23
+ jupyterlab-widgets==3.0.16
24
+ kiwisolver==1.4.9
25
+ linkify-it-py==2.0.3
26
+ markdown-it-py==4.0.0
27
+ matplotlib==3.10.7
28
+ matplotlib-inline==0.2.1
29
+ mdit-py-plugins==0.5.0
30
+ mdurl==0.1.2
31
+ mizani==0.14.3
32
+ narwhals==2.13.0
33
+ numpy==2.3.5
34
+ openpyxl==3.1.5
35
+ orjson==3.11.5
36
+ packaging==25.0
37
+ pandas==2.3.3
38
+ parso==0.8.5
39
+ pathlib==1.0.1
40
+ patsy==1.0.2
41
+ pexpect==4.9.0
42
+ pillow==12.0.0
43
+ platformdirs==4.5.1
44
+ plotly==6.5.0
45
+ plotnine==0.15.1
46
+ prompt-toolkit==3.0.52
47
+ psygnal==0.15.0
48
+ ptyprocess==0.7.0
49
+ pure-eval==0.2.3
50
+ pygments==2.19.2
51
+ pyparsing==3.2.5
52
+ pyscbwrapper==0.1.2
53
+ python-dateutil==2.9.0.post0
54
+ python-multipart==0.0.20
55
+ pytz==2025.2
56
+ questionary==2.1.1
57
+ requests==2.32.5
58
+ ruff==0.14.9
59
+ scipy==1.16.3
60
+ setuptools==80.9.0
61
+ shiny==1.5.1
62
+ shinychat==0.2.8
63
+ shinywidgets==0.7.0
64
+ six==1.17.0
65
+ stack-data==0.6.3
66
+ starlette==0.50.0
67
+ statsmodels==0.14.6
68
+ traitlets==5.14.3
69
+ typing-extensions==4.15.0
70
+ tzdata==2025.2
71
+ uc-micro-py==1.0.3
72
+ urllib3==2.6.1
73
+ uvicorn==0.38.0
74
+ watchfiles==1.1.1
75
+ wcwidth==0.2.14
76
+ websockets==15.0.1
77
+ widgetsnbextension==4.0.15
requirements.txt CHANGED
@@ -1,154 +1,75 @@
1
- annotated-types==0.7.0
2
- anyio==4.11.0
3
  anywidget==0.9.21
4
- argon2-cffi==25.1.0
5
- argon2-cffi-bindings==25.1.0
6
- arrow==1.4.0
7
- asgiref==3.10.0
8
- asttokens==3.0.0
9
- async-lru==2.0.5
10
- attrs==25.4.0
11
- babel==2.17.0
12
- beautifulsoup4==4.14.2
13
- bleach==6.3.0
14
- brand-yml==0.1.1
15
  certifi==2025.11.12
16
- cffi==2.0.0
17
  charset-normalizer==3.4.4
18
- click==8.3.0
19
  comm==0.2.3
20
  contourpy==1.3.3
21
  cycler==0.12.1
22
- debugpy==1.8.17
23
  decorator==5.2.1
24
- defusedxml==0.7.1
25
- et-xmlfile==2.0.0
26
- eval-type-backport==0.3.0
27
  executing==2.2.1
28
- fastjsonschema==2.21.2
29
- fonttools==4.60.1
30
- fqdn==1.5.1
31
- git-filter-repo==2.47.0
32
  h11==0.16.0
33
  htmltools==0.6.0
34
- httpcore==1.0.9
35
- httpx==0.28.1
36
  idna==3.11
37
- ipykernel==7.1.0
38
- ipython==9.7.0
39
  ipython-pygments-lexers==1.1.1
40
  ipywidgets==8.1.8
41
- isoduration==20.11.0
42
- itables==2.5.2
43
  jedi==0.19.2
44
- jinja2==3.1.6
45
- json5==0.12.1
46
- jsonpointer==3.0.0
47
- jsonschema==4.25.1
48
- jsonschema-specifications==2025.9.1
49
- jupyter==1.1.1
50
- jupyter-client==8.6.3
51
- jupyter-console==6.6.3
52
  jupyter-core==5.9.1
53
- jupyter-events==0.12.0
54
- jupyter-lsp==2.3.0
55
- jupyter-server==2.17.0
56
- jupyter-server-terminals==0.5.3
57
- jupyterlab==4.4.10
58
- jupyterlab-pygments==0.3.0
59
- jupyterlab-server==2.28.0
60
  jupyterlab-widgets==3.0.16
61
  kiwisolver==1.4.9
62
- lark==1.3.1
63
- libsass==0.23.0
64
  linkify-it-py==2.0.3
65
  markdown-it-py==4.0.0
66
- markupsafe==3.0.3
67
  matplotlib==3.10.7
68
  matplotlib-inline==0.2.1
69
  mdit-py-plugins==0.5.0
70
  mdurl==0.1.2
71
- mistune==3.1.4
72
  mizani==0.14.3
73
- narwhals==2.11.0
74
- nbclient==0.10.2
75
- nbconvert==7.16.6
76
- nbformat==5.10.4
77
- nest-asyncio==1.6.0
78
- notebook==7.4.7
79
- notebook-shim==0.2.4
80
- numpy==2.3.4
81
- openpyxl==3.1.5
82
- orjson==3.11.4
83
  packaging==25.0
84
- palmerpenguins==0.1.4
85
  pandas==2.3.3
86
- pandocfilters==1.5.1
87
  parso==0.8.5
 
88
  patsy==1.0.2
89
- penguins==0.5.2
90
  pexpect==4.9.0
91
  pillow==12.0.0
92
- platformdirs==4.5.0
93
- plotly==6.4.0
94
- plotly-express==0.4.1
95
  plotnine==0.15.1
96
- prometheus-client==0.23.1
97
  prompt-toolkit==3.0.52
98
- psutil==7.1.3
99
  psygnal==0.15.0
100
  ptyprocess==0.7.0
101
  pure-eval==0.2.3
102
- pycparser==2.23
103
- pydantic==2.12.4
104
- pydantic-core==2.41.5
105
  pygments==2.19.2
106
  pyparsing==3.2.5
107
  pyscbwrapper==0.1.2
108
  python-dateutil==2.9.0.post0
109
- python-json-logger==4.0.0
110
  python-multipart==0.0.20
111
  pytz==2025.2
112
- pyyaml==6.0.3
113
- pyzmq==27.1.0
114
  questionary==2.1.1
115
- referencing==0.37.0
116
  requests==2.32.5
117
- rfc3339-validator==0.1.4
118
- rfc3986-validator==0.1.1
119
- rfc3987-syntax==1.1.0
120
- rpds-py==0.28.0
121
- ruamel-yaml==0.18.16
122
- ruamel-yaml-clib==0.2.15
123
  scipy==1.16.3
124
- seaborn==0.13.2
125
- send2trash==1.8.3
126
  setuptools==80.9.0
127
- shiny==1.5.0
128
  shinychat==0.2.8
129
  shinyswatch==0.9.0
130
  shinywidgets==0.7.0
131
  six==1.17.0
132
- sniffio==1.3.1
133
- soupsieve==2.8
134
  stack-data==0.6.3
135
  starlette==0.50.0
136
- statsmodels==0.14.5
137
- terminado==0.18.1
138
- tinycss2==1.4.0
139
- tornado==6.5.2
140
  traitlets==5.14.3
141
  typing-extensions==4.15.0
142
- typing-inspection==0.4.2
143
  tzdata==2025.2
144
  uc-micro-py==1.0.3
145
- uri-template==1.3.0
146
- urllib3==2.5.0
147
  uvicorn==0.38.0
148
  watchfiles==1.1.1
149
  wcwidth==0.2.14
150
- webcolors==25.10.0
151
- webencodings==0.5.1
152
- websocket-client==1.9.0
153
  websockets==15.0.1
154
  widgetsnbextension==4.0.15
 
1
+ anyio==4.12.0
 
2
  anywidget==0.9.21
3
+ asgiref==3.11.0
4
+ asttokens==3.0.1
 
 
 
 
 
 
 
 
 
5
  certifi==2025.11.12
 
6
  charset-normalizer==3.4.4
7
+ click==8.3.1
8
  comm==0.2.3
9
  contourpy==1.3.3
10
  cycler==0.12.1
 
11
  decorator==5.2.1
 
 
 
12
  executing==2.2.1
13
+ fonttools==4.61.0
 
 
 
14
  h11==0.16.0
15
  htmltools==0.6.0
 
 
16
  idna==3.11
17
+ ipython==9.8.0
 
18
  ipython-pygments-lexers==1.1.1
19
  ipywidgets==8.1.8
 
 
20
  jedi==0.19.2
 
 
 
 
 
 
 
 
21
  jupyter-core==5.9.1
 
 
 
 
 
 
 
22
  jupyterlab-widgets==3.0.16
23
  kiwisolver==1.4.9
 
 
24
  linkify-it-py==2.0.3
25
  markdown-it-py==4.0.0
 
26
  matplotlib==3.10.7
27
  matplotlib-inline==0.2.1
28
  mdit-py-plugins==0.5.0
29
  mdurl==0.1.2
 
30
  mizani==0.14.3
31
+ narwhals==2.13.0
32
+ numpy==2.3.5
33
+ orjson==3.11.5
 
 
 
 
 
 
 
34
  packaging==25.0
 
35
  pandas==2.3.3
 
36
  parso==0.8.5
37
+ pathlib==1.0.1
38
  patsy==1.0.2
 
39
  pexpect==4.9.0
40
  pillow==12.0.0
41
+ platformdirs==4.5.1
42
+ plotly==6.5.0
 
43
  plotnine==0.15.1
 
44
  prompt-toolkit==3.0.52
 
45
  psygnal==0.15.0
46
  ptyprocess==0.7.0
47
  pure-eval==0.2.3
 
 
 
48
  pygments==2.19.2
49
  pyparsing==3.2.5
50
  pyscbwrapper==0.1.2
51
  python-dateutil==2.9.0.post0
 
52
  python-multipart==0.0.20
53
  pytz==2025.2
 
 
54
  questionary==2.1.1
 
55
  requests==2.32.5
 
 
 
 
 
 
56
  scipy==1.16.3
 
 
57
  setuptools==80.9.0
58
+ shiny==1.5.1
59
  shinychat==0.2.8
60
  shinyswatch==0.9.0
61
  shinywidgets==0.7.0
62
  six==1.17.0
 
 
63
  stack-data==0.6.3
64
  starlette==0.50.0
65
+ statsmodels==0.14.6
 
 
 
66
  traitlets==5.14.3
67
  typing-extensions==4.15.0
 
68
  tzdata==2025.2
69
  uc-micro-py==1.0.3
70
+ urllib3==2.6.1
 
71
  uvicorn==0.38.0
72
  watchfiles==1.1.1
73
  wcwidth==0.2.14
 
 
 
74
  websockets==15.0.1
75
  widgetsnbextension==4.0.15
scripts/01_scbPull.py DELETED
@@ -1,129 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import argparse
4
- from pathlib import Path
5
- from typing import Literal
6
-
7
- import pandas as pd
8
- from pyscbwrapper import SCB
9
-
10
- Taxonomy = Literal["ssyk2012", "ssyk96"]
11
-
12
-
13
- try:
14
- ROOT = Path(__file__).resolve().parents[1]
15
- except NameError: # pragma: no cover - interactive fallback
16
- ROOT = Path.cwd().resolve()
17
-
18
- DATA_DIR = ROOT / "data"
19
- SCB_DIR = DATA_DIR / "02_scb_data"
20
-
21
- TABLES = {
22
- "ssyk2012": ("en", "AM", "AM0208", "AM0208E", "YREG51BAS"),
23
- "ssyk96": ("en", "AM", "AM0208", "AM0208E", "YREG33"),
24
- }
25
-
26
-
27
- def coerce_year(value: str | int | None) -> int | None:
28
- try:
29
- return int(value) if value is not None else None
30
- except (TypeError, ValueError):
31
- return None
32
-
33
-
34
- def latest_year(var_block: dict) -> str:
35
- years = [coerce_year(year) for year in var_block.get("year", [])]
36
- valid = [year for year in years if year is not None]
37
- if not valid:
38
- raise ValueError("SCB variable metadata did not provide any valid years")
39
- return str(max(valid))
40
-
41
-
42
- def fetch_taxonomy_dataframe(taxonomy: Taxonomy) -> tuple[pd.DataFrame, str]:
43
- if taxonomy not in TABLES:
44
- raise KeyError(f"Unknown taxonomy '{taxonomy}'")
45
-
46
- scb = SCB(*TABLES[taxonomy])
47
- var_block = scb.get_variables()
48
- occupations_key, occupations = next(iter(var_block.items()))
49
- clean_key = occupations_key.replace(" ", "")
50
-
51
- year = latest_year(var_block)
52
- scb.set_query(**{clean_key: occupations, "year": [year]})
53
- scb_fetch = scb.get_data()["data"]
54
-
55
- codes = scb.get_query()["query"][0]["selection"]["values"]
56
- occ_dict = dict(zip(codes, occupations))
57
-
58
- records = []
59
- for record in scb_fetch:
60
- code, obs_year = record["key"][:2]
61
- if code == "0002":
62
- continue # drop unspecified bucket
63
- value = int(record["values"][0])
64
- records.append(
65
- {
66
- "code_4": str(code).zfill(4),
67
- "code_3": str(code).zfill(4)[:3],
68
- "code_2": str(code).zfill(4)[:2],
69
- "code_1": str(code).zfill(4)[:1],
70
- "year": obs_year,
71
- "value": value,
72
- }
73
- )
74
-
75
- df = pd.DataFrame(records)
76
- if df.empty:
77
- raise RuntimeError(f"SCB returned no data for taxonomy '{taxonomy}'")
78
-
79
- level_map = {4: "code_4", 3: "code_3", 2: "code_2", 1: "code_1"}
80
- frames = []
81
- for level, column in level_map.items():
82
- level_df = (
83
- df.groupby(["year", column], as_index=False)["value"]
84
- .sum()
85
- .rename(columns={column: "code"})
86
- )
87
- level_df["level"] = level
88
- frames.append(level_df)
89
-
90
- stacked = (
91
- pd.concat(frames, ignore_index=True)
92
- .assign(taxonomy=taxonomy)[["taxonomy", "year", "level", "code", "value"]]
93
- .sort_values(["year", "level", "code"], ignore_index=True)
94
- )
95
-
96
- return stacked, year
97
-
98
-
99
- def write_taxonomy_csv(df: pd.DataFrame, taxonomy: Taxonomy, year: str) -> Path:
100
- SCB_DIR.mkdir(parents=True, exist_ok=True)
101
- out_path = SCB_DIR / f"{taxonomy}_en_{year}.csv"
102
- df.to_csv(out_path, index=False)
103
- return out_path
104
-
105
-
106
- def pull_taxonomy(taxonomy: Taxonomy) -> Path:
107
- df, year = fetch_taxonomy_dataframe(taxonomy)
108
- return write_taxonomy_csv(df, taxonomy, year)
109
-
110
-
111
- def parse_args() -> argparse.Namespace:
112
- parser = argparse.ArgumentParser(description="Pull SCB weights for a taxonomy")
113
- parser.add_argument(
114
- "--taxonomy",
115
- default="ssyk2012",
116
- choices=["ssyk2012", "ssyk96"],
117
- help="Taxonomy to download (default: ssyk2012)",
118
- )
119
- return parser.parse_args()
120
-
121
-
122
- def main() -> None:
123
- args = parse_args()
124
- path = pull_taxonomy(args.taxonomy)
125
- print(f"Wrote {path}")
126
-
127
-
128
- if __name__ == "__main__":
129
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/02_weighting.py DELETED
@@ -1,258 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import argparse
4
- from pathlib import Path
5
- from typing import Literal
6
-
7
- import pandas as pd
8
-
9
- Taxonomy = Literal["ssyk2012", "ssyk96"]
10
-
11
- try:
12
- ROOT = Path(__file__).resolve().parents[1]
13
- except NameError: # pragma: no cover - interactive fallback
14
- ROOT = Path.cwd()
15
-
16
- DATA_DIR = ROOT / "data"
17
-
18
-
19
- def data_path(*parts: str | Path) -> Path:
20
- return DATA_DIR.joinpath(*parts)
21
-
22
-
23
- def latest_file(directory: Path, pattern: str) -> Path:
24
- files = sorted(directory.glob(pattern))
25
- if not files:
26
- raise FileNotFoundError(f"No files matching '{pattern}' in {directory}")
27
- return files[-1]
28
-
29
-
30
- def load_daioe_raw(taxonomy: Taxonomy, sep: str = "\t") -> pd.DataFrame:
31
- return pd.read_csv(data_path("01_daioe_raw", f"daioe_{taxonomy}.csv"), sep=sep)
32
-
33
-
34
- def load_scb_employment(taxonomy: Taxonomy) -> pd.DataFrame:
35
- scb_path = latest_file(data_path("02_scb_data"), f"{taxonomy}*.csv")
36
- return pd.read_csv(scb_path).drop(columns=["year"], errors="ignore")
37
-
38
-
39
- def ensure_columns(df: pd.DataFrame, required: list[str]) -> None:
40
- missing = [col for col in required if col not in df.columns]
41
- if missing:
42
- raise KeyError(f"Missing expected columns: {missing}")
43
-
44
-
45
- def split_code_label(series: pd.Series) -> tuple[pd.Series, pd.Series]:
46
- parts = series.astype(str).str.split(" ", n=1, expand=True)
47
- parts = parts.fillna({0: "", 1: ""})
48
- return parts[0], parts[1]
49
-
50
-
51
- def prepare_raw_dataframe(raw: pd.DataFrame, taxonomy: Taxonomy) -> tuple[pd.DataFrame, list[str]]:
52
- df = raw.drop(columns=["Unnamed: 0"], errors="ignore").copy()
53
- ensure_columns(df, ["year"])
54
-
55
- daioe_cols = [col for col in df.columns if col.startswith("daioe_")]
56
- if not daioe_cols:
57
- raise KeyError("Expected at least one 'daioe_*' column in DAIOE raw file.")
58
-
59
- code_cols = {
60
- 4: f"{taxonomy}_4",
61
- 3: f"{taxonomy}_3",
62
- 2: f"{taxonomy}_2",
63
- 1: f"{taxonomy}_1",
64
- }
65
- ensure_columns(df, list(code_cols.values()))
66
-
67
- for level, col in code_cols.items():
68
- codes, labels = split_code_label(df[col])
69
- df[f"code{level}"] = codes
70
- df[f"label{level}"] = labels
71
-
72
- df["code4"] = df["code4"].str.zfill(4)
73
- for level in (1, 2, 3):
74
- df[f"code{level}"] = df[f"code{level}"].str.lstrip("0")
75
-
76
- return df, daioe_cols
77
-
78
-
79
- def attach_employment(df: pd.DataFrame, scb: pd.DataFrame) -> pd.DataFrame:
80
- scb_lvl4 = scb[scb["level"] == 4].copy()
81
- if scb_lvl4.empty:
82
- raise ValueError("SCB data must contain level-4 rows for weighting.")
83
-
84
- scb_lvl4["code4"] = scb_lvl4["code"].astype(str).str.zfill(4)
85
- merged = df.merge(
86
- scb_lvl4[["code4", "value"]],
87
- on="code4",
88
- how="left",
89
- validate="many_to_one",
90
- )
91
- return merged.rename(columns={"value": "emp"})
92
-
93
-
94
- def compute_children_maps(df: pd.DataFrame) -> dict[int, pd.DataFrame]:
95
- counts = {
96
- 1: df.groupby(["year", "code1"])["code2"].nunique().reset_index(name="n_children"),
97
- 2: df.groupby(["year", "code2"])["code3"].nunique().reset_index(name="n_children"),
98
- 3: df.groupby(["year", "code3"])["code4"].nunique().reset_index(name="n_children"),
99
- }
100
- lvl4 = df.groupby(["year", "code4"]).size().reset_index(name="n_children")
101
- lvl4["n_children"] = 1
102
- counts[4] = lvl4
103
- return counts
104
-
105
-
106
- def aggregate_level(
107
- df: pd.DataFrame,
108
- *,
109
- daioe_cols: list[str],
110
- n_children: dict[int, pd.DataFrame],
111
- taxonomy: Taxonomy,
112
- level: int,
113
- method: Literal["weighted", "simple"],
114
- ) -> pd.DataFrame:
115
- if level not in (1, 2, 3):
116
- raise ValueError("Only levels 1–3 can be aggregated from level 4.")
117
-
118
- code_col, label_col = f"code{level}", f"label{level}"
119
- group_cols = ["year", code_col, label_col]
120
-
121
- if method == "weighted":
122
- tmp = df[group_cols + ["emp"] + daioe_cols].copy()
123
- for metric in daioe_cols:
124
- mask = tmp[metric].notna()
125
- tmp[f"{metric}_wx"] = tmp[metric].where(mask, 0) * tmp["emp"].where(mask, 0)
126
- tmp[f"{metric}_w"] = tmp["emp"].where(mask, 0)
127
- agg_cols = {f"{metric}_wx": "sum" for metric in daioe_cols}
128
- agg_cols.update({f"{metric}_w": "sum" for metric in daioe_cols})
129
- grouped = tmp.groupby(group_cols, as_index=False).agg(agg_cols)
130
- for metric in daioe_cols:
131
- denom = grouped[f"{metric}_w"].replace(0, pd.NA)
132
- grouped[metric] = grouped[f"{metric}_wx"] / denom
133
- grouped.drop(columns=[f"{metric}_wx", f"{metric}_w"], inplace=True)
134
- else:
135
- grouped = df[group_cols + daioe_cols].groupby(group_cols, as_index=False).mean()
136
-
137
- grouped = grouped.merge(
138
- n_children[level],
139
- left_on=["year", code_col],
140
- right_on=["year", code_col],
141
- how="left",
142
- )
143
-
144
- out = grouped[["year", code_col, label_col, "n_children"] + daioe_cols].copy()
145
- out["taxonomy"] = taxonomy
146
- out["level"] = level
147
- out = out.rename(columns={code_col: "code", label_col: "label"})
148
- out["code"] = out["code"].astype(str)
149
- return out
150
-
151
-
152
- def base_level_four(df: pd.DataFrame, daioe_cols: list[str], taxonomy: Taxonomy, n_children: pd.DataFrame) -> pd.DataFrame:
153
- base = df[["year", "code4", "label4"] + daioe_cols].copy()
154
- base = base.merge(n_children, on=["year", "code4"], how="left")
155
- base["taxonomy"] = taxonomy
156
- base["level"] = 4
157
- base = base.rename(columns={"code4": "code", "label4": "label"})
158
- base["code"] = base["code"].astype(str)
159
- return base
160
-
161
-
162
- def add_percentiles(df: pd.DataFrame, metrics: list[str]) -> list[str]:
163
- pct_cols: list[str] = []
164
- for metric in metrics:
165
- suffix = metric.removeprefix("daioe_")
166
- rank_col = f"pct_rank_{suffix}"
167
- df[rank_col] = df.groupby(["year", "level"])[metric].rank(pct=True)
168
- pct_cols.append(rank_col)
169
- return pct_cols
170
-
171
-
172
- def build_pipeline(
173
- df: pd.DataFrame,
174
- *,
175
- daioe_cols: list[str],
176
- taxonomy: Taxonomy,
177
- n_children: dict[int, pd.DataFrame],
178
- method: Literal["weighted", "simple"],
179
- ) -> pd.DataFrame:
180
- lvl4 = base_level_four(df, daioe_cols, taxonomy, n_children[4])
181
- lvl1 = aggregate_level(df, daioe_cols=daioe_cols, n_children=n_children, taxonomy=taxonomy, level=1, method=method)
182
- lvl2 = aggregate_level(df, daioe_cols=daioe_cols, n_children=n_children, taxonomy=taxonomy, level=2, method=method)
183
- lvl3 = aggregate_level(df, daioe_cols=daioe_cols, n_children=n_children, taxonomy=taxonomy, level=3, method=method)
184
-
185
- combined = pd.concat([lvl1, lvl2, lvl3, lvl4], ignore_index=True)
186
- pct_cols = add_percentiles(combined, daioe_cols)
187
- ordered = [
188
- "taxonomy",
189
- "level",
190
- "code",
191
- "label",
192
- "year",
193
- "n_children",
194
- *daioe_cols,
195
- *pct_cols,
196
- ]
197
- return combined[ordered].sort_values(["level", "code", "year"], ignore_index=True)
198
-
199
-
200
- def write_outputs(taxonomy: Taxonomy, weighted: pd.DataFrame, simple: pd.DataFrame) -> tuple[Path, Path]:
201
- out_dir = data_path("03_daioe_aggregated")
202
- out_dir.mkdir(parents=True, exist_ok=True)
203
- weighted_path = out_dir / f"daioe_{taxonomy}_emp_weighted.csv"
204
- simple_path = out_dir / f"daioe_{taxonomy}_simple_avg.csv"
205
- weighted.to_csv(weighted_path, index=False)
206
- simple.to_csv(simple_path, index=False)
207
- return weighted_path, simple_path
208
-
209
-
210
- def run_weighting(taxonomy: Taxonomy, sep: str = "\t") -> tuple[Path, Path]:
211
- raw = load_daioe_raw(taxonomy, sep=sep)
212
- scb = load_scb_employment(taxonomy)
213
- prepared, daioe_cols = prepare_raw_dataframe(raw, taxonomy)
214
- prepared = attach_employment(prepared, scb)
215
- n_children = compute_children_maps(prepared)
216
-
217
- weighted = build_pipeline(
218
- prepared,
219
- daioe_cols=daioe_cols,
220
- taxonomy=taxonomy,
221
- n_children=n_children,
222
- method="weighted",
223
- )
224
- simple = build_pipeline(
225
- prepared,
226
- daioe_cols=daioe_cols,
227
- taxonomy=taxonomy,
228
- n_children=n_children,
229
- method="simple",
230
- )
231
- return write_outputs(taxonomy, weighted, simple)
232
-
233
-
234
- def parse_args() -> argparse.Namespace:
235
- parser = argparse.ArgumentParser(description="Run DAIOE weighting pipeline")
236
- parser.add_argument(
237
- "--taxonomy",
238
- default="ssyk2012",
239
- choices=["ssyk2012", "ssyk96"],
240
- help="Taxonomy to process (default: ssyk2012)",
241
- )
242
- parser.add_argument(
243
- "--sep",
244
- default="\t",
245
- help="Delimiter used in DAIOE raw files (default: tab)",
246
- )
247
- return parser.parse_args()
248
-
249
-
250
- def main() -> None:
251
- args = parse_args()
252
- weighted_path, simple_path = run_weighting(args.taxonomy, sep=args.sep)
253
- print("Written employment-weighted file:", weighted_path)
254
- print("Written simple-average file: ", simple_path)
255
-
256
-
257
- if __name__ == "__main__":
258
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/04_occ.py DELETED
@@ -1,109 +0,0 @@
1
- import pandas as pd
2
- from pyscbwrapper import SCB
3
- from pathlib import Path
4
-
5
-
6
- # Optional: project root if you need it elsewhere
7
- ROOT = Path(__file__).resolve().parent
8
-
9
-
10
- TAX_ID = "ssyk2012"
11
-
12
- TABLES = {
13
- "ssyk2012_tab": ("en", "AM", "AM0208", "AM0208B", "YREG61BAS"),
14
- # "ssyk96_tab": ("en", "AM", "AM0208", "AM0208E", "YREG33"),
15
- }
16
-
17
-
18
- def fetch_scb_aku_occupations(tax_id: str = TAX_ID) -> pd.DataFrame:
19
- """
20
- Fetch SCB AKU employment by occupation (SSYK 2012), age and year,
21
- and return a cleaned DataFrame at the SSYK3 level (string codes).
22
-
23
- Columns:
24
- - code_3 (SSYK code as returned by SCB; can be 2–4 digits)
25
- - occupation (text label from SCB)
26
- - age
27
- - year
28
- - value (string as provided by SCB)
29
- """
30
-
31
- # ---- 1) Init SCB table ----
32
- scb = SCB(*TABLES[f"{tax_id}_tab"])
33
- var_ = scb.get_variables()
34
-
35
- # First variable is the occupation variable (as in your original code)
36
- occupations_key, occupations = next(iter(var_.items()))
37
- clean_key = occupations_key.replace(" ", "")
38
-
39
- # ---- 2) Years: coerce to int, use all valid years ----
40
- def coerce_year(y):
41
- try:
42
- return int(y)
43
- except Exception:
44
- return None
45
-
46
- years = [coerce_year(y) for y in var_["year"]]
47
- years = [y for y in years if y is not None]
48
- if not years:
49
- raise ValueError("No valid years found in SCB variables")
50
-
51
- years_sorted = sorted(set(years))
52
- year_values = [str(y) for y in years_sorted]
53
-
54
- # ---- 3) All ages as provided by SCB ----
55
- age_values = var_["age"]
56
-
57
- # ---- 4) Build and send query ----
58
- scb.set_query(
59
- **{
60
- clean_key: occupations,
61
- "year": year_values, # all years
62
- "age": age_values, # all ages
63
- }
64
- )
65
-
66
- scb_data = scb.get_data()
67
- scb_fetch = scb_data["data"]
68
-
69
- # Map occupation codes to their labels
70
- codes = scb.get_query()["query"][0]["selection"]["values"]
71
- occ_dict = dict(zip(codes, occupations))
72
-
73
- # ---- 5) Build DataFrame ----
74
- records = []
75
- for r in scb_fetch:
76
- # The order follows the SCB query; your original code assumed:
77
- # occupation code, age, year
78
- code, age, year = r["key"]
79
- name = occ_dict.get(code, code)
80
- value = r["values"][0] # raw string
81
- records.append(
82
- {
83
- "code_3": code,
84
- "occupation": name,
85
- "age": age,
86
- "year": year,
87
- "value": value,
88
- }
89
- )
90
-
91
- df = pd.DataFrame(records)
92
-
93
- # Remove unidentified group 002 (as in your original code)
94
- df = df[df["code_3"] != "002"].reset_index(drop=True)
95
-
96
- return df
97
-
98
-
99
- def main() -> pd.DataFrame:
100
- """Entry point when run as a script; returns the DataFrame."""
101
- df = fetch_scb_aku_occupations()
102
- # Optional: quick check
103
- print(df.head())
104
- print(f"\nRows: {len(df)}, columns: {list(df.columns)}")
105
- return df
106
-
107
-
108
- if __name__ == "__main__":
109
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/__init__.py DELETED
File without changes
src/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """src package initializer.
2
+
3
+ This package contains the core SCB employment data pipeline modules.
4
+ Modules include data loading, caching and aggregation helpers. See
5
+ individual module docstrings for details.
6
+ """
src/config.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration constants for the SCB-only employment data pipeline.
3
+ """
4
+
5
+ from typing import Dict, List, Literal, Tuple
6
+
7
+ # ======================================================
8
+ # DATA SOURCES / CONSTANTS
9
+ # ======================================================
10
+ TAXONOMY: Literal["ssyk2012"] = "ssyk2012"
11
+
12
+ TRANSLATION_URL: str = (
13
+ "https://raw.githubusercontent.com/joseph-data/07_translate_ssyk/main/"
14
+ "02_translation_files/ssyk2012_en.xlsx"
15
+ )
16
+
17
+ # SCB table definitions
18
+ TABLES: Dict[str, Tuple[str, str, str, str, str]] = {
19
+ "14_to_18": ("en", "AM", "AM0208", "AM0208E", "YREG51"),
20
+ "19_to_21": ("en", "AM", "AM0208", "AM0208E", "YREG51N"),
21
+ "20_to_23": ("en", "AM", "AM0208", "AM0208E", "YREG51BAS"),
22
+ }
23
+
24
+ AGE_EXCLUSIONS: List[str] = ["65-69 years"]
25
+ EXCLUDED_CODES: List[str] = ["0002", "0000"]
26
+
27
+ # ======================================================
28
+ # UI DEFAULTS
29
+ # ======================================================
30
+ LEVEL_OPTIONS: List[Tuple[str, str]] = [
31
+ ("Level 4 (4-digit)", "4"),
32
+ ("Level 3 (3-digit)", "3"),
33
+ ("Level 2 (2-digit)", "2"),
34
+ ("Level 1 (1-digit)", "1"),
35
+ ]
36
+
37
+ DEFAULT_LEVEL: str = "3"
38
+
39
+ GLOBAL_YEAR_MIN: int = 2014
40
+ GLOBAL_YEAR_MAX: int = 2023
41
+ DEFAULT_YEAR_RANGE: Tuple[int, int] = (GLOBAL_YEAR_MIN, GLOBAL_YEAR_MAX)
42
+
43
+ AGE_ORDER: List[str] = [
44
+ "16-24",
45
+ "25-29",
46
+ "30-34",
47
+ "35-39",
48
+ "40-44",
49
+ "45-49",
50
+ "50-54",
51
+ "55-59",
52
+ "60-64",
53
+ ]
src/data_manager.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Data manager for loading and caching SCB employment pipeline results.
2
+
3
+ This module encapsulates the logic for computing the SCB-only
4
+ transformations in ``pipeline.py`` and persisting the result to disk.
5
+ It adds a small amount of resilience around caching and uses
6
+ ``logging`` instead of printing directly to stdout. The cache file
7
+ includes a version tag to make it easy to invalidate caches when
8
+ fundamental changes are made to the pipeline logic.
9
+ """
10
+
11
+ import os
12
+ import tempfile
13
+ import logging
14
+ from pathlib import Path
15
+ from functools import lru_cache
16
+
17
+ import pandas as pd
18
+
19
+ from . import pipeline
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # ---------------------------------------------------------------------------
24
+ # Cache setup
25
+ # ---------------------------------------------------------------------------
26
+ # A version tag to embed into the cache filenames. Bump this value
27
+ # whenever the underlying ``pipeline`` logic changes in a way that
28
+ # invalidates existing caches.
29
+ CACHE_VERSION: str = "v1"
30
+
31
+
32
+ def _resolve_cache_dir() -> Path:
33
+ """Select a writable directory for caching.
34
+
35
+ The lookup order is:
36
+
37
+ 1. The ``DATA_CACHE_DIR`` environment variable, if set.
38
+ 2. A ``data`` folder at the repository root.
39
+ 3. A temporary directory in ``/tmp``.
40
+
41
+ Each candidate path is tested for writability by attempting to
42
+ create and delete a sentinel file. The first path that succeeds
43
+ is returned. If none succeed, a final fallback directory in ``/tmp``
44
+ is created and returned.
45
+ """
46
+ candidates: list[Path] = []
47
+ env = os.getenv("DATA_CACHE_DIR")
48
+ if env:
49
+ # Expand relative or user paths to absolute
50
+ candidates.append(Path(env).expanduser().resolve())
51
+
52
+ # Repo root /data (two levels up from this file)
53
+ candidates.append(Path(__file__).resolve().parent.parent / "data")
54
+ # Temp fallback
55
+ candidates.append(Path(tempfile.gettempdir()) / "employment_ai_cache")
56
+
57
+ for path in candidates:
58
+ try:
59
+ path.mkdir(parents=True, exist_ok=True)
60
+ test_file = path / ".write_test"
61
+ test_file.write_text("ok", encoding="utf-8")
62
+ test_file.unlink()
63
+ return path
64
+ except Exception:
65
+ continue
66
+
67
+ # Final fallback: ensure the last candidate exists
68
+ fallback = Path(tempfile.gettempdir()) / "employment_ai_cache"
69
+ fallback.mkdir(parents=True, exist_ok=True)
70
+ return fallback
71
+
72
+
73
+ # Resolve the directory once at import time
74
+ DATA_DIR: Path = _resolve_cache_dir()
75
+
76
+ # Single cache file for the SCB-only output DataFrame.
77
+ SCB_CACHE: Path = DATA_DIR / f"scb_employment_{CACHE_VERSION}.csv"
78
+
79
+
80
+ def _atomic_to_csv(df: pd.DataFrame, path: Path) -> None:
81
+ """Write a DataFrame to CSV atomically.
82
+
83
+ The CSV is first written to a temporary file in the same directory
84
+ and then renamed to the final location. This avoids leaving a
85
+ partially written file if the process is interrupted mid‑write.
86
+ """
87
+ path.parent.mkdir(parents=True, exist_ok=True)
88
+ tmp_path = path.with_suffix(path.suffix + ".tmp")
89
+ df.to_csv(tmp_path, index=False)
90
+ tmp_path.replace(path)
91
+
92
+
93
+ @lru_cache(maxsize=1)
94
+ def _compute_pipeline_payload() -> pd.DataFrame:
95
+ """Runs the SCB-only pipeline calculation."""
96
+ return pipeline.run_pipeline()
97
+
98
+
99
+ def load_payload(force_recompute: bool = False) -> pd.DataFrame:
100
+ """
101
+ Load employment data from disk cache if available, otherwise compute and save.
102
+
103
+ Parameters
104
+ ----------
105
+ force_recompute : bool, optional
106
+ If ``True``, recompute the pipeline even if cache files exist.
107
+
108
+ Returns
109
+ -------
110
+ pd.DataFrame
111
+ The SCB employment data with hierarchy levels, age groups and totals.
112
+ """
113
+ # If a cached payload exists and recomputation is not forced, return it
114
+ if not force_recompute and SCB_CACHE.exists():
115
+ logger.info("Loading pipeline output from cache directory %s", DATA_DIR)
116
+ try:
117
+ return pd.read_csv(SCB_CACHE)
118
+ except Exception as exc:
119
+ # If reading the cache fails, fall back to recomputing
120
+ logger.warning(
121
+ "Error reading cache file %s: %s; falling back to recompute",
122
+ SCB_CACHE,
123
+ exc,
124
+ )
125
+
126
+ if force_recompute:
127
+ # Clear the LRU cache before recomputing
128
+ _compute_pipeline_payload.cache_clear()
129
+
130
+ logger.info("Computing SCB employment data – this may take a while…")
131
+ payload = _compute_pipeline_payload()
132
+
133
+ # Persist to disk atomically
134
+ try:
135
+ _atomic_to_csv(payload, SCB_CACHE)
136
+ logger.info("Cache updated: %s", SCB_CACHE.name)
137
+ except Exception as exc:
138
+ logger.warning("Could not write cache file: %s", exc)
139
+
140
+ return payload
src/label_enrichment.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utilities to add English occupation labels to pipeline output using the
3
+ published SSYK2012 translation workbook.
4
+
5
+ The translation file is read directly from:
6
+ https://github.com/joseph-data/07_translate_ssyk/blob/main/02_translation_files/ssyk2012_en.xlsx
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Dict
12
+
13
+ import pandas as pd
14
+
15
+ from .config import TRANSLATION_URL
16
+
17
+
18
+ def _load_level(sheet_name: str, level: int, url: str) -> pd.DataFrame:
19
+ """Load a single level sheet and return columns ``code<level>``/``label<level>``."""
20
+ # Header row with code/name resides at index 3 (0-based)
21
+ df = pd.read_excel(url, sheet_name=sheet_name, header=3)
22
+ df = df.rename(columns=lambda c: str(c).strip())
23
+
24
+ code_col = next(c for c in df.columns if "SSYK" in str(c))
25
+ name_col = next(c for c in df.columns if "Name" in str(c))
26
+
27
+ df = df[[code_col, name_col]].dropna(subset=[code_col])
28
+ df[code_col] = df[code_col].astype(str).str.strip().str.zfill(level)
29
+ df[name_col] = df[name_col].astype(str).str.strip()
30
+
31
+ return df.rename(columns={code_col: f"code{level}", name_col: f"label{level}"})
32
+
33
+
34
+ def load_translation_tables(url: str = TRANSLATION_URL) -> Dict[int, pd.DataFrame]:
35
+ """Return translation tables for SSYK levels 1–4 keyed by level."""
36
+ tables: Dict[int, pd.DataFrame] = {}
37
+ for level, sheet in ((1, "1-digit"), (2, "2-digit"), (3, "3-digit"), (4, "4-digit")):
38
+ tables[level] = _load_level(sheet, level, url)
39
+ return tables
40
+
41
+
42
+ def apply_translations(df: pd.DataFrame, *, tables: Dict[int, pd.DataFrame] | None = None) -> pd.DataFrame:
43
+ """
44
+ Apply English labels to an aggregated SCB DataFrame with columns ``level``, ``code`` and ``label``.
45
+
46
+ The ``label`` column is replaced (when available) with the translation matching
47
+ the SSYK level/code combination. Rows without a translation keep their original label.
48
+ """
49
+ if tables is None:
50
+ tables = load_translation_tables()
51
+
52
+ label_maps = {
53
+ level: tbl.set_index(f"code{level}")[f"label{level}"] for level, tbl in tables.items()
54
+ }
55
+
56
+ out = df.copy()
57
+ for level, mapping in label_maps.items():
58
+ mask = out["level"] == level
59
+ if mask.any():
60
+ out.loc[mask, "label"] = out.loc[mask, "code"].map(mapping).fillna(
61
+ out.loc[mask, "label"]
62
+ )
63
+ return out
64
+
65
+
66
+ if __name__ == "__main__":
67
+ # Example usage: enrich pipeline output with translated labels and preview
68
+ from .data_manager import load_payload
69
+
70
+ pipeline_df = load_payload()
71
+ labeled = apply_translations(pipeline_df)
72
+ print(labeled.head())
src/pipeline.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Core pipeline logic for SCB employment-only data.
2
+
3
+ This module fetches employment data from Statistics Sweden (SCB),
4
+ derives SSYK2012 hierarchy columns from 4-digit codes, and aggregates
5
+ employment totals across hierarchy levels. DAIOE exposure inputs have
6
+ been removed so the output contains only SCB employment counts.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Dict, Optional
12
+
13
+ import logging
14
+ import pandas as pd
15
+
16
+ from .config import TAXONOMY
17
+ from .label_enrichment import apply_translations
18
+ from .scb_fetch import fetch_all_employment_data
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ def filter_years(
24
+ df: pd.DataFrame,
25
+ year_min: Optional[int],
26
+ year_max: Optional[int],
27
+ *,
28
+ year_col: str,
29
+ ) -> pd.DataFrame:
30
+ """Return a DataFrame filtered to the inclusive year range."""
31
+ if year_min is None and year_max is None:
32
+ return df.copy()
33
+ mask = pd.Series(True, index=df.index, dtype=bool)
34
+ if year_min is not None:
35
+ mask &= df[year_col] >= year_min
36
+ if year_max is not None:
37
+ mask &= df[year_col] <= year_max
38
+ mask = mask.fillna(False)
39
+ return df.loc[mask].copy()
40
+
41
+
42
+ def prepare_employment(
43
+ raw: pd.DataFrame,
44
+ *,
45
+ year_min: Optional[int] = None,
46
+ year_max: Optional[int] = None,
47
+ ) -> pd.DataFrame:
48
+ """Clean SCB employment data and derive SSYK hierarchy columns."""
49
+ if raw.empty:
50
+ raise ValueError("SCB fetch returned an empty DataFrame.")
51
+
52
+ emp = raw.copy()
53
+ emp["code4"] = emp["code_4"].astype(str).str.zfill(4)
54
+ emp["code3"] = emp["code4"].str[:3]
55
+ emp["code2"] = emp["code4"].str[:2]
56
+ emp["code1"] = emp["code4"].str[:1]
57
+
58
+ emp["label4"] = emp["occupation"].fillna("").str.strip()
59
+ emp["label3"] = emp["code3"]
60
+ emp["label2"] = emp["code2"]
61
+ emp["label1"] = emp["code1"]
62
+
63
+ emp["age"] = emp["age"].astype(str).str.strip()
64
+ emp["year"] = pd.to_numeric(emp["year"], errors="coerce").astype("Int64")
65
+ emp["employment"] = pd.to_numeric(emp["value"], errors="coerce").fillna(0)
66
+
67
+ emp = emp.dropna(subset=["year"])
68
+ emp = filter_years(emp, year_min, year_max, year_col="year")
69
+
70
+ ordered_cols = [
71
+ "year",
72
+ "age",
73
+ "code4",
74
+ "label4",
75
+ "code3",
76
+ "label3",
77
+ "code2",
78
+ "label2",
79
+ "code1",
80
+ "label1",
81
+ "employment",
82
+ ]
83
+ return emp[ordered_cols]
84
+
85
+
86
+ def compute_children_maps(df: pd.DataFrame) -> Dict[int, pd.DataFrame]:
87
+ """Count the number of descendants for each code at each hierarchy level."""
88
+ base = df[["year", "code4", "code3", "code2", "code1"]].drop_duplicates()
89
+ counts: Dict[int, pd.DataFrame] = {}
90
+ counts[3] = (
91
+ base.groupby(["year", "code3"])["code4"]
92
+ .nunique()
93
+ .reset_index(name="n_children")
94
+ )
95
+ counts[2] = (
96
+ base.groupby(["year", "code2"])["code3"]
97
+ .nunique()
98
+ .reset_index(name="n_children")
99
+ )
100
+ counts[1] = (
101
+ base.groupby(["year", "code1"])["code2"]
102
+ .nunique()
103
+ .reset_index(name="n_children")
104
+ )
105
+ lvl4 = base.groupby(["year", "code4"]).size().reset_index(name="n_children")
106
+ lvl4["n_children"] = 1
107
+ counts[4] = lvl4
108
+ return counts
109
+
110
+
111
+ def build_employment_views(emp: pd.DataFrame) -> Dict[int, Dict[str, pd.DataFrame]]:
112
+ """Build employment views (age and totals) for each hierarchy level."""
113
+ views: Dict[int, Dict[str, pd.DataFrame]] = {}
114
+ for level in (4, 3, 2, 1):
115
+ code_col, label_col = f"code{level}", f"label{level}"
116
+ age_view = emp.groupby(
117
+ ["year", "age", code_col, label_col], as_index=False
118
+ )["employment"].sum()
119
+ total_view = (
120
+ age_view.groupby(["year", code_col, label_col], as_index=False)["employment"]
121
+ .sum()
122
+ .rename(columns={"employment": "employment_total"})
123
+ )
124
+ views[level] = {"age": age_view, "total": total_view}
125
+ return views
126
+
127
+
128
+ def build_level_frame(
129
+ level: int, views: Dict[int, Dict[str, pd.DataFrame]], children: Dict[int, pd.DataFrame]
130
+ ) -> pd.DataFrame:
131
+ """Combine age-level employment, totals and child counts for a level."""
132
+ code_col, label_col = f"code{level}", f"label{level}"
133
+ age_view = views[level]["age"].copy()
134
+ totals = views[level]["total"]
135
+
136
+ merged = (
137
+ age_view.merge(totals, on=["year", code_col, label_col], how="left")
138
+ .merge(children[level], on=["year", code_col], how="left")
139
+ )
140
+ merged["level"] = level
141
+ merged["taxonomy"] = TAXONOMY
142
+ merged = merged.rename(columns={code_col: "code", label_col: "label"})
143
+
144
+ ordered = [
145
+ "taxonomy",
146
+ "level",
147
+ "code",
148
+ "label",
149
+ "year",
150
+ "n_children",
151
+ "age",
152
+ "employment",
153
+ "employment_total",
154
+ ]
155
+ return merged[ordered]
156
+
157
+
158
+ def run_pipeline(
159
+ *,
160
+ year_min: Optional[int] = None,
161
+ year_max: Optional[int] = None,
162
+ ) -> pd.DataFrame:
163
+ """Run the SCB-only pipeline and return aggregated employment data."""
164
+ logger.info("Starting SCB-only employment pipeline")
165
+ raw = fetch_all_employment_data()
166
+ employment = prepare_employment(raw, year_min=year_min, year_max=year_max)
167
+
168
+ if employment.empty:
169
+ raise ValueError("No SCB employment rows remain after filtering.")
170
+
171
+ children = compute_children_maps(employment)
172
+ emp_views = build_employment_views(employment)
173
+
174
+ levels = [
175
+ build_level_frame(level, emp_views, children) for level in (1, 2, 3, 4)
176
+ ]
177
+ combined = pd.concat(levels, ignore_index=True)
178
+ combined = combined.sort_values(["level", "code", "year", "age"], ignore_index=True)
179
+ combined = apply_translations(combined)
180
+ return combined
src/plot_helper.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import plotly.graph_objects as go
3
+ import plotly.express as px
4
+ from plotly.subplots import make_subplots
5
+
6
+
7
+ def multi_plot(df: pd.DataFrame) -> go.Figure:
8
+ age_groups = sorted(df["age"].dropna().unique())
9
+
10
+ occupations = sorted(df["label"].dropna().unique())
11
+ # Use a Plotly qualitative palette
12
+ palette = px.colors.qualitative.Plotly
13
+ # Cycle safely if occupations > palette length
14
+ occ_color_map = {
15
+ occ: palette[i % len(palette)] for i, occ in enumerate(occupations)
16
+ }
17
+
18
+ # ------------------------------------------------------------------
19
+ # 2. Create multi-row subplot scaffolding
20
+ # ------------------------------------------------------------------
21
+ subplot_titles = [
22
+ (f"<b>Employed Persons Aged {age} Years by Occupation") for age in age_groups
23
+ ]
24
+
25
+ fig = make_subplots(
26
+ rows=len(age_groups),
27
+ cols=1,
28
+ shared_xaxes=False,
29
+ vertical_spacing=0.03,
30
+ subplot_titles=subplot_titles,
31
+ )
32
+
33
+ # ------------------------------------------------------------------
34
+ # 3. Add traces per age group and exposure level
35
+ # ------------------------------------------------------------------
36
+
37
+ # Need to pre-define the max row number for the final x-axis update
38
+ max_row = len(age_groups)
39
+
40
+ for i, age in enumerate(age_groups, start=1):
41
+ df_age = df[df["age"] == age]
42
+
43
+ # Aggregate by Year and Label
44
+ df_plot = df_age.groupby(["year", "label"], as_index=False)["employment"].sum()
45
+
46
+ for occ_title, sub in df_plot.groupby("label"):
47
+ fig.add_trace(
48
+ go.Scatter(
49
+ x=sub["year"],
50
+ y=sub["employment"],
51
+ mode="lines+markers",
52
+ showlegend=True
53
+ if i == 1
54
+ else False, # Show legend only in the first subplot
55
+ name=occ_title,
56
+ line=dict(color=occ_color_map[occ_title], width=2),
57
+ # Add group/age info to the hover template for debugging/clarity
58
+ hovertemplate=f"Age: {age}<br>Year: %{{x}}<br>Employment: %{{y:,}}<extra>{occ_title}</extra>",
59
+ ),
60
+ row=i,
61
+ col=1,
62
+ )
63
+
64
+ # Y-axis update must be inside the loop to target the current row (i)
65
+ fig.update_yaxes(
66
+ title_text="Number of Employed Persons",
67
+ tickformat=",",
68
+ rangemode="tozero",
69
+ row=i,
70
+ col=1,
71
+ )
72
+
73
+ # X-axis update must target the bottom row (max_row)
74
+ fig.update_xaxes(
75
+ title_text="Year",
76
+ tickmode="linear",
77
+ dtick=1,
78
+ row=max_row,
79
+ col=1,
80
+ )
81
+
82
+ # ------------------------------------------------------------------
83
+ # 4. Global layout tweaks
84
+ # ------------------------------------------------------------------
85
+ fig.update_annotations(yshift=30)
86
+ fig.update_layout(
87
+ height=400 * len(age_groups), # Reduced height for sample data
88
+ width=1000, # Added a main title
89
+ legend_traceorder="normal",
90
+ legend=dict(
91
+ title="Occupation Title(s)",
92
+ orientation="v",
93
+ yanchor="top",
94
+ y=1.0,
95
+ xanchor="left",
96
+ x=1.02,
97
+ bordercolor="#c7c7c7",
98
+ borderwidth=1,
99
+ bgcolor="#f9f9f9",
100
+ font=dict(size=10),
101
+ ),
102
+ margin=dict(t=100, l=50, r=80, b=40),
103
+ plot_bgcolor="#f5f7fb",
104
+ xaxis_showgrid=True,
105
+ )
106
+
107
+ return fig
src/scb_fetch.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Helpers for fetching employment data from the SCB API.
2
+
3
+ This module wraps the ``pyscbwrapper`` library to download
4
+ occupation/employment tables from Statistics Sweden. Error handling
5
+ and logging are centralised here so that callers of ``fetch_all_employment_data``
6
+ can remain agnostic of the details.
7
+ """
8
+
9
+ from typing import Tuple
10
+ import logging
11
+
12
+ import pandas as pd
13
+ from pyscbwrapper import SCB
14
+
15
+ from .config import AGE_EXCLUSIONS, EXCLUDED_CODES, TABLES
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def fetch_scb_table(
21
+ table_id: str, config: Tuple[str, str, str, str, str]
22
+ ) -> pd.DataFrame:
23
+ """Fetch and transform a single SCB table.
24
+
25
+ Parameters
26
+ ----------
27
+ table_id : str
28
+ A key identifying which table definition in ``TABLES`` to use.
29
+ config : Tuple[str, str, str, str, str]
30
+ The tuple of (language, subject, table, variable_code, filter) used
31
+ by ``pyscbwrapper.SCB`` to form the query.
32
+
33
+ Returns
34
+ -------
35
+ pd.DataFrame
36
+ A DataFrame containing one row per (4‑digit occupation code, age,
37
+ year) combination. Returns an empty frame on error.
38
+ """
39
+ logger.info("Starting SCB fetch for table %s", table_id)
40
+ try:
41
+ scb = SCB(*config)
42
+ var_ = scb.get_variables()
43
+
44
+ def get_key_raw(term: str) -> str:
45
+ return next(k for k in var_ if term in k.lower())
46
+
47
+ # Identify variable keys from the SCB metadata
48
+ occ_key_raw = get_key_raw("occupation")
49
+ year_key_raw = get_key_raw("year")
50
+ age_key_raw = get_key_raw("age")
51
+
52
+ # Filter out excluded ages
53
+ all_ages = var_[age_key_raw]
54
+ filtered_ages = [age for age in all_ages if age not in AGE_EXCLUSIONS]
55
+
56
+ # Build the query: remove spaces from the occupation key because SCB
57
+ # uses inconsistent spacing conventions
58
+ query_args = {
59
+ occ_key_raw.replace(" ", ""): var_[occ_key_raw],
60
+ year_key_raw: var_[year_key_raw],
61
+ age_key_raw: filtered_ages,
62
+ }
63
+ scb.set_query(**query_args)
64
+
65
+ raw_data = scb.get_data()
66
+ scb_fetch = raw_data.get("data", [])
67
+
68
+ # Build a mapping from code to human‑readable occupation name using the
69
+ # query metadata. We fall back to the code itself if no mapping
70
+ # exists.
71
+ query_meta = scb.get_query().get("query", [])
72
+ occ_meta_vals = next(
73
+ q["selection"]["values"]
74
+ for q in query_meta
75
+ if "occupation" in q["code"].lower() or q["code"] == "Yrke2012"
76
+ )
77
+ occ_dict = dict(zip(occ_meta_vals, var_[occ_key_raw]))
78
+
79
+ records = []
80
+ for r in scb_fetch:
81
+ code, age, year = r.get("key", [])[:3]
82
+ records.append(
83
+ {
84
+ "code_4": code,
85
+ "occupation": occ_dict.get(code, code),
86
+ "age": age,
87
+ "year": year,
88
+ "value": r.get("values", [None])[0],
89
+ "source_table": table_id,
90
+ }
91
+ )
92
+ return pd.DataFrame.from_records(records)
93
+
94
+ except Exception as exc:
95
+ logger.error("Error processing SCB table %s: %s", table_id, exc)
96
+ return pd.DataFrame()
97
+
98
+
99
+ def fetch_all_employment_data() -> pd.DataFrame:
100
+ """Fetch and consolidate employment data across all configured SCB tables.
101
+
102
+ The configured tables in ``TABLES`` may overlap in years. When
103
+ overlaps occur, later tables in the dictionary take precedence over
104
+ earlier ones. Rows whose occupation codes are listed in
105
+ ``EXCLUDED_CODES`` are removed.
106
+
107
+ Returns
108
+ -------
109
+ pd.DataFrame
110
+ A DataFrame indexed by (code_4, age, year) with a single
111
+ numeric ``value`` column containing the employment counts.
112
+ Returns an empty frame if no data could be retrieved.
113
+ """
114
+ logger.info("Beginning employment data collection from SCB")
115
+ dfs: list[pd.DataFrame] = []
116
+ for tab_id, config in TABLES.items():
117
+ df_part = fetch_scb_table(tab_id, config)
118
+ if not df_part.empty:
119
+ dfs.append(df_part)
120
+ else:
121
+ logger.warning("No data retrieved for table %s", tab_id)
122
+
123
+ # If nothing fetched, return an empty DataFrame
124
+ if not dfs:
125
+ logger.warning("All SCB table fetches returned empty DataFrames")
126
+ return pd.DataFrame()
127
+
128
+ df = pd.concat(dfs, ignore_index=True)
129
+
130
+ # Resolve overlaps between tables by assigning a priority to each table.
131
+ table_priority = {key: i for i, key in enumerate(TABLES.keys())}
132
+ df["table_priority"] = df["source_table"].map(table_priority)
133
+ df = (
134
+ df.sort_values(["code_4", "age", "year", "table_priority"])
135
+ .drop_duplicates(subset=["code_4", "age", "year"], keep="last")
136
+ .drop(columns=["table_priority"])
137
+ )
138
+
139
+ # Exclude specified codes and coerce the value column to numeric
140
+ df = df[~df["code_4"].isin(EXCLUDED_CODES)].reset_index(drop=True)
141
+ df["value"] = pd.to_numeric(df["value"], errors="coerce")
142
+
143
+ return df