Robzy commited on
Commit
82057dc
1 Parent(s): f6be049

updating README

Browse files
Files changed (5) hide show
  1. .github/update-df.yml +28 -28
  2. README.md +45 -4
  3. app_streamlit.py +3 -2
  4. debug.ipynb +3 -310
  5. functions/figure.py +4 -1
.github/update-df.yml CHANGED
@@ -1,34 +1,34 @@
1
- # name: update-df
2
 
3
- # on:
4
- # workflow_dispatch:
5
- # schedule:
6
- # - cron: '11 6 * * *'
7
 
8
- # jobs:
9
- # schedule_pipelines:
10
- # runs-on: ubuntu-latest
11
 
12
- # permissions:
13
- # pages: write
14
- # contents: write
15
 
16
- # steps:
17
- # - name: checkout repo content
18
- # uses: actions/checkout@v4
19
 
20
- # - name: setup python
21
- # uses: actions/setup-python@v5
22
- # with:
23
- # python-version: '3.10'
24
- # cache: 'pip'
25
- # - name: install python packages
26
- # run: |
27
- # python -m pip install --upgrade pip
28
- # pip install -r requirements.txt
29
 
30
- # - name: execute python workflows from bash script
31
- # env:
32
- # HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }}
33
- # run: |
34
- # python infer.py
 
1
+ name: update-df
2
 
3
+ on:
4
+ workflow_dispatch:
5
+ schedule:
6
+ - cron: '11 6 * * *'
7
 
8
+ jobs:
9
+ schedule_pipelines:
10
+ runs-on: ubuntu-latest
11
 
12
+ permissions:
13
+ pages: write
14
+ contents: write
15
 
16
+ steps:
17
+ - name: checkout repo content
18
+ uses: actions/checkout@v4
19
 
20
+ - name: setup python
21
+ uses: actions/setup-python@v5
22
+ with:
23
+ python-version: '3.10'
24
+ cache: 'pip'
25
+ - name: install python packages
26
+ run: |
27
+ python -m pip install --upgrade pip
28
+ pip install -r requirements.txt
29
 
30
+ - name: execute python workflows from bash script
31
+ env:
32
+ HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }}
33
+ run: |
34
+ python infer.py
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Hbg Weather
3
  emoji: 😻
4
  colorFrom: blue
5
  colorTo: purple
@@ -7,9 +7,50 @@ sdk: streamlit
7
  sdk_version: 1.25.0
8
  app_file: app_streamlit.py
9
  pinned: false
10
- short_description: Weather prediction for Helsingborg, Sweden!
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
 
15
- gradio version: 5.6.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Air Quality Forecast
3
  emoji: 😻
4
  colorFrom: blue
5
  colorTo: purple
 
7
  sdk_version: 1.25.0
8
  app_file: app_streamlit.py
9
  pinned: false
10
+ short_description: Air quality forecasting for Lahore, Pakistan!
11
  ---
12
 
13
+ # Air Quality Monitoring for Lahore, Pakistan
14
 
15
+ ### Dashboard link: https://huggingface.co/spaces/Robzy/hbg-weather
16
+
17
+ # Architecture & pipeline
18
+
19
+ ### 1. Data sourcing
20
+
21
+ * Historical air quality measurements are collected from World Air Quality Index in .csv form.
22
+ * Historical weather data collected from Open-Meteo API client, loaded as Pandas dataframe.
23
+
24
+ Weather data features:
25
+
26
+ * Temperature (average over the day)
27
+ * Pecipitation (the total over the day)
28
+ * Wind speed (average over the day)
29
+ * Wind direction (the most dominant direction over the day)
30
+
31
+
32
+ ### 2. Backfill
33
+
34
+ * Created two feature groups are created on Hopsworks: `air_quality` and `weather`
35
+ * Data us pre-processed and uploaded to their respective feature groups.
36
+
37
+ ### 3. Feature pipeline
38
+
39
+ * Daily weather and air quality data is fetched from Open Meteo and World Air Quality Index APIs respectively, then uploaded into the feature groups.
40
+
41
+ ### 4. Training & model
42
+
43
+ * A feature view `air_quality_fv` is created on Hopsworks, which is an input/output API schema for a model.
44
+ * We train a XGBoost regression model `air_quality_xgboost_model` and save it to our model registry.
45
+
46
+ ### 5. Inference pipeline
47
+
48
+ * A new feature group `aq_predictions` is created on Hopsworks.
49
+ * Upon an inference request, the forecasted features and the model are retrieved from the feature view and model regsitry respectively.
50
+ * Air quality predictions are made by inputting the forecasted features into the model, and then uploaded into the `aq_predictions` feature group.
51
+
52
+ # Dashboard & scheduling
53
+
54
+ HuggingFace's Streamlit Spaces is used to display the hindcast, forecast, and real air quality using an interactive line graph. GitHub Actions is used to call the feature and inference pipeline daily by levraging schedulin.
55
+
56
+ Note that backfilling, feature group creation and model training is only performed once
app_streamlit.py CHANGED
@@ -19,8 +19,9 @@ st.set_page_config(
19
  }
20
  )
21
 
22
- st.title('Lahore Air Quality!')
23
- st.subheader('Particle matter, diameter < 2.5 micrometers (PM2.5)')
 
24
 
25
  #pickle_file_path = 'air_quality_df.pkl'
26
  pickle_file_path = 'outcome_df.pkl'
 
19
  }
20
  )
21
 
22
+ st.title('Lahore Air Quality')
23
+ st.subheader('Forecast and hindcast')
24
+ st.subheader('Unit: PM25 - particle matter of diameter < 2.5 micrometers')
25
 
26
  #pickle_file_path = 'air_quality_df.pkl'
27
  pickle_file_path = 'outcome_df.pkl'
debug.ipynb CHANGED
@@ -51,7 +51,7 @@
51
  },
52
  {
53
  "cell_type": "code",
54
- "execution_count": 2,
55
  "metadata": {},
56
  "outputs": [
57
  {
@@ -90,136 +90,12 @@
90
  ")\n",
91
  "\n",
92
  "today_timestamp = pd.to_datetime(today)\n",
93
- "batch_data = weather_fg.filter(weather_fg.date >= today_timestamp ).read()"
94
- ]
95
- },
96
- {
97
- "cell_type": "code",
98
- "execution_count": 17,
99
- "metadata": {},
100
- "outputs": [
101
- {
102
- "data": {
103
- "text/html": [
104
- "<div>\n",
105
- "<style scoped>\n",
106
- " .dataframe tbody tr th:only-of-type {\n",
107
- " vertical-align: middle;\n",
108
- " }\n",
109
- "\n",
110
- " .dataframe tbody tr th {\n",
111
- " vertical-align: top;\n",
112
- " }\n",
113
- "\n",
114
- " .dataframe thead th {\n",
115
- " text-align: right;\n",
116
- " }\n",
117
- "</style>\n",
118
- "<table border=\"1\" class=\"dataframe\">\n",
119
- " <thead>\n",
120
- " <tr style=\"text-align: right;\">\n",
121
- " <th></th>\n",
122
- " <th>date</th>\n",
123
- " <th>temperature_2m_mean</th>\n",
124
- " <th>precipitation_sum</th>\n",
125
- " <th>wind_speed_10m_max</th>\n",
126
- " <th>wind_direction_10m_dominant</th>\n",
127
- " <th>city</th>\n",
128
- " <th>predicted_pm25</th>\n",
129
- " <th>street</th>\n",
130
- " <th>country</th>\n",
131
- " <th>days_before_forecast_day</th>\n",
132
- " </tr>\n",
133
- " </thead>\n",
134
- " <tbody>\n",
135
- " <tr>\n",
136
- " <th>0</th>\n",
137
- " <td>2024-11-21 00:00:00+00:00</td>\n",
138
- " <td>3.40</td>\n",
139
- " <td>0.2</td>\n",
140
- " <td>19.995398</td>\n",
141
- " <td>246.665939</td>\n",
142
- " <td>Helsingborg</td>\n",
143
- " <td>39.168438</td>\n",
144
- " <td>Drottninggatan</td>\n",
145
- " <td>Sweden</td>\n",
146
- " <td>1</td>\n",
147
- " </tr>\n",
148
- " <tr>\n",
149
- " <th>3</th>\n",
150
- " <td>2024-11-22 00:00:00+00:00</td>\n",
151
- " <td>4.05</td>\n",
152
- " <td>0.7</td>\n",
153
- " <td>23.540806</td>\n",
154
- " <td>246.571289</td>\n",
155
- " <td>Helsingborg</td>\n",
156
- " <td>20.740093</td>\n",
157
- " <td>Drottninggatan</td>\n",
158
- " <td>Sweden</td>\n",
159
- " <td>4</td>\n",
160
- " </tr>\n",
161
- " <tr>\n",
162
- " <th>2</th>\n",
163
- " <td>2024-11-23 00:00:00+00:00</td>\n",
164
- " <td>5.45</td>\n",
165
- " <td>0.0</td>\n",
166
- " <td>30.631746</td>\n",
167
- " <td>240.422256</td>\n",
168
- " <td>Helsingborg</td>\n",
169
- " <td>46.448105</td>\n",
170
- " <td>Drottninggatan</td>\n",
171
- " <td>Sweden</td>\n",
172
- " <td>3</td>\n",
173
- " </tr>\n",
174
- " <tr>\n",
175
- " <th>1</th>\n",
176
- " <td>2024-11-24 00:00:00+00:00</td>\n",
177
- " <td>5.60</td>\n",
178
- " <td>0.0</td>\n",
179
- " <td>13.755580</td>\n",
180
- " <td>276.008911</td>\n",
181
- " <td>Helsingborg</td>\n",
182
- " <td>61.713448</td>\n",
183
- " <td>Drottninggatan</td>\n",
184
- " <td>Sweden</td>\n",
185
- " <td>2</td>\n",
186
- " </tr>\n",
187
- " </tbody>\n",
188
- "</table>\n",
189
- "</div>"
190
- ],
191
- "text/plain": [
192
- " date temperature_2m_mean precipitation_sum \\\n",
193
- "0 2024-11-21 00:00:00+00:00 3.40 0.2 \n",
194
- "3 2024-11-22 00:00:00+00:00 4.05 0.7 \n",
195
- "2 2024-11-23 00:00:00+00:00 5.45 0.0 \n",
196
- "1 2024-11-24 00:00:00+00:00 5.60 0.0 \n",
197
- "\n",
198
- " wind_speed_10m_max wind_direction_10m_dominant city \\\n",
199
- "0 19.995398 246.665939 Helsingborg \n",
200
- "3 23.540806 246.571289 Helsingborg \n",
201
- "2 30.631746 240.422256 Helsingborg \n",
202
- "1 13.755580 276.008911 Helsingborg \n",
203
- "\n",
204
- " predicted_pm25 street country days_before_forecast_day \n",
205
- "0 39.168438 Drottninggatan Sweden 1 \n",
206
- "3 20.740093 Drottninggatan Sweden 4 \n",
207
- "2 46.448105 Drottninggatan Sweden 3 \n",
208
- "1 61.713448 Drottninggatan Sweden 2 "
209
- ]
210
- },
211
- "execution_count": 17,
212
- "metadata": {},
213
- "output_type": "execute_result"
214
- }
215
- ],
216
- "source": [
217
- "batch_data.sort_values(by=['date'])"
218
  ]
219
  },
220
  {
221
  "cell_type": "code",
222
- "execution_count": null,
223
  "metadata": {},
224
  "outputs": [
225
  {
@@ -313,189 +189,6 @@
313
  "plt = util.plot_air_quality_forecast(city, street, hindcast_df, file_path=\"./img/pm25_hindcast_1day.png\", hindcast=True)"
314
  ]
315
  },
316
- {
317
- "cell_type": "code",
318
- "execution_count": 13,
319
- "metadata": {},
320
- "outputs": [
321
- {
322
- "data": {
323
- "text/html": [
324
- "<div>\n",
325
- "<style scoped>\n",
326
- " .dataframe tbody tr th:only-of-type {\n",
327
- " vertical-align: middle;\n",
328
- " }\n",
329
- "\n",
330
- " .dataframe tbody tr th {\n",
331
- " vertical-align: top;\n",
332
- " }\n",
333
- "\n",
334
- " .dataframe thead th {\n",
335
- " text-align: right;\n",
336
- " }\n",
337
- "</style>\n",
338
- "<table border=\"1\" class=\"dataframe\">\n",
339
- " <thead>\n",
340
- " <tr style=\"text-align: right;\">\n",
341
- " <th></th>\n",
342
- " <th>date</th>\n",
343
- " <th>pm25</th>\n",
344
- " </tr>\n",
345
- " </thead>\n",
346
- " <tbody>\n",
347
- " <tr>\n",
348
- " <th>478</th>\n",
349
- " <td>2020-06-01</td>\n",
350
- " <td>12.0</td>\n",
351
- " </tr>\n",
352
- " <tr>\n",
353
- " <th>746</th>\n",
354
- " <td>2020-06-02</td>\n",
355
- " <td>14.0</td>\n",
356
- " </tr>\n",
357
- " <tr>\n",
358
- " <th>1211</th>\n",
359
- " <td>2020-06-03</td>\n",
360
- " <td>20.0</td>\n",
361
- " </tr>\n",
362
- " <tr>\n",
363
- " <th>1536</th>\n",
364
- " <td>2020-06-04</td>\n",
365
- " <td>21.0</td>\n",
366
- " </tr>\n",
367
- " <tr>\n",
368
- " <th>1470</th>\n",
369
- " <td>2020-06-05</td>\n",
370
- " <td>18.0</td>\n",
371
- " </tr>\n",
372
- " <tr>\n",
373
- " <th>...</th>\n",
374
- " <td>...</td>\n",
375
- " <td>...</td>\n",
376
- " </tr>\n",
377
- " <tr>\n",
378
- " <th>764</th>\n",
379
- " <td>2024-11-12</td>\n",
380
- " <td>26.0</td>\n",
381
- " </tr>\n",
382
- " <tr>\n",
383
- " <th>1265</th>\n",
384
- " <td>2024-11-13</td>\n",
385
- " <td>35.0</td>\n",
386
- " </tr>\n",
387
- " <tr>\n",
388
- " <th>1315</th>\n",
389
- " <td>2024-11-14</td>\n",
390
- " <td>22.0</td>\n",
391
- " </tr>\n",
392
- " <tr>\n",
393
- " <th>301</th>\n",
394
- " <td>2024-11-15</td>\n",
395
- " <td>7.0</td>\n",
396
- " </tr>\n",
397
- " <tr>\n",
398
- " <th>1588</th>\n",
399
- " <td>2024-11-16</td>\n",
400
- " <td>34.0</td>\n",
401
- " </tr>\n",
402
- " </tbody>\n",
403
- "</table>\n",
404
- "<p>1589 rows × 2 columns</p>\n",
405
- "</div>"
406
- ],
407
- "text/plain": [
408
- " date pm25\n",
409
- "478 2020-06-01 12.0\n",
410
- "746 2020-06-02 14.0\n",
411
- "1211 2020-06-03 20.0\n",
412
- "1536 2020-06-04 21.0\n",
413
- "1470 2020-06-05 18.0\n",
414
- "... ... ...\n",
415
- "764 2024-11-12 26.0\n",
416
- "1265 2024-11-13 35.0\n",
417
- "1315 2024-11-14 22.0\n",
418
- "301 2024-11-15 7.0\n",
419
- "1588 2024-11-16 34.0\n",
420
- "\n",
421
- "[1589 rows x 2 columns]"
422
- ]
423
- },
424
- "execution_count": 13,
425
- "metadata": {},
426
- "output_type": "execute_result"
427
- }
428
- ],
429
- "source": [
430
- "outcome_df.sort_values(by=['date'])"
431
- ]
432
- },
433
- {
434
- "cell_type": "code",
435
- "execution_count": 14,
436
- "metadata": {},
437
- "outputs": [
438
- {
439
- "data": {
440
- "text/html": [
441
- "<div>\n",
442
- "<style scoped>\n",
443
- " .dataframe tbody tr th:only-of-type {\n",
444
- " vertical-align: middle;\n",
445
- " }\n",
446
- "\n",
447
- " .dataframe tbody tr th {\n",
448
- " vertical-align: top;\n",
449
- " }\n",
450
- "\n",
451
- " .dataframe thead th {\n",
452
- " text-align: right;\n",
453
- " }\n",
454
- "</style>\n",
455
- "<table border=\"1\" class=\"dataframe\">\n",
456
- " <thead>\n",
457
- " <tr style=\"text-align: right;\">\n",
458
- " <th></th>\n",
459
- " <th>date</th>\n",
460
- " <th>predicted_pm25</th>\n",
461
- " </tr>\n",
462
- " </thead>\n",
463
- " <tbody>\n",
464
- " <tr>\n",
465
- " <th>1</th>\n",
466
- " <td>2024-11-15 00:00:00+00:00</td>\n",
467
- " <td>33.413746</td>\n",
468
- " </tr>\n",
469
- " <tr>\n",
470
- " <th>0</th>\n",
471
- " <td>2024-11-16 00:00:00+00:00</td>\n",
472
- " <td>34.305458</td>\n",
473
- " </tr>\n",
474
- " <tr>\n",
475
- " <th>2</th>\n",
476
- " <td>2024-11-21 00:00:00+00:00</td>\n",
477
- " <td>39.168438</td>\n",
478
- " </tr>\n",
479
- " </tbody>\n",
480
- "</table>\n",
481
- "</div>"
482
- ],
483
- "text/plain": [
484
- " date predicted_pm25\n",
485
- "1 2024-11-15 00:00:00+00:00 33.413746\n",
486
- "0 2024-11-16 00:00:00+00:00 34.305458\n",
487
- "2 2024-11-21 00:00:00+00:00 39.168438"
488
- ]
489
- },
490
- "execution_count": 14,
491
- "metadata": {},
492
- "output_type": "execute_result"
493
- }
494
- ],
495
- "source": [
496
- "preds_df.sort_values(by=['date'])"
497
- ]
498
- },
499
  {
500
  "cell_type": "code",
501
  "execution_count": 23,
 
51
  },
52
  {
53
  "cell_type": "code",
54
+ "execution_count": null,
55
  "metadata": {},
56
  "outputs": [
57
  {
 
90
  ")\n",
91
  "\n",
92
  "today_timestamp = pd.to_datetime(today)\n",
93
+ "batch_data = weather_fg.filter(weather_fg.date >= today_timestamp ).read().sort_values(by=['date'])"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  ]
95
  },
96
  {
97
  "cell_type": "code",
98
+ "execution_count": 18,
99
  "metadata": {},
100
  "outputs": [
101
  {
 
189
  "plt = util.plot_air_quality_forecast(city, street, hindcast_df, file_path=\"./img/pm25_hindcast_1day.png\", hindcast=True)"
190
  ]
191
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  {
193
  "cell_type": "code",
194
  "execution_count": 23,
functions/figure.py CHANGED
@@ -84,10 +84,13 @@ def plot(df, n=10):
84
  line=dict(width=3) # Make the line thicker
85
  )
86
 
 
 
 
87
  fig.update_layout(
88
  shapes=shapes, # Add the background rectangles
89
  xaxis=dict(
90
- range=[x_values.iloc[-n], x_values.iloc[-1]], # Dynamically set the range
91
  title=dict(
92
  text="Date", # Set x-axis label
93
  font=dict(size=label_font_size) # Increase font size for the x-axis label
 
84
  line=dict(width=3) # Make the line thicker
85
  )
86
 
87
+ # x range start BEFORE today
88
+ k = 4
89
+
90
  fig.update_layout(
91
  shapes=shapes, # Add the background rectangles
92
  xaxis=dict(
93
+ range=[x_values.iloc[-n-k], x_values.iloc[-1]], # Dynamically set the range
94
  title=dict(
95
  text="Date", # Set x-axis label
96
  font=dict(size=label_font_size) # Increase font size for the x-axis label