Annikaijak commited on
Commit
a1aa76f
1 Parent(s): ff9a039

Upload 37 files

Browse files
Files changed (38) hide show
  1. .gitattributes +1 -0
  2. air_quality-main/.github/workflows/feature_pipeline_seattle.yml +29 -0
  3. air_quality-main/.gitignore +160 -0
  4. air_quality-main/1_backfill_feature_groups.ipynb +1535 -0
  5. air_quality-main/2_feature_pipeline.py +108 -0
  6. air_quality-main/3_training_dataset_and_modeling.ipynb +0 -0
  7. air_quality-main/LICENSE +201 -0
  8. air_quality-main/README.md +60 -0
  9. air_quality-main/air_quality_model/residplot.png +0 -0
  10. air_quality-main/air_quality_model/xgboost_pipeline.pkl +3 -0
  11. air_quality-main/app.py +197 -0
  12. air_quality-main/data/backfill_pm2_5.csv +0 -0
  13. air_quality-main/data/backfill_pm2_5_eu.csv +0 -0
  14. air_quality-main/data/backfill_pm2_5_seattle.csv +0 -0
  15. air_quality-main/data/backfill_pm2_5_us.csv +0 -0
  16. air_quality-main/data/backfill_weather.csv +3 -0
  17. air_quality-main/data/seattle_pm25_2013.csv +0 -0
  18. air_quality-main/data/seattle_pm25_2014.csv +0 -0
  19. air_quality-main/data/seattle_pm25_2015.csv +0 -0
  20. air_quality-main/data/seattle_pm25_2016.csv +0 -0
  21. air_quality-main/data/seattle_pm25_2017.csv +0 -0
  22. air_quality-main/data/seattle_pm25_2018.csv +0 -0
  23. air_quality-main/data/seattle_pm25_2019.csv +0 -0
  24. air_quality-main/data/seattle_pm25_2020.csv +0 -0
  25. air_quality-main/data/seattle_pm25_2021.csv +0 -0
  26. air_quality-main/data/seattle_pm25_2022.csv +0 -0
  27. air_quality-main/data/seattle_pm25_2023.csv +0 -0
  28. air_quality-main/functions.py +385 -0
  29. air_quality-main/hopsworks-login.sh.example +9 -0
  30. air_quality-main/images/1.png +0 -0
  31. air_quality-main/images/2.png +0 -0
  32. air_quality-main/images/3.png +0 -0
  33. air_quality-main/images/4.png +0 -0
  34. air_quality-main/images/5.png +0 -0
  35. air_quality-main/images/6.png +0 -0
  36. air_quality-main/requirements.txt +8 -0
  37. air_quality-main/scripts/run-feature-pipeline.sh +5 -0
  38. air_quality-main/target_cities.json +1 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ air_quality-main/data/backfill_weather.csv filter=lfs diff=lfs merge=lfs -text
air_quality-main/.github/workflows/feature_pipeline_seattle.yml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: air_quality_feature_pipeline
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ schedule:
6
+ - cron: '0 14 * * *'
7
+
8
+ jobs:
9
+ test_schedule:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - name: checkout repo content
13
+ uses: actions/checkout@v3
14
+
15
+ - name: setup python
16
+ uses: actions/setup-python@v3
17
+ with:
18
+ python-version: '3.11.5'
19
+
20
+ - name: install python packages
21
+ run:
22
+ python -m pip install --upgrade pip
23
+ pip install -r requirements.txt
24
+
25
+ - name: execute Feature Pipeline
26
+ env:
27
+ HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }}
28
+ CONTINENT: "Seattle"
29
+ run: ./scripts/run-feature-pipeline.sh
air_quality-main/.gitignore ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
air_quality-main/1_backfill_feature_groups.ipynb ADDED
@@ -0,0 +1,1535 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "73ee3ec9",
6
+ "metadata": {},
7
+ "source": [
8
+ "# <span style=\"font-width:bold; font-size: 3rem; color:#1EB182;\"><img src=\"../../images/icon102.png\" width=\"38px\"></img> **Hopsworks Feature Store** </span>\n",
9
+ "\n",
10
+ "<span style=\"font-width:bold; font-size: 3rem; color:#333;\">- Part 01: Backfill Features to the Feature Store</span>\n",
11
+ "\n",
12
+ "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/advanced_tutorials/air_quality/1_backfill_feature_groups.ipynb)\n",
13
+ "\n",
14
+ "\n",
15
+ "## 🗒️ This notebook is divided into the following sections:\n",
16
+ "1. Fetch historical data\n",
17
+ "2. Connect to the Hopsworks feature store\n",
18
+ "3. Create feature groups and insert them to the feature store\n",
19
+ "\n",
20
+ "![tutorial-flow](../../images/01_featuregroups.png)"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "markdown",
25
+ "id": "f04d5c5e",
26
+ "metadata": {},
27
+ "source": [
28
+ "### <span style='color:#ff5f27'> 📝 Imports"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 2,
34
+ "id": "f65f0db4-1e4b-4f28-a17c-eadcb0d0f016",
35
+ "metadata": {
36
+ "tags": []
37
+ },
38
+ "outputs": [],
39
+ "source": [
40
+ "%pip install geopy folium streamlit-folium geopy --q"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": 3,
46
+ "id": "cd165941",
47
+ "metadata": {},
48
+ "outputs": [],
49
+ "source": [
50
+ "import datetime\n",
51
+ "import time\n",
52
+ "import requests\n",
53
+ "from urllib.request import urlopen\n",
54
+ "import json\n",
55
+ "import pandas as pd\n",
56
+ "import folium\n",
57
+ "from functions import *\n",
58
+ "import warnings\n",
59
+ "warnings.filterwarnings(\"ignore\")"
60
+ ]
61
+ },
62
+ {
63
+ "cell_type": "markdown",
64
+ "id": "ba9903fc",
65
+ "metadata": {},
66
+ "source": [
67
+ "---"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "markdown",
72
+ "id": "b7a1965a-0da7-4263-a68a-8b2e8cb753f1",
73
+ "metadata": {},
74
+ "source": [
75
+ "## <span style='color:#ff5f27'> 🌍 Representing the Target cities </span>"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 4,
81
+ "id": "bd578db1-69e7-4230-b3f2-807b8056283a",
82
+ "metadata": {
83
+ "tags": []
84
+ },
85
+ "outputs": [],
86
+ "source": [
87
+ "target_url='https://repo.hops.works/dev/jdowling/target_cities.json'\n",
88
+ "response = urlopen(target_url)\n",
89
+ "target_cities = json.loads(response.read())\n"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "markdown",
94
+ "id": "2246ca9d",
95
+ "metadata": {},
96
+ "source": [
97
+ "## <span style='color:#ff5f27'> 🌫 Processing Air Quality data</span>"
98
+ ]
99
+ },
100
+ {
101
+ "cell_type": "markdown",
102
+ "id": "b4a1c5d1",
103
+ "metadata": {},
104
+ "source": [
105
+ "### [🇪🇺 EEA](https://discomap.eea.europa.eu/map/fme/AirQualityExport.htm)\n",
106
+ "#### EEA means European Environmental Agency"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "execution_count": 5,
112
+ "id": "96b8be01-6286-4886-8043-56e0e49b314e",
113
+ "metadata": {
114
+ "tags": []
115
+ },
116
+ "outputs": [
117
+ {
118
+ "data": {
119
+ "text/plain": [
120
+ "{'Amsterdam': [52.37, 4.89],\n",
121
+ " 'Athina': [37.98, 23.73],\n",
122
+ " 'Berlin': [52.52, 13.39],\n",
123
+ " 'Gdansk': [54.37, 18.61],\n",
124
+ " 'Kraków': [50.06, 19.94],\n",
125
+ " 'London': [51.51, -0.13],\n",
126
+ " 'Madrid': [40.42, -3.7],\n",
127
+ " 'Marseille': [43.3, 5.37],\n",
128
+ " 'Milano': [45.46, 9.19],\n",
129
+ " 'München': [48.14, 11.58],\n",
130
+ " 'Napoli': [40.84, 14.25],\n",
131
+ " 'Paris': [48.85, 2.35],\n",
132
+ " 'Sevilla': [37.39, -6.0],\n",
133
+ " 'Stockholm': [59.33, 18.07],\n",
134
+ " 'Tallinn': [59.44, 24.75],\n",
135
+ " 'Varna': [43.21, 27.92],\n",
136
+ " 'Wien': [48.21, 16.37]}"
137
+ ]
138
+ },
139
+ "execution_count": 5,
140
+ "metadata": {},
141
+ "output_type": "execute_result"
142
+ }
143
+ ],
144
+ "source": [
145
+ "target_cities[\"EU\"]"
146
+ ]
147
+ },
148
+ {
149
+ "cell_type": "code",
150
+ "execution_count": 6,
151
+ "id": "5bb2a868-5f3a-4065-b651-318c24826b97",
152
+ "metadata": {},
153
+ "outputs": [],
154
+ "source": [
155
+ "df_eu = pd.read_csv(\"data/backfill_pm2_5_eu.csv\")"
156
+ ]
157
+ },
158
+ {
159
+ "cell_type": "code",
160
+ "execution_count": 7,
161
+ "id": "5620df22-f744-4550-a81a-7e5d71aae542",
162
+ "metadata": {
163
+ "tags": []
164
+ },
165
+ "outputs": [
166
+ {
167
+ "data": {
168
+ "text/plain": [
169
+ "0"
170
+ ]
171
+ },
172
+ "execution_count": 7,
173
+ "metadata": {},
174
+ "output_type": "execute_result"
175
+ }
176
+ ],
177
+ "source": [
178
+ "df_eu.isna().sum().sum()"
179
+ ]
180
+ },
181
+ {
182
+ "cell_type": "code",
183
+ "execution_count": 8,
184
+ "id": "b0e23728-a01d-45bc-bf25-4a9c77f21d66",
185
+ "metadata": {
186
+ "tags": []
187
+ },
188
+ "outputs": [
189
+ {
190
+ "name": "stdout",
191
+ "output_type": "stream",
192
+ "text": [
193
+ "Size of this dataframe: (63548, 3)\n"
194
+ ]
195
+ },
196
+ {
197
+ "data": {
198
+ "text/html": [
199
+ "<div>\n",
200
+ "<style scoped>\n",
201
+ " .dataframe tbody tr th:only-of-type {\n",
202
+ " vertical-align: middle;\n",
203
+ " }\n",
204
+ "\n",
205
+ " .dataframe tbody tr th {\n",
206
+ " vertical-align: top;\n",
207
+ " }\n",
208
+ "\n",
209
+ " .dataframe thead th {\n",
210
+ " text-align: right;\n",
211
+ " }\n",
212
+ "</style>\n",
213
+ "<table border=\"1\" class=\"dataframe\">\n",
214
+ " <thead>\n",
215
+ " <tr style=\"text-align: right;\">\n",
216
+ " <th></th>\n",
217
+ " <th>city_name</th>\n",
218
+ " <th>date</th>\n",
219
+ " <th>pm2_5</th>\n",
220
+ " </tr>\n",
221
+ " </thead>\n",
222
+ " <tbody>\n",
223
+ " <tr>\n",
224
+ " <th>16477</th>\n",
225
+ " <td>Kraków</td>\n",
226
+ " <td>2017-01-05</td>\n",
227
+ " <td>16.0</td>\n",
228
+ " </tr>\n",
229
+ " <tr>\n",
230
+ " <th>12612</th>\n",
231
+ " <td>Gdansk</td>\n",
232
+ " <td>2016-09-15</td>\n",
233
+ " <td>10.0</td>\n",
234
+ " </tr>\n",
235
+ " <tr>\n",
236
+ " <th>58456</th>\n",
237
+ " <td>Varna</td>\n",
238
+ " <td>2018-12-03</td>\n",
239
+ " <td>11.0</td>\n",
240
+ " </tr>\n",
241
+ " </tbody>\n",
242
+ "</table>\n",
243
+ "</div>"
244
+ ],
245
+ "text/plain": [
246
+ " city_name date pm2_5\n",
247
+ "16477 Kraków 2017-01-05 16.0\n",
248
+ "12612 Gdansk 2016-09-15 10.0\n",
249
+ "58456 Varna 2018-12-03 11.0"
250
+ ]
251
+ },
252
+ "execution_count": 8,
253
+ "metadata": {},
254
+ "output_type": "execute_result"
255
+ }
256
+ ],
257
+ "source": [
258
+ "print(\"Size of this dataframe:\", df_eu.shape)\n",
259
+ "\n",
260
+ "df_eu.sample(3)"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "markdown",
265
+ "id": "c2e45567-dd6b-4e5e-a153-82a2f4f32fbc",
266
+ "metadata": {},
267
+ "source": [
268
+ "### [🇺🇸 USEPA](https://aqs.epa.gov/aqsweb/documents/data_api.html#daily)\n",
269
+ "#### USEPA means United States Environmental Protection Agency\n",
270
+ "[Manual downloading](https://www.epa.gov/outdoor-air-quality-data/download-daily-data)\n",
271
+ "\n"
272
+ ]
273
+ },
274
+ {
275
+ "cell_type": "code",
276
+ "execution_count": 9,
277
+ "id": "c4952759-0fb9-4229-8b78-2e37cffb144d",
278
+ "metadata": {
279
+ "tags": []
280
+ },
281
+ "outputs": [
282
+ {
283
+ "data": {
284
+ "text/plain": [
285
+ "{'Albuquerque': [35.08, -106.65],\n",
286
+ " 'Atlanta': [33.75, -84.39],\n",
287
+ " 'Chicago': [41.88, -87.62],\n",
288
+ " 'Columbus': [39.96, -83.0],\n",
289
+ " 'Dallas': [32.78, -96.8],\n",
290
+ " 'Denver': [39.74, -104.98],\n",
291
+ " 'Houston': [29.76, -95.37],\n",
292
+ " 'Los Angeles': [34.05, -118.24],\n",
293
+ " 'New York': [40.71, -74.01],\n",
294
+ " 'Phoenix-Mesa': [33.66, -112.04],\n",
295
+ " 'Salt Lake City': [40.76, -111.89],\n",
296
+ " 'San Francisco': [37.78, -122.42],\n",
297
+ " 'Tampa': [27.95, -82.46]}"
298
+ ]
299
+ },
300
+ "execution_count": 9,
301
+ "metadata": {},
302
+ "output_type": "execute_result"
303
+ }
304
+ ],
305
+ "source": [
306
+ "target_cities[\"US\"]"
307
+ ]
308
+ },
309
+ {
310
+ "cell_type": "code",
311
+ "execution_count": 10,
312
+ "id": "c6aceaee-9431-48fd-818a-41fbdd07575c",
313
+ "metadata": {
314
+ "tags": []
315
+ },
316
+ "outputs": [],
317
+ "source": [
318
+ "df_us = pd.read_csv(\"data/backfill_pm2_5_us.csv\")"
319
+ ]
320
+ },
321
+ {
322
+ "cell_type": "code",
323
+ "execution_count": 11,
324
+ "id": "4e7ff20e-8a1a-4fa3-b801-71beead7b5f2",
325
+ "metadata": {
326
+ "tags": []
327
+ },
328
+ "outputs": [
329
+ {
330
+ "data": {
331
+ "text/plain": [
332
+ "0"
333
+ ]
334
+ },
335
+ "execution_count": 11,
336
+ "metadata": {},
337
+ "output_type": "execute_result"
338
+ }
339
+ ],
340
+ "source": [
341
+ "df_us.isna().sum().sum()"
342
+ ]
343
+ },
344
+ {
345
+ "cell_type": "code",
346
+ "execution_count": 12,
347
+ "id": "3818e3e1-8674-4634-9023-92be8410fba5",
348
+ "metadata": {
349
+ "tags": []
350
+ },
351
+ "outputs": [
352
+ {
353
+ "name": "stdout",
354
+ "output_type": "stream",
355
+ "text": [
356
+ "Size of this dataframe: (46037, 3)\n"
357
+ ]
358
+ },
359
+ {
360
+ "data": {
361
+ "text/html": [
362
+ "<div>\n",
363
+ "<style scoped>\n",
364
+ " .dataframe tbody tr th:only-of-type {\n",
365
+ " vertical-align: middle;\n",
366
+ " }\n",
367
+ "\n",
368
+ " .dataframe tbody tr th {\n",
369
+ " vertical-align: top;\n",
370
+ " }\n",
371
+ "\n",
372
+ " .dataframe thead th {\n",
373
+ " text-align: right;\n",
374
+ " }\n",
375
+ "</style>\n",
376
+ "<table border=\"1\" class=\"dataframe\">\n",
377
+ " <thead>\n",
378
+ " <tr style=\"text-align: right;\">\n",
379
+ " <th></th>\n",
380
+ " <th>date</th>\n",
381
+ " <th>city_name</th>\n",
382
+ " <th>pm2_5</th>\n",
383
+ " </tr>\n",
384
+ " </thead>\n",
385
+ " <tbody>\n",
386
+ " <tr>\n",
387
+ " <th>39995</th>\n",
388
+ " <td>2016-05-09</td>\n",
389
+ " <td>San Francisco</td>\n",
390
+ " <td>7.3</td>\n",
391
+ " </tr>\n",
392
+ " <tr>\n",
393
+ " <th>18276</th>\n",
394
+ " <td>2016-04-10</td>\n",
395
+ " <td>Denver</td>\n",
396
+ " <td>3.1</td>\n",
397
+ " </tr>\n",
398
+ " <tr>\n",
399
+ " <th>32122</th>\n",
400
+ " <td>2014-10-17</td>\n",
401
+ " <td>Phoenix-Mesa</td>\n",
402
+ " <td>11.7</td>\n",
403
+ " </tr>\n",
404
+ " </tbody>\n",
405
+ "</table>\n",
406
+ "</div>"
407
+ ],
408
+ "text/plain": [
409
+ " date city_name pm2_5\n",
410
+ "39995 2016-05-09 San Francisco 7.3\n",
411
+ "18276 2016-04-10 Denver 3.1\n",
412
+ "32122 2014-10-17 Phoenix-Mesa 11.7"
413
+ ]
414
+ },
415
+ "execution_count": 12,
416
+ "metadata": {},
417
+ "output_type": "execute_result"
418
+ }
419
+ ],
420
+ "source": [
421
+ "print(\"Size of this dataframe:\", df_us.shape)\n",
422
+ "\n",
423
+ "df_us.sample(3)"
424
+ ]
425
+ },
426
+ {
427
+ "cell_type": "markdown",
428
+ "id": "25557752-31c8-4da9-a52c-4415c4d20ae3",
429
+ "metadata": {},
430
+ "source": [
431
+ "### <span style=\"color:#ff5f27;\">🏢 Processing special city - `Seattle`</span>\n",
432
+ "#### We need different stations across the Seattle. \n",
433
+ "I downloaded daily `PM2.5` data manually [here](https://www.epa.gov/outdoor-air-quality-data/download-daily-data)"
434
+ ]
435
+ },
436
+ {
437
+ "cell_type": "code",
438
+ "execution_count": 13,
439
+ "id": "2f54d2cb-991c-47cb-a686-76c9f7a87170",
440
+ "metadata": {
441
+ "tags": []
442
+ },
443
+ "outputs": [
444
+ {
445
+ "data": {
446
+ "text/plain": [
447
+ "{'Bellevue-SE 12th St': [47.60086, -122.1484],\n",
448
+ " 'DARRINGTON - FIR ST (Darrington High School)': [48.2469, -121.6031],\n",
449
+ " 'KENT - JAMES & CENTRAL': [47.38611, -122.23028],\n",
450
+ " 'LAKE FOREST PARK TOWNE CENTER': [47.755, -122.2806],\n",
451
+ " 'MARYSVILLE - 7TH AVE (Marysville Junior High)': [48.05432, -122.17153],\n",
452
+ " 'NORTH BEND - NORTH BEND WAY': [47.49022, -121.77278],\n",
453
+ " 'SEATTLE - BEACON HILL': [47.56824, -122.30863],\n",
454
+ " 'SEATTLE - DUWAMISH': [47.55975, -122.33827],\n",
455
+ " 'SEATTLE - SOUTH PARK #2': [47.53091, -122.3208],\n",
456
+ " 'Seattle-10th & Weller': [47.59722, -122.31972],\n",
457
+ " 'TACOMA - ALEXANDER AVE': [47.2656, -122.3858],\n",
458
+ " 'TACOMA - L STREET': [47.1864, -122.4517],\n",
459
+ " 'Tacoma-S 36th St': [47.22634, -122.46256],\n",
460
+ " 'Tukwila Allentown': [47.49854, -122.27839],\n",
461
+ " 'Tulalip-Totem Beach Rd': [48.06534, -122.28519]}"
462
+ ]
463
+ },
464
+ "execution_count": 13,
465
+ "metadata": {},
466
+ "output_type": "execute_result"
467
+ }
468
+ ],
469
+ "source": [
470
+ "target_cities[\"Seattle\"]"
471
+ ]
472
+ },
473
+ {
474
+ "cell_type": "code",
475
+ "execution_count": 14,
476
+ "id": "31c8505d-68bc-40b6-be0f-42d8532dbd48",
477
+ "metadata": {
478
+ "tags": []
479
+ },
480
+ "outputs": [],
481
+ "source": [
482
+ "df_seattle = pd.read_csv(\"data/backfill_pm2_5_seattle.csv\")"
483
+ ]
484
+ },
485
+ {
486
+ "cell_type": "code",
487
+ "execution_count": 15,
488
+ "id": "2f6583c9-3b2a-41c6-a020-aeede88c4867",
489
+ "metadata": {
490
+ "tags": []
491
+ },
492
+ "outputs": [
493
+ {
494
+ "data": {
495
+ "text/plain": [
496
+ "0"
497
+ ]
498
+ },
499
+ "execution_count": 15,
500
+ "metadata": {},
501
+ "output_type": "execute_result"
502
+ }
503
+ ],
504
+ "source": [
505
+ "df_seattle.isna().sum().sum()"
506
+ ]
507
+ },
508
+ {
509
+ "cell_type": "code",
510
+ "execution_count": 16,
511
+ "id": "065a5b03-28f7-475c-9c6a-4340388157d8",
512
+ "metadata": {
513
+ "tags": []
514
+ },
515
+ "outputs": [
516
+ {
517
+ "name": "stdout",
518
+ "output_type": "stream",
519
+ "text": [
520
+ "Size of this dataframe: (46479, 3)\n"
521
+ ]
522
+ },
523
+ {
524
+ "data": {
525
+ "text/html": [
526
+ "<div>\n",
527
+ "<style scoped>\n",
528
+ " .dataframe tbody tr th:only-of-type {\n",
529
+ " vertical-align: middle;\n",
530
+ " }\n",
531
+ "\n",
532
+ " .dataframe tbody tr th {\n",
533
+ " vertical-align: top;\n",
534
+ " }\n",
535
+ "\n",
536
+ " .dataframe thead th {\n",
537
+ " text-align: right;\n",
538
+ " }\n",
539
+ "</style>\n",
540
+ "<table border=\"1\" class=\"dataframe\">\n",
541
+ " <thead>\n",
542
+ " <tr style=\"text-align: right;\">\n",
543
+ " <th></th>\n",
544
+ " <th>city_name</th>\n",
545
+ " <th>date</th>\n",
546
+ " <th>pm2_5</th>\n",
547
+ " </tr>\n",
548
+ " </thead>\n",
549
+ " <tbody>\n",
550
+ " <tr>\n",
551
+ " <th>3345</th>\n",
552
+ " <td>MARYSVILLE - 7TH AVE (Marysville Junior High)</td>\n",
553
+ " <td>2013-05-03</td>\n",
554
+ " <td>5.3</td>\n",
555
+ " </tr>\n",
556
+ " <tr>\n",
557
+ " <th>22979</th>\n",
558
+ " <td>TACOMA - L STREET</td>\n",
559
+ " <td>2018-08-13</td>\n",
560
+ " <td>19.2</td>\n",
561
+ " </tr>\n",
562
+ " <tr>\n",
563
+ " <th>14456</th>\n",
564
+ " <td>DARRINGTON - FIR ST (Darrington High School)</td>\n",
565
+ " <td>2016-11-09</td>\n",
566
+ " <td>8.4</td>\n",
567
+ " </tr>\n",
568
+ " </tbody>\n",
569
+ "</table>\n",
570
+ "</div>"
571
+ ],
572
+ "text/plain": [
573
+ " city_name date pm2_5\n",
574
+ "3345 MARYSVILLE - 7TH AVE (Marysville Junior High) 2013-05-03 5.3\n",
575
+ "22979 TACOMA - L STREET 2018-08-13 19.2\n",
576
+ "14456 DARRINGTON - FIR ST (Darrington High School) 2016-11-09 8.4"
577
+ ]
578
+ },
579
+ "execution_count": 16,
580
+ "metadata": {},
581
+ "output_type": "execute_result"
582
+ }
583
+ ],
584
+ "source": [
585
+ "print(\"Size of this dataframe:\", df_seattle.shape)\n",
586
+ "\n",
587
+ "df_seattle.sample(3)"
588
+ ]
589
+ },
590
+ {
591
+ "cell_type": "code",
592
+ "execution_count": 17,
593
+ "id": "e3b17ca4-0e9d-4207-ad62-90ea9c157def",
594
+ "metadata": {
595
+ "tags": []
596
+ },
597
+ "outputs": [
598
+ {
599
+ "data": {
600
+ "text/plain": [
601
+ "city_name\n",
602
+ "NORTH BEND - NORTH BEND WAY 3705\n",
603
+ "TACOMA - L STREET 3696\n",
604
+ "SEATTLE - BEACON HILL 3691\n",
605
+ "MARYSVILLE - 7TH AVE (Marysville Junior High) 3648\n",
606
+ "DARRINGTON - FIR ST (Darrington High School) 3614\n",
607
+ "SEATTLE - SOUTH PARK #2 3577\n",
608
+ "TACOMA - ALEXANDER AVE 3569\n",
609
+ "KENT - JAMES & CENTRAL 3556\n",
610
+ "SEATTLE - DUWAMISH 3439\n",
611
+ "Seattle-10th & Weller 3097\n",
612
+ "LAKE FOREST PARK TOWNE CENTER 2999\n",
613
+ "Tacoma-S 36th St 2574\n",
614
+ "Bellevue-SE 12th St 2172\n",
615
+ "Tukwila Allentown 2074\n",
616
+ "Tulalip-Totem Beach Rd 1068\n",
617
+ "Name: count, dtype: int64"
618
+ ]
619
+ },
620
+ "execution_count": 17,
621
+ "metadata": {},
622
+ "output_type": "execute_result"
623
+ }
624
+ ],
625
+ "source": [
626
+ "df_seattle.city_name.value_counts()"
627
+ ]
628
+ },
629
+ {
630
+ "cell_type": "markdown",
631
+ "id": "c278a55d-f083-4f95-b292-92e545b9c408",
632
+ "metadata": {},
633
+ "source": [
634
+ "### <span style=\"color:#ff5f27;\">🌟 All together</span>"
635
+ ]
636
+ },
637
+ {
638
+ "cell_type": "code",
639
+ "execution_count": 18,
640
+ "id": "0d55ae92-4bf9-43ae-8841-6767f5f68bec",
641
+ "metadata": {
642
+ "tags": []
643
+ },
644
+ "outputs": [],
645
+ "source": [
646
+ "df_air_quality = pd.concat([df_eu, df_us, df_seattle]).reset_index(drop=True)"
647
+ ]
648
+ },
649
+ {
650
+ "cell_type": "code",
651
+ "execution_count": 19,
652
+ "id": "d5df39e2-2ce6-48df-9063-9827da8e7317",
653
+ "metadata": {
654
+ "tags": []
655
+ },
656
+ "outputs": [
657
+ {
658
+ "data": {
659
+ "text/html": [
660
+ "<div>\n",
661
+ "<style scoped>\n",
662
+ " .dataframe tbody tr th:only-of-type {\n",
663
+ " vertical-align: middle;\n",
664
+ " }\n",
665
+ "\n",
666
+ " .dataframe tbody tr th {\n",
667
+ " vertical-align: top;\n",
668
+ " }\n",
669
+ "\n",
670
+ " .dataframe thead th {\n",
671
+ " text-align: right;\n",
672
+ " }\n",
673
+ "</style>\n",
674
+ "<table border=\"1\" class=\"dataframe\">\n",
675
+ " <thead>\n",
676
+ " <tr style=\"text-align: right;\">\n",
677
+ " <th></th>\n",
678
+ " <th>city_name</th>\n",
679
+ " <th>date</th>\n",
680
+ " <th>pm2_5</th>\n",
681
+ " </tr>\n",
682
+ " </thead>\n",
683
+ " <tbody>\n",
684
+ " <tr>\n",
685
+ " <th>155596</th>\n",
686
+ " <td>Tacoma-S 36th St</td>\n",
687
+ " <td>2023-03-12</td>\n",
688
+ " <td>13.9</td>\n",
689
+ " </tr>\n",
690
+ " <tr>\n",
691
+ " <th>72851</th>\n",
692
+ " <td>Chicago</td>\n",
693
+ " <td>2018-07-04</td>\n",
694
+ " <td>10.3</td>\n",
695
+ " </tr>\n",
696
+ " <tr>\n",
697
+ " <th>150716</th>\n",
698
+ " <td>Bellevue-SE 12th St</td>\n",
699
+ " <td>2022-12-07</td>\n",
700
+ " <td>1.8</td>\n",
701
+ " </tr>\n",
702
+ " <tr>\n",
703
+ " <th>88999</th>\n",
704
+ " <td>Los Angeles</td>\n",
705
+ " <td>2016-07-11</td>\n",
706
+ " <td>10.5</td>\n",
707
+ " </tr>\n",
708
+ " <tr>\n",
709
+ " <th>127366</th>\n",
710
+ " <td>Tacoma-S 36th St</td>\n",
711
+ " <td>2017-12-01</td>\n",
712
+ " <td>4.6</td>\n",
713
+ " </tr>\n",
714
+ " </tbody>\n",
715
+ "</table>\n",
716
+ "</div>"
717
+ ],
718
+ "text/plain": [
719
+ " city_name date pm2_5\n",
720
+ "155596 Tacoma-S 36th St 2023-03-12 13.9\n",
721
+ "72851 Chicago 2018-07-04 10.3\n",
722
+ "150716 Bellevue-SE 12th St 2022-12-07 1.8\n",
723
+ "88999 Los Angeles 2016-07-11 10.5\n",
724
+ "127366 Tacoma-S 36th St 2017-12-01 4.6"
725
+ ]
726
+ },
727
+ "execution_count": 19,
728
+ "metadata": {},
729
+ "output_type": "execute_result"
730
+ }
731
+ ],
732
+ "source": [
733
+ "df_air_quality.sample(5)"
734
+ ]
735
+ },
736
+ {
737
+ "cell_type": "code",
738
+ "execution_count": 20,
739
+ "id": "794c30fe-fb54-4fa0-a34c-5cef68f52473",
740
+ "metadata": {
741
+ "tags": []
742
+ },
743
+ "outputs": [
744
+ {
745
+ "data": {
746
+ "text/plain": [
747
+ "(156064, 3)"
748
+ ]
749
+ },
750
+ "execution_count": 20,
751
+ "metadata": {},
752
+ "output_type": "execute_result"
753
+ }
754
+ ],
755
+ "source": [
756
+ "df_air_quality.shape"
757
+ ]
758
+ },
759
+ {
760
+ "cell_type": "code",
761
+ "execution_count": 21,
762
+ "id": "ed9bc7f1-d62e-4b1f-97af-6ecd30fe4b67",
763
+ "metadata": {
764
+ "tags": []
765
+ },
766
+ "outputs": [
767
+ {
768
+ "data": {
769
+ "text/plain": [
770
+ "Index(['city_name', 'date', 'pm2_5'], dtype='object')"
771
+ ]
772
+ },
773
+ "execution_count": 21,
774
+ "metadata": {},
775
+ "output_type": "execute_result"
776
+ }
777
+ ],
778
+ "source": [
779
+ "df_air_quality.columns"
780
+ ]
781
+ },
782
+ {
783
+ "cell_type": "markdown",
784
+ "id": "88a9e0ef-e9d2-4e3c-91af-c4e619b8c906",
785
+ "metadata": {},
786
+ "source": [
787
+ "---"
788
+ ]
789
+ },
790
+ {
791
+ "cell_type": "markdown",
792
+ "id": "4687e802",
793
+ "metadata": {
794
+ "tags": []
795
+ },
796
+ "source": [
797
+ "## <span style='color:#ff5f27'> 🌦 Loading Weather Data from [Open Meteo](https://open-meteo.com/en/docs)"
798
+ ]
799
+ },
800
+ {
801
+ "cell_type": "code",
802
+ "execution_count": 22,
803
+ "id": "c46283b4",
804
+ "metadata": {},
805
+ "outputs": [],
806
+ "source": [
807
+ "df_weather = pd.read_csv(\"data/backfill_weather.csv\")"
808
+ ]
809
+ },
810
+ {
811
+ "cell_type": "code",
812
+ "execution_count": 23,
813
+ "id": "1921b61c-d002-417e-88a6-9fe1cad0a7d4",
814
+ "metadata": {
815
+ "tags": []
816
+ },
817
+ "outputs": [
818
+ {
819
+ "data": {
820
+ "text/plain": [
821
+ "city_name\n",
822
+ "Amsterdam 3767\n",
823
+ "Athina 3767\n",
824
+ "Berlin 3767\n",
825
+ "Gdansk 3767\n",
826
+ "Kraków 3767\n",
827
+ "London 3767\n",
828
+ "Madrid 3767\n",
829
+ "Marseille 3767\n",
830
+ "Milano 3767\n",
831
+ "München 3767\n",
832
+ "Napoli 3767\n",
833
+ "Paris 3767\n",
834
+ "Sevilla 3767\n",
835
+ "Stockholm 3767\n",
836
+ "Tallinn 3767\n",
837
+ "Varna 3767\n",
838
+ "Wien 3767\n",
839
+ "Albuquerque 3767\n",
840
+ "Atlanta 3767\n",
841
+ "Chicago 3767\n",
842
+ "Columbus 3767\n",
843
+ "Dallas 3767\n",
844
+ "Denver 3767\n",
845
+ "Houston 3767\n",
846
+ "Los Angeles 3767\n",
847
+ "New York 3767\n",
848
+ "Phoenix-Mesa 3767\n",
849
+ "Salt Lake City 3767\n",
850
+ "San Francisco 3767\n",
851
+ "Tampa 3767\n",
852
+ "Bellevue-SE 12th St 3767\n",
853
+ "DARRINGTON - FIR ST (Darrington High School) 3767\n",
854
+ "KENT - JAMES & CENTRAL 3767\n",
855
+ "LAKE FOREST PARK TOWNE CENTER 3767\n",
856
+ "MARYSVILLE - 7TH AVE (Marysville Junior High) 3767\n",
857
+ "NORTH BEND - NORTH BEND WAY 3767\n",
858
+ "SEATTLE - BEACON HILL 3767\n",
859
+ "SEATTLE - DUWAMISH 3767\n",
860
+ "SEATTLE - SOUTH PARK #2 3767\n",
861
+ "Seattle-10th & Weller 3767\n",
862
+ "TACOMA - ALEXANDER AVE 3767\n",
863
+ "TACOMA - L STREET 3767\n",
864
+ "Tacoma-S 36th St 3767\n",
865
+ "Tukwila Allentown 3767\n",
866
+ "Tulalip-Totem Beach Rd 3767\n",
867
+ "Name: count, dtype: int64"
868
+ ]
869
+ },
870
+ "execution_count": 23,
871
+ "metadata": {},
872
+ "output_type": "execute_result"
873
+ }
874
+ ],
875
+ "source": [
876
+ "df_weather.city_name.value_counts()"
877
+ ]
878
+ },
879
+ {
880
+ "cell_type": "code",
881
+ "execution_count": 24,
882
+ "id": "8d5dcd0a",
883
+ "metadata": {},
884
+ "outputs": [
885
+ {
886
+ "data": {
887
+ "text/html": [
888
+ "<div>\n",
889
+ "<style scoped>\n",
890
+ " .dataframe tbody tr th:only-of-type {\n",
891
+ " vertical-align: middle;\n",
892
+ " }\n",
893
+ "\n",
894
+ " .dataframe tbody tr th {\n",
895
+ " vertical-align: top;\n",
896
+ " }\n",
897
+ "\n",
898
+ " .dataframe thead th {\n",
899
+ " text-align: right;\n",
900
+ " }\n",
901
+ "</style>\n",
902
+ "<table border=\"1\" class=\"dataframe\">\n",
903
+ " <thead>\n",
904
+ " <tr style=\"text-align: right;\">\n",
905
+ " <th></th>\n",
906
+ " <th>city_name</th>\n",
907
+ " <th>date</th>\n",
908
+ " <th>temperature_max</th>\n",
909
+ " <th>temperature_min</th>\n",
910
+ " <th>precipitation_sum</th>\n",
911
+ " <th>rain_sum</th>\n",
912
+ " <th>snowfall_sum</th>\n",
913
+ " <th>precipitation_hours</th>\n",
914
+ " <th>wind_speed_max</th>\n",
915
+ " <th>wind_gusts_max</th>\n",
916
+ " <th>wind_direction_dominant</th>\n",
917
+ " </tr>\n",
918
+ " </thead>\n",
919
+ " <tbody>\n",
920
+ " <tr>\n",
921
+ " <th>56824</th>\n",
922
+ " <td>Varna</td>\n",
923
+ " <td>2014-03-01</td>\n",
924
+ " <td>9.4</td>\n",
925
+ " <td>5.5</td>\n",
926
+ " <td>2.6</td>\n",
927
+ " <td>2.6</td>\n",
928
+ " <td>0.00</td>\n",
929
+ " <td>7.0</td>\n",
930
+ " <td>13.2</td>\n",
931
+ " <td>22.7</td>\n",
932
+ " <td>150</td>\n",
933
+ " </tr>\n",
934
+ " <tr>\n",
935
+ " <th>146508</th>\n",
936
+ " <td>SEATTLE - SOUTH PARK #2</td>\n",
937
+ " <td>2022-12-08</td>\n",
938
+ " <td>5.6</td>\n",
939
+ " <td>1.8</td>\n",
940
+ " <td>7.9</td>\n",
941
+ " <td>7.6</td>\n",
942
+ " <td>0.21</td>\n",
943
+ " <td>15.0</td>\n",
944
+ " <td>18.1</td>\n",
945
+ " <td>38.9</td>\n",
946
+ " <td>285</td>\n",
947
+ " </tr>\n",
948
+ " <tr>\n",
949
+ " <th>53035</th>\n",
950
+ " <td>Tallinn</td>\n",
951
+ " <td>2014-01-31</td>\n",
952
+ " <td>-8.6</td>\n",
953
+ " <td>-17.0</td>\n",
954
+ " <td>1.0</td>\n",
955
+ " <td>0.0</td>\n",
956
+ " <td>0.98</td>\n",
957
+ " <td>3.0</td>\n",
958
+ " <td>29.6</td>\n",
959
+ " <td>55.8</td>\n",
960
+ " <td>158</td>\n",
961
+ " </tr>\n",
962
+ " </tbody>\n",
963
+ "</table>\n",
964
+ "</div>"
965
+ ],
966
+ "text/plain": [
967
+ " city_name date temperature_max temperature_min \\\n",
968
+ "56824 Varna 2014-03-01 9.4 5.5 \n",
969
+ "146508 SEATTLE - SOUTH PARK #2 2022-12-08 5.6 1.8 \n",
970
+ "53035 Tallinn 2014-01-31 -8.6 -17.0 \n",
971
+ "\n",
972
+ " precipitation_sum rain_sum snowfall_sum precipitation_hours \\\n",
973
+ "56824 2.6 2.6 0.00 7.0 \n",
974
+ "146508 7.9 7.6 0.21 15.0 \n",
975
+ "53035 1.0 0.0 0.98 3.0 \n",
976
+ "\n",
977
+ " wind_speed_max wind_gusts_max wind_direction_dominant \n",
978
+ "56824 13.2 22.7 150 \n",
979
+ "146508 18.1 38.9 285 \n",
980
+ "53035 29.6 55.8 158 "
981
+ ]
982
+ },
983
+ "execution_count": 24,
984
+ "metadata": {},
985
+ "output_type": "execute_result"
986
+ }
987
+ ],
988
+ "source": [
989
+ "df_weather.sample(3)"
990
+ ]
991
+ },
992
+ {
993
+ "cell_type": "markdown",
994
+ "id": "cc9b7ad6",
995
+ "metadata": {},
996
+ "source": [
997
+ "---"
998
+ ]
999
+ },
1000
+ {
1001
+ "cell_type": "code",
1002
+ "execution_count": 25,
1003
+ "id": "a8f886c3-a5ac-4370-a6a2-22838ab7409e",
1004
+ "metadata": {
1005
+ "tags": []
1006
+ },
1007
+ "outputs": [],
1008
+ "source": [
1009
+ "df_air_quality.date = pd.to_datetime(df_air_quality.date)\n",
1010
+ "df_weather.date = pd.to_datetime(df_weather.date)\n",
1011
+ "\n",
1012
+ "df_air_quality[\"unix_time\"] = df_air_quality[\"date\"].apply(convert_date_to_unix)\n",
1013
+ "df_weather[\"unix_time\"] = df_weather[\"date\"].apply(convert_date_to_unix)"
1014
+ ]
1015
+ },
1016
+ {
1017
+ "cell_type": "code",
1018
+ "execution_count": 26,
1019
+ "id": "1b6af890-87a3-4468-8eda-576c2dd75464",
1020
+ "metadata": {
1021
+ "tags": []
1022
+ },
1023
+ "outputs": [],
1024
+ "source": [
1025
+ "df_air_quality.date = df_air_quality.date.astype(str)\n",
1026
+ "df_weather.date = df_weather.date.astype(str)"
1027
+ ]
1028
+ },
1029
+ {
1030
+ "cell_type": "code",
1031
+ "execution_count": 27,
1032
+ "id": "2ad5ea08",
1033
+ "metadata": {},
1034
+ "outputs": [
1035
+ {
1036
+ "data": {
1037
+ "text/html": [
1038
+ "<div>\n",
1039
+ "<style scoped>\n",
1040
+ " .dataframe tbody tr th:only-of-type {\n",
1041
+ " vertical-align: middle;\n",
1042
+ " }\n",
1043
+ "\n",
1044
+ " .dataframe tbody tr th {\n",
1045
+ " vertical-align: top;\n",
1046
+ " }\n",
1047
+ "\n",
1048
+ " .dataframe thead th {\n",
1049
+ " text-align: right;\n",
1050
+ " }\n",
1051
+ "</style>\n",
1052
+ "<table border=\"1\" class=\"dataframe\">\n",
1053
+ " <thead>\n",
1054
+ " <tr style=\"text-align: right;\">\n",
1055
+ " <th></th>\n",
1056
+ " <th>city_name</th>\n",
1057
+ " <th>date</th>\n",
1058
+ " <th>pm2_5</th>\n",
1059
+ " <th>unix_time</th>\n",
1060
+ " </tr>\n",
1061
+ " </thead>\n",
1062
+ " <tbody>\n",
1063
+ " <tr>\n",
1064
+ " <th>0</th>\n",
1065
+ " <td>Amsterdam</td>\n",
1066
+ " <td>2013-01-01</td>\n",
1067
+ " <td>14.0</td>\n",
1068
+ " <td>1356994800000</td>\n",
1069
+ " </tr>\n",
1070
+ " <tr>\n",
1071
+ " <th>1</th>\n",
1072
+ " <td>Amsterdam</td>\n",
1073
+ " <td>2013-01-02</td>\n",
1074
+ " <td>8.0</td>\n",
1075
+ " <td>1357081200000</td>\n",
1076
+ " </tr>\n",
1077
+ " <tr>\n",
1078
+ " <th>2</th>\n",
1079
+ " <td>Amsterdam</td>\n",
1080
+ " <td>2013-01-03</td>\n",
1081
+ " <td>12.0</td>\n",
1082
+ " <td>1357167600000</td>\n",
1083
+ " </tr>\n",
1084
+ " <tr>\n",
1085
+ " <th>3</th>\n",
1086
+ " <td>Amsterdam</td>\n",
1087
+ " <td>2013-01-04</td>\n",
1088
+ " <td>12.0</td>\n",
1089
+ " <td>1357254000000</td>\n",
1090
+ " </tr>\n",
1091
+ " <tr>\n",
1092
+ " <th>4</th>\n",
1093
+ " <td>Amsterdam</td>\n",
1094
+ " <td>2013-01-05</td>\n",
1095
+ " <td>14.0</td>\n",
1096
+ " <td>1357340400000</td>\n",
1097
+ " </tr>\n",
1098
+ " <tr>\n",
1099
+ " <th>...</th>\n",
1100
+ " <td>...</td>\n",
1101
+ " <td>...</td>\n",
1102
+ " <td>...</td>\n",
1103
+ " <td>...</td>\n",
1104
+ " </tr>\n",
1105
+ " <tr>\n",
1106
+ " <th>156059</th>\n",
1107
+ " <td>MARYSVILLE - 7TH AVE (Marysville Junior High)</td>\n",
1108
+ " <td>2023-03-30</td>\n",
1109
+ " <td>7.9</td>\n",
1110
+ " <td>1680127200000</td>\n",
1111
+ " </tr>\n",
1112
+ " <tr>\n",
1113
+ " <th>156060</th>\n",
1114
+ " <td>MARYSVILLE - 7TH AVE (Marysville Junior High)</td>\n",
1115
+ " <td>2023-03-31</td>\n",
1116
+ " <td>3.7</td>\n",
1117
+ " <td>1680213600000</td>\n",
1118
+ " </tr>\n",
1119
+ " <tr>\n",
1120
+ " <th>156061</th>\n",
1121
+ " <td>MARYSVILLE - 7TH AVE (Marysville Junior High)</td>\n",
1122
+ " <td>2023-04-01</td>\n",
1123
+ " <td>3.4</td>\n",
1124
+ " <td>1680300000000</td>\n",
1125
+ " </tr>\n",
1126
+ " <tr>\n",
1127
+ " <th>156062</th>\n",
1128
+ " <td>MARYSVILLE - 7TH AVE (Marysville Junior High)</td>\n",
1129
+ " <td>2023-04-02</td>\n",
1130
+ " <td>3.1</td>\n",
1131
+ " <td>1680386400000</td>\n",
1132
+ " </tr>\n",
1133
+ " <tr>\n",
1134
+ " <th>156063</th>\n",
1135
+ " <td>MARYSVILLE - 7TH AVE (Marysville Junior High)</td>\n",
1136
+ " <td>2023-04-03</td>\n",
1137
+ " <td>4.4</td>\n",
1138
+ " <td>1680472800000</td>\n",
1139
+ " </tr>\n",
1140
+ " </tbody>\n",
1141
+ "</table>\n",
1142
+ "<p>156064 rows × 4 columns</p>\n",
1143
+ "</div>"
1144
+ ],
1145
+ "text/plain": [
1146
+ " city_name date pm2_5 \\\n",
1147
+ "0 Amsterdam 2013-01-01 14.0 \n",
1148
+ "1 Amsterdam 2013-01-02 8.0 \n",
1149
+ "2 Amsterdam 2013-01-03 12.0 \n",
1150
+ "3 Amsterdam 2013-01-04 12.0 \n",
1151
+ "4 Amsterdam 2013-01-05 14.0 \n",
1152
+ "... ... ... ... \n",
1153
+ "156059 MARYSVILLE - 7TH AVE (Marysville Junior High) 2023-03-30 7.9 \n",
1154
+ "156060 MARYSVILLE - 7TH AVE (Marysville Junior High) 2023-03-31 3.7 \n",
1155
+ "156061 MARYSVILLE - 7TH AVE (Marysville Junior High) 2023-04-01 3.4 \n",
1156
+ "156062 MARYSVILLE - 7TH AVE (Marysville Junior High) 2023-04-02 3.1 \n",
1157
+ "156063 MARYSVILLE - 7TH AVE (Marysville Junior High) 2023-04-03 4.4 \n",
1158
+ "\n",
1159
+ " unix_time \n",
1160
+ "0 1356994800000 \n",
1161
+ "1 1357081200000 \n",
1162
+ "2 1357167600000 \n",
1163
+ "3 1357254000000 \n",
1164
+ "4 1357340400000 \n",
1165
+ "... ... \n",
1166
+ "156059 1680127200000 \n",
1167
+ "156060 1680213600000 \n",
1168
+ "156061 1680300000000 \n",
1169
+ "156062 1680386400000 \n",
1170
+ "156063 1680472800000 \n",
1171
+ "\n",
1172
+ "[156064 rows x 4 columns]"
1173
+ ]
1174
+ },
1175
+ "execution_count": 27,
1176
+ "metadata": {},
1177
+ "output_type": "execute_result"
1178
+ }
1179
+ ],
1180
+ "source": [
1181
+ "df_air_quality"
1182
+ ]
1183
+ },
1184
+ {
1185
+ "cell_type": "markdown",
1186
+ "id": "f2ebd846-0420-4e4c-8a5b-0827fa91c693",
1187
+ "metadata": {},
1188
+ "source": [
1189
+ "---"
1190
+ ]
1191
+ },
1192
+ {
1193
+ "cell_type": "markdown",
1194
+ "id": "cb6f83ba",
1195
+ "metadata": {},
1196
+ "source": [
1197
+ "### <span style=\"color:#ff5f27;\"> 🔮 Connecting to Hopsworks Feature Store </span>"
1198
+ ]
1199
+ },
1200
+ {
1201
+ "cell_type": "code",
1202
+ "execution_count": 29,
1203
+ "id": "dd068240",
1204
+ "metadata": {},
1205
+ "outputs": [
1206
+ {
1207
+ "name": "stdout",
1208
+ "output_type": "stream",
1209
+ "text": [
1210
+ "Connected. Call `.close()` to terminate connection gracefully.\n",
1211
+ "Copy your Api Key (first register/login): https://c.app.hopsworks.ai/account/api/generated\n",
1212
+ "Connected. Call `.close()` to terminate connection gracefully.\n",
1213
+ "\n",
1214
+ "Multiple projects found. \n",
1215
+ "\n",
1216
+ "\t (1) annikaij\n",
1217
+ "\t (2) miknie20\n",
1218
+ "\n",
1219
+ "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/549019\n",
1220
+ "Connected. Call `.close()` to terminate connection gracefully.\n"
1221
+ ]
1222
+ }
1223
+ ],
1224
+ "source": [
1225
+ "import hopsworks\n",
1226
+ "\n",
1227
+ "project = hopsworks.login()\n",
1228
+ "\n",
1229
+ "fs = project.get_feature_store() "
1230
+ ]
1231
+ },
1232
+ {
1233
+ "cell_type": "code",
1234
+ "execution_count": 30,
1235
+ "id": "71db5ac1",
1236
+ "metadata": {},
1237
+ "outputs": [
1238
+ {
1239
+ "data": {
1240
+ "text/plain": [
1241
+ "{\"expectation_type\": \"expect_column_values_to_be_between\", \"kwargs\": {\"column\": \"pm2_5\", \"min_value\": 0.0, \"max_value\": 1000.0}, \"meta\": {}}"
1242
+ ]
1243
+ },
1244
+ "execution_count": 30,
1245
+ "metadata": {},
1246
+ "output_type": "execute_result"
1247
+ }
1248
+ ],
1249
+ "source": [
1250
+ "from great_expectations.core import ExpectationSuite, ExpectationConfiguration\n",
1251
+ "\n",
1252
+ "expectation_suite = ExpectationSuite(expectation_suite_name=\"pmi_data\")\n",
1253
+ "\n",
1254
+ "expectation_suite.add_expectation(\n",
1255
+ " ExpectationConfiguration(\n",
1256
+ " expectation_type=\"expect_column_values_to_be_between\",\n",
1257
+ " kwargs={\n",
1258
+ " \"column\": \"pm2_5\", \n",
1259
+ " \"min_value\": 0.0,\n",
1260
+ " \"max_value\": 1000.0,\n",
1261
+ " }\n",
1262
+ " )\n",
1263
+ ")"
1264
+ ]
1265
+ },
1266
+ {
1267
+ "cell_type": "markdown",
1268
+ "id": "63d8c3b9",
1269
+ "metadata": {},
1270
+ "source": [
1271
+ "## <span style=\"color:#ff5f27;\">🪄 Creating Feature Groups</span>"
1272
+ ]
1273
+ },
1274
+ {
1275
+ "cell_type": "markdown",
1276
+ "id": "4a2515c4",
1277
+ "metadata": {},
1278
+ "source": [
1279
+ "### <span style='color:#ff5f27'> 🌫 Air Quality Data"
1280
+ ]
1281
+ },
1282
+ {
1283
+ "cell_type": "code",
1284
+ "execution_count": 31,
1285
+ "id": "9d7088a8",
1286
+ "metadata": {
1287
+ "scrolled": true,
1288
+ "tags": []
1289
+ },
1290
+ "outputs": [],
1291
+ "source": [
1292
+ "air_quality_fg = fs.get_or_create_feature_group(\n",
1293
+ " name='air_quality',\n",
1294
+ " description='Air Quality characteristics of each day',\n",
1295
+ " version=1,\n",
1296
+ " primary_key=['city_name'], #'unix_time',\n",
1297
+ " online_enabled=False,\n",
1298
+ " expectation_suite = expectation_suite,\n",
1299
+ " event_time=\"unix_time\"\n",
1300
+ ") "
1301
+ ]
1302
+ },
1303
+ {
1304
+ "cell_type": "code",
1305
+ "execution_count": 32,
1306
+ "id": "7e04a975-bb58-42e2-9abd-90e68ae37864",
1307
+ "metadata": {},
1308
+ "outputs": [
1309
+ {
1310
+ "name": "stdout",
1311
+ "output_type": "stream",
1312
+ "text": [
1313
+ "Feature Group created successfully, explore it at \n",
1314
+ "https://c.app.hopsworks.ai:443/p/549019/fs/544841/fg/758117\n",
1315
+ "Validation failed.\n",
1316
+ "Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/549019/fs/544841/fg/758117\n"
1317
+ ]
1318
+ },
1319
+ {
1320
+ "name": "stderr",
1321
+ "output_type": "stream",
1322
+ "text": [
1323
+ "Uploading Dataframe: 100.00% |██████████| Rows 156064/156064 | Elapsed Time: 00:16 | Remaining Time: 00:00\n"
1324
+ ]
1325
+ },
1326
+ {
1327
+ "name": "stdout",
1328
+ "output_type": "stream",
1329
+ "text": [
1330
+ "Launching job: air_quality_1_offline_fg_materialization\n",
1331
+ "Job started successfully, you can follow the progress at \n",
1332
+ "https://c.app.hopsworks.ai/p/549019/jobs/named/air_quality_1_offline_fg_materialization/executions\n"
1333
+ ]
1334
+ },
1335
+ {
1336
+ "data": {
1337
+ "text/plain": [
1338
+ "(<hsfs.core.job.Job at 0x7fb0dce8c6d0>,\n",
1339
+ " {\n",
1340
+ " \"evaluation_parameters\": {},\n",
1341
+ " \"success\": false,\n",
1342
+ " \"statistics\": {\n",
1343
+ " \"evaluated_expectations\": 1,\n",
1344
+ " \"successful_expectations\": 0,\n",
1345
+ " \"unsuccessful_expectations\": 1,\n",
1346
+ " \"success_percent\": 0.0\n",
1347
+ " },\n",
1348
+ " \"results\": [\n",
1349
+ " {\n",
1350
+ " \"exception_info\": {\n",
1351
+ " \"raised_exception\": false,\n",
1352
+ " \"exception_message\": null,\n",
1353
+ " \"exception_traceback\": null\n",
1354
+ " },\n",
1355
+ " \"expectation_config\": {\n",
1356
+ " \"expectation_type\": \"expect_column_values_to_be_between\",\n",
1357
+ " \"kwargs\": {\n",
1358
+ " \"column\": \"pm2_5\",\n",
1359
+ " \"min_value\": 0.0,\n",
1360
+ " \"max_value\": 1000.0\n",
1361
+ " },\n",
1362
+ " \"meta\": {\n",
1363
+ " \"expectationId\": 473089\n",
1364
+ " }\n",
1365
+ " },\n",
1366
+ " \"success\": false,\n",
1367
+ " \"result\": {\n",
1368
+ " \"element_count\": 156064,\n",
1369
+ " \"missing_count\": 0,\n",
1370
+ " \"missing_percent\": 0.0,\n",
1371
+ " \"unexpected_count\": 84,\n",
1372
+ " \"unexpected_percent\": 0.05382407217551774,\n",
1373
+ " \"unexpected_percent_total\": 0.05382407217551774,\n",
1374
+ " \"unexpected_percent_nonmissing\": 0.05382407217551774,\n",
1375
+ " \"partial_unexpected_list\": [\n",
1376
+ " -1.0,\n",
1377
+ " -1.0,\n",
1378
+ " -1.0,\n",
1379
+ " -1.0,\n",
1380
+ " -0.2,\n",
1381
+ " -0.1,\n",
1382
+ " -1.2,\n",
1383
+ " -1.2,\n",
1384
+ " -1.1,\n",
1385
+ " -0.9,\n",
1386
+ " -0.6,\n",
1387
+ " -0.2,\n",
1388
+ " -1.0,\n",
1389
+ " -0.5,\n",
1390
+ " -0.7,\n",
1391
+ " -0.1,\n",
1392
+ " -0.4,\n",
1393
+ " -0.5,\n",
1394
+ " -0.1,\n",
1395
+ " -0.2\n",
1396
+ " ]\n",
1397
+ " },\n",
1398
+ " \"meta\": {\n",
1399
+ " \"ingestionResult\": \"INGESTED\",\n",
1400
+ " \"validationTime\": \"2024-04-27T01:53:43.000307Z\"\n",
1401
+ " }\n",
1402
+ " }\n",
1403
+ " ],\n",
1404
+ " \"meta\": {\n",
1405
+ " \"great_expectations_version\": \"0.15.12\",\n",
1406
+ " \"expectation_suite_name\": \"pmi_data\",\n",
1407
+ " \"run_id\": {\n",
1408
+ " \"run_name\": null,\n",
1409
+ " \"run_time\": \"2024-04-27T13:53:43.307739+00:00\"\n",
1410
+ " },\n",
1411
+ " \"batch_kwargs\": {\n",
1412
+ " \"ge_batch_id\": \"8f57f63a-049d-11ef-9d82-e2cf145aedc8\"\n",
1413
+ " },\n",
1414
+ " \"batch_markers\": {},\n",
1415
+ " \"batch_parameters\": {},\n",
1416
+ " \"validation_time\": \"20240427T135343.307573Z\",\n",
1417
+ " \"expectation_suite_meta\": {\n",
1418
+ " \"great_expectations_version\": \"0.15.12\"\n",
1419
+ " }\n",
1420
+ " }\n",
1421
+ " })"
1422
+ ]
1423
+ },
1424
+ "execution_count": 32,
1425
+ "metadata": {},
1426
+ "output_type": "execute_result"
1427
+ }
1428
+ ],
1429
+ "source": [
1430
+ "air_quality_fg.insert(df_air_quality, write_options={\"wait_for_job\": False})"
1431
+ ]
1432
+ },
1433
+ {
1434
+ "cell_type": "markdown",
1435
+ "id": "a73a9029",
1436
+ "metadata": {},
1437
+ "source": [
1438
+ "### <span style='color:#ff5f27'> 🌦 Weather Data"
1439
+ ]
1440
+ },
1441
+ {
1442
+ "cell_type": "code",
1443
+ "execution_count": 33,
1444
+ "id": "acc2b799",
1445
+ "metadata": {},
1446
+ "outputs": [],
1447
+ "source": [
1448
+ "weather_fg = fs.get_or_create_feature_group(\n",
1449
+ " name='weather',\n",
1450
+ " description='Weather characteristics of each day',\n",
1451
+ " version=1,\n",
1452
+ " primary_key=['city_name'], #'unix_time'\n",
1453
+ " online_enabled=False,\n",
1454
+ " event_time=\"unix_time\"\n",
1455
+ ") "
1456
+ ]
1457
+ },
1458
+ {
1459
+ "cell_type": "code",
1460
+ "execution_count": 34,
1461
+ "id": "9583b4d1-e2e3-4f56-9e5d-23caa0c49457",
1462
+ "metadata": {
1463
+ "tags": []
1464
+ },
1465
+ "outputs": [
1466
+ {
1467
+ "name": "stdout",
1468
+ "output_type": "stream",
1469
+ "text": [
1470
+ "Feature Group created successfully, explore it at \n",
1471
+ "https://c.app.hopsworks.ai:443/p/549019/fs/544841/fg/760147\n"
1472
+ ]
1473
+ },
1474
+ {
1475
+ "name": "stderr",
1476
+ "output_type": "stream",
1477
+ "text": [
1478
+ "Uploading Dataframe: 100.00% |██████████| Rows 169515/169515 | Elapsed Time: 00:22 | Remaining Time: 00:00\n"
1479
+ ]
1480
+ },
1481
+ {
1482
+ "name": "stdout",
1483
+ "output_type": "stream",
1484
+ "text": [
1485
+ "Launching job: weather_1_offline_fg_materialization\n",
1486
+ "Job started successfully, you can follow the progress at \n",
1487
+ "https://c.app.hopsworks.ai/p/549019/jobs/named/weather_1_offline_fg_materialization/executions\n"
1488
+ ]
1489
+ },
1490
+ {
1491
+ "data": {
1492
+ "text/plain": [
1493
+ "(<hsfs.core.job.Job at 0x7fb0dcedaf50>, None)"
1494
+ ]
1495
+ },
1496
+ "execution_count": 34,
1497
+ "metadata": {},
1498
+ "output_type": "execute_result"
1499
+ }
1500
+ ],
1501
+ "source": [
1502
+ "weather_fg.insert(df_weather, write_options={\"wait_for_job\": False})"
1503
+ ]
1504
+ },
1505
+ {
1506
+ "cell_type": "code",
1507
+ "execution_count": null,
1508
+ "id": "b087a12f",
1509
+ "metadata": {},
1510
+ "outputs": [],
1511
+ "source": []
1512
+ }
1513
+ ],
1514
+ "metadata": {
1515
+ "kernelspec": {
1516
+ "display_name": "ucloud-sml",
1517
+ "language": "python",
1518
+ "name": "python3"
1519
+ },
1520
+ "language_info": {
1521
+ "codemirror_mode": {
1522
+ "name": "ipython",
1523
+ "version": 3
1524
+ },
1525
+ "file_extension": ".py",
1526
+ "mimetype": "text/x-python",
1527
+ "name": "python",
1528
+ "nbconvert_exporter": "python",
1529
+ "pygments_lexer": "ipython3",
1530
+ "version": "3.11.9"
1531
+ }
1532
+ },
1533
+ "nbformat": 4,
1534
+ "nbformat_minor": 5
1535
+ }
air_quality-main/2_feature_pipeline.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ import modal
4
+ import datetime
5
+ import time
6
+ import requests
7
+ import pandas as pd
8
+ import json
9
+ import hopsworks
10
+ from functions import *
11
+ import warnings
12
+ from urllib.request import urlopen
13
+ warnings.filterwarnings("ignore")
14
+
15
+ stub = modal.Stub("air_quality_daily")
16
+ image = modal.Image.debian_slim().pip_install(["hopsworks", "geopy"])
17
+
18
+
19
+ def features():
20
+ target_url='https://repo.hops.works/dev/jdowling/target_cities.json'
21
+ response = urlopen(target_url)
22
+ target_cities = json.loads(response.read())
23
+
24
+ today = datetime.date.today()
25
+ hindcast_day = today - datetime.timedelta(days=1)
26
+ forecast_day = today + datetime.timedelta(days=7)
27
+
28
+
29
+ start_of_cell = time.time()
30
+
31
+ df_aq_raw = pd.DataFrame()
32
+
33
+ for continent in target_cities:
34
+ for city_name, coords in target_cities[continent].items():
35
+ df_ = get_aqi_data_from_open_meteo(city_name=city_name,
36
+ coordinates=coords,
37
+ start_date=str(hindcast_day),
38
+ end_date=str(today))
39
+ df_aq_raw = pd.concat([df_aq_raw, df_]).reset_index(drop=True)
40
+
41
+ end_of_cell = time.time()
42
+ print("-" * 64)
43
+ print(f"Parsed new PM2.5 data for ALL locations up to {str(today)}.")
44
+ print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
45
+
46
+
47
+ df_aq_update = df_aq_raw
48
+
49
+ df_aq_update['date'] = pd.to_datetime(df_aq_update['date'])
50
+ df_aq_update = df_aq_update.dropna()
51
+
52
+ df_weather_update = pd.DataFrame()
53
+
54
+ start_of_cell = time.time()
55
+ for continent in target_cities:
56
+ for city_name, coords in target_cities[continent].items():
57
+ df_ = get_weather_data_from_open_meteo(city_name=city_name,
58
+ coordinates=coords,
59
+ start_date=str(today),
60
+ end_date=str(forecast_day),
61
+ forecast=True)
62
+ df_weather_update = pd.concat([df_weather_update, df_]).reset_index(drop=True)
63
+
64
+ end_of_cell = time.time()
65
+ print("-" * 64)
66
+ print(f"Parsed new weather data for ALL cities up to {str(today)}.")
67
+ print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
68
+
69
+
70
+ df_aq_update.date = pd.to_datetime(df_aq_update.date)
71
+ df_weather_update.date = pd.to_datetime(df_weather_update.date)
72
+
73
+ df_aq_update["unix_time"] = df_aq_update["date"].apply(convert_date_to_unix)
74
+ df_weather_update["unix_time"] = df_weather_update["date"].apply(convert_date_to_unix)
75
+
76
+
77
+ df_aq_update.date = df_aq_update.date.astype(str)
78
+ df_weather_update.date = df_weather_update.date.astype(str)
79
+
80
+ return df_aq_update, df_weather_update
81
+
82
+ @stub.function(image=image, schedule=modal.Period(days=1), secret=modal.Secret.from_name("jim-hopsworks-gcp"))
83
+ def g():
84
+ df_aq_update, df_weather_update = features()
85
+
86
+ project = hopsworks.login()
87
+ fs = project.get_feature_store()
88
+
89
+ air_quality_fg = fs.get_feature_group(
90
+ name = 'air_quality',
91
+ version = 1
92
+ )
93
+ weather_fg = fs.get_feature_group(
94
+ name = 'weather',
95
+ version = 1
96
+ )
97
+ air_quality_fg.insert(df_aq_update, write_options={"wait_for_job": False})
98
+ weather_fg.insert(df_weather_update, write_options={"wait_for_job": False})
99
+
100
+
101
+
102
+
103
+
104
+ if __name__ == "__main__":
105
+ stub.deploy("air_quality_daily")
106
+ with stub.run():
107
+ g()
108
+
air_quality-main/3_training_dataset_and_modeling.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
air_quality-main/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
air_quality-main/README.md ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # <span style="font-width:bold; font-size: 3rem; color:#1EB182;"><img src="../../images/icon102.png" width="38px"></img> **Hopsworks Feature Store** </span><span style="font-width:bold; font-size: 3rem; color:#333;">Advanced Tutorial - Air Quality Prediction</span>
2
+
3
+
4
+ <span style="font-width:bold; font-size: 1.4rem;">
5
+ This is an <b>advanced example</b> of the Hopsworks <a href="https://www.hopsworks.ai/feature-store">Feature Store</a> usage; you are tasked with predicting the Air Quality value <a href="https://en.wikipedia.org/wiki/Particulates">(PM2.5)</a> in Europe and USA using weather features and air quality features of the previous days.
6
+
7
+ > The [Feature Store](https://www.hopsworks.ai/feature-store) is the essential part of AI infrastructure that helps organisations bring modern enterprise data to analytical and operational ML systems. It is the simplest most powerful way to get your models to production. From anywhere, to anywhere.
8
+ You will load starting data into the feature store, create two feature groups from which we will make a feature view and training dataset, and train a model to predict fare amounts.
9
+ Also, you will design a data-generating and Feature Store insertion pipeline, that will be running once a time using <b>GitHub actions</b>.
10
+
11
+ <b>Streamlit</b> app will be created so you would be able to try your model on different cities interactively.
12
+
13
+ This is a <b>batch use case</b>, it will give you a high-level view of how to use our python APIs and the UI to navigate the feature groups.
14
+ </span>
15
+
16
+ ## **🗒️ This whole tutorial is divided into 5 parts:**
17
+ 1. Backfill Features to the Feature Store,
18
+ 2. Create a feature pipeline,
19
+ 3. Create Feature view & Training Datasets, train a model and upload it to the Model Registry,
20
+ 4. Deploy Streamlit app.
21
+
22
+
23
+ ## Prerequisites
24
+ To run this tutorial, you need an account on Hopsworks. You can create a new account at [app.hopsworks.ai](https://app.hopsworks.ai).
25
+ In the notebook you will be prompted with a link to generate an API token to interact with your Hopsworks account.
26
+
27
+ Also, you are required to have some python library installed (See the `requirements.txt` inside this folder).
28
+
29
+
30
+ ## Data
31
+ The data for this project was collected using several different APIs. I used [European Environmental Agency](https://discomap.eea.europa.eu/map/fme/AirQualityExport.htm) to collect data on European cities, and [United States Environmental Protection Agency](https://aqs.epa.gov/aqsweb/documents/data_api.html#daily) for American cities. Both are free and publicly available. They don't require registration or API keys.
32
+ For the city of Seattle, I found and downloaded the data manually from [here](https://www.epa.gov/outdoor-air-quality-data/download-daily-data).
33
+
34
+ In `feature pipeline` you will use a free [Open-Meteo](https://open-meteo.com/en/docs/air-quality-api) API that covers many places in the world (works by coordinates).
35
+
36
+ The reason I use so many different APIs instead of just one Open-Meteo is that Open-Meteo only has data from 2022-07-29.
37
+
38
+ Anyway, as I said earlier, you don't need any registration or API keys to use any of the above APIs.
39
+
40
+
41
+ ## Streamlit run
42
+ To run streamlit app (after you have run all notebooks and already have required feature groups in Feature Store and model in Model Registry), simply type:
43
+
44
+ `python -m streamlit run streamlit_app.py` on Windows
45
+
46
+ or
47
+
48
+ `python3 -m streamlit run streamlit_app.py` on Unix
49
+
50
+
51
+ ## Streamlit usage examples
52
+ ![1.png](images/1.png)
53
+ ![2.png](images/2.png)
54
+ ![3.png](images/3.png)
55
+ ![4.png](images/4.png)
56
+ ![5.png](images/5.png)
57
+ ![6.png](images/6.png)
58
+
59
+
60
+ ### Edited
air_quality-main/air_quality_model/residplot.png ADDED
air_quality-main/air_quality_model/xgboost_pipeline.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4c05647dc4d02eac4dd6463a6913c052a1f60dc1e835f12f67e2e5dc70dfa6f
3
+ size 355567
air_quality-main/app.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import time
3
+ import pickle
4
+ import joblib
5
+
6
+ import hopsworks
7
+ import streamlit as st
8
+ from geopy import distance
9
+
10
+ import plotly.express as px
11
+ import folium
12
+ from streamlit_folium import st_folium
13
+
14
+ from functions import *
15
+
16
+
17
+
18
+ def print_fancy_header(text, font_size=22, color="#ff5f27"):
19
+ res = f'<span style="color:{color}; font-size: {font_size}px;">{text}</span>'
20
+ st.markdown(res, unsafe_allow_html=True)
21
+
22
+ @st.cache_data()
23
+ def get_batch_data_from_fs(td_version, date_threshold):
24
+ st.write(f"Retrieving the Batch data since {date_threshold}")
25
+ feature_view.init_batch_scoring(training_dataset_version=td_version)
26
+
27
+ batch_data = feature_view.get_batch_data(start_time=date_threshold)
28
+ return batch_data
29
+
30
+
31
+ @st.cache_data()
32
+ def download_model(name="air_quality_xgboost_model", version=1):
33
+ mr = project.get_model_registry()
34
+ retrieved_model = mr.get_model(
35
+ name="air_quality_xgboost_model",
36
+ version=1
37
+ )
38
+ saved_model_dir = retrieved_model.download()
39
+ return saved_model_dir
40
+
41
+
42
+
43
+ def plot_pm2_5(df):
44
+ # create figure with plotly express
45
+ fig = px.line(df, x='date', y='pm2_5', color='city_name')
46
+
47
+ # customize line colors and styles
48
+ fig.update_traces(mode='lines+markers')
49
+ fig.update_layout({
50
+ 'plot_bgcolor': 'rgba(0, 0, 0, 0)',
51
+ 'paper_bgcolor': 'rgba(0, 0, 0, 0)',
52
+ 'legend_title': 'City',
53
+ 'legend_font': {'size': 12},
54
+ 'legend_bgcolor': 'rgba(0, 0, 0, 0)',
55
+ 'xaxis': {'title': 'Date'},
56
+ 'yaxis': {'title': 'PM2.5'},
57
+ 'shapes': [{
58
+ 'type': 'line',
59
+ 'x0': datetime.datetime.now().strftime('%Y-%m-%d'),
60
+ 'y0': 0,
61
+ 'x1': datetime.datetime.now().strftime('%Y-%m-%d'),
62
+ 'y1': df['pm2_5'].max(),
63
+ 'line': {'color': 'red', 'width': 2, 'dash': 'dashdot'}
64
+ }]
65
+ })
66
+
67
+ # show plot
68
+ st.plotly_chart(fig, use_container_width=True)
69
+
70
+
71
+ with open('target_cities.json') as json_file:
72
+ target_cities = json.load(json_file)
73
+
74
+
75
+ #########################
76
+ st.title('🌫 Air Quality Prediction 🌦')
77
+
78
+ st.write(3 * "-")
79
+ print_fancy_header('\n📡 Connecting to Hopsworks Feature Store...')
80
+
81
+ st.write("Logging... ")
82
+ # (Attention! If the app has stopped at this step,
83
+ # please enter your Hopsworks API Key in the commmand prompt.)
84
+ project = hopsworks.login()
85
+ fs = project.get_feature_store()
86
+ st.write("✅ Logged in successfully!")
87
+
88
+ st.write("Getting the Feature View...")
89
+ feature_view = fs.get_feature_view(
90
+ name = 'air_quality_fv',
91
+ version = 1
92
+ )
93
+ st.write("✅ Success!")
94
+
95
+ # I am going to load data for of last 60 days (for feature engineering)
96
+ today = datetime.date.today()
97
+ date_threshold = today - datetime.timedelta(days=60)
98
+
99
+ st.write(3 * "-")
100
+ print_fancy_header('\n☁️ Retriving batch data from Feature Store...')
101
+ batch_data = get_batch_data_from_fs(td_version=1,
102
+ date_threshold=date_threshold)
103
+
104
+ st.write("Batch data:")
105
+ st.write(batch_data.sample(5))
106
+
107
+
108
+ saved_model_dir = download_model(
109
+ name="air_quality_xgboost_model",
110
+ version=1
111
+ )
112
+
113
+ pipeline = joblib.load(saved_model_dir + "/xgboost_pipeline.pkl")
114
+ st.write("\n")
115
+ st.write("✅ Model was downloaded and cached.")
116
+
117
+ st.write(3 * '-')
118
+ st.write("\n")
119
+ print_fancy_header(text="🖍 Select the cities using the form below. \
120
+ Click the 'Submit' button at the bottom of the form to continue.",
121
+ font_size=22)
122
+ dict_for_streamlit = {}
123
+ for continent in target_cities:
124
+ for city_name, coords in target_cities[continent].items():
125
+ dict_for_streamlit[city_name] = coords
126
+ selected_cities_full_list = []
127
+
128
+ with st.form(key="user_inputs"):
129
+ print_fancy_header(text='\n🗺 Here you can choose cities from the drop-down menu',
130
+ font_size=20, color="#00FFFF")
131
+
132
+ cities_multiselect = st.multiselect(label='',
133
+ options=dict_for_streamlit.keys())
134
+ selected_cities_full_list.extend(cities_multiselect)
135
+ st.write("_" * 3)
136
+ print_fancy_header(text="\n📌 To add a city using the interactive map, click somewhere \
137
+ (for the coordinates to appear)",
138
+ font_size=20, color="#00FFFF")
139
+
140
+ my_map = folium.Map(location=[42.57, -44.092], zoom_start=2)
141
+ # Add markers for each city
142
+ for city_name, coords in dict_for_streamlit.items():
143
+ folium.CircleMarker(
144
+ location=coords
145
+ ).add_to(my_map)
146
+
147
+ my_map.add_child(folium.LatLngPopup())
148
+ res_map = st_folium(my_map, width=640, height=480)
149
+
150
+ try:
151
+ new_lat, new_long = res_map["last_clicked"]["lat"], res_map["last_clicked"]["lng"]
152
+
153
+ # Calculate the distance between the clicked location and each city
154
+ distances = {city: distance.distance(coord, (new_lat, new_long)).km for city, coord in dict_for_streamlit.items()}
155
+
156
+ # Find the city with the minimum distance and print its name
157
+ nearest_city = min(distances, key=distances.get)
158
+ print_fancy_header(text=f"You have selected {nearest_city} using map", font_size=18, color="#52fa23")
159
+
160
+ selected_cities_full_list.append(nearest_city)
161
+ st.write(label_encoder.transform([nearest_city])[0])
162
+
163
+ except Exception as err:
164
+ print(err)
165
+ pass
166
+
167
+ submit_button = st.form_submit_button(label='Submit')
168
+
169
+ if submit_button:
170
+ st.write('Selected cities:', selected_cities_full_list)
171
+
172
+ st.write(3*'-')
173
+
174
+ dataset = batch_data
175
+
176
+ dataset = dataset.sort_values(by=["city_name", "date"])
177
+
178
+ st.write("\n")
179
+ print_fancy_header(text='\n🧠 Predicting PM2.5 for selected cities...',
180
+ font_size=18, color="#FDF4F5")
181
+ st.write("")
182
+ preds = pd.DataFrame(columns=dataset.columns)
183
+ for city_name in selected_cities_full_list:
184
+ st.write(f"\t * {city_name}...")
185
+ features = dataset.loc[dataset['city_name'] == city_name]
186
+ print(features.head())
187
+ features['pm2_5'] = pipeline.predict(features)
188
+ preds = pd.concat([preds, features])
189
+
190
+ st.write("")
191
+ print_fancy_header(text="📈Results 📉",
192
+ font_size=22)
193
+ plot_pm2_5(preds[preds['city_name'].isin(selected_cities_full_list)])
194
+
195
+ st.write(3 * "-")
196
+ st.subheader('\n🎉 📈 🤝 App Finished Successfully 🤝 📈 🎉')
197
+ st.button("Re-run")
air_quality-main/data/backfill_pm2_5.csv ADDED
The diff for this file is too large to render. See raw diff
 
air_quality-main/data/backfill_pm2_5_eu.csv ADDED
The diff for this file is too large to render. See raw diff
 
air_quality-main/data/backfill_pm2_5_seattle.csv ADDED
The diff for this file is too large to render. See raw diff
 
air_quality-main/data/backfill_pm2_5_us.csv ADDED
The diff for this file is too large to render. See raw diff
 
air_quality-main/data/backfill_weather.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:961b3e35cdadab7852e087ba154e4b5046eb82f95df86f5043bb0773a94a529d
3
+ size 10925454
air_quality-main/data/seattle_pm25_2013.csv ADDED
The diff for this file is too large to render. See raw diff
 
air_quality-main/data/seattle_pm25_2014.csv ADDED
The diff for this file is too large to render. See raw diff
 
air_quality-main/data/seattle_pm25_2015.csv ADDED
The diff for this file is too large to render. See raw diff
 
air_quality-main/data/seattle_pm25_2016.csv ADDED
The diff for this file is too large to render. See raw diff
 
air_quality-main/data/seattle_pm25_2017.csv ADDED
The diff for this file is too large to render. See raw diff
 
air_quality-main/data/seattle_pm25_2018.csv ADDED
The diff for this file is too large to render. See raw diff
 
air_quality-main/data/seattle_pm25_2019.csv ADDED
The diff for this file is too large to render. See raw diff
 
air_quality-main/data/seattle_pm25_2020.csv ADDED
The diff for this file is too large to render. See raw diff
 
air_quality-main/data/seattle_pm25_2021.csv ADDED
The diff for this file is too large to render. See raw diff
 
air_quality-main/data/seattle_pm25_2022.csv ADDED
The diff for this file is too large to render. See raw diff
 
air_quality-main/data/seattle_pm25_2023.csv ADDED
The diff for this file is too large to render. See raw diff
 
air_quality-main/functions.py ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import datetime
3
+ import time
4
+ import requests
5
+ import pandas as pd
6
+ import json
7
+
8
+ from geopy.geocoders import Nominatim
9
+
10
+
11
+
12
+
13
+ def convert_date_to_unix(x):
14
+ """
15
+ Convert datetime to unix time in milliseconds.
16
+ """
17
+ dt_obj = datetime.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S')
18
+ dt_obj = int(dt_obj.timestamp() * 1000)
19
+ return dt_obj
20
+
21
+
22
+ def get_city_coordinates(city_name: str):
23
+ """
24
+ Takes city name and returns its latitude and longitude (rounded to 2 digits after dot).
25
+ """
26
+ # Initialize Nominatim API (for getting lat and long of the city)
27
+ geolocator = Nominatim(user_agent="MyApp")
28
+ city = geolocator.geocode(city_name)
29
+
30
+ latitude = round(city.latitude, 2)
31
+ longitude = round(city.longitude, 2)
32
+
33
+ return latitude, longitude
34
+
35
+
36
+ ##################################### EEA
37
+ def convert_to_daily(df, pollutant: str):
38
+ """
39
+ Returns DataFrame where pollutant column is resampled to days and rounded.
40
+ """
41
+ res_df = df.copy()
42
+ # convert dates in 'time' column
43
+ res_df["date"] = pd.to_datetime(res_df["date"])
44
+
45
+ # I want data daily, not hourly (mean per each day = 1 datarow per 1 day)
46
+ res_df = res_df.set_index('date')
47
+ res_df = res_df[pollutant].resample('1d').mean().reset_index()
48
+ res_df[pollutant] = res_df[pollutant].fillna(res_df[pollutant].median())
49
+ res_df[pollutant] = res_df[pollutant].apply(lambda x: round(x, 0))
50
+
51
+ return res_df
52
+
53
+
54
+ def find_fullest_csv(csv_links: list, year: str):
55
+ candidates = [link for link in csv_links if str(year) in link]
56
+ biggest_df = pd.read_csv(candidates[0])
57
+ for link in candidates[1:]:
58
+ _df = pd.read_csv(link)
59
+ if len(biggest_df) < len(_df):
60
+ biggest_df = _df
61
+ return biggest_df
62
+
63
+
64
+ def get_air_quality_from_eea(city_name: str,
65
+ pollutant: str,
66
+ start_year: str,
67
+ end_year: str):
68
+ """
69
+ Takes city name, daterange and returns pandas DataFrame with daily air quality data.
70
+ It parses data by 1-year batches, so please specify years, not dates. (example: "2014", "2022"...)
71
+
72
+ EEA means European Environmental Agency. So it has data for Europe Union countries ONLY.
73
+ """
74
+ start_of_cell = time.time()
75
+
76
+ params = {
77
+ 'CountryCode': '',
78
+ 'CityName': city_name,
79
+ 'Pollutant': pollutant.upper(),
80
+ 'Year_from': start_year,
81
+ 'Year_to': end_year,
82
+ 'Station': '',
83
+ 'Source': 'All',
84
+ 'Samplingpoint': '',
85
+ 'Output': 'TEXT',
86
+ 'UpdateDate': '',
87
+ 'TimeCoverage': 'Year'
88
+ }
89
+
90
+ # observations endpoint
91
+ base_url = "https://fme.discomap.eea.europa.eu/fmedatastreaming/AirQualityDownload/AQData_Extract.fmw?"
92
+ try:
93
+ response = requests.get(base_url, params=params)
94
+ except ConnectionError:
95
+ response = requests.get(base_url, params=params)
96
+
97
+ response.encoding = response.apparent_encoding
98
+ csv_links = response.text.split("\r\n")
99
+
100
+ res_df = pd.DataFrame()
101
+ target_year = int(start_year)
102
+
103
+ for year in range(int(start_year), int(end_year) + 1):
104
+ try:
105
+ # find the fullest, the biggest csv file with observations for this particular year
106
+ _df = find_fullest_csv(csv_links, year)
107
+ # append it to res_df
108
+ res_df = pd.concat([res_df, _df])
109
+ except IndexError:
110
+ print(f"!! Missing data for {year} for {city} city.")
111
+ pass
112
+
113
+ pollutant = pollutant.lower()
114
+ if pollutant == "pm2.5":
115
+ pollutant = "pm2_5"
116
+
117
+ res_df = res_df.rename(columns={
118
+ 'DatetimeBegin': 'date',
119
+ 'Concentration': pollutant
120
+ })
121
+
122
+ # cut timezones info
123
+ res_df['date'] = res_df['date'].apply(lambda x: x[:-6])
124
+ # convert dates in 'time' column
125
+ res_df['date'] = pd.to_datetime(res_df['date'])
126
+
127
+ res_df = convert_to_daily(res_df, pollutant)
128
+
129
+ res_df['city_name'] = city_name
130
+ res_df = res_df[['city_name', 'date', pollutant.lower()]]
131
+
132
+ end_of_cell = time.time()
133
+
134
+ print(f"Processed {pollutant.upper()} for {city_name} since {start_year} till {end_year}.")
135
+ print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
136
+
137
+ return res_df
138
+
139
+
140
+
141
+ ##################################### USEPA
142
+ city_code_dict = {}
143
+ pollutant_dict = {
144
+ 'CO': '42101',
145
+ 'SO2': '42401',
146
+ 'NO2': '42602',
147
+ 'O3': '44201',
148
+ 'PM10': '81102',
149
+ 'PM2.5': '88101'
150
+ }
151
+
152
+ def get_city_code(city_name: str):
153
+ "Encodes city name to be used later for data parsing using USEPA."
154
+ if city_code_dict:
155
+ city_full = [i for i in city_code_dict.keys() if city_name in i][0]
156
+ return city_code_dict[city_full]
157
+ else:
158
+ params = {
159
+ "email": "test@aqs.api",
160
+ "key": "test"
161
+ }
162
+ response = requests.get("https://aqs.epa.gov/data/api/list/cbsas?", params)
163
+ response_json = response.json()
164
+ data = response_json["Data"]
165
+ for item in data:
166
+ city_code_dict[item['value_represented']] = item['code']
167
+
168
+ return get_city_code(city_name)
169
+
170
+
171
+ def get_air_quality_from_usepa(city_name: str,
172
+ pollutant: str,
173
+ start_date: str,
174
+ end_date: str):
175
+ """
176
+ Takes city name, daterange and returns pandas DataFrame with daily air quality data.
177
+
178
+ USEPA means United States Environmental Protection Agency. So it has data for US ONLY.
179
+ """
180
+ start_of_cell = time.time()
181
+ res_df = pd.DataFrame()
182
+
183
+ for start_date_, end_date_ in make_date_intervals(start_date, end_date):
184
+ params = {
185
+ "email": "test@aqs.api",
186
+ "key": "test",
187
+ "param": pollutant_dict[pollutant.upper().replace("_", ".")], # encoded pollutant
188
+ "bdate": start_date_,
189
+ "edate": end_date_,
190
+ "cbsa": get_city_code(city_name) # Core-based statistical area
191
+ }
192
+
193
+ # observations endpoint
194
+ base_url = "https://aqs.epa.gov/data/api/dailyData/byCBSA?"
195
+
196
+ response = requests.get(base_url, params=params)
197
+ response_json = response.json()
198
+
199
+ df_ = pd.DataFrame(response_json["Data"])
200
+
201
+ pollutant = pollutant.lower()
202
+ if pollutant == "pm2.5":
203
+ pollutant = "pm2_5"
204
+ df_ = df_.rename(columns={
205
+ 'date_local': 'date',
206
+ 'arithmetic_mean': pollutant
207
+ })
208
+
209
+ # convert dates in 'date' column
210
+ df_['date'] = pd.to_datetime(df_['date'])
211
+ df_['city_name'] = city_name
212
+ df_ = df_[['city_name', 'date', pollutant]]
213
+ res_df = pd.concat([res_df, df_])
214
+
215
+ # there are duplicated rows (several records for the same day and station). get rid of it.
216
+ res_df = res_df.groupby(['date', 'city_name'], as_index=False)[pollutant].mean()
217
+ res_df[pollutant] = round(res_df[pollutant], 1)
218
+
219
+ end_of_cell = time.time()
220
+ print(f"Processed {pollutant.upper()} for {city_name} since {start_date} till {end_date}.")
221
+ print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
222
+
223
+ return res_df
224
+
225
+
226
+ def make_date_intervals(start_date, end_date):
227
+ start_dt = datetime.datetime.strptime(start_date, '%Y-%m-%d')
228
+ end_dt = datetime.datetime.strptime(end_date, '%Y-%m-%d')
229
+ date_intervals = []
230
+ for year in range(start_dt.year, end_dt.year + 1):
231
+ year_start = datetime.datetime(year, 1, 1)
232
+ year_end = datetime.datetime(year, 12, 31)
233
+ interval_start = max(start_dt, year_start)
234
+ interval_end = min(end_dt, year_end)
235
+ if interval_start < interval_end:
236
+ date_intervals.append((interval_start.strftime('%Y%m%d'), interval_end.strftime('%Y%m%d')))
237
+ return date_intervals
238
+
239
+ ##################################### Weather Open Meteo
240
+ def get_weather_data_from_open_meteo(city_name: str,
241
+ start_date: str,
242
+ end_date: str,
243
+ coordinates: list = None,
244
+ forecast: bool = False):
245
+ """
246
+ Takes [city name OR coordinates] and returns pandas DataFrame with weather data.
247
+
248
+ Examples of arguments:
249
+ coordinates=(47.755, -122.2806), start_date="2023-01-01"
250
+ """
251
+ start_of_cell = time.time()
252
+
253
+ if coordinates:
254
+ latitude, longitude = coordinates
255
+ else:
256
+ latitude, longitude = get_city_coordinates(city_name=city_name)
257
+
258
+ params = {
259
+ 'latitude': latitude,
260
+ 'longitude': longitude,
261
+ 'daily': ["temperature_2m_max", "temperature_2m_min",
262
+ "precipitation_sum", "rain_sum", "snowfall_sum",
263
+ "precipitation_hours", "windspeed_10m_max",
264
+ "windgusts_10m_max", "winddirection_10m_dominant"],
265
+ 'start_date': start_date,
266
+ 'end_date': end_date,
267
+ 'timezone': "Europe/London"
268
+ }
269
+
270
+ if forecast:
271
+ # historical forecast endpoint
272
+ base_url = 'https://api.open-meteo.com/v1/forecast'
273
+ else:
274
+ # historical observations endpoint
275
+ base_url = 'https://archive-api.open-meteo.com/v1/archive'
276
+
277
+ try:
278
+ response = requests.get(base_url, params=params)
279
+ except ConnectionError:
280
+ response = requests.get(base_url, params=params)
281
+
282
+ response_json = response.json()
283
+ res_df = pd.DataFrame(response_json["daily"])
284
+ res_df["city_name"] = city_name
285
+
286
+ # rename columns
287
+ res_df = res_df.rename(columns={
288
+ "time": "date",
289
+ "temperature_2m_max": "temperature_max",
290
+ "temperature_2m_min": "temperature_min",
291
+ "windspeed_10m_max": "wind_speed_max",
292
+ "winddirection_10m_dominant": "wind_direction_dominant",
293
+ "windgusts_10m_max": "wind_gusts_max"
294
+ })
295
+
296
+ # change columns order
297
+ res_df = res_df[
298
+ ['city_name', 'date', 'temperature_max', 'temperature_min',
299
+ 'precipitation_sum', 'rain_sum', 'snowfall_sum',
300
+ 'precipitation_hours', 'wind_speed_max',
301
+ 'wind_gusts_max', 'wind_direction_dominant']
302
+ ]
303
+
304
+ # convert dates in 'date' column
305
+ res_df["date"] = pd.to_datetime(res_df["date"])
306
+ end_of_cell = time.time()
307
+ print(f"Parsed weather for {city_name} since {start_date} till {end_date}.")
308
+ print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
309
+
310
+ return res_df
311
+
312
+
313
+ ##################################### Air Quality data from Open Meteo
314
+ def get_aqi_data_from_open_meteo(city_name: str,
315
+ start_date: str,
316
+ end_date: str,
317
+ coordinates: list = None,
318
+ pollutant: str = "pm2_5"):
319
+ """
320
+ Takes [city name OR coordinates] and returns pandas DataFrame with AQI data.
321
+
322
+ Examples of arguments:
323
+ ...
324
+ coordinates=(47.755, -122.2806),
325
+ start_date="2023-01-01",
326
+ pollutant="no2"
327
+ ...
328
+ """
329
+ start_of_cell = time.time()
330
+
331
+ if coordinates:
332
+ latitude, longitude = coordinates
333
+ else:
334
+ latitude, longitude = get_city_coordinates(city_name=city_name)
335
+
336
+ pollutant = pollutant.lower()
337
+ if pollutant == "pm2.5":
338
+ pollutant = "pm2_5"
339
+
340
+ # make it work with both "no2" and "nitrogen_dioxide" passed.
341
+ if pollutant == "no2":
342
+ pollutant = "nitrogen_dioxide"
343
+
344
+ params = {
345
+ 'latitude': latitude,
346
+ 'longitude': longitude,
347
+ 'hourly': [pollutant],
348
+ 'start_date': start_date,
349
+ 'end_date': end_date,
350
+ 'timezone': "Europe/London"
351
+ }
352
+
353
+ # base endpoint
354
+ base_url = "https://air-quality-api.open-meteo.com/v1/air-quality"
355
+ try:
356
+ response = requests.get(base_url, params=params)
357
+ except ConnectionError:
358
+ response = requests.get(base_url, params=params)
359
+ response_json = response.json()
360
+ res_df = pd.DataFrame(response_json["hourly"])
361
+
362
+ # convert dates
363
+ res_df["time"] = pd.to_datetime(res_df["time"])
364
+
365
+ # resample to days
366
+ res_df = res_df.groupby(res_df['time'].dt.date).mean(numeric_only=True).reset_index()
367
+ res_df[pollutant] = round(res_df[pollutant], 1)
368
+
369
+ # rename columns
370
+ res_df = res_df.rename(columns={
371
+ "time": "date"
372
+ })
373
+
374
+ res_df["city_name"] = city_name
375
+
376
+ # change columns order
377
+ res_df = res_df[
378
+ ['city_name', 'date', pollutant]
379
+ ]
380
+ end_of_cell = time.time()
381
+ print(f"Processed {pollutant.upper()} for {city_name} since {start_date} till {end_date}.")
382
+ print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
383
+
384
+ return res_df
385
+
air_quality-main/hopsworks-login.sh.example ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+
4
+ export HOPSWORKS_PROJECT=pydata
5
+ export HOPSWORKS_HOST=35.240.110.235
6
+ export HOPSWORKS_API_KEY=DDDDD
7
+
8
+ export MODAL_API_KEY=ak-DDDD
9
+
air_quality-main/images/1.png ADDED
air_quality-main/images/2.png ADDED
air_quality-main/images/3.png ADDED
air_quality-main/images/4.png ADDED
air_quality-main/images/5.png ADDED
air_quality-main/images/6.png ADDED
air_quality-main/requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ hopsworks==3.2.0rc0
2
+ geopy
3
+ python-dotenv
4
+ streamlit
5
+ streamlit-folium
6
+ joblib
7
+ plotly
8
+ nbconvert
air_quality-main/scripts/run-feature-pipeline.sh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ set -e
4
+
5
+ jupyter nbconvert --to notebook --execute 2_feature_pipeline.ipynb
air_quality-main/target_cities.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"EU": {"Amsterdam": [52.37, 4.89], "Athina": [37.98, 23.73], "Berlin": [52.52, 13.39], "Gdansk": [54.37, 18.61], "Krak\u00f3w": [50.06, 19.94], "London": [51.51, -0.13], "Madrid": [40.42, -3.7], "Marseille": [43.3, 5.37], "Milano": [45.46, 9.19], "M\u00fcnchen": [48.14, 11.58], "Napoli": [40.84, 14.25], "Paris": [48.85, 2.35], "Sevilla": [37.39, -6.0], "Stockholm": [59.33, 18.07], "Tallinn": [59.44, 24.75], "Varna": [43.21, 27.92], "Wien": [48.21, 16.37]}, "US": {"Albuquerque": [35.08, -106.65], "Atlanta": [33.75, -84.39], "Chicago": [41.88, -87.62], "Columbus": [39.96, -83.0], "Dallas": [32.78, -96.8], "Denver": [39.74, -104.98], "Houston": [29.76, -95.37], "Los Angeles": [34.05, -118.24], "New York": [40.71, -74.01], "Phoenix-Mesa": [33.66, -112.04], "Salt Lake City": [40.76, -111.89], "San Francisco": [37.78, -122.42], "Tampa": [27.95, -82.46]}, "Seattle": {"Bellevue-SE 12th St": [47.60086, -122.1484], "DARRINGTON - FIR ST (Darrington High School)": [48.2469, -121.6031], "KENT - JAMES & CENTRAL": [47.38611, -122.23028], "LAKE FOREST PARK TOWNE CENTER": [47.755, -122.2806], "MARYSVILLE - 7TH AVE (Marysville Junior High)": [48.05432, -122.17153], "NORTH BEND - NORTH BEND WAY": [47.49022, -121.77278], "SEATTLE - BEACON HILL": [47.56824, -122.30863], "SEATTLE - DUWAMISH": [47.55975, -122.33827], "SEATTLE - SOUTH PARK #2": [47.53091, -122.3208], "Seattle-10th & Weller": [47.59722, -122.31972], "TACOMA - ALEXANDER AVE": [47.2656, -122.3858], "TACOMA - L STREET": [47.1864, -122.4517], "Tacoma-S 36th St": [47.22634, -122.46256], "Tukwila Allentown": [47.49854, -122.27839], "Tulalip-Totem Beach Rd": [48.06534, -122.28519]}}