Spaces:
Sleeping
Sleeping
Annikaijak
commited on
Commit
•
a1aa76f
1
Parent(s):
ff9a039
Upload 37 files
Browse files- .gitattributes +1 -0
- air_quality-main/.github/workflows/feature_pipeline_seattle.yml +29 -0
- air_quality-main/.gitignore +160 -0
- air_quality-main/1_backfill_feature_groups.ipynb +1535 -0
- air_quality-main/2_feature_pipeline.py +108 -0
- air_quality-main/3_training_dataset_and_modeling.ipynb +0 -0
- air_quality-main/LICENSE +201 -0
- air_quality-main/README.md +60 -0
- air_quality-main/air_quality_model/residplot.png +0 -0
- air_quality-main/air_quality_model/xgboost_pipeline.pkl +3 -0
- air_quality-main/app.py +197 -0
- air_quality-main/data/backfill_pm2_5.csv +0 -0
- air_quality-main/data/backfill_pm2_5_eu.csv +0 -0
- air_quality-main/data/backfill_pm2_5_seattle.csv +0 -0
- air_quality-main/data/backfill_pm2_5_us.csv +0 -0
- air_quality-main/data/backfill_weather.csv +3 -0
- air_quality-main/data/seattle_pm25_2013.csv +0 -0
- air_quality-main/data/seattle_pm25_2014.csv +0 -0
- air_quality-main/data/seattle_pm25_2015.csv +0 -0
- air_quality-main/data/seattle_pm25_2016.csv +0 -0
- air_quality-main/data/seattle_pm25_2017.csv +0 -0
- air_quality-main/data/seattle_pm25_2018.csv +0 -0
- air_quality-main/data/seattle_pm25_2019.csv +0 -0
- air_quality-main/data/seattle_pm25_2020.csv +0 -0
- air_quality-main/data/seattle_pm25_2021.csv +0 -0
- air_quality-main/data/seattle_pm25_2022.csv +0 -0
- air_quality-main/data/seattle_pm25_2023.csv +0 -0
- air_quality-main/functions.py +385 -0
- air_quality-main/hopsworks-login.sh.example +9 -0
- air_quality-main/images/1.png +0 -0
- air_quality-main/images/2.png +0 -0
- air_quality-main/images/3.png +0 -0
- air_quality-main/images/4.png +0 -0
- air_quality-main/images/5.png +0 -0
- air_quality-main/images/6.png +0 -0
- air_quality-main/requirements.txt +8 -0
- air_quality-main/scripts/run-feature-pipeline.sh +5 -0
- air_quality-main/target_cities.json +1 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
air_quality-main/data/backfill_weather.csv filter=lfs diff=lfs merge=lfs -text
|
air_quality-main/.github/workflows/feature_pipeline_seattle.yml
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: air_quality_feature_pipeline
|
2 |
+
|
3 |
+
on:
|
4 |
+
workflow_dispatch:
|
5 |
+
schedule:
|
6 |
+
- cron: '0 14 * * *'
|
7 |
+
|
8 |
+
jobs:
|
9 |
+
test_schedule:
|
10 |
+
runs-on: ubuntu-latest
|
11 |
+
steps:
|
12 |
+
- name: checkout repo content
|
13 |
+
uses: actions/checkout@v3
|
14 |
+
|
15 |
+
- name: setup python
|
16 |
+
uses: actions/setup-python@v3
|
17 |
+
with:
|
18 |
+
python-version: '3.11.5'
|
19 |
+
|
20 |
+
- name: install python packages
|
21 |
+
run:
|
22 |
+
python -m pip install --upgrade pip
|
23 |
+
pip install -r requirements.txt
|
24 |
+
|
25 |
+
- name: execute Feature Pipeline
|
26 |
+
env:
|
27 |
+
HOPSWORKS_API_KEY: ${{ secrets.HOPSWORKS_API_KEY }}
|
28 |
+
CONTINENT: "Seattle"
|
29 |
+
run: ./scripts/run-feature-pipeline.sh
|
air_quality-main/.gitignore
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/#use-with-ide
|
110 |
+
.pdm.toml
|
111 |
+
|
112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
113 |
+
__pypackages__/
|
114 |
+
|
115 |
+
# Celery stuff
|
116 |
+
celerybeat-schedule
|
117 |
+
celerybeat.pid
|
118 |
+
|
119 |
+
# SageMath parsed files
|
120 |
+
*.sage.py
|
121 |
+
|
122 |
+
# Environments
|
123 |
+
.env
|
124 |
+
.venv
|
125 |
+
env/
|
126 |
+
venv/
|
127 |
+
ENV/
|
128 |
+
env.bak/
|
129 |
+
venv.bak/
|
130 |
+
|
131 |
+
# Spyder project settings
|
132 |
+
.spyderproject
|
133 |
+
.spyproject
|
134 |
+
|
135 |
+
# Rope project settings
|
136 |
+
.ropeproject
|
137 |
+
|
138 |
+
# mkdocs documentation
|
139 |
+
/site
|
140 |
+
|
141 |
+
# mypy
|
142 |
+
.mypy_cache/
|
143 |
+
.dmypy.json
|
144 |
+
dmypy.json
|
145 |
+
|
146 |
+
# Pyre type checker
|
147 |
+
.pyre/
|
148 |
+
|
149 |
+
# pytype static type analyzer
|
150 |
+
.pytype/
|
151 |
+
|
152 |
+
# Cython debug symbols
|
153 |
+
cython_debug/
|
154 |
+
|
155 |
+
# PyCharm
|
156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
+
#.idea/
|
air_quality-main/1_backfill_feature_groups.ipynb
ADDED
@@ -0,0 +1,1535 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "73ee3ec9",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# <span style=\"font-width:bold; font-size: 3rem; color:#1EB182;\"><img src=\"../../images/icon102.png\" width=\"38px\"></img> **Hopsworks Feature Store** </span>\n",
|
9 |
+
"\n",
|
10 |
+
"<span style=\"font-width:bold; font-size: 3rem; color:#333;\">- Part 01: Backfill Features to the Feature Store</span>\n",
|
11 |
+
"\n",
|
12 |
+
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/logicalclocks/hopsworks-tutorials/blob/master/advanced_tutorials/air_quality/1_backfill_feature_groups.ipynb)\n",
|
13 |
+
"\n",
|
14 |
+
"\n",
|
15 |
+
"## 🗒️ This notebook is divided into the following sections:\n",
|
16 |
+
"1. Fetch historical data\n",
|
17 |
+
"2. Connect to the Hopsworks feature store\n",
|
18 |
+
"3. Create feature groups and insert them to the feature store\n",
|
19 |
+
"\n",
|
20 |
+
"![tutorial-flow](../../images/01_featuregroups.png)"
|
21 |
+
]
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"cell_type": "markdown",
|
25 |
+
"id": "f04d5c5e",
|
26 |
+
"metadata": {},
|
27 |
+
"source": [
|
28 |
+
"### <span style='color:#ff5f27'> 📝 Imports"
|
29 |
+
]
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"cell_type": "code",
|
33 |
+
"execution_count": 2,
|
34 |
+
"id": "f65f0db4-1e4b-4f28-a17c-eadcb0d0f016",
|
35 |
+
"metadata": {
|
36 |
+
"tags": []
|
37 |
+
},
|
38 |
+
"outputs": [],
|
39 |
+
"source": [
|
40 |
+
"%pip install geopy folium streamlit-folium geopy --q"
|
41 |
+
]
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"cell_type": "code",
|
45 |
+
"execution_count": 3,
|
46 |
+
"id": "cd165941",
|
47 |
+
"metadata": {},
|
48 |
+
"outputs": [],
|
49 |
+
"source": [
|
50 |
+
"import datetime\n",
|
51 |
+
"import time\n",
|
52 |
+
"import requests\n",
|
53 |
+
"from urllib.request import urlopen\n",
|
54 |
+
"import json\n",
|
55 |
+
"import pandas as pd\n",
|
56 |
+
"import folium\n",
|
57 |
+
"from functions import *\n",
|
58 |
+
"import warnings\n",
|
59 |
+
"warnings.filterwarnings(\"ignore\")"
|
60 |
+
]
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"cell_type": "markdown",
|
64 |
+
"id": "ba9903fc",
|
65 |
+
"metadata": {},
|
66 |
+
"source": [
|
67 |
+
"---"
|
68 |
+
]
|
69 |
+
},
|
70 |
+
{
|
71 |
+
"cell_type": "markdown",
|
72 |
+
"id": "b7a1965a-0da7-4263-a68a-8b2e8cb753f1",
|
73 |
+
"metadata": {},
|
74 |
+
"source": [
|
75 |
+
"## <span style='color:#ff5f27'> 🌍 Representing the Target cities </span>"
|
76 |
+
]
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"cell_type": "code",
|
80 |
+
"execution_count": 4,
|
81 |
+
"id": "bd578db1-69e7-4230-b3f2-807b8056283a",
|
82 |
+
"metadata": {
|
83 |
+
"tags": []
|
84 |
+
},
|
85 |
+
"outputs": [],
|
86 |
+
"source": [
|
87 |
+
"target_url='https://repo.hops.works/dev/jdowling/target_cities.json'\n",
|
88 |
+
"response = urlopen(target_url)\n",
|
89 |
+
"target_cities = json.loads(response.read())\n"
|
90 |
+
]
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"cell_type": "markdown",
|
94 |
+
"id": "2246ca9d",
|
95 |
+
"metadata": {},
|
96 |
+
"source": [
|
97 |
+
"## <span style='color:#ff5f27'> 🌫 Processing Air Quality data</span>"
|
98 |
+
]
|
99 |
+
},
|
100 |
+
{
|
101 |
+
"cell_type": "markdown",
|
102 |
+
"id": "b4a1c5d1",
|
103 |
+
"metadata": {},
|
104 |
+
"source": [
|
105 |
+
"### [🇪🇺 EEA](https://discomap.eea.europa.eu/map/fme/AirQualityExport.htm)\n",
|
106 |
+
"#### EEA means European Environmental Agency"
|
107 |
+
]
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"cell_type": "code",
|
111 |
+
"execution_count": 5,
|
112 |
+
"id": "96b8be01-6286-4886-8043-56e0e49b314e",
|
113 |
+
"metadata": {
|
114 |
+
"tags": []
|
115 |
+
},
|
116 |
+
"outputs": [
|
117 |
+
{
|
118 |
+
"data": {
|
119 |
+
"text/plain": [
|
120 |
+
"{'Amsterdam': [52.37, 4.89],\n",
|
121 |
+
" 'Athina': [37.98, 23.73],\n",
|
122 |
+
" 'Berlin': [52.52, 13.39],\n",
|
123 |
+
" 'Gdansk': [54.37, 18.61],\n",
|
124 |
+
" 'Kraków': [50.06, 19.94],\n",
|
125 |
+
" 'London': [51.51, -0.13],\n",
|
126 |
+
" 'Madrid': [40.42, -3.7],\n",
|
127 |
+
" 'Marseille': [43.3, 5.37],\n",
|
128 |
+
" 'Milano': [45.46, 9.19],\n",
|
129 |
+
" 'München': [48.14, 11.58],\n",
|
130 |
+
" 'Napoli': [40.84, 14.25],\n",
|
131 |
+
" 'Paris': [48.85, 2.35],\n",
|
132 |
+
" 'Sevilla': [37.39, -6.0],\n",
|
133 |
+
" 'Stockholm': [59.33, 18.07],\n",
|
134 |
+
" 'Tallinn': [59.44, 24.75],\n",
|
135 |
+
" 'Varna': [43.21, 27.92],\n",
|
136 |
+
" 'Wien': [48.21, 16.37]}"
|
137 |
+
]
|
138 |
+
},
|
139 |
+
"execution_count": 5,
|
140 |
+
"metadata": {},
|
141 |
+
"output_type": "execute_result"
|
142 |
+
}
|
143 |
+
],
|
144 |
+
"source": [
|
145 |
+
"target_cities[\"EU\"]"
|
146 |
+
]
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"cell_type": "code",
|
150 |
+
"execution_count": 6,
|
151 |
+
"id": "5bb2a868-5f3a-4065-b651-318c24826b97",
|
152 |
+
"metadata": {},
|
153 |
+
"outputs": [],
|
154 |
+
"source": [
|
155 |
+
"df_eu = pd.read_csv(\"data/backfill_pm2_5_eu.csv\")"
|
156 |
+
]
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"cell_type": "code",
|
160 |
+
"execution_count": 7,
|
161 |
+
"id": "5620df22-f744-4550-a81a-7e5d71aae542",
|
162 |
+
"metadata": {
|
163 |
+
"tags": []
|
164 |
+
},
|
165 |
+
"outputs": [
|
166 |
+
{
|
167 |
+
"data": {
|
168 |
+
"text/plain": [
|
169 |
+
"0"
|
170 |
+
]
|
171 |
+
},
|
172 |
+
"execution_count": 7,
|
173 |
+
"metadata": {},
|
174 |
+
"output_type": "execute_result"
|
175 |
+
}
|
176 |
+
],
|
177 |
+
"source": [
|
178 |
+
"df_eu.isna().sum().sum()"
|
179 |
+
]
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"cell_type": "code",
|
183 |
+
"execution_count": 8,
|
184 |
+
"id": "b0e23728-a01d-45bc-bf25-4a9c77f21d66",
|
185 |
+
"metadata": {
|
186 |
+
"tags": []
|
187 |
+
},
|
188 |
+
"outputs": [
|
189 |
+
{
|
190 |
+
"name": "stdout",
|
191 |
+
"output_type": "stream",
|
192 |
+
"text": [
|
193 |
+
"Size of this dataframe: (63548, 3)\n"
|
194 |
+
]
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"data": {
|
198 |
+
"text/html": [
|
199 |
+
"<div>\n",
|
200 |
+
"<style scoped>\n",
|
201 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
202 |
+
" vertical-align: middle;\n",
|
203 |
+
" }\n",
|
204 |
+
"\n",
|
205 |
+
" .dataframe tbody tr th {\n",
|
206 |
+
" vertical-align: top;\n",
|
207 |
+
" }\n",
|
208 |
+
"\n",
|
209 |
+
" .dataframe thead th {\n",
|
210 |
+
" text-align: right;\n",
|
211 |
+
" }\n",
|
212 |
+
"</style>\n",
|
213 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
214 |
+
" <thead>\n",
|
215 |
+
" <tr style=\"text-align: right;\">\n",
|
216 |
+
" <th></th>\n",
|
217 |
+
" <th>city_name</th>\n",
|
218 |
+
" <th>date</th>\n",
|
219 |
+
" <th>pm2_5</th>\n",
|
220 |
+
" </tr>\n",
|
221 |
+
" </thead>\n",
|
222 |
+
" <tbody>\n",
|
223 |
+
" <tr>\n",
|
224 |
+
" <th>16477</th>\n",
|
225 |
+
" <td>Kraków</td>\n",
|
226 |
+
" <td>2017-01-05</td>\n",
|
227 |
+
" <td>16.0</td>\n",
|
228 |
+
" </tr>\n",
|
229 |
+
" <tr>\n",
|
230 |
+
" <th>12612</th>\n",
|
231 |
+
" <td>Gdansk</td>\n",
|
232 |
+
" <td>2016-09-15</td>\n",
|
233 |
+
" <td>10.0</td>\n",
|
234 |
+
" </tr>\n",
|
235 |
+
" <tr>\n",
|
236 |
+
" <th>58456</th>\n",
|
237 |
+
" <td>Varna</td>\n",
|
238 |
+
" <td>2018-12-03</td>\n",
|
239 |
+
" <td>11.0</td>\n",
|
240 |
+
" </tr>\n",
|
241 |
+
" </tbody>\n",
|
242 |
+
"</table>\n",
|
243 |
+
"</div>"
|
244 |
+
],
|
245 |
+
"text/plain": [
|
246 |
+
" city_name date pm2_5\n",
|
247 |
+
"16477 Kraków 2017-01-05 16.0\n",
|
248 |
+
"12612 Gdansk 2016-09-15 10.0\n",
|
249 |
+
"58456 Varna 2018-12-03 11.0"
|
250 |
+
]
|
251 |
+
},
|
252 |
+
"execution_count": 8,
|
253 |
+
"metadata": {},
|
254 |
+
"output_type": "execute_result"
|
255 |
+
}
|
256 |
+
],
|
257 |
+
"source": [
|
258 |
+
"print(\"Size of this dataframe:\", df_eu.shape)\n",
|
259 |
+
"\n",
|
260 |
+
"df_eu.sample(3)"
|
261 |
+
]
|
262 |
+
},
|
263 |
+
{
|
264 |
+
"cell_type": "markdown",
|
265 |
+
"id": "c2e45567-dd6b-4e5e-a153-82a2f4f32fbc",
|
266 |
+
"metadata": {},
|
267 |
+
"source": [
|
268 |
+
"### [🇺🇸 USEPA](https://aqs.epa.gov/aqsweb/documents/data_api.html#daily)\n",
|
269 |
+
"#### USEPA means United States Environmental Protection Agency\n",
|
270 |
+
"[Manual downloading](https://www.epa.gov/outdoor-air-quality-data/download-daily-data)\n",
|
271 |
+
"\n"
|
272 |
+
]
|
273 |
+
},
|
274 |
+
{
|
275 |
+
"cell_type": "code",
|
276 |
+
"execution_count": 9,
|
277 |
+
"id": "c4952759-0fb9-4229-8b78-2e37cffb144d",
|
278 |
+
"metadata": {
|
279 |
+
"tags": []
|
280 |
+
},
|
281 |
+
"outputs": [
|
282 |
+
{
|
283 |
+
"data": {
|
284 |
+
"text/plain": [
|
285 |
+
"{'Albuquerque': [35.08, -106.65],\n",
|
286 |
+
" 'Atlanta': [33.75, -84.39],\n",
|
287 |
+
" 'Chicago': [41.88, -87.62],\n",
|
288 |
+
" 'Columbus': [39.96, -83.0],\n",
|
289 |
+
" 'Dallas': [32.78, -96.8],\n",
|
290 |
+
" 'Denver': [39.74, -104.98],\n",
|
291 |
+
" 'Houston': [29.76, -95.37],\n",
|
292 |
+
" 'Los Angeles': [34.05, -118.24],\n",
|
293 |
+
" 'New York': [40.71, -74.01],\n",
|
294 |
+
" 'Phoenix-Mesa': [33.66, -112.04],\n",
|
295 |
+
" 'Salt Lake City': [40.76, -111.89],\n",
|
296 |
+
" 'San Francisco': [37.78, -122.42],\n",
|
297 |
+
" 'Tampa': [27.95, -82.46]}"
|
298 |
+
]
|
299 |
+
},
|
300 |
+
"execution_count": 9,
|
301 |
+
"metadata": {},
|
302 |
+
"output_type": "execute_result"
|
303 |
+
}
|
304 |
+
],
|
305 |
+
"source": [
|
306 |
+
"target_cities[\"US\"]"
|
307 |
+
]
|
308 |
+
},
|
309 |
+
{
|
310 |
+
"cell_type": "code",
|
311 |
+
"execution_count": 10,
|
312 |
+
"id": "c6aceaee-9431-48fd-818a-41fbdd07575c",
|
313 |
+
"metadata": {
|
314 |
+
"tags": []
|
315 |
+
},
|
316 |
+
"outputs": [],
|
317 |
+
"source": [
|
318 |
+
"df_us = pd.read_csv(\"data/backfill_pm2_5_us.csv\")"
|
319 |
+
]
|
320 |
+
},
|
321 |
+
{
|
322 |
+
"cell_type": "code",
|
323 |
+
"execution_count": 11,
|
324 |
+
"id": "4e7ff20e-8a1a-4fa3-b801-71beead7b5f2",
|
325 |
+
"metadata": {
|
326 |
+
"tags": []
|
327 |
+
},
|
328 |
+
"outputs": [
|
329 |
+
{
|
330 |
+
"data": {
|
331 |
+
"text/plain": [
|
332 |
+
"0"
|
333 |
+
]
|
334 |
+
},
|
335 |
+
"execution_count": 11,
|
336 |
+
"metadata": {},
|
337 |
+
"output_type": "execute_result"
|
338 |
+
}
|
339 |
+
],
|
340 |
+
"source": [
|
341 |
+
"df_us.isna().sum().sum()"
|
342 |
+
]
|
343 |
+
},
|
344 |
+
{
|
345 |
+
"cell_type": "code",
|
346 |
+
"execution_count": 12,
|
347 |
+
"id": "3818e3e1-8674-4634-9023-92be8410fba5",
|
348 |
+
"metadata": {
|
349 |
+
"tags": []
|
350 |
+
},
|
351 |
+
"outputs": [
|
352 |
+
{
|
353 |
+
"name": "stdout",
|
354 |
+
"output_type": "stream",
|
355 |
+
"text": [
|
356 |
+
"Size of this dataframe: (46037, 3)\n"
|
357 |
+
]
|
358 |
+
},
|
359 |
+
{
|
360 |
+
"data": {
|
361 |
+
"text/html": [
|
362 |
+
"<div>\n",
|
363 |
+
"<style scoped>\n",
|
364 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
365 |
+
" vertical-align: middle;\n",
|
366 |
+
" }\n",
|
367 |
+
"\n",
|
368 |
+
" .dataframe tbody tr th {\n",
|
369 |
+
" vertical-align: top;\n",
|
370 |
+
" }\n",
|
371 |
+
"\n",
|
372 |
+
" .dataframe thead th {\n",
|
373 |
+
" text-align: right;\n",
|
374 |
+
" }\n",
|
375 |
+
"</style>\n",
|
376 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
377 |
+
" <thead>\n",
|
378 |
+
" <tr style=\"text-align: right;\">\n",
|
379 |
+
" <th></th>\n",
|
380 |
+
" <th>date</th>\n",
|
381 |
+
" <th>city_name</th>\n",
|
382 |
+
" <th>pm2_5</th>\n",
|
383 |
+
" </tr>\n",
|
384 |
+
" </thead>\n",
|
385 |
+
" <tbody>\n",
|
386 |
+
" <tr>\n",
|
387 |
+
" <th>39995</th>\n",
|
388 |
+
" <td>2016-05-09</td>\n",
|
389 |
+
" <td>San Francisco</td>\n",
|
390 |
+
" <td>7.3</td>\n",
|
391 |
+
" </tr>\n",
|
392 |
+
" <tr>\n",
|
393 |
+
" <th>18276</th>\n",
|
394 |
+
" <td>2016-04-10</td>\n",
|
395 |
+
" <td>Denver</td>\n",
|
396 |
+
" <td>3.1</td>\n",
|
397 |
+
" </tr>\n",
|
398 |
+
" <tr>\n",
|
399 |
+
" <th>32122</th>\n",
|
400 |
+
" <td>2014-10-17</td>\n",
|
401 |
+
" <td>Phoenix-Mesa</td>\n",
|
402 |
+
" <td>11.7</td>\n",
|
403 |
+
" </tr>\n",
|
404 |
+
" </tbody>\n",
|
405 |
+
"</table>\n",
|
406 |
+
"</div>"
|
407 |
+
],
|
408 |
+
"text/plain": [
|
409 |
+
" date city_name pm2_5\n",
|
410 |
+
"39995 2016-05-09 San Francisco 7.3\n",
|
411 |
+
"18276 2016-04-10 Denver 3.1\n",
|
412 |
+
"32122 2014-10-17 Phoenix-Mesa 11.7"
|
413 |
+
]
|
414 |
+
},
|
415 |
+
"execution_count": 12,
|
416 |
+
"metadata": {},
|
417 |
+
"output_type": "execute_result"
|
418 |
+
}
|
419 |
+
],
|
420 |
+
"source": [
|
421 |
+
"print(\"Size of this dataframe:\", df_us.shape)\n",
|
422 |
+
"\n",
|
423 |
+
"df_us.sample(3)"
|
424 |
+
]
|
425 |
+
},
|
426 |
+
{
|
427 |
+
"cell_type": "markdown",
|
428 |
+
"id": "25557752-31c8-4da9-a52c-4415c4d20ae3",
|
429 |
+
"metadata": {},
|
430 |
+
"source": [
|
431 |
+
"### <span style=\"color:#ff5f27;\">🏢 Processing special city - `Seattle`</span>\n",
|
432 |
+
"#### We need different stations across the Seattle. \n",
|
433 |
+
"I downloaded daily `PM2.5` data manually [here](https://www.epa.gov/outdoor-air-quality-data/download-daily-data)"
|
434 |
+
]
|
435 |
+
},
|
436 |
+
{
|
437 |
+
"cell_type": "code",
|
438 |
+
"execution_count": 13,
|
439 |
+
"id": "2f54d2cb-991c-47cb-a686-76c9f7a87170",
|
440 |
+
"metadata": {
|
441 |
+
"tags": []
|
442 |
+
},
|
443 |
+
"outputs": [
|
444 |
+
{
|
445 |
+
"data": {
|
446 |
+
"text/plain": [
|
447 |
+
"{'Bellevue-SE 12th St': [47.60086, -122.1484],\n",
|
448 |
+
" 'DARRINGTON - FIR ST (Darrington High School)': [48.2469, -121.6031],\n",
|
449 |
+
" 'KENT - JAMES & CENTRAL': [47.38611, -122.23028],\n",
|
450 |
+
" 'LAKE FOREST PARK TOWNE CENTER': [47.755, -122.2806],\n",
|
451 |
+
" 'MARYSVILLE - 7TH AVE (Marysville Junior High)': [48.05432, -122.17153],\n",
|
452 |
+
" 'NORTH BEND - NORTH BEND WAY': [47.49022, -121.77278],\n",
|
453 |
+
" 'SEATTLE - BEACON HILL': [47.56824, -122.30863],\n",
|
454 |
+
" 'SEATTLE - DUWAMISH': [47.55975, -122.33827],\n",
|
455 |
+
" 'SEATTLE - SOUTH PARK #2': [47.53091, -122.3208],\n",
|
456 |
+
" 'Seattle-10th & Weller': [47.59722, -122.31972],\n",
|
457 |
+
" 'TACOMA - ALEXANDER AVE': [47.2656, -122.3858],\n",
|
458 |
+
" 'TACOMA - L STREET': [47.1864, -122.4517],\n",
|
459 |
+
" 'Tacoma-S 36th St': [47.22634, -122.46256],\n",
|
460 |
+
" 'Tukwila Allentown': [47.49854, -122.27839],\n",
|
461 |
+
" 'Tulalip-Totem Beach Rd': [48.06534, -122.28519]}"
|
462 |
+
]
|
463 |
+
},
|
464 |
+
"execution_count": 13,
|
465 |
+
"metadata": {},
|
466 |
+
"output_type": "execute_result"
|
467 |
+
}
|
468 |
+
],
|
469 |
+
"source": [
|
470 |
+
"target_cities[\"Seattle\"]"
|
471 |
+
]
|
472 |
+
},
|
473 |
+
{
|
474 |
+
"cell_type": "code",
|
475 |
+
"execution_count": 14,
|
476 |
+
"id": "31c8505d-68bc-40b6-be0f-42d8532dbd48",
|
477 |
+
"metadata": {
|
478 |
+
"tags": []
|
479 |
+
},
|
480 |
+
"outputs": [],
|
481 |
+
"source": [
|
482 |
+
"df_seattle = pd.read_csv(\"data/backfill_pm2_5_seattle.csv\")"
|
483 |
+
]
|
484 |
+
},
|
485 |
+
{
|
486 |
+
"cell_type": "code",
|
487 |
+
"execution_count": 15,
|
488 |
+
"id": "2f6583c9-3b2a-41c6-a020-aeede88c4867",
|
489 |
+
"metadata": {
|
490 |
+
"tags": []
|
491 |
+
},
|
492 |
+
"outputs": [
|
493 |
+
{
|
494 |
+
"data": {
|
495 |
+
"text/plain": [
|
496 |
+
"0"
|
497 |
+
]
|
498 |
+
},
|
499 |
+
"execution_count": 15,
|
500 |
+
"metadata": {},
|
501 |
+
"output_type": "execute_result"
|
502 |
+
}
|
503 |
+
],
|
504 |
+
"source": [
|
505 |
+
"df_seattle.isna().sum().sum()"
|
506 |
+
]
|
507 |
+
},
|
508 |
+
{
|
509 |
+
"cell_type": "code",
|
510 |
+
"execution_count": 16,
|
511 |
+
"id": "065a5b03-28f7-475c-9c6a-4340388157d8",
|
512 |
+
"metadata": {
|
513 |
+
"tags": []
|
514 |
+
},
|
515 |
+
"outputs": [
|
516 |
+
{
|
517 |
+
"name": "stdout",
|
518 |
+
"output_type": "stream",
|
519 |
+
"text": [
|
520 |
+
"Size of this dataframe: (46479, 3)\n"
|
521 |
+
]
|
522 |
+
},
|
523 |
+
{
|
524 |
+
"data": {
|
525 |
+
"text/html": [
|
526 |
+
"<div>\n",
|
527 |
+
"<style scoped>\n",
|
528 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
529 |
+
" vertical-align: middle;\n",
|
530 |
+
" }\n",
|
531 |
+
"\n",
|
532 |
+
" .dataframe tbody tr th {\n",
|
533 |
+
" vertical-align: top;\n",
|
534 |
+
" }\n",
|
535 |
+
"\n",
|
536 |
+
" .dataframe thead th {\n",
|
537 |
+
" text-align: right;\n",
|
538 |
+
" }\n",
|
539 |
+
"</style>\n",
|
540 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
541 |
+
" <thead>\n",
|
542 |
+
" <tr style=\"text-align: right;\">\n",
|
543 |
+
" <th></th>\n",
|
544 |
+
" <th>city_name</th>\n",
|
545 |
+
" <th>date</th>\n",
|
546 |
+
" <th>pm2_5</th>\n",
|
547 |
+
" </tr>\n",
|
548 |
+
" </thead>\n",
|
549 |
+
" <tbody>\n",
|
550 |
+
" <tr>\n",
|
551 |
+
" <th>3345</th>\n",
|
552 |
+
" <td>MARYSVILLE - 7TH AVE (Marysville Junior High)</td>\n",
|
553 |
+
" <td>2013-05-03</td>\n",
|
554 |
+
" <td>5.3</td>\n",
|
555 |
+
" </tr>\n",
|
556 |
+
" <tr>\n",
|
557 |
+
" <th>22979</th>\n",
|
558 |
+
" <td>TACOMA - L STREET</td>\n",
|
559 |
+
" <td>2018-08-13</td>\n",
|
560 |
+
" <td>19.2</td>\n",
|
561 |
+
" </tr>\n",
|
562 |
+
" <tr>\n",
|
563 |
+
" <th>14456</th>\n",
|
564 |
+
" <td>DARRINGTON - FIR ST (Darrington High School)</td>\n",
|
565 |
+
" <td>2016-11-09</td>\n",
|
566 |
+
" <td>8.4</td>\n",
|
567 |
+
" </tr>\n",
|
568 |
+
" </tbody>\n",
|
569 |
+
"</table>\n",
|
570 |
+
"</div>"
|
571 |
+
],
|
572 |
+
"text/plain": [
|
573 |
+
" city_name date pm2_5\n",
|
574 |
+
"3345 MARYSVILLE - 7TH AVE (Marysville Junior High) 2013-05-03 5.3\n",
|
575 |
+
"22979 TACOMA - L STREET 2018-08-13 19.2\n",
|
576 |
+
"14456 DARRINGTON - FIR ST (Darrington High School) 2016-11-09 8.4"
|
577 |
+
]
|
578 |
+
},
|
579 |
+
"execution_count": 16,
|
580 |
+
"metadata": {},
|
581 |
+
"output_type": "execute_result"
|
582 |
+
}
|
583 |
+
],
|
584 |
+
"source": [
|
585 |
+
"print(\"Size of this dataframe:\", df_seattle.shape)\n",
|
586 |
+
"\n",
|
587 |
+
"df_seattle.sample(3)"
|
588 |
+
]
|
589 |
+
},
|
590 |
+
{
|
591 |
+
"cell_type": "code",
|
592 |
+
"execution_count": 17,
|
593 |
+
"id": "e3b17ca4-0e9d-4207-ad62-90ea9c157def",
|
594 |
+
"metadata": {
|
595 |
+
"tags": []
|
596 |
+
},
|
597 |
+
"outputs": [
|
598 |
+
{
|
599 |
+
"data": {
|
600 |
+
"text/plain": [
|
601 |
+
"city_name\n",
|
602 |
+
"NORTH BEND - NORTH BEND WAY 3705\n",
|
603 |
+
"TACOMA - L STREET 3696\n",
|
604 |
+
"SEATTLE - BEACON HILL 3691\n",
|
605 |
+
"MARYSVILLE - 7TH AVE (Marysville Junior High) 3648\n",
|
606 |
+
"DARRINGTON - FIR ST (Darrington High School) 3614\n",
|
607 |
+
"SEATTLE - SOUTH PARK #2 3577\n",
|
608 |
+
"TACOMA - ALEXANDER AVE 3569\n",
|
609 |
+
"KENT - JAMES & CENTRAL 3556\n",
|
610 |
+
"SEATTLE - DUWAMISH 3439\n",
|
611 |
+
"Seattle-10th & Weller 3097\n",
|
612 |
+
"LAKE FOREST PARK TOWNE CENTER 2999\n",
|
613 |
+
"Tacoma-S 36th St 2574\n",
|
614 |
+
"Bellevue-SE 12th St 2172\n",
|
615 |
+
"Tukwila Allentown 2074\n",
|
616 |
+
"Tulalip-Totem Beach Rd 1068\n",
|
617 |
+
"Name: count, dtype: int64"
|
618 |
+
]
|
619 |
+
},
|
620 |
+
"execution_count": 17,
|
621 |
+
"metadata": {},
|
622 |
+
"output_type": "execute_result"
|
623 |
+
}
|
624 |
+
],
|
625 |
+
"source": [
|
626 |
+
"df_seattle.city_name.value_counts()"
|
627 |
+
]
|
628 |
+
},
|
629 |
+
{
|
630 |
+
"cell_type": "markdown",
|
631 |
+
"id": "c278a55d-f083-4f95-b292-92e545b9c408",
|
632 |
+
"metadata": {},
|
633 |
+
"source": [
|
634 |
+
"### <span style=\"color:#ff5f27;\">🌟 All together</span>"
|
635 |
+
]
|
636 |
+
},
|
637 |
+
{
|
638 |
+
"cell_type": "code",
|
639 |
+
"execution_count": 18,
|
640 |
+
"id": "0d55ae92-4bf9-43ae-8841-6767f5f68bec",
|
641 |
+
"metadata": {
|
642 |
+
"tags": []
|
643 |
+
},
|
644 |
+
"outputs": [],
|
645 |
+
"source": [
|
646 |
+
"df_air_quality = pd.concat([df_eu, df_us, df_seattle]).reset_index(drop=True)"
|
647 |
+
]
|
648 |
+
},
|
649 |
+
{
|
650 |
+
"cell_type": "code",
|
651 |
+
"execution_count": 19,
|
652 |
+
"id": "d5df39e2-2ce6-48df-9063-9827da8e7317",
|
653 |
+
"metadata": {
|
654 |
+
"tags": []
|
655 |
+
},
|
656 |
+
"outputs": [
|
657 |
+
{
|
658 |
+
"data": {
|
659 |
+
"text/html": [
|
660 |
+
"<div>\n",
|
661 |
+
"<style scoped>\n",
|
662 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
663 |
+
" vertical-align: middle;\n",
|
664 |
+
" }\n",
|
665 |
+
"\n",
|
666 |
+
" .dataframe tbody tr th {\n",
|
667 |
+
" vertical-align: top;\n",
|
668 |
+
" }\n",
|
669 |
+
"\n",
|
670 |
+
" .dataframe thead th {\n",
|
671 |
+
" text-align: right;\n",
|
672 |
+
" }\n",
|
673 |
+
"</style>\n",
|
674 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
675 |
+
" <thead>\n",
|
676 |
+
" <tr style=\"text-align: right;\">\n",
|
677 |
+
" <th></th>\n",
|
678 |
+
" <th>city_name</th>\n",
|
679 |
+
" <th>date</th>\n",
|
680 |
+
" <th>pm2_5</th>\n",
|
681 |
+
" </tr>\n",
|
682 |
+
" </thead>\n",
|
683 |
+
" <tbody>\n",
|
684 |
+
" <tr>\n",
|
685 |
+
" <th>155596</th>\n",
|
686 |
+
" <td>Tacoma-S 36th St</td>\n",
|
687 |
+
" <td>2023-03-12</td>\n",
|
688 |
+
" <td>13.9</td>\n",
|
689 |
+
" </tr>\n",
|
690 |
+
" <tr>\n",
|
691 |
+
" <th>72851</th>\n",
|
692 |
+
" <td>Chicago</td>\n",
|
693 |
+
" <td>2018-07-04</td>\n",
|
694 |
+
" <td>10.3</td>\n",
|
695 |
+
" </tr>\n",
|
696 |
+
" <tr>\n",
|
697 |
+
" <th>150716</th>\n",
|
698 |
+
" <td>Bellevue-SE 12th St</td>\n",
|
699 |
+
" <td>2022-12-07</td>\n",
|
700 |
+
" <td>1.8</td>\n",
|
701 |
+
" </tr>\n",
|
702 |
+
" <tr>\n",
|
703 |
+
" <th>88999</th>\n",
|
704 |
+
" <td>Los Angeles</td>\n",
|
705 |
+
" <td>2016-07-11</td>\n",
|
706 |
+
" <td>10.5</td>\n",
|
707 |
+
" </tr>\n",
|
708 |
+
" <tr>\n",
|
709 |
+
" <th>127366</th>\n",
|
710 |
+
" <td>Tacoma-S 36th St</td>\n",
|
711 |
+
" <td>2017-12-01</td>\n",
|
712 |
+
" <td>4.6</td>\n",
|
713 |
+
" </tr>\n",
|
714 |
+
" </tbody>\n",
|
715 |
+
"</table>\n",
|
716 |
+
"</div>"
|
717 |
+
],
|
718 |
+
"text/plain": [
|
719 |
+
" city_name date pm2_5\n",
|
720 |
+
"155596 Tacoma-S 36th St 2023-03-12 13.9\n",
|
721 |
+
"72851 Chicago 2018-07-04 10.3\n",
|
722 |
+
"150716 Bellevue-SE 12th St 2022-12-07 1.8\n",
|
723 |
+
"88999 Los Angeles 2016-07-11 10.5\n",
|
724 |
+
"127366 Tacoma-S 36th St 2017-12-01 4.6"
|
725 |
+
]
|
726 |
+
},
|
727 |
+
"execution_count": 19,
|
728 |
+
"metadata": {},
|
729 |
+
"output_type": "execute_result"
|
730 |
+
}
|
731 |
+
],
|
732 |
+
"source": [
|
733 |
+
"df_air_quality.sample(5)"
|
734 |
+
]
|
735 |
+
},
|
736 |
+
{
|
737 |
+
"cell_type": "code",
|
738 |
+
"execution_count": 20,
|
739 |
+
"id": "794c30fe-fb54-4fa0-a34c-5cef68f52473",
|
740 |
+
"metadata": {
|
741 |
+
"tags": []
|
742 |
+
},
|
743 |
+
"outputs": [
|
744 |
+
{
|
745 |
+
"data": {
|
746 |
+
"text/plain": [
|
747 |
+
"(156064, 3)"
|
748 |
+
]
|
749 |
+
},
|
750 |
+
"execution_count": 20,
|
751 |
+
"metadata": {},
|
752 |
+
"output_type": "execute_result"
|
753 |
+
}
|
754 |
+
],
|
755 |
+
"source": [
|
756 |
+
"df_air_quality.shape"
|
757 |
+
]
|
758 |
+
},
|
759 |
+
{
|
760 |
+
"cell_type": "code",
|
761 |
+
"execution_count": 21,
|
762 |
+
"id": "ed9bc7f1-d62e-4b1f-97af-6ecd30fe4b67",
|
763 |
+
"metadata": {
|
764 |
+
"tags": []
|
765 |
+
},
|
766 |
+
"outputs": [
|
767 |
+
{
|
768 |
+
"data": {
|
769 |
+
"text/plain": [
|
770 |
+
"Index(['city_name', 'date', 'pm2_5'], dtype='object')"
|
771 |
+
]
|
772 |
+
},
|
773 |
+
"execution_count": 21,
|
774 |
+
"metadata": {},
|
775 |
+
"output_type": "execute_result"
|
776 |
+
}
|
777 |
+
],
|
778 |
+
"source": [
|
779 |
+
"df_air_quality.columns"
|
780 |
+
]
|
781 |
+
},
|
782 |
+
{
|
783 |
+
"cell_type": "markdown",
|
784 |
+
"id": "88a9e0ef-e9d2-4e3c-91af-c4e619b8c906",
|
785 |
+
"metadata": {},
|
786 |
+
"source": [
|
787 |
+
"---"
|
788 |
+
]
|
789 |
+
},
|
790 |
+
{
|
791 |
+
"cell_type": "markdown",
|
792 |
+
"id": "4687e802",
|
793 |
+
"metadata": {
|
794 |
+
"tags": []
|
795 |
+
},
|
796 |
+
"source": [
|
797 |
+
"## <span style='color:#ff5f27'> 🌦 Loading Weather Data from [Open Meteo](https://open-meteo.com/en/docs)"
|
798 |
+
]
|
799 |
+
},
|
800 |
+
{
|
801 |
+
"cell_type": "code",
|
802 |
+
"execution_count": 22,
|
803 |
+
"id": "c46283b4",
|
804 |
+
"metadata": {},
|
805 |
+
"outputs": [],
|
806 |
+
"source": [
|
807 |
+
"df_weather = pd.read_csv(\"data/backfill_weather.csv\")"
|
808 |
+
]
|
809 |
+
},
|
810 |
+
{
|
811 |
+
"cell_type": "code",
|
812 |
+
"execution_count": 23,
|
813 |
+
"id": "1921b61c-d002-417e-88a6-9fe1cad0a7d4",
|
814 |
+
"metadata": {
|
815 |
+
"tags": []
|
816 |
+
},
|
817 |
+
"outputs": [
|
818 |
+
{
|
819 |
+
"data": {
|
820 |
+
"text/plain": [
|
821 |
+
"city_name\n",
|
822 |
+
"Amsterdam 3767\n",
|
823 |
+
"Athina 3767\n",
|
824 |
+
"Berlin 3767\n",
|
825 |
+
"Gdansk 3767\n",
|
826 |
+
"Kraków 3767\n",
|
827 |
+
"London 3767\n",
|
828 |
+
"Madrid 3767\n",
|
829 |
+
"Marseille 3767\n",
|
830 |
+
"Milano 3767\n",
|
831 |
+
"München 3767\n",
|
832 |
+
"Napoli 3767\n",
|
833 |
+
"Paris 3767\n",
|
834 |
+
"Sevilla 3767\n",
|
835 |
+
"Stockholm 3767\n",
|
836 |
+
"Tallinn 3767\n",
|
837 |
+
"Varna 3767\n",
|
838 |
+
"Wien 3767\n",
|
839 |
+
"Albuquerque 3767\n",
|
840 |
+
"Atlanta 3767\n",
|
841 |
+
"Chicago 3767\n",
|
842 |
+
"Columbus 3767\n",
|
843 |
+
"Dallas 3767\n",
|
844 |
+
"Denver 3767\n",
|
845 |
+
"Houston 3767\n",
|
846 |
+
"Los Angeles 3767\n",
|
847 |
+
"New York 3767\n",
|
848 |
+
"Phoenix-Mesa 3767\n",
|
849 |
+
"Salt Lake City 3767\n",
|
850 |
+
"San Francisco 3767\n",
|
851 |
+
"Tampa 3767\n",
|
852 |
+
"Bellevue-SE 12th St 3767\n",
|
853 |
+
"DARRINGTON - FIR ST (Darrington High School) 3767\n",
|
854 |
+
"KENT - JAMES & CENTRAL 3767\n",
|
855 |
+
"LAKE FOREST PARK TOWNE CENTER 3767\n",
|
856 |
+
"MARYSVILLE - 7TH AVE (Marysville Junior High) 3767\n",
|
857 |
+
"NORTH BEND - NORTH BEND WAY 3767\n",
|
858 |
+
"SEATTLE - BEACON HILL 3767\n",
|
859 |
+
"SEATTLE - DUWAMISH 3767\n",
|
860 |
+
"SEATTLE - SOUTH PARK #2 3767\n",
|
861 |
+
"Seattle-10th & Weller 3767\n",
|
862 |
+
"TACOMA - ALEXANDER AVE 3767\n",
|
863 |
+
"TACOMA - L STREET 3767\n",
|
864 |
+
"Tacoma-S 36th St 3767\n",
|
865 |
+
"Tukwila Allentown 3767\n",
|
866 |
+
"Tulalip-Totem Beach Rd 3767\n",
|
867 |
+
"Name: count, dtype: int64"
|
868 |
+
]
|
869 |
+
},
|
870 |
+
"execution_count": 23,
|
871 |
+
"metadata": {},
|
872 |
+
"output_type": "execute_result"
|
873 |
+
}
|
874 |
+
],
|
875 |
+
"source": [
|
876 |
+
"df_weather.city_name.value_counts()"
|
877 |
+
]
|
878 |
+
},
|
879 |
+
{
|
880 |
+
"cell_type": "code",
|
881 |
+
"execution_count": 24,
|
882 |
+
"id": "8d5dcd0a",
|
883 |
+
"metadata": {},
|
884 |
+
"outputs": [
|
885 |
+
{
|
886 |
+
"data": {
|
887 |
+
"text/html": [
|
888 |
+
"<div>\n",
|
889 |
+
"<style scoped>\n",
|
890 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
891 |
+
" vertical-align: middle;\n",
|
892 |
+
" }\n",
|
893 |
+
"\n",
|
894 |
+
" .dataframe tbody tr th {\n",
|
895 |
+
" vertical-align: top;\n",
|
896 |
+
" }\n",
|
897 |
+
"\n",
|
898 |
+
" .dataframe thead th {\n",
|
899 |
+
" text-align: right;\n",
|
900 |
+
" }\n",
|
901 |
+
"</style>\n",
|
902 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
903 |
+
" <thead>\n",
|
904 |
+
" <tr style=\"text-align: right;\">\n",
|
905 |
+
" <th></th>\n",
|
906 |
+
" <th>city_name</th>\n",
|
907 |
+
" <th>date</th>\n",
|
908 |
+
" <th>temperature_max</th>\n",
|
909 |
+
" <th>temperature_min</th>\n",
|
910 |
+
" <th>precipitation_sum</th>\n",
|
911 |
+
" <th>rain_sum</th>\n",
|
912 |
+
" <th>snowfall_sum</th>\n",
|
913 |
+
" <th>precipitation_hours</th>\n",
|
914 |
+
" <th>wind_speed_max</th>\n",
|
915 |
+
" <th>wind_gusts_max</th>\n",
|
916 |
+
" <th>wind_direction_dominant</th>\n",
|
917 |
+
" </tr>\n",
|
918 |
+
" </thead>\n",
|
919 |
+
" <tbody>\n",
|
920 |
+
" <tr>\n",
|
921 |
+
" <th>56824</th>\n",
|
922 |
+
" <td>Varna</td>\n",
|
923 |
+
" <td>2014-03-01</td>\n",
|
924 |
+
" <td>9.4</td>\n",
|
925 |
+
" <td>5.5</td>\n",
|
926 |
+
" <td>2.6</td>\n",
|
927 |
+
" <td>2.6</td>\n",
|
928 |
+
" <td>0.00</td>\n",
|
929 |
+
" <td>7.0</td>\n",
|
930 |
+
" <td>13.2</td>\n",
|
931 |
+
" <td>22.7</td>\n",
|
932 |
+
" <td>150</td>\n",
|
933 |
+
" </tr>\n",
|
934 |
+
" <tr>\n",
|
935 |
+
" <th>146508</th>\n",
|
936 |
+
" <td>SEATTLE - SOUTH PARK #2</td>\n",
|
937 |
+
" <td>2022-12-08</td>\n",
|
938 |
+
" <td>5.6</td>\n",
|
939 |
+
" <td>1.8</td>\n",
|
940 |
+
" <td>7.9</td>\n",
|
941 |
+
" <td>7.6</td>\n",
|
942 |
+
" <td>0.21</td>\n",
|
943 |
+
" <td>15.0</td>\n",
|
944 |
+
" <td>18.1</td>\n",
|
945 |
+
" <td>38.9</td>\n",
|
946 |
+
" <td>285</td>\n",
|
947 |
+
" </tr>\n",
|
948 |
+
" <tr>\n",
|
949 |
+
" <th>53035</th>\n",
|
950 |
+
" <td>Tallinn</td>\n",
|
951 |
+
" <td>2014-01-31</td>\n",
|
952 |
+
" <td>-8.6</td>\n",
|
953 |
+
" <td>-17.0</td>\n",
|
954 |
+
" <td>1.0</td>\n",
|
955 |
+
" <td>0.0</td>\n",
|
956 |
+
" <td>0.98</td>\n",
|
957 |
+
" <td>3.0</td>\n",
|
958 |
+
" <td>29.6</td>\n",
|
959 |
+
" <td>55.8</td>\n",
|
960 |
+
" <td>158</td>\n",
|
961 |
+
" </tr>\n",
|
962 |
+
" </tbody>\n",
|
963 |
+
"</table>\n",
|
964 |
+
"</div>"
|
965 |
+
],
|
966 |
+
"text/plain": [
|
967 |
+
" city_name date temperature_max temperature_min \\\n",
|
968 |
+
"56824 Varna 2014-03-01 9.4 5.5 \n",
|
969 |
+
"146508 SEATTLE - SOUTH PARK #2 2022-12-08 5.6 1.8 \n",
|
970 |
+
"53035 Tallinn 2014-01-31 -8.6 -17.0 \n",
|
971 |
+
"\n",
|
972 |
+
" precipitation_sum rain_sum snowfall_sum precipitation_hours \\\n",
|
973 |
+
"56824 2.6 2.6 0.00 7.0 \n",
|
974 |
+
"146508 7.9 7.6 0.21 15.0 \n",
|
975 |
+
"53035 1.0 0.0 0.98 3.0 \n",
|
976 |
+
"\n",
|
977 |
+
" wind_speed_max wind_gusts_max wind_direction_dominant \n",
|
978 |
+
"56824 13.2 22.7 150 \n",
|
979 |
+
"146508 18.1 38.9 285 \n",
|
980 |
+
"53035 29.6 55.8 158 "
|
981 |
+
]
|
982 |
+
},
|
983 |
+
"execution_count": 24,
|
984 |
+
"metadata": {},
|
985 |
+
"output_type": "execute_result"
|
986 |
+
}
|
987 |
+
],
|
988 |
+
"source": [
|
989 |
+
"df_weather.sample(3)"
|
990 |
+
]
|
991 |
+
},
|
992 |
+
{
|
993 |
+
"cell_type": "markdown",
|
994 |
+
"id": "cc9b7ad6",
|
995 |
+
"metadata": {},
|
996 |
+
"source": [
|
997 |
+
"---"
|
998 |
+
]
|
999 |
+
},
|
1000 |
+
{
|
1001 |
+
"cell_type": "code",
|
1002 |
+
"execution_count": 25,
|
1003 |
+
"id": "a8f886c3-a5ac-4370-a6a2-22838ab7409e",
|
1004 |
+
"metadata": {
|
1005 |
+
"tags": []
|
1006 |
+
},
|
1007 |
+
"outputs": [],
|
1008 |
+
"source": [
|
1009 |
+
"df_air_quality.date = pd.to_datetime(df_air_quality.date)\n",
|
1010 |
+
"df_weather.date = pd.to_datetime(df_weather.date)\n",
|
1011 |
+
"\n",
|
1012 |
+
"df_air_quality[\"unix_time\"] = df_air_quality[\"date\"].apply(convert_date_to_unix)\n",
|
1013 |
+
"df_weather[\"unix_time\"] = df_weather[\"date\"].apply(convert_date_to_unix)"
|
1014 |
+
]
|
1015 |
+
},
|
1016 |
+
{
|
1017 |
+
"cell_type": "code",
|
1018 |
+
"execution_count": 26,
|
1019 |
+
"id": "1b6af890-87a3-4468-8eda-576c2dd75464",
|
1020 |
+
"metadata": {
|
1021 |
+
"tags": []
|
1022 |
+
},
|
1023 |
+
"outputs": [],
|
1024 |
+
"source": [
|
1025 |
+
"df_air_quality.date = df_air_quality.date.astype(str)\n",
|
1026 |
+
"df_weather.date = df_weather.date.astype(str)"
|
1027 |
+
]
|
1028 |
+
},
|
1029 |
+
{
|
1030 |
+
"cell_type": "code",
|
1031 |
+
"execution_count": 27,
|
1032 |
+
"id": "2ad5ea08",
|
1033 |
+
"metadata": {},
|
1034 |
+
"outputs": [
|
1035 |
+
{
|
1036 |
+
"data": {
|
1037 |
+
"text/html": [
|
1038 |
+
"<div>\n",
|
1039 |
+
"<style scoped>\n",
|
1040 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
1041 |
+
" vertical-align: middle;\n",
|
1042 |
+
" }\n",
|
1043 |
+
"\n",
|
1044 |
+
" .dataframe tbody tr th {\n",
|
1045 |
+
" vertical-align: top;\n",
|
1046 |
+
" }\n",
|
1047 |
+
"\n",
|
1048 |
+
" .dataframe thead th {\n",
|
1049 |
+
" text-align: right;\n",
|
1050 |
+
" }\n",
|
1051 |
+
"</style>\n",
|
1052 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
1053 |
+
" <thead>\n",
|
1054 |
+
" <tr style=\"text-align: right;\">\n",
|
1055 |
+
" <th></th>\n",
|
1056 |
+
" <th>city_name</th>\n",
|
1057 |
+
" <th>date</th>\n",
|
1058 |
+
" <th>pm2_5</th>\n",
|
1059 |
+
" <th>unix_time</th>\n",
|
1060 |
+
" </tr>\n",
|
1061 |
+
" </thead>\n",
|
1062 |
+
" <tbody>\n",
|
1063 |
+
" <tr>\n",
|
1064 |
+
" <th>0</th>\n",
|
1065 |
+
" <td>Amsterdam</td>\n",
|
1066 |
+
" <td>2013-01-01</td>\n",
|
1067 |
+
" <td>14.0</td>\n",
|
1068 |
+
" <td>1356994800000</td>\n",
|
1069 |
+
" </tr>\n",
|
1070 |
+
" <tr>\n",
|
1071 |
+
" <th>1</th>\n",
|
1072 |
+
" <td>Amsterdam</td>\n",
|
1073 |
+
" <td>2013-01-02</td>\n",
|
1074 |
+
" <td>8.0</td>\n",
|
1075 |
+
" <td>1357081200000</td>\n",
|
1076 |
+
" </tr>\n",
|
1077 |
+
" <tr>\n",
|
1078 |
+
" <th>2</th>\n",
|
1079 |
+
" <td>Amsterdam</td>\n",
|
1080 |
+
" <td>2013-01-03</td>\n",
|
1081 |
+
" <td>12.0</td>\n",
|
1082 |
+
" <td>1357167600000</td>\n",
|
1083 |
+
" </tr>\n",
|
1084 |
+
" <tr>\n",
|
1085 |
+
" <th>3</th>\n",
|
1086 |
+
" <td>Amsterdam</td>\n",
|
1087 |
+
" <td>2013-01-04</td>\n",
|
1088 |
+
" <td>12.0</td>\n",
|
1089 |
+
" <td>1357254000000</td>\n",
|
1090 |
+
" </tr>\n",
|
1091 |
+
" <tr>\n",
|
1092 |
+
" <th>4</th>\n",
|
1093 |
+
" <td>Amsterdam</td>\n",
|
1094 |
+
" <td>2013-01-05</td>\n",
|
1095 |
+
" <td>14.0</td>\n",
|
1096 |
+
" <td>1357340400000</td>\n",
|
1097 |
+
" </tr>\n",
|
1098 |
+
" <tr>\n",
|
1099 |
+
" <th>...</th>\n",
|
1100 |
+
" <td>...</td>\n",
|
1101 |
+
" <td>...</td>\n",
|
1102 |
+
" <td>...</td>\n",
|
1103 |
+
" <td>...</td>\n",
|
1104 |
+
" </tr>\n",
|
1105 |
+
" <tr>\n",
|
1106 |
+
" <th>156059</th>\n",
|
1107 |
+
" <td>MARYSVILLE - 7TH AVE (Marysville Junior High)</td>\n",
|
1108 |
+
" <td>2023-03-30</td>\n",
|
1109 |
+
" <td>7.9</td>\n",
|
1110 |
+
" <td>1680127200000</td>\n",
|
1111 |
+
" </tr>\n",
|
1112 |
+
" <tr>\n",
|
1113 |
+
" <th>156060</th>\n",
|
1114 |
+
" <td>MARYSVILLE - 7TH AVE (Marysville Junior High)</td>\n",
|
1115 |
+
" <td>2023-03-31</td>\n",
|
1116 |
+
" <td>3.7</td>\n",
|
1117 |
+
" <td>1680213600000</td>\n",
|
1118 |
+
" </tr>\n",
|
1119 |
+
" <tr>\n",
|
1120 |
+
" <th>156061</th>\n",
|
1121 |
+
" <td>MARYSVILLE - 7TH AVE (Marysville Junior High)</td>\n",
|
1122 |
+
" <td>2023-04-01</td>\n",
|
1123 |
+
" <td>3.4</td>\n",
|
1124 |
+
" <td>1680300000000</td>\n",
|
1125 |
+
" </tr>\n",
|
1126 |
+
" <tr>\n",
|
1127 |
+
" <th>156062</th>\n",
|
1128 |
+
" <td>MARYSVILLE - 7TH AVE (Marysville Junior High)</td>\n",
|
1129 |
+
" <td>2023-04-02</td>\n",
|
1130 |
+
" <td>3.1</td>\n",
|
1131 |
+
" <td>1680386400000</td>\n",
|
1132 |
+
" </tr>\n",
|
1133 |
+
" <tr>\n",
|
1134 |
+
" <th>156063</th>\n",
|
1135 |
+
" <td>MARYSVILLE - 7TH AVE (Marysville Junior High)</td>\n",
|
1136 |
+
" <td>2023-04-03</td>\n",
|
1137 |
+
" <td>4.4</td>\n",
|
1138 |
+
" <td>1680472800000</td>\n",
|
1139 |
+
" </tr>\n",
|
1140 |
+
" </tbody>\n",
|
1141 |
+
"</table>\n",
|
1142 |
+
"<p>156064 rows × 4 columns</p>\n",
|
1143 |
+
"</div>"
|
1144 |
+
],
|
1145 |
+
"text/plain": [
|
1146 |
+
" city_name date pm2_5 \\\n",
|
1147 |
+
"0 Amsterdam 2013-01-01 14.0 \n",
|
1148 |
+
"1 Amsterdam 2013-01-02 8.0 \n",
|
1149 |
+
"2 Amsterdam 2013-01-03 12.0 \n",
|
1150 |
+
"3 Amsterdam 2013-01-04 12.0 \n",
|
1151 |
+
"4 Amsterdam 2013-01-05 14.0 \n",
|
1152 |
+
"... ... ... ... \n",
|
1153 |
+
"156059 MARYSVILLE - 7TH AVE (Marysville Junior High) 2023-03-30 7.9 \n",
|
1154 |
+
"156060 MARYSVILLE - 7TH AVE (Marysville Junior High) 2023-03-31 3.7 \n",
|
1155 |
+
"156061 MARYSVILLE - 7TH AVE (Marysville Junior High) 2023-04-01 3.4 \n",
|
1156 |
+
"156062 MARYSVILLE - 7TH AVE (Marysville Junior High) 2023-04-02 3.1 \n",
|
1157 |
+
"156063 MARYSVILLE - 7TH AVE (Marysville Junior High) 2023-04-03 4.4 \n",
|
1158 |
+
"\n",
|
1159 |
+
" unix_time \n",
|
1160 |
+
"0 1356994800000 \n",
|
1161 |
+
"1 1357081200000 \n",
|
1162 |
+
"2 1357167600000 \n",
|
1163 |
+
"3 1357254000000 \n",
|
1164 |
+
"4 1357340400000 \n",
|
1165 |
+
"... ... \n",
|
1166 |
+
"156059 1680127200000 \n",
|
1167 |
+
"156060 1680213600000 \n",
|
1168 |
+
"156061 1680300000000 \n",
|
1169 |
+
"156062 1680386400000 \n",
|
1170 |
+
"156063 1680472800000 \n",
|
1171 |
+
"\n",
|
1172 |
+
"[156064 rows x 4 columns]"
|
1173 |
+
]
|
1174 |
+
},
|
1175 |
+
"execution_count": 27,
|
1176 |
+
"metadata": {},
|
1177 |
+
"output_type": "execute_result"
|
1178 |
+
}
|
1179 |
+
],
|
1180 |
+
"source": [
|
1181 |
+
"df_air_quality"
|
1182 |
+
]
|
1183 |
+
},
|
1184 |
+
{
|
1185 |
+
"cell_type": "markdown",
|
1186 |
+
"id": "f2ebd846-0420-4e4c-8a5b-0827fa91c693",
|
1187 |
+
"metadata": {},
|
1188 |
+
"source": [
|
1189 |
+
"---"
|
1190 |
+
]
|
1191 |
+
},
|
1192 |
+
{
|
1193 |
+
"cell_type": "markdown",
|
1194 |
+
"id": "cb6f83ba",
|
1195 |
+
"metadata": {},
|
1196 |
+
"source": [
|
1197 |
+
"### <span style=\"color:#ff5f27;\"> 🔮 Connecting to Hopsworks Feature Store </span>"
|
1198 |
+
]
|
1199 |
+
},
|
1200 |
+
{
|
1201 |
+
"cell_type": "code",
|
1202 |
+
"execution_count": 29,
|
1203 |
+
"id": "dd068240",
|
1204 |
+
"metadata": {},
|
1205 |
+
"outputs": [
|
1206 |
+
{
|
1207 |
+
"name": "stdout",
|
1208 |
+
"output_type": "stream",
|
1209 |
+
"text": [
|
1210 |
+
"Connected. Call `.close()` to terminate connection gracefully.\n",
|
1211 |
+
"Copy your Api Key (first register/login): https://c.app.hopsworks.ai/account/api/generated\n",
|
1212 |
+
"Connected. Call `.close()` to terminate connection gracefully.\n",
|
1213 |
+
"\n",
|
1214 |
+
"Multiple projects found. \n",
|
1215 |
+
"\n",
|
1216 |
+
"\t (1) annikaij\n",
|
1217 |
+
"\t (2) miknie20\n",
|
1218 |
+
"\n",
|
1219 |
+
"Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/549019\n",
|
1220 |
+
"Connected. Call `.close()` to terminate connection gracefully.\n"
|
1221 |
+
]
|
1222 |
+
}
|
1223 |
+
],
|
1224 |
+
"source": [
|
1225 |
+
"import hopsworks\n",
|
1226 |
+
"\n",
|
1227 |
+
"project = hopsworks.login()\n",
|
1228 |
+
"\n",
|
1229 |
+
"fs = project.get_feature_store() "
|
1230 |
+
]
|
1231 |
+
},
|
1232 |
+
{
|
1233 |
+
"cell_type": "code",
|
1234 |
+
"execution_count": 30,
|
1235 |
+
"id": "71db5ac1",
|
1236 |
+
"metadata": {},
|
1237 |
+
"outputs": [
|
1238 |
+
{
|
1239 |
+
"data": {
|
1240 |
+
"text/plain": [
|
1241 |
+
"{\"expectation_type\": \"expect_column_values_to_be_between\", \"kwargs\": {\"column\": \"pm2_5\", \"min_value\": 0.0, \"max_value\": 1000.0}, \"meta\": {}}"
|
1242 |
+
]
|
1243 |
+
},
|
1244 |
+
"execution_count": 30,
|
1245 |
+
"metadata": {},
|
1246 |
+
"output_type": "execute_result"
|
1247 |
+
}
|
1248 |
+
],
|
1249 |
+
"source": [
|
1250 |
+
"from great_expectations.core import ExpectationSuite, ExpectationConfiguration\n",
|
1251 |
+
"\n",
|
1252 |
+
"expectation_suite = ExpectationSuite(expectation_suite_name=\"pmi_data\")\n",
|
1253 |
+
"\n",
|
1254 |
+
"expectation_suite.add_expectation(\n",
|
1255 |
+
" ExpectationConfiguration(\n",
|
1256 |
+
" expectation_type=\"expect_column_values_to_be_between\",\n",
|
1257 |
+
" kwargs={\n",
|
1258 |
+
" \"column\": \"pm2_5\", \n",
|
1259 |
+
" \"min_value\": 0.0,\n",
|
1260 |
+
" \"max_value\": 1000.0,\n",
|
1261 |
+
" }\n",
|
1262 |
+
" )\n",
|
1263 |
+
")"
|
1264 |
+
]
|
1265 |
+
},
|
1266 |
+
{
|
1267 |
+
"cell_type": "markdown",
|
1268 |
+
"id": "63d8c3b9",
|
1269 |
+
"metadata": {},
|
1270 |
+
"source": [
|
1271 |
+
"## <span style=\"color:#ff5f27;\">🪄 Creating Feature Groups</span>"
|
1272 |
+
]
|
1273 |
+
},
|
1274 |
+
{
|
1275 |
+
"cell_type": "markdown",
|
1276 |
+
"id": "4a2515c4",
|
1277 |
+
"metadata": {},
|
1278 |
+
"source": [
|
1279 |
+
"### <span style='color:#ff5f27'> 🌫 Air Quality Data"
|
1280 |
+
]
|
1281 |
+
},
|
1282 |
+
{
|
1283 |
+
"cell_type": "code",
|
1284 |
+
"execution_count": 31,
|
1285 |
+
"id": "9d7088a8",
|
1286 |
+
"metadata": {
|
1287 |
+
"scrolled": true,
|
1288 |
+
"tags": []
|
1289 |
+
},
|
1290 |
+
"outputs": [],
|
1291 |
+
"source": [
|
1292 |
+
"air_quality_fg = fs.get_or_create_feature_group(\n",
|
1293 |
+
" name='air_quality',\n",
|
1294 |
+
" description='Air Quality characteristics of each day',\n",
|
1295 |
+
" version=1,\n",
|
1296 |
+
" primary_key=['city_name'], #'unix_time',\n",
|
1297 |
+
" online_enabled=False,\n",
|
1298 |
+
" expectation_suite = expectation_suite,\n",
|
1299 |
+
" event_time=\"unix_time\"\n",
|
1300 |
+
") "
|
1301 |
+
]
|
1302 |
+
},
|
1303 |
+
{
|
1304 |
+
"cell_type": "code",
|
1305 |
+
"execution_count": 32,
|
1306 |
+
"id": "7e04a975-bb58-42e2-9abd-90e68ae37864",
|
1307 |
+
"metadata": {},
|
1308 |
+
"outputs": [
|
1309 |
+
{
|
1310 |
+
"name": "stdout",
|
1311 |
+
"output_type": "stream",
|
1312 |
+
"text": [
|
1313 |
+
"Feature Group created successfully, explore it at \n",
|
1314 |
+
"https://c.app.hopsworks.ai:443/p/549019/fs/544841/fg/758117\n",
|
1315 |
+
"Validation failed.\n",
|
1316 |
+
"Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/549019/fs/544841/fg/758117\n"
|
1317 |
+
]
|
1318 |
+
},
|
1319 |
+
{
|
1320 |
+
"name": "stderr",
|
1321 |
+
"output_type": "stream",
|
1322 |
+
"text": [
|
1323 |
+
"Uploading Dataframe: 100.00% |██████████| Rows 156064/156064 | Elapsed Time: 00:16 | Remaining Time: 00:00\n"
|
1324 |
+
]
|
1325 |
+
},
|
1326 |
+
{
|
1327 |
+
"name": "stdout",
|
1328 |
+
"output_type": "stream",
|
1329 |
+
"text": [
|
1330 |
+
"Launching job: air_quality_1_offline_fg_materialization\n",
|
1331 |
+
"Job started successfully, you can follow the progress at \n",
|
1332 |
+
"https://c.app.hopsworks.ai/p/549019/jobs/named/air_quality_1_offline_fg_materialization/executions\n"
|
1333 |
+
]
|
1334 |
+
},
|
1335 |
+
{
|
1336 |
+
"data": {
|
1337 |
+
"text/plain": [
|
1338 |
+
"(<hsfs.core.job.Job at 0x7fb0dce8c6d0>,\n",
|
1339 |
+
" {\n",
|
1340 |
+
" \"evaluation_parameters\": {},\n",
|
1341 |
+
" \"success\": false,\n",
|
1342 |
+
" \"statistics\": {\n",
|
1343 |
+
" \"evaluated_expectations\": 1,\n",
|
1344 |
+
" \"successful_expectations\": 0,\n",
|
1345 |
+
" \"unsuccessful_expectations\": 1,\n",
|
1346 |
+
" \"success_percent\": 0.0\n",
|
1347 |
+
" },\n",
|
1348 |
+
" \"results\": [\n",
|
1349 |
+
" {\n",
|
1350 |
+
" \"exception_info\": {\n",
|
1351 |
+
" \"raised_exception\": false,\n",
|
1352 |
+
" \"exception_message\": null,\n",
|
1353 |
+
" \"exception_traceback\": null\n",
|
1354 |
+
" },\n",
|
1355 |
+
" \"expectation_config\": {\n",
|
1356 |
+
" \"expectation_type\": \"expect_column_values_to_be_between\",\n",
|
1357 |
+
" \"kwargs\": {\n",
|
1358 |
+
" \"column\": \"pm2_5\",\n",
|
1359 |
+
" \"min_value\": 0.0,\n",
|
1360 |
+
" \"max_value\": 1000.0\n",
|
1361 |
+
" },\n",
|
1362 |
+
" \"meta\": {\n",
|
1363 |
+
" \"expectationId\": 473089\n",
|
1364 |
+
" }\n",
|
1365 |
+
" },\n",
|
1366 |
+
" \"success\": false,\n",
|
1367 |
+
" \"result\": {\n",
|
1368 |
+
" \"element_count\": 156064,\n",
|
1369 |
+
" \"missing_count\": 0,\n",
|
1370 |
+
" \"missing_percent\": 0.0,\n",
|
1371 |
+
" \"unexpected_count\": 84,\n",
|
1372 |
+
" \"unexpected_percent\": 0.05382407217551774,\n",
|
1373 |
+
" \"unexpected_percent_total\": 0.05382407217551774,\n",
|
1374 |
+
" \"unexpected_percent_nonmissing\": 0.05382407217551774,\n",
|
1375 |
+
" \"partial_unexpected_list\": [\n",
|
1376 |
+
" -1.0,\n",
|
1377 |
+
" -1.0,\n",
|
1378 |
+
" -1.0,\n",
|
1379 |
+
" -1.0,\n",
|
1380 |
+
" -0.2,\n",
|
1381 |
+
" -0.1,\n",
|
1382 |
+
" -1.2,\n",
|
1383 |
+
" -1.2,\n",
|
1384 |
+
" -1.1,\n",
|
1385 |
+
" -0.9,\n",
|
1386 |
+
" -0.6,\n",
|
1387 |
+
" -0.2,\n",
|
1388 |
+
" -1.0,\n",
|
1389 |
+
" -0.5,\n",
|
1390 |
+
" -0.7,\n",
|
1391 |
+
" -0.1,\n",
|
1392 |
+
" -0.4,\n",
|
1393 |
+
" -0.5,\n",
|
1394 |
+
" -0.1,\n",
|
1395 |
+
" -0.2\n",
|
1396 |
+
" ]\n",
|
1397 |
+
" },\n",
|
1398 |
+
" \"meta\": {\n",
|
1399 |
+
" \"ingestionResult\": \"INGESTED\",\n",
|
1400 |
+
" \"validationTime\": \"2024-04-27T01:53:43.000307Z\"\n",
|
1401 |
+
" }\n",
|
1402 |
+
" }\n",
|
1403 |
+
" ],\n",
|
1404 |
+
" \"meta\": {\n",
|
1405 |
+
" \"great_expectations_version\": \"0.15.12\",\n",
|
1406 |
+
" \"expectation_suite_name\": \"pmi_data\",\n",
|
1407 |
+
" \"run_id\": {\n",
|
1408 |
+
" \"run_name\": null,\n",
|
1409 |
+
" \"run_time\": \"2024-04-27T13:53:43.307739+00:00\"\n",
|
1410 |
+
" },\n",
|
1411 |
+
" \"batch_kwargs\": {\n",
|
1412 |
+
" \"ge_batch_id\": \"8f57f63a-049d-11ef-9d82-e2cf145aedc8\"\n",
|
1413 |
+
" },\n",
|
1414 |
+
" \"batch_markers\": {},\n",
|
1415 |
+
" \"batch_parameters\": {},\n",
|
1416 |
+
" \"validation_time\": \"20240427T135343.307573Z\",\n",
|
1417 |
+
" \"expectation_suite_meta\": {\n",
|
1418 |
+
" \"great_expectations_version\": \"0.15.12\"\n",
|
1419 |
+
" }\n",
|
1420 |
+
" }\n",
|
1421 |
+
" })"
|
1422 |
+
]
|
1423 |
+
},
|
1424 |
+
"execution_count": 32,
|
1425 |
+
"metadata": {},
|
1426 |
+
"output_type": "execute_result"
|
1427 |
+
}
|
1428 |
+
],
|
1429 |
+
"source": [
|
1430 |
+
"air_quality_fg.insert(df_air_quality, write_options={\"wait_for_job\": False})"
|
1431 |
+
]
|
1432 |
+
},
|
1433 |
+
{
|
1434 |
+
"cell_type": "markdown",
|
1435 |
+
"id": "a73a9029",
|
1436 |
+
"metadata": {},
|
1437 |
+
"source": [
|
1438 |
+
"### <span style='color:#ff5f27'> 🌦 Weather Data"
|
1439 |
+
]
|
1440 |
+
},
|
1441 |
+
{
|
1442 |
+
"cell_type": "code",
|
1443 |
+
"execution_count": 33,
|
1444 |
+
"id": "acc2b799",
|
1445 |
+
"metadata": {},
|
1446 |
+
"outputs": [],
|
1447 |
+
"source": [
|
1448 |
+
"weather_fg = fs.get_or_create_feature_group(\n",
|
1449 |
+
" name='weather',\n",
|
1450 |
+
" description='Weather characteristics of each day',\n",
|
1451 |
+
" version=1,\n",
|
1452 |
+
" primary_key=['city_name'], #'unix_time'\n",
|
1453 |
+
" online_enabled=False,\n",
|
1454 |
+
" event_time=\"unix_time\"\n",
|
1455 |
+
") "
|
1456 |
+
]
|
1457 |
+
},
|
1458 |
+
{
|
1459 |
+
"cell_type": "code",
|
1460 |
+
"execution_count": 34,
|
1461 |
+
"id": "9583b4d1-e2e3-4f56-9e5d-23caa0c49457",
|
1462 |
+
"metadata": {
|
1463 |
+
"tags": []
|
1464 |
+
},
|
1465 |
+
"outputs": [
|
1466 |
+
{
|
1467 |
+
"name": "stdout",
|
1468 |
+
"output_type": "stream",
|
1469 |
+
"text": [
|
1470 |
+
"Feature Group created successfully, explore it at \n",
|
1471 |
+
"https://c.app.hopsworks.ai:443/p/549019/fs/544841/fg/760147\n"
|
1472 |
+
]
|
1473 |
+
},
|
1474 |
+
{
|
1475 |
+
"name": "stderr",
|
1476 |
+
"output_type": "stream",
|
1477 |
+
"text": [
|
1478 |
+
"Uploading Dataframe: 100.00% |██████████| Rows 169515/169515 | Elapsed Time: 00:22 | Remaining Time: 00:00\n"
|
1479 |
+
]
|
1480 |
+
},
|
1481 |
+
{
|
1482 |
+
"name": "stdout",
|
1483 |
+
"output_type": "stream",
|
1484 |
+
"text": [
|
1485 |
+
"Launching job: weather_1_offline_fg_materialization\n",
|
1486 |
+
"Job started successfully, you can follow the progress at \n",
|
1487 |
+
"https://c.app.hopsworks.ai/p/549019/jobs/named/weather_1_offline_fg_materialization/executions\n"
|
1488 |
+
]
|
1489 |
+
},
|
1490 |
+
{
|
1491 |
+
"data": {
|
1492 |
+
"text/plain": [
|
1493 |
+
"(<hsfs.core.job.Job at 0x7fb0dcedaf50>, None)"
|
1494 |
+
]
|
1495 |
+
},
|
1496 |
+
"execution_count": 34,
|
1497 |
+
"metadata": {},
|
1498 |
+
"output_type": "execute_result"
|
1499 |
+
}
|
1500 |
+
],
|
1501 |
+
"source": [
|
1502 |
+
"weather_fg.insert(df_weather, write_options={\"wait_for_job\": False})"
|
1503 |
+
]
|
1504 |
+
},
|
1505 |
+
{
|
1506 |
+
"cell_type": "code",
|
1507 |
+
"execution_count": null,
|
1508 |
+
"id": "b087a12f",
|
1509 |
+
"metadata": {},
|
1510 |
+
"outputs": [],
|
1511 |
+
"source": []
|
1512 |
+
}
|
1513 |
+
],
|
1514 |
+
"metadata": {
|
1515 |
+
"kernelspec": {
|
1516 |
+
"display_name": "ucloud-sml",
|
1517 |
+
"language": "python",
|
1518 |
+
"name": "python3"
|
1519 |
+
},
|
1520 |
+
"language_info": {
|
1521 |
+
"codemirror_mode": {
|
1522 |
+
"name": "ipython",
|
1523 |
+
"version": 3
|
1524 |
+
},
|
1525 |
+
"file_extension": ".py",
|
1526 |
+
"mimetype": "text/x-python",
|
1527 |
+
"name": "python",
|
1528 |
+
"nbconvert_exporter": "python",
|
1529 |
+
"pygments_lexer": "ipython3",
|
1530 |
+
"version": "3.11.9"
|
1531 |
+
}
|
1532 |
+
},
|
1533 |
+
"nbformat": 4,
|
1534 |
+
"nbformat_minor": 5
|
1535 |
+
}
|
air_quality-main/2_feature_pipeline.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
|
3 |
+
import modal
|
4 |
+
import datetime
|
5 |
+
import time
|
6 |
+
import requests
|
7 |
+
import pandas as pd
|
8 |
+
import json
|
9 |
+
import hopsworks
|
10 |
+
from functions import *
|
11 |
+
import warnings
|
12 |
+
from urllib.request import urlopen
|
13 |
+
warnings.filterwarnings("ignore")
|
14 |
+
|
15 |
+
stub = modal.Stub("air_quality_daily")
|
16 |
+
image = modal.Image.debian_slim().pip_install(["hopsworks", "geopy"])
|
17 |
+
|
18 |
+
|
19 |
+
def features():
|
20 |
+
target_url='https://repo.hops.works/dev/jdowling/target_cities.json'
|
21 |
+
response = urlopen(target_url)
|
22 |
+
target_cities = json.loads(response.read())
|
23 |
+
|
24 |
+
today = datetime.date.today()
|
25 |
+
hindcast_day = today - datetime.timedelta(days=1)
|
26 |
+
forecast_day = today + datetime.timedelta(days=7)
|
27 |
+
|
28 |
+
|
29 |
+
start_of_cell = time.time()
|
30 |
+
|
31 |
+
df_aq_raw = pd.DataFrame()
|
32 |
+
|
33 |
+
for continent in target_cities:
|
34 |
+
for city_name, coords in target_cities[continent].items():
|
35 |
+
df_ = get_aqi_data_from_open_meteo(city_name=city_name,
|
36 |
+
coordinates=coords,
|
37 |
+
start_date=str(hindcast_day),
|
38 |
+
end_date=str(today))
|
39 |
+
df_aq_raw = pd.concat([df_aq_raw, df_]).reset_index(drop=True)
|
40 |
+
|
41 |
+
end_of_cell = time.time()
|
42 |
+
print("-" * 64)
|
43 |
+
print(f"Parsed new PM2.5 data for ALL locations up to {str(today)}.")
|
44 |
+
print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
|
45 |
+
|
46 |
+
|
47 |
+
df_aq_update = df_aq_raw
|
48 |
+
|
49 |
+
df_aq_update['date'] = pd.to_datetime(df_aq_update['date'])
|
50 |
+
df_aq_update = df_aq_update.dropna()
|
51 |
+
|
52 |
+
df_weather_update = pd.DataFrame()
|
53 |
+
|
54 |
+
start_of_cell = time.time()
|
55 |
+
for continent in target_cities:
|
56 |
+
for city_name, coords in target_cities[continent].items():
|
57 |
+
df_ = get_weather_data_from_open_meteo(city_name=city_name,
|
58 |
+
coordinates=coords,
|
59 |
+
start_date=str(today),
|
60 |
+
end_date=str(forecast_day),
|
61 |
+
forecast=True)
|
62 |
+
df_weather_update = pd.concat([df_weather_update, df_]).reset_index(drop=True)
|
63 |
+
|
64 |
+
end_of_cell = time.time()
|
65 |
+
print("-" * 64)
|
66 |
+
print(f"Parsed new weather data for ALL cities up to {str(today)}.")
|
67 |
+
print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
|
68 |
+
|
69 |
+
|
70 |
+
df_aq_update.date = pd.to_datetime(df_aq_update.date)
|
71 |
+
df_weather_update.date = pd.to_datetime(df_weather_update.date)
|
72 |
+
|
73 |
+
df_aq_update["unix_time"] = df_aq_update["date"].apply(convert_date_to_unix)
|
74 |
+
df_weather_update["unix_time"] = df_weather_update["date"].apply(convert_date_to_unix)
|
75 |
+
|
76 |
+
|
77 |
+
df_aq_update.date = df_aq_update.date.astype(str)
|
78 |
+
df_weather_update.date = df_weather_update.date.astype(str)
|
79 |
+
|
80 |
+
return df_aq_update, df_weather_update
|
81 |
+
|
82 |
+
@stub.function(image=image, schedule=modal.Period(days=1), secret=modal.Secret.from_name("jim-hopsworks-gcp"))
|
83 |
+
def g():
|
84 |
+
df_aq_update, df_weather_update = features()
|
85 |
+
|
86 |
+
project = hopsworks.login()
|
87 |
+
fs = project.get_feature_store()
|
88 |
+
|
89 |
+
air_quality_fg = fs.get_feature_group(
|
90 |
+
name = 'air_quality',
|
91 |
+
version = 1
|
92 |
+
)
|
93 |
+
weather_fg = fs.get_feature_group(
|
94 |
+
name = 'weather',
|
95 |
+
version = 1
|
96 |
+
)
|
97 |
+
air_quality_fg.insert(df_aq_update, write_options={"wait_for_job": False})
|
98 |
+
weather_fg.insert(df_weather_update, write_options={"wait_for_job": False})
|
99 |
+
|
100 |
+
|
101 |
+
|
102 |
+
|
103 |
+
|
104 |
+
if __name__ == "__main__":
|
105 |
+
stub.deploy("air_quality_daily")
|
106 |
+
with stub.run():
|
107 |
+
g()
|
108 |
+
|
air_quality-main/3_training_dataset_and_modeling.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
air_quality-main/LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright [yyyy] [name of copyright owner]
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
air_quality-main/README.md
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# <span style="font-width:bold; font-size: 3rem; color:#1EB182;"><img src="../../images/icon102.png" width="38px"></img> **Hopsworks Feature Store** </span><span style="font-width:bold; font-size: 3rem; color:#333;">Advanced Tutorial - Air Quality Prediction</span>
|
2 |
+
|
3 |
+
|
4 |
+
<span style="font-width:bold; font-size: 1.4rem;">
|
5 |
+
This is an <b>advanced example</b> of the Hopsworks <a href="https://www.hopsworks.ai/feature-store">Feature Store</a> usage; you are tasked with predicting the Air Quality value <a href="https://en.wikipedia.org/wiki/Particulates">(PM2.5)</a> in Europe and USA using weather features and air quality features of the previous days.
|
6 |
+
|
7 |
+
> The [Feature Store](https://www.hopsworks.ai/feature-store) is the essential part of AI infrastructure that helps organisations bring modern enterprise data to analytical and operational ML systems. It is the simplest most powerful way to get your models to production. From anywhere, to anywhere.
|
8 |
+
You will load starting data into the feature store, create two feature groups from which we will make a feature view and training dataset, and train a model to predict fare amounts.
|
9 |
+
Also, you will design a data-generating and Feature Store insertion pipeline, that will be running once a time using <b>GitHub actions</b>.
|
10 |
+
|
11 |
+
<b>Streamlit</b> app will be created so you would be able to try your model on different cities interactively.
|
12 |
+
|
13 |
+
This is a <b>batch use case</b>, it will give you a high-level view of how to use our python APIs and the UI to navigate the feature groups.
|
14 |
+
</span>
|
15 |
+
|
16 |
+
## **🗒️ This whole tutorial is divided into 5 parts:**
|
17 |
+
1. Backfill Features to the Feature Store,
|
18 |
+
2. Create a feature pipeline,
|
19 |
+
3. Create Feature view & Training Datasets, train a model and upload it to the Model Registry,
|
20 |
+
4. Deploy Streamlit app.
|
21 |
+
|
22 |
+
|
23 |
+
## Prerequisites
|
24 |
+
To run this tutorial, you need an account on Hopsworks. You can create a new account at [app.hopsworks.ai](https://app.hopsworks.ai).
|
25 |
+
In the notebook you will be prompted with a link to generate an API token to interact with your Hopsworks account.
|
26 |
+
|
27 |
+
Also, you are required to have some python library installed (See the `requirements.txt` inside this folder).
|
28 |
+
|
29 |
+
|
30 |
+
## Data
|
31 |
+
The data for this project was collected using several different APIs. I used [European Environmental Agency](https://discomap.eea.europa.eu/map/fme/AirQualityExport.htm) to collect data on European cities, and [United States Environmental Protection Agency](https://aqs.epa.gov/aqsweb/documents/data_api.html#daily) for American cities. Both are free and publicly available. They don't require registration or API keys.
|
32 |
+
For the city of Seattle, I found and downloaded the data manually from [here](https://www.epa.gov/outdoor-air-quality-data/download-daily-data).
|
33 |
+
|
34 |
+
In `feature pipeline` you will use a free [Open-Meteo](https://open-meteo.com/en/docs/air-quality-api) API that covers many places in the world (works by coordinates).
|
35 |
+
|
36 |
+
The reason I use so many different APIs instead of just one Open-Meteo is that Open-Meteo only has data from 2022-07-29.
|
37 |
+
|
38 |
+
Anyway, as I said earlier, you don't need any registration or API keys to use any of the above APIs.
|
39 |
+
|
40 |
+
|
41 |
+
## Streamlit run
|
42 |
+
To run streamlit app (after you have run all notebooks and already have required feature groups in Feature Store and model in Model Registry), simply type:
|
43 |
+
|
44 |
+
`python -m streamlit run streamlit_app.py` on Windows
|
45 |
+
|
46 |
+
or
|
47 |
+
|
48 |
+
`python3 -m streamlit run streamlit_app.py` on Unix
|
49 |
+
|
50 |
+
|
51 |
+
## Streamlit usage examples
|
52 |
+
![1.png](images/1.png)
|
53 |
+
![2.png](images/2.png)
|
54 |
+
![3.png](images/3.png)
|
55 |
+
![4.png](images/4.png)
|
56 |
+
![5.png](images/5.png)
|
57 |
+
![6.png](images/6.png)
|
58 |
+
|
59 |
+
|
60 |
+
### Edited
|
air_quality-main/air_quality_model/residplot.png
ADDED
air_quality-main/air_quality_model/xgboost_pipeline.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d4c05647dc4d02eac4dd6463a6913c052a1f60dc1e835f12f67e2e5dc70dfa6f
|
3 |
+
size 355567
|
air_quality-main/app.py
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import time
|
3 |
+
import pickle
|
4 |
+
import joblib
|
5 |
+
|
6 |
+
import hopsworks
|
7 |
+
import streamlit as st
|
8 |
+
from geopy import distance
|
9 |
+
|
10 |
+
import plotly.express as px
|
11 |
+
import folium
|
12 |
+
from streamlit_folium import st_folium
|
13 |
+
|
14 |
+
from functions import *
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
def print_fancy_header(text, font_size=22, color="#ff5f27"):
|
19 |
+
res = f'<span style="color:{color}; font-size: {font_size}px;">{text}</span>'
|
20 |
+
st.markdown(res, unsafe_allow_html=True)
|
21 |
+
|
22 |
+
@st.cache_data()
|
23 |
+
def get_batch_data_from_fs(td_version, date_threshold):
|
24 |
+
st.write(f"Retrieving the Batch data since {date_threshold}")
|
25 |
+
feature_view.init_batch_scoring(training_dataset_version=td_version)
|
26 |
+
|
27 |
+
batch_data = feature_view.get_batch_data(start_time=date_threshold)
|
28 |
+
return batch_data
|
29 |
+
|
30 |
+
|
31 |
+
@st.cache_data()
|
32 |
+
def download_model(name="air_quality_xgboost_model", version=1):
|
33 |
+
mr = project.get_model_registry()
|
34 |
+
retrieved_model = mr.get_model(
|
35 |
+
name="air_quality_xgboost_model",
|
36 |
+
version=1
|
37 |
+
)
|
38 |
+
saved_model_dir = retrieved_model.download()
|
39 |
+
return saved_model_dir
|
40 |
+
|
41 |
+
|
42 |
+
|
43 |
+
def plot_pm2_5(df):
|
44 |
+
# create figure with plotly express
|
45 |
+
fig = px.line(df, x='date', y='pm2_5', color='city_name')
|
46 |
+
|
47 |
+
# customize line colors and styles
|
48 |
+
fig.update_traces(mode='lines+markers')
|
49 |
+
fig.update_layout({
|
50 |
+
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
|
51 |
+
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
|
52 |
+
'legend_title': 'City',
|
53 |
+
'legend_font': {'size': 12},
|
54 |
+
'legend_bgcolor': 'rgba(0, 0, 0, 0)',
|
55 |
+
'xaxis': {'title': 'Date'},
|
56 |
+
'yaxis': {'title': 'PM2.5'},
|
57 |
+
'shapes': [{
|
58 |
+
'type': 'line',
|
59 |
+
'x0': datetime.datetime.now().strftime('%Y-%m-%d'),
|
60 |
+
'y0': 0,
|
61 |
+
'x1': datetime.datetime.now().strftime('%Y-%m-%d'),
|
62 |
+
'y1': df['pm2_5'].max(),
|
63 |
+
'line': {'color': 'red', 'width': 2, 'dash': 'dashdot'}
|
64 |
+
}]
|
65 |
+
})
|
66 |
+
|
67 |
+
# show plot
|
68 |
+
st.plotly_chart(fig, use_container_width=True)
|
69 |
+
|
70 |
+
|
71 |
+
with open('target_cities.json') as json_file:
|
72 |
+
target_cities = json.load(json_file)
|
73 |
+
|
74 |
+
|
75 |
+
#########################
|
76 |
+
st.title('🌫 Air Quality Prediction 🌦')
|
77 |
+
|
78 |
+
st.write(3 * "-")
|
79 |
+
print_fancy_header('\n📡 Connecting to Hopsworks Feature Store...')
|
80 |
+
|
81 |
+
st.write("Logging... ")
|
82 |
+
# (Attention! If the app has stopped at this step,
|
83 |
+
# please enter your Hopsworks API Key in the commmand prompt.)
|
84 |
+
project = hopsworks.login()
|
85 |
+
fs = project.get_feature_store()
|
86 |
+
st.write("✅ Logged in successfully!")
|
87 |
+
|
88 |
+
st.write("Getting the Feature View...")
|
89 |
+
feature_view = fs.get_feature_view(
|
90 |
+
name = 'air_quality_fv',
|
91 |
+
version = 1
|
92 |
+
)
|
93 |
+
st.write("✅ Success!")
|
94 |
+
|
95 |
+
# I am going to load data for of last 60 days (for feature engineering)
|
96 |
+
today = datetime.date.today()
|
97 |
+
date_threshold = today - datetime.timedelta(days=60)
|
98 |
+
|
99 |
+
st.write(3 * "-")
|
100 |
+
print_fancy_header('\n☁️ Retriving batch data from Feature Store...')
|
101 |
+
batch_data = get_batch_data_from_fs(td_version=1,
|
102 |
+
date_threshold=date_threshold)
|
103 |
+
|
104 |
+
st.write("Batch data:")
|
105 |
+
st.write(batch_data.sample(5))
|
106 |
+
|
107 |
+
|
108 |
+
saved_model_dir = download_model(
|
109 |
+
name="air_quality_xgboost_model",
|
110 |
+
version=1
|
111 |
+
)
|
112 |
+
|
113 |
+
pipeline = joblib.load(saved_model_dir + "/xgboost_pipeline.pkl")
|
114 |
+
st.write("\n")
|
115 |
+
st.write("✅ Model was downloaded and cached.")
|
116 |
+
|
117 |
+
st.write(3 * '-')
|
118 |
+
st.write("\n")
|
119 |
+
print_fancy_header(text="🖍 Select the cities using the form below. \
|
120 |
+
Click the 'Submit' button at the bottom of the form to continue.",
|
121 |
+
font_size=22)
|
122 |
+
dict_for_streamlit = {}
|
123 |
+
for continent in target_cities:
|
124 |
+
for city_name, coords in target_cities[continent].items():
|
125 |
+
dict_for_streamlit[city_name] = coords
|
126 |
+
selected_cities_full_list = []
|
127 |
+
|
128 |
+
with st.form(key="user_inputs"):
|
129 |
+
print_fancy_header(text='\n🗺 Here you can choose cities from the drop-down menu',
|
130 |
+
font_size=20, color="#00FFFF")
|
131 |
+
|
132 |
+
cities_multiselect = st.multiselect(label='',
|
133 |
+
options=dict_for_streamlit.keys())
|
134 |
+
selected_cities_full_list.extend(cities_multiselect)
|
135 |
+
st.write("_" * 3)
|
136 |
+
print_fancy_header(text="\n📌 To add a city using the interactive map, click somewhere \
|
137 |
+
(for the coordinates to appear)",
|
138 |
+
font_size=20, color="#00FFFF")
|
139 |
+
|
140 |
+
my_map = folium.Map(location=[42.57, -44.092], zoom_start=2)
|
141 |
+
# Add markers for each city
|
142 |
+
for city_name, coords in dict_for_streamlit.items():
|
143 |
+
folium.CircleMarker(
|
144 |
+
location=coords
|
145 |
+
).add_to(my_map)
|
146 |
+
|
147 |
+
my_map.add_child(folium.LatLngPopup())
|
148 |
+
res_map = st_folium(my_map, width=640, height=480)
|
149 |
+
|
150 |
+
try:
|
151 |
+
new_lat, new_long = res_map["last_clicked"]["lat"], res_map["last_clicked"]["lng"]
|
152 |
+
|
153 |
+
# Calculate the distance between the clicked location and each city
|
154 |
+
distances = {city: distance.distance(coord, (new_lat, new_long)).km for city, coord in dict_for_streamlit.items()}
|
155 |
+
|
156 |
+
# Find the city with the minimum distance and print its name
|
157 |
+
nearest_city = min(distances, key=distances.get)
|
158 |
+
print_fancy_header(text=f"You have selected {nearest_city} using map", font_size=18, color="#52fa23")
|
159 |
+
|
160 |
+
selected_cities_full_list.append(nearest_city)
|
161 |
+
st.write(label_encoder.transform([nearest_city])[0])
|
162 |
+
|
163 |
+
except Exception as err:
|
164 |
+
print(err)
|
165 |
+
pass
|
166 |
+
|
167 |
+
submit_button = st.form_submit_button(label='Submit')
|
168 |
+
|
169 |
+
if submit_button:
|
170 |
+
st.write('Selected cities:', selected_cities_full_list)
|
171 |
+
|
172 |
+
st.write(3*'-')
|
173 |
+
|
174 |
+
dataset = batch_data
|
175 |
+
|
176 |
+
dataset = dataset.sort_values(by=["city_name", "date"])
|
177 |
+
|
178 |
+
st.write("\n")
|
179 |
+
print_fancy_header(text='\n🧠 Predicting PM2.5 for selected cities...',
|
180 |
+
font_size=18, color="#FDF4F5")
|
181 |
+
st.write("")
|
182 |
+
preds = pd.DataFrame(columns=dataset.columns)
|
183 |
+
for city_name in selected_cities_full_list:
|
184 |
+
st.write(f"\t * {city_name}...")
|
185 |
+
features = dataset.loc[dataset['city_name'] == city_name]
|
186 |
+
print(features.head())
|
187 |
+
features['pm2_5'] = pipeline.predict(features)
|
188 |
+
preds = pd.concat([preds, features])
|
189 |
+
|
190 |
+
st.write("")
|
191 |
+
print_fancy_header(text="📈Results 📉",
|
192 |
+
font_size=22)
|
193 |
+
plot_pm2_5(preds[preds['city_name'].isin(selected_cities_full_list)])
|
194 |
+
|
195 |
+
st.write(3 * "-")
|
196 |
+
st.subheader('\n🎉 📈 🤝 App Finished Successfully 🤝 📈 🎉')
|
197 |
+
st.button("Re-run")
|
air_quality-main/data/backfill_pm2_5.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
air_quality-main/data/backfill_pm2_5_eu.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
air_quality-main/data/backfill_pm2_5_seattle.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
air_quality-main/data/backfill_pm2_5_us.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
air_quality-main/data/backfill_weather.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:961b3e35cdadab7852e087ba154e4b5046eb82f95df86f5043bb0773a94a529d
|
3 |
+
size 10925454
|
air_quality-main/data/seattle_pm25_2013.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
air_quality-main/data/seattle_pm25_2014.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
air_quality-main/data/seattle_pm25_2015.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
air_quality-main/data/seattle_pm25_2016.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
air_quality-main/data/seattle_pm25_2017.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
air_quality-main/data/seattle_pm25_2018.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
air_quality-main/data/seattle_pm25_2019.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
air_quality-main/data/seattle_pm25_2020.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
air_quality-main/data/seattle_pm25_2021.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
air_quality-main/data/seattle_pm25_2022.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
air_quality-main/data/seattle_pm25_2023.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
air_quality-main/functions.py
ADDED
@@ -0,0 +1,385 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import datetime
|
3 |
+
import time
|
4 |
+
import requests
|
5 |
+
import pandas as pd
|
6 |
+
import json
|
7 |
+
|
8 |
+
from geopy.geocoders import Nominatim
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
def convert_date_to_unix(x):
|
14 |
+
"""
|
15 |
+
Convert datetime to unix time in milliseconds.
|
16 |
+
"""
|
17 |
+
dt_obj = datetime.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S')
|
18 |
+
dt_obj = int(dt_obj.timestamp() * 1000)
|
19 |
+
return dt_obj
|
20 |
+
|
21 |
+
|
22 |
+
def get_city_coordinates(city_name: str):
|
23 |
+
"""
|
24 |
+
Takes city name and returns its latitude and longitude (rounded to 2 digits after dot).
|
25 |
+
"""
|
26 |
+
# Initialize Nominatim API (for getting lat and long of the city)
|
27 |
+
geolocator = Nominatim(user_agent="MyApp")
|
28 |
+
city = geolocator.geocode(city_name)
|
29 |
+
|
30 |
+
latitude = round(city.latitude, 2)
|
31 |
+
longitude = round(city.longitude, 2)
|
32 |
+
|
33 |
+
return latitude, longitude
|
34 |
+
|
35 |
+
|
36 |
+
##################################### EEA
|
37 |
+
def convert_to_daily(df, pollutant: str):
|
38 |
+
"""
|
39 |
+
Returns DataFrame where pollutant column is resampled to days and rounded.
|
40 |
+
"""
|
41 |
+
res_df = df.copy()
|
42 |
+
# convert dates in 'time' column
|
43 |
+
res_df["date"] = pd.to_datetime(res_df["date"])
|
44 |
+
|
45 |
+
# I want data daily, not hourly (mean per each day = 1 datarow per 1 day)
|
46 |
+
res_df = res_df.set_index('date')
|
47 |
+
res_df = res_df[pollutant].resample('1d').mean().reset_index()
|
48 |
+
res_df[pollutant] = res_df[pollutant].fillna(res_df[pollutant].median())
|
49 |
+
res_df[pollutant] = res_df[pollutant].apply(lambda x: round(x, 0))
|
50 |
+
|
51 |
+
return res_df
|
52 |
+
|
53 |
+
|
54 |
+
def find_fullest_csv(csv_links: list, year: str):
|
55 |
+
candidates = [link for link in csv_links if str(year) in link]
|
56 |
+
biggest_df = pd.read_csv(candidates[0])
|
57 |
+
for link in candidates[1:]:
|
58 |
+
_df = pd.read_csv(link)
|
59 |
+
if len(biggest_df) < len(_df):
|
60 |
+
biggest_df = _df
|
61 |
+
return biggest_df
|
62 |
+
|
63 |
+
|
64 |
+
def get_air_quality_from_eea(city_name: str,
|
65 |
+
pollutant: str,
|
66 |
+
start_year: str,
|
67 |
+
end_year: str):
|
68 |
+
"""
|
69 |
+
Takes city name, daterange and returns pandas DataFrame with daily air quality data.
|
70 |
+
It parses data by 1-year batches, so please specify years, not dates. (example: "2014", "2022"...)
|
71 |
+
|
72 |
+
EEA means European Environmental Agency. So it has data for Europe Union countries ONLY.
|
73 |
+
"""
|
74 |
+
start_of_cell = time.time()
|
75 |
+
|
76 |
+
params = {
|
77 |
+
'CountryCode': '',
|
78 |
+
'CityName': city_name,
|
79 |
+
'Pollutant': pollutant.upper(),
|
80 |
+
'Year_from': start_year,
|
81 |
+
'Year_to': end_year,
|
82 |
+
'Station': '',
|
83 |
+
'Source': 'All',
|
84 |
+
'Samplingpoint': '',
|
85 |
+
'Output': 'TEXT',
|
86 |
+
'UpdateDate': '',
|
87 |
+
'TimeCoverage': 'Year'
|
88 |
+
}
|
89 |
+
|
90 |
+
# observations endpoint
|
91 |
+
base_url = "https://fme.discomap.eea.europa.eu/fmedatastreaming/AirQualityDownload/AQData_Extract.fmw?"
|
92 |
+
try:
|
93 |
+
response = requests.get(base_url, params=params)
|
94 |
+
except ConnectionError:
|
95 |
+
response = requests.get(base_url, params=params)
|
96 |
+
|
97 |
+
response.encoding = response.apparent_encoding
|
98 |
+
csv_links = response.text.split("\r\n")
|
99 |
+
|
100 |
+
res_df = pd.DataFrame()
|
101 |
+
target_year = int(start_year)
|
102 |
+
|
103 |
+
for year in range(int(start_year), int(end_year) + 1):
|
104 |
+
try:
|
105 |
+
# find the fullest, the biggest csv file with observations for this particular year
|
106 |
+
_df = find_fullest_csv(csv_links, year)
|
107 |
+
# append it to res_df
|
108 |
+
res_df = pd.concat([res_df, _df])
|
109 |
+
except IndexError:
|
110 |
+
print(f"!! Missing data for {year} for {city} city.")
|
111 |
+
pass
|
112 |
+
|
113 |
+
pollutant = pollutant.lower()
|
114 |
+
if pollutant == "pm2.5":
|
115 |
+
pollutant = "pm2_5"
|
116 |
+
|
117 |
+
res_df = res_df.rename(columns={
|
118 |
+
'DatetimeBegin': 'date',
|
119 |
+
'Concentration': pollutant
|
120 |
+
})
|
121 |
+
|
122 |
+
# cut timezones info
|
123 |
+
res_df['date'] = res_df['date'].apply(lambda x: x[:-6])
|
124 |
+
# convert dates in 'time' column
|
125 |
+
res_df['date'] = pd.to_datetime(res_df['date'])
|
126 |
+
|
127 |
+
res_df = convert_to_daily(res_df, pollutant)
|
128 |
+
|
129 |
+
res_df['city_name'] = city_name
|
130 |
+
res_df = res_df[['city_name', 'date', pollutant.lower()]]
|
131 |
+
|
132 |
+
end_of_cell = time.time()
|
133 |
+
|
134 |
+
print(f"Processed {pollutant.upper()} for {city_name} since {start_year} till {end_year}.")
|
135 |
+
print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
|
136 |
+
|
137 |
+
return res_df
|
138 |
+
|
139 |
+
|
140 |
+
|
141 |
+
##################################### USEPA
|
142 |
+
city_code_dict = {}
|
143 |
+
pollutant_dict = {
|
144 |
+
'CO': '42101',
|
145 |
+
'SO2': '42401',
|
146 |
+
'NO2': '42602',
|
147 |
+
'O3': '44201',
|
148 |
+
'PM10': '81102',
|
149 |
+
'PM2.5': '88101'
|
150 |
+
}
|
151 |
+
|
152 |
+
def get_city_code(city_name: str):
|
153 |
+
"Encodes city name to be used later for data parsing using USEPA."
|
154 |
+
if city_code_dict:
|
155 |
+
city_full = [i for i in city_code_dict.keys() if city_name in i][0]
|
156 |
+
return city_code_dict[city_full]
|
157 |
+
else:
|
158 |
+
params = {
|
159 |
+
"email": "test@aqs.api",
|
160 |
+
"key": "test"
|
161 |
+
}
|
162 |
+
response = requests.get("https://aqs.epa.gov/data/api/list/cbsas?", params)
|
163 |
+
response_json = response.json()
|
164 |
+
data = response_json["Data"]
|
165 |
+
for item in data:
|
166 |
+
city_code_dict[item['value_represented']] = item['code']
|
167 |
+
|
168 |
+
return get_city_code(city_name)
|
169 |
+
|
170 |
+
|
171 |
+
def get_air_quality_from_usepa(city_name: str,
|
172 |
+
pollutant: str,
|
173 |
+
start_date: str,
|
174 |
+
end_date: str):
|
175 |
+
"""
|
176 |
+
Takes city name, daterange and returns pandas DataFrame with daily air quality data.
|
177 |
+
|
178 |
+
USEPA means United States Environmental Protection Agency. So it has data for US ONLY.
|
179 |
+
"""
|
180 |
+
start_of_cell = time.time()
|
181 |
+
res_df = pd.DataFrame()
|
182 |
+
|
183 |
+
for start_date_, end_date_ in make_date_intervals(start_date, end_date):
|
184 |
+
params = {
|
185 |
+
"email": "test@aqs.api",
|
186 |
+
"key": "test",
|
187 |
+
"param": pollutant_dict[pollutant.upper().replace("_", ".")], # encoded pollutant
|
188 |
+
"bdate": start_date_,
|
189 |
+
"edate": end_date_,
|
190 |
+
"cbsa": get_city_code(city_name) # Core-based statistical area
|
191 |
+
}
|
192 |
+
|
193 |
+
# observations endpoint
|
194 |
+
base_url = "https://aqs.epa.gov/data/api/dailyData/byCBSA?"
|
195 |
+
|
196 |
+
response = requests.get(base_url, params=params)
|
197 |
+
response_json = response.json()
|
198 |
+
|
199 |
+
df_ = pd.DataFrame(response_json["Data"])
|
200 |
+
|
201 |
+
pollutant = pollutant.lower()
|
202 |
+
if pollutant == "pm2.5":
|
203 |
+
pollutant = "pm2_5"
|
204 |
+
df_ = df_.rename(columns={
|
205 |
+
'date_local': 'date',
|
206 |
+
'arithmetic_mean': pollutant
|
207 |
+
})
|
208 |
+
|
209 |
+
# convert dates in 'date' column
|
210 |
+
df_['date'] = pd.to_datetime(df_['date'])
|
211 |
+
df_['city_name'] = city_name
|
212 |
+
df_ = df_[['city_name', 'date', pollutant]]
|
213 |
+
res_df = pd.concat([res_df, df_])
|
214 |
+
|
215 |
+
# there are duplicated rows (several records for the same day and station). get rid of it.
|
216 |
+
res_df = res_df.groupby(['date', 'city_name'], as_index=False)[pollutant].mean()
|
217 |
+
res_df[pollutant] = round(res_df[pollutant], 1)
|
218 |
+
|
219 |
+
end_of_cell = time.time()
|
220 |
+
print(f"Processed {pollutant.upper()} for {city_name} since {start_date} till {end_date}.")
|
221 |
+
print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
|
222 |
+
|
223 |
+
return res_df
|
224 |
+
|
225 |
+
|
226 |
+
def make_date_intervals(start_date, end_date):
|
227 |
+
start_dt = datetime.datetime.strptime(start_date, '%Y-%m-%d')
|
228 |
+
end_dt = datetime.datetime.strptime(end_date, '%Y-%m-%d')
|
229 |
+
date_intervals = []
|
230 |
+
for year in range(start_dt.year, end_dt.year + 1):
|
231 |
+
year_start = datetime.datetime(year, 1, 1)
|
232 |
+
year_end = datetime.datetime(year, 12, 31)
|
233 |
+
interval_start = max(start_dt, year_start)
|
234 |
+
interval_end = min(end_dt, year_end)
|
235 |
+
if interval_start < interval_end:
|
236 |
+
date_intervals.append((interval_start.strftime('%Y%m%d'), interval_end.strftime('%Y%m%d')))
|
237 |
+
return date_intervals
|
238 |
+
|
239 |
+
##################################### Weather Open Meteo
|
240 |
+
def get_weather_data_from_open_meteo(city_name: str,
|
241 |
+
start_date: str,
|
242 |
+
end_date: str,
|
243 |
+
coordinates: list = None,
|
244 |
+
forecast: bool = False):
|
245 |
+
"""
|
246 |
+
Takes [city name OR coordinates] and returns pandas DataFrame with weather data.
|
247 |
+
|
248 |
+
Examples of arguments:
|
249 |
+
coordinates=(47.755, -122.2806), start_date="2023-01-01"
|
250 |
+
"""
|
251 |
+
start_of_cell = time.time()
|
252 |
+
|
253 |
+
if coordinates:
|
254 |
+
latitude, longitude = coordinates
|
255 |
+
else:
|
256 |
+
latitude, longitude = get_city_coordinates(city_name=city_name)
|
257 |
+
|
258 |
+
params = {
|
259 |
+
'latitude': latitude,
|
260 |
+
'longitude': longitude,
|
261 |
+
'daily': ["temperature_2m_max", "temperature_2m_min",
|
262 |
+
"precipitation_sum", "rain_sum", "snowfall_sum",
|
263 |
+
"precipitation_hours", "windspeed_10m_max",
|
264 |
+
"windgusts_10m_max", "winddirection_10m_dominant"],
|
265 |
+
'start_date': start_date,
|
266 |
+
'end_date': end_date,
|
267 |
+
'timezone': "Europe/London"
|
268 |
+
}
|
269 |
+
|
270 |
+
if forecast:
|
271 |
+
# historical forecast endpoint
|
272 |
+
base_url = 'https://api.open-meteo.com/v1/forecast'
|
273 |
+
else:
|
274 |
+
# historical observations endpoint
|
275 |
+
base_url = 'https://archive-api.open-meteo.com/v1/archive'
|
276 |
+
|
277 |
+
try:
|
278 |
+
response = requests.get(base_url, params=params)
|
279 |
+
except ConnectionError:
|
280 |
+
response = requests.get(base_url, params=params)
|
281 |
+
|
282 |
+
response_json = response.json()
|
283 |
+
res_df = pd.DataFrame(response_json["daily"])
|
284 |
+
res_df["city_name"] = city_name
|
285 |
+
|
286 |
+
# rename columns
|
287 |
+
res_df = res_df.rename(columns={
|
288 |
+
"time": "date",
|
289 |
+
"temperature_2m_max": "temperature_max",
|
290 |
+
"temperature_2m_min": "temperature_min",
|
291 |
+
"windspeed_10m_max": "wind_speed_max",
|
292 |
+
"winddirection_10m_dominant": "wind_direction_dominant",
|
293 |
+
"windgusts_10m_max": "wind_gusts_max"
|
294 |
+
})
|
295 |
+
|
296 |
+
# change columns order
|
297 |
+
res_df = res_df[
|
298 |
+
['city_name', 'date', 'temperature_max', 'temperature_min',
|
299 |
+
'precipitation_sum', 'rain_sum', 'snowfall_sum',
|
300 |
+
'precipitation_hours', 'wind_speed_max',
|
301 |
+
'wind_gusts_max', 'wind_direction_dominant']
|
302 |
+
]
|
303 |
+
|
304 |
+
# convert dates in 'date' column
|
305 |
+
res_df["date"] = pd.to_datetime(res_df["date"])
|
306 |
+
end_of_cell = time.time()
|
307 |
+
print(f"Parsed weather for {city_name} since {start_date} till {end_date}.")
|
308 |
+
print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
|
309 |
+
|
310 |
+
return res_df
|
311 |
+
|
312 |
+
|
313 |
+
##################################### Air Quality data from Open Meteo
|
314 |
+
def get_aqi_data_from_open_meteo(city_name: str,
|
315 |
+
start_date: str,
|
316 |
+
end_date: str,
|
317 |
+
coordinates: list = None,
|
318 |
+
pollutant: str = "pm2_5"):
|
319 |
+
"""
|
320 |
+
Takes [city name OR coordinates] and returns pandas DataFrame with AQI data.
|
321 |
+
|
322 |
+
Examples of arguments:
|
323 |
+
...
|
324 |
+
coordinates=(47.755, -122.2806),
|
325 |
+
start_date="2023-01-01",
|
326 |
+
pollutant="no2"
|
327 |
+
...
|
328 |
+
"""
|
329 |
+
start_of_cell = time.time()
|
330 |
+
|
331 |
+
if coordinates:
|
332 |
+
latitude, longitude = coordinates
|
333 |
+
else:
|
334 |
+
latitude, longitude = get_city_coordinates(city_name=city_name)
|
335 |
+
|
336 |
+
pollutant = pollutant.lower()
|
337 |
+
if pollutant == "pm2.5":
|
338 |
+
pollutant = "pm2_5"
|
339 |
+
|
340 |
+
# make it work with both "no2" and "nitrogen_dioxide" passed.
|
341 |
+
if pollutant == "no2":
|
342 |
+
pollutant = "nitrogen_dioxide"
|
343 |
+
|
344 |
+
params = {
|
345 |
+
'latitude': latitude,
|
346 |
+
'longitude': longitude,
|
347 |
+
'hourly': [pollutant],
|
348 |
+
'start_date': start_date,
|
349 |
+
'end_date': end_date,
|
350 |
+
'timezone': "Europe/London"
|
351 |
+
}
|
352 |
+
|
353 |
+
# base endpoint
|
354 |
+
base_url = "https://air-quality-api.open-meteo.com/v1/air-quality"
|
355 |
+
try:
|
356 |
+
response = requests.get(base_url, params=params)
|
357 |
+
except ConnectionError:
|
358 |
+
response = requests.get(base_url, params=params)
|
359 |
+
response_json = response.json()
|
360 |
+
res_df = pd.DataFrame(response_json["hourly"])
|
361 |
+
|
362 |
+
# convert dates
|
363 |
+
res_df["time"] = pd.to_datetime(res_df["time"])
|
364 |
+
|
365 |
+
# resample to days
|
366 |
+
res_df = res_df.groupby(res_df['time'].dt.date).mean(numeric_only=True).reset_index()
|
367 |
+
res_df[pollutant] = round(res_df[pollutant], 1)
|
368 |
+
|
369 |
+
# rename columns
|
370 |
+
res_df = res_df.rename(columns={
|
371 |
+
"time": "date"
|
372 |
+
})
|
373 |
+
|
374 |
+
res_df["city_name"] = city_name
|
375 |
+
|
376 |
+
# change columns order
|
377 |
+
res_df = res_df[
|
378 |
+
['city_name', 'date', pollutant]
|
379 |
+
]
|
380 |
+
end_of_cell = time.time()
|
381 |
+
print(f"Processed {pollutant.upper()} for {city_name} since {start_date} till {end_date}.")
|
382 |
+
print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
|
383 |
+
|
384 |
+
return res_df
|
385 |
+
|
air_quality-main/hopsworks-login.sh.example
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
|
4 |
+
export HOPSWORKS_PROJECT=pydata
|
5 |
+
export HOPSWORKS_HOST=35.240.110.235
|
6 |
+
export HOPSWORKS_API_KEY=DDDDD
|
7 |
+
|
8 |
+
export MODAL_API_KEY=ak-DDDD
|
9 |
+
|
air_quality-main/images/1.png
ADDED
air_quality-main/images/2.png
ADDED
air_quality-main/images/3.png
ADDED
air_quality-main/images/4.png
ADDED
air_quality-main/images/5.png
ADDED
air_quality-main/images/6.png
ADDED
air_quality-main/requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
hopsworks==3.2.0rc0
|
2 |
+
geopy
|
3 |
+
python-dotenv
|
4 |
+
streamlit
|
5 |
+
streamlit-folium
|
6 |
+
joblib
|
7 |
+
plotly
|
8 |
+
nbconvert
|
air_quality-main/scripts/run-feature-pipeline.sh
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
set -e
|
4 |
+
|
5 |
+
jupyter nbconvert --to notebook --execute 2_feature_pipeline.ipynb
|
air_quality-main/target_cities.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"EU": {"Amsterdam": [52.37, 4.89], "Athina": [37.98, 23.73], "Berlin": [52.52, 13.39], "Gdansk": [54.37, 18.61], "Krak\u00f3w": [50.06, 19.94], "London": [51.51, -0.13], "Madrid": [40.42, -3.7], "Marseille": [43.3, 5.37], "Milano": [45.46, 9.19], "M\u00fcnchen": [48.14, 11.58], "Napoli": [40.84, 14.25], "Paris": [48.85, 2.35], "Sevilla": [37.39, -6.0], "Stockholm": [59.33, 18.07], "Tallinn": [59.44, 24.75], "Varna": [43.21, 27.92], "Wien": [48.21, 16.37]}, "US": {"Albuquerque": [35.08, -106.65], "Atlanta": [33.75, -84.39], "Chicago": [41.88, -87.62], "Columbus": [39.96, -83.0], "Dallas": [32.78, -96.8], "Denver": [39.74, -104.98], "Houston": [29.76, -95.37], "Los Angeles": [34.05, -118.24], "New York": [40.71, -74.01], "Phoenix-Mesa": [33.66, -112.04], "Salt Lake City": [40.76, -111.89], "San Francisco": [37.78, -122.42], "Tampa": [27.95, -82.46]}, "Seattle": {"Bellevue-SE 12th St": [47.60086, -122.1484], "DARRINGTON - FIR ST (Darrington High School)": [48.2469, -121.6031], "KENT - JAMES & CENTRAL": [47.38611, -122.23028], "LAKE FOREST PARK TOWNE CENTER": [47.755, -122.2806], "MARYSVILLE - 7TH AVE (Marysville Junior High)": [48.05432, -122.17153], "NORTH BEND - NORTH BEND WAY": [47.49022, -121.77278], "SEATTLE - BEACON HILL": [47.56824, -122.30863], "SEATTLE - DUWAMISH": [47.55975, -122.33827], "SEATTLE - SOUTH PARK #2": [47.53091, -122.3208], "Seattle-10th & Weller": [47.59722, -122.31972], "TACOMA - ALEXANDER AVE": [47.2656, -122.3858], "TACOMA - L STREET": [47.1864, -122.4517], "Tacoma-S 36th St": [47.22634, -122.46256], "Tukwila Allentown": [47.49854, -122.27839], "Tulalip-Totem Beach Rd": [48.06534, -122.28519]}}
|