Spaces:

Robzy
/

hbg-weather

Running

App Files Files Community

Robzy commited on 13 days ago

Commit

2e28476

•

1 Parent(s): be43693

finalizing

Browse files

Files changed (13) hide show

air_quality_model/images/feature_importance.png +0 -0
air_quality_model/images/pm25_hindcast.png +0 -0
air_quality_model/model.json +0 -0
app_streamlit.py +12 -11
backfill.ipynb +587 -0
backfill.py +0 -62
data/lahore.csv +0 -0
debug.ipynb +64 -363
functions/__pycache__/util.cpython-312.pyc +0 -0
functions/{merge_df.py → retrieve.py} +4 -1
functions/util.py +13 -9
inference_pipeline.py +1 -95
training.py +270 -0

air_quality_model/images/feature_importance.png ADDED Viewed

air_quality_model/images/pm25_hindcast.png ADDED Viewed

air_quality_model/model.json ADDED Viewed

The diff for this file is too large to render. See raw diff

app_streamlit.py CHANGED Viewed

@@ -3,7 +3,7 @@ import pandas as pd
 import numpy as np
 import datetime
 import hopsworks
-from functions import figure, util
 import os
 import pickle
 import plotly.express as px
@@ -13,17 +13,18 @@ import os
 # Real data
-#df = get_merged_dataframe()
 # Dummmy data
-size = 400
-data = {
-    'date': pd.date_range(start='2023-01-01', periods=size, freq='D'),
-    'pm25': np.random.randint(50, 150, size=size),
-    'predicted_pm25': np.random.randint(50, 150, size=size)
-}
-df = pd.DataFrame(data)
 # Page configuration
@@ -42,5 +43,5 @@ st.subheader('Forecast and hindcast')
 st.subheader('Unit: PM25 - particle matter of diameter < 2.5 micrometers')
 # Plotting
-fig = figure.plot(df)
 st.plotly_chart(fig, use_container_width=True)

 import numpy as np
 import datetime
 import hopsworks
+from functions import figure, retrieve
 import os
 import pickle
 import plotly.express as px
 # Real data
+today = datetime.today().strftime('%Y-%m-%d')
+df = retrieve.get_merged_dataframe()
+n = len(df[df['pm25'].isna()]) - 1
 # Dummmy data
+# size = 400
+# data = {
+#     'date': pd.date_range(start='2023-01-01', periods=size, freq='D'),
+#     'pm25': np.random.randint(50, 150, size=size),
+#     'predicted_pm25': np.random.randint(50, 150, size=size)
+# }
+# df = pd.DataFrame(data)
 # Page configuration
 st.subheader('Unit: PM25 - particle matter of diameter < 2.5 micrometers')
 # Plotting
+fig = figure.plot(df, n=n)
 st.plotly_chart(fig, use_container_width=True)

backfill.ipynb ADDED Viewed

	@@ -0,0 +1,587 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Connected. Call `.close()` to terminate connection gracefully.\n",
+      "\n",
+      "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1160340\n",
+      "2024-11-21 05:38:56,037 WARNING: using legacy validation callback\n",
+      "Connected. Call `.close()` to terminate connection gracefully.\n",
+      "Connected. Call `.close()` to terminate connection gracefully.\n",
+      "Deleted air_quality_fv/1\n",
+      "Deleted air_quality/1\n",
+      "Deleted weather/1\n",
+      "Deleted aq_predictions/1\n",
+      "Deleted model air_quality_xgboost_model/1\n",
+      "Connected. Call `.close()` to terminate connection gracefully.\n",
+      "No SENSOR_LOCATION_JSON secret found\n"
+     ]
+    }
+   ],
+   "source": [
+    "import datetime\n",
+    "import requests\n",
+    "import pandas as pd\n",
+    "import hopsworks\n",
+    "import datetime\n",
+    "from pathlib import Path\n",
+    "from functions import util\n",
+    "import json\n",
+    "import re\n",
+    "import os\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "AQI_API_KEY = os.getenv('AQI_API_KEY')\n",
+    "api_key = os.getenv('HOPSWORKS_API_KEY')\n",
+    "project_name = os.getenv('HOPSWORKS_PROJECT')\n",
+    "project = hopsworks.login(project=project_name, api_key_value=api_key)\n",
+    "util.purge_project(project)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File successfully found at the path: data/lahore.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "country=\"pakistan\"\n",
+    "city = \"lahore\"\n",
+    "street = \"pakistan-lahore-cantonment\"\n",
+    "aqicn_url=\"https://api.waqi.info/feed/A74005\"\n",
+    "\n",
+    "latitude, longitude = util.get_city_coordinates(city)\n",
+    "today = datetime.date.today()\n",
+    "\n",
+    "csv_file=\"data/lahore.csv\"\n",
+    "util.check_file_path(csv_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Connected. Call `.close()` to terminate connection gracefully.\n"
+     ]
+    }
+   ],
+   "source": [
+    "secrets = util.secrets_api(project.name)\n",
+    "try:\n",
+    "    secrets.create_secret(\"AQI_API_KEY\", AQI_API_KEY)\n",
+    "except hopsworks.RestAPIError:\n",
+    "    AQI_API_KEY = secrets.get_secret(\"AQI_API_KEY\").value"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    aq_today_df = util.get_pm25(aqicn_url, country, city, street, today, AQI_API_KEY)\n",
+    "except hopsworks.RestAPIError:\n",
+    "    print(\"It looks like the AQI_API_KEY doesn't work for your sensor. Is the API key correct? Is the sensor URL correct?\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 1802 entries, 0 to 1801\n",
+      "Data columns (total 2 columns):\n",
+      " #   Column  Non-Null Count  Dtype  \n",
+      "---  ------  --------------  -----  \n",
+      " 0   date    1802 non-null   object \n",
+      " 1   pm25    1802 non-null   float32\n",
+      "dtypes: float32(1), object(1)\n",
+      "memory usage: 21.2+ KB\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'2019-12-09'"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "aq_today_df.head()\n",
+    "\n",
+    "df = pd.read_csv(csv_file,  parse_dates=['date'], skipinitialspace=True)\n",
+    "\n",
+    "# These commands will succeed if your CSV file didn't have a `median` or `timestamp` column\n",
+    "df = df.rename(columns={\"median\": \"pm25\"})\n",
+    "# df = df.rename(columns={\"timestamp\": \"date\"})\n",
+    "df['date'] = pd.to_datetime(df['date']).dt.date\n",
+    "\n",
+    "df_aq = df[['date', 'pm25']]\n",
+    "df_aq['pm25'] = df_aq['pm25'].astype('float32')\n",
+    "df_aq.info()\n",
+    "df_aq.dropna(inplace=True)\n",
+    "df_aq['country']=country\n",
+    "df_aq['city']=city\n",
+    "df_aq['street']=street\n",
+    "df_aq['url']=aqicn_url\n",
+    "df_aq\n",
+    "\n",
+    "df_aq =df_aq.set_index(\"date\")\n",
+    "df_aq['past_air_quality'] = df_aq['pm25'].rolling(3).mean()\n",
+    "df_aq[\"past_air_quality\"] = df_aq[\"past_air_quality\"].fillna(df_aq[\"past_air_quality\"].mean())\n",
+    "df_aq = df_aq.reset_index()\n",
+    "df_aq.date.describe()\n",
+    "\n",
+    "earliest_aq_date = pd.Series.min(df_aq['date'])\n",
+    "earliest_aq_date = earliest_aq_date.strftime('%Y-%m-%d')\n",
+    "earliest_aq_date"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "datetime.date(2024, 11, 20)"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "today"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Coordinates 31.59929656982422°N 74.26347351074219°E\n",
+      "Elevation 215.0 m asl\n",
+      "Timezone None None\n",
+      "Timezone difference to GMT+0 0 s\n",
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 1807 entries, 0 to 1806\n",
+      "Data columns (total 6 columns):\n",
+      " #   Column                       Non-Null Count  Dtype         \n",
+      "---  ------                       --------------  -----         \n",
+      " 0   date                         1807 non-null   datetime64[ns]\n",
+      " 1   temperature_2m_mean          1807 non-null   float32       \n",
+      " 2   precipitation_sum            1807 non-null   float32       \n",
+      " 3   wind_speed_10m_max           1807 non-null   float32       \n",
+      " 4   wind_direction_10m_dominant  1807 non-null   float32       \n",
+      " 5   city                         1807 non-null   object        \n",
+      "dtypes: datetime64[ns](1), float32(4), object(1)\n",
+      "memory usage: 70.6+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "weather_df = util.get_historical_weather(city, earliest_aq_date, str(today - datetime.timedelta(days=1)), latitude, longitude)\n",
+    "# weather_df = util.get_historical_weather(city, earliest_aq_date, \"2024-11-05\", latitude, longitude)\n",
+    "weather_df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{\"expectation_type\": \"expect_column_min_to_be_between\", \"kwargs\": {\"column\": \"pm25\", \"min_value\": -0.1, \"max_value\": 500.0, \"strict_min\": true}, \"meta\": {}}"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "import great_expectations as ge\n",
+    "aq_expectation_suite = ge.core.ExpectationSuite(\n",
+    "    expectation_suite_name=\"aq_expectation_suite\"\n",
+    ")\n",
+    "\n",
+    "aq_expectation_suite.add_expectation(\n",
+    "    ge.core.ExpectationConfiguration(\n",
+    "        expectation_type=\"expect_column_min_to_be_between\",\n",
+    "        kwargs={\n",
+    "            \"column\":\"pm25\",\n",
+    "            \"min_value\":-0.1,\n",
+    "            \"max_value\":500.0,\n",
+    "            \"strict_min\":True\n",
+    "        }\n",
+    "    )\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import great_expectations as ge\n",
+    "weather_expectation_suite = ge.core.ExpectationSuite(\n",
+    "    expectation_suite_name=\"weather_expectation_suite\"\n",
+    ")\n",
+    "\n",
+    "def expect_greater_than_zero(col):\n",
+    "    weather_expectation_suite.add_expectation(\n",
+    "        ge.core.ExpectationConfiguration(\n",
+    "            expectation_type=\"expect_column_min_to_be_between\",\n",
+    "            kwargs={\n",
+    "                \"column\":col,\n",
+    "                \"min_value\":-0.1,\n",
+    "                \"max_value\":1000.0,\n",
+    "                \"strict_min\":True\n",
+    "            }\n",
+    "        )\n",
+    "    )\n",
+    "expect_greater_than_zero(\"precipitation_sum\")\n",
+    "expect_greater_than_zero(\"wind_speed_10m_max\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Connected. Call `.close()` to terminate connection gracefully.\n"
+     ]
+    }
+   ],
+   "source": [
+    "fs = project.get_feature_store() "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Secret created successfully, explore it at https://c.app.hopsworks.ai:443/account/secrets\n"
+     ]
+    }
+   ],
+   "source": [
+    "dict_obj = {\n",
+    "    \"country\": country,\n",
+    "    \"city\": city,\n",
+    "    \"street\": street,\n",
+    "    \"aqicn_url\": aqicn_url,\n",
+    "    \"latitude\": latitude,\n",
+    "    \"longitude\": longitude\n",
+    "}\n",
+    "\n",
+    "# Convert the dictionary to a JSON string\n",
+    "str_dict = json.dumps(dict_obj)\n",
+    "\n",
+    "try:\n",
+    "    secrets.create_secret(\"SENSOR_LOCATION_JSON\", str_dict)\n",
+    "except hopsworks.RestAPIError:\n",
+    "    print(\"SENSOR_LOCATION_JSON already exists. To update, delete the secret in the UI (https://c.app.hopsworks.ai/account/secrets) and re-run this cell.\")\n",
+    "    existing_key = secrets.get_secret(\"SENSOR_LOCATION_JSON\").value\n",
+    "    print(f\"{existing_key}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "air_quality_fg = fs.get_or_create_feature_group(\n",
+    "    name='air_quality',\n",
+    "    description='Air Quality characteristics of each day',\n",
+    "    version=1,\n",
+    "    primary_key=['city', 'street', 'date'],\n",
+    "    event_time=\"date\",\n",
+    "    expectation_suite=aq_expectation_suite\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Feature Group created successfully, explore it at \n",
+      "https://c.app.hopsworks.ai:443/p/1160340/fs/1151043/fg/1362254\n",
+      "2024-11-21 05:44:54,527 INFO: \t1 expectation(s) included in expectation_suite.\n",
+      "Validation succeeded.\n",
+      "Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1160340/fs/1151043/fg/1362254\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "506badbe42224a17b3ccc6d6b1ae7927",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading Dataframe: 0.00% |          | Rows 0/1802 | Elapsed Time: 00:00 | Remaining Time: ?"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Launching job: air_quality_1_offline_fg_materialization\n",
+      "Job started successfully, you can follow the progress at \n",
+      "https://c.app.hopsworks.ai/p/1160340/jobs/named/air_quality_1_offline_fg_materialization/executions\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(<hsfs.core.job.Job at 0x74c9c7eb8c20>,\n",
+       " {\n",
+       "   \"success\": true,\n",
+       "   \"results\": [\n",
+       "     {\n",
+       "       \"success\": true,\n",
+       "       \"expectation_config\": {\n",
+       "         \"expectation_type\": \"expect_column_min_to_be_between\",\n",
+       "         \"kwargs\": {\n",
+       "           \"column\": \"pm25\",\n",
+       "           \"min_value\": -0.1,\n",
+       "           \"max_value\": 500.0,\n",
+       "           \"strict_min\": true\n",
+       "         },\n",
+       "         \"meta\": {\n",
+       "           \"expectationId\": 686087\n",
+       "         }\n",
+       "       },\n",
+       "       \"result\": {\n",
+       "         \"observed_value\": 1.9899998903274536,\n",
+       "         \"element_count\": 1802,\n",
+       "         \"missing_count\": null,\n",
+       "         \"missing_percent\": null\n",
+       "       },\n",
+       "       \"meta\": {\n",
+       "         \"ingestionResult\": \"INGESTED\",\n",
+       "         \"validationTime\": \"2024-11-20T09:44:54.000525Z\"\n",
+       "       },\n",
+       "       \"exception_info\": {\n",
+       "         \"raised_exception\": false,\n",
+       "         \"exception_message\": null,\n",
+       "         \"exception_traceback\": null\n",
+       "       }\n",
+       "     }\n",
+       "   ],\n",
+       "   \"evaluation_parameters\": {},\n",
+       "   \"statistics\": {\n",
+       "     \"evaluated_expectations\": 1,\n",
+       "     \"successful_expectations\": 1,\n",
+       "     \"unsuccessful_expectations\": 0,\n",
+       "     \"success_percent\": 100.0\n",
+       "   },\n",
+       "   \"meta\": {\n",
+       "     \"great_expectations_version\": \"0.18.12\",\n",
+       "     \"expectation_suite_name\": \"aq_expectation_suite\",\n",
+       "     \"run_id\": {\n",
+       "       \"run_name\": null,\n",
+       "       \"run_time\": \"2024-11-21T05:44:54.526004+08:00\"\n",
+       "     },\n",
+       "     \"batch_kwargs\": {\n",
+       "       \"ge_batch_id\": \"adcf6d76-a788-11ef-a237-1091d10619ea\"\n",
+       "     },\n",
+       "     \"batch_markers\": {},\n",
+       "     \"batch_parameters\": {},\n",
+       "     \"validation_time\": \"20241120T214454.525505Z\",\n",
+       "     \"expectation_suite_meta\": {\n",
+       "       \"great_expectations_version\": \"0.18.12\"\n",
+       "     }\n",
+       "   }\n",
+       " })"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "air_quality_fg.insert(df_aq)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<hsfs.feature_group.FeatureGroup at 0x74c9c7ed3d10>"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "air_quality_fg.update_feature_description(\"date\", \"Date of measurement of air quality\")\n",
+    "air_quality_fg.update_feature_description(\"country\", \"Country where the air quality was measured (sometimes a city in acqcn.org)\")\n",
+    "air_quality_fg.update_feature_description(\"city\", \"City where the air quality was measured\")\n",
+    "air_quality_fg.update_feature_description(\"street\", \"Street in the city where the air quality was measured\")\n",
+    "air_quality_fg.update_feature_description(\"pm25\", \"Particles less than 2.5 micrometers in diameter (fine particles) pose health risk\")\n",
+    "air_quality_fg.update_feature_description(\"past_air_quality\", \"mean air quality of the past 3 days\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Feature Group created successfully, explore it at \n",
+      "https://c.app.hopsworks.ai:443/p/1160340/fs/1151043/fg/1362255\n",
+      "2024-11-21 05:56:51,769 INFO: \t2 expectation(s) included in expectation_suite.\n",
+      "Validation succeeded.\n",
+      "Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1160340/fs/1151043/fg/1362255\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "455439f2dd8643b4b06da1d3851d2f8c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading Dataframe: 0.00% |          | Rows 0/1807 | Elapsed Time: 00:00 | Remaining Time: ?"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Launching job: weather_1_offline_fg_materialization\n",
+      "Job started successfully, you can follow the progress at \n",
+      "https://c.app.hopsworks.ai/p/1160340/jobs/named/weather_1_offline_fg_materialization/executions\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<hsfs.feature_group.FeatureGroup at 0x74c9c7ebaea0>"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "weather_fg = fs.get_or_create_feature_group(\n",
+    "    name='weather',\n",
+    "    description='Weather characteristics of each day',\n",
+    "    version=1,\n",
+    "    primary_key=['city', 'date'],\n",
+    "    event_time=\"date\",\n",
+    "    expectation_suite=weather_expectation_suite\n",
+    ") \n",
+    "\n",
+    "weather_fg.insert(weather_df)\n",
+    "\n",
+    "weather_fg.update_feature_description(\"date\", \"Date of measurement of weather\")\n",
+    "weather_fg.update_feature_description(\"city\", \"City where weather is measured/forecast for\")\n",
+    "weather_fg.update_feature_description(\"temperature_2m_mean\", \"Temperature in Celsius\")\n",
+    "weather_fg.update_feature_description(\"precipitation_sum\", \"Precipitation (rain/snow) in mm\")\n",
+    "weather_fg.update_feature_description(\"wind_speed_10m_max\", \"Wind speed at 10m abouve ground\")\n",
+    "weather_fg.update_feature_description(\"wind_direction_10m_dominant\", \"Dominant Wind direction over the dayd\")\n",
+    "\n",
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

backfill.py DELETED Viewed

@@ -1,62 +0,0 @@
-import datetime
-import requests
-import pandas as pd
-import hopsworks
-import datetime
-from pathlib import Path
-from functions import util
-import json
-import re
-import os
-import warnings
-import pandas as pd
-api_key = os.getenv('HOPSWORKS_API_KEY')
-project_name = os.getenv('HOPSWORKS_PROJECT')
-project = hopsworks.login(project=project_name, api_key_value=api_key)
-fs = project.get_feature_store()
-secrets = util.secrets_api(project.name)
-AQI_API_KEY = secrets.get_secret("AQI_API_KEY").value
-location_str = secrets.get_secret("SENSOR_LOCATION_JSON").value
-location = json.loads(location_str)
-country=location['country']
-city=location['city']
-street=location['street']
-aqicn_url=location['aqicn_url']
-latitude=location['latitude']
-longitude=location['longitude']
-today = datetime.date.today()
-# Retrieve feature groups
-air_quality_fg = fs.get_feature_group(
-    name='air_quality',
-    version=1,
-)
-weather_fg = fs.get_feature_group(
-    name='weather',
-    version=1,
-)
-aq_today_df = util.get_pm25(aqicn_url, country, city, street, today, AQI_API_KEY)
-#aq_today_df = util.get_pm25(aqicn_url, country, city, street, "2024-11-15", AQI_API_KEY)
-aq_today_df['date'] = pd.to_datetime(aq_today_df['date']).dt.date
-aq_today_df
-# Get weather forecast data
-hourly_df = util.get_hourly_weather_forecast(city, latitude, longitude)
-hourly_df = hourly_df.set_index('date')
-# We will only make 1 daily prediction, so we will replace the hourly forecasts with a single daily forecast
-# We only want the daily weather data, so only get weather at 12:00
-daily_df = hourly_df.between_time('11:59', '12:01')
-daily_df = daily_df.reset_index()
-daily_df['date'] = pd.to_datetime(daily_df['date']).dt.date
-daily_df['date'] = pd.to_datetime(daily_df['date'])
-# daily_df['date'] = daily_df['date'].astype(str)
-daily_df['city'] = city
-daily_df

data/lahore.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

debug.ipynb CHANGED Viewed

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -10,266 +10,70 @@
      "output_type": "stream",
      "text": [
       "Connection closed.\n",
-      "Connected. Call `.close()` to terminate connection gracefully.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
       "\n",
       "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1160344\n",
       "Connected. Call `.close()` to terminate connection gracefully.\n",
-      "Connected. Call `.close()` to terminate connection gracefully.\n"
      ]
     }
    ],
    "source": [
-    "import datetime\n",
     "import pandas as pd\n",
-    "from xgboost import XGBRegressor\n",
     "import hopsworks\n",
     "import json\n",
-    "from functions import util\n",
     "import os\n",
     "\n",
-    "# Set up\n",
-    "\n",
-    "api_key = os.getenv('HOPSWORKS_API_KEY')\n",
-    "project_name = os.getenv('HOPSWORKS_PROJECT')\n",
-    "project = hopsworks.login(project=project_name, api_key_value=api_key)\n",
-    "fs = project.get_feature_store() \n",
-    "secrets = util.secrets_api(project.name)\n",
-    "location_str = secrets.get_secret(\"SENSOR_LOCATION_JSON\").value\n",
-    "location = json.loads(location_str)\n",
-    "country=location['country']\n",
-    "city=location['city']\n",
-    "street=location['street']\n",
-    "\n",
-    "AQI_API_KEY = secrets.get_secret(\"AQI_API_KEY\").value\n",
-    "location_str = secrets.get_secret(\"SENSOR_LOCATION_JSON\").value\n",
-    "location = json.loads(location_str)\n",
     "\n",
-    "today = datetime.datetime.now() - datetime.timedelta(0)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "Connected. Call `.close()` to terminate connection gracefully.\n",
-      "Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.02s) \n"
-     ]
-    }
-   ],
-   "source": [
-    "### Retreive model\n",
-    "\n",
-    "mr = project.get_model_registry()\n",
-    "\n",
-    "retrieved_model = mr.get_model(\n",
-    "    name=\"air_quality_xgboost_model\",\n",
-    "    version=1,\n",
-    ")\n",
-    "\n",
-    "saved_model_dir = retrieved_model.download()\n",
-    "retrieved_xgboost_model = XGBRegressor()\n",
-    "retrieved_xgboost_model.load_model(saved_model_dir + \"/model.json\")\n",
-    "\n",
-    "### Retrieve features \n",
-    "\n",
-    "weather_fg = fs.get_feature_group(\n",
-    "    name='weather',\n",
-    "    version=1,\n",
-    ")\n",
-    "\n",
-    "today_timestamp = pd.to_datetime(today)\n",
-    "batch_data = weather_fg.filter(weather_fg.date >= today_timestamp ).read().sort_values(by=['date'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>date</th>\n",
-       "      <th>temperature_2m_mean</th>\n",
-       "      <th>precipitation_sum</th>\n",
-       "      <th>wind_speed_10m_max</th>\n",
-       "      <th>wind_direction_10m_dominant</th>\n",
-       "      <th>city</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2024-11-21 00:00:00+00:00</td>\n",
-       "      <td>21.700001</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>1.138420</td>\n",
-       "      <td>71.564964</td>\n",
-       "      <td>lahore</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>2024-11-22 00:00:00+00:00</td>\n",
-       "      <td>21.850000</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>4.610250</td>\n",
-       "      <td>128.659836</td>\n",
-       "      <td>lahore</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>7</th>\n",
-       "      <td>2024-11-23 00:00:00+00:00</td>\n",
-       "      <td>22.250000</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>5.091168</td>\n",
-       "      <td>44.999897</td>\n",
-       "      <td>lahore</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>2024-11-24 00:00:00+00:00</td>\n",
-       "      <td>21.400000</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>4.334974</td>\n",
-       "      <td>318.366547</td>\n",
-       "      <td>lahore</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>2024-11-25 00:00:00+00:00</td>\n",
-       "      <td>20.750000</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>6.439876</td>\n",
-       "      <td>296.564972</td>\n",
-       "      <td>lahore</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>2024-11-26 00:00:00+00:00</td>\n",
-       "      <td>20.750000</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>4.680000</td>\n",
-       "      <td>270.000000</td>\n",
-       "      <td>lahore</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>2024-11-27 00:00:00+00:00</td>\n",
-       "      <td>20.350000</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>4.104631</td>\n",
-       "      <td>37.875053</td>\n",
-       "      <td>lahore</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>2024-11-28 00:00:00+00:00</td>\n",
-       "      <td>19.799999</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>2.189795</td>\n",
-       "      <td>9.462248</td>\n",
-       "      <td>lahore</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                       date  temperature_2m_mean  precipitation_sum  \\\n",
-       "1 2024-11-21 00:00:00+00:00            21.700001                0.0   \n",
-       "4 2024-11-22 00:00:00+00:00            21.850000                0.0   \n",
-       "7 2024-11-23 00:00:00+00:00            22.250000                0.0   \n",
-       "6 2024-11-24 00:00:00+00:00            21.400000                0.0   \n",
-       "5 2024-11-25 00:00:00+00:00            20.750000                0.0   \n",
-       "2 2024-11-26 00:00:00+00:00            20.750000                0.0   \n",
-       "0 2024-11-27 00:00:00+00:00            20.350000                0.0   \n",
-       "3 2024-11-28 00:00:00+00:00            19.799999                0.0   \n",
-       "\n",
-       "   wind_speed_10m_max  wind_direction_10m_dominant    city  \n",
-       "1            1.138420                    71.564964  lahore  \n",
-       "4            4.610250                   128.659836  lahore  \n",
-       "7            5.091168                    44.999897  lahore  \n",
-       "6            4.334974                   318.366547  lahore  \n",
-       "5            6.439876                   296.564972  lahore  \n",
-       "2            4.680000                   270.000000  lahore  \n",
-       "0            4.104631                    37.875053  lahore  \n",
-       "3            2.189795                     9.462248  lahore  "
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "batch_data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "ValueError",
-     "evalue": "feature_names mismatch: ['past_air_quality', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant'] ['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']\nexpected past_air_quality in input data",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[6], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m### Predict and upload\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m batch_data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpredicted_pm25\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mretrieved_xgboost_model\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m      4\u001b[0m \u001b[43m    \u001b[49m\u001b[43mbatch_data\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtemperature_2m_mean\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mprecipitation_sum\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mwind_speed_10m_max\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mwind_direction_10m_dominant\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      6\u001b[0m batch_data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mstreet\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m street\n\u001b[1;32m      7\u001b[0m batch_data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcity\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m city\n",
-      "File \u001b[0;32m~/Documents/scalable-ml/lab1-new/hbg-weather/.venv/lib/python3.12/site-packages/xgboost/sklearn.py:1168\u001b[0m, in \u001b[0;36mXGBModel.predict\u001b[0;34m(self, X, output_margin, validate_features, base_margin, iteration_range)\u001b[0m\n\u001b[1;32m   1166\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_can_use_inplace_predict():\n\u001b[1;32m   1167\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1168\u001b[0m         predts \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_booster\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minplace_predict\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1169\u001b[0m \u001b[43m            \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1170\u001b[0m \u001b[43m            \u001b[49m\u001b[43miteration_range\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43miteration_range\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1171\u001b[0m \u001b[43m            \u001b[49m\u001b[43mpredict_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmargin\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43moutput_margin\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mvalue\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1172\u001b[0m \u001b[43m            \u001b[49m\u001b[43mmissing\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmissing\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1173\u001b[0m \u001b[43m            \u001b[49m\u001b[43mbase_margin\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbase_margin\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1174\u001b[0m \u001b[43m            \u001b[49m\u001b[43mvalidate_features\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvalidate_features\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1175\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1176\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m _is_cupy_array(predts):\n\u001b[1;32m   1177\u001b[0m             \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mcupy\u001b[39;00m  \u001b[38;5;66;03m# pylint: disable=import-error\u001b[39;00m\n",
-      "File \u001b[0;32m~/Documents/scalable-ml/lab1-new/hbg-weather/.venv/lib/python3.12/site-packages/xgboost/core.py:2418\u001b[0m, in \u001b[0;36mBooster.inplace_predict\u001b[0;34m(self, data, iteration_range, predict_type, missing, validate_features, base_margin, strict_shape)\u001b[0m\n\u001b[1;32m   2416\u001b[0m     data, fns, _ \u001b[38;5;241m=\u001b[39m _transform_pandas_df(data, enable_categorical)\n\u001b[1;32m   2417\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m validate_features:\n\u001b[0;32m-> 2418\u001b[0m         \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_features\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfns\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2419\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _is_list(data) \u001b[38;5;129;01mor\u001b[39;00m _is_tuple(data):\n\u001b[1;32m   2420\u001b[0m     data \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39marray(data)\n",
-      "File \u001b[0;32m~/Documents/scalable-ml/lab1-new/hbg-weather/.venv/lib/python3.12/site-packages/xgboost/core.py:2970\u001b[0m, in \u001b[0;36mBooster._validate_features\u001b[0;34m(self, feature_names)\u001b[0m\n\u001b[1;32m   2964\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m my_missing:\n\u001b[1;32m   2965\u001b[0m     msg \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m   2966\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mtraining data did not have the following fields: \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   2967\u001b[0m         \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;28mstr\u001b[39m(s) \u001b[38;5;28;01mfor\u001b[39;00m s \u001b[38;5;129;01min\u001b[39;00m my_missing)\n\u001b[1;32m   2968\u001b[0m     )\n\u001b[0;32m-> 2970\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(msg\u001b[38;5;241m.\u001b[39mformat(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfeature_names, feature_names))\n",
-      "\u001b[0;31mValueError\u001b[0m: feature_names mismatch: ['past_air_quality', 'temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant'] ['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']\nexpected past_air_quality in input data"
      ]
     }
    ],
    "source": [
-    "### Predict and upload\n",
-    "\n",
-    "batch_data['predicted_pm25'] = retrieved_xgboost_model.predict(\n",
-    "    batch_data[['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']])\n",
-    "\n",
-    "batch_data['street'] = street\n",
-    "batch_data['city'] = city\n",
-    "batch_data['country'] = country\n",
-    "# Fill in the number of days before the date on which you made the forecast (base_date)\n",
-    "batch_data['days_before_forecast_day'] = range(1, len(batch_data)+1)\n",
-    "batch_data = batch_data.sort_values(by=['date'])\n",
-    "#batch_data['date'] = batch_data['date'].dt.tz_convert(None).astype('datetime64[ns]')\n",
     "\n",
-    "plt = util.plot_air_quality_forecast(city, street, batch_data, file_path=\"./img/pm25_forecast.png\")\n",
-    "plt.show()\n"
    ]
   },
   {
@@ -277,156 +81,53 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Batch data:                        date  temperature_2m_mean  precipitation_sum  \\\n",
-      "0 2024-11-21 00:00:00+00:00                 3.40                0.2   \n",
-      "3 2024-11-22 00:00:00+00:00                 4.05                0.7   \n",
-      "2 2024-11-23 00:00:00+00:00                 5.45                0.0   \n",
-      "1 2024-11-24 00:00:00+00:00                 5.60                0.0   \n",
-      "\n",
-      "   wind_speed_10m_max  wind_direction_10m_dominant         city  \\\n",
-      "0           19.995398                   246.665939  Helsingborg   \n",
-      "3           23.540806                   246.571289  Helsingborg   \n",
-      "2           30.631746                   240.422256  Helsingborg   \n",
-      "1           13.755580                   276.008911  Helsingborg   \n",
-      "\n",
-      "   predicted_pm25          street country  days_before_forecast_day  \n",
-      "0       39.168438  Drottninggatan  Sweden                         1  \n",
-      "3       20.740093  Drottninggatan  Sweden                         2  \n",
-      "2       46.448105  Drottninggatan  Sweden                         3  \n",
-      "1       61.713448  Drottninggatan  Sweden                         4  \n"
-     ]
-    },
     {
      "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0c3e8fd8c8f545a597e504acf5f077e8",
-       "version_major": 2,
-       "version_minor": 0
-      },
       "text/plain": [
-       "Uploading Dataframe: 0.00% |          | Rows 0/4 | Elapsed Time: 00:00 | Remaining Time: ?"
       ]
      },
      "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Launching job: aq_predictions_1_offline_fg_materialization\n",
-      "Job started successfully, you can follow the progress at \n",
-      "https://c.app.hopsworks.ai/p/1160340/jobs/named/aq_predictions_1_offline_fg_materialization/executions\n",
-      "Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.95s) \n",
-      "Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.85s) \n"
-     ]
-    }
-   ],
-   "source": [
-    "monitor_fg = fs.get_or_create_feature_group(\n",
-    "    name='aq_predictions',\n",
-    "    description='Air Quality prediction monitoring',\n",
-    "    version=1,\n",
-    "    primary_key=['city','street','date','days_before_forecast_day'],\n",
-    "    event_time=\"date\"\n",
-    ")\n",
-    "\n",
-    "print(f\"Batch data: {batch_data}\")\n",
-    "\n",
-    "monitor_fg.insert(batch_data, write_options={\"wait_for_job\": True})\n",
-    "monitoring_df = monitor_fg.filter(monitor_fg.days_before_forecast_day == 1).read()\n",
-    "\n",
-    "# Hindcast monitoring\n",
-    "\n",
-    "air_quality_fg = fs.get_feature_group(\n",
-    "    name='air_quality',\n",
-    "    version=1,\n",
-    ")\n",
-    "air_quality_df = air_quality_fg.read()\n",
-    "\n",
-    "outcome_df = air_quality_df[['date', 'pm25']]\n",
-    "preds_df =  monitoring_df[['date', 'predicted_pm25']]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "ValueError",
-     "evalue": "You are trying to merge on datetime64[us, UTC] and object columns for key 'date'. If you wish to proceed you should use pd.concat",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[5], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m hindcast_df \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmerge\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpreds_df\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutcome_df\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mon\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdate\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m      2\u001b[0m hindcast_df \u001b[38;5;241m=\u001b[39m hindcast_df\u001b[38;5;241m.\u001b[39msort_values(by\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdate\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(hindcast_df) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
-      "File \u001b[0;32m~/Documents/scalable-ml/lab1-new/hbg-weather/.venv/lib/python3.12/site-packages/pandas/core/reshape/merge.py:169\u001b[0m, in \u001b[0;36mmerge\u001b[0;34m(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)\u001b[0m\n\u001b[1;32m    154\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m _cross_merge(\n\u001b[1;32m    155\u001b[0m         left_df,\n\u001b[1;32m    156\u001b[0m         right_df,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    166\u001b[0m         copy\u001b[38;5;241m=\u001b[39mcopy,\n\u001b[1;32m    167\u001b[0m     )\n\u001b[1;32m    168\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 169\u001b[0m     op \u001b[38;5;241m=\u001b[39m \u001b[43m_MergeOperation\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    170\u001b[0m \u001b[43m        \u001b[49m\u001b[43mleft_df\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    171\u001b[0m \u001b[43m        \u001b[49m\u001b[43mright_df\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    172\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhow\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhow\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    173\u001b[0m \u001b[43m        \u001b[49m\u001b[43mon\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    174\u001b[0m \u001b[43m        \u001b[49m\u001b[43mleft_on\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mleft_on\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    175\u001b[0m \u001b[43m        \u001b[49m\u001b[43mright_on\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mright_on\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    176\u001b[0m \u001b[43m        \u001b[49m\u001b[43mleft_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mleft_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    177\u001b[0m \u001b[43m        \u001b[49m\u001b[43mright_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mright_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    178\u001b[0m \u001b[43m        \u001b[49m\u001b[43msort\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    179\u001b[0m \u001b[43m        \u001b[49m\u001b[43msuffixes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msuffixes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    180\u001b[0m \u001b[43m        \u001b[49m\u001b[43mindicator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindicator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    181\u001b[0m \u001b[43m        \u001b[49m\u001b[43mvalidate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvalidate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    182\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    183\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m op\u001b[38;5;241m.\u001b[39mget_result(copy\u001b[38;5;241m=\u001b[39mcopy)\n",
-      "File \u001b[0;32m~/Documents/scalable-ml/lab1-new/hbg-weather/.venv/lib/python3.12/site-packages/pandas/core/reshape/merge.py:804\u001b[0m, in \u001b[0;36m_MergeOperation.__init__\u001b[0;34m(self, left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, indicator, validate)\u001b[0m\n\u001b[1;32m    800\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_tolerance(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mleft_join_keys)\n\u001b[1;32m    802\u001b[0m \u001b[38;5;66;03m# validate the merge keys dtypes. We may need to coerce\u001b[39;00m\n\u001b[1;32m    803\u001b[0m \u001b[38;5;66;03m# to avoid incompatible dtypes\u001b[39;00m\n\u001b[0;32m--> 804\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_maybe_coerce_merge_keys\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    806\u001b[0m \u001b[38;5;66;03m# If argument passed to validate,\u001b[39;00m\n\u001b[1;32m    807\u001b[0m \u001b[38;5;66;03m# check if columns specified as unique\u001b[39;00m\n\u001b[1;32m    808\u001b[0m \u001b[38;5;66;03m# are in fact unique.\u001b[39;00m\n\u001b[1;32m    809\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m validate \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
-      "File \u001b[0;32m~/Documents/scalable-ml/lab1-new/hbg-weather/.venv/lib/python3.12/site-packages/pandas/core/reshape/merge.py:1483\u001b[0m, in \u001b[0;36m_MergeOperation._maybe_coerce_merge_keys\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1481\u001b[0m \u001b[38;5;66;03m# datetimelikes must match exactly\u001b[39;00m\n\u001b[1;32m   1482\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m needs_i8_conversion(lk\u001b[38;5;241m.\u001b[39mdtype) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m needs_i8_conversion(rk\u001b[38;5;241m.\u001b[39mdtype):\n\u001b[0;32m-> 1483\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(msg)\n\u001b[1;32m   1484\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m needs_i8_conversion(lk\u001b[38;5;241m.\u001b[39mdtype) \u001b[38;5;129;01mand\u001b[39;00m needs_i8_conversion(rk\u001b[38;5;241m.\u001b[39mdtype):\n\u001b[1;32m   1485\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(msg)\n",
-      "\u001b[0;31mValueError\u001b[0m: You are trying to merge on datetime64[us, UTC] and object columns for key 'date'. If you wish to proceed you should use pd.concat"
-     ]
     }
    ],
    "source": [
-    "hindcast_df = pd.merge(preds_df, outcome_df, on=\"date\")\n",
-    "hindcast_df = hindcast_df.sort_values(by=['date'])\n",
-    "\n",
-    "if len(hindcast_df) == 0:\n",
-    "    hindcast_df = util.backfill_predictions_for_monitoring(weather_fg, air_quality_df, monitor_fg, retrieved_xgboost_model)\n",
-    "\n",
-    "plt = util.plot_air_quality_forecast(city, street, hindcast_df, file_path=\"./img/pm25_hindcast_1day.png\", hindcast=True)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2024-11-20 14:23:12,559 WARNING: SettingWithCopyWarning: \n",
-      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
-      "Try using .loc[row_indexer,col_indexer] = value instead\n",
-      "\n",
-      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "import numpy as np\n",
-    "\n",
-    "scale = 5\n",
-    "outcome_df['predicted_pm25'] = outcome_df['pm25'] + scale * np.random.uniform(-1, 1, outcome_df.shape[0])\n",
-    "outcome_df.sort_values(by=['date'])\n",
-    "outcome_df.to_pickle('outcome_df.pkl')"
-   ]
   }
  ],
  "metadata": {
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  },
   "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.4"
   }
  },
  "nbformat": 4,

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "output_type": "stream",
      "text": [
       "Connection closed.\n",
+      "Connected. Call `.close()` to terminate connection gracefully.\n",
       "\n",
       "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1160344\n",
       "Connected. Call `.close()` to terminate connection gracefully.\n",
+      "Connected. Call `.close()` to terminate connection gracefully.\n",
+      "Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (3.28s) \n",
+      "Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.04s) \n"
      ]
     }
    ],
    "source": [
+    "import streamlit as st\n",
     "import pandas as pd\n",
+    "import numpy as np\n",
+    "import datetime\n",
     "import hopsworks\n",
+    "from functions import figure, retrieve\n",
+    "import os\n",
+    "import pickle\n",
+    "import plotly.express as px\n",
     "import json\n",
+    "from datetime import datetime\n",
     "import os\n",
     "\n",
     "\n",
+    "# Real data\n",
+    "today = datetime.today().strftime('%Y-%m-%d')\n",
+    "df = retrieve.get_merged_dataframe()"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Connection closed.\n",
       "Connected. Call `.close()` to terminate connection gracefully.\n",
+      "\n",
+      "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1160344\n",
+      "Connected. Call `.close()` to terminate connection gracefully.\n",
+      "Connected. Call `.close()` to terminate connection gracefully.\n",
+      "No air_quality_fv feature view found\n",
+      "No air_quality feature group found\n",
+      "No weather feature group found\n",
+      "No aq_predictions feature group found\n",
+      "No air_quality_xgboost_model model found\n",
+      "Connected. Call `.close()` to terminate connection gracefully.\n",
+      "Deleted secret SENSOR_LOCATION_JSON\n"
      ]
     }
    ],
    "source": [
+    "import hopsworks\n",
+    "import os\n",
     "\n",
+    "from functions import util\n",
+    "api_key = os.getenv('HOPSWORKS_API_KEY')\n",
+    "project_name = os.getenv('HOPSWORKS_PROJECT')\n",
+    "project = hopsworks.login(project=project_name, api_key_value=api_key)\n",
+    "util.purge_project(project)"
    ]
   },
   {
    "execution_count": null,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "9"
       ]
      },
+     "execution_count": 16,
      "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
+    "def backfill_predictions_for_monitoring(weather_fg, air_quality_df, monitor_fg, model):\n",
+    "    weather_df = weather_fg.read()\n",
+    "    weather_df = weather_df.sort_values(by=['date'], ascending=True)\n",
+    "    weather_df['date'] = weather_df['date'].dt.tz_convert(None).astype('datetime64[ns]')\n",
+    "    air_quality_df_filter = air_quality_df[['date', 'past_air_quality']]\n",
+    "    monitor_fg_filter = monitor_fg.read()[['date','past_air_quality']]\n",
+    "    combined_df = pd.concat([air_quality_df_filter, monitor_fg_filter])\n",
+    "    combined_df['date'] = pd.to_datetime(combined_df['date'], utc=True)\n",
+    "    combined_df['date'] = combined_df['date'].dt.tz_convert(None).astype('datetime64[ns]')\n",
+    "    features_df = pd.merge(weather_df, combined_df, on='date', how='left')\n",
+    "    \n",
+    "    features_df = features_df.tail(10)\n",
+    "    features_df['predicted_pm25'] = model.predict(features_df[['past_air_quality','temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']])\n",
+    "    air_quality_df['date'] = pd.to_datetime(air_quality_df['date'])\n",
+    "    # features_df['date'] = features_df['date'].dt.tz_convert(None).astype('datetime64[ns]')\n",
+    "    \n",
+    "    df = pd.merge(features_df, air_quality_df[['date','pm25','street','country']], on=\"date\")\n",
+    "    df['days_before_forecast_day'] = 1\n",
+    "    hindcast_df = df\n",
+    "    df = df.drop('pm25', axis=1)\n",
+    "    monitor_fg.insert(df, write_options={\"wait_for_job\": True})\n",
+    "    return hindcast_df"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "language_info": {
+   "name": "python"
   }
  },
  "nbformat": 4,

functions/__pycache__/util.cpython-312.pyc CHANGED Viewed

Binary files a/functions/__pycache__/util.cpython-312.pyc and b/functions/__pycache__/util.cpython-312.pyc differ

functions/{merge_df.py → retrieve.py} RENAMED Viewed

@@ -50,15 +50,17 @@ def get_merged_dataframe():
     selected_features = air_quality_fg.select_all(['pm25', 'past_air_quality']).join(weather_fg.select(['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']), on=['city'])
     selected_features = selected_features.read()
     selected_features['date'] = pd.to_datetime(selected_features['date'], utc=True).dt.tz_convert(None).astype('datetime64[ns]')
     predicted_data = monitor_fg.read()
     predicted_data = predicted_data[['date','predicted_pm25']]
     predicted_data['date'] = predicted_data['date'].dt.tz_convert(None).astype('datetime64[ns]')
     predicted_data = predicted_data.sort_values(by=['date'], ascending=True).reset_index(drop=True)
     #get historical predicted pm25
-    selected_features['predicted_pm25'] = retrieved_xgboost_model.predict(selected_features[['past_air_quality','temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']])
     #merge data
     selected_features = selected_features[['date', 'pm25', 'predicted_pm25']]
@@ -70,5 +72,6 @@ def get_merged_dataframe():
     # Drop the individual columns after merging
     combined_df = combined_df.drop(columns=['predicted_pm25_x', 'predicted_pm25_y'])
     return combined_df

     selected_features = air_quality_fg.select_all(['pm25', 'past_air_quality']).join(weather_fg.select(['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']), on=['city'])
     selected_features = selected_features.read()
     selected_features['date'] = pd.to_datetime(selected_features['date'], utc=True).dt.tz_convert(None).astype('datetime64[ns]')
+    selected_features = selected_features.tail(100)
     predicted_data = monitor_fg.read()
     predicted_data = predicted_data[['date','predicted_pm25']]
     predicted_data['date'] = predicted_data['date'].dt.tz_convert(None).astype('datetime64[ns]')
     predicted_data = predicted_data.sort_values(by=['date'], ascending=True).reset_index(drop=True)
     #get historical predicted pm25
+    selected_features['predicted_pm25'] = retrieved_xgboost_model.predict(selected_features[['past_air_quality','temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']])
     #merge data
     selected_features = selected_features[['date', 'pm25', 'predicted_pm25']]
     # Drop the individual columns after merging
     combined_df = combined_df.drop(columns=['predicted_pm25_x', 'predicted_pm25_y'])
+    combined_df = combined_df.drop_duplicates(subset=['date']).reset_index(drop=True)
     return combined_df

functions/util.py CHANGED Viewed

@@ -15,10 +15,6 @@ import hopsworks
 import hsfs
 from pathlib import Path
-import sys
-print(sys.path)
 def get_historical_weather(city, start_date,  end_date, latitude, longitude):
     # latitude, longitude = get_city_coordinates(city)
@@ -300,16 +296,24 @@ def check_file_path(file_path):
         print(f"File successfully found at the path: {file_path}")
 def backfill_predictions_for_monitoring(weather_fg, air_quality_df, monitor_fg, model):
-    features_df = weather_fg.read()
-    features_df = features_df.sort_values(by=['date'], ascending=True)
     features_df = features_df.tail(10)
-    features_df['predicted_pm25'] = model.predict(features_df[['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']])
     air_quality_df['date'] = pd.to_datetime(air_quality_df['date'])
-    features_df['date'] = features_df['date'].dt.tz_convert(None).astype('datetime64[ns]')
     df = pd.merge(features_df, air_quality_df[['date','pm25','street','country']], on="date")
     df['days_before_forecast_day'] = 1
     hindcast_df = df
     df = df.drop('pm25', axis=1)
     monitor_fg.insert(df, write_options={"wait_for_job": True})
-    return hindcast_df

 import hsfs
 from pathlib import Path
 def get_historical_weather(city, start_date,  end_date, latitude, longitude):
     # latitude, longitude = get_city_coordinates(city)
         print(f"File successfully found at the path: {file_path}")
 def backfill_predictions_for_monitoring(weather_fg, air_quality_df, monitor_fg, model):
+    weather_df = weather_fg.read()
+    weather_df = weather_df.sort_values(by=['date'], ascending=True)
+    weather_df['date'] = weather_df['date'].dt.tz_convert(None).astype('datetime64[ns]')
+    air_quality_df_filter = air_quality_df[['date', 'past_air_quality']]
+    monitor_fg_filter = monitor_fg.read()[['date','past_air_quality']]
+    combined_df = pd.concat([air_quality_df_filter, monitor_fg_filter])
+    combined_df['date'] = pd.to_datetime(combined_df['date'], utc=True)
+    combined_df['date'] = combined_df['date'].dt.tz_convert(None).astype('datetime64[ns]')
+    features_df = pd.merge(weather_df, combined_df, on='date', how='left')
     features_df = features_df.tail(10)
+    features_df['predicted_pm25'] = model.predict(features_df[['past_air_quality','temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']])
     air_quality_df['date'] = pd.to_datetime(air_quality_df['date'])
+    # features_df['date'] = features_df['date'].dt.tz_convert(None).astype('datetime64[ns]')
     df = pd.merge(features_df, air_quality_df[['date','pm25','street','country']], on="date")
     df['days_before_forecast_day'] = 1
     hindcast_df = df
     df = df.drop('pm25', axis=1)
     monitor_fg.insert(df, write_options={"wait_for_job": True})
+    return hindcast_df

inference_pipeline.py CHANGED Viewed

@@ -30,9 +30,6 @@ today = datetime.datetime.now() - datetime.timedelta(0)
 tomorrow = today + datetime.timedelta(days = 1)
 today
-# ## <span style="color:#ff5f27;"> 📡 Connect to Hopsworks Feature Store </span>
 # In[3]:
@@ -50,10 +47,6 @@ country=location['country']
 city=location['city']
 street=location['street']
-# ## <span style="color:#ff5f27;"> ⚙️ Feature View Retrieval</span>
-#
 # In[4]:
@@ -62,9 +55,6 @@ feature_view = fs.get_feature_view(
     version=1,
 )
-# ## <span style="color:#ff5f27;">🪝 Download the model from Model Registry</span>
 # In[5]:
@@ -74,38 +64,22 @@ retrieved_model = mr.get_model(
     name="air_quality_xgboost_model",
     version=1,
 )
-# Download the saved model artifacts to a local directory
 saved_model_dir = retrieved_model.download()
 # In[6]:
-# Loading the XGBoost regressor model and label encoder from the saved model directory
-# retrieved_xgboost_model = joblib.load(saved_model_dir + "/xgboost_regressor.pkl")
 retrieved_xgboost_model = XGBRegressor()
 retrieved_xgboost_model.load_model(saved_model_dir + "/model.json")
-# Displaying the retrieved XGBoost regressor model
 retrieved_xgboost_model
 # In[7]:
-# Access the feature names of the trained XGBoost model
 feature_names = retrieved_xgboost_model.get_booster().feature_names
-# Print the feature names
 print("Feature names:", feature_names)
-# ## <span style="color:#ff5f27;">✨ Get Weather Forecast Features with Feature View   </span>
-#
-#
 # In[8]:
@@ -117,12 +91,8 @@ today_timestamp = pd.to_datetime(today)
 batch_data = weather_fg.filter(weather_fg.date >= today_timestamp ).read()
 batch_data
-# ### Get Mean air quality for past days
 # In[9]:
 air_quality_fg = fs.get_feature_group(
     name='air_quality',
     version=1,
@@ -130,39 +100,22 @@ air_quality_fg = fs.get_feature_group(
 selected_features = air_quality_fg.select_all() #(['pm25']).join(weather_fg.select_all(), on=['city'])
 selected_features = selected_features.read()
 # In[10]:
 selected_features = selected_features.sort_values(by='date').reset_index(drop=True)
 # In[11]:
 past_air_q_list = selected_features[['date', 'pm25']][-3:]['pm25'].tolist()
 # In[12]:
 batch_data = batch_data.sort_values(by='date').reset_index(drop=True)
 # In[13]:
 batch_data['past_air_quality'] = None
-# In[14]:
-batch_data
-# ### <span style="color:#ff5f27;">🤖 Making the predictions</span>
 # In[15]:
@@ -196,23 +149,11 @@ batch_data['predicted_pm25'] = predictions
 # Display the updated DataFrame
 batch_data
-# In[16]:
-# batch_data['predicted_pm25'] = retrieved_xgboost_model.predict(
-#     batch_data[['temperature_2m_mean', 'precipitation_sum', 'wind_speed_10m_max', 'wind_direction_10m_dominant']])
-# batch_data
 # In[17]:
 batch_data.info()
-# ### <span style="color:#ff5f27;">🤖 Saving the predictions (for monitoring) to a Feature Group</span>
 # In[18]:
@@ -226,24 +167,6 @@ batch_data['date'] = batch_data['date'].dt.tz_convert(None).astype('datetime64[n
 batch_data
-# In[19]:
-batch_data.info()
-# ### Create Forecast Graph
-# Draw a graph of the predictions with dates as a PNG and save it to the github repo
-# Show it on github pages
-# In[20]:
-file_path = "img/pm25_forecast.png"
-plt = util.plot_air_quality_forecast(city, street, batch_data, file_path)
-plt.show()
 # In[21]:
@@ -268,8 +191,6 @@ monitor_fg.insert(batch_data, write_options={"wait_for_job": True})
 # We will create a hindcast chart for  only the forecasts made 1 day beforehand
 monitoring_df = monitor_fg.filter(monitor_fg.days_before_forecast_day == 1).read()
-monitoring_df
 # In[24]:
@@ -331,19 +252,4 @@ hindcast_df = hindcast_df.sort_values(by=['date'])
 # If there are no outcomes for predictions yet, generate some predictions/outcomes from existing data
 if len(hindcast_df) == 0:
     hindcast_df = util.backfill_predictions_for_monitoring(weather_fg, air_quality_df, monitor_fg, retrieved_xgboost_model)
-hindcast_df
-# ### Plot the Hindcast comparing predicted with forecasted values (1-day prior forecast)
-#
-# __This graph will be empty to begin with - this is normal.__
-#
-# After a few days of predictions and observations, you will get data points in this graph.
-# In[32]:
-file_path = "img/pm25_hindcast_1day.png"
-plt = util.plot_air_quality_forecast(city, street, hindcast_df, file_path, hindcast=True)
-plt.show()
-# %%

 tomorrow = today + datetime.timedelta(days = 1)
 today
 # In[3]:
 city=location['city']
 street=location['street']
 # In[4]:
     version=1,
 )
 # In[5]:
     name="air_quality_xgboost_model",
     version=1,
 )
 saved_model_dir = retrieved_model.download()
 # In[6]:
 retrieved_xgboost_model = XGBRegressor()
 retrieved_xgboost_model.load_model(saved_model_dir + "/model.json")
 retrieved_xgboost_model
 # In[7]:
 feature_names = retrieved_xgboost_model.get_booster().feature_names
 print("Feature names:", feature_names)
 # In[8]:
 batch_data = weather_fg.filter(weather_fg.date >= today_timestamp ).read()
 batch_data
 # In[9]:
 air_quality_fg = fs.get_feature_group(
     name='air_quality',
     version=1,
 selected_features = air_quality_fg.select_all() #(['pm25']).join(weather_fg.select_all(), on=['city'])
 selected_features = selected_features.read()
 # In[10]:
 selected_features = selected_features.sort_values(by='date').reset_index(drop=True)
 # In[11]:
 past_air_q_list = selected_features[['date', 'pm25']][-3:]['pm25'].tolist()
 # In[12]:
 batch_data = batch_data.sort_values(by='date').reset_index(drop=True)
 # In[13]:
 batch_data['past_air_quality'] = None
 # In[15]:
 # Display the updated DataFrame
 batch_data
 # In[17]:
 batch_data.info()
 # In[18]:
 batch_data
 # In[21]:
 # We will create a hindcast chart for  only the forecasts made 1 day beforehand
 monitoring_df = monitor_fg.filter(monitor_fg.days_before_forecast_day == 1).read()
 # In[24]:
 # If there are no outcomes for predictions yet, generate some predictions/outcomes from existing data
 if len(hindcast_df) == 0:
     hindcast_df = util.backfill_predictions_for_monitoring(weather_fg, air_quality_df, monitor_fg, retrieved_xgboost_model)
+hindcast_df

training.py ADDED Viewed

	@@ -0,0 +1,270 @@

+#!/usr/bin/env python
+# coding: utf-8
+# # <span style="font-width:bold; font-size: 3rem; color:#333;">Training Pipeline</span>
+#
+# ## 🗒️ This notebook is divided into the following sections:
+#
+# 1. Select features for the model and create a Feature View with the selected features
+# 2. Create training data using the feature view
+# 3. Train model
+# 4. Evaluate model performance
+# 5. Save model to model registry
+# ### <span style='color:#ff5f27'> 📝 Imports
+# In[1]:
+import os
+from datetime import datetime, timedelta
+import pandas as pd
+import matplotlib.pyplot as plt
+from xgboost import XGBRegressor
+from xgboost import plot_importance
+from sklearn.metrics import mean_squared_error, r2_score
+import hopsworks
+from functions import util
+import warnings
+warnings.filterwarnings("ignore")
+# ## <span style="color:#ff5f27;"> 📡 Connect to Hopsworks Feature Store </span>
+# In[2]:
+project = hopsworks.login()
+api_key = os.getenv('HOPSWORKS_API_KEY')
+project_name = os.getenv('HOPSWORKS_PROJECT')
+project = hopsworks.login(project=project_name, api_key_value=api_key)
+fs = project.get_feature_store()
+secrets = util.secrets_api(project.name)
+# In[3]:
+# Retrieve feature groups
+air_quality_fg = fs.get_feature_group(
+    name='air_quality',
+    version=1,
+)
+weather_fg = fs.get_feature_group(
+    name='weather',
+    version=1,
+)
+# ---
+#
+# ## <span style="color:#ff5f27;"> 🖍 Feature View Creation and Retrieving </span>
+# In[4]:
+# Select features for training data.
+selected_features = air_quality_fg.select(['pm25', 'past_air_quality']).join(weather_fg.select_all(), on=['city'])
+selected_features.show(10)
+# In[9]:
+feature_view = fs.get_or_create_feature_view(
+    name='air_quality_fv',
+    description="weather features with air quality as the target",
+    version=1,
+    labels=['pm25'],
+    query=selected_features,
+)
+# In[10]:
+start_date_test_data = "2024-03-01"
+# Convert string to datetime object
+test_start = datetime.strptime(start_date_test_data, "%Y-%m-%d")
+# In[11]:
+X_train, X_test, y_train, y_test = feature_view.train_test_split(
+    test_start=test_start
+)
+# In[12]:
+X_train
+# In[13]:
+# Drop the index columns - 'date' (event_time) and 'city' (primary key)
+train_features = X_train.drop(['date', 'city'], axis=1)
+test_features = X_test.drop(['date', 'city'], axis=1)
+# In[14]:
+y_train
+# The `Feature View` is now saved in Hopsworks and you can retrieve it using `FeatureStore.get_feature_view(name='...', version=1)`.
+# ---
+# ## <span style="color:#ff5f27;">🧬 Modeling</span>
+#
+# We will train a regression model to predict pm25 using our 4 features (wind_speed, wind_dir, temp, precipitation)
+# In[16]:
+# Creating an instance of the XGBoost Regressor
+xgb_regressor = XGBRegressor()
+# Fitting the XGBoost Regressor to the training data
+xgb_regressor.fit(train_features, y_train)
+# In[17]:
+# Predicting target values on the test set
+y_pred = xgb_regressor.predict(test_features)
+# Calculating Mean Squared Error (MSE) using sklearn
+mse = mean_squared_error(y_test.iloc[:,0], y_pred)
+print("MSE:", mse)
+# Calculating R squared using sklearn
+r2 = r2_score(y_test.iloc[:,0], y_pred)
+print("R squared:", r2)
+# In[18]:
+df = y_test
+df['predicted_pm25'] = y_pred
+# In[19]:
+df['date'] = X_test['date']
+df = df.sort_values(by=['date'])
+df.head(5)
+# In[20]:
+# Creating a directory for the model artifacts if it doesn't exist
+model_dir = "air_quality_model"
+if not os.path.exists(model_dir):
+    os.mkdir(model_dir)
+images_dir = model_dir + "/images"
+if not os.path.exists(images_dir):
+    os.mkdir(images_dir)
+# In[21]:
+file_path = images_dir + "/pm25_hindcast.png"
+plt = util.plot_air_quality_forecast("lahore", "pakistan-lahore-cantonment", df, file_path, hindcast=True)
+plt.show()
+# In[22]:
+# Plotting feature importances using the plot_importance function from XGBoost
+plot_importance(xgb_regressor, max_num_features=5)
+feature_importance_path = images_dir + "/feature_importance.png"
+plt.savefig(feature_importance_path)
+plt.show()
+# ---
+# ## <span style='color:#ff5f27'>🗄 Model Registry</span>
+#
+# One of the features in Hopsworks is the model registry. This is where you can store different versions of models and compare their performance. Models from the registry can then be served as API endpoints.
+# ### <span style="color:#ff5f27;">⚙️ Model Schema</span>
+# The model needs to be set up with a [Model Schema](https://docs.hopsworks.ai/machine-learning-api/latest/generated/model_schema/), which describes the inputs and outputs for a model.
+#
+# A Model Schema can be automatically generated from training examples, as shown below.
+# In[23]:
+from hsml.schema import Schema
+from hsml.model_schema import ModelSchema
+# Creating input and output schemas using the 'Schema' class for features (X) and target variable (y)
+input_schema = Schema(X_train)
+output_schema = Schema(y_train)
+# Creating a model schema using 'ModelSchema' with the input and output schemas
+model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)
+# Converting the model schema to a dictionary representation
+schema_dict = model_schema.to_dict()
+# In[24]:
+# Saving the XGBoost regressor object as a json file in the model directory
+xgb_regressor.save_model(model_dir + "/model.json")
+# In[25]:
+res_dict = {
+        "MSE": str(mse),
+        "R squared": str(r2),
+    }
+# In[26]:
+mr = project.get_model_registry()
+# Creating a Python model in the model registry named 'air_quality_xgboost_model'
+aq_model = mr.python.create_model(
+    name="air_quality_xgboost_model",
+    metrics= res_dict,
+    model_schema=model_schema,
+    input_example=X_test.sample().values,
+    description="Air Quality (PM2.5) predictor",
+)
+# Saving the model artifacts to the 'air_quality_model' directory in the model registry
+aq_model.save(model_dir)
+# ---
+# ## <span style="color:#ff5f27;">⏭️ **Next:** Part 04: Batch Inference</span>
+#
+# In the following notebook you will use your model for Batch Inference.
+#
+# In[ ]: