|
import os |
|
from datetime import date, timedelta |
|
|
|
import joblib |
|
import pandas as pd |
|
from dotenv import load_dotenv |
|
from huggingface_hub import hf_hub_download, login |
|
|
|
from src.data_api_calls import get_combined_data |
|
from src.features_pipeline import create_features |
|
|
|
|
|
def load_model(particle): |
|
load_dotenv() |
|
login(token=os.getenv("HUGGINGFACE_DOWNLOAD_TOKEN")) |
|
|
|
repo_id = f"elisaklunder/Utrecht-{particle}-Forecasting-Model" |
|
if particle == "O3": |
|
file_name = "O3_svr_model.pkl" |
|
elif particle == "NO2": |
|
file_name = "NO2_svr_model.pkl" |
|
|
|
model_path = hf_hub_download(repo_id=repo_id, filename=file_name) |
|
model = joblib.load(model_path) |
|
return model |
|
|
|
|
|
def run_model(particle, data): |
|
input_data = create_features(data=data, target_particle=particle) |
|
model = load_model(particle) |
|
prediction = model.predict(input_data) |
|
target_scaler = joblib.load(f"scalers/target_scaler_{particle}.joblib") |
|
prediction = target_scaler.inverse_transform(prediction) |
|
return prediction |
|
|
|
|
|
def get_data_and_predictions(): |
|
PREDICTIONS_FILE = "predictions_history.csv" |
|
|
|
week_data = get_combined_data() |
|
|
|
o3_predictions = run_model("O3", data=week_data) |
|
no2_predictions = run_model("NO2", data=week_data) |
|
|
|
prediction_data = [] |
|
for i in range(3): |
|
prediction_data.append( |
|
{ |
|
"pollutant": "O3", |
|
"date_predicted": date.today(), |
|
"date": date.today() + timedelta(days=i + 1), |
|
"prediction_value": o3_predictions[0][i], |
|
} |
|
) |
|
prediction_data.append( |
|
{ |
|
"pollutant": "NO2", |
|
"date_predicted": date.today(), |
|
"date": date.today() + timedelta(days=i + 1), |
|
"prediction_value": no2_predictions[0][i], |
|
} |
|
) |
|
|
|
predictions_df = pd.DataFrame(prediction_data) |
|
|
|
if os.path.exists(PREDICTIONS_FILE): |
|
existing_data = pd.read_csv(PREDICTIONS_FILE) |
|
combined_data = pd.concat([existing_data, predictions_df]) |
|
combined_data = combined_data.drop_duplicates( |
|
subset=["pollutant", "date_predicted", "date"], keep="first" |
|
) |
|
else: |
|
combined_data = predictions_df |
|
|
|
combined_data.to_csv(PREDICTIONS_FILE, index=False) |
|
return week_data, o3_predictions, no2_predictions |
|
|