Spaces:
Runtime error
Runtime error
import pandas as pd | |
import numpy as np | |
import gradio as gr | |
import datetime | |
from dateutil.relativedelta import relativedelta | |
import datarobot as dr | |
from function import get_fish_qty, get_estat, dr_prediction_deployment | |
import yaml | |
with open('config.yaml') as file: | |
config = yaml.safe_load(file.read()) | |
def create_train_data(): | |
# ターゲットを抽出 | |
household_survey = get_estat.get_household_survey() | |
expence_df = pd.DataFrame({'年月':household_survey['時間軸(月次)'].unique()}) | |
cate='3.1 電気代' | |
temp_df = household_survey.loc[household_survey['品目分類(2020年改定)'] == cate] | |
unit = temp_df['unit'].unique()[0] | |
temp_df = temp_df.rename(columns={'value':f'{cate}_({unit})'}) | |
expence_df = pd.merge(expence_df, | |
temp_df[['時間軸(月次)', f'{cate}_({unit})']].rename(columns={'時間軸(月次)':'年月'}), | |
on='年月', | |
how='left') | |
expence_df = expence_df.rename(columns={'3.1 電気代_(円)':'電気代'}) | |
expence_df['年月'] = pd.to_datetime(expence_df['年月'], format='%Y年%m月').astype(str).apply(lambda x:''.join(x.split('-'))[:6]).astype(int) | |
# 原油価格を抽出し作成 | |
oil_price_df = pd.read_excel(config['oil_price_url'], header=5) | |
oil_price_df = oil_price_df.rename(columns={oil_price_df.columns[0]:'年'}) | |
oil_price_df['年'] = oil_price_df['年'].interpolate(method='ffill') | |
oil_price_df['年月'] = oil_price_df['年'] + oil_price_df['月'].astype(str) + '月' | |
oil_price_df['年月'] = pd.to_datetime(oil_price_df['年月'], format='%Y年%m月').astype(str).apply(lambda x:''.join(x.split('-'))[:6]).astype(int) | |
# 燃料調達価格のデータを作成 | |
fuel_procurement_cost_df = pd.read_excel(config['fuel_procurement_cost_url'], header=4) | |
fuel_procurement_cost_df = fuel_procurement_cost_df.iloc[:, 3:] | |
for i in fuel_procurement_cost_df.columns: | |
if '\n' in i: | |
fuel_procurement_cost_df = fuel_procurement_cost_df.rename(columns={i:i.replace('\n', '')}) | |
fuel_procurement_cost_df['燃料費調整単価適用期間'] = fuel_procurement_cost_df['燃料費調整単価適用期間'].interpolate(method='ffill') | |
fuel_procurement_cost_df['燃料費調整単価適用期間'] = pd.to_datetime(fuel_procurement_cost_df['燃料費調整単価適用期間'], | |
format='%Y年\n%m月').astype(str).apply(lambda x:''.join(x.split('-'))[:6]).astype(int) | |
for kind in fuel_procurement_cost_df['種別'].unique(): | |
temp_df = fuel_procurement_cost_df.loc[fuel_procurement_cost_df['種別']==kind].drop('種別', axis=1) | |
temp_df = temp_df.rename(columns={temp_df.columns[0]:'年月'}) | |
for i in temp_df.columns: | |
if i != '年月': | |
temp_df = temp_df.rename(columns={i:f'{i}_{kind}_lag1'}) | |
temp_df[f'{i}_{kind}_lag1'] = temp_df[f'{i}_{kind}_lag1'].shift(1) | |
expence_df = pd.merge(expence_df, | |
temp_df, | |
on='年月', | |
how='left') | |
# 各データを結合 | |
oil_price_df[['ブレント_lag3', 'ドバイ_lag3', 'WTI_lag3', 'OPECバスケット_lag3']] = oil_price_df[['ブレント', 'ドバイ', 'WTI', 'OPECバスケット']].shift(3) | |
expence_df = pd.merge(expence_df, | |
oil_price_df[['ブレント_lag3', 'ドバイ_lag3', 'WTI_lag3', 'OPECバスケット_lag3', '年月']], | |
on='年月', | |
how='left') | |
# 魚の卸売りデータを読み込み | |
last_time_fish_arch = pd.read_csv('data/fish_sell_ach.csv') | |
start_date = pd.to_datetime(str(int(last_time_fish_arch['date'].max()))) | |
today = datetime.datetime.now() | |
end_date = pd.to_datetime(today + relativedelta(days=1)) | |
use_fish_list = config['use_fish_list'] | |
temp_sell_ach = get_fish_qty.get_fish_price_data(start_date, end_date, use_fish_list) | |
temp_sell_ach['date'] = temp_sell_ach['date'].astype(int) | |
sell_ach = pd.concat([last_time_fish_arch, | |
temp_sell_ach.loc[~temp_sell_ach['date'].isin(last_time_fish_arch['date'].unique())]]) | |
sell_ach.to_csv('data/fish_sell_ach.csv', index=False) | |
# trainデータの作成 | |
sell_ach['target_date'] = sell_ach['date'].apply(lambda x:int((pd.to_datetime(str(x))+relativedelta(months=1)).strftime('%Y%m%d'))) | |
sell_ach['年月'] = sell_ach['target_date'].astype(str).str[:6].astype(int) | |
col_list=['するめいか_卸売数量計(kg)', | |
'いわし_卸売数量計(kg)', | |
'ぶり・わらさ_卸売数量計(kg)', | |
'冷さけ_卸売数量計(kg)', | |
'塩さけ_卸売数量計(kg)', | |
'さけます類_卸売数量計(kg)', | |
'全卸売数量計(kg)'] | |
for shift_i in [7, 14, 21, 28]: | |
change_col_list = [f'{i}_lag{shift_i}' for i in col_list] | |
sell_ach[change_col_list] = sell_ach[col_list].shift(shift_i) | |
sell_ach = sell_ach.rename(columns={'date':'forecast_point'}) | |
train_df = pd.merge(expence_df, | |
sell_ach, | |
on='年月') | |
train_df.to_csv('data/train.csv', index=False) | |
return train_df | |
def modeling(): | |
train_df = create_train_data() | |
# モデリングに必要な各設定値 | |
## データロボットとの接続設定 | |
token = 'NjQwMDVmNGI0ZDQzZDFhYzI2YThmZDJiOnVZejljTXFNTXNoUnlKMStoUFhXSFdYMEZRck9lY3dobnEvRFZ1aVBHbVE9' | |
### デモ環境これっぽい | |
endpoint = 'https://app.datarobot.com/api/v2' | |
## プロジェクト名 | |
project_name = f'{datetime.datetime.now().strftime("%Y%m%d")}_ESTYLEU_電気代予測_再学習' | |
## 各種設定 | |
### 特徴量設定 | |
target = '電気代' | |
feature_timeline = 'target_date' #時系列 | |
not_use_feature = ['年月', 'forecast_point'] | |
# 最適化指標 | |
metric = 'RMSE' | |
### ギャップ | |
gap='P0Y' # これで0?要確認 | |
### バックテストの数 | |
number_of_backtests = 1 | |
end_date = int(train_df[feature_timeline].max()) | |
### 日付 | |
holdout_end_date=pd.to_datetime(str(end_date)) | |
holdout_start_date=holdout_end_date - relativedelta(years=1) | |
backtest_end_date = holdout_start_date - relativedelta(days=1) | |
backtest_start_date = backtest_end_date - relativedelta(years=1) | |
train_end_date = backtest_start_date - relativedelta(days=1) | |
train_start_date = pd.to_datetime(str(int(train_df[feature_timeline].min()))) | |
### モデリングモード | |
mode = dr.AUTOPILOT_MODE.QUICK | |
# mode = dr.AUTOPILOT_MODE.FULL_AUTO | |
dr.Client( | |
endpoint=endpoint, | |
token=token | |
) | |
# バックテスト設定 | |
backtests_setting = [dr.BacktestSpecification( | |
index=0, | |
primary_training_start_date=train_start_date, | |
primary_training_end_date=train_end_date, | |
validation_start_date=backtest_start_date, | |
validation_end_date=backtest_end_date | |
)] | |
spec = dr.DatetimePartitioningSpecification( | |
feature_timeline, | |
use_time_series=False, | |
disable_holdout=False, | |
holdout_start_date=holdout_start_date, | |
holdout_end_date=holdout_end_date, | |
gap_duration=gap, | |
number_of_backtests=number_of_backtests, | |
backtests=backtests_setting, | |
) | |
use_feature_list = train_df.columns.to_list() | |
print('now creating project') | |
project = dr.Project.create( | |
train_df, | |
project_name=project_name | |
) | |
raw = [feat_list for feat_list in project.get_featurelists() if feat_list.name == 'Informative Features'][0] | |
raw_features = [feat for feat in raw.features if f'{feature_timeline} ' in feat] | |
for i in not_use_feature: | |
if i in use_feature_list: | |
use_feature_list.remove(i) | |
use_feature_list = use_feature_list.extend(raw_features) | |
print("start modeling") | |
project.analyze_and_model( | |
target = target, | |
mode = mode, | |
partitioning_method=spec, | |
max_wait=3000, | |
worker_count=-1, | |
featurelist_id = project.create_featurelist('モデリング', use_feature_list).id | |
) | |
project.wait_for_autopilot() | |
project.unlock_holdout() | |
model_df = pd.DataFrame( | |
[[model.id, | |
model.model_type, | |
model.metrics['RMSE']['validation'], | |
model.metrics['RMSE']['backtesting'], | |
model.metrics['RMSE']['holdout'], | |
model] for model in project.get_datetime_models() if model.model_type != 'Baseline Predictions Using Most Recent Value'], | |
columns=['ID', 'モデル名', 'バックテスト1', '全てのバックテスト', 'holdout', 'model']) | |
model_df = model_df.sort_values('holdout').reset_index(drop=True) | |
model = model_df['model'][0] | |
model_management_df = pd.DataFrame() | |
temp_model_management_df = pd.DataFrame({ | |
'作成日':[int(datetime.datetime.now().strftime('%Y%m%d'))], | |
'作成時間':[int(datetime.datetime.now().strftime('%H%M%S'))], | |
'project_url':[project.get_uri()], | |
'model_url':[model.get_uri()], | |
'model_type':[model.model_type] | |
}) | |
model_management_df = pd.concat([model_management_df, | |
temp_model_management_df]) | |
model_management_df.to_csv('data/temp_model_management.csv') | |
deployment = dr.Deployment.get(deployment_id='640d791796a6a52d92c368a0') | |
deployment.replace_model(model.id, dr.enums.MODEL_REPLACEMENT_REASON.SCHEDULED_REFRESH) | |
return model_management_df | |