|
import pandas as pd |
|
import numpy as np |
|
import gradio as gr |
|
import datetime |
|
from dateutil.relativedelta import relativedelta |
|
import datarobot as dr |
|
from function import get_fish_qty, get_estat, dr_prediction_deployment |
|
|
|
import yaml |
|
with open('config.yaml') as file: |
|
config = yaml.safe_load(file.read()) |
|
|
|
def create_train_data(): |
|
|
|
household_survey = get_estat.get_household_survey() |
|
expence_df = pd.DataFrame({'年月':household_survey['時間軸(月次)'].unique()}) |
|
cate='3.1 電気代' |
|
temp_df = household_survey.loc[household_survey['品目分類(2020年改定)'] == cate] |
|
unit = temp_df['unit'].unique()[0] |
|
temp_df = temp_df.rename(columns={'value':f'{cate}_({unit})'}) |
|
expence_df = pd.merge(expence_df, |
|
temp_df[['時間軸(月次)', f'{cate}_({unit})']].rename(columns={'時間軸(月次)':'年月'}), |
|
on='年月', |
|
how='left') |
|
expence_df = expence_df.rename(columns={'3.1 電気代_(円)':'電気代'}) |
|
expence_df['年月'] = pd.to_datetime(expence_df['年月'], format='%Y年%m月').astype(str).apply(lambda x:''.join(x.split('-'))[:6]).astype(int) |
|
|
|
|
|
oil_price_df = pd.read_excel(config['oil_price_url'], header=5) |
|
oil_price_df = oil_price_df.rename(columns={oil_price_df.columns[0]:'年'}) |
|
oil_price_df['年'] = oil_price_df['年'].interpolate(method='ffill') |
|
oil_price_df['年月'] = oil_price_df['年'] + oil_price_df['月'].astype(str) + '月' |
|
oil_price_df['年月'] = pd.to_datetime(oil_price_df['年月'], format='%Y年%m月').astype(str).apply(lambda x:''.join(x.split('-'))[:6]).astype(int) |
|
|
|
|
|
fuel_procurement_cost_df = pd.read_excel(config['fuel_procurement_cost_url'], header=4) |
|
fuel_procurement_cost_df = fuel_procurement_cost_df.iloc[:, 3:] |
|
for i in fuel_procurement_cost_df.columns: |
|
if '\n' in i: |
|
fuel_procurement_cost_df = fuel_procurement_cost_df.rename(columns={i:i.replace('\n', '')}) |
|
|
|
fuel_procurement_cost_df['燃料費調整単価適用期間'] = fuel_procurement_cost_df['燃料費調整単価適用期間'].interpolate(method='ffill') |
|
fuel_procurement_cost_df['燃料費調整単価適用期間'] = pd.to_datetime(fuel_procurement_cost_df['燃料費調整単価適用期間'], |
|
format='%Y年\n%m月').astype(str).apply(lambda x:''.join(x.split('-'))[:6]).astype(int) |
|
for kind in fuel_procurement_cost_df['種別'].unique(): |
|
temp_df = fuel_procurement_cost_df.loc[fuel_procurement_cost_df['種別']==kind].drop('種別', axis=1) |
|
temp_df = temp_df.rename(columns={temp_df.columns[0]:'年月'}) |
|
for i in temp_df.columns: |
|
if i != '年月': |
|
temp_df = temp_df.rename(columns={i:f'{i}_{kind}_lag1'}) |
|
temp_df[f'{i}_{kind}_lag1'] = temp_df[f'{i}_{kind}_lag1'].shift(1) |
|
expence_df = pd.merge(expence_df, |
|
temp_df, |
|
on='年月', |
|
how='left') |
|
|
|
|
|
oil_price_df[['ブレント_lag3', 'ドバイ_lag3', 'WTI_lag3', 'OPECバスケット_lag3']] = oil_price_df[['ブレント', 'ドバイ', 'WTI', 'OPECバスケット']].shift(3) |
|
expence_df = pd.merge(expence_df, |
|
oil_price_df[['ブレント_lag3', 'ドバイ_lag3', 'WTI_lag3', 'OPECバスケット_lag3', '年月']], |
|
on='年月', |
|
how='left') |
|
|
|
|
|
last_time_fish_arch = pd.read_csv('data/fish_sell_ach.csv') |
|
start_date = pd.to_datetime(str(int(last_time_fish_arch['date'].max()))) |
|
today = datetime.datetime.now() |
|
end_date = pd.to_datetime(today + relativedelta(days=1)) |
|
use_fish_list = config['use_fish_list'] |
|
temp_sell_ach = get_fish_qty.get_fish_price_data(start_date, end_date, use_fish_list) |
|
temp_sell_ach['date'] = temp_sell_ach['date'].astype(int) |
|
sell_ach = pd.concat([last_time_fish_arch, |
|
temp_sell_ach.loc[~temp_sell_ach['date'].isin(last_time_fish_arch['date'].unique())]]) |
|
sell_ach.to_csv('data/fish_sell_ach.csv', index=False) |
|
|
|
|
|
sell_ach['target_date'] = sell_ach['date'].apply(lambda x:int((pd.to_datetime(str(x))+relativedelta(months=1)).strftime('%Y%m%d'))) |
|
sell_ach['年月'] = sell_ach['target_date'].astype(str).str[:6].astype(int) |
|
|
|
col_list=['するめいか_卸売数量計(kg)', |
|
'いわし_卸売数量計(kg)', |
|
'ぶり・わらさ_卸売数量計(kg)', |
|
'冷さけ_卸売数量計(kg)', |
|
'塩さけ_卸売数量計(kg)', |
|
'さけます類_卸売数量計(kg)', |
|
'全卸売数量計(kg)'] |
|
|
|
for shift_i in [7, 14, 21, 28]: |
|
change_col_list = [f'{i}_lag{shift_i}' for i in col_list] |
|
sell_ach[change_col_list] = sell_ach[col_list].shift(shift_i) |
|
|
|
sell_ach = sell_ach.rename(columns={'date':'forecast_point'}) |
|
train_df = pd.merge(expence_df, |
|
sell_ach, |
|
on='年月') |
|
train_df.to_csv('data/train.csv', index=False) |
|
|
|
return train_df |
|
|
|
|
|
def modeling(): |
|
train_df = create_train_data() |
|
|
|
|
|
token = 'NjQwMDVmNGI0ZDQzZDFhYzI2YThmZDJiOnVZejljTXFNTXNoUnlKMStoUFhXSFdYMEZRck9lY3dobnEvRFZ1aVBHbVE9' |
|
|
|
endpoint = 'https://app.datarobot.com/api/v2' |
|
|
|
|
|
project_name = f'{datetime.datetime.now().strftime("%Y%m%d")}_ESTYLEU_電気代予測_再学習' |
|
|
|
|
|
|
|
target = '電気代' |
|
feature_timeline = 'target_date' |
|
not_use_feature = ['年月', 'forecast_point'] |
|
|
|
metric = 'RMSE' |
|
|
|
gap='P0Y' |
|
|
|
number_of_backtests = 1 |
|
end_date = int(train_df[feature_timeline].max()) |
|
|
|
holdout_end_date=pd.to_datetime(str(end_date)) |
|
holdout_start_date=holdout_end_date - relativedelta(years=1) |
|
backtest_end_date = holdout_start_date - relativedelta(days=1) |
|
backtest_start_date = backtest_end_date - relativedelta(years=1) |
|
train_end_date = backtest_start_date - relativedelta(days=1) |
|
train_start_date = pd.to_datetime(str(int(train_df[feature_timeline].min()))) |
|
|
|
|
|
|
|
mode = dr.AUTOPILOT_MODE.FULL_AUTO |
|
dr.Client( |
|
endpoint=endpoint, |
|
token=token |
|
) |
|
|
|
|
|
backtests_setting = [dr.BacktestSpecification( |
|
index=0, |
|
primary_training_start_date=train_start_date, |
|
primary_training_end_date=train_end_date, |
|
validation_start_date=backtest_start_date, |
|
validation_end_date=backtest_end_date |
|
)] |
|
|
|
spec = dr.DatetimePartitioningSpecification( |
|
feature_timeline, |
|
use_time_series=False, |
|
disable_holdout=False, |
|
holdout_start_date=holdout_start_date, |
|
holdout_end_date=holdout_end_date, |
|
gap_duration=gap, |
|
number_of_backtests=number_of_backtests, |
|
backtests=backtests_setting, |
|
) |
|
|
|
use_feature_list = train_df.columns.to_list() |
|
|
|
print('now creating project') |
|
project = dr.Project.create( |
|
train_df, |
|
project_name=project_name |
|
) |
|
|
|
raw = [feat_list for feat_list in project.get_featurelists() if feat_list.name == 'Informative Features'][0] |
|
raw_features = [feat for feat in raw.features if f'{feature_timeline} ' in feat] |
|
|
|
for i in not_use_feature: |
|
if i in use_feature_list: |
|
use_feature_list.remove(i) |
|
|
|
use_feature_list = use_feature_list.extend(raw_features) |
|
print("start modeling") |
|
project.analyze_and_model( |
|
target = target, |
|
mode = mode, |
|
partitioning_method=spec, |
|
max_wait=3000, |
|
worker_count=-1, |
|
featurelist_id = project.create_featurelist('モデリング', use_feature_list).id |
|
) |
|
project.wait_for_autopilot() |
|
project.unlock_holdout() |
|
|
|
model_df = pd.DataFrame( |
|
[[model.id, |
|
model.model_type, |
|
model.metrics['RMSE']['validation'], |
|
model.metrics['RMSE']['backtesting'], |
|
model.metrics['RMSE']['holdout'], |
|
model] for model in project.get_datetime_models() if model.model_type != 'Baseline Predictions Using Most Recent Value'], |
|
columns=['ID', 'モデル名', 'バックテスト1', '全てのバックテスト', 'holdout', 'model']) |
|
model_df = model_df.sort_values('holdout').reset_index(drop=True) |
|
|
|
model = model_df['model'][0] |
|
|
|
try: |
|
model_management_df = read_csv('data/model_management.csv') |
|
except: |
|
model_management_df = pd.DataFrame() |
|
|
|
temp_model_management_df = pd.DataFrame({ |
|
'作成日':[int(datetime.datetime.now().strftime('%Y%m%d'))], |
|
'作成時間':[int(datetime.datetime.now().strftime('%H%M%S'))], |
|
'project_url':[project.get_uri()], |
|
'model_url':[model.get_uri()], |
|
'model_type':[model.model_type] |
|
}) |
|
|
|
model_management_df = pd.concat([model_management_df, |
|
temp_model_management_df]) |
|
model_management_df.to_csv('data/model_management.csv') |
|
|
|
deployment = dr.Deployment.get(deployment_id='640d791796a6a52d92c368a0') |
|
|
|
deployment.replace_model(model.id, dr.enums.MODEL_REPLACEMENT_REASON.SCHEDULED_REFRESH) |
|
|
|
|
|
|