|
import pandas as pd |
|
import numpy as np |
|
import gradio as gr |
|
import datetime |
|
from dateutil.relativedelta import relativedelta |
|
from func import get_fish_qty, get_estat, dr_prediction_deployment |
|
|
|
import yaml |
|
with open('config.yaml') as file: |
|
config = yaml.safe_load(file.read()) |
|
|
|
def create_train_data(): |
|
|
|
household_survey = get_estat.get_household_survey() |
|
expence_df = pd.DataFrame({'年月':household_survey['時間軸(月次)'].unique()}) |
|
cate='3.1 電気代' |
|
temp_df = household_survey.loc[household_survey['品目分類(2020年改定)'] == cate] |
|
unit = temp_df['unit'].unique()[0] |
|
temp_df = temp_df.rename(columns={'value':f'{cate}_({unit})'}) |
|
expence_df = pd.merge(expence_df, |
|
temp_df[['時間軸(月次)', f'{cate}_({unit})']].rename(columns={'時間軸(月次)':'年月'}), |
|
on='年月', |
|
how='left') |
|
expence_df = expence_df.rename(columns={'3.1 電気代_(円)':'電気代'}) |
|
expence_df['年月'] = pd.to_datetime(expence_df['年月'], format='%Y年%m月').astype(str).apply(lambda x:''.join(x.split('-'))[:6]).astype(int) |
|
|
|
|
|
oil_price_df = pd.read_excel(config['oil_price_url'], header=5) |
|
oil_price_df = oil_price_df.rename(columns={oil_price_df.columns[0]:'年'}) |
|
oil_price_df['年'] = oil_price_df['年'].interpolate(method='ffill') |
|
oil_price_df['年月'] = oil_price_df['年'] + oil_price_df['月'].astype(str) + '月' |
|
oil_price_df['年月'] = pd.to_datetime(oil_price_df['年月'], format='%Y年%m月').astype(str).apply(lambda x:''.join(x.split('-'))[:6]).astype(int) |
|
|
|
|
|
fuel_procurement_cost_df = pd.read_excel(config['fuel_procurement_cost_url'], header=4) |
|
fuel_procurement_cost_df = fuel_procurement_cost_df.iloc[:, 3:] |
|
for i in fuel_procurement_cost_df.columns: |
|
if '\n' in i: |
|
fuel_procurement_cost_df = fuel_procurement_cost_df.rename(columns={i:i.replace('\n', '')}) |
|
|
|
fuel_procurement_cost_df['燃料費調整単価適用期間'] = fuel_procurement_cost_df['燃料費調整単価適用期間'].interpolate(method='ffill') |
|
fuel_procurement_cost_df['燃料費調整単価適用期間'] = pd.to_datetime(fuel_procurement_cost_df['燃料費調整単価適用期間'], |
|
format='%Y年\n%m月').astype(str).apply(lambda x:''.join(x.split('-'))[:6]).astype(int) |
|
for kind in fuel_procurement_cost_df['種別'].unique(): |
|
temp_df = fuel_procurement_cost_df.loc[fuel_procurement_cost_df['種別']==kind].drop('種別', axis=1) |
|
temp_df = temp_df.rename(columns={temp_df.columns[0]:'年月'}) |
|
for i in temp_df.columns: |
|
if i != '年月': |
|
temp_df = temp_df.rename(columns={i:f'{i}_{kind}_lag1'}) |
|
temp_df[f'{i}_{kind}_lag1'] = temp_df[f'{i}_{kind}_lag1'].shift(1) |
|
expence_df = pd.merge(expence_df, |
|
temp_df, |
|
on='年月', |
|
how='left') |
|
|
|
|
|
oil_price_df[['ブレント_lag3', 'ドバイ_lag3', 'WTI_lag3', 'OPECバスケット_lag3']] = oil_price_df[['ブレント', 'ドバイ', 'WTI', 'OPECバスケット']].shift(3) |
|
expence_df = pd.merge(expence_df, |
|
oil_price_df[['ブレント_lag3', 'ドバイ_lag3', 'WTI_lag3', 'OPECバスケット_lag3', '年月']], |
|
on='年月', |
|
how='left') |
|
|
|
|
|
last_time_fish_arch = pd.read_csv('data/fish_sell_ach.csv') |
|
start_date = pd.to_datetime(str(last_time_fish_arch['date'].max())) |
|
end_date = pd.to_datetime(today + relativedelta(days=1)) |
|
use_fish_list = config['use_fish_list'] |
|
temp_sell_ach = get_fish_qty.get_fish_price_data(start_date, end_date, use_fish_list) |
|
temp_sell_ach['date'] = temp_sell_ach['date'].astype(int) |
|
sell_ach = pd.concat([last_time_fish_arch, |
|
temp_sell_ach.loc[~temp_sell_ach['date'].isin(last_time_fish_arch['date'].unique())]]) |
|
sell_ach.to_csv('data/fish_sell_ach.csv', index=False) |
|
|
|
|
|
sell_ach['target_date'] = sell_ach['date'].apply(lambda x:int((pd.to_datetime(str(x))+relativedelta(months=1)).strftime('%Y%m%d'))) |
|
sell_ach['年月'] = sell_ach['target_date'].astype(str).str[:6].astype(int) |
|
|
|
col_list=['するめいか_卸売数量計(kg)', |
|
'いわし_卸売数量計(kg)', |
|
'ぶり・わらさ_卸売数量計(kg)', |
|
'冷さけ_卸売数量計(kg)', |
|
'塩さけ_卸売数量計(kg)', |
|
'さけます類_卸売数量計(kg)', |
|
'全卸売数量計(kg)'] |
|
|
|
for shift_i in [7, 14, 21, 28]: |
|
change_col_list = [f'{i}_lag{shift_i}' for i in col_list] |
|
sell_ach[change_col_list] = sell_ach[col_list].shift(shift_i) |
|
|
|
sell_ach = sell_ach.rename(columns={'date':'forecast_point'}) |
|
train_df = pd.merge(expence_df, |
|
sell_ach, |
|
on='年月') |
|
|
|
return train_df |
|
|
|
|
|
def modeling(): |
|
train_df = create_train_data() |
|
|
|
|
|
token = 'NjQwMDVmNGI0ZDQzZDFhYzI2YThmZDJiOnVZejljTXFNTXNoUnlKMStoUFhXSFdYMEZRck9lY3dobnEvRFZ1aVBHbVE9' |
|
|
|
endpoint = 'https://app.datarobot.com/api/v2' |
|
|
|
|
|
project_name = f'{datetime.datetime.now().strftime("Y%%m%d")}_ESTYLEU_電気代予測_再学習' |
|
|
|
|
|
|
|
target = '電気代' |
|
feature_timeline = 'target_date' |
|
not_use_feature = ['年月', 'forecast_point'] |
|
|
|
metric = 'RMSE' |
|
|
|
gap='P0Y' |
|
|
|
number_of_backtests = 1 |
|
end_date = train_df[target].max() |
|
|
|
holdout_end_date=pd.to_datetime(str(end_date)) |
|
holdout_start_date=holdout_end_date - relativedelta(year=1) |
|
backtest_end_date = holdout_start_date - relativedelta(days=1) |
|
backtest_start_date = backtest_end_date - relativedelta(years=1) |
|
train_end_date = backtest_start_date - relativedelta(days=1) |
|
train_start_date = pd.to_datetime(str(train_df[target].min())) |
|
|
|
|
|
|
|
mode = dr.AUTOPILOT_MODE.FULL_AUTO |
|
dr.Client( |
|
endpoint=endpoint, |
|
token=token |
|
) |
|
|
|
|
|
backtests_setting = [dr.BacktestSpecification( |
|
index=0, |
|
primary_training_start_date=train_start_date, |
|
primary_training_end_date=train_end_date, |
|
validation_start_date=backtest_start_date, |
|
validation_end_date=backtest_end_date |
|
)] |
|
|
|
spec = dr.DatetimePartitioningSpecification( |
|
feature_timeline, |
|
use_time_series=False, |
|
disable_holdout=False, |
|
holdout_start_date=holdout_start_date, |
|
holdout_end_date=holdout_end_date, |
|
gap_duration=gap, |
|
number_of_backtests=number_of_backtests, |
|
backtests=backtests_setting, |
|
) |
|
|
|
use_feature_list = df.columns.to_list() |
|
|
|
print('now creating project') |
|
project = dr.Project.create( |
|
train_df, |
|
project_name=project_name |
|
) |
|
|
|
raw = [feat_list for feat_list in project.get_featurelists() if feat_list.name == 'Informative Features'][0] |
|
raw_features = [feat for feat in raw.features if f'{feature_timeline} ' in feat] |
|
|
|
for i in not_use_feature: |
|
if i in use_feature_list: |
|
use_feature_list.remove(i) |
|
|
|
use_feature_list = use_feature_list.extend(raw_features) |
|
print("start modeling") |
|
project.analyze_and_model( |
|
target = target, |
|
mode = mode, |
|
partitioning_method=spec, |
|
max_wait=3000, |
|
worker_count=-1, |
|
featurelist_id = project.create_featurelist('モデリング', use_feature_list).id |
|
) |
|
print(project.get_leaderboard_ui_permalink()) |
|
project.wait_for_autopilot() |
|
|
|
|