NaokiOkamoto
commited on
Commit
•
e76c3de
1
Parent(s):
32b2fac
Delete function
Browse files- function/.DS_Store +0 -0
- function/dr_prediction_deployment.py +0 -119
- function/get_estat.py +0 -31
- function/get_fish_qty.py +0 -107
- function/prediction_func.py +0 -118
- function/train_modeling.py +0 -221
function/.DS_Store
DELETED
Binary file (6.15 kB)
|
|
function/dr_prediction_deployment.py
DELETED
@@ -1,119 +0,0 @@
|
|
1 |
-
import sys
|
2 |
-
import json
|
3 |
-
import requests
|
4 |
-
|
5 |
-
class DataRobotPredictionError(Exception):
|
6 |
-
"""Raised if there are issues getting predictions from DataRobot"""
|
7 |
-
|
8 |
-
|
9 |
-
def make_datarobot_deployment_predictions(data, deployment_id):
|
10 |
-
"""
|
11 |
-
Make predictions on data provided using DataRobot deployment_id provided.
|
12 |
-
See docs for details:
|
13 |
-
https://app.datarobot.com/docs-jp/predictions/api/dr-predapi.html
|
14 |
-
|
15 |
-
Parameters
|
16 |
-
----------
|
17 |
-
data : str
|
18 |
-
If using CSV as input:
|
19 |
-
Feature1,Feature2
|
20 |
-
numeric_value,string
|
21 |
-
|
22 |
-
Or if using JSON as input:
|
23 |
-
[{"Feature1":numeric_value,"Feature2":"string"}]
|
24 |
-
|
25 |
-
deployment_id : str
|
26 |
-
The ID of the deployment to make predictions with.
|
27 |
-
|
28 |
-
Returns
|
29 |
-
-------
|
30 |
-
Response schema:
|
31 |
-
https://app.datarobot.com/docs-jp/predictions/api/dr-predapi.html#response-schema
|
32 |
-
|
33 |
-
Raises
|
34 |
-
------
|
35 |
-
DataRobotPredictionError if there are issues getting predictions from DataRobot
|
36 |
-
"""
|
37 |
-
# Set HTTP headers. The charset should match the contents of the file.
|
38 |
-
headers = {
|
39 |
-
# As default, we expect CSV as input data.
|
40 |
-
# Should you wish to supply JSON instead,
|
41 |
-
# comment out the line below and use the line after that instead:
|
42 |
-
'Content-Type': 'text/plain; charset=UTF-8',
|
43 |
-
# 'Content-Type': 'application/json; charset=UTF-8',
|
44 |
-
|
45 |
-
'Authorization': 'Bearer {}'.format('NjQwMDVmNGI0ZDQzZDFhYzI2YThmZDJiOnVZejljTXFNTXNoUnlKMStoUFhXSFdYMEZRck9lY3dobnEvRFZ1aVBHbVE9'),
|
46 |
-
'DataRobot-Key': '84f96e49-d400-ec9c-92fc-30fc6e9329d1',
|
47 |
-
}
|
48 |
-
API_URL = 'https://jppdemo.orm.datarobot.com/predApi/v1.0/deployments/{deployment_id}/predictions'
|
49 |
-
url = API_URL.format(deployment_id=deployment_id)
|
50 |
-
|
51 |
-
# Prediction Explanations:
|
52 |
-
# See the documentation for more information:
|
53 |
-
# https://app.datarobot.com/docs-jp/predictions/api/dr-predapi.html#request-pred-explanations
|
54 |
-
# Should you wish to include Prediction Explanations or Prediction Warnings in the result,
|
55 |
-
# Change the parameters below accordingly, and remove the comment from the params field below:
|
56 |
-
|
57 |
-
params = {
|
58 |
-
# If explanations are required, uncomment the line below
|
59 |
-
# 'maxExplanations': 3,
|
60 |
-
# 'thresholdHigh': 0.5,
|
61 |
-
# 'thresholdLow': 0.15,
|
62 |
-
# If text explanations are required, uncomment the line below.
|
63 |
-
# 'maxNgramExplanations': 'all',
|
64 |
-
# Uncomment this for Prediction Warnings, if enabled for your deployment.
|
65 |
-
# 'predictionWarningEnabled': 'true',
|
66 |
-
}
|
67 |
-
# Make API request for predictions
|
68 |
-
predictions_response = requests.post(
|
69 |
-
url,
|
70 |
-
data=data,
|
71 |
-
headers=headers,
|
72 |
-
# Prediction Explanations:
|
73 |
-
# Uncomment this to include explanations in your prediction
|
74 |
-
# params=params,
|
75 |
-
)
|
76 |
-
_raise_dataroboterror_for_status(predictions_response)
|
77 |
-
# Return a Python dict following the schema in the documentation
|
78 |
-
return predictions_response.json()
|
79 |
-
|
80 |
-
|
81 |
-
def _raise_dataroboterror_for_status(response):
|
82 |
-
"""Raise DataRobotPredictionError if the request fails along with the response returned"""
|
83 |
-
try:
|
84 |
-
response.raise_for_status()
|
85 |
-
except requests.exceptions.HTTPError:
|
86 |
-
err_msg = '{code} Error: {msg}'.format(
|
87 |
-
code=response.status_code, msg=response.text)
|
88 |
-
raise DataRobotPredictionError(err_msg)
|
89 |
-
|
90 |
-
|
91 |
-
def main(filename, deployment_id):
|
92 |
-
"""
|
93 |
-
Return an exit code on script completion or error. Codes > 0 are errors to the shell.
|
94 |
-
Also useful as a usage demonstration of
|
95 |
-
`make_datarobot_deployment_predictions(data, deployment_id)`
|
96 |
-
"""
|
97 |
-
MAX_PREDICTION_FILE_SIZE_BYTES = 52428800 # 50 MB
|
98 |
-
if not filename:
|
99 |
-
print(
|
100 |
-
'Input file is required argument. '
|
101 |
-
'Usage: python datarobot-predict.py <input-file.csv>')
|
102 |
-
return 1
|
103 |
-
data = open(filename, 'rb').read()
|
104 |
-
data_size = sys.getsizeof(data)
|
105 |
-
if data_size >= MAX_PREDICTION_FILE_SIZE_BYTES:
|
106 |
-
print((
|
107 |
-
'Input file is too large: {} bytes. '
|
108 |
-
'Max allowed size is: {} bytes.'
|
109 |
-
).format(data_size, MAX_PREDICTION_FILE_SIZE_BYTES))
|
110 |
-
return 1
|
111 |
-
try:
|
112 |
-
predictions = make_datarobot_deployment_predictions(data, deployment_id)
|
113 |
-
except DataRobotPredictionError as exc:
|
114 |
-
print(exc)
|
115 |
-
return 1
|
116 |
-
return predictions
|
117 |
-
|
118 |
-
# def prediction_formatting_to_dataframe(prediction_json):
|
119 |
-
# prediction_df = pd.jason_normalize(prediction_json)[['rowId'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function/get_estat.py
DELETED
@@ -1,31 +0,0 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
import datetime
|
3 |
-
|
4 |
-
|
5 |
-
def get_household_survey():
|
6 |
-
# e-Statにユーザー登録し、APIキーを取得しておくこと
|
7 |
-
# URL: https://www.e-stat.go.jp/api/
|
8 |
-
API_KEY = "ddc1349cf530bdee69ca6a7ad6c0e2301aeb0780"
|
9 |
-
|
10 |
-
# 取得年月の設定
|
11 |
-
latest_year = int(datetime.datetime.now().strftime('%Y'))
|
12 |
-
year_period = 5
|
13 |
-
years = list(range(latest_year, latest_year - year_period, -1))
|
14 |
-
months = range(1, 13)
|
15 |
-
periods = []
|
16 |
-
for y in years:
|
17 |
-
y = y * 1_000_000
|
18 |
-
for m in months:
|
19 |
-
ym = y + m * 100 + m
|
20 |
-
periods.append(str(ym))
|
21 |
-
periods = ("%2C").join(periods)
|
22 |
-
|
23 |
-
# データ取得
|
24 |
-
url = f"http://api.e-stat.go.jp/rest/3.0/app/getSimpleStatsData?cdTab=01&cdCat02=03&cdArea=00000&cdTime={periods}&appId={API_KEY}&lang=J&statsDataId=0003343671&metaGetFlg=Y&cntGetFlg=N&explanationGetFlg=Y&annotationGetFlg=Y§ionHeaderFlg=1&replaceSpChars=0"
|
25 |
-
df = pd.read_csv(url, header=28)
|
26 |
-
return df
|
27 |
-
|
28 |
-
|
29 |
-
if __name__ == "__main__":
|
30 |
-
df = get_household_survey()
|
31 |
-
df.to_csv("household_survey.csv", index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function/get_fish_qty.py
DELETED
@@ -1,107 +0,0 @@
|
|
1 |
-
import codecs
|
2 |
-
import io
|
3 |
-
import random
|
4 |
-
import requests
|
5 |
-
import time
|
6 |
-
from datetime import date, timedelta
|
7 |
-
from tqdm import tqdm
|
8 |
-
from typing import Generator, Tuple
|
9 |
-
|
10 |
-
import numpy as np
|
11 |
-
import pandas as pd
|
12 |
-
|
13 |
-
|
14 |
-
def date_range(
|
15 |
-
start: date, stop: date, step: timedelta = timedelta(1)
|
16 |
-
) -> Generator[date, None, None]:
|
17 |
-
"""startからendまで日付をstep日ずつループさせるジェネレータ"""
|
18 |
-
current = start
|
19 |
-
while current < stop:
|
20 |
-
yield current
|
21 |
-
current += step
|
22 |
-
|
23 |
-
|
24 |
-
def get_url(download_date: date) -> Tuple[str, str]:
|
25 |
-
"""ダウンロードするURLと日付の文字列を返す"""
|
26 |
-
month = download_date.strftime("%Y%m")
|
27 |
-
day = download_date.strftime("%Y%m%d")
|
28 |
-
return (
|
29 |
-
f"https://www.shijou-nippo.metro.tokyo.lg.jp/SN/{month}/{day}/Sui/Sui_K1.csv",
|
30 |
-
day,
|
31 |
-
)
|
32 |
-
|
33 |
-
|
34 |
-
def content_wrap(content):
|
35 |
-
"""1行目にヘッダ行が来るまでスキップする"""
|
36 |
-
buffer = ""
|
37 |
-
first = True
|
38 |
-
for line in io.BytesIO(content):
|
39 |
-
line_str = codecs.decode(line, "shift-jis")
|
40 |
-
if first:
|
41 |
-
if "品名" in line_str:
|
42 |
-
first = False
|
43 |
-
buffer = line_str
|
44 |
-
else:
|
45 |
-
continue
|
46 |
-
else:
|
47 |
-
buffer += line_str
|
48 |
-
return io.StringIO(buffer)
|
49 |
-
|
50 |
-
|
51 |
-
def to_numeric(x):
|
52 |
-
"""文字列を数値に変換する"""
|
53 |
-
if isinstance(x, str):
|
54 |
-
return float(x)
|
55 |
-
else:
|
56 |
-
return x
|
57 |
-
|
58 |
-
|
59 |
-
def get_fish_price_data(start_date: date, end_date: date, use_fish_list) -> pd.core.frame.DataFrame:
|
60 |
-
"""
|
61 |
-
東京卸売市場からデータを引っ張ってくる
|
62 |
-
|
63 |
-
:param start_date: 開始日
|
64 |
-
:param end_date: 終了日
|
65 |
-
:return: あじの値段を結合したデータ
|
66 |
-
"""
|
67 |
-
columns = ['date'] + [i + '_卸売数量計(kg)' for i in use_fish_list] + ['全卸売数量計(kg)']
|
68 |
-
fish_qty_df = pd.DataFrame(columns=columns)
|
69 |
-
|
70 |
-
iterator = tqdm(
|
71 |
-
date_range(start_date, end_date), total=(end_date - start_date).days
|
72 |
-
)
|
73 |
-
|
74 |
-
for download_date in iterator:
|
75 |
-
url, day = get_url(download_date)
|
76 |
-
iterator.set_description(day)
|
77 |
-
response = requests.get(url)
|
78 |
-
|
79 |
-
# URLが存在しないとき
|
80 |
-
temp_df = pd.DataFrame([{'date':day}])
|
81 |
-
if response.status_code == 404:
|
82 |
-
continue
|
83 |
-
assert (
|
84 |
-
response.status_code == 200
|
85 |
-
), f"Unexpected HTTP response. Please check the website {url}."
|
86 |
-
|
87 |
-
df = pd.read_csv(content_wrap(response.content))
|
88 |
-
|
89 |
-
|
90 |
-
for i in use_fish_list:
|
91 |
-
temp = df.loc[df["品名"] == i, ['卸売数量計']]
|
92 |
-
|
93 |
-
# display(temp)
|
94 |
-
|
95 |
-
if len(temp) == 0:
|
96 |
-
temp_df[f'{i}_卸売数量計(kg)'] = 0
|
97 |
-
|
98 |
-
|
99 |
-
temp_df[f'{i}_卸売数量計(kg)'] = temp['卸売数量計'].sum()
|
100 |
-
|
101 |
-
all_qty = df[['卸売数量計']].dropna().values[-1][0]
|
102 |
-
|
103 |
-
temp_df['全卸売数量計(kg)'] = all_qty
|
104 |
-
|
105 |
-
fish_qty_df = pd.concat([fish_qty_df, temp_df])
|
106 |
-
time.sleep(max(0.5 + random.normalvariate(0, 0.3), 0.1))
|
107 |
-
return fish_qty_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function/prediction_func.py
DELETED
@@ -1,118 +0,0 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
import numpy as np
|
3 |
-
import gradio as gr
|
4 |
-
import datetime
|
5 |
-
from dateutil.relativedelta import relativedelta
|
6 |
-
from function import get_fish_qty, get_estat, dr_prediction_deployment
|
7 |
-
|
8 |
-
import yaml
|
9 |
-
with open('config.yaml') as file:
|
10 |
-
config = yaml.safe_load(file.read())
|
11 |
-
|
12 |
-
def create_prediction_data(fish_sell_ach, oil_price_url, fuel_procurement_cost_url):
|
13 |
-
oil_price_df = pd.read_excel(oil_price_url, header=5)
|
14 |
-
oil_price_df = oil_price_df.rename(columns={oil_price_df.columns[0]:'年'})
|
15 |
-
oil_price_df['年'] = oil_price_df['年'].interpolate(method='ffill')
|
16 |
-
oil_price_df['年月'] = oil_price_df['年'] + oil_price_df['月'].astype(str) + '月'
|
17 |
-
oil_price_df['年月'] = pd.to_datetime(oil_price_df['年月'], format='%Y年%m月')
|
18 |
-
oil_price_df['年月'] = oil_price_df['年月'].apply(lambda x:x+relativedelta(months=3))
|
19 |
-
oil_price_df['年月'] = oil_price_df['年月'].apply(lambda x:''.join(str(x).split('-'))[:6]).astype(int)
|
20 |
-
oil_price_df = oil_price_df.drop(['年', '月'], axis=1)
|
21 |
-
for i in oil_price_df.columns:
|
22 |
-
if i != '年月':
|
23 |
-
oil_price_df = oil_price_df.rename(columns={i:f'{i}_lag3'})
|
24 |
-
oil_price_df[f'{i}_lag3'] = oil_price_df[f'{i}_lag3'].shift(1)
|
25 |
-
|
26 |
-
fuel_procurement_cost_df = pd.read_excel(fuel_procurement_cost_url, header=4)
|
27 |
-
fuel_procurement_cost_df = fuel_procurement_cost_df.iloc[:, 3:]
|
28 |
-
for i in fuel_procurement_cost_df.columns:
|
29 |
-
if '\n' in i:
|
30 |
-
fuel_procurement_cost_df = fuel_procurement_cost_df.rename(columns={i:i.replace('\n', '')})
|
31 |
-
|
32 |
-
fuel_procurement_cost_df['燃料費調整単価適用期間'] = fuel_procurement_cost_df['燃料費調整単価適用期間'].interpolate(method='ffill')
|
33 |
-
fuel_procurement_cost_df['燃料費調整単価適用期間'] = pd.to_datetime(fuel_procurement_cost_df['燃料費調整単価適用期間'],
|
34 |
-
format='%Y年\n%m月').astype(str).apply(lambda x:''.join(x.split('-'))[:6]).astype(int)
|
35 |
-
|
36 |
-
col_list=['するめいか_卸売数量計(kg)',
|
37 |
-
'いわし_卸売数量計(kg)',
|
38 |
-
'ぶり・わらさ_卸売数量計(kg)',
|
39 |
-
'冷さけ_卸売数量計(kg)',
|
40 |
-
'塩さけ_卸売数量計(kg)',
|
41 |
-
'さけます類_卸売数量計(kg)',
|
42 |
-
'全卸売数量計(kg)']
|
43 |
-
|
44 |
-
for shift_i in [7, 14, 21, 28]:
|
45 |
-
change_col_list = [f'{i}_lag{shift_i}' for i in col_list]
|
46 |
-
fish_sell_ach[change_col_list] = fish_sell_ach[col_list].shift(shift_i)
|
47 |
-
|
48 |
-
fish_sell_ach['target_date'] = fish_sell_ach['date'].apply(lambda x:int((pd.to_datetime(str(x))+relativedelta(months=1)).strftime('%Y%m%d')))
|
49 |
-
fish_sell_ach['年月'] = fish_sell_ach['target_date'].astype(str).str[:6].astype(int)
|
50 |
-
|
51 |
-
prediction_df = pd.merge(fish_sell_ach,
|
52 |
-
oil_price_df,
|
53 |
-
on='年月',
|
54 |
-
how='left')
|
55 |
-
|
56 |
-
for kind in fuel_procurement_cost_df['種別'].unique():
|
57 |
-
temp_df = fuel_procurement_cost_df.loc[fuel_procurement_cost_df['種別']==kind].drop('種別', axis=1)
|
58 |
-
temp_df = temp_df.rename(columns={temp_df.columns[0]:'年月'})
|
59 |
-
for i in temp_df.columns:
|
60 |
-
if i != '年月':
|
61 |
-
temp_df = temp_df.rename(columns={i:f'{i}_{kind}_lag1'})
|
62 |
-
temp_df['年月'] = pd.to_datetime(temp_df['年月'], format='%Y%m')
|
63 |
-
temp_df['年月'] = temp_df['年月'].apply(lambda x:x+relativedelta(months=1))
|
64 |
-
temp_df['年月'] = temp_df['年月'].apply(lambda x:''.join(str(x).split('-'))[:6]).astype(int)
|
65 |
-
prediction_df = pd.merge(prediction_df,
|
66 |
-
temp_df,
|
67 |
-
on='年月')
|
68 |
-
prediction_df = prediction_df.rename(columns={'date':'forecast_point'})
|
69 |
-
|
70 |
-
return prediction_df
|
71 |
-
|
72 |
-
def prediction_to_dr(oil_price_url, fuel_procurement_cost_url):
|
73 |
-
today = datetime.datetime.now()
|
74 |
-
last_prediction_result = pd.read_csv('data/prediction_result.csv')
|
75 |
-
last_time_fish_arch = pd.read_csv('data/fish_sell_ach.csv')
|
76 |
-
if (str(last_prediction_result['forecast_point'].max()) == today.strftime('%Y%m%d'))|(str(last_time_fish_arch['date'].max()) == int(today.strftime('%Y%m%d'))):
|
77 |
-
pass
|
78 |
-
|
79 |
-
else:
|
80 |
-
start_date = pd.to_datetime(str(last_time_fish_arch['date'].max()))
|
81 |
-
end_date = pd.to_datetime(today + relativedelta(days=1))
|
82 |
-
use_fish_list = config['use_fish_list']
|
83 |
-
temp_sell_ach = get_fish_qty.get_fish_price_data(start_date, end_date, use_fish_list)
|
84 |
-
temp_sell_ach['date'] = temp_sell_ach['date'].astype(int)
|
85 |
-
if str(temp_sell_ach['date'].max()) != today.strftime('%Y%m%d'):
|
86 |
-
pass
|
87 |
-
|
88 |
-
else:
|
89 |
-
temp_sell_ach = pd.concat([last_time_fish_arch,
|
90 |
-
temp_sell_ach.loc[~temp_sell_ach['date'].isin(last_time_fish_arch['date'].unique())]])
|
91 |
-
temp_sell_ach.to_csv('data/fish_sell_ach.csv', index=False)
|
92 |
-
prediction_df = create_prediction_data(temp_sell_ach,
|
93 |
-
oil_price_url,
|
94 |
-
fuel_procurement_cost_url)
|
95 |
-
prediction_df = prediction_df.loc[(prediction_df['forecast_point'].astype(int)>last_prediction_result['forecast_point'].max())
|
96 |
-
&
|
97 |
-
(prediction_df['forecast_point'].astype(int)<=int(today.strftime('%Y%m%d')))].reset_index(drop=True)
|
98 |
-
|
99 |
-
DEPLOYMENT_ID = '640d791796a6a52d92c368a0'
|
100 |
-
prediction_df.to_csv('data/temp_prediction.csv', index=False)
|
101 |
-
prediction_json = dr_prediction_deployment.main('data/temp_prediction.csv', DEPLOYMENT_ID)
|
102 |
-
|
103 |
-
prediction_result = pd.DataFrame({
|
104 |
-
'target_date':prediction_df['target_date'],
|
105 |
-
'forecast_point':prediction_df['forecast_point'],
|
106 |
-
'電気代':pd.json_normalize(prediction_json['data'])['prediction']
|
107 |
-
})
|
108 |
-
|
109 |
-
prediction_result = pd.merge(prediction_df,
|
110 |
-
prediction_result,
|
111 |
-
on=['target_date', 'forecast_point'])
|
112 |
-
|
113 |
-
last_prediction_result = pd.concat([last_prediction_result,
|
114 |
-
prediction_result])
|
115 |
-
|
116 |
-
last_prediction_result.to_csv('data/prediction_result.csv', index=False)
|
117 |
-
|
118 |
-
return last_prediction_result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function/train_modeling.py
DELETED
@@ -1,221 +0,0 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
import numpy as np
|
3 |
-
import gradio as gr
|
4 |
-
import datetime
|
5 |
-
from dateutil.relativedelta import relativedelta
|
6 |
-
import datarobot as dr
|
7 |
-
from function import get_fish_qty, get_estat, dr_prediction_deployment
|
8 |
-
|
9 |
-
import yaml
|
10 |
-
with open('config.yaml') as file:
|
11 |
-
config = yaml.safe_load(file.read())
|
12 |
-
|
13 |
-
def create_train_data():
|
14 |
-
# ターゲットを抽出
|
15 |
-
household_survey = get_estat.get_household_survey()
|
16 |
-
expence_df = pd.DataFrame({'年月':household_survey['時間軸(月次)'].unique()})
|
17 |
-
cate='3.1 電気代'
|
18 |
-
temp_df = household_survey.loc[household_survey['品目分類(2020年改定)'] == cate]
|
19 |
-
unit = temp_df['unit'].unique()[0]
|
20 |
-
temp_df = temp_df.rename(columns={'value':f'{cate}_({unit})'})
|
21 |
-
expence_df = pd.merge(expence_df,
|
22 |
-
temp_df[['時間軸(月次)', f'{cate}_({unit})']].rename(columns={'時間軸(月次)':'年月'}),
|
23 |
-
on='年月',
|
24 |
-
how='left')
|
25 |
-
expence_df = expence_df.rename(columns={'3.1 電気代_(円)':'電気代'})
|
26 |
-
expence_df['年月'] = pd.to_datetime(expence_df['年月'], format='%Y年%m月').astype(str).apply(lambda x:''.join(x.split('-'))[:6]).astype(int)
|
27 |
-
|
28 |
-
# 原油価格を抽出し作成
|
29 |
-
oil_price_df = pd.read_excel(config['oil_price_url'], header=5)
|
30 |
-
oil_price_df = oil_price_df.rename(columns={oil_price_df.columns[0]:'年'})
|
31 |
-
oil_price_df['年'] = oil_price_df['年'].interpolate(method='ffill')
|
32 |
-
oil_price_df['年月'] = oil_price_df['年'] + oil_price_df['月'].astype(str) + '月'
|
33 |
-
oil_price_df['年月'] = pd.to_datetime(oil_price_df['年月'], format='%Y年%m月').astype(str).apply(lambda x:''.join(x.split('-'))[:6]).astype(int)
|
34 |
-
|
35 |
-
# 燃料調達価格のデータを作成
|
36 |
-
fuel_procurement_cost_df = pd.read_excel(config['fuel_procurement_cost_url'], header=4)
|
37 |
-
fuel_procurement_cost_df = fuel_procurement_cost_df.iloc[:, 3:]
|
38 |
-
for i in fuel_procurement_cost_df.columns:
|
39 |
-
if '\n' in i:
|
40 |
-
fuel_procurement_cost_df = fuel_procurement_cost_df.rename(columns={i:i.replace('\n', '')})
|
41 |
-
|
42 |
-
fuel_procurement_cost_df['燃料費調整単価適用期間'] = fuel_procurement_cost_df['燃料費調整単価適用期間'].interpolate(method='ffill')
|
43 |
-
fuel_procurement_cost_df['燃料費調整単価適用期間'] = pd.to_datetime(fuel_procurement_cost_df['燃料費調整単価適用期間'],
|
44 |
-
format='%Y年\n%m月').astype(str).apply(lambda x:''.join(x.split('-'))[:6]).astype(int)
|
45 |
-
for kind in fuel_procurement_cost_df['種別'].unique():
|
46 |
-
temp_df = fuel_procurement_cost_df.loc[fuel_procurement_cost_df['種別']==kind].drop('種別', axis=1)
|
47 |
-
temp_df = temp_df.rename(columns={temp_df.columns[0]:'年月'})
|
48 |
-
for i in temp_df.columns:
|
49 |
-
if i != '年月':
|
50 |
-
temp_df = temp_df.rename(columns={i:f'{i}_{kind}_lag1'})
|
51 |
-
temp_df[f'{i}_{kind}_lag1'] = temp_df[f'{i}_{kind}_lag1'].shift(1)
|
52 |
-
expence_df = pd.merge(expence_df,
|
53 |
-
temp_df,
|
54 |
-
on='年月',
|
55 |
-
how='left')
|
56 |
-
|
57 |
-
# 各データを結合
|
58 |
-
oil_price_df[['ブレント_lag3', 'ドバイ_lag3', 'WTI_lag3', 'OPECバスケット_lag3']] = oil_price_df[['ブレント', 'ドバイ', 'WTI', 'OPECバスケット']].shift(3)
|
59 |
-
expence_df = pd.merge(expence_df,
|
60 |
-
oil_price_df[['ブレント_lag3', 'ドバイ_lag3', 'WTI_lag3', 'OPECバスケット_lag3', '年月']],
|
61 |
-
on='年月',
|
62 |
-
how='left')
|
63 |
-
|
64 |
-
# 魚の卸売りデータを読み込み
|
65 |
-
last_time_fish_arch = pd.read_csv('data/fish_sell_ach.csv')
|
66 |
-
start_date = pd.to_datetime(str(int(last_time_fish_arch['date'].max())))
|
67 |
-
today = datetime.datetime.now()
|
68 |
-
end_date = pd.to_datetime(today + relativedelta(days=1))
|
69 |
-
use_fish_list = config['use_fish_list']
|
70 |
-
temp_sell_ach = get_fish_qty.get_fish_price_data(start_date, end_date, use_fish_list)
|
71 |
-
temp_sell_ach['date'] = temp_sell_ach['date'].astype(int)
|
72 |
-
sell_ach = pd.concat([last_time_fish_arch,
|
73 |
-
temp_sell_ach.loc[~temp_sell_ach['date'].isin(last_time_fish_arch['date'].unique())]])
|
74 |
-
sell_ach.to_csv('data/fish_sell_ach.csv', index=False)
|
75 |
-
|
76 |
-
# trainデータの作成
|
77 |
-
sell_ach['target_date'] = sell_ach['date'].apply(lambda x:int((pd.to_datetime(str(x))+relativedelta(months=1)).strftime('%Y%m%d')))
|
78 |
-
sell_ach['年月'] = sell_ach['target_date'].astype(str).str[:6].astype(int)
|
79 |
-
|
80 |
-
col_list=['するめいか_卸売数量計(kg)',
|
81 |
-
'いわし_卸売数量計(kg)',
|
82 |
-
'ぶり・わらさ_卸売数量計(kg)',
|
83 |
-
'冷さけ_卸売数量計(kg)',
|
84 |
-
'塩さけ_卸売数量計(kg)',
|
85 |
-
'さけます類_卸売数量計(kg)',
|
86 |
-
'全卸売数量計(kg)']
|
87 |
-
|
88 |
-
for shift_i in [7, 14, 21, 28]:
|
89 |
-
change_col_list = [f'{i}_lag{shift_i}' for i in col_list]
|
90 |
-
sell_ach[change_col_list] = sell_ach[col_list].shift(shift_i)
|
91 |
-
|
92 |
-
sell_ach = sell_ach.rename(columns={'date':'forecast_point'})
|
93 |
-
train_df = pd.merge(expence_df,
|
94 |
-
sell_ach,
|
95 |
-
on='年月')
|
96 |
-
train_df.to_csv('data/train.csv', index=False)
|
97 |
-
|
98 |
-
return train_df
|
99 |
-
|
100 |
-
|
101 |
-
def modeling():
|
102 |
-
train_df = create_train_data()
|
103 |
-
# モデリングに必要な各設定値
|
104 |
-
## データロボットとの接続設定
|
105 |
-
token = 'NjQwMDVmNGI0ZDQzZDFhYzI2YThmZDJiOnVZejljTXFNTXNoUnlKMStoUFhXSFdYMEZRck9lY3dobnEvRFZ1aVBHbVE9'
|
106 |
-
### デモ環境これっぽい
|
107 |
-
endpoint = 'https://app.datarobot.com/api/v2'
|
108 |
-
|
109 |
-
## プロジェクト名
|
110 |
-
project_name = f'{datetime.datetime.now().strftime("%Y%m%d")}_ESTYLEU_電気代予測_再学習'
|
111 |
-
|
112 |
-
## 各種設定
|
113 |
-
### 特徴量設定
|
114 |
-
target = '電気代'
|
115 |
-
feature_timeline = 'target_date' #時系列
|
116 |
-
not_use_feature = ['年月', 'forecast_point']
|
117 |
-
# 最適化指標
|
118 |
-
metric = 'RMSE'
|
119 |
-
### ギャップ
|
120 |
-
gap='P0Y' # これで0?要確認
|
121 |
-
### バックテストの数
|
122 |
-
number_of_backtests = 1
|
123 |
-
end_date = int(train_df[feature_timeline].max())
|
124 |
-
### 日付
|
125 |
-
holdout_end_date=pd.to_datetime(str(end_date))
|
126 |
-
holdout_start_date=holdout_end_date - relativedelta(years=1)
|
127 |
-
backtest_end_date = holdout_start_date - relativedelta(days=1)
|
128 |
-
backtest_start_date = backtest_end_date - relativedelta(years=1)
|
129 |
-
train_end_date = backtest_start_date - relativedelta(days=1)
|
130 |
-
train_start_date = pd.to_datetime(str(int(train_df[feature_timeline].min())))
|
131 |
-
|
132 |
-
### モデリングモード
|
133 |
-
# mode = dr.AUTOPILOT_MODE.QUICK
|
134 |
-
mode = dr.AUTOPILOT_MODE.FULL_AUTO
|
135 |
-
dr.Client(
|
136 |
-
endpoint=endpoint,
|
137 |
-
token=token
|
138 |
-
)
|
139 |
-
|
140 |
-
# バックテスト設定
|
141 |
-
backtests_setting = [dr.BacktestSpecification(
|
142 |
-
index=0,
|
143 |
-
primary_training_start_date=train_start_date,
|
144 |
-
primary_training_end_date=train_end_date,
|
145 |
-
validation_start_date=backtest_start_date,
|
146 |
-
validation_end_date=backtest_end_date
|
147 |
-
)]
|
148 |
-
|
149 |
-
spec = dr.DatetimePartitioningSpecification(
|
150 |
-
feature_timeline,
|
151 |
-
use_time_series=False,
|
152 |
-
disable_holdout=False,
|
153 |
-
holdout_start_date=holdout_start_date,
|
154 |
-
holdout_end_date=holdout_end_date,
|
155 |
-
gap_duration=gap,
|
156 |
-
number_of_backtests=number_of_backtests,
|
157 |
-
backtests=backtests_setting,
|
158 |
-
)
|
159 |
-
|
160 |
-
use_feature_list = train_df.columns.to_list()
|
161 |
-
|
162 |
-
print('now creating project')
|
163 |
-
project = dr.Project.create(
|
164 |
-
train_df,
|
165 |
-
project_name=project_name
|
166 |
-
)
|
167 |
-
|
168 |
-
raw = [feat_list for feat_list in project.get_featurelists() if feat_list.name == 'Informative Features'][0]
|
169 |
-
raw_features = [feat for feat in raw.features if f'{feature_timeline} ' in feat]
|
170 |
-
|
171 |
-
for i in not_use_feature:
|
172 |
-
if i in use_feature_list:
|
173 |
-
use_feature_list.remove(i)
|
174 |
-
|
175 |
-
use_feature_list = use_feature_list.extend(raw_features)
|
176 |
-
print("start modeling")
|
177 |
-
project.analyze_and_model(
|
178 |
-
target = target,
|
179 |
-
mode = mode,
|
180 |
-
partitioning_method=spec,
|
181 |
-
max_wait=3000,
|
182 |
-
worker_count=-1,
|
183 |
-
featurelist_id = project.create_featurelist('モデリング', use_feature_list).id
|
184 |
-
)
|
185 |
-
project.wait_for_autopilot()
|
186 |
-
project.unlock_holdout()
|
187 |
-
|
188 |
-
model_df = pd.DataFrame(
|
189 |
-
[[model.id,
|
190 |
-
model.model_type,
|
191 |
-
model.metrics['RMSE']['validation'],
|
192 |
-
model.metrics['RMSE']['backtesting'],
|
193 |
-
model.metrics['RMSE']['holdout'],
|
194 |
-
model] for model in project.get_datetime_models() if model.model_type != 'Baseline Predictions Using Most Recent Value'],
|
195 |
-
columns=['ID', 'モデル名', 'バックテスト1', '全てのバックテスト', 'holdout', 'model'])
|
196 |
-
model_df = model_df.sort_values('holdout').reset_index(drop=True)
|
197 |
-
|
198 |
-
model = model_df['model'][0]
|
199 |
-
|
200 |
-
try:
|
201 |
-
model_management_df = read_csv('data/model_management.csv')
|
202 |
-
except:
|
203 |
-
model_management_df = pd.DataFrame()
|
204 |
-
|
205 |
-
temp_model_management_df = pd.DataFrame({
|
206 |
-
'作成日':[int(datetime.datetime.now().strftime('%Y%m%d'))],
|
207 |
-
'作成時間':[int(datetime.datetime.now().strftime('%H%M%S'))],
|
208 |
-
'project_url':[project.get_uri()],
|
209 |
-
'model_url':[model.get_uri()],
|
210 |
-
'model_type':[model.model_type]
|
211 |
-
})
|
212 |
-
|
213 |
-
model_management_df = pd.concat([model_management_df,
|
214 |
-
temp_model_management_df])
|
215 |
-
model_management_df.to_csv('data/model_management.csv')
|
216 |
-
|
217 |
-
deployment = dr.Deployment.get(deployment_id='640d791796a6a52d92c368a0')
|
218 |
-
|
219 |
-
deployment.replace_model(model.id, dr.enums.MODEL_REPLACEMENT_REASON.SCHEDULED_REFRESH)
|
220 |
-
|
221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|