assignment / predict.py
EstyleSeiya's picture
Update predict.py
52ec042
import pandas as pd
import datetime as dt
import lightgbm as lgb
import pickle
import datetime
import json
import numpy as np
import requests
from bs4 import BeautifulSoup #ダウンロードしてなかったらpipでできるからやってね。
import csv
#取ったデータをfloat型に変えるやつ。(データが取れなかったとき気象庁は"/"を埋め込んでいるから0に変える)
def str2float(str):
try:
return float(str)
except:
return 0.0
def rename_multicol(df):
df_col=df.columns #列名をコピー
df = df.T.reset_index(drop=False).T #一回列名をリセット
for i in range(df.shape[1]): #列名を新たに定義
rename_col = {i:"".join(df_col[i])}
df = df.rename(columns = rename_col)
df = df.drop(["level_0","level_1"],axis=0)
return df
def predict():
electric_bill_correction_path = f"electric_bill_Correction.csv"
tokyo_monthly_electric_bill_path = f"tokyo_monthly_electiric_bill.csv"
tokyo_weather_2014_2017_path = f"tokyo_weather_2014.3_2018.3.csv"
tokyo_weather_2018_2022_path = f"tokyo_weather_2018.3_2023.3.csv"
weather_condition_path = f"weather_conditions.csv"
train_path = f"train.csv"
#データ読み込み
electric_bill_correction_df = pd.read_csv(electric_bill_correction_path)
tokyo_monthly_electric_bill_df = pd.read_csv(tokyo_monthly_electric_bill_path)
tokyo_weather_2014_2017_df = pd.read_csv(tokyo_weather_2014_2017_path)
tokyo_weather_2018_2022_df = pd.read_csv(tokyo_weather_2018_2022_path)
tokyo_weather_df = pd.concat([tokyo_weather_2014_2017_df, tokyo_weather_2018_2022_df])
weather_condition_df = pd.read_csv(weather_condition_path)
train_df = pd.read_csv(train_path)
#日付型変換
electric_bill_correction_df["年月"] = pd.to_datetime(electric_bill_correction_df["年月"], format="%Y%m" )
tokyo_monthly_electric_bill_df["時間軸(月次)"] = pd.to_datetime(tokyo_monthly_electric_bill_df["時間軸(月次)"], format="%Y年%m月" )
tokyo_weather_df["年月日"] = pd.to_datetime(tokyo_weather_df["年月日"], format="%Y/%m/%d" )
electric_bill_correction_df["年"] = electric_bill_correction_df["年月"].dt.year
electric_bill_correction_df["月"] = electric_bill_correction_df["年月"].dt.month
tokyo_monthly_electric_bill_df["年"] = tokyo_monthly_electric_bill_df["時間軸(月次)"].dt.year
tokyo_monthly_electric_bill_df["月"] = tokyo_monthly_electric_bill_df["時間軸(月次)"].dt.month
tokyo_weather_df["年"] = tokyo_weather_df["年月日"].dt.year
tokyo_weather_df["月"] = tokyo_weather_df["年月日"].dt.month
weather_condition_dict = dict(zip(weather_condition_df["code"], weather_condition_df["天候"]))
model = pickle.load(open('nowcast_electriic_bill_model.pickle', 'rb'))
dt_now = datetime.datetime.now()
year = dt_now.year
month = dt_now.month
##今月の気象履歴を気象庁HPから持ってくる
place_codeA = 44
place_codeB = 47662
place_name = ["東京"]
# URLで年と月ごとの設定ができるので%sで指定した英数字を埋め込めるようにします。
base_url = "http://www.data.jma.go.jp/obd/stats/etrn/view/daily_s1.php?prec_no=%s&block_no=%s&year=%s&month=%s&day=1&view=p1"
#2つの都市コードと年と月を当てはめる。
r = requests.get(base_url%(44, 47662, year, month))
r.encoding = r.apparent_encoding
# まずはサイトごとスクレイピング
soup = BeautifulSoup(r.text)
# findAllで条件に一致するものをすべて抜き出します。
# 今回の条件はtrタグでclassがmtxになってるものです。
rows = soup.findAll('tr',class_='mtx')
# 表の最初の1~4行目はカラム情報なのでスライスする。(indexだから初めは0だよ)
# 【追記】2020/3/11 申し訳ございません。間違えてました。
rows = rows[4:]
column_list = ['年月日', '平均気温', '最高気温', '最低気温', '天気概況(昼:06時〜18時)', '天気概況(夜:18時〜翌日06時)']
All_list = []
# 1日〜最終日までの1行を網羅し、取得します。
for row in rows:
# 今度はtrのなかのtdをすべて抜き出します
data = row.findAll('td')
#1行の中には様々なデータがあるので全部取り出す。
# ★ポイント
rowData = [] #初期化
rowData.append(str(year) + "/" + str(month) + "/" + str(data[0].string))
rowData.append(str2float(data[6].string))
rowData.append(str2float(data[7].string))
rowData.append(str2float(data[8].string))
rowData.append(data[19].string)
rowData.append(data[20].string)
#次の行にデータを追加
All_list.append(rowData)
weather_df = pd.DataFrame(All_list, columns=column_list)
weather_df["年月日"] = pd.to_datetime(weather_df["年月日"])
weather_df = weather_df.add_prefix('天気実績_')
forecast_days = 14
forecast_url = f"http://api.weatherapi.com/v1/forecast.json?key=9184b04d480140f8a3c133051232903&q=Tokyo&days={forecast_days}"
r = requests.get(forecast_url)
r.encoding = r.apparent_encoding
json_string = r.text
response_json = json.loads(json_string)
print("======"*10)
#14日以内予測
All_list = []
for i in range(forecast_days):
row_list = []
row_list.append(response_json["forecast"]["forecastday"][i]["date"])
row_list.append(response_json["forecast"]["forecastday"][i]["day"]["maxtemp_c"])
row_list.append(response_json["forecast"]["forecastday"][i]["day"]["avgtemp_c"])
row_list.append(response_json["forecast"]["forecastday"][i]["day"]["mintemp_c"])
condition_code = min_temperature = response_json["forecast"]["forecastday"][i]["day"]["condition"]["code"]
row_list.append(weather_condition_dict[condition_code])
row_list.append(weather_condition_dict[condition_code])
All_list.append(row_list)
short_forecast_weather_df = pd.DataFrame(All_list, columns=column_list)
short_forecast_weather_df["年月日"] = pd.to_datetime(short_forecast_weather_df["年月日"])
short_forecast_weather_df = short_forecast_weather_df.add_prefix('短期天気予報_')
#15日以降予測
necessary_days = 20
base_url = "https://api.weatherapi.com/v1/future.json?key=9184b04d480140f8a3c133051232903&q=Tokyo&hour=24&dt="
All_list = []
for forecast_day_point in range(0, necessary_days):
forecast_day = (dt_now + datetime.timedelta(days=14+forecast_day_point)).strftime('%Y-%m-%d')
url = base_url + forecast_day
print(url)
r = requests.get(url)
r.encoding = r.apparent_encoding
response_json = json.loads(r.text)
print("======"*10)
print(response_json)
row_list = []
row_list.append(response_json["forecast"]["forecastday"][0]["date"])
row_list.append(response_json["forecast"]["forecastday"][0]["day"]["maxtemp_c"])
row_list.append(response_json["forecast"]["forecastday"][0]["day"]["avgtemp_c"])
row_list.append(response_json["forecast"]["forecastday"][0]["day"]["mintemp_c"])
condition_code = min_temperature = response_json["forecast"]["forecastday"][0]["day"]["condition"]["code"]
row_list.append(weather_condition_dict[condition_code])
row_list.append(weather_condition_dict[condition_code])
All_list.append(row_list)
long_forecast_weather_df = pd.DataFrame(All_list, columns=column_list)
long_forecast_weather_df["年月日"] = pd.to_datetime(long_forecast_weather_df["年月日"])
long_forecast_weather_df = long_forecast_weather_df.add_prefix('長期天気予報_')
total_weather_df = pd.merge(weather_df, short_forecast_weather_df, left_on="天気実績_年月日", right_on="短期天気予報_年月日", how="left")
total_weather_df = pd.merge(total_weather_df, long_forecast_weather_df, left_on="天気実績_年月日", right_on="長期天気予報_年月日", how="left")
total_weather_df["年月日"] = total_weather_df["天気実績_年月日"]
total_weather_df["平均気温"] = np.where(total_weather_df['天気実績_平均気温']!=0.0, total_weather_df['天気実績_平均気温'], total_weather_df['短期天気予報_平均気温'])
total_weather_df["平均気温"] = np.where(total_weather_df['平均気温']!=0.0, total_weather_df['平均気温'], total_weather_df['長期天気予報_平均気温'])
total_weather_df["最高気温"] = np.where(total_weather_df['天気実績_最高気温']!=0.0, total_weather_df['天気実績_最高気温'], total_weather_df['短期天気予報_最高気温'])
total_weather_df["最高気温"] = np.where(total_weather_df['最高気温']!=0.0, total_weather_df['最高気温'], total_weather_df['長期天気予報_最高気温'])
total_weather_df["最低気温"] = np.where(total_weather_df['天気実績_最低気温']!=0.0, total_weather_df['天気実績_最低気温'], total_weather_df['短期天気予報_最低気温'])
total_weather_df["最低気温"] = np.where(total_weather_df['最低気温']!=0.0, total_weather_df['最低気温'], total_weather_df['長期天気予報_最低気温'])
total_weather_df["天気概況(昼:06時〜18時)"] = np.where(pd.notna(total_weather_df['天気実績_天気概況(昼:06時〜18時)']), total_weather_df['天気実績_天気概況(昼:06時〜18時)'], total_weather_df['短期天気予報_天気概況(昼:06時〜18時)'])
total_weather_df["天気概況(昼:06時〜18時)"] = np.where(pd.notna(total_weather_df['天気概況(昼:06時〜18時)']), total_weather_df['天気概況(昼:06時〜18時)'], total_weather_df['長期天気予報_天気概況(昼:06時〜18時)'])
total_weather_df["天気概況(夜:18時〜翌日06時)"] = np.where(pd.notna(total_weather_df['天気実績_天気概況(夜:18時〜翌日06時)']), total_weather_df['天気実績_天気概況(夜:18時〜翌日06時)'], total_weather_df['短期天気予報_天気概況(夜:18時〜翌日06時)'])
total_weather_df["天気概況(夜:18時〜翌日06時)"] = np.where(pd.notna(total_weather_df['天気概況(夜:18時〜翌日06時)']), total_weather_df['天気概況(夜:18時〜翌日06時)'], total_weather_df['長期天気予報_天気概況(夜:18時〜翌日06時)'])
total_weather_df = total_weather_df[["年月日", "平均気温", "最高気温", "最低気温", "天気概況(昼:06時〜18時)", "天気概況(夜:18時〜翌日06時)"]]
total_weather_df["年"] = total_weather_df["年月日"].dt.year
total_weather_df["月"] = total_weather_df["年月日"].dt.month
first_half_weather_df = total_weather_df.groupby(["年", "月"])['天気概況(昼:06時〜18時)'].apply(lambda weather: weather.str.contains("曇").sum()).reset_index().rename(columns={"天気概況(昼:06時〜18時)":"日中_曇"})
latter_half_weather_df = total_weather_df.groupby(["年", "月"])['天気概況(夜:18時〜翌日06時)'].apply(lambda weather: weather.str.contains("曇").sum()).reset_index().rename(columns={"天気概況(夜:18時〜翌日06時)":"夜間_曇"})
weather_list = ["快晴", "晴", "雨", "大雨", "霧雨", "雪", "雷", "みぞれ", "後", "一時"]
for weather in weather_list:
tmp_first_df = total_weather_df.groupby(["年", "月"])['天気概況(昼:06時〜18時)'].apply(lambda x: x.str.contains(weather).sum()).reset_index().rename(columns={"天気概況(昼:06時〜18時)":f'日中_{weather}'})
tmp_latter_df = total_weather_df.groupby(["年", "月"])['天気概況(夜:18時〜翌日06時)'].apply(lambda x: x.str.contains(weather).sum()).reset_index().rename(columns={"天気概況(夜:18時〜翌日06時)":f'夜間_{weather}'})
first_half_weather_df = pd.merge(first_half_weather_df, tmp_first_df, on=["年", "月"])
latter_half_weather_df = pd.merge(latter_half_weather_df, tmp_latter_df, on=["年", "月"])
latter_half_weather_df.drop(["年", "月"], axis=1, inplace=True)
weather_df = pd.concat([first_half_weather_df, latter_half_weather_df], axis=1)
temperature_agg_list = ['max', 'mean', 'min']
total_weather_agg_df = total_weather_df.groupby(["年", "月"]).agg({'平均気温': temperature_agg_list, '最高気温': temperature_agg_list, '最低気温':temperature_agg_list}).reset_index()
total_weather_agg_df = rename_multicol(total_weather_agg_df)
total_weather_agg_df = total_weather_agg_df.astype(float)
tokyo_monthly_electric_bill_df["年"] = tokyo_monthly_electric_bill_df["年"] + 1
total_weather_agg_df = pd.merge(total_weather_agg_df, tokyo_monthly_electric_bill_df, on=["年", "月"], how="left")
total_weather_agg_df.rename(columns={"value":"前年同月実績"}, inplace=True)
predict_df = pd.merge(weather_df, total_weather_agg_df, on=["年", "月"])
predict_df["再生エネルギー賦課金"] = 3.45
predict_df["燃料費調整(低圧)"] = 4.69
predict_df.drop("時間軸(月次)", axis=1, inplace=True)
this_month_electric_price = int(model.predict(predict_df)[0])
return this_month_electric_price