import pandas as pd import numpy as np import gradio as gr import datetime import calendar import matplotlib.pyplot as plt import japanize_matplotlib import matplotlib.dates as mdates from dateutil.relativedelta import relativedelta import datetime import datarobot as dr from function import get_fish_qty, get_estat, dr_prediction_deployment, prediction_func, train_modeling import yaml with open('config.yaml') as file: config = yaml.safe_load(file.read()) def retrain(): model_management_df = train_modeling.modeling() model = dr.Model.get(project = dr.Project.get(model_management_df.iloc[0, :]['model_url'].split('/')[4]), model_id = model_management_df.iloc[0, :]['model_url'].split('/')[-1]) feature_impact = pd.DataFrame(model.get_or_request_feature_impact()) feature_impact = feature_impact.sort_values('impactNormalized', ascending=False).reset_index(drop=True) feature_impact = feature_impact.iloc[:20, :] for i in range(len(feature_impact)): feature_impact['featureName'][i] = str(i+1).zfill(2) + '_' + feature_impact['featureName'][i] return model_management_df.iloc[0, :]['model_type'], model.metrics['RMSE']['holdout'], feature_impact def get_prediction_result(): today = datetime.datetime.now() prediction_month = (today+relativedelta(months=1)).strftime('%Y%m') month_days = month_days = [pd.to_datetime(prediction_month + str(i+1).zfill(2)) for i in range(calendar.monthrange((today+relativedelta(months=1)).year, (today+relativedelta(months=1)).month)[1])] dfc = pd.DataFrame({'target_date':month_days}) df = prediction_func.prediction_to_dr(config['oil_price_url'], config['fuel_procurement_cost_url']) df = df.loc[df['target_date'].astype(str).str[:6]==prediction_month] df['target_date'] = pd.to_datetime(df['target_date'].astype(str)) df['forecast_point'] = pd.to_datetime(df['forecast_point'].astype(str)) df = pd.merge(dfc, df, on='target_date', how='left') df.loc[df['forecast_point'].isnull(), 'forecast_point'] = df['target_date'].apply(lambda x:x-relativedelta(months=1)) df = df.loc[~((df['target_date']<(today+relativedelta(months=1)))&(df['電気代'].isnull()))] df = df.rename(columns={'電気代':'電気代_予測'}) return df[['forecast_point', 'target_date', '電気代_予測']] def plot_prediction_result(): update = gr.LinePlot.update( value=get_prediction_result(), x="target_date", y="電気代_予測", title="昨日までの魚の卸売り量から予測された、来月の2人世帯の平均電気料金の推移", width=500, height=300, ) return update def get_model_infomation(): token = 'NjQwMDVmNGI0ZDQzZDFhYzI2YThmZDJiOnVZejljTXFNTXNoUnlKMStoUFhXSFdYMEZRck9lY3dobnEvRFZ1aVBHbVE9' endpoint = 'https://app.datarobot.com/api/v2' dr.Client( endpoint=endpoint, token=token ) project = dr.Project.get([i for i in dr.Project.list() if '電気代予測' in str(i)][0].id) model_df = pd.DataFrame( [[model.id, model.model_type, model.metrics['RMSE']['validation'], model.metrics['RMSE']['backtesting'], model.metrics['RMSE']['holdout'], model] for model in project.get_datetime_models() if model.model_type != 'Baseline Predictions Using Most Recent Value'], columns=['ID', 'モデル名', 'バックテスト1', '全てのバックテスト', 'holdout', 'model']) model_df = model_df.sort_values('holdout').reset_index(drop=True) model = model_df['model'][0] model_info = {} model_info['RMSE'] = model.metrics['RMSE']['holdout'] model_info['model_type'] = model.model_type model_info['model_type'] = model.model_type feature_impact = pd.DataFrame(model.get_or_request_feature_impact()) feature_impact = feature_impact.sort_values('impactNormalized', ascending=False).reset_index(drop=True) feature_impact = feature_impact.iloc[:20, :] return model_info, feature_impact def get_featuredrift(): deployment = dr.Deployment.get(deployment_id='640d791796a6a52d92c368a0') target_drift = dr.models.TargetDrift.get(deployment.id) feature_drift_list = dr.models.FeatureDrift.list(deployment.id) drift_df = pd.DataFrame( { 'feature_name':[target_drift.target_name], 'drift_score':[target_drift.drift_score], 'feature_impact':[1] } ) drift_df = pd.concat([ drift_df, pd.DataFrame( [[ feature_drift.name, feature_drift.drift_score, feature_drift.feature_impact ] for feature_drift in feature_drift_list ], columns=[ 'feature_name', 'drift_score', 'feature_impact'] ) ]) start_point = (target_drift.period['start']+relativedelta(hours=9)).strftime("%Y / %m / %d %H:%M:%S") end_point = (target_drift.period['end']+relativedelta(hours=9)).strftime("%Y / %m / %d %H:%M:%S") # drift_df.loc[(drift_df['drift_score']>drift_threshold)&(drift_df['feature_impact']>impact_threshold), 'alert'] = '重要性の高く、大きなドリフト' # drift_df.loc[(drift_df['drift_score']>drift_threshold)&(drift_df['feature_impact']<=impact_threshold), 'alert'] = '重要性低いが、大きなドリフト' # drift_df.loc[(drift_df['drift_score']<=drift_threshold), 'alert'] = '正常' return drift_df, start_point, end_point with gr.Blocks() as electoric_ploting: gr.Markdown( """ # その日の魚の卸売り量から、来月の家計データ月別支出の電気代を予測するAI 使用データ * 東京卸売市場日報 * 家計調査の月別支出 * 原油価格データ * 燃料調達価格データ why 電気代のtrendは原油価格などが大きく影響するが、細かい変化は気候に影響し、気候はある程度海水温に関連性があると考えられる。 また、魚の卸売量は水揚げ量に関係し、水揚げ量は海水温に関係するという考えからモデルを作成。 """ ) with gr.Tab("予測結果"): with gr.Row(): with gr.Column(): plot = gr.LinePlot(show_label=False) # plot = gr.Plot(label="昨日までの魚の卸売り量から予測された、来月の2人世帯の平均電気料金の推移") with gr.Column(): df = get_prediction_result() gr.Textbox(df['電気代_予測'].max(), label='現在までの予測値の最大値') gr.Textbox(df['電気代_予測'].min(), label='現在までの予測値の最小値') gr.Textbox(df['電気代_予測'].mean(), label='現在までの予測値の平均値') gr.Textbox(df['電気代_予測'].median(), label='現在までの予測値の中央値') with gr.Row(): gr.DataFrame(get_prediction_result) with gr.Tab("モデル情報"): gr.Markdown( """ 注意: 再学習後はモデルのデプロイが自動で行われます。 huggingfaceの使用上csvを上書きできないため。 """ ) retrain_btn= gr.Button(value="再学習") with gr.Row(): with gr.Column(): model_info, feature_impact_df = get_model_infomation() gr.Textbox(model_info['model_type'], label='現在のモデル') with gr.Column(): output_model_type = gr.Textbox(label='再学習後のモデル') with gr.Row(): with gr.Column(): gr.Textbox(model_info['RMSE'],label=f'Holdout RMSE精度') with gr.Column(): output_acc = gr.Textbox(label='再学習後のHoldout RMSE精度') with gr.Row(): with gr.Column(): for i in range(len(feature_impact_df)): feature_impact_df['featureName'][i] = str(i+1).zfill(2) + '_' + feature_impact_df['featureName'][i] gr.BarPlot(value = feature_impact_df, title = '特徴量インパクト上位20', x = 'featureName', y = 'impactNormalized', tooltip=['impactNormalized'], x_title = '特徴量名', y_title = '特徴量インパクト_相対値', vertical=False, y_lim=[0, 1.2], width=400, height=300) with gr.Column(): output_plot = gr.BarPlot(title = '再学習後特徴量インパクト上位20', x = 'featureName', y = 'impactNormalized', tooltip=['impactNormalized'], x_title = '特徴量名', y_title = '特徴量インパクト_相対値', vertical=False, y_lim=[0, 1.2], width=400, height=300) with gr.Tab("データドリフト情報"): result = get_featuredrift() with gr.Row(): gr.Markdown( """ こちらの図はデータドリフトと特徴量の有用性を表した図になっています。 味方は以下の通り * ドリフトスコア:予測データに含まれるデータが、どれぐらい過去のデータに比べてずれが発生しているかを表しており、上に行けば行くほどズレが大きい * 特徴量の有用性:ターゲットの有用性を1とした時に、どれぐらいそれぞれの特徴量の有用性が高いかを表したもので、右に行くほど有用性が高い """ ) with gr.Row(): drift_df = result[0] start_point = result[1] end_point = result[2] gr.Textbox(f"{start_point}〜{end_point}",label=f'データドリフト確認期間') with gr.Row(): if len(drift_df["drift_score"].unique())!=1: gr.ScatterPlot( drift_df, x="feature_impact", y="drift_score", title="データドリフトとデータの有用性", color_legend_title="Species", x_title="特徴量の有用性", y_title="ドリフトスコア", x_lim = [-0.1, drift_df["feature_impact"].max()*1.4], y_lim = [-0.1, drift_df["drift_score"].max()*1.4], tooltip=["feature_name", "feature_impact", "drift_score"], caption="", height=500, width=500 ) else: gr.Markdown( """ モデルの入れ替え後に予測が実行されていないためdriftは表示できません。 """ ) retrain_btn.click(retrain, inputs=None, outputs = [output_model_type, output_acc, output_plot]) electoric_ploting.load(lambda: datetime.datetime.now(), None, # c_time2, every=3600) dep = electoric_ploting.load(plot_prediction_result, None, plot, every=3600) electoric_ploting.launch() plt.close()