Spaces:
Runtime error
Runtime error
| import altair | |
| import gradio as gr | |
| from math import sqrt | |
| import matplotlib | |
| matplotlib.use("Agg") | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import pandas as pd | |
| import datetime | |
| from sklearn.linear_model import LinearRegression | |
| from sklearn.metrics import log_loss | |
| from sklearn.preprocessing import StandardScaler | |
| import requests | |
| from bs4 import BeautifulSoup as bs | |
| from requests_html import AsyncHTMLSession | |
| import codecs | |
| import io | |
| import random | |
| import requests | |
| import time | |
| from datetime import date, timedelta | |
| from tqdm import tqdm | |
| from typing import Generator, Tuple | |
| import numpy as np | |
| import pandas as pd | |
| def date_range( | |
| start: date, stop: date, step: timedelta = timedelta(1) | |
| ) -> Generator[date, None, None]: | |
| """startからendまで日付をstep日ずつループさせるジェネレータ""" | |
| current = start | |
| while current < stop: | |
| yield current | |
| current += step | |
| def get_url(download_date: date) -> Tuple[str, str]: | |
| """ダウンロードするURLと日付の文字列を返す""" | |
| month = download_date.strftime("%Y%m") | |
| day = download_date.strftime("%Y%m%d") | |
| return ( | |
| f"https://www.shijou-nippo.metro.tokyo.lg.jp/SN/{month}/{day}/Sui/Sui_K1.csv", | |
| day, | |
| ) | |
| def content_wrap(content): | |
| """1行目にヘッダ行が来るまでスキップする""" | |
| buffer = "" | |
| first = True | |
| for line in io.BytesIO(content): | |
| line_str = codecs.decode(line, "shift-jis") | |
| if first: | |
| if "品名" in line_str: | |
| first = False | |
| buffer = line_str | |
| else: | |
| continue | |
| else: | |
| buffer += line_str | |
| return io.StringIO(buffer) | |
| def insert_data(data, day, low_price, center_price, high_price, quantity): | |
| """ "データをリストに追加する""" | |
| data["date"].append(day) | |
| data["low_price"].append(low_price) | |
| data["center_price"].append(center_price) | |
| data["high_price"].append(high_price) | |
| data["quantity"].append(quantity) | |
| def to_numeric(x): | |
| """文字列を数値に変換する""" | |
| if isinstance(x, str): | |
| return float(x) | |
| else: | |
| return x | |
| def get_fish_price_data(start_date: date, end_date: date) -> pd.core.frame.DataFrame: | |
| """ | |
| 東京卸売市場からデータを引っ張ってくる | |
| :param start_date: 開始日 | |
| :param end_date: 終了日 | |
| :return: あじの値段を結合したデータ | |
| """ | |
| data = { | |
| "date": [], | |
| "low_price": [], | |
| "center_price": [], | |
| "high_price": [], | |
| "quantity": [], | |
| } | |
| iterator = tqdm( | |
| date_range(start_date, end_date), total=(end_date - start_date).days | |
| ) | |
| for download_date in iterator: | |
| url, day = get_url(download_date) | |
| iterator.set_description(day) | |
| response = requests.get(url) | |
| # URLが存在しないとき | |
| if response.status_code == 404: | |
| insert_data(data, day, np.nan, np.nan, np.nan, 0) | |
| continue | |
| assert ( | |
| response.status_code == 200 | |
| ), f"Unexpected HTTP response. Please check the website {url}." | |
| df = pd.read_csv(content_wrap(response.content)) | |
| # 欠損値補完 | |
| price_cols = ["安値(円)", "中値(円)", "高値(円)"] | |
| for c in price_cols: | |
| df[c].mask(df[c] == "-", np.nan, inplace=True) | |
| df[c].mask(df[c] == "−", np.nan, inplace=True) | |
| df["卸売数量"].mask(df["卸売数量"] == "-", np.nan, inplace=True) | |
| df["卸売数量"].mask(df["卸売数量"] == "−", np.nan, inplace=True) | |
| # 長崎で獲れたあじの中値と卸売数量 | |
| # 品目 == あじ の行だけ抽出 | |
| df_aji = df.loc[df["品名"] == "あじ", ["卸売数量"] + price_cols] | |
| # あじの販売がなかったら欠損扱いに | |
| if len(df_aji) == 0: | |
| insert_data(data, day, np.nan, np.nan, np.nan, 0) | |
| continue | |
| isnan = lambda x: isinstance(x, float) and np.isnan(x) | |
| # 産地ごと(?)の鯵の販売実績を調べる | |
| low_prices = [] | |
| center_prices = [] | |
| high_prices = [] | |
| quantities = [] | |
| for i, row in enumerate(df_aji.iloc): | |
| lp, cp, hp, q = row[price_cols + ["卸売数量"]] | |
| lp, cp, hp, q = ( | |
| to_numeric(lp), | |
| to_numeric(cp), | |
| to_numeric(hp), | |
| to_numeric(q), | |
| ) | |
| # 中値だけが記録されている -> 価格帯が1個だけなので高値、安値も中値と同じにしておく | |
| if isnan(lp) and isnan(hp) and (not isnan(cp)): | |
| low_prices.append(cp) | |
| center_prices.append(cp) | |
| high_prices.append(cp) | |
| # 高値・安値があり中値がない -> 価格帯2個、とりあえず両者の平均を中値とする | |
| elif (not isnan(lp)) and (not isnan(hp)) and isnan(cp): | |
| low_prices.append(lp) | |
| center_prices.append((lp + hp) / 2) | |
| high_prices.append(hp) | |
| else: | |
| low_prices.append(lp) | |
| center_prices.append(cp) | |
| high_prices.append(hp) | |
| if isnan(row["卸売数量"]): | |
| quantities.append(0) | |
| else: | |
| quantities.append(q) | |
| low_price = int(min(low_prices)) | |
| center_price = int(sum(center_prices) / len(center_prices)) | |
| high_price = int(max(high_prices)) | |
| quantity = int(float(sum(quantities))) | |
| # 保存 | |
| insert_data(data, day, low_price, center_price, high_price, quantity) | |
| # 短期間にアクセスが集中しないようにクールタイムを設定 | |
| time.sleep(max(0.5 + random.normalvariate(0, 0.3), 0.1)) | |
| # DataFrameを作成 | |
| df = pd.DataFrame(data) | |
| return df | |
| # Webページを取得し解析する | |
| load_url = "https://www.football-lab.jp/kyot/match/" | |
| html = requests.get(load_url) | |
| soup = bs(html.content, "html.parser") | |
| df_train = pd.read_csv('df_train.csv') | |
| X = df_train.drop('audience', axis=1) | |
| y = df_train['audience'] | |
| linear_regression = LinearRegression() | |
| model = linear_regression.fit(X,y) | |
| d_today = datetime.date.today() | |
| d_tom = datetime.date.today() + datetime.timedelta(days = 1) | |
| d_y = datetime.date.today() + datetime.timedelta(days = -1) | |
| # 前日のあじデータ抽出 | |
| if __name__ == "__main__": | |
| start_date = d_y | |
| end_date = d_today | |
| df_aji_pre = get_fish_price_data(start_date=start_date, end_date=end_date) | |
| df_aji_pre['date'] = df_aji_pre['date'].astype(int) | |
| url23 = 'https://www.football-lab.jp/ka-f/match/' | |
| dfs23 = pd.read_html(url23) | |
| #シーズン毎に分類 | |
| res23 = pd.DataFrame([['S2023']]*len(dfs23[0])).join(dfs23) | |
| df = res23 | |
| df = df.rename(columns={'会場': 'stadium', 0: 'year', '開催日': 'date', '観客数': 'audience'}) | |
| df = df.query('stadium=="等々力"').reset_index() | |
| df = df.query('audience.notna()', engine='python').reset_index() | |
| df = df[['audience', 'year', 'date']] | |
| #seasonカラムから年を抽出 | |
| df["year"] = df["year"].apply(lambda x: str(x)[1:5]) | |
| #開催日から月と日を分割 | |
| df['month'] = df['date'].str.split(pat='.', expand=True)[0] | |
| df['day'] = df['date'].str.split(pat='.', expand=True)[1] | |
| #数値データを日付データに変換 | |
| df['date'] = pd.to_datetime({'year': df['year'], 'month': df['month'], 'day': df['day']}) | |
| #日付昇順に並び替える | |
| df = df.sort_values('date', ascending=True) | |
| df['date_ymd'] = pd.to_datetime(df['date']).dt.strftime('%Y%m%d') | |
| df['date_ym'] = pd.to_datetime(df['date']).dt.strftime('%Y%m') | |
| df["date_ymd"] = df["date_ymd"].astype(int) | |
| df['date_before'] = df['date_ymd'] - 1 | |
| df["date_before"] = df["date_before"] | |
| df = df[['audience', 'date_ymd', 'date_before']] | |
| df['last_audience'] = df['audience'].shift(1) | |
| df_pre = df.tail(1).reset_index() | |
| df_pre = df_pre.drop('index', axis=1) | |
| df_aji_ft_pre = pd.concat([df_pre, df_aji_pre], axis=1) | |
| df_aji_ft_pre = df_aji_ft_pre[['date_ymd', 'audience', 'low_price', 'center_price', 'high_price', 'quantity']] | |
| df_aji_ft_pre = df_aji_ft_pre.rename(columns={'audience': 'last_audience', 0: 'year', '開催日': 'date_ymd', '観客数': 'audience'}) | |
| df_aji_ft_pre ['last_audience'] = df_aji_ft_pre ['last_audience'].astype(int) | |
| pred = linear_regression.predict(df_aji_ft_pre) | |
| df_aji_ft_pre['audience_pred'] = pred | |
| df_aji_ft_pre['date_ymd'] = df_aji_ft_pre['date_ymd'].astype(int) | |
| def outbreak(date): | |
| if date: | |
| fig = plt.figure() | |
| plt.plot(df_train['date_ymd'], df_train['audience'], label='original') | |
| plt.plot(df_aji_ft_pre['date_ymd'], df_aji_ft_pre['audience_pred'], '*', label='predict') | |
| plt.title(f"prediction of audince 「today prediction value : {pred}」") | |
| plt.ylabel("audience") | |
| plt.xlabel("Days") | |
| plt.legend() | |
| return fig | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| """ | |
| # 川崎フロンターレの観客動員数の予測 | |
| 川崎フロンターレの等々力陸上競技場での試合の観客数を「あじ」の価格をもとに予測する。 | |
| ## 使用データ | |
| * 東京卸売市場日報 | |
| * Football Lab | |
| ## 予測ロジック | |
| 観客動員数は雨天か否かで左右されると考えられる。そこで雨天の可能性をあじの価格を利用し表した。 | |
| 一般的に雨天の場合、低気圧の影響で海面が上昇し漁に出ることが難しくなる。 | |
| そのため漁獲量が減少し、あじの価格が上昇すると考えられる。 | |
| ## モデルについて | |
| モデル名:sklearn | |
| 特徴量:予測日前日のあじの高値、予測日前日のあじの中値、予測日前日のあじの安値、 | |
| 予測日前日のあじの卸売数量、等々力競技場での川崎フロンターレの前回試合の観客数 | |
| ## 注意点 | |
| 予測日前日のあじのデータがない場合はErrorとなります。 | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| date_input = gr.Checkbox(label='Do you want to predict audiences?') | |
| prediction_btn = gr.Button(value="predict") | |
| with gr.Column(): | |
| prediction = gr.Plot(label = "時系列プロット") | |
| prediction_btn.click(outbreak, inputs=date_input, outputs=prediction) | |
| demo.launch() |