|
import codecs |
|
import io |
|
import random |
|
import requests |
|
import time |
|
from datetime import date, timedelta |
|
from tqdm import tqdm |
|
from typing import Generator, Tuple |
|
|
|
import numpy as np |
|
import pandas as pd |
|
|
|
|
|
def date_range( |
|
start: date, stop: date, step: timedelta = timedelta(1) |
|
) -> Generator[date, None, None]: |
|
"""startからendまで日付をstep日ずつループさせるジェネレータ""" |
|
current = start |
|
while current < stop: |
|
yield current |
|
current += step |
|
|
|
|
|
def get_url(download_date: date) -> Tuple[str, str]: |
|
"""ダウンロードするURLと日付の文字列を返す""" |
|
month = download_date.strftime("%Y%m") |
|
day = download_date.strftime("%Y%m%d") |
|
return ( |
|
f"https://www.shijou-nippo.metro.tokyo.lg.jp/SN/{month}/{day}/Sui/Sui_K1.csv", |
|
day, |
|
) |
|
|
|
|
|
def content_wrap(content): |
|
"""1行目にヘッダ行が来るまでスキップする""" |
|
buffer = "" |
|
first = True |
|
for line in io.BytesIO(content): |
|
line_str = codecs.decode(line, "shift-jis") |
|
if first: |
|
if "品名" in line_str: |
|
first = False |
|
buffer = line_str |
|
else: |
|
continue |
|
else: |
|
buffer += line_str |
|
return io.StringIO(buffer) |
|
|
|
|
|
def to_numeric(x): |
|
"""文字列を数値に変換する""" |
|
if isinstance(x, str): |
|
return float(x) |
|
else: |
|
return x |
|
|
|
|
|
def get_fish_price_data(start_date: date, end_date: date, use_fish_list) -> pd.core.frame.DataFrame: |
|
""" |
|
東京卸売市場からデータを引っ張ってくる |
|
|
|
:param start_date: 開始日 |
|
:param end_date: 終了日 |
|
:return: あじの値段を結合したデータ |
|
""" |
|
columns = ['date'] + [i + '_卸売数量計(kg)' for i in use_fish_list] + ['全卸売数量計(kg)'] |
|
fish_qty_df = pd.DataFrame(columns=columns) |
|
|
|
iterator = tqdm( |
|
date_range(start_date, end_date), total=(end_date - start_date).days |
|
) |
|
|
|
for download_date in iterator: |
|
url, day = get_url(download_date) |
|
iterator.set_description(day) |
|
response = requests.get(url) |
|
|
|
|
|
temp_df = pd.DataFrame([{'date':day}]) |
|
if response.status_code == 404: |
|
continue |
|
assert ( |
|
response.status_code == 200 |
|
), f"Unexpected HTTP response. Please check the website {url}." |
|
|
|
df = pd.read_csv(content_wrap(response.content)) |
|
|
|
|
|
for i in use_fish_list: |
|
temp = df.loc[df["品名"] == i, ['卸売数量計']] |
|
|
|
|
|
|
|
if len(temp) == 0: |
|
temp_df[f'{i}_卸売数量計(kg)'] = 0 |
|
|
|
|
|
temp_df[f'{i}_卸売数量計(kg)'] = temp['卸売数量計'].sum() |
|
|
|
all_qty = df[['卸売数量計']].dropna().values[-1][0] |
|
|
|
temp_df['全卸売数量計(kg)'] = all_qty |
|
|
|
fish_qty_df = pd.concat([fish_qty_df, temp_df]) |
|
time.sleep(max(0.5 + random.normalvariate(0, 0.3), 0.1)) |
|
return fish_qty_df |
|
|