NaokiOkamoto's picture
Upload 6 files
a4b2e63
raw
history blame
No virus
3.13 kB
import codecs
import io
import random
import requests
import time
from datetime import date, timedelta
from tqdm import tqdm
from typing import Generator, Tuple
import numpy as np
import pandas as pd
def date_range(
start: date, stop: date, step: timedelta = timedelta(1)
) -> Generator[date, None, None]:
"""startからendまで日付をstep日ずつループさせるジェネレータ"""
current = start
while current < stop:
yield current
current += step
def get_url(download_date: date) -> Tuple[str, str]:
"""ダウンロードするURLと日付の文字列を返す"""
month = download_date.strftime("%Y%m")
day = download_date.strftime("%Y%m%d")
return (
f"https://www.shijou-nippo.metro.tokyo.lg.jp/SN/{month}/{day}/Sui/Sui_K1.csv",
day,
)
def content_wrap(content):
"""1行目にヘッダ行が来るまでスキップする"""
buffer = ""
first = True
for line in io.BytesIO(content):
line_str = codecs.decode(line, "shift-jis")
if first:
if "品名" in line_str:
first = False
buffer = line_str
else:
continue
else:
buffer += line_str
return io.StringIO(buffer)
def to_numeric(x):
"""文字列を数値に変換する"""
if isinstance(x, str):
return float(x)
else:
return x
def get_fish_price_data(start_date: date, end_date: date, use_fish_list) -> pd.core.frame.DataFrame:
"""
東京卸売市場からデータを引っ張ってくる
:param start_date: 開始日
:param end_date: 終了日
:return: あじの値段を結合したデータ
"""
columns = ['date'] + [i + '_卸売数量計(kg)' for i in use_fish_list] + ['全卸売数量計(kg)']
fish_qty_df = pd.DataFrame(columns=columns)
iterator = tqdm(
date_range(start_date, end_date), total=(end_date - start_date).days
)
for download_date in iterator:
url, day = get_url(download_date)
iterator.set_description(day)
response = requests.get(url)
# URLが存在しないとき
temp_df = pd.DataFrame([{'date':day}])
if response.status_code == 404:
continue
assert (
response.status_code == 200
), f"Unexpected HTTP response. Please check the website {url}."
df = pd.read_csv(content_wrap(response.content))
for i in use_fish_list:
temp = df.loc[df["品名"] == i, ['卸売数量計']]
# display(temp)
if len(temp) == 0:
temp_df[f'{i}_卸売数量計(kg)'] = 0
temp_df[f'{i}_卸売数量計(kg)'] = temp['卸売数量計'].sum()
all_qty = df[['卸売数量計']].dropna().values[-1][0]
temp_df['全卸売数量計(kg)'] = all_qty
fish_qty_df = pd.concat([fish_qty_df, temp_df])
time.sleep(max(0.5 + random.normalvariate(0, 0.3), 0.1))
return fish_qty_df