File size: 3,128 Bytes
a4b2e63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import codecs
import io
import random
import requests
import time
from datetime import date, timedelta
from tqdm import tqdm
from typing import Generator, Tuple

import numpy as np
import pandas as pd


def date_range(
    start: date, stop: date, step: timedelta = timedelta(1)
) -> Generator[date, None, None]:
    """startからendまで日付をstep日ずつループさせるジェネレータ"""
    current = start
    while current < stop:
        yield current
        current += step


def get_url(download_date: date) -> Tuple[str, str]:
    """ダウンロードするURLと日付の文字列を返す"""
    month = download_date.strftime("%Y%m")
    day = download_date.strftime("%Y%m%d")
    return (
        f"https://www.shijou-nippo.metro.tokyo.lg.jp/SN/{month}/{day}/Sui/Sui_K1.csv",
        day,
    )


def content_wrap(content):
    """1行目にヘッダ行が来るまでスキップする"""
    buffer = ""
    first = True
    for line in io.BytesIO(content):
        line_str = codecs.decode(line, "shift-jis")
        if first:
            if "品名" in line_str:
                first = False
                buffer = line_str
            else:
                continue
        else:
            buffer += line_str
    return io.StringIO(buffer)


def to_numeric(x):
    """文字列を数値に変換する"""
    if isinstance(x, str):
        return float(x)
    else:
        return x


def get_fish_price_data(start_date: date, end_date: date, use_fish_list) -> pd.core.frame.DataFrame:
    """
    東京卸売市場からデータを引っ張ってくる

    :param start_date: 開始日
    :param end_date: 終了日
    :return: あじの値段を結合したデータ
    """
    columns = ['date'] + [i + '_卸売数量計(kg)' for i in use_fish_list] + ['全卸売数量計(kg)']
    fish_qty_df = pd.DataFrame(columns=columns)
    
    iterator = tqdm(
        date_range(start_date, end_date), total=(end_date - start_date).days
    )

    for download_date in iterator:
        url, day = get_url(download_date)
        iterator.set_description(day)
        response = requests.get(url)

        # URLが存在しないとき
        temp_df = pd.DataFrame([{'date':day}])
        if response.status_code == 404:
            continue
        assert (
            response.status_code == 200
        ), f"Unexpected HTTP response. Please check the website {url}."

        df = pd.read_csv(content_wrap(response.content))
        
        
        for i in use_fish_list:
            temp = df.loc[df["品名"] == i, ['卸売数量計']]
            
            # display(temp)
            
            if len(temp) == 0:
                temp_df[f'{i}_卸売数量計(kg)'] = 0
            
            
            temp_df[f'{i}_卸売数量計(kg)'] = temp['卸売数量計'].sum()
        
        all_qty = df[['卸売数量計']].dropna().values[-1][0]
        
        temp_df['全卸売数量計(kg)'] = all_qty
                
        fish_qty_df = pd.concat([fish_qty_df, temp_df])
        time.sleep(max(0.5 + random.normalvariate(0, 0.3), 0.1))
    return fish_qty_df