penut85420 commited on
Commit
877c973
1 Parent(s): 773c0a4

First Commit

Browse files
Files changed (1) hide show
  1. app.py +96 -0
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+
3
+ import gradio as gr
4
+ import pandas as pd
5
+ import requests
6
+ from bs4 import BeautifulSoup as BS
7
+ from bs4.element import Tag
8
+
9
+
10
+ class App:
11
+ def __init__(self) -> None:
12
+ with gr.Blocks() as self.demo:
13
+ gr.Markdown("# Yuyu Tei Price Crawler")
14
+ with gr.Row():
15
+ with gr.Column():
16
+ ph = "e.g. https://yuyu-tei.jp/game_ws/sell/sell_price.php?ver=uma&menu=newest"
17
+ self.input_url = gr.Textbox(label="URL", placeholder=ph)
18
+ self.submit_btn = gr.Button("Submit")
19
+ with gr.Column():
20
+ self.output_file = gr.File(label="Excel", file_count="single")
21
+ self.status = gr.Markdown("Ready")
22
+
23
+ self.submit_btn.click(
24
+ self.Download,
25
+ self.input_url,
26
+ [self.output_file, self.status],
27
+ )
28
+
29
+ def Download(self, url):
30
+ try:
31
+ ts = datetime.datetime.utcnow() + datetime.timedelta(hours=8)
32
+ output_path = ts.strftime("%Y%m%d_%H%M%S.xlsx")
33
+ CrawlPage(url, output_path)
34
+ return output_path, "Success"
35
+ except Exception as e:
36
+ return None, f"Error: {e}"
37
+
38
+ def Launch(self):
39
+ self.demo.launch()
40
+
41
+
42
+ def CrawlPage(url, output_path):
43
+ print(f"Visiting {url}")
44
+ bs = BS(requests.get(url).text, features="html.parser")
45
+
46
+ elems = bs.find_all("li", attrs={"class": "card_unit"})
47
+
48
+ data = [IterElem(e) for e in elems]
49
+ pd.DataFrame(data).to_excel(output_path, index=False)
50
+
51
+
52
+ def IterElem(e: Tag):
53
+ # 卡號
54
+ card_id = e.find_next("p", attrs={"class": "id"})
55
+ card_id = card_id.text.strip()
56
+
57
+ # 卡名 1 - 從標題提取,但可能會被縮減
58
+ card_name_elem = e.find_next("p", attrs={"class": "name"})
59
+ card_name = card_name_elem.text.strip()
60
+
61
+ # 卡名 2 - 從圖片的替代敘述提取,但有些圖片的替代名稱是 "NowPrinting" 與實際卡名不符
62
+ card_alt_name = e.find_next("p", attrs={"class": "image"})
63
+ card_alt_name = card_alt_name.find_next("img")
64
+ card_alt_name = card_alt_name.get("alt")
65
+
66
+ # 卡名 3 - 進入該卡片的詳細資訊網頁抓取卡名,但會比較慢
67
+ # 且可能造成過多訪問以至於被伺服器當成機器人 Ban 掉
68
+ # 這邊只針對圖片的替代名稱為 "NowPrinting" 且標題沒有被縮減的部份額外爬取
69
+ if card_name.endswith("...") and card_alt_name == "NowPrinting":
70
+ url = card_name_elem.find_next("a").get("href")
71
+ card_name = GetCardNameFromPage(url)
72
+
73
+ card_name = card_name if card_alt_name == "NowPrinting" else card_alt_name
74
+
75
+ # 價格
76
+ price = e.find_next("p", attrs={"class": "price"})
77
+ price = price.text.strip().strip("円")
78
+
79
+ # 稀有度
80
+ rarity = e.get("class")[1].strip("rarity_")
81
+
82
+ return {"卡號": card_id, "卡名": card_name, "價格": price, "稀有度": rarity}
83
+
84
+
85
+ def GetCardNameFromPage(url):
86
+ url = f"https://yuyu-tei.jp{url}"
87
+ print(f"Visiting {url}")
88
+ res = requests.get(url)
89
+ bs = BS(res.text, features="html.parser")
90
+ info_box = bs.find("div", attrs={"class": "information_box"})
91
+ card_name = info_box.find("td")
92
+ return card_name.text.strip()
93
+
94
+
95
+ if __name__ == "__main__":
96
+ App().Launch()