Spaces:
Sleeping
Sleeping
| import datetime | |
| import json | |
| import gradio as gr | |
| import pandas as pd | |
| import requests | |
| from bs4 import BeautifulSoup as BS | |
| from bs4.element import Tag | |
| class App: | |
| def __init__(self) -> None: | |
| with open("bookmarks.json", "rt", encoding="UTF-8") as fp: | |
| self.bookmark_info = json.load(fp) | |
| with open("Intro.md", "rt", encoding="UTF-8") as fp: | |
| intro = fp.read() | |
| theme = gr.themes.Soft() | |
| with gr.Blocks(title="Yuyu Tei Crawler", theme=theme) as self.demo: | |
| gr.Markdown(intro) | |
| with gr.Row(equal_height=False): | |
| self.__CreateColumns__() | |
| self.__RegisterEvents__() | |
| def __CreateColumns__(self): | |
| with gr.Column(): | |
| ph = "e.g. https://yuyu-tei.jp/game_ws/sell/sell_price.php?ver=uma&menu=newest" | |
| self.input_url = gr.Textbox(label="URL", placeholder=ph) | |
| self.submit_btn = gr.Button("Submit") | |
| self.__CreateBookmarks__() | |
| with gr.Column(): | |
| self.output_file = gr.File(label="Result", file_count="single") | |
| self.status = gr.Textbox("Ready", label="Status", interactive=False) | |
| def __CreateBookmarks__(self): | |
| with gr.Tab("Bookmarks"): | |
| with gr.Row(): | |
| self.bookmarks = {name: gr.Button(value=name) for name in self.bookmark_info} | |
| def __RegisterEvents__(self): | |
| args_submit = KwargsToDict( | |
| fn=self.Download, | |
| inputs=self.input_url, | |
| outputs=[self.output_file, self.status], | |
| ) | |
| self.submit_btn.click(**args_submit) | |
| self.input_url.submit(**args_submit) | |
| def GetArgsBookmark(name): | |
| return KwargsToDict( | |
| fn=self.ClickBookmark, | |
| inputs=self.bookmarks[name], | |
| outputs=self.input_url, | |
| show_progress=False, | |
| ) | |
| for name in self.bookmarks: | |
| args_bookmark = GetArgsBookmark(name) | |
| self.bookmarks[name].click(**args_bookmark) | |
| def ClickBookmark(self, name): | |
| return self.bookmark_info[name] | |
| def Download(self, url): | |
| try: | |
| ts = datetime.datetime.utcnow() + datetime.timedelta(hours=8) | |
| output_path = ts.strftime("%Y%m%d_%H%M%S.xlsx") | |
| CrawlPage(url, output_path) | |
| return output_path, "Success" | |
| except Exception as e: | |
| return None, f"Error: {e}" | |
| def Launch(self): | |
| self.demo.launch(favicon_path="icon.png") | |
| def CrawlPage(url, output_path): | |
| print(f"Visiting {url}") | |
| res = requests.get(url) | |
| print(f"Status: {res.status_code}") | |
| bs = BS(res.text, features="html.parser") | |
| elems = bs.find_all("li", attrs={"class": "card_unit"}) | |
| data = [IterElem(e) for e in elems] | |
| pd.DataFrame(data).to_excel(output_path, index=False) | |
| def IterElem(e: Tag): | |
| # 卡號 | |
| card_id = e.find_next("p", attrs={"class": "id"}) | |
| card_id = card_id.text.strip() | |
| # 卡名 1 - 從標題提取,但可能會被縮減 | |
| card_name_elem = e.find_next("p", attrs={"class": "name"}) | |
| card_name = card_name_elem.text.strip() | |
| # 卡名 2 - 從圖片的替代敘述提取,但有些圖片的替代名稱是 "NowPrinting" 與實際卡名不符 | |
| card_alt_name = e.find_next("p", attrs={"class": "image"}) | |
| card_alt_name = card_alt_name.find_next("img") | |
| card_alt_name = card_alt_name.get("alt") | |
| # 卡名 3 - 進入該卡片的詳細資訊網頁抓取卡名,但會比較慢 | |
| # 且可能造成過多訪問以至於被伺服器當成機器人 Ban 掉 | |
| # 這邊只針對圖片的替代名稱為 "NowPrinting" 且標題沒有被縮減的部份額外爬取 | |
| if card_name.endswith("...") and card_alt_name == "NowPrinting": | |
| url = card_name_elem.find_next("a").get("href") | |
| card_name = GetCardNameFromPage(url) | |
| card_name = card_name if card_alt_name == "NowPrinting" else card_alt_name | |
| # 價格 | |
| price = e.find_next("p", attrs={"class": "price"}) | |
| price = price.text.strip().strip("円") | |
| # 稀有度 | |
| rarity = e.get("class")[1].strip("rarity_") | |
| return {"卡號": card_id, "卡名": card_name, "價格": price, "稀有度": rarity} | |
| def GetCardNameFromPage(url): | |
| url = f"https://yuyu-tei.jp{url}" | |
| print(f"Visiting {url}") | |
| res = requests.get(url) | |
| print(f"Status: {res.status_code}") | |
| bs = BS(res.text, features="html.parser") | |
| info_box = bs.find("div", attrs={"class": "information_box"}) | |
| card_name = info_box.find("td") | |
| return card_name.text.strip() | |
| def KwargsToDict(**kwargs): | |
| return kwargs | |
| if __name__ == "__main__": | |
| App().Launch() | |