import datetime import gradio as gr import pandas as pd import requests from bs4 import BeautifulSoup as BS from bs4.element import Tag class App: def __init__(self) -> None: with gr.Blocks() as self.demo: gr.Markdown("# Yuyu Tei Price Crawler") with gr.Row(): with gr.Column(): ph = "e.g. https://yuyu-tei.jp/game_ws/sell/sell_price.php?ver=uma&menu=newest" self.input_url = gr.Textbox(label="URL", placeholder=ph) self.submit_btn = gr.Button("Submit") with gr.Column(): self.output_file = gr.File(label="Excel", file_count="single") self.status = gr.Markdown("Ready") self.submit_btn.click( self.Download, self.input_url, [self.output_file, self.status], ) def Download(self, url): try: ts = datetime.datetime.utcnow() + datetime.timedelta(hours=8) output_path = ts.strftime("%Y%m%d_%H%M%S.xlsx") CrawlPage(url, output_path) return output_path, "Success" except Exception as e: return None, f"Error: {e}" def Launch(self): self.demo.launch() def CrawlPage(url, output_path): print(f"Visiting {url}") bs = BS(requests.get(url).text, features="html.parser") elems = bs.find_all("li", attrs={"class": "card_unit"}) data = [IterElem(e) for e in elems] pd.DataFrame(data).to_excel(output_path, index=False) def IterElem(e: Tag): # 卡號 card_id = e.find_next("p", attrs={"class": "id"}) card_id = card_id.text.strip() # 卡名 1 - 從標題提取,但可能會被縮減 card_name_elem = e.find_next("p", attrs={"class": "name"}) card_name = card_name_elem.text.strip() # 卡名 2 - 從圖片的替代敘述提取,但有些圖片的替代名稱是 "NowPrinting" 與實際卡名不符 card_alt_name = e.find_next("p", attrs={"class": "image"}) card_alt_name = card_alt_name.find_next("img") card_alt_name = card_alt_name.get("alt") # 卡名 3 - 進入該卡片的詳細資訊網頁抓取卡名,但會比較慢 # 且可能造成過多訪問以至於被伺服器當成機器人 Ban 掉 # 這邊只針對圖片的替代名稱為 "NowPrinting" 且標題沒有被縮減的部份額外爬取 if card_name.endswith("...") and card_alt_name == "NowPrinting": url = card_name_elem.find_next("a").get("href") card_name = GetCardNameFromPage(url) card_name = card_name if card_alt_name == "NowPrinting" else card_alt_name # 價格 price = e.find_next("p", attrs={"class": "price"}) price = price.text.strip().strip("円") # 稀有度 rarity = e.get("class")[1].strip("rarity_") return {"卡號": card_id, "卡名": card_name, "價格": price, "稀有度": rarity} def GetCardNameFromPage(url): url = f"https://yuyu-tei.jp{url}" print(f"Visiting {url}") res = requests.get(url) bs = BS(res.text, features="html.parser") info_box = bs.find("div", attrs={"class": "information_box"}) card_name = info_box.find("td") return card_name.text.strip() if __name__ == "__main__": App().Launch()