Spaces:
Sleeping
Sleeping
import datetime | |
import json | |
import gradio as gr | |
import pandas as pd | |
import requests | |
from bs4 import BeautifulSoup as BS | |
from bs4.element import Tag | |
class App: | |
def __init__(self) -> None: | |
with open("bookmarks.json", "rt", encoding="UTF-8") as fp: | |
self.bookmark_info = json.load(fp) | |
with open("Intro.md", "rt", encoding="UTF-8") as fp: | |
intro = fp.read() | |
theme = gr.themes.Soft() | |
with gr.Blocks(title="Yuyu Tei Crawler", theme=theme) as self.demo: | |
gr.Markdown(intro) | |
with gr.Row(equal_height=False): | |
self.__CreateColumns__() | |
self.__RegisterEvents__() | |
def __CreateColumns__(self): | |
with gr.Column(): | |
ph = "e.g. https://yuyu-tei.jp/game_ws/sell/sell_price.php?ver=uma&menu=newest" | |
self.input_url = gr.Textbox(label="URL", placeholder=ph) | |
self.submit_btn = gr.Button("Submit") | |
self.__CreateBookmarks__() | |
with gr.Column(): | |
self.output_file = gr.File(label="Result", file_count="single") | |
self.status = gr.Textbox("Ready", label="Status", interactive=False) | |
def __CreateBookmarks__(self): | |
with gr.Tab("Bookmarks"): | |
with gr.Row(): | |
self.bookmarks = {name: gr.Button(value=name) for name in self.bookmark_info} | |
def __RegisterEvents__(self): | |
args_submit = KwargsToDict( | |
fn=self.Download, | |
inputs=self.input_url, | |
outputs=[self.output_file, self.status], | |
) | |
self.submit_btn.click(**args_submit) | |
self.input_url.submit(**args_submit) | |
def GetArgsBookmark(name): | |
return KwargsToDict( | |
fn=self.ClickBookmark, | |
inputs=self.bookmarks[name], | |
outputs=self.input_url, | |
show_progress=False, | |
) | |
for name in self.bookmarks: | |
args_bookmark = GetArgsBookmark(name) | |
self.bookmarks[name].click(**args_bookmark) | |
def ClickBookmark(self, name): | |
return self.bookmark_info[name] | |
def Download(self, url): | |
try: | |
ts = datetime.datetime.utcnow() + datetime.timedelta(hours=8) | |
output_path = ts.strftime("%Y%m%d_%H%M%S.xlsx") | |
CrawlPage(url, output_path) | |
return output_path, "Success" | |
except Exception as e: | |
return None, f"Error: {e}" | |
def Launch(self): | |
self.demo.launch(favicon_path="icon.png") | |
def CrawlPage(url, output_path): | |
print(f"Visiting {url}") | |
res = requests.get(url) | |
print(f"Status: {res.status_code}") | |
bs = BS(res.text, features="html.parser") | |
elems = bs.find_all("li", attrs={"class": "card_unit"}) | |
data = [IterElem(e) for e in elems] | |
pd.DataFrame(data).to_excel(output_path, index=False) | |
def IterElem(e: Tag): | |
# 卡號 | |
card_id = e.find_next("p", attrs={"class": "id"}) | |
card_id = card_id.text.strip() | |
# 卡名 1 - 從標題提取,但可能會被縮減 | |
card_name_elem = e.find_next("p", attrs={"class": "name"}) | |
card_name = card_name_elem.text.strip() | |
# 卡名 2 - 從圖片的替代敘述提取,但有些圖片的替代名稱是 "NowPrinting" 與實際卡名不符 | |
card_alt_name = e.find_next("p", attrs={"class": "image"}) | |
card_alt_name = card_alt_name.find_next("img") | |
card_alt_name = card_alt_name.get("alt") | |
# 卡名 3 - 進入該卡片的詳細資訊網頁抓取卡名,但會比較慢 | |
# 且可能造成過多訪問以至於被伺服器當成機器人 Ban 掉 | |
# 這邊只針對圖片的替代名稱為 "NowPrinting" 且標題沒有被縮減的部份額外爬取 | |
if card_name.endswith("...") and card_alt_name == "NowPrinting": | |
url = card_name_elem.find_next("a").get("href") | |
card_name = GetCardNameFromPage(url) | |
card_name = card_name if card_alt_name == "NowPrinting" else card_alt_name | |
# 價格 | |
price = e.find_next("p", attrs={"class": "price"}) | |
price = price.text.strip().strip("円") | |
# 稀有度 | |
rarity = e.get("class")[1].strip("rarity_") | |
return {"卡號": card_id, "卡名": card_name, "價格": price, "稀有度": rarity} | |
def GetCardNameFromPage(url): | |
url = f"https://yuyu-tei.jp{url}" | |
print(f"Visiting {url}") | |
res = requests.get(url) | |
print(f"Status: {res.status_code}") | |
bs = BS(res.text, features="html.parser") | |
info_box = bs.find("div", attrs={"class": "information_box"}) | |
card_name = info_box.find("td") | |
return card_name.text.strip() | |
def KwargsToDict(**kwargs): | |
return kwargs | |
if __name__ == "__main__": | |
App().Launch() | |