YuyuTeiCrawler / app.py
Penut Chen
Add Bookmarks
5bd4136
import datetime
import json
import gradio as gr
import pandas as pd
import requests
from bs4 import BeautifulSoup as BS
from bs4.element import Tag
class App:
def __init__(self) -> None:
with open("bookmarks.json", "rt", encoding="UTF-8") as fp:
self.bookmark_info = json.load(fp)
with open("Intro.md", "rt", encoding="UTF-8") as fp:
intro = fp.read()
theme = gr.themes.Soft()
with gr.Blocks(title="Yuyu Tei Crawler", theme=theme) as self.demo:
gr.Markdown(intro)
with gr.Row(equal_height=False):
self.__CreateColumns__()
self.__RegisterEvents__()
def __CreateColumns__(self):
with gr.Column():
ph = "e.g. https://yuyu-tei.jp/game_ws/sell/sell_price.php?ver=uma&menu=newest"
self.input_url = gr.Textbox(label="URL", placeholder=ph)
self.submit_btn = gr.Button("Submit")
self.__CreateBookmarks__()
with gr.Column():
self.output_file = gr.File(label="Result", file_count="single")
self.status = gr.Textbox("Ready", label="Status", interactive=False)
def __CreateBookmarks__(self):
with gr.Tab("Bookmarks"):
with gr.Row():
self.bookmarks = {name: gr.Button(value=name) for name in self.bookmark_info}
def __RegisterEvents__(self):
args_submit = KwargsToDict(
fn=self.Download,
inputs=self.input_url,
outputs=[self.output_file, self.status],
)
self.submit_btn.click(**args_submit)
self.input_url.submit(**args_submit)
def GetArgsBookmark(name):
return KwargsToDict(
fn=self.ClickBookmark,
inputs=self.bookmarks[name],
outputs=self.input_url,
show_progress=False,
)
for name in self.bookmarks:
args_bookmark = GetArgsBookmark(name)
self.bookmarks[name].click(**args_bookmark)
def ClickBookmark(self, name):
return self.bookmark_info[name]
def Download(self, url):
try:
ts = datetime.datetime.utcnow() + datetime.timedelta(hours=8)
output_path = ts.strftime("%Y%m%d_%H%M%S.xlsx")
CrawlPage(url, output_path)
return output_path, "Success"
except Exception as e:
return None, f"Error: {e}"
def Launch(self):
self.demo.launch(favicon_path="icon.png")
def CrawlPage(url, output_path):
print(f"Visiting {url}")
res = requests.get(url)
print(f"Status: {res.status_code}")
bs = BS(res.text, features="html.parser")
elems = bs.find_all("li", attrs={"class": "card_unit"})
data = [IterElem(e) for e in elems]
pd.DataFrame(data).to_excel(output_path, index=False)
def IterElem(e: Tag):
# 卡號
card_id = e.find_next("p", attrs={"class": "id"})
card_id = card_id.text.strip()
# 卡名 1 - 從標題提取,但可能會被縮減
card_name_elem = e.find_next("p", attrs={"class": "name"})
card_name = card_name_elem.text.strip()
# 卡名 2 - 從圖片的替代敘述提取,但有些圖片的替代名稱是 "NowPrinting" 與實際卡名不符
card_alt_name = e.find_next("p", attrs={"class": "image"})
card_alt_name = card_alt_name.find_next("img")
card_alt_name = card_alt_name.get("alt")
# 卡名 3 - 進入該卡片的詳細資訊網頁抓取卡名,但會比較慢
# 且可能造成過多訪問以至於被伺服器當成機器人 Ban 掉
# 這邊只針對圖片的替代名稱為 "NowPrinting" 且標題沒有被縮減的部份額外爬取
if card_name.endswith("...") and card_alt_name == "NowPrinting":
url = card_name_elem.find_next("a").get("href")
card_name = GetCardNameFromPage(url)
card_name = card_name if card_alt_name == "NowPrinting" else card_alt_name
# 價格
price = e.find_next("p", attrs={"class": "price"})
price = price.text.strip().strip("円")
# 稀有度
rarity = e.get("class")[1].strip("rarity_")
return {"卡號": card_id, "卡名": card_name, "價格": price, "稀有度": rarity}
def GetCardNameFromPage(url):
url = f"https://yuyu-tei.jp{url}"
print(f"Visiting {url}")
res = requests.get(url)
print(f"Status: {res.status_code}")
bs = BS(res.text, features="html.parser")
info_box = bs.find("div", attrs={"class": "information_box"})
card_name = info_box.find("td")
return card_name.text.strip()
def KwargsToDict(**kwargs):
return kwargs
if __name__ == "__main__":
App().Launch()