YuyuTeiCrawler / app.py
penut85420's picture
First Commit
877c973
raw
history blame
3.28 kB
import datetime
import gradio as gr
import pandas as pd
import requests
from bs4 import BeautifulSoup as BS
from bs4.element import Tag
class App:
def __init__(self) -> None:
with gr.Blocks() as self.demo:
gr.Markdown("# Yuyu Tei Price Crawler")
with gr.Row():
with gr.Column():
ph = "e.g. https://yuyu-tei.jp/game_ws/sell/sell_price.php?ver=uma&menu=newest"
self.input_url = gr.Textbox(label="URL", placeholder=ph)
self.submit_btn = gr.Button("Submit")
with gr.Column():
self.output_file = gr.File(label="Excel", file_count="single")
self.status = gr.Markdown("Ready")
self.submit_btn.click(
self.Download,
self.input_url,
[self.output_file, self.status],
)
def Download(self, url):
try:
ts = datetime.datetime.utcnow() + datetime.timedelta(hours=8)
output_path = ts.strftime("%Y%m%d_%H%M%S.xlsx")
CrawlPage(url, output_path)
return output_path, "Success"
except Exception as e:
return None, f"Error: {e}"
def Launch(self):
self.demo.launch()
def CrawlPage(url, output_path):
print(f"Visiting {url}")
bs = BS(requests.get(url).text, features="html.parser")
elems = bs.find_all("li", attrs={"class": "card_unit"})
data = [IterElem(e) for e in elems]
pd.DataFrame(data).to_excel(output_path, index=False)
def IterElem(e: Tag):
# 卡號
card_id = e.find_next("p", attrs={"class": "id"})
card_id = card_id.text.strip()
# 卡名 1 - 從標題提取,但可能會被縮減
card_name_elem = e.find_next("p", attrs={"class": "name"})
card_name = card_name_elem.text.strip()
# 卡名 2 - 從圖片的替代敘述提取,但有些圖片的替代名稱是 "NowPrinting" 與實際卡名不符
card_alt_name = e.find_next("p", attrs={"class": "image"})
card_alt_name = card_alt_name.find_next("img")
card_alt_name = card_alt_name.get("alt")
# 卡名 3 - 進入該卡片的詳細資訊網頁抓取卡名,但會比較慢
# 且可能造成過多訪問以至於被伺服器當成機器人 Ban 掉
# 這邊只針對圖片的替代名稱為 "NowPrinting" 且標題沒有被縮減的部份額外爬取
if card_name.endswith("...") and card_alt_name == "NowPrinting":
url = card_name_elem.find_next("a").get("href")
card_name = GetCardNameFromPage(url)
card_name = card_name if card_alt_name == "NowPrinting" else card_alt_name
# 價格
price = e.find_next("p", attrs={"class": "price"})
price = price.text.strip().strip("円")
# 稀有度
rarity = e.get("class")[1].strip("rarity_")
return {"卡號": card_id, "卡名": card_name, "價格": price, "稀有度": rarity}
def GetCardNameFromPage(url):
url = f"https://yuyu-tei.jp{url}"
print(f"Visiting {url}")
res = requests.get(url)
bs = BS(res.text, features="html.parser")
info_box = bs.find("div", attrs={"class": "information_box"})
card_name = info_box.find("td")
return card_name.text.strip()
if __name__ == "__main__":
App().Launch()