Viper373's picture
Upload Data_CrawlProcess/Other.py with huggingface_hub
e789ee2 verified
# -*- coding:utf-8 -*-
# @Software :PyCharm
# @Project :LOL-DeepWinPredictor
# @Path :/Data_CrawlProcess
# @FileName :Other.py
# @Time :2025/4/27 22:59
# @Author :Viper373
# @GitHub :https://github.com/Viper373
# @Home :https://viper3.top
# @Blog :https://blog.viper3.top
import re
from collections import OrderedDict
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Optional
import orjson
import requests
from Data_CrawlProcess import env
from tool_utils.log_utils import RichLogger
from tool_utils.mysql_utils import MySQLUtils
from tool_utils.progress_utils import RichProgressUtils
rich_logger = RichLogger()
mysql_utils = MySQLUtils()
class Other:
def __init__(self, rich_progress: Optional[RichProgressUtils] = None):
self.rich_progress = rich_progress or RichProgressUtils()
self.hero_list_url = (
"https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js"
)
self.logo_url_template = "https://game.gtimg.cn/images/lol/act/img/champion/{}.png" # 官方Logo URL模板
self.hero_win_rate_url = "https://op.gg/zh-cn/lol/champions"
self.team_list_url = (
"https://open.tjstats.com/match-auth-app/open/v1/compound/public/team"
)
self.hero_list_path = env.HERO_LIST
self.hero_logo_path = env.HERO_LOGO
self.hero_info_path = env.HERO_INFO
self.hero_win_rate_path = env.HERO_WIN_RATE
self.team_list_path = env.TEAM_LIST
self.cookies = {
"_ol": "zh_CN",
"_olvt": "false",
"_opvc": "7",
}
self.headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "no-cache",
"pragma": "no-cache",
"priority": "u=0, i",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1 Edg/135.0.0.0",
}
self.team_list_headers = {
"accept": "*/*",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6,ja;q=0.5,ko;q=0.4,fr;q=0.3",
"authorization": "7935be4c41d8760a28c05581a7b1f570",
"cache-control": "no-cache",
"origin": "https://lpl.qq.com",
"pragma": "no-cache",
"priority": "u=1, i",
"referer": "https://lpl.qq.com/",
"sec-ch-ua": '"Microsoft Edge";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "cross-site",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.0.0",
}
self.params = {
"tier": "all",
}
self.team_list_params = {
"seasonId": None,
"stageIds": None,
}
def get_hero_list(self) -> None:
"""
获取英雄列表并保存为格式化的JSON文件,并写入MySQL(分路胜率表)。
:return: None
"""
try:
task_id = self.rich_progress.add_task("[Other] 获取hero_list", total=1)
response = requests.get(self.hero_list_url)
if response.status_code != 200:
rich_logger.error(
f"[Other] 请求hero_list失败,状态码: {response.status_code}"
)
return
try:
hero_data = response.json()
except orjson.JSONDecodeError:
rich_logger.error("[Other] hero_list 响应内容不是有效的 JSON 格式")
return
with open(self.hero_list_path, "wb") as f:
f.write(orjson.dumps(hero_data, option=env.ORJSON_OPTS))
rich_logger.info(
f"[Other] {hero_data.get('fileName')}保存完成 丨 version:{hero_data.get('version')}{hero_data.get('fileTime')}"
)
self.rich_progress.advance(task_id)
except Exception as _error:
rich_logger.error(f"[Other] 获取hero_list失败: {_error}")
def append_hero_id(self) -> None:
"""
将英雄列表中的英雄ID添加到胜率数据中
:return: None
"""
try:
task_id = self.rich_progress.add_task("[Other] append_hero_id", total=1)
hero_win_rate = orjson.loads(open(self.hero_win_rate_path, "rb").read())
hero_list = orjson.loads(open(self.hero_list_path, "rb").read())
# 用alias(英文名,小写)做映射
alias_to_id = {
hero["alias"].lower(): hero["heroId"]
for hero in hero_list.get("hero", [])
if "alias" in hero and "heroId" in hero
}
for hero in hero_win_rate.get("data", []):
key = hero.get("key", "").lower()
if key in alias_to_id:
hero["champion_id"] = alias_to_id[key]
with open(self.hero_win_rate_path, "wb") as f:
f.write(orjson.dumps(hero_win_rate, option=orjson.OPT_INDENT_2))
rich_logger.info(f"[Other] append_hero_id完成")
self.rich_progress.advance(task_id)
except Exception as e:
rich_logger.error(f"[Other] append_hero_id失败: {e}")
def get_hero_win_rate(self) -> None:
"""
获取英雄胜率数据并保存为格式化的JSON文件,并写入MySQL(分路胜率表,每个英雄一条,分路缺失补0.5)。
:return: None
"""
try:
task_id = self.rich_progress.add_task("[Other] 获取hero_win_rate", total=1)
max_retries = 3
retry_count = 0
while retry_count < max_retries:
try:
response = requests.get(
url=self.hero_win_rate_url,
params=self.params,
cookies=self.cookies,
headers=self.headers,
)
if response.status_code != 200:
rich_logger.warning(
f"请求失败,状态码: {response.status_code},尝试重试"
)
retry_count += 1
continue
scripts = re.findall(
r"<script\b[^>]*>([\s\S]*?)<\/script>",
response.text,
re.IGNORECASE,
)
target_index = None
for index, content in enumerate(scripts, start=1):
if "positionWinRate" in content:
target_index = index
break
target_script = scripts[target_index - 1]
pattern = r"(\{.*\})"
match = re.search(pattern, target_script, re.DOTALL)
if not match:
rich_logger.warning("目标script中未匹配到JSON数据")
continue
json_str = match.group(1)
json_str = json_str.replace('\\"', '"').replace("\\\\", "\\")
try:
hero_data = orjson.loads(json_str.encode("utf-8"))
save_path = self.hero_win_rate_path
with open(save_path, "wb") as f:
f.write(
orjson.dumps(
hero_data,
option=orjson.OPT_INDENT_2
| orjson.OPT_APPEND_NEWLINE,
)
)
rich_logger.info(f"[Other] hero_win_rate保存完成")
if task_id is not None:
self.rich_progress.advance(task_id)
return
except Exception as e:
rich_logger.error(f"[Other] JSON解析失败: {e},尝试重试")
retry_count += 1
continue
except Exception as _error:
rich_logger.error(
f"[Other] 获取hero_win_rate失败: {_error},尝试重试"
)
retry_count += 1
continue
rich_logger.error(f"[Other] 获取hero_win_rate失败:已重试 {max_retries} 次")
except Exception as e:
rich_logger.error(f"[Other] 获取hero_win_rate主流程异常: {e}")
def write_hero_win_rate_to_mysql(self) -> None:
"""
读取env.HERO_WIN_RATE路径的json数据,聚合后写入MySQL,每个英雄一条,分路缺失补0.5。
:return: None
"""
with open(self.hero_win_rate_path, "rb") as f:
hero_data = orjson.loads(f.read())
winrate_list = hero_data.get("data", [])
hero_dict = {}
# 分路名映射,兼容多种写法
pos_map = {
"TOP": "TOP",
"JUN": "JUN", "JUNGLE": "JUN",
"MID": "MID",
"ADC": "ADC", "BOTTOM": "ADC",
"SUP": "SUP", "SUPPORT": "SUP"
}
for hero in winrate_list:
hero_id = int(hero.get("champion_id", 0))
hero_name = hero.get("name", "")
pos_raw = hero.get("positionName", "").upper()
pos = pos_map.get(pos_raw)
win_rate = hero.get("positionWinRate", 0.5)
if hero_id == 0:
continue
if hero_id not in hero_dict:
hero_dict[hero_id] = {
"hero_id": hero_id,
"hero_name": hero_name,
"TOP": 0.5,
"JUN": 0.5,
"MID": 0.5,
"ADC": 0.5,
"SUP": 0.5,
}
if pos:
try:
hero_dict[hero_id][pos] = float(win_rate / 100)
except Exception:
hero_dict[hero_id][pos] = 0.5
insert_data = []
for hero in hero_dict.values():
insert_data.append(
(
hero["hero_id"],
hero["hero_name"],
hero["TOP"],
hero["JUN"],
hero["MID"],
hero["ADC"],
hero["SUP"],
)
)
mysql_task_id = self.rich_progress.add_task("[Other] hero_win_rate写入MySQL", total=len(insert_data))
mysql_utils.insert_hero_win_rate(insert_data)
for _ in range(len(insert_data)):
self.rich_progress.advance(mysql_task_id)
rich_logger.info(f"[Other] hero_win_rate已写入MySQL,共{len(insert_data)}条")
def append_counter(self, json_path: str) -> None:
"""
为MySQL hero_win_rate表中的每个英雄添加克制关系字段(counter,分路分组,字段为{"TOP": [...], "JUN": [...]},多线程加速写入)。
:param json_path: json文件路径
:return: None
"""
json_data = orjson.loads(open(json_path, "rb").read())
# 分路名映射,兼容多种写法
pos_map = {
"TOP": "TOP",
"JUN": "JUN", "JUNGLE": "JUN",
"MID": "MID",
"ADC": "ADC", "BOTTOM": "ADC",
"SUP": "SUP", "SUPPORT": "SUP"
}
hero_counter_map = {}
for hero in json_data.get("data", []):
hero_id = int(hero.get("champion_id")) if "champion_id" in hero else None
if hero_id is not None:
if hero_id not in hero_counter_map:
hero_counter_map[hero_id] = {}
pos_counters = hero_counter_map[hero_id]
# 顶层分路
if "positionCounters" in hero and isinstance(hero["positionCounters"], list):
pos_raw = hero.get("positionName", "").upper()
pos = pos_map.get(pos_raw)
if pos:
pos_counters[pos] = [int(c.get("champion_id")) for c in hero["positionCounters"] if isinstance(c, dict) and c.get("champion_id") is not None]
# positions数组
if "positions" in hero and isinstance(hero["positions"], list):
for pos_item in hero["positions"]:
pos_raw = pos_item.get("positionName", "").upper()
pos = pos_map.get(pos_raw)
if pos:
pos_counters[pos] = [int(c.get("champion_id")) for c in pos_item.get("positionCounters", []) if isinstance(c, dict) and c.get("champion_id") is not None]
lane_order = ["TOP", "JUN", "MID", "ADC", "SUP"]
# 多线程写入MySQL
try:
hero_ids = list(hero_counter_map.keys())
task_id = self.rich_progress.add_task(f"[Other] hero_counter写入MySQL", total=len(hero_ids))
def update_one(hero_id):
counter_dict = hero_counter_map[hero_id]
# 按顺序构造有序字典
ordered_counter = OrderedDict()
for lane in lane_order:
if lane in counter_dict:
ordered_counter[lane] = counter_dict[lane]
mysql_utils.update_hero_counter(hero_id, ordered_counter)
self.rich_progress.advance(task_id)
with ThreadPoolExecutor(max_workers=12) as executor:
futures = [executor.submit(update_one, hero_id) for hero_id in hero_ids]
for _ in as_completed(futures):
pass
rich_logger.info(f"[Other] hero_counter已写入MySQL,共{len(hero_ids)}条")
except Exception as e:
rich_logger.error(f"[Other] hero_counter写入MySQL失败: {e}")
def generate_logo_data(self) -> None:
"""
生成英雄Logo数据。
"""
count = 0
try:
with open(self.hero_info_path, "rb") as f:
hero_info = orjson.loads(f.read())
logo_data = []
total = len(hero_info)
task_id = self.rich_progress.add_task("[Other] 生成hero_Logo", total=total)
for hero in hero_info:
try:
hero_id = str(hero.get("heroId"))
name = hero.get("name")
heroLogo = hero.get("heroLogo")
logo_url = self.logo_url_template.format(heroLogo)
logo_data.append(
{"heroId": hero_id, "name": name, "heroLogo": logo_url}
)
count += 1
except KeyError as e:
rich_logger.warning(f"字段缺失: {str(e)} | 英雄数据: {hero}")
except Exception as e:
rich_logger.error(f"处理英雄异常: {str(e)} | 数据: {hero}")
self.rich_progress.advance(task_id)
with open(self.hero_logo_path, "wb") as f:
f.write(orjson.dumps(logo_data, option=orjson.OPT_INDENT_2))
rich_logger.info(f"[Other] hero_Logo生成完成丨共{count}条")
except Exception as e:
rich_logger.error(f"[Other] hero_Logo生成失败: {str(e)}")
def get_team_list(self) -> None:
"""
获取队伍列表。
"""
def get_latest_season_game_id():
"""
从 env.py 中读取 SEASONS(list结构),返回 id 最大且 type 最大的赛季的 id 和 type。
处理多余逗号导致的 JSON 解析报错。
:return: (id, type)
"""
with open(env.ENV_SEASONS, "r", encoding="utf-8") as f:
content = f.read()
seasons_match = re.search(r"SEASONS\s*=\s*(\[.*?\])", content, re.DOTALL)
if not seasons_match:
raise Exception(f"未在 {env.ENV_SEASONS} 中找到 SEASONS 变量")
try:
# 先去除多余的结尾逗号
seasons_json_str = seasons_match.group(1)
# 用正则去掉最后一个元素后面的逗号
seasons_json_str = re.sub(r",\s*\]", "]", seasons_json_str)
seasons_list = orjson.loads(seasons_json_str)
except Exception as e:
raise Exception(f"解析 SEASONS 出错: {e}")
# 先找id最大,再在这些中找type最大
max_id = max(int(s["id"]) for s in seasons_list if s.get("id"))
candidates = [s for s in seasons_list if int(s["id"]) == max_id]
def type_max(type_str):
if not type_str:
return -1
return max(int(x) for x in type_str.split(",") if x.isdigit())
best = max(candidates, key=lambda s: type_max(s.get("type")))
return best["id"], best["type"]
seasonId, stageIds = get_latest_season_game_id()
self.team_list_params["seasonId"] = seasonId
self.team_list_params["stageIds"] = stageIds
try:
task_id = self.rich_progress.add_task("[Other] 获取team_list", total=1)
response = requests.get(
url=self.team_list_url,
params=self.team_list_params,
cookies=self.cookies,
headers=self.team_list_headers,
)
if response.status_code != 200:
rich_logger.error(
f"[Other] 请求team_list失败,状态码:{response.status_code}"
)
return
try:
team_list = response.json()
except orjson.JSONDecodeError:
rich_logger.error("[Other] team_list 响应内容不是有效的 JSON 格式")
return
with open(self.team_list_path, "wb") as f:
f.write(orjson.dumps(team_list, option=env.ORJSON_OPTS))
rich_logger.info(f"[Other] team_list获取完成")
self.rich_progress.advance(task_id)
except Exception as _error:
rich_logger.error(f"[Other] 获取team_list失败: {str(_error)}")
def main(self) -> None:
"""
英雄数据主流程。
"""
self.get_hero_list()
self.generate_logo_data()
self.get_team_list()
self.get_hero_win_rate() # 只保存json
self.append_hero_id() # 补全champion_id
self.write_hero_win_rate_to_mysql() # 聚合并写入MySQL
self.append_counter(self.hero_win_rate_path)