Spaces:

AMadabhushi
/

WIFX

Running

WIFX / scripts /build_dashboard_data.py

amadabhu

updated to historical

36f80b6 4 days ago

48.8 kB

	#!/usr/bin/env python3
	"""Pre-aggregate raw data into small JSON files for the interactive dashboard.

	Usage:
	python scripts/build_dashboard_data.py

	Reads from data/ and writes JSON files to data/dashboard/.
	"""
	from __future__ import annotations

	import json
	import warnings
	from pathlib import Path

	import numpy as np
	import pandas as pd

	warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

	ROOT = Path(__file__).resolve().parent.parent
	DATA = ROOT / "data"
	OUT = ROOT / "output"
	OUT.mkdir(parents=True, exist_ok=True)

	COMPETITIONS = {
	"FA_Womens_Super_League_2018-2019": {"type": "league", "label": "FAWSL 2018-19"},
	"FA_Womens_Super_League_2019-2020": {"type": "league", "label": "FAWSL 2019-20"},
	"FA_Womens_Super_League_2020-2021": {"type": "league", "label": "FAWSL 2020-21"},
	"NWSL_2018": {"type": "league", "label": "NWSL 2018"},
	"UEFA_Womens_Euro_2022": {"type": "tournament", "label": "Euros 2022"},
	"UEFA_Womens_Euro_2025": {"type": "tournament", "label": "Euros 2025"},
	"Womens_World_Cup_2019": {"type": "tournament", "label": "WWC 2019"},
	"Womens_World_Cup_2023": {"type": "tournament", "label": "WWC 2023"},
	}

	EVENT_COLS = [
	"type", "player", "player_id", "team", "match_id", "minute",
	"shot_outcome", "shot_statsbomb_xg",
	"pass_goal_assist", "pass_shot_assist", "pass_through_ball",
	"pass_cross", "pass_switch", "pass_outcome",
	"dribble_outcome",
	"interception_outcome",
	"duel_type", "duel_outcome",
	"position",
	]


	def load_events(comp_dir: str) -> pd.DataFrame:
	path = DATA / "statsbomb" / comp_dir / "events.csv"
	df = pd.read_csv(path, usecols=lambda c: c in EVENT_COLS, low_memory=False)
	df["competition"] = comp_dir
	return df


	def load_matches(comp_dir: str) -> pd.DataFrame:
	path = DATA / "statsbomb" / comp_dir / "matches.csv"
	df = pd.read_csv(path)
	df["competition"] = comp_dir
	return df


	def load_lineups(comp_dir: str) -> pd.DataFrame:
	path = DATA / "statsbomb" / comp_dir / "lineups.csv"
	df = pd.read_csv(path)
	df["competition"] = comp_dir
	return df


	def percentile_rank(series: pd.Series) -> pd.Series:
	return series.rank(pct=True) * 100


	# ---------------------------------------------------------------------------
	# StatsBomb Player Aggregates
	# ---------------------------------------------------------------------------
	def build_sb_players(all_events: pd.DataFrame, all_lineups: pd.DataFrame) -> dict:
	ev = all_events[all_events["player"].notna()].copy()

	# Goal threat components
	goals = ev[ev["shot_outcome"] == "Goal"].groupby("player").size().rename("goals")
	shots_on = ev[ev["shot_outcome"].isin(["Goal", "Saved", "Saved Off Target", "Saved to Post"])].groupby("player").size().rename("shots_on_target")
	xg = ev[ev["shot_statsbomb_xg"].notna()].groupby("player")["shot_statsbomb_xg"].sum().rename("xg")
	assists = ev[ev["pass_goal_assist"].notna()].groupby("player").size().rename("assists")
	key_passes = ev[ev["pass_shot_assist"].notna()].groupby("player").size().rename("key_passes")

	# Playmaker components
	through_balls = ev[ev["pass_through_ball"].notna()].groupby("player").size().rename("through_balls")
	crosses = ev[ev["pass_cross"].notna()].groupby("player").size().rename("crosses")
	switches = ev[ev["pass_switch"].notna()].groupby("player").size().rename("switches")
	dribbles_ok = ev[(ev["type"] == "Dribble") & (ev["dribble_outcome"] == "Complete")].groupby("player").size().rename("dribbles")

	# Defensive components
	interceptions = ev[ev["type"] == "Interception"].groupby("player").size().rename("interceptions")
	tackles_won = ev[(ev["duel_type"] == "Tackle") & (ev["duel_outcome"].isin(["Won", "Success In Play", "Success Out"]))].groupby("player").size().rename("tackles_won")
	blocks = ev[ev["type"] == "Block"].groupby("player").size().rename("blocks")
	clearances = ev[ev["type"] == "Clearance"].groupby("player").size().rename("clearances")
	pressures = ev[ev["type"] == "Pressure"].groupby("player").size().rename("pressures")
	recoveries = ev[ev["type"] == "Ball Recovery"].groupby("player").size().rename("recoveries")
	fouls_won = ev[ev["type"] == "Foul Won"].groupby("player").size().rename("fouls_won")
	fouls_committed = ev[ev["type"] == "Foul Committed"].groupby("player").size().rename("fouls_committed")

	# Get primary team and position per player
	player_team = ev.groupby("player")["team"].agg(lambda x: x.value_counts().index[0]).rename("team")
	player_comp = ev.groupby("player")["competition"].agg(lambda x: x.value_counts().index[0])
	player_comp_label = player_comp.map(lambda c: COMPETITIONS.get(c, {}).get("label", c)).rename("competition")

	# Position from lineups
	pos_df = all_lineups[["player_name", "positions"]].copy()
	pos_df = pos_df[pos_df["positions"].notna()]

	def extract_primary_pos(pos_str):
	try:
	import ast
	positions = ast.literal_eval(pos_str)
	if positions and isinstance(positions, list):
	return positions[0].get("position", "Unknown") if isinstance(positions[0], dict) else str(positions[0])
	except Exception:
	pass
	return "Unknown"

	pos_df["primary_position"] = pos_df["positions"].apply(extract_primary_pos)
	player_positions = pos_df.groupby("player_name")["primary_position"].agg(
	lambda x: x.value_counts().index[0]
	).rename("position")

	def simplify_position(pos):
	pos = str(pos).lower()
	if "goalkeeper" in pos or pos == "gk":
	return "GK"
	elif "back" in pos or "defender" in pos or pos in ("cb", "lb", "rb", "lwb", "rwb"):
	return "DF"
	elif "midfield" in pos or pos in ("cm", "cdm", "cam", "lm", "rm", "dm", "am"):
	return "MF"
	elif "forward" in pos or "wing" in pos or "striker" in pos or pos in ("st", "cf", "lw", "rw", "ss"):
	return "FW"
	return "MF"

	player_pos_simple = player_positions.map(simplify_position).rename("position_group")

	# Combine all stats
	stats = pd.DataFrame({
	"team": player_team,
	"competition": player_comp_label,
	})
	for s in [goals, shots_on, xg, assists, key_passes, through_balls, crosses,
	switches, dribbles_ok, interceptions, tackles_won, blocks, clearances,
	pressures, recoveries, fouls_won, fouls_committed]:
	stats = stats.join(s, how="left")
	stats = stats.join(player_pos_simple, how="left")
	stats = stats.fillna(0)
	stats["position_group"] = stats["position_group"].replace(0, "MF")

	# Compute scores as percentile ranks
	stats["goal_threat"] = percentile_rank(
	stats[["goals", "shots_on_target", "xg", "assists", "key_passes"]].sum(axis=1)
	)
	stats["playmaker"] = percentile_rank(
	stats[["assists", "key_passes", "through_balls", "crosses", "switches", "dribbles"]].sum(axis=1)
	)
	stats["defensive"] = percentile_rank(
	stats[["interceptions", "tackles_won", "blocks", "clearances", "pressures", "recoveries"]].sum(axis=1)
	)
	stats["composite"] = (stats["goal_threat"] + stats["playmaker"] + stats["defensive"]) / 3

	stats = stats.reset_index().rename(columns={"index": "player"})

	# Top 30 per metric
	result = {}
	for metric in ["goal_threat", "playmaker", "defensive", "composite"]:
	top = stats.nlargest(30, metric)
	result[metric] = top[["player", "team", "competition", "position_group",
	"goals", "assists", "xg", "key_passes",
	"interceptions", "tackles_won", "blocks",
	metric]].to_dict(orient="records")

	# League vs tournament split
	ev_with_type = ev.copy()
	ev_with_type["comp_type"] = ev_with_type["competition"].map(
	lambda c: COMPETITIONS.get(c, {}).get("type", "unknown")
	)
	league_goals = ev_with_type[(ev_with_type["shot_outcome"] == "Goal") & (ev_with_type["comp_type"] == "league")].groupby("player").size().rename("league_goals")
	tourn_goals = ev_with_type[(ev_with_type["shot_outcome"] == "Goal") & (ev_with_type["comp_type"] == "tournament")].groupby("player").size().rename("tournament_goals")
	league_assists = ev_with_type[(ev_with_type["pass_goal_assist"].notna()) & (ev_with_type["comp_type"] == "league")].groupby("player").size().rename("league_assists")
	tourn_assists = ev_with_type[(ev_with_type["pass_goal_assist"].notna()) & (ev_with_type["comp_type"] == "tournament")].groupby("player").size().rename("tournament_assists")

	lvt = pd.DataFrame({"league_goals": league_goals, "tournament_goals": tourn_goals,
	"league_assists": league_assists, "tournament_assists": tourn_assists}).fillna(0)
	lvt["total"] = lvt.sum(axis=1)
	lvt = lvt.nlargest(25, "total").reset_index().rename(columns={"index": "player"})
	result["league_vs_tournament"] = lvt.to_dict(orient="records")

	# Top 10 by position
	by_pos = {}
	for pos in ["FW", "MF", "DF", "GK"]:
	subset = stats[stats["position_group"] == pos].nlargest(10, "composite")
	by_pos[pos] = subset[["player", "team", "composite", "goal_threat", "playmaker", "defensive"]].to_dict(orient="records")
	result["by_position"] = by_pos

	return result


	# ---------------------------------------------------------------------------
	# StatsBomb Club/Country Aggregates
	# ---------------------------------------------------------------------------
	def compute_team_rankings(all_matches: pd.DataFrame, all_events: pd.DataFrame, comp_type: str) -> dict:
	comps = [c for c, info in COMPETITIONS.items() if info["type"] == comp_type]
	matches = all_matches[all_matches["competition"].isin(comps)].copy()
	events = all_events[all_events["competition"].isin(comps)]

	if matches.empty:
	return {"teams": []}

	matches = matches.sort_values("match_date")

	# xG per team per match
	xg_by_match = events[events["shot_statsbomb_xg"].notna()].groupby(
	["match_id", "team"]
	)["shot_statsbomb_xg"].sum().reset_index()

	# Build team stats
	records = []
	for _, m in matches.iterrows():
	home, away = m["home_team"], m["away_team"]
	hs, as_ = m["home_score"], m["away_score"]
	mid = m["match_id"]
	comp_label = COMPETITIONS.get(m["competition"], {}).get("label", m["competition"])

	for team, opp, gs, gc in [(home, away, hs, as_), (away, home, as_, hs)]:
	xg_team = xg_by_match[(xg_by_match["match_id"] == mid) & (xg_by_match["team"] == team)]
	xg_opp = xg_by_match[(xg_by_match["match_id"] == mid) & (xg_by_match["team"] == opp)]
	records.append({
	"team": team,
	"match_id": mid,
	"match_date": m["match_date"],
	"competition": comp_label,
	"goals_scored": gs,
	"goals_conceded": gc,
	"points": 3 if gs > gc else (1 if gs == gc else 0),
	"xg_for": float(xg_team["shot_statsbomb_xg"].values[0]) if len(xg_team) else 0.0,
	"xg_against": float(xg_opp["shot_statsbomb_xg"].values[0]) if len(xg_opp) else 0.0,
	})

	df = pd.DataFrame(records)

	# Aggregate across all competitions
	team_stats = df.groupby("team").agg(
	matches=("match_id", "count"),
	total_points=("points", "sum"),
	goals_scored=("goals_scored", "sum"),
	goals_conceded=("goals_conceded", "sum"),
	xg_for=("xg_for", "sum"),
	xg_against=("xg_against", "sum"),
	competition=("competition", lambda x: ", ".join(x.unique()[:3])), # List multiple comps
	).reset_index()

	team_stats["ppg"] = (team_stats["total_points"] / team_stats["matches"]).round(2)
	team_stats["gd_per_game"] = ((team_stats["goals_scored"] - team_stats["goals_conceded"]) / team_stats["matches"]).round(2)
	team_stats["xg_dominance"] = ((team_stats["xg_for"] - team_stats["xg_against"]) / team_stats["matches"]).round(3)

	# Elo (across all matches)
	elo = {}
	for _, m in matches.iterrows():
	home, away = m["home_team"], m["away_team"]
	hs, as_ = m["home_score"], m["away_score"]
	eh = elo.get(home, 1500)
	ea = elo.get(away, 1500)
	exp_h = 1 / (1 + 10 ** ((ea - eh) / 400))
	actual_h = 1.0 if hs > as_ else (0.5 if hs == as_ else 0.0)
	K = 40
	elo[home] = eh + K * (actual_h - exp_h)
	elo[away] = ea + K * ((1 - actual_h) - (1 - exp_h))

	team_stats["elo"] = team_stats["team"].map(elo).round(0)

	# Composite
	for col in ["ppg", "elo", "xg_dominance"]:
	team_stats[f"{col}_pct"] = percentile_rank(team_stats[col])
	team_stats["composite"] = ((team_stats["ppg_pct"] + team_stats["elo_pct"] + team_stats["xg_dominance_pct"]) / 3).round(1)

	team_stats = team_stats.sort_values("composite", ascending=False)
	cols = ["team", "competition", "matches", "ppg", "elo", "xg_dominance", "gd_per_game", "composite"]
	return {"teams": team_stats[cols].to_dict(orient="records")}


	# ---------------------------------------------------------------------------
	# StatsBomb Player Comparisons
	# ---------------------------------------------------------------------------
	def build_sb_player_comparisons(all_events: pd.DataFrame) -> dict:
	ev = all_events[all_events["player"].notna()].copy()
	ev["comp_type"] = ev["competition"].map(lambda c: COMPETITIONS.get(c, {}).get("type", "unknown"))
	ev["comp_label"] = ev["competition"].map(lambda c: COMPETITIONS.get(c, {}).get("label", c))

	def player_scores(subset):
	goals = subset[subset["shot_outcome"] == "Goal"].groupby("player").size().rename("goals")
	assists = subset[subset["pass_goal_assist"].notna()].groupby("player").size().rename("assists")
	key_passes = subset[subset["pass_shot_assist"].notna()].groupby("player").size().rename("key_passes")
	xg = subset[subset["shot_statsbomb_xg"].notna()].groupby("player")["shot_statsbomb_xg"].sum().rename("xg")
	through_balls = subset[subset["pass_through_ball"].notna()].groupby("player").size().rename("through_balls")
	crosses = subset[subset["pass_cross"].notna()].groupby("player").size().rename("crosses")
	interceptions = subset[subset["type"] == "Interception"].groupby("player").size().rename("interceptions")
	tackles = subset[(subset["duel_type"] == "Tackle") & (subset["duel_outcome"].isin(["Won", "Success In Play", "Success Out"]))].groupby("player").size().rename("tackles_won")
	blocks = subset[subset["type"] == "Block"].groupby("player").size().rename("blocks")
	recoveries = subset[subset["type"] == "Ball Recovery"].groupby("player").size().rename("recoveries")

	stats = pd.DataFrame({"goals": goals, "assists": assists, "key_passes": key_passes,
	"xg": xg, "through_balls": through_balls, "crosses": crosses,
	"interceptions": interceptions, "tackles_won": tackles,
	"blocks": blocks, "recoveries": recoveries}).fillna(0)

	if len(stats) == 0:
	return stats

	stats["goal_threat"] = percentile_rank(stats[["goals", "xg", "assists", "key_passes"]].sum(axis=1))
	stats["playmaker"] = percentile_rank(stats[["assists", "key_passes", "through_balls", "crosses"]].sum(axis=1))
	stats["defensive"] = percentile_rank(stats[["interceptions", "tackles_won", "blocks", "recoveries"]].sum(axis=1))
	stats["composite"] = (stats["goal_threat"] + stats["playmaker"] + stats["defensive"]) / 3
	return stats

	result = {}

	# 1. Historical tournaments vs Euros 2025
	hist_tourn = ev[(ev["comp_type"] == "tournament") & (ev["competition"] != "UEFA_Womens_Euro_2025")]
	euros25 = ev[ev["competition"] == "UEFA_Womens_Euro_2025"]
	hist_scores = player_scores(hist_tourn)
	e25_scores = player_scores(euros25)

	comparison1 = []
	common = hist_scores.index.intersection(e25_scores.index)
	for metric in ["goal_threat", "playmaker", "defensive", "composite"]:
	merged = pd.DataFrame({
	"historical": hist_scores.loc[common, metric] if metric in hist_scores.columns else 0,
	"euros_2025": e25_scores.loc[common, metric] if metric in e25_scores.columns else 0,
	}).dropna()
	top = merged.nlargest(15, "euros_2025").reset_index().rename(columns={"index": "player"})
	comparison1.append({"metric": metric, "players": top.to_dict(orient="records")})
	result["historical_vs_euros2025"] = comparison1

	# 2. League vs Tournament
	league_ev = ev[ev["comp_type"] == "league"]
	tourn_ev = ev[ev["comp_type"] == "tournament"]
	league_scores = player_scores(league_ev)
	tourn_scores = player_scores(tourn_ev)

	comparison2 = []
	common2 = league_scores.index.intersection(tourn_scores.index)
	for metric in ["goal_threat", "playmaker", "defensive", "composite"]:
	merged = pd.DataFrame({
	"league": league_scores.loc[common2, metric] if metric in league_scores.columns else 0,
	"tournament": tourn_scores.loc[common2, metric] if metric in tourn_scores.columns else 0,
	}).dropna()
	top = merged.nlargest(15, "tournament").reset_index().rename(columns={"index": "player"})
	comparison2.append({"metric": metric, "players": top.to_dict(orient="records")})
	result["league_vs_tournament"] = comparison2

	# 3. Euros 2025 Group vs Knockout
	# Need match stage info from matches
	e25_matches_path = DATA / "statsbomb" / "UEFA_Womens_Euro_2025" / "matches.csv"
	if e25_matches_path.exists():
	e25m = pd.read_csv(e25_matches_path)
	group_match_ids = e25m[e25m["competition_stage"].str.contains("Group", case=False, na=False)]["match_id"].tolist()
	ko_match_ids = e25m[~e25m["competition_stage"].str.contains("Group", case=False, na=False)]["match_id"].tolist()

	group_ev = euros25[euros25["match_id"].isin(group_match_ids)]
	ko_ev = euros25[euros25["match_id"].isin(ko_match_ids)]
	group_scores = player_scores(group_ev)
	ko_scores = player_scores(ko_ev)

	comparison3 = []
	common3 = group_scores.index.intersection(ko_scores.index)
	for metric in ["goal_threat", "playmaker", "composite"]:
	merged = pd.DataFrame({
	"group_stage": group_scores.loc[common3, metric] if metric in group_scores.columns else 0,
	"knockout": ko_scores.loc[common3, metric] if metric in ko_scores.columns else 0,
	}).dropna()
	top = merged.nlargest(15, "knockout").reset_index().rename(columns={"index": "player"})
	comparison3.append({"metric": metric, "players": top.to_dict(orient="records")})
	result["euros2025_group_vs_knockout"] = comparison3

	return result


	# ---------------------------------------------------------------------------
	# FIFA Rankings
	# ---------------------------------------------------------------------------
	def build_fifa_rankings() -> dict:
	quarters = [
	("2025_03_06", "Mar 2025"),
	("2025_06_12", "Jun 2025"),
	("2025_08_07", "Aug 2025"),
	("2025_12_11", "Dec 2025"),
	]

	frames = {}
	for suffix, label in quarters:
	path = DATA / f"fifa_womens_world_ranking_{suffix}.csv"
	if path.exists():
	df = pd.read_csv(path)
	frames[label] = df

	if not frames:
	return {}

	# Build per-country trajectory
	countries = {}
	for label, df in frames.items():
	for _, row in df.iterrows():
	c = row["Country"]
	if c not in countries:
	countries[c] = {
	"country": c,
	"code": row.get("Country_Code", ""),
	"confederation": row.get("Confederation", ""),
	"points": {},
	"ranks": {},
	}
	if pd.notna(row["Total_Points"]):
	countries[c]["points"][label] = float(row["Total_Points"])
	if pd.notna(row["Rank"]):
	countries[c]["ranks"][label] = int(row["Rank"])

	all_countries = list(countries.values())

	# Average points and rank across all quarters for top 25
	all_df = pd.concat(frames.values(), ignore_index=True)
	avg_points = all_df.groupby(["Country", "Country_Code", "Confederation"])["Total_Points"].mean().reset_index()
	avg_points = avg_points.sort_values("Total_Points", ascending=False)
	top25 = avg_points.head(25).rename(columns={"Total_Points": "Avg_Points"}).to_dict(orient="records")

	# Confederation breakdown (by year/quarter)
	conf_avg = {}
	for label, df in frames.items():
	conf_avg[label] = df.groupby("Confederation")["Total_Points"].mean().round(1).to_dict()

	# Movers: calculate change from earliest to latest available year
	all_quarters_sorted = sorted(frames.keys())
	first_label = all_quarters_sorted[0]
	latest_label = all_quarters_sorted[-1]
	movers = []
	for c in all_countries:
	if first_label in c["ranks"] and latest_label in c["ranks"]:
	rank_change = c["ranks"][first_label] - c["ranks"][latest_label]
	point_change = c["points"].get(latest_label, 0) - c["points"].get(first_label, 0)
	movers.append({
	"country": c["country"],
	"code": c["code"],
	"confederation": c["confederation"],
	"rank_change": rank_change,
	"point_change": round(point_change, 1),
	})

	movers_df = pd.DataFrame(movers)
	top_climbers = movers_df.nlargest(15, "rank_change").to_dict(orient="records")
	top_fallers = movers_df.nsmallest(15, "rank_change").to_dict(orient="records")
	top_point_gainers = movers_df.nlargest(15, "point_change").to_dict(orient="records")

	# H1 vs H2 (first half vs second half of the year)
	mid = len(all_quarters_sorted) // 2
	h1_labels = all_quarters_sorted[:mid]
	h2_labels = all_quarters_sorted[mid:]
	h1h2 = []
	for c in all_countries:
	pts = c["points"]
	rnk = c["ranks"]
	if all(l in pts for l in h1_labels + h2_labels) and all(l in rnk for l in h1_labels + h2_labels):
	h1_point_delta = sum(pts[l] for l in h1_labels[1:]) - sum(pts[l] for l in h1_labels[:-1])
	h2_point_delta = sum(pts[l] for l in h2_labels[1:]) - sum(pts[l] for l in h2_labels[:-1])
	h1_rank_delta = rnk[h1_labels[0]] - rnk[h1_labels[-1]]
	h2_rank_delta = rnk[h2_labels[0]] - rnk[h2_labels[-1]]
	h1h2.append({
	"country": c["country"],
	"code": c["code"],
	"confederation": c["confederation"],
	"h1_point_delta": round(h1_point_delta, 1),
	"h2_point_delta": round(h2_point_delta, 1),
	"h1_rank_delta": h1_rank_delta,
	"h2_rank_delta": h2_rank_delta,
	})
	h1h2_df = pd.DataFrame(h1h2)

	# Top 10 trajectories (by average points across all years)
	top10_countries = avg_points.head(10)["Country"].tolist()
	trajectories = [c for c in all_countries if c["country"] in top10_countries]

	return {
	"top25": top25,
	"confederation_avg": conf_avg,
	"top_climbers": top_climbers,
	"top_fallers": top_fallers,
	"top_point_gainers": top_point_gainers,
	"h1_vs_h2": h1h2_df.nlargest(20, "h2_point_delta").to_dict(orient="records") if len(h1h2_df) else [],
	"h1_vs_h2_risers": h1h2_df.assign(
	h2_improvement=h1h2_df["h2_point_delta"] - h1h2_df["h1_point_delta"]
	).nlargest(15, "h2_improvement").to_dict(orient="records") if len(h1h2_df) else [],
	"trajectories": trajectories,
	"quarters": [q[1] for q in quarters],
	}


	# ---------------------------------------------------------------------------
	# WIFXScore (aggregated across years)
	# ---------------------------------------------------------------------------
	def build_wifx_scores() -> dict:
	path = DATA / "wifx_scores.csv"
	df = pd.read_csv(path)

	# Aggregate by player across all years/competitions
	# Use max score so merging entries never penalises players
	# (different sources have different feature richness)
	player_agg = df.groupby("player").apply(
	lambda g: pd.Series({
	"WIFXScore": g["WIFXScore"].max(),
	"epm_raw": g.loc[g["WIFXScore"].idxmax(), "epm_raw"],
	"offensive_score": g.loc[g["WIFXScore"].idxmax(), "offensive_score"],
	"creative_score": g.loc[g["WIFXScore"].idxmax(), "creative_score"],
	"defensive_score": g.loc[g["WIFXScore"].idxmax(), "defensive_score"],
	"total_events": g["total_events"].sum(),
	"team": g["team"].value_counts().index[0] if len(g["team"].value_counts()) > 0 else "Unknown",
	"primary_comp": ", ".join(g["primary_comp"].unique()[:3]) if len(g["primary_comp"].unique()) > 0 else "Unknown",
	})
	).reset_index()

	# Filter for minimum events threshold
	player_agg = player_agg[player_agg["total_events"] >= 50] # At least 50 events total
	player_agg = player_agg[player_agg["WIFXScore"].notna()] # Remove NaN scores

	# Top 25 by average WIFXScore - include all metrics
	top25 = player_agg.nlargest(25, "WIFXScore")[
	["player", "team", "primary_comp", "WIFXScore", "epm_raw", "offensive_score", "creative_score", "defensive_score", "total_events"]
	].to_dict(orient="records")

	# Bottom 25 by WIFXScore
	bottom25 = player_agg.nsmallest(25, "WIFXScore")[
	["player", "team", "primary_comp", "WIFXScore", "epm_raw", "offensive_score", "creative_score", "defensive_score", "total_events"]
	].to_dict(orient="records")

	# All players for component breakdown (top 15)
	all_players = player_agg.sort_values("WIFXScore", ascending=False)[
	["player", "team", "primary_comp", "WIFXScore", "epm_raw", "offensive_score", "creative_score", "defensive_score", "total_events"]
	].to_dict(orient="records")

	# Distribution histogram
	hist_counts, hist_edges = np.histogram(player_agg["WIFXScore"], bins=30)
	distribution = {
	"counts": hist_counts.tolist(),
	"edges": [round(float(e), 2) for e in hist_edges.tolist()],
	"mean": round(float(player_agg["WIFXScore"].mean()), 2),
	"std": round(float(player_agg["WIFXScore"].std()), 2),
	}

	# By competition (still useful to show)
	by_comp = df.groupby("primary_comp")["WIFXScore"].agg(["mean", "median", "std", "count", "min", "max"]).round(2)
	by_comp_list = []
	for comp, row in by_comp.iterrows():
	scores = df[df["primary_comp"] == comp]["WIFXScore"].tolist()
	by_comp_list.append({
	"competition": comp,
	"mean": row["mean"],
	"median": row["median"],
	"std": row["std"],
	"count": int(row["count"]),
	"scores": [round(s, 2) for s in scores],
	})

	return {
	"top25": top25,
	"bottom25": bottom25,
	"all_players": all_players,
	"distribution": distribution,
	"by_competition": by_comp_list,
	}


	# ---------------------------------------------------------------------------
	# WIFXScore Historical (retired/legend players)
	# ---------------------------------------------------------------------------
	def build_wifx_historical_scores() -> dict:
	path = DATA / "wifx_historical_scores.csv"
	retired_path = DATA / "retired_players.csv"
	df = pd.read_csv(path)
	retired_df = pd.read_csv(retired_path)
	category_map = dict(zip(retired_df["player"], retired_df["category"]))

	player_agg = df.groupby("player").apply(
	lambda g: pd.Series({
	"WIFXScore": g["WIFXScore"].max(),
	"epm_raw": g.loc[g["WIFXScore"].idxmax(), "epm_raw"],
	"offensive_score": g.loc[g["WIFXScore"].idxmax(), "offensive_score"],
	"creative_score": g.loc[g["WIFXScore"].idxmax(), "creative_score"],
	"defensive_score": g.loc[g["WIFXScore"].idxmax(), "defensive_score"],
	"total_events": g["total_events"].sum(),
	"team": g["team"].value_counts().index[0] if len(g["team"].value_counts()) > 0 else "Unknown",
	"primary_comp": ", ".join(g["primary_comp"].unique()[:3]) if len(g["primary_comp"].unique()) > 0 else "Unknown",
	})
	).reset_index()

	player_agg = player_agg[player_agg["total_events"] >= 50]
	player_agg = player_agg[player_agg["WIFXScore"].notna()]
	player_agg["category"] = player_agg["player"].map(category_map).fillna("retired")

	cols = ["player", "team", "primary_comp", "WIFXScore", "epm_raw",
	"offensive_score", "creative_score", "defensive_score",
	"total_events", "category"]

	all_players = player_agg.sort_values("WIFXScore", ascending=False)[cols].to_dict(orient="records")
	top25 = player_agg.nlargest(25, "WIFXScore")[cols].to_dict(orient="records")

	return {
	"top25": top25,
	"all_players": all_players,
	}


	# ---------------------------------------------------------------------------
	# Historical Match Results
	# ---------------------------------------------------------------------------
	def build_match_results() -> dict:
	results_path = DATA / "versions" / "36" / "results.csv"
	goals_path = DATA / "versions" / "36" / "goalscorers.csv"

	results = pd.read_csv(results_path)
	goalscorers = pd.read_csv(goals_path)

	# Team aggregates (min 10 matches)
	records = []
	for _, m in results.iterrows():
	home, away = m["home_team"], m["away_team"]
	hs, as_ = m["home_score"], m["away_score"]
	for team, opp, gs, gc in [(home, away, hs, as_), (away, home, as_, hs)]:
	records.append({
	"team": team,
	"date": m["date"],
	"goals_scored": gs,
	"goals_conceded": gc,
	"points": 3 if gs > gc else (1 if gs == gc else 0),
	})

	df = pd.DataFrame(records)
	team_stats = df.groupby("team").agg(
	matches=("points", "count"),
	total_points=("points", "sum"),
	goals_scored=("goals_scored", "sum"),
	goals_conceded=("goals_conceded", "sum"),
	).reset_index()
	team_stats = team_stats[team_stats["matches"] >= 10]
	team_stats["ppg"] = (team_stats["total_points"] / team_stats["matches"]).round(2)
	team_stats["gd_per_game"] = ((team_stats["goals_scored"] - team_stats["goals_conceded"]) / team_stats["matches"]).round(2)

	# Elo
	results_sorted = results.sort_values("date")
	elo = {}
	for _, m in results_sorted.iterrows():
	home, away = m["home_team"], m["away_team"]
	hs, as_ = m["home_score"], m["away_score"]
	eh = elo.get(home, 1500)
	ea = elo.get(away, 1500)
	exp_h = 1 / (1 + 10 ** ((ea - eh) / 400))
	actual_h = 1.0 if hs > as_ else (0.5 if hs == as_ else 0.0)
	K = 40
	elo[home] = eh + K * (actual_h - exp_h)
	elo[away] = ea + K * ((1 - actual_h) - (1 - exp_h))

	team_stats["elo"] = team_stats["team"].map(elo).round(0)

	# Composite
	for col in ["ppg", "elo", "gd_per_game"]:
	team_stats[f"{col}_pct"] = percentile_rank(team_stats[col])
	team_stats["composite"] = ((team_stats["ppg_pct"] + team_stats["elo_pct"] + team_stats["gd_per_game_pct"]) / 3).round(1)
	team_stats = team_stats.sort_values("composite", ascending=False)

	top_teams = team_stats.head(30)[["team", "matches", "ppg", "elo", "gd_per_game", "composite"]].to_dict(orient="records")

	# Top scorers
	scorer_counts = goalscorers.groupby("scorer").agg(
	goals=("scorer", "count"),
	teams=("team", lambda x: ", ".join(x.unique())),
	penalties=("penalty", "sum"),
	).reset_index().sort_values("goals", ascending=False)
	top_scorers = scorer_counts.head(30).to_dict(orient="records")

	return {
	"top_teams": top_teams,
	"top_scorers": top_scorers,
	}


	# ---------------------------------------------------------------------------
	# WIFX National Team Scores (aggregated across all years)
	# ---------------------------------------------------------------------------
	def build_wifx_national_team_scores():
	path = DATA / "wifx_national_team_scores.csv"
	df = pd.read_csv(path)

	# Championship wins weighting (major tournaments)
	CHAMPIONSHIP_WINS = {
	"United States Women's": 4, # WWC: 1991, 1999, 2015, 2019
	"United States": 4,
	"Germany Women's": 2, # Euro: 1995, 2001, 2009, 2013
	"Germany": 2,
	"Norway Women's": 1, # Euro: 1995, WWC: 2023
	"Norway": 1,
	"Japan Women's": 1, # WWC: 2011
	"Japan": 1,
	"Spain Women's": 2, # Euro: 2022, WWC: 2023
	"Spain": 2,
	"England Women's": 1, # Euro: 2022
	"England": 1,
	"Netherlands Women's": 1, # Euro: 2017
	"Netherlands": 1,
	"France Women's": 0,
	"France": 0,
	"Sweden Women's": 0,
	"Sweden": 0,
	"Canada Women's": 1, # Olympics: 2020, 2024
	"Canada": 1,
	"Brazil Women's": 0,
	"Brazil": 0,
	"Australia Women's": 0,
	"Australia": 0,
	}

	# Add championship wins
	df["championship_wins"] = df["team"].map(CHAMPIONSHIP_WINS).fillna(0)

	# Aggregate by team
	agg_cols = {
	"offensive_rating": "mean",
	"defensive_rating": "mean",
	"net_rating": "mean",
	"composite_rating": "mean",
	"matches": "sum",
	"goals_scored": "sum",
	"championship_wins": "max", # Keep max wins
	}
	if "goals_conceded" in df.columns:
	agg_cols["goals_conceded"] = "sum"

	agg = df.groupby("team").agg(agg_cols).reset_index()

	# Weight net rating by championship wins (add number of championships)
	agg["wifx_global_ranking"] = agg["net_rating"] + agg["championship_wins"]

	# Sort by WIFX Global Ranking
	agg = agg.sort_values("wifx_global_ranking", ascending=False)

	# Rename net_rating to wifx_global_ranking for output
	result = {
	"all_teams": agg.to_dict(orient="records"),
	}

	write_json("wifx_national_team_scores.json", result)


	# ---------------------------------------------------------------------------
	# WIFX Club Team Scores (aggregated across all years)
	# ---------------------------------------------------------------------------
	def build_wifx_club_team_scores():
	# First, load existing StatsBomb data
	path = DATA / "wifx_club_team_scores.csv"
	df = pd.read_csv(path)

	# Proper weighted average aggregation for StatsBomb
	agg = {}
	for _, row in df.iterrows():
	team = row['team']
	matches = row['matches']
	if team not in agg:
	agg[team] = {
	'team': team,
	'matches': 0,
	'goals_scored': 0,
	'offensive_rating_sum': 0,
	'defensive_rating_sum': 0,
	'net_rating_sum': 0,
	'composite_rating_sum': 0,
	'comps': set()
	}
	agg[team]['matches'] += matches
	agg[team]['goals_scored'] += int(row.get('goals_scored', 0) or 0)
	if 'goals_conceded' in row and pd.notna(row.get('goals_conceded')):
	if 'goals_conceded' not in agg[team]:
	agg[team]['goals_conceded'] = 0
	agg[team]['goals_conceded'] += int(row['goals_conceded'])
	agg[team]['offensive_rating_sum'] += (row['offensive_rating'] or 0) * matches
	agg[team]['defensive_rating_sum'] += (row['defensive_rating'] or 0) * matches
	agg[team]['net_rating_sum'] += (row['net_rating'] or 0) * matches
	agg[team]['composite_rating_sum'] += (row['composite_rating'] or 0) * matches
	if pd.notna(row.get('comp_label')):
	agg[team]['comps'].add(row['comp_label'])

	# Compute StatsBomb averages
	sb_result = []
	for team, data in agg.items():
	comps_str = ", ".join(sorted(data['comps'])) if data['comps'] else "FAWSL"
	result = {
	'team': team,
	'offensive_rating': round(data['offensive_rating_sum'] / data['matches'], 1),
	'defensive_rating': round(data['defensive_rating_sum'] / data['matches'], 1),
	'net_rating': round(data['net_rating_sum'] / data['matches'], 1),
	'composite_rating': round(data['composite_rating_sum'] / data['matches'], 1),
	'matches': data['matches'],
	'goals_scored': data['goals_scored'],
	'comp_label': comps_str,
	'source': 'statsbomb'
	}
	if 'goals_conceded' in data:
	result['goals_conceded'] = data['goals_conceded']
	sb_result.append(result)

	# Normalize StatsBomb to 0-100 scale (was 0-30)
	sb_off_min = min(t['offensive_rating'] for t in sb_result)
	sb_off_max = max(t['offensive_rating'] for t in sb_result)
	sb_def_min = min(t['defensive_rating'] for t in sb_result)
	sb_def_max = max(t['defensive_rating'] for t in sb_result)
	if sb_off_max > sb_off_min:
	for t in sb_result:
	t['offensive_rating'] = round((t['offensive_rating'] - sb_off_min) / (sb_off_max - sb_off_min) * 100, 1)
	t['defensive_rating'] = round((t['defensive_rating'] - sb_def_min) / (sb_def_max - sb_def_min) * 100, 1)
	t['net_rating'] = round(t['offensive_rating'] - t['defensive_rating'], 1)
	t['composite_rating'] = round((t['offensive_rating'] + t['defensive_rating']) / 2, 1)

	TEAM_MAP = {
	'KPqjw8PQ6v': 'Portland Thorns',
	'aDQ0lzvQEv': 'OL Reign',
	'4JMAk47qKg': 'Chicago Red Stars',
	'XVqKeVKM01': 'Washington Spirit',
	'raMyrr25d2': 'Houston Dash',
	'zeQZeazqKw': 'Orlando Pride',
	'7vQ7BBzqD1': 'FC Kansas City',
	'4wM4rZdqjB': 'North Carolina Courage',
	'Pk5LeeNqOW': 'Kansas City Current',
	'4wM4Ezg5jB': 'Sky Blue FC',
	'7VqG1lYMvW': 'NJ/NY Gotham',
	'eV5DR6YQKn': 'Angel City',
	'kRQa8JOqKZ': 'San Diego Wave',
	'eV5D2w9QKn': 'Bay FC',
	'315VnJ759x': 'Racing Louisville',
	'xW5pwDBMg1': 'Boston Breakers',
	'kRQaWa15KZ': 'Western New York Flash',
	}

	ga_path = DATA / "asa_nwsl" / "goals_added.csv"
	if ga_path.exists():
	ga = pd.read_csv(ga_path)

	team_year = ga.groupby(['team_id_ga', 'season']).agg({
	'minutes_played_ga': 'sum',
	'ga_shooting_raw': 'sum',
	'ga_passing_raw': 'sum',
	'ga_dribbling_raw': 'sum',
	'ga_interrupting_raw': 'sum',
	'ga_receiving_raw': 'sum',
	'player_id': 'count',
	}).reset_index()
	team_year.columns = ['team_id', 'season', 'minutes', 'shooting', 'passing', 'dribbling', 'interrupting', 'receiving', 'players']
	team_year['team'] = team_year['team_id'].map(TEAM_MAP).fillna('Unknown')
	team_year = team_year[(team_year['team'] != 'Unknown') & (team_year['minutes'] > 5000)]

	# Percentile ranking within each season
	team_year['offensive_rating'] = team_year.groupby('season')['shooting'].transform(lambda x: (x.rank(pct=True) * 100).round(1))
	team_year['defensive_rating'] = team_year.groupby('season')['interrupting'].transform(lambda x: (x.rank(pct=True) * 100).round(1))
	team_year['net_rating'] = (team_year['offensive_rating'] - team_year['defensive_rating']).round(1)
	team_year['composite_rating'] = ((team_year['offensive_rating'] + team_year['defensive_rating']) / 2).round(1)

	# Convert minutes to matches (approx 90 min = 1 match)
	team_year['matches'] = (team_year['minutes'] / 90).astype(int)
	team_year['comp_label'] = 'NWSL ' + team_year['season'].astype(str)

	# Aggregate across all years
	asa_agg = {}
	for _, row in team_year.iterrows():
	team = row['team']
	matches = row['matches']
	if team not in asa_agg:
	asa_agg[team] = {
	'team': team,
	'matches': 0,
	'offensive_rating_sum': 0,
	'defensive_rating_sum': 0,
	'net_rating_sum': 0,
	'composite_rating_sum': 0,
	}
	asa_agg[team]['matches'] += matches
	asa_agg[team]['offensive_rating_sum'] += row['offensive_rating'] * matches
	asa_agg[team]['defensive_rating_sum'] += row['defensive_rating'] * matches
	asa_agg[team]['net_rating_sum'] += row['net_rating'] * matches
	asa_agg[team]['composite_rating_sum'] += row['composite_rating'] * matches

	asa_result = []
	for team, data in asa_agg.items():
	asa_result.append({
	'team': team,
	'offensive_rating': round(data['offensive_rating_sum'] / data['matches'], 1),
	'defensive_rating': round(data['defensive_rating_sum'] / data['matches'], 1),
	'net_rating': round(data['net_rating_sum'] / data['matches'], 1),
	'composite_rating': round(data['composite_rating_sum'] / data['matches'], 1),
	'matches': data['matches'],
	'goals_scored': 0, # Not available in ASA
	'goals_conceded': 0,
	'comp_label': 'NWSL 2016-2025',
	'source': 'asa'
	})
	else:
	asa_result = []

	# Combine both (deduplicate by team name - prefer ASA if available as it has more data)
	combined = {}

	# Championship wins mapping for clubs (NWSL weighted slightly higher)
	CLUB_CHAMPIONSHIPS = {
	# NWSL (weighted 1.5x)
	"Portland Thorns": 3, # 2017, 2022, 2024
	"North Carolina Courage": 3, # 2018, 2019, 2023
	"Kansas City Current": 1, # 2024 (as Current)
	"FC Kansas City": 2, # 2014, 2015
	"Western New York Flash": 1, # 2016
	"OL Reign": 1, # 2020
	"Seattle Reign": 1, # 2020
	"Chicago Red Stars": 0,
	"Washington Spirit": 1, # 2021
	"Houston Dash": 0,
	"Angel City": 0,
	"NJ/NY Gotham": 0,
	"Boston Breakers": 0,
	"Sky Blue FC": 0,

	# FAWSL
	"Chelsea": 4, # 2015-16, 2017-18, 2019-20, 2020-21
	"Manchester City Women": 2, # 2016-17, 2020-21
	"Arsenal Women": 1, # 2022-23
	"Liverpool FFC": 1, # 2013-14
	"Everton Ladies": 0,
	"Bristol City WFC": 0,
	"Brighton & Hove Albion Women": 0,
	"Reading FC Women": 0,
	"Tottenham Hotspur Women": 0,
	"West Ham United LFC": 0,
	"Aston Villa": 0,
	"Yeovil Town LFC": 0,

	# UWCL
	"Lyon": 8,
	"OL Lyonnes": 8, # 2016-2020 (5), 2021-22, 2022-23, 2023-24
	"Barcelona": 3,
	"Fútbol Club Barcelona": 3, # 2020-21, 2021-22, 2022-23
	"Wolfsburg": 2,
	"VfL Wolfsburg": 2, # 2013-14, 2015-16
	"Paris Saint-Germain": 0,
	"Olympique Lyonnais": 8,

	# Other leagues
	"Bay FC": 0,
	"Racing Louisville": 0,
	"San Diego Wave": 0,
	"FC Barcelona": 3,
	}

	# First add StatsBomb teams
	for t in sb_result:
	combined[t['team']] = t

	# Then add ASA teams (will overwrite StatsBomb if exists)
	for t in asa_result:
	if t['team'] in combined:
	# Merge - keep statsbomb goals data, use ASA ratings weighted by matches
	existing = combined[t['team']]
	total_matches = existing['matches'] + t['matches']
	combined[t['team']] = {
	'team': t['team'],
	'offensive_rating': round((existing['offensive_rating'] * existing['matches'] + t['offensive_rating'] * t['matches']) / total_matches, 1),
	'defensive_rating': round((existing['defensive_rating'] * existing['matches'] + t['defensive_rating'] * t['matches']) / total_matches, 1),
	'net_rating': round((existing['net_rating'] * existing['matches'] + t['net_rating'] * t['matches']) / total_matches, 1),
	'composite_rating': round((existing['composite_rating'] * existing['matches'] + t['composite_rating'] * t['matches']) / total_matches, 1),
	'matches': total_matches,
	'goals_scored': existing.get('goals_scored', 0),
	'goals_conceded': existing.get('goals_conceded', 0),
	'comp_label': 'NWSL + FAWSL',
	}
	else:
	combined[t['team']] = t

	# Add championship wins and WIFX Global Club Ranking
	for team, data in combined.items():
	wins = CLUB_CHAMPIONSHIPS.get(team, 0)
	# NWSL championships weighted 1.5x
	nwsl_teams = ["Portland Thorns", "North Carolina Courage", "Kansas City Current", "FC Kansas City",
	"Western New York Flash", "OL Reign", "Seattle Reign", "Chicago Red Stars",
	"Washington Spirit", "Houston Dash", "Angel City", "NJ/NY Gotham", "Boston Breakers",
	"Sky Blue FC", "Bay FC", "Racing Louisville", "San Diego Wave"]
	if team in nwsl_teams:
	data['championship_wins'] = wins
	data['wifx_global_club_ranking'] = data['net_rating'] + (wins * 1.5)
	else:
	data['championship_wins'] = wins
	data['wifx_global_club_ranking'] = data['net_rating'] + wins

	all_teams = list(combined.values())
	all_teams.sort(key=lambda x: x.get('wifx_global_club_ranking', x.get('composite_rating', 0)), reverse=True)

	write_json("wifx_club_team_scores.json", {"all_teams": all_teams})


	# ---------------------------------------------------------------------------
	# WIFX Confederation Scores (aggregated across years)
	# ---------------------------------------------------------------------------
	def build_wifx_confederation_scores():
	path = DATA / "wifx_club_confederation_scores.csv"
	df = pd.read_csv(path)

	# Aggregate by team
	agg = df.groupby("team").agg({
	"wifx_club_score": "mean",
	"country": "first",
	"confederation": "first",
	"championships_won": "sum",
	"finals_reached": "sum",
	}).reset_index()

	agg = agg.sort_values("wifx_club_score", ascending=False)
	agg = agg.assign(rank=range(1, len(agg) + 1))

	result = {
	"club_confederation_scores": agg.to_dict(orient="records"),
	}

	write_json("wifx_club_confederation_scores.json", result)


	# ---------------------------------------------------------------------------
	# Main
	# ---------------------------------------------------------------------------
	def main():
	print("Loading StatsBomb events (this may take a minute)...")
	all_events = pd.concat([load_events(c) for c in COMPETITIONS], ignore_index=True)
	print(f" Loaded {len(all_events):,} events")

	all_matches = pd.concat([load_matches(c) for c in COMPETITIONS], ignore_index=True)
	print(f" Loaded {len(all_matches):,} matches")

	all_lineups = pd.concat([load_lineups(c) for c in COMPETITIONS], ignore_index=True)
	print(f" Loaded {len(all_lineups):,} lineup entries")

	# Build WIFX dashboards only
	print("Building WIFX scores...")
	wifx = build_wifx_scores()
	write_json("wifx_scores.json", wifx)

	print("Building WIFX historical scores...")
	wifx_hist = build_wifx_historical_scores()
	write_json("wifx_historical_scores.json", wifx_hist)

	print("Building aggregated WIFX national team scores...")
	build_wifx_national_team_scores()
	print("Building aggregated WIFX club team scores...")
	build_wifx_club_team_scores()
	print("Building aggregated WIFX confederation scores...")
	build_wifx_confederation_scores()

	print("Done! All JSON files written to data/dashboard/")


	def write_json(filename: str, data: dict):
	import math
	path = OUT / filename

	def clean_nan(obj):
	if isinstance(obj, dict):
	return {k: clean_nan(v) for k, v in obj.items()}
	elif isinstance(obj, list):
	return [clean_nan(v) for v in obj]
	elif isinstance(obj, float) and (math.isnan(obj) or math.isinf(obj)):
	return None
	elif obj == "NaN":
	return None
	return obj

	data = clean_nan(data)
	with path.open("w", encoding="utf-8") as f:
	json.dump(data, f, ensure_ascii=False, default=str)
	size = path.stat().st_size
	print(f" Wrote {path} ({size / 1024:.1f} KB)")


	if __name__ == "__main__":
	main()