Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """Pre-aggregate raw data into small JSON files for the interactive dashboard. | |
| Usage: | |
| python scripts/build_dashboard_data.py | |
| Reads from data/ and writes JSON files to data/dashboard/. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import warnings | |
| from pathlib import Path | |
| import numpy as np | |
| import pandas as pd | |
| warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning) | |
| ROOT = Path(__file__).resolve().parent.parent | |
| DATA = ROOT / "data" | |
| OUT = ROOT / "output" | |
| OUT.mkdir(parents=True, exist_ok=True) | |
| COMPETITIONS = { | |
| "FA_Womens_Super_League_2018-2019": {"type": "league", "label": "FAWSL 2018-19"}, | |
| "FA_Womens_Super_League_2019-2020": {"type": "league", "label": "FAWSL 2019-20"}, | |
| "FA_Womens_Super_League_2020-2021": {"type": "league", "label": "FAWSL 2020-21"}, | |
| "NWSL_2018": {"type": "league", "label": "NWSL 2018"}, | |
| "UEFA_Womens_Euro_2022": {"type": "tournament", "label": "Euros 2022"}, | |
| "UEFA_Womens_Euro_2025": {"type": "tournament", "label": "Euros 2025"}, | |
| "Womens_World_Cup_2019": {"type": "tournament", "label": "WWC 2019"}, | |
| "Womens_World_Cup_2023": {"type": "tournament", "label": "WWC 2023"}, | |
| } | |
| EVENT_COLS = [ | |
| "type", "player", "player_id", "team", "match_id", "minute", | |
| "shot_outcome", "shot_statsbomb_xg", | |
| "pass_goal_assist", "pass_shot_assist", "pass_through_ball", | |
| "pass_cross", "pass_switch", "pass_outcome", | |
| "dribble_outcome", | |
| "interception_outcome", | |
| "duel_type", "duel_outcome", | |
| "position", | |
| ] | |
| def load_events(comp_dir: str) -> pd.DataFrame: | |
| path = DATA / "statsbomb" / comp_dir / "events.csv" | |
| df = pd.read_csv(path, usecols=lambda c: c in EVENT_COLS, low_memory=False) | |
| df["competition"] = comp_dir | |
| return df | |
| def load_matches(comp_dir: str) -> pd.DataFrame: | |
| path = DATA / "statsbomb" / comp_dir / "matches.csv" | |
| df = pd.read_csv(path) | |
| df["competition"] = comp_dir | |
| return df | |
| def load_lineups(comp_dir: str) -> pd.DataFrame: | |
| path = DATA / "statsbomb" / comp_dir / "lineups.csv" | |
| df = pd.read_csv(path) | |
| df["competition"] = comp_dir | |
| return df | |
| def percentile_rank(series: pd.Series) -> pd.Series: | |
| return series.rank(pct=True) * 100 | |
| # --------------------------------------------------------------------------- | |
| # StatsBomb Player Aggregates | |
| # --------------------------------------------------------------------------- | |
| def build_sb_players(all_events: pd.DataFrame, all_lineups: pd.DataFrame) -> dict: | |
| ev = all_events[all_events["player"].notna()].copy() | |
| # Goal threat components | |
| goals = ev[ev["shot_outcome"] == "Goal"].groupby("player").size().rename("goals") | |
| shots_on = ev[ev["shot_outcome"].isin(["Goal", "Saved", "Saved Off Target", "Saved to Post"])].groupby("player").size().rename("shots_on_target") | |
| xg = ev[ev["shot_statsbomb_xg"].notna()].groupby("player")["shot_statsbomb_xg"].sum().rename("xg") | |
| assists = ev[ev["pass_goal_assist"].notna()].groupby("player").size().rename("assists") | |
| key_passes = ev[ev["pass_shot_assist"].notna()].groupby("player").size().rename("key_passes") | |
| # Playmaker components | |
| through_balls = ev[ev["pass_through_ball"].notna()].groupby("player").size().rename("through_balls") | |
| crosses = ev[ev["pass_cross"].notna()].groupby("player").size().rename("crosses") | |
| switches = ev[ev["pass_switch"].notna()].groupby("player").size().rename("switches") | |
| dribbles_ok = ev[(ev["type"] == "Dribble") & (ev["dribble_outcome"] == "Complete")].groupby("player").size().rename("dribbles") | |
| # Defensive components | |
| interceptions = ev[ev["type"] == "Interception"].groupby("player").size().rename("interceptions") | |
| tackles_won = ev[(ev["duel_type"] == "Tackle") & (ev["duel_outcome"].isin(["Won", "Success In Play", "Success Out"]))].groupby("player").size().rename("tackles_won") | |
| blocks = ev[ev["type"] == "Block"].groupby("player").size().rename("blocks") | |
| clearances = ev[ev["type"] == "Clearance"].groupby("player").size().rename("clearances") | |
| pressures = ev[ev["type"] == "Pressure"].groupby("player").size().rename("pressures") | |
| recoveries = ev[ev["type"] == "Ball Recovery"].groupby("player").size().rename("recoveries") | |
| fouls_won = ev[ev["type"] == "Foul Won"].groupby("player").size().rename("fouls_won") | |
| fouls_committed = ev[ev["type"] == "Foul Committed"].groupby("player").size().rename("fouls_committed") | |
| # Get primary team and position per player | |
| player_team = ev.groupby("player")["team"].agg(lambda x: x.value_counts().index[0]).rename("team") | |
| player_comp = ev.groupby("player")["competition"].agg(lambda x: x.value_counts().index[0]) | |
| player_comp_label = player_comp.map(lambda c: COMPETITIONS.get(c, {}).get("label", c)).rename("competition") | |
| # Position from lineups | |
| pos_df = all_lineups[["player_name", "positions"]].copy() | |
| pos_df = pos_df[pos_df["positions"].notna()] | |
| def extract_primary_pos(pos_str): | |
| try: | |
| import ast | |
| positions = ast.literal_eval(pos_str) | |
| if positions and isinstance(positions, list): | |
| return positions[0].get("position", "Unknown") if isinstance(positions[0], dict) else str(positions[0]) | |
| except Exception: | |
| pass | |
| return "Unknown" | |
| pos_df["primary_position"] = pos_df["positions"].apply(extract_primary_pos) | |
| player_positions = pos_df.groupby("player_name")["primary_position"].agg( | |
| lambda x: x.value_counts().index[0] | |
| ).rename("position") | |
| def simplify_position(pos): | |
| pos = str(pos).lower() | |
| if "goalkeeper" in pos or pos == "gk": | |
| return "GK" | |
| elif "back" in pos or "defender" in pos or pos in ("cb", "lb", "rb", "lwb", "rwb"): | |
| return "DF" | |
| elif "midfield" in pos or pos in ("cm", "cdm", "cam", "lm", "rm", "dm", "am"): | |
| return "MF" | |
| elif "forward" in pos or "wing" in pos or "striker" in pos or pos in ("st", "cf", "lw", "rw", "ss"): | |
| return "FW" | |
| return "MF" | |
| player_pos_simple = player_positions.map(simplify_position).rename("position_group") | |
| # Combine all stats | |
| stats = pd.DataFrame({ | |
| "team": player_team, | |
| "competition": player_comp_label, | |
| }) | |
| for s in [goals, shots_on, xg, assists, key_passes, through_balls, crosses, | |
| switches, dribbles_ok, interceptions, tackles_won, blocks, clearances, | |
| pressures, recoveries, fouls_won, fouls_committed]: | |
| stats = stats.join(s, how="left") | |
| stats = stats.join(player_pos_simple, how="left") | |
| stats = stats.fillna(0) | |
| stats["position_group"] = stats["position_group"].replace(0, "MF") | |
| # Compute scores as percentile ranks | |
| stats["goal_threat"] = percentile_rank( | |
| stats[["goals", "shots_on_target", "xg", "assists", "key_passes"]].sum(axis=1) | |
| ) | |
| stats["playmaker"] = percentile_rank( | |
| stats[["assists", "key_passes", "through_balls", "crosses", "switches", "dribbles"]].sum(axis=1) | |
| ) | |
| stats["defensive"] = percentile_rank( | |
| stats[["interceptions", "tackles_won", "blocks", "clearances", "pressures", "recoveries"]].sum(axis=1) | |
| ) | |
| stats["composite"] = (stats["goal_threat"] + stats["playmaker"] + stats["defensive"]) / 3 | |
| stats = stats.reset_index().rename(columns={"index": "player"}) | |
| # Top 30 per metric | |
| result = {} | |
| for metric in ["goal_threat", "playmaker", "defensive", "composite"]: | |
| top = stats.nlargest(30, metric) | |
| result[metric] = top[["player", "team", "competition", "position_group", | |
| "goals", "assists", "xg", "key_passes", | |
| "interceptions", "tackles_won", "blocks", | |
| metric]].to_dict(orient="records") | |
| # League vs tournament split | |
| ev_with_type = ev.copy() | |
| ev_with_type["comp_type"] = ev_with_type["competition"].map( | |
| lambda c: COMPETITIONS.get(c, {}).get("type", "unknown") | |
| ) | |
| league_goals = ev_with_type[(ev_with_type["shot_outcome"] == "Goal") & (ev_with_type["comp_type"] == "league")].groupby("player").size().rename("league_goals") | |
| tourn_goals = ev_with_type[(ev_with_type["shot_outcome"] == "Goal") & (ev_with_type["comp_type"] == "tournament")].groupby("player").size().rename("tournament_goals") | |
| league_assists = ev_with_type[(ev_with_type["pass_goal_assist"].notna()) & (ev_with_type["comp_type"] == "league")].groupby("player").size().rename("league_assists") | |
| tourn_assists = ev_with_type[(ev_with_type["pass_goal_assist"].notna()) & (ev_with_type["comp_type"] == "tournament")].groupby("player").size().rename("tournament_assists") | |
| lvt = pd.DataFrame({"league_goals": league_goals, "tournament_goals": tourn_goals, | |
| "league_assists": league_assists, "tournament_assists": tourn_assists}).fillna(0) | |
| lvt["total"] = lvt.sum(axis=1) | |
| lvt = lvt.nlargest(25, "total").reset_index().rename(columns={"index": "player"}) | |
| result["league_vs_tournament"] = lvt.to_dict(orient="records") | |
| # Top 10 by position | |
| by_pos = {} | |
| for pos in ["FW", "MF", "DF", "GK"]: | |
| subset = stats[stats["position_group"] == pos].nlargest(10, "composite") | |
| by_pos[pos] = subset[["player", "team", "composite", "goal_threat", "playmaker", "defensive"]].to_dict(orient="records") | |
| result["by_position"] = by_pos | |
| return result | |
| # --------------------------------------------------------------------------- | |
| # StatsBomb Club/Country Aggregates | |
| # --------------------------------------------------------------------------- | |
| def compute_team_rankings(all_matches: pd.DataFrame, all_events: pd.DataFrame, comp_type: str) -> dict: | |
| comps = [c for c, info in COMPETITIONS.items() if info["type"] == comp_type] | |
| matches = all_matches[all_matches["competition"].isin(comps)].copy() | |
| events = all_events[all_events["competition"].isin(comps)] | |
| if matches.empty: | |
| return {"teams": []} | |
| matches = matches.sort_values("match_date") | |
| # xG per team per match | |
| xg_by_match = events[events["shot_statsbomb_xg"].notna()].groupby( | |
| ["match_id", "team"] | |
| )["shot_statsbomb_xg"].sum().reset_index() | |
| # Build team stats | |
| records = [] | |
| for _, m in matches.iterrows(): | |
| home, away = m["home_team"], m["away_team"] | |
| hs, as_ = m["home_score"], m["away_score"] | |
| mid = m["match_id"] | |
| comp_label = COMPETITIONS.get(m["competition"], {}).get("label", m["competition"]) | |
| for team, opp, gs, gc in [(home, away, hs, as_), (away, home, as_, hs)]: | |
| xg_team = xg_by_match[(xg_by_match["match_id"] == mid) & (xg_by_match["team"] == team)] | |
| xg_opp = xg_by_match[(xg_by_match["match_id"] == mid) & (xg_by_match["team"] == opp)] | |
| records.append({ | |
| "team": team, | |
| "match_id": mid, | |
| "match_date": m["match_date"], | |
| "competition": comp_label, | |
| "goals_scored": gs, | |
| "goals_conceded": gc, | |
| "points": 3 if gs > gc else (1 if gs == gc else 0), | |
| "xg_for": float(xg_team["shot_statsbomb_xg"].values[0]) if len(xg_team) else 0.0, | |
| "xg_against": float(xg_opp["shot_statsbomb_xg"].values[0]) if len(xg_opp) else 0.0, | |
| }) | |
| df = pd.DataFrame(records) | |
| # Aggregate across all competitions | |
| team_stats = df.groupby("team").agg( | |
| matches=("match_id", "count"), | |
| total_points=("points", "sum"), | |
| goals_scored=("goals_scored", "sum"), | |
| goals_conceded=("goals_conceded", "sum"), | |
| xg_for=("xg_for", "sum"), | |
| xg_against=("xg_against", "sum"), | |
| competition=("competition", lambda x: ", ".join(x.unique()[:3])), # List multiple comps | |
| ).reset_index() | |
| team_stats["ppg"] = (team_stats["total_points"] / team_stats["matches"]).round(2) | |
| team_stats["gd_per_game"] = ((team_stats["goals_scored"] - team_stats["goals_conceded"]) / team_stats["matches"]).round(2) | |
| team_stats["xg_dominance"] = ((team_stats["xg_for"] - team_stats["xg_against"]) / team_stats["matches"]).round(3) | |
| # Elo (across all matches) | |
| elo = {} | |
| for _, m in matches.iterrows(): | |
| home, away = m["home_team"], m["away_team"] | |
| hs, as_ = m["home_score"], m["away_score"] | |
| eh = elo.get(home, 1500) | |
| ea = elo.get(away, 1500) | |
| exp_h = 1 / (1 + 10 ** ((ea - eh) / 400)) | |
| actual_h = 1.0 if hs > as_ else (0.5 if hs == as_ else 0.0) | |
| K = 40 | |
| elo[home] = eh + K * (actual_h - exp_h) | |
| elo[away] = ea + K * ((1 - actual_h) - (1 - exp_h)) | |
| team_stats["elo"] = team_stats["team"].map(elo).round(0) | |
| # Composite | |
| for col in ["ppg", "elo", "xg_dominance"]: | |
| team_stats[f"{col}_pct"] = percentile_rank(team_stats[col]) | |
| team_stats["composite"] = ((team_stats["ppg_pct"] + team_stats["elo_pct"] + team_stats["xg_dominance_pct"]) / 3).round(1) | |
| team_stats = team_stats.sort_values("composite", ascending=False) | |
| cols = ["team", "competition", "matches", "ppg", "elo", "xg_dominance", "gd_per_game", "composite"] | |
| return {"teams": team_stats[cols].to_dict(orient="records")} | |
| # --------------------------------------------------------------------------- | |
| # StatsBomb Player Comparisons | |
| # --------------------------------------------------------------------------- | |
| def build_sb_player_comparisons(all_events: pd.DataFrame) -> dict: | |
| ev = all_events[all_events["player"].notna()].copy() | |
| ev["comp_type"] = ev["competition"].map(lambda c: COMPETITIONS.get(c, {}).get("type", "unknown")) | |
| ev["comp_label"] = ev["competition"].map(lambda c: COMPETITIONS.get(c, {}).get("label", c)) | |
| def player_scores(subset): | |
| goals = subset[subset["shot_outcome"] == "Goal"].groupby("player").size().rename("goals") | |
| assists = subset[subset["pass_goal_assist"].notna()].groupby("player").size().rename("assists") | |
| key_passes = subset[subset["pass_shot_assist"].notna()].groupby("player").size().rename("key_passes") | |
| xg = subset[subset["shot_statsbomb_xg"].notna()].groupby("player")["shot_statsbomb_xg"].sum().rename("xg") | |
| through_balls = subset[subset["pass_through_ball"].notna()].groupby("player").size().rename("through_balls") | |
| crosses = subset[subset["pass_cross"].notna()].groupby("player").size().rename("crosses") | |
| interceptions = subset[subset["type"] == "Interception"].groupby("player").size().rename("interceptions") | |
| tackles = subset[(subset["duel_type"] == "Tackle") & (subset["duel_outcome"].isin(["Won", "Success In Play", "Success Out"]))].groupby("player").size().rename("tackles_won") | |
| blocks = subset[subset["type"] == "Block"].groupby("player").size().rename("blocks") | |
| recoveries = subset[subset["type"] == "Ball Recovery"].groupby("player").size().rename("recoveries") | |
| stats = pd.DataFrame({"goals": goals, "assists": assists, "key_passes": key_passes, | |
| "xg": xg, "through_balls": through_balls, "crosses": crosses, | |
| "interceptions": interceptions, "tackles_won": tackles, | |
| "blocks": blocks, "recoveries": recoveries}).fillna(0) | |
| if len(stats) == 0: | |
| return stats | |
| stats["goal_threat"] = percentile_rank(stats[["goals", "xg", "assists", "key_passes"]].sum(axis=1)) | |
| stats["playmaker"] = percentile_rank(stats[["assists", "key_passes", "through_balls", "crosses"]].sum(axis=1)) | |
| stats["defensive"] = percentile_rank(stats[["interceptions", "tackles_won", "blocks", "recoveries"]].sum(axis=1)) | |
| stats["composite"] = (stats["goal_threat"] + stats["playmaker"] + stats["defensive"]) / 3 | |
| return stats | |
| result = {} | |
| # 1. Historical tournaments vs Euros 2025 | |
| hist_tourn = ev[(ev["comp_type"] == "tournament") & (ev["competition"] != "UEFA_Womens_Euro_2025")] | |
| euros25 = ev[ev["competition"] == "UEFA_Womens_Euro_2025"] | |
| hist_scores = player_scores(hist_tourn) | |
| e25_scores = player_scores(euros25) | |
| comparison1 = [] | |
| common = hist_scores.index.intersection(e25_scores.index) | |
| for metric in ["goal_threat", "playmaker", "defensive", "composite"]: | |
| merged = pd.DataFrame({ | |
| "historical": hist_scores.loc[common, metric] if metric in hist_scores.columns else 0, | |
| "euros_2025": e25_scores.loc[common, metric] if metric in e25_scores.columns else 0, | |
| }).dropna() | |
| top = merged.nlargest(15, "euros_2025").reset_index().rename(columns={"index": "player"}) | |
| comparison1.append({"metric": metric, "players": top.to_dict(orient="records")}) | |
| result["historical_vs_euros2025"] = comparison1 | |
| # 2. League vs Tournament | |
| league_ev = ev[ev["comp_type"] == "league"] | |
| tourn_ev = ev[ev["comp_type"] == "tournament"] | |
| league_scores = player_scores(league_ev) | |
| tourn_scores = player_scores(tourn_ev) | |
| comparison2 = [] | |
| common2 = league_scores.index.intersection(tourn_scores.index) | |
| for metric in ["goal_threat", "playmaker", "defensive", "composite"]: | |
| merged = pd.DataFrame({ | |
| "league": league_scores.loc[common2, metric] if metric in league_scores.columns else 0, | |
| "tournament": tourn_scores.loc[common2, metric] if metric in tourn_scores.columns else 0, | |
| }).dropna() | |
| top = merged.nlargest(15, "tournament").reset_index().rename(columns={"index": "player"}) | |
| comparison2.append({"metric": metric, "players": top.to_dict(orient="records")}) | |
| result["league_vs_tournament"] = comparison2 | |
| # 3. Euros 2025 Group vs Knockout | |
| # Need match stage info from matches | |
| e25_matches_path = DATA / "statsbomb" / "UEFA_Womens_Euro_2025" / "matches.csv" | |
| if e25_matches_path.exists(): | |
| e25m = pd.read_csv(e25_matches_path) | |
| group_match_ids = e25m[e25m["competition_stage"].str.contains("Group", case=False, na=False)]["match_id"].tolist() | |
| ko_match_ids = e25m[~e25m["competition_stage"].str.contains("Group", case=False, na=False)]["match_id"].tolist() | |
| group_ev = euros25[euros25["match_id"].isin(group_match_ids)] | |
| ko_ev = euros25[euros25["match_id"].isin(ko_match_ids)] | |
| group_scores = player_scores(group_ev) | |
| ko_scores = player_scores(ko_ev) | |
| comparison3 = [] | |
| common3 = group_scores.index.intersection(ko_scores.index) | |
| for metric in ["goal_threat", "playmaker", "composite"]: | |
| merged = pd.DataFrame({ | |
| "group_stage": group_scores.loc[common3, metric] if metric in group_scores.columns else 0, | |
| "knockout": ko_scores.loc[common3, metric] if metric in ko_scores.columns else 0, | |
| }).dropna() | |
| top = merged.nlargest(15, "knockout").reset_index().rename(columns={"index": "player"}) | |
| comparison3.append({"metric": metric, "players": top.to_dict(orient="records")}) | |
| result["euros2025_group_vs_knockout"] = comparison3 | |
| return result | |
| # --------------------------------------------------------------------------- | |
| # FIFA Rankings | |
| # --------------------------------------------------------------------------- | |
| def build_fifa_rankings() -> dict: | |
| quarters = [ | |
| ("2025_03_06", "Mar 2025"), | |
| ("2025_06_12", "Jun 2025"), | |
| ("2025_08_07", "Aug 2025"), | |
| ("2025_12_11", "Dec 2025"), | |
| ] | |
| frames = {} | |
| for suffix, label in quarters: | |
| path = DATA / f"fifa_womens_world_ranking_{suffix}.csv" | |
| if path.exists(): | |
| df = pd.read_csv(path) | |
| frames[label] = df | |
| if not frames: | |
| return {} | |
| # Build per-country trajectory | |
| countries = {} | |
| for label, df in frames.items(): | |
| for _, row in df.iterrows(): | |
| c = row["Country"] | |
| if c not in countries: | |
| countries[c] = { | |
| "country": c, | |
| "code": row.get("Country_Code", ""), | |
| "confederation": row.get("Confederation", ""), | |
| "points": {}, | |
| "ranks": {}, | |
| } | |
| if pd.notna(row["Total_Points"]): | |
| countries[c]["points"][label] = float(row["Total_Points"]) | |
| if pd.notna(row["Rank"]): | |
| countries[c]["ranks"][label] = int(row["Rank"]) | |
| all_countries = list(countries.values()) | |
| # Average points and rank across all quarters for top 25 | |
| all_df = pd.concat(frames.values(), ignore_index=True) | |
| avg_points = all_df.groupby(["Country", "Country_Code", "Confederation"])["Total_Points"].mean().reset_index() | |
| avg_points = avg_points.sort_values("Total_Points", ascending=False) | |
| top25 = avg_points.head(25).rename(columns={"Total_Points": "Avg_Points"}).to_dict(orient="records") | |
| # Confederation breakdown (by year/quarter) | |
| conf_avg = {} | |
| for label, df in frames.items(): | |
| conf_avg[label] = df.groupby("Confederation")["Total_Points"].mean().round(1).to_dict() | |
| # Movers: calculate change from earliest to latest available year | |
| all_quarters_sorted = sorted(frames.keys()) | |
| first_label = all_quarters_sorted[0] | |
| latest_label = all_quarters_sorted[-1] | |
| movers = [] | |
| for c in all_countries: | |
| if first_label in c["ranks"] and latest_label in c["ranks"]: | |
| rank_change = c["ranks"][first_label] - c["ranks"][latest_label] | |
| point_change = c["points"].get(latest_label, 0) - c["points"].get(first_label, 0) | |
| movers.append({ | |
| "country": c["country"], | |
| "code": c["code"], | |
| "confederation": c["confederation"], | |
| "rank_change": rank_change, | |
| "point_change": round(point_change, 1), | |
| }) | |
| movers_df = pd.DataFrame(movers) | |
| top_climbers = movers_df.nlargest(15, "rank_change").to_dict(orient="records") | |
| top_fallers = movers_df.nsmallest(15, "rank_change").to_dict(orient="records") | |
| top_point_gainers = movers_df.nlargest(15, "point_change").to_dict(orient="records") | |
| # H1 vs H2 (first half vs second half of the year) | |
| mid = len(all_quarters_sorted) // 2 | |
| h1_labels = all_quarters_sorted[:mid] | |
| h2_labels = all_quarters_sorted[mid:] | |
| h1h2 = [] | |
| for c in all_countries: | |
| pts = c["points"] | |
| rnk = c["ranks"] | |
| if all(l in pts for l in h1_labels + h2_labels) and all(l in rnk for l in h1_labels + h2_labels): | |
| h1_point_delta = sum(pts[l] for l in h1_labels[1:]) - sum(pts[l] for l in h1_labels[:-1]) | |
| h2_point_delta = sum(pts[l] for l in h2_labels[1:]) - sum(pts[l] for l in h2_labels[:-1]) | |
| h1_rank_delta = rnk[h1_labels[0]] - rnk[h1_labels[-1]] | |
| h2_rank_delta = rnk[h2_labels[0]] - rnk[h2_labels[-1]] | |
| h1h2.append({ | |
| "country": c["country"], | |
| "code": c["code"], | |
| "confederation": c["confederation"], | |
| "h1_point_delta": round(h1_point_delta, 1), | |
| "h2_point_delta": round(h2_point_delta, 1), | |
| "h1_rank_delta": h1_rank_delta, | |
| "h2_rank_delta": h2_rank_delta, | |
| }) | |
| h1h2_df = pd.DataFrame(h1h2) | |
| # Top 10 trajectories (by average points across all years) | |
| top10_countries = avg_points.head(10)["Country"].tolist() | |
| trajectories = [c for c in all_countries if c["country"] in top10_countries] | |
| return { | |
| "top25": top25, | |
| "confederation_avg": conf_avg, | |
| "top_climbers": top_climbers, | |
| "top_fallers": top_fallers, | |
| "top_point_gainers": top_point_gainers, | |
| "h1_vs_h2": h1h2_df.nlargest(20, "h2_point_delta").to_dict(orient="records") if len(h1h2_df) else [], | |
| "h1_vs_h2_risers": h1h2_df.assign( | |
| h2_improvement=h1h2_df["h2_point_delta"] - h1h2_df["h1_point_delta"] | |
| ).nlargest(15, "h2_improvement").to_dict(orient="records") if len(h1h2_df) else [], | |
| "trajectories": trajectories, | |
| "quarters": [q[1] for q in quarters], | |
| } | |
| # --------------------------------------------------------------------------- | |
| # WIFXScore (aggregated across years) | |
| # --------------------------------------------------------------------------- | |
| def build_wifx_scores() -> dict: | |
| path = DATA / "wifx_scores.csv" | |
| df = pd.read_csv(path) | |
| # Aggregate by player across all years/competitions | |
| # Use max score so merging entries never penalises players | |
| # (different sources have different feature richness) | |
| player_agg = df.groupby("player").apply( | |
| lambda g: pd.Series({ | |
| "WIFXScore": g["WIFXScore"].max(), | |
| "epm_raw": g.loc[g["WIFXScore"].idxmax(), "epm_raw"], | |
| "offensive_score": g.loc[g["WIFXScore"].idxmax(), "offensive_score"], | |
| "creative_score": g.loc[g["WIFXScore"].idxmax(), "creative_score"], | |
| "defensive_score": g.loc[g["WIFXScore"].idxmax(), "defensive_score"], | |
| "total_events": g["total_events"].sum(), | |
| "team": g["team"].value_counts().index[0] if len(g["team"].value_counts()) > 0 else "Unknown", | |
| "primary_comp": ", ".join(g["primary_comp"].unique()[:3]) if len(g["primary_comp"].unique()) > 0 else "Unknown", | |
| }) | |
| ).reset_index() | |
| # Filter for minimum events threshold | |
| player_agg = player_agg[player_agg["total_events"] >= 50] # At least 50 events total | |
| player_agg = player_agg[player_agg["WIFXScore"].notna()] # Remove NaN scores | |
| # Top 25 by average WIFXScore - include all metrics | |
| top25 = player_agg.nlargest(25, "WIFXScore")[ | |
| ["player", "team", "primary_comp", "WIFXScore", "epm_raw", "offensive_score", "creative_score", "defensive_score", "total_events"] | |
| ].to_dict(orient="records") | |
| # Bottom 25 by WIFXScore | |
| bottom25 = player_agg.nsmallest(25, "WIFXScore")[ | |
| ["player", "team", "primary_comp", "WIFXScore", "epm_raw", "offensive_score", "creative_score", "defensive_score", "total_events"] | |
| ].to_dict(orient="records") | |
| # All players for component breakdown (top 15) | |
| all_players = player_agg.sort_values("WIFXScore", ascending=False)[ | |
| ["player", "team", "primary_comp", "WIFXScore", "epm_raw", "offensive_score", "creative_score", "defensive_score", "total_events"] | |
| ].to_dict(orient="records") | |
| # Distribution histogram | |
| hist_counts, hist_edges = np.histogram(player_agg["WIFXScore"], bins=30) | |
| distribution = { | |
| "counts": hist_counts.tolist(), | |
| "edges": [round(float(e), 2) for e in hist_edges.tolist()], | |
| "mean": round(float(player_agg["WIFXScore"].mean()), 2), | |
| "std": round(float(player_agg["WIFXScore"].std()), 2), | |
| } | |
| # By competition (still useful to show) | |
| by_comp = df.groupby("primary_comp")["WIFXScore"].agg(["mean", "median", "std", "count", "min", "max"]).round(2) | |
| by_comp_list = [] | |
| for comp, row in by_comp.iterrows(): | |
| scores = df[df["primary_comp"] == comp]["WIFXScore"].tolist() | |
| by_comp_list.append({ | |
| "competition": comp, | |
| "mean": row["mean"], | |
| "median": row["median"], | |
| "std": row["std"], | |
| "count": int(row["count"]), | |
| "scores": [round(s, 2) for s in scores], | |
| }) | |
| return { | |
| "top25": top25, | |
| "bottom25": bottom25, | |
| "all_players": all_players, | |
| "distribution": distribution, | |
| "by_competition": by_comp_list, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # WIFXScore Historical (retired/legend players) | |
| # --------------------------------------------------------------------------- | |
| def build_wifx_historical_scores() -> dict: | |
| path = DATA / "wifx_historical_scores.csv" | |
| retired_path = DATA / "retired_players.csv" | |
| df = pd.read_csv(path) | |
| retired_df = pd.read_csv(retired_path) | |
| category_map = dict(zip(retired_df["player"], retired_df["category"])) | |
| player_agg = df.groupby("player").apply( | |
| lambda g: pd.Series({ | |
| "WIFXScore": g["WIFXScore"].max(), | |
| "epm_raw": g.loc[g["WIFXScore"].idxmax(), "epm_raw"], | |
| "offensive_score": g.loc[g["WIFXScore"].idxmax(), "offensive_score"], | |
| "creative_score": g.loc[g["WIFXScore"].idxmax(), "creative_score"], | |
| "defensive_score": g.loc[g["WIFXScore"].idxmax(), "defensive_score"], | |
| "total_events": g["total_events"].sum(), | |
| "team": g["team"].value_counts().index[0] if len(g["team"].value_counts()) > 0 else "Unknown", | |
| "primary_comp": ", ".join(g["primary_comp"].unique()[:3]) if len(g["primary_comp"].unique()) > 0 else "Unknown", | |
| }) | |
| ).reset_index() | |
| player_agg = player_agg[player_agg["total_events"] >= 50] | |
| player_agg = player_agg[player_agg["WIFXScore"].notna()] | |
| player_agg["category"] = player_agg["player"].map(category_map).fillna("retired") | |
| cols = ["player", "team", "primary_comp", "WIFXScore", "epm_raw", | |
| "offensive_score", "creative_score", "defensive_score", | |
| "total_events", "category"] | |
| all_players = player_agg.sort_values("WIFXScore", ascending=False)[cols].to_dict(orient="records") | |
| top25 = player_agg.nlargest(25, "WIFXScore")[cols].to_dict(orient="records") | |
| return { | |
| "top25": top25, | |
| "all_players": all_players, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Historical Match Results | |
| # --------------------------------------------------------------------------- | |
| def build_match_results() -> dict: | |
| results_path = DATA / "versions" / "36" / "results.csv" | |
| goals_path = DATA / "versions" / "36" / "goalscorers.csv" | |
| results = pd.read_csv(results_path) | |
| goalscorers = pd.read_csv(goals_path) | |
| # Team aggregates (min 10 matches) | |
| records = [] | |
| for _, m in results.iterrows(): | |
| home, away = m["home_team"], m["away_team"] | |
| hs, as_ = m["home_score"], m["away_score"] | |
| for team, opp, gs, gc in [(home, away, hs, as_), (away, home, as_, hs)]: | |
| records.append({ | |
| "team": team, | |
| "date": m["date"], | |
| "goals_scored": gs, | |
| "goals_conceded": gc, | |
| "points": 3 if gs > gc else (1 if gs == gc else 0), | |
| }) | |
| df = pd.DataFrame(records) | |
| team_stats = df.groupby("team").agg( | |
| matches=("points", "count"), | |
| total_points=("points", "sum"), | |
| goals_scored=("goals_scored", "sum"), | |
| goals_conceded=("goals_conceded", "sum"), | |
| ).reset_index() | |
| team_stats = team_stats[team_stats["matches"] >= 10] | |
| team_stats["ppg"] = (team_stats["total_points"] / team_stats["matches"]).round(2) | |
| team_stats["gd_per_game"] = ((team_stats["goals_scored"] - team_stats["goals_conceded"]) / team_stats["matches"]).round(2) | |
| # Elo | |
| results_sorted = results.sort_values("date") | |
| elo = {} | |
| for _, m in results_sorted.iterrows(): | |
| home, away = m["home_team"], m["away_team"] | |
| hs, as_ = m["home_score"], m["away_score"] | |
| eh = elo.get(home, 1500) | |
| ea = elo.get(away, 1500) | |
| exp_h = 1 / (1 + 10 ** ((ea - eh) / 400)) | |
| actual_h = 1.0 if hs > as_ else (0.5 if hs == as_ else 0.0) | |
| K = 40 | |
| elo[home] = eh + K * (actual_h - exp_h) | |
| elo[away] = ea + K * ((1 - actual_h) - (1 - exp_h)) | |
| team_stats["elo"] = team_stats["team"].map(elo).round(0) | |
| # Composite | |
| for col in ["ppg", "elo", "gd_per_game"]: | |
| team_stats[f"{col}_pct"] = percentile_rank(team_stats[col]) | |
| team_stats["composite"] = ((team_stats["ppg_pct"] + team_stats["elo_pct"] + team_stats["gd_per_game_pct"]) / 3).round(1) | |
| team_stats = team_stats.sort_values("composite", ascending=False) | |
| top_teams = team_stats.head(30)[["team", "matches", "ppg", "elo", "gd_per_game", "composite"]].to_dict(orient="records") | |
| # Top scorers | |
| scorer_counts = goalscorers.groupby("scorer").agg( | |
| goals=("scorer", "count"), | |
| teams=("team", lambda x: ", ".join(x.unique())), | |
| penalties=("penalty", "sum"), | |
| ).reset_index().sort_values("goals", ascending=False) | |
| top_scorers = scorer_counts.head(30).to_dict(orient="records") | |
| return { | |
| "top_teams": top_teams, | |
| "top_scorers": top_scorers, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # WIFX National Team Scores (aggregated across all years) | |
| # --------------------------------------------------------------------------- | |
| def build_wifx_national_team_scores(): | |
| path = DATA / "wifx_national_team_scores.csv" | |
| df = pd.read_csv(path) | |
| # Championship wins weighting (major tournaments) | |
| CHAMPIONSHIP_WINS = { | |
| "United States Women's": 4, # WWC: 1991, 1999, 2015, 2019 | |
| "United States": 4, | |
| "Germany Women's": 2, # Euro: 1995, 2001, 2009, 2013 | |
| "Germany": 2, | |
| "Norway Women's": 1, # Euro: 1995, WWC: 2023 | |
| "Norway": 1, | |
| "Japan Women's": 1, # WWC: 2011 | |
| "Japan": 1, | |
| "Spain Women's": 2, # Euro: 2022, WWC: 2023 | |
| "Spain": 2, | |
| "England Women's": 1, # Euro: 2022 | |
| "England": 1, | |
| "Netherlands Women's": 1, # Euro: 2017 | |
| "Netherlands": 1, | |
| "France Women's": 0, | |
| "France": 0, | |
| "Sweden Women's": 0, | |
| "Sweden": 0, | |
| "Canada Women's": 1, # Olympics: 2020, 2024 | |
| "Canada": 1, | |
| "Brazil Women's": 0, | |
| "Brazil": 0, | |
| "Australia Women's": 0, | |
| "Australia": 0, | |
| } | |
| # Add championship wins | |
| df["championship_wins"] = df["team"].map(CHAMPIONSHIP_WINS).fillna(0) | |
| # Aggregate by team | |
| agg_cols = { | |
| "offensive_rating": "mean", | |
| "defensive_rating": "mean", | |
| "net_rating": "mean", | |
| "composite_rating": "mean", | |
| "matches": "sum", | |
| "goals_scored": "sum", | |
| "championship_wins": "max", # Keep max wins | |
| } | |
| if "goals_conceded" in df.columns: | |
| agg_cols["goals_conceded"] = "sum" | |
| agg = df.groupby("team").agg(agg_cols).reset_index() | |
| # Weight net rating by championship wins (add number of championships) | |
| agg["wifx_global_ranking"] = agg["net_rating"] + agg["championship_wins"] | |
| # Sort by WIFX Global Ranking | |
| agg = agg.sort_values("wifx_global_ranking", ascending=False) | |
| # Rename net_rating to wifx_global_ranking for output | |
| result = { | |
| "all_teams": agg.to_dict(orient="records"), | |
| } | |
| write_json("wifx_national_team_scores.json", result) | |
| # --------------------------------------------------------------------------- | |
| # WIFX Club Team Scores (aggregated across all years) | |
| # --------------------------------------------------------------------------- | |
| def build_wifx_club_team_scores(): | |
| # First, load existing StatsBomb data | |
| path = DATA / "wifx_club_team_scores.csv" | |
| df = pd.read_csv(path) | |
| # Proper weighted average aggregation for StatsBomb | |
| agg = {} | |
| for _, row in df.iterrows(): | |
| team = row['team'] | |
| matches = row['matches'] | |
| if team not in agg: | |
| agg[team] = { | |
| 'team': team, | |
| 'matches': 0, | |
| 'goals_scored': 0, | |
| 'offensive_rating_sum': 0, | |
| 'defensive_rating_sum': 0, | |
| 'net_rating_sum': 0, | |
| 'composite_rating_sum': 0, | |
| 'comps': set() | |
| } | |
| agg[team]['matches'] += matches | |
| agg[team]['goals_scored'] += int(row.get('goals_scored', 0) or 0) | |
| if 'goals_conceded' in row and pd.notna(row.get('goals_conceded')): | |
| if 'goals_conceded' not in agg[team]: | |
| agg[team]['goals_conceded'] = 0 | |
| agg[team]['goals_conceded'] += int(row['goals_conceded']) | |
| agg[team]['offensive_rating_sum'] += (row['offensive_rating'] or 0) * matches | |
| agg[team]['defensive_rating_sum'] += (row['defensive_rating'] or 0) * matches | |
| agg[team]['net_rating_sum'] += (row['net_rating'] or 0) * matches | |
| agg[team]['composite_rating_sum'] += (row['composite_rating'] or 0) * matches | |
| if pd.notna(row.get('comp_label')): | |
| agg[team]['comps'].add(row['comp_label']) | |
| # Compute StatsBomb averages | |
| sb_result = [] | |
| for team, data in agg.items(): | |
| comps_str = ", ".join(sorted(data['comps'])) if data['comps'] else "FAWSL" | |
| result = { | |
| 'team': team, | |
| 'offensive_rating': round(data['offensive_rating_sum'] / data['matches'], 1), | |
| 'defensive_rating': round(data['defensive_rating_sum'] / data['matches'], 1), | |
| 'net_rating': round(data['net_rating_sum'] / data['matches'], 1), | |
| 'composite_rating': round(data['composite_rating_sum'] / data['matches'], 1), | |
| 'matches': data['matches'], | |
| 'goals_scored': data['goals_scored'], | |
| 'comp_label': comps_str, | |
| 'source': 'statsbomb' | |
| } | |
| if 'goals_conceded' in data: | |
| result['goals_conceded'] = data['goals_conceded'] | |
| sb_result.append(result) | |
| # Normalize StatsBomb to 0-100 scale (was 0-30) | |
| sb_off_min = min(t['offensive_rating'] for t in sb_result) | |
| sb_off_max = max(t['offensive_rating'] for t in sb_result) | |
| sb_def_min = min(t['defensive_rating'] for t in sb_result) | |
| sb_def_max = max(t['defensive_rating'] for t in sb_result) | |
| if sb_off_max > sb_off_min: | |
| for t in sb_result: | |
| t['offensive_rating'] = round((t['offensive_rating'] - sb_off_min) / (sb_off_max - sb_off_min) * 100, 1) | |
| t['defensive_rating'] = round((t['defensive_rating'] - sb_def_min) / (sb_def_max - sb_def_min) * 100, 1) | |
| t['net_rating'] = round(t['offensive_rating'] - t['defensive_rating'], 1) | |
| t['composite_rating'] = round((t['offensive_rating'] + t['defensive_rating']) / 2, 1) | |
| TEAM_MAP = { | |
| 'KPqjw8PQ6v': 'Portland Thorns', | |
| 'aDQ0lzvQEv': 'OL Reign', | |
| '4JMAk47qKg': 'Chicago Red Stars', | |
| 'XVqKeVKM01': 'Washington Spirit', | |
| 'raMyrr25d2': 'Houston Dash', | |
| 'zeQZeazqKw': 'Orlando Pride', | |
| '7vQ7BBzqD1': 'FC Kansas City', | |
| '4wM4rZdqjB': 'North Carolina Courage', | |
| 'Pk5LeeNqOW': 'Kansas City Current', | |
| '4wM4Ezg5jB': 'Sky Blue FC', | |
| '7VqG1lYMvW': 'NJ/NY Gotham', | |
| 'eV5DR6YQKn': 'Angel City', | |
| 'kRQa8JOqKZ': 'San Diego Wave', | |
| 'eV5D2w9QKn': 'Bay FC', | |
| '315VnJ759x': 'Racing Louisville', | |
| 'xW5pwDBMg1': 'Boston Breakers', | |
| 'kRQaWa15KZ': 'Western New York Flash', | |
| } | |
| ga_path = DATA / "asa_nwsl" / "goals_added.csv" | |
| if ga_path.exists(): | |
| ga = pd.read_csv(ga_path) | |
| team_year = ga.groupby(['team_id_ga', 'season']).agg({ | |
| 'minutes_played_ga': 'sum', | |
| 'ga_shooting_raw': 'sum', | |
| 'ga_passing_raw': 'sum', | |
| 'ga_dribbling_raw': 'sum', | |
| 'ga_interrupting_raw': 'sum', | |
| 'ga_receiving_raw': 'sum', | |
| 'player_id': 'count', | |
| }).reset_index() | |
| team_year.columns = ['team_id', 'season', 'minutes', 'shooting', 'passing', 'dribbling', 'interrupting', 'receiving', 'players'] | |
| team_year['team'] = team_year['team_id'].map(TEAM_MAP).fillna('Unknown') | |
| team_year = team_year[(team_year['team'] != 'Unknown') & (team_year['minutes'] > 5000)] | |
| # Percentile ranking within each season | |
| team_year['offensive_rating'] = team_year.groupby('season')['shooting'].transform(lambda x: (x.rank(pct=True) * 100).round(1)) | |
| team_year['defensive_rating'] = team_year.groupby('season')['interrupting'].transform(lambda x: (x.rank(pct=True) * 100).round(1)) | |
| team_year['net_rating'] = (team_year['offensive_rating'] - team_year['defensive_rating']).round(1) | |
| team_year['composite_rating'] = ((team_year['offensive_rating'] + team_year['defensive_rating']) / 2).round(1) | |
| # Convert minutes to matches (approx 90 min = 1 match) | |
| team_year['matches'] = (team_year['minutes'] / 90).astype(int) | |
| team_year['comp_label'] = 'NWSL ' + team_year['season'].astype(str) | |
| # Aggregate across all years | |
| asa_agg = {} | |
| for _, row in team_year.iterrows(): | |
| team = row['team'] | |
| matches = row['matches'] | |
| if team not in asa_agg: | |
| asa_agg[team] = { | |
| 'team': team, | |
| 'matches': 0, | |
| 'offensive_rating_sum': 0, | |
| 'defensive_rating_sum': 0, | |
| 'net_rating_sum': 0, | |
| 'composite_rating_sum': 0, | |
| } | |
| asa_agg[team]['matches'] += matches | |
| asa_agg[team]['offensive_rating_sum'] += row['offensive_rating'] * matches | |
| asa_agg[team]['defensive_rating_sum'] += row['defensive_rating'] * matches | |
| asa_agg[team]['net_rating_sum'] += row['net_rating'] * matches | |
| asa_agg[team]['composite_rating_sum'] += row['composite_rating'] * matches | |
| asa_result = [] | |
| for team, data in asa_agg.items(): | |
| asa_result.append({ | |
| 'team': team, | |
| 'offensive_rating': round(data['offensive_rating_sum'] / data['matches'], 1), | |
| 'defensive_rating': round(data['defensive_rating_sum'] / data['matches'], 1), | |
| 'net_rating': round(data['net_rating_sum'] / data['matches'], 1), | |
| 'composite_rating': round(data['composite_rating_sum'] / data['matches'], 1), | |
| 'matches': data['matches'], | |
| 'goals_scored': 0, # Not available in ASA | |
| 'goals_conceded': 0, | |
| 'comp_label': 'NWSL 2016-2025', | |
| 'source': 'asa' | |
| }) | |
| else: | |
| asa_result = [] | |
| # Combine both (deduplicate by team name - prefer ASA if available as it has more data) | |
| combined = {} | |
| # Championship wins mapping for clubs (NWSL weighted slightly higher) | |
| CLUB_CHAMPIONSHIPS = { | |
| # NWSL (weighted 1.5x) | |
| "Portland Thorns": 3, # 2017, 2022, 2024 | |
| "North Carolina Courage": 3, # 2018, 2019, 2023 | |
| "Kansas City Current": 1, # 2024 (as Current) | |
| "FC Kansas City": 2, # 2014, 2015 | |
| "Western New York Flash": 1, # 2016 | |
| "OL Reign": 1, # 2020 | |
| "Seattle Reign": 1, # 2020 | |
| "Chicago Red Stars": 0, | |
| "Washington Spirit": 1, # 2021 | |
| "Houston Dash": 0, | |
| "Angel City": 0, | |
| "NJ/NY Gotham": 0, | |
| "Boston Breakers": 0, | |
| "Sky Blue FC": 0, | |
| # FAWSL | |
| "Chelsea": 4, # 2015-16, 2017-18, 2019-20, 2020-21 | |
| "Manchester City Women": 2, # 2016-17, 2020-21 | |
| "Arsenal Women": 1, # 2022-23 | |
| "Liverpool FFC": 1, # 2013-14 | |
| "Everton Ladies": 0, | |
| "Bristol City WFC": 0, | |
| "Brighton & Hove Albion Women": 0, | |
| "Reading FC Women": 0, | |
| "Tottenham Hotspur Women": 0, | |
| "West Ham United LFC": 0, | |
| "Aston Villa": 0, | |
| "Yeovil Town LFC": 0, | |
| # UWCL | |
| "Lyon": 8, | |
| "OL Lyonnes": 8, # 2016-2020 (5), 2021-22, 2022-23, 2023-24 | |
| "Barcelona": 3, | |
| "Fútbol Club Barcelona": 3, # 2020-21, 2021-22, 2022-23 | |
| "Wolfsburg": 2, | |
| "VfL Wolfsburg": 2, # 2013-14, 2015-16 | |
| "Paris Saint-Germain": 0, | |
| "Olympique Lyonnais": 8, | |
| # Other leagues | |
| "Bay FC": 0, | |
| "Racing Louisville": 0, | |
| "San Diego Wave": 0, | |
| "FC Barcelona": 3, | |
| } | |
| # First add StatsBomb teams | |
| for t in sb_result: | |
| combined[t['team']] = t | |
| # Then add ASA teams (will overwrite StatsBomb if exists) | |
| for t in asa_result: | |
| if t['team'] in combined: | |
| # Merge - keep statsbomb goals data, use ASA ratings weighted by matches | |
| existing = combined[t['team']] | |
| total_matches = existing['matches'] + t['matches'] | |
| combined[t['team']] = { | |
| 'team': t['team'], | |
| 'offensive_rating': round((existing['offensive_rating'] * existing['matches'] + t['offensive_rating'] * t['matches']) / total_matches, 1), | |
| 'defensive_rating': round((existing['defensive_rating'] * existing['matches'] + t['defensive_rating'] * t['matches']) / total_matches, 1), | |
| 'net_rating': round((existing['net_rating'] * existing['matches'] + t['net_rating'] * t['matches']) / total_matches, 1), | |
| 'composite_rating': round((existing['composite_rating'] * existing['matches'] + t['composite_rating'] * t['matches']) / total_matches, 1), | |
| 'matches': total_matches, | |
| 'goals_scored': existing.get('goals_scored', 0), | |
| 'goals_conceded': existing.get('goals_conceded', 0), | |
| 'comp_label': 'NWSL + FAWSL', | |
| } | |
| else: | |
| combined[t['team']] = t | |
| # Add championship wins and WIFX Global Club Ranking | |
| for team, data in combined.items(): | |
| wins = CLUB_CHAMPIONSHIPS.get(team, 0) | |
| # NWSL championships weighted 1.5x | |
| nwsl_teams = ["Portland Thorns", "North Carolina Courage", "Kansas City Current", "FC Kansas City", | |
| "Western New York Flash", "OL Reign", "Seattle Reign", "Chicago Red Stars", | |
| "Washington Spirit", "Houston Dash", "Angel City", "NJ/NY Gotham", "Boston Breakers", | |
| "Sky Blue FC", "Bay FC", "Racing Louisville", "San Diego Wave"] | |
| if team in nwsl_teams: | |
| data['championship_wins'] = wins | |
| data['wifx_global_club_ranking'] = data['net_rating'] + (wins * 1.5) | |
| else: | |
| data['championship_wins'] = wins | |
| data['wifx_global_club_ranking'] = data['net_rating'] + wins | |
| all_teams = list(combined.values()) | |
| all_teams.sort(key=lambda x: x.get('wifx_global_club_ranking', x.get('composite_rating', 0)), reverse=True) | |
| write_json("wifx_club_team_scores.json", {"all_teams": all_teams}) | |
| # --------------------------------------------------------------------------- | |
| # WIFX Confederation Scores (aggregated across years) | |
| # --------------------------------------------------------------------------- | |
| def build_wifx_confederation_scores(): | |
| path = DATA / "wifx_club_confederation_scores.csv" | |
| df = pd.read_csv(path) | |
| # Aggregate by team | |
| agg = df.groupby("team").agg({ | |
| "wifx_club_score": "mean", | |
| "country": "first", | |
| "confederation": "first", | |
| "championships_won": "sum", | |
| "finals_reached": "sum", | |
| }).reset_index() | |
| agg = agg.sort_values("wifx_club_score", ascending=False) | |
| agg = agg.assign(rank=range(1, len(agg) + 1)) | |
| result = { | |
| "club_confederation_scores": agg.to_dict(orient="records"), | |
| } | |
| write_json("wifx_club_confederation_scores.json", result) | |
| # --------------------------------------------------------------------------- | |
| # Main | |
| # --------------------------------------------------------------------------- | |
| def main(): | |
| print("Loading StatsBomb events (this may take a minute)...") | |
| all_events = pd.concat([load_events(c) for c in COMPETITIONS], ignore_index=True) | |
| print(f" Loaded {len(all_events):,} events") | |
| all_matches = pd.concat([load_matches(c) for c in COMPETITIONS], ignore_index=True) | |
| print(f" Loaded {len(all_matches):,} matches") | |
| all_lineups = pd.concat([load_lineups(c) for c in COMPETITIONS], ignore_index=True) | |
| print(f" Loaded {len(all_lineups):,} lineup entries") | |
| # Build WIFX dashboards only | |
| print("Building WIFX scores...") | |
| wifx = build_wifx_scores() | |
| write_json("wifx_scores.json", wifx) | |
| print("Building WIFX historical scores...") | |
| wifx_hist = build_wifx_historical_scores() | |
| write_json("wifx_historical_scores.json", wifx_hist) | |
| print("Building aggregated WIFX national team scores...") | |
| build_wifx_national_team_scores() | |
| print("Building aggregated WIFX club team scores...") | |
| build_wifx_club_team_scores() | |
| print("Building aggregated WIFX confederation scores...") | |
| build_wifx_confederation_scores() | |
| print("Done! All JSON files written to data/dashboard/") | |
| def write_json(filename: str, data: dict): | |
| import math | |
| path = OUT / filename | |
| def clean_nan(obj): | |
| if isinstance(obj, dict): | |
| return {k: clean_nan(v) for k, v in obj.items()} | |
| elif isinstance(obj, list): | |
| return [clean_nan(v) for v in obj] | |
| elif isinstance(obj, float) and (math.isnan(obj) or math.isinf(obj)): | |
| return None | |
| elif obj == "NaN": | |
| return None | |
| return obj | |
| data = clean_nan(data) | |
| with path.open("w", encoding="utf-8") as f: | |
| json.dump(data, f, ensure_ascii=False, default=str) | |
| size = path.stat().st_size | |
| print(f" Wrote {path} ({size / 1024:.1f} KB)") | |
| if __name__ == "__main__": | |
| main() | |