WIFX / scripts /build_dashboard_data.py
amadabhu
updated to historical
36f80b6
#!/usr/bin/env python3
"""Pre-aggregate raw data into small JSON files for the interactive dashboard.
Usage:
python scripts/build_dashboard_data.py
Reads from data/ and writes JSON files to data/dashboard/.
"""
from __future__ import annotations
import json
import warnings
from pathlib import Path
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)
ROOT = Path(__file__).resolve().parent.parent
DATA = ROOT / "data"
OUT = ROOT / "output"
OUT.mkdir(parents=True, exist_ok=True)
COMPETITIONS = {
"FA_Womens_Super_League_2018-2019": {"type": "league", "label": "FAWSL 2018-19"},
"FA_Womens_Super_League_2019-2020": {"type": "league", "label": "FAWSL 2019-20"},
"FA_Womens_Super_League_2020-2021": {"type": "league", "label": "FAWSL 2020-21"},
"NWSL_2018": {"type": "league", "label": "NWSL 2018"},
"UEFA_Womens_Euro_2022": {"type": "tournament", "label": "Euros 2022"},
"UEFA_Womens_Euro_2025": {"type": "tournament", "label": "Euros 2025"},
"Womens_World_Cup_2019": {"type": "tournament", "label": "WWC 2019"},
"Womens_World_Cup_2023": {"type": "tournament", "label": "WWC 2023"},
}
EVENT_COLS = [
"type", "player", "player_id", "team", "match_id", "minute",
"shot_outcome", "shot_statsbomb_xg",
"pass_goal_assist", "pass_shot_assist", "pass_through_ball",
"pass_cross", "pass_switch", "pass_outcome",
"dribble_outcome",
"interception_outcome",
"duel_type", "duel_outcome",
"position",
]
def load_events(comp_dir: str) -> pd.DataFrame:
path = DATA / "statsbomb" / comp_dir / "events.csv"
df = pd.read_csv(path, usecols=lambda c: c in EVENT_COLS, low_memory=False)
df["competition"] = comp_dir
return df
def load_matches(comp_dir: str) -> pd.DataFrame:
path = DATA / "statsbomb" / comp_dir / "matches.csv"
df = pd.read_csv(path)
df["competition"] = comp_dir
return df
def load_lineups(comp_dir: str) -> pd.DataFrame:
path = DATA / "statsbomb" / comp_dir / "lineups.csv"
df = pd.read_csv(path)
df["competition"] = comp_dir
return df
def percentile_rank(series: pd.Series) -> pd.Series:
return series.rank(pct=True) * 100
# ---------------------------------------------------------------------------
# StatsBomb Player Aggregates
# ---------------------------------------------------------------------------
def build_sb_players(all_events: pd.DataFrame, all_lineups: pd.DataFrame) -> dict:
ev = all_events[all_events["player"].notna()].copy()
# Goal threat components
goals = ev[ev["shot_outcome"] == "Goal"].groupby("player").size().rename("goals")
shots_on = ev[ev["shot_outcome"].isin(["Goal", "Saved", "Saved Off Target", "Saved to Post"])].groupby("player").size().rename("shots_on_target")
xg = ev[ev["shot_statsbomb_xg"].notna()].groupby("player")["shot_statsbomb_xg"].sum().rename("xg")
assists = ev[ev["pass_goal_assist"].notna()].groupby("player").size().rename("assists")
key_passes = ev[ev["pass_shot_assist"].notna()].groupby("player").size().rename("key_passes")
# Playmaker components
through_balls = ev[ev["pass_through_ball"].notna()].groupby("player").size().rename("through_balls")
crosses = ev[ev["pass_cross"].notna()].groupby("player").size().rename("crosses")
switches = ev[ev["pass_switch"].notna()].groupby("player").size().rename("switches")
dribbles_ok = ev[(ev["type"] == "Dribble") & (ev["dribble_outcome"] == "Complete")].groupby("player").size().rename("dribbles")
# Defensive components
interceptions = ev[ev["type"] == "Interception"].groupby("player").size().rename("interceptions")
tackles_won = ev[(ev["duel_type"] == "Tackle") & (ev["duel_outcome"].isin(["Won", "Success In Play", "Success Out"]))].groupby("player").size().rename("tackles_won")
blocks = ev[ev["type"] == "Block"].groupby("player").size().rename("blocks")
clearances = ev[ev["type"] == "Clearance"].groupby("player").size().rename("clearances")
pressures = ev[ev["type"] == "Pressure"].groupby("player").size().rename("pressures")
recoveries = ev[ev["type"] == "Ball Recovery"].groupby("player").size().rename("recoveries")
fouls_won = ev[ev["type"] == "Foul Won"].groupby("player").size().rename("fouls_won")
fouls_committed = ev[ev["type"] == "Foul Committed"].groupby("player").size().rename("fouls_committed")
# Get primary team and position per player
player_team = ev.groupby("player")["team"].agg(lambda x: x.value_counts().index[0]).rename("team")
player_comp = ev.groupby("player")["competition"].agg(lambda x: x.value_counts().index[0])
player_comp_label = player_comp.map(lambda c: COMPETITIONS.get(c, {}).get("label", c)).rename("competition")
# Position from lineups
pos_df = all_lineups[["player_name", "positions"]].copy()
pos_df = pos_df[pos_df["positions"].notna()]
def extract_primary_pos(pos_str):
try:
import ast
positions = ast.literal_eval(pos_str)
if positions and isinstance(positions, list):
return positions[0].get("position", "Unknown") if isinstance(positions[0], dict) else str(positions[0])
except Exception:
pass
return "Unknown"
pos_df["primary_position"] = pos_df["positions"].apply(extract_primary_pos)
player_positions = pos_df.groupby("player_name")["primary_position"].agg(
lambda x: x.value_counts().index[0]
).rename("position")
def simplify_position(pos):
pos = str(pos).lower()
if "goalkeeper" in pos or pos == "gk":
return "GK"
elif "back" in pos or "defender" in pos or pos in ("cb", "lb", "rb", "lwb", "rwb"):
return "DF"
elif "midfield" in pos or pos in ("cm", "cdm", "cam", "lm", "rm", "dm", "am"):
return "MF"
elif "forward" in pos or "wing" in pos or "striker" in pos or pos in ("st", "cf", "lw", "rw", "ss"):
return "FW"
return "MF"
player_pos_simple = player_positions.map(simplify_position).rename("position_group")
# Combine all stats
stats = pd.DataFrame({
"team": player_team,
"competition": player_comp_label,
})
for s in [goals, shots_on, xg, assists, key_passes, through_balls, crosses,
switches, dribbles_ok, interceptions, tackles_won, blocks, clearances,
pressures, recoveries, fouls_won, fouls_committed]:
stats = stats.join(s, how="left")
stats = stats.join(player_pos_simple, how="left")
stats = stats.fillna(0)
stats["position_group"] = stats["position_group"].replace(0, "MF")
# Compute scores as percentile ranks
stats["goal_threat"] = percentile_rank(
stats[["goals", "shots_on_target", "xg", "assists", "key_passes"]].sum(axis=1)
)
stats["playmaker"] = percentile_rank(
stats[["assists", "key_passes", "through_balls", "crosses", "switches", "dribbles"]].sum(axis=1)
)
stats["defensive"] = percentile_rank(
stats[["interceptions", "tackles_won", "blocks", "clearances", "pressures", "recoveries"]].sum(axis=1)
)
stats["composite"] = (stats["goal_threat"] + stats["playmaker"] + stats["defensive"]) / 3
stats = stats.reset_index().rename(columns={"index": "player"})
# Top 30 per metric
result = {}
for metric in ["goal_threat", "playmaker", "defensive", "composite"]:
top = stats.nlargest(30, metric)
result[metric] = top[["player", "team", "competition", "position_group",
"goals", "assists", "xg", "key_passes",
"interceptions", "tackles_won", "blocks",
metric]].to_dict(orient="records")
# League vs tournament split
ev_with_type = ev.copy()
ev_with_type["comp_type"] = ev_with_type["competition"].map(
lambda c: COMPETITIONS.get(c, {}).get("type", "unknown")
)
league_goals = ev_with_type[(ev_with_type["shot_outcome"] == "Goal") & (ev_with_type["comp_type"] == "league")].groupby("player").size().rename("league_goals")
tourn_goals = ev_with_type[(ev_with_type["shot_outcome"] == "Goal") & (ev_with_type["comp_type"] == "tournament")].groupby("player").size().rename("tournament_goals")
league_assists = ev_with_type[(ev_with_type["pass_goal_assist"].notna()) & (ev_with_type["comp_type"] == "league")].groupby("player").size().rename("league_assists")
tourn_assists = ev_with_type[(ev_with_type["pass_goal_assist"].notna()) & (ev_with_type["comp_type"] == "tournament")].groupby("player").size().rename("tournament_assists")
lvt = pd.DataFrame({"league_goals": league_goals, "tournament_goals": tourn_goals,
"league_assists": league_assists, "tournament_assists": tourn_assists}).fillna(0)
lvt["total"] = lvt.sum(axis=1)
lvt = lvt.nlargest(25, "total").reset_index().rename(columns={"index": "player"})
result["league_vs_tournament"] = lvt.to_dict(orient="records")
# Top 10 by position
by_pos = {}
for pos in ["FW", "MF", "DF", "GK"]:
subset = stats[stats["position_group"] == pos].nlargest(10, "composite")
by_pos[pos] = subset[["player", "team", "composite", "goal_threat", "playmaker", "defensive"]].to_dict(orient="records")
result["by_position"] = by_pos
return result
# ---------------------------------------------------------------------------
# StatsBomb Club/Country Aggregates
# ---------------------------------------------------------------------------
def compute_team_rankings(all_matches: pd.DataFrame, all_events: pd.DataFrame, comp_type: str) -> dict:
comps = [c for c, info in COMPETITIONS.items() if info["type"] == comp_type]
matches = all_matches[all_matches["competition"].isin(comps)].copy()
events = all_events[all_events["competition"].isin(comps)]
if matches.empty:
return {"teams": []}
matches = matches.sort_values("match_date")
# xG per team per match
xg_by_match = events[events["shot_statsbomb_xg"].notna()].groupby(
["match_id", "team"]
)["shot_statsbomb_xg"].sum().reset_index()
# Build team stats
records = []
for _, m in matches.iterrows():
home, away = m["home_team"], m["away_team"]
hs, as_ = m["home_score"], m["away_score"]
mid = m["match_id"]
comp_label = COMPETITIONS.get(m["competition"], {}).get("label", m["competition"])
for team, opp, gs, gc in [(home, away, hs, as_), (away, home, as_, hs)]:
xg_team = xg_by_match[(xg_by_match["match_id"] == mid) & (xg_by_match["team"] == team)]
xg_opp = xg_by_match[(xg_by_match["match_id"] == mid) & (xg_by_match["team"] == opp)]
records.append({
"team": team,
"match_id": mid,
"match_date": m["match_date"],
"competition": comp_label,
"goals_scored": gs,
"goals_conceded": gc,
"points": 3 if gs > gc else (1 if gs == gc else 0),
"xg_for": float(xg_team["shot_statsbomb_xg"].values[0]) if len(xg_team) else 0.0,
"xg_against": float(xg_opp["shot_statsbomb_xg"].values[0]) if len(xg_opp) else 0.0,
})
df = pd.DataFrame(records)
# Aggregate across all competitions
team_stats = df.groupby("team").agg(
matches=("match_id", "count"),
total_points=("points", "sum"),
goals_scored=("goals_scored", "sum"),
goals_conceded=("goals_conceded", "sum"),
xg_for=("xg_for", "sum"),
xg_against=("xg_against", "sum"),
competition=("competition", lambda x: ", ".join(x.unique()[:3])), # List multiple comps
).reset_index()
team_stats["ppg"] = (team_stats["total_points"] / team_stats["matches"]).round(2)
team_stats["gd_per_game"] = ((team_stats["goals_scored"] - team_stats["goals_conceded"]) / team_stats["matches"]).round(2)
team_stats["xg_dominance"] = ((team_stats["xg_for"] - team_stats["xg_against"]) / team_stats["matches"]).round(3)
# Elo (across all matches)
elo = {}
for _, m in matches.iterrows():
home, away = m["home_team"], m["away_team"]
hs, as_ = m["home_score"], m["away_score"]
eh = elo.get(home, 1500)
ea = elo.get(away, 1500)
exp_h = 1 / (1 + 10 ** ((ea - eh) / 400))
actual_h = 1.0 if hs > as_ else (0.5 if hs == as_ else 0.0)
K = 40
elo[home] = eh + K * (actual_h - exp_h)
elo[away] = ea + K * ((1 - actual_h) - (1 - exp_h))
team_stats["elo"] = team_stats["team"].map(elo).round(0)
# Composite
for col in ["ppg", "elo", "xg_dominance"]:
team_stats[f"{col}_pct"] = percentile_rank(team_stats[col])
team_stats["composite"] = ((team_stats["ppg_pct"] + team_stats["elo_pct"] + team_stats["xg_dominance_pct"]) / 3).round(1)
team_stats = team_stats.sort_values("composite", ascending=False)
cols = ["team", "competition", "matches", "ppg", "elo", "xg_dominance", "gd_per_game", "composite"]
return {"teams": team_stats[cols].to_dict(orient="records")}
# ---------------------------------------------------------------------------
# StatsBomb Player Comparisons
# ---------------------------------------------------------------------------
def build_sb_player_comparisons(all_events: pd.DataFrame) -> dict:
ev = all_events[all_events["player"].notna()].copy()
ev["comp_type"] = ev["competition"].map(lambda c: COMPETITIONS.get(c, {}).get("type", "unknown"))
ev["comp_label"] = ev["competition"].map(lambda c: COMPETITIONS.get(c, {}).get("label", c))
def player_scores(subset):
goals = subset[subset["shot_outcome"] == "Goal"].groupby("player").size().rename("goals")
assists = subset[subset["pass_goal_assist"].notna()].groupby("player").size().rename("assists")
key_passes = subset[subset["pass_shot_assist"].notna()].groupby("player").size().rename("key_passes")
xg = subset[subset["shot_statsbomb_xg"].notna()].groupby("player")["shot_statsbomb_xg"].sum().rename("xg")
through_balls = subset[subset["pass_through_ball"].notna()].groupby("player").size().rename("through_balls")
crosses = subset[subset["pass_cross"].notna()].groupby("player").size().rename("crosses")
interceptions = subset[subset["type"] == "Interception"].groupby("player").size().rename("interceptions")
tackles = subset[(subset["duel_type"] == "Tackle") & (subset["duel_outcome"].isin(["Won", "Success In Play", "Success Out"]))].groupby("player").size().rename("tackles_won")
blocks = subset[subset["type"] == "Block"].groupby("player").size().rename("blocks")
recoveries = subset[subset["type"] == "Ball Recovery"].groupby("player").size().rename("recoveries")
stats = pd.DataFrame({"goals": goals, "assists": assists, "key_passes": key_passes,
"xg": xg, "through_balls": through_balls, "crosses": crosses,
"interceptions": interceptions, "tackles_won": tackles,
"blocks": blocks, "recoveries": recoveries}).fillna(0)
if len(stats) == 0:
return stats
stats["goal_threat"] = percentile_rank(stats[["goals", "xg", "assists", "key_passes"]].sum(axis=1))
stats["playmaker"] = percentile_rank(stats[["assists", "key_passes", "through_balls", "crosses"]].sum(axis=1))
stats["defensive"] = percentile_rank(stats[["interceptions", "tackles_won", "blocks", "recoveries"]].sum(axis=1))
stats["composite"] = (stats["goal_threat"] + stats["playmaker"] + stats["defensive"]) / 3
return stats
result = {}
# 1. Historical tournaments vs Euros 2025
hist_tourn = ev[(ev["comp_type"] == "tournament") & (ev["competition"] != "UEFA_Womens_Euro_2025")]
euros25 = ev[ev["competition"] == "UEFA_Womens_Euro_2025"]
hist_scores = player_scores(hist_tourn)
e25_scores = player_scores(euros25)
comparison1 = []
common = hist_scores.index.intersection(e25_scores.index)
for metric in ["goal_threat", "playmaker", "defensive", "composite"]:
merged = pd.DataFrame({
"historical": hist_scores.loc[common, metric] if metric in hist_scores.columns else 0,
"euros_2025": e25_scores.loc[common, metric] if metric in e25_scores.columns else 0,
}).dropna()
top = merged.nlargest(15, "euros_2025").reset_index().rename(columns={"index": "player"})
comparison1.append({"metric": metric, "players": top.to_dict(orient="records")})
result["historical_vs_euros2025"] = comparison1
# 2. League vs Tournament
league_ev = ev[ev["comp_type"] == "league"]
tourn_ev = ev[ev["comp_type"] == "tournament"]
league_scores = player_scores(league_ev)
tourn_scores = player_scores(tourn_ev)
comparison2 = []
common2 = league_scores.index.intersection(tourn_scores.index)
for metric in ["goal_threat", "playmaker", "defensive", "composite"]:
merged = pd.DataFrame({
"league": league_scores.loc[common2, metric] if metric in league_scores.columns else 0,
"tournament": tourn_scores.loc[common2, metric] if metric in tourn_scores.columns else 0,
}).dropna()
top = merged.nlargest(15, "tournament").reset_index().rename(columns={"index": "player"})
comparison2.append({"metric": metric, "players": top.to_dict(orient="records")})
result["league_vs_tournament"] = comparison2
# 3. Euros 2025 Group vs Knockout
# Need match stage info from matches
e25_matches_path = DATA / "statsbomb" / "UEFA_Womens_Euro_2025" / "matches.csv"
if e25_matches_path.exists():
e25m = pd.read_csv(e25_matches_path)
group_match_ids = e25m[e25m["competition_stage"].str.contains("Group", case=False, na=False)]["match_id"].tolist()
ko_match_ids = e25m[~e25m["competition_stage"].str.contains("Group", case=False, na=False)]["match_id"].tolist()
group_ev = euros25[euros25["match_id"].isin(group_match_ids)]
ko_ev = euros25[euros25["match_id"].isin(ko_match_ids)]
group_scores = player_scores(group_ev)
ko_scores = player_scores(ko_ev)
comparison3 = []
common3 = group_scores.index.intersection(ko_scores.index)
for metric in ["goal_threat", "playmaker", "composite"]:
merged = pd.DataFrame({
"group_stage": group_scores.loc[common3, metric] if metric in group_scores.columns else 0,
"knockout": ko_scores.loc[common3, metric] if metric in ko_scores.columns else 0,
}).dropna()
top = merged.nlargest(15, "knockout").reset_index().rename(columns={"index": "player"})
comparison3.append({"metric": metric, "players": top.to_dict(orient="records")})
result["euros2025_group_vs_knockout"] = comparison3
return result
# ---------------------------------------------------------------------------
# FIFA Rankings
# ---------------------------------------------------------------------------
def build_fifa_rankings() -> dict:
quarters = [
("2025_03_06", "Mar 2025"),
("2025_06_12", "Jun 2025"),
("2025_08_07", "Aug 2025"),
("2025_12_11", "Dec 2025"),
]
frames = {}
for suffix, label in quarters:
path = DATA / f"fifa_womens_world_ranking_{suffix}.csv"
if path.exists():
df = pd.read_csv(path)
frames[label] = df
if not frames:
return {}
# Build per-country trajectory
countries = {}
for label, df in frames.items():
for _, row in df.iterrows():
c = row["Country"]
if c not in countries:
countries[c] = {
"country": c,
"code": row.get("Country_Code", ""),
"confederation": row.get("Confederation", ""),
"points": {},
"ranks": {},
}
if pd.notna(row["Total_Points"]):
countries[c]["points"][label] = float(row["Total_Points"])
if pd.notna(row["Rank"]):
countries[c]["ranks"][label] = int(row["Rank"])
all_countries = list(countries.values())
# Average points and rank across all quarters for top 25
all_df = pd.concat(frames.values(), ignore_index=True)
avg_points = all_df.groupby(["Country", "Country_Code", "Confederation"])["Total_Points"].mean().reset_index()
avg_points = avg_points.sort_values("Total_Points", ascending=False)
top25 = avg_points.head(25).rename(columns={"Total_Points": "Avg_Points"}).to_dict(orient="records")
# Confederation breakdown (by year/quarter)
conf_avg = {}
for label, df in frames.items():
conf_avg[label] = df.groupby("Confederation")["Total_Points"].mean().round(1).to_dict()
# Movers: calculate change from earliest to latest available year
all_quarters_sorted = sorted(frames.keys())
first_label = all_quarters_sorted[0]
latest_label = all_quarters_sorted[-1]
movers = []
for c in all_countries:
if first_label in c["ranks"] and latest_label in c["ranks"]:
rank_change = c["ranks"][first_label] - c["ranks"][latest_label]
point_change = c["points"].get(latest_label, 0) - c["points"].get(first_label, 0)
movers.append({
"country": c["country"],
"code": c["code"],
"confederation": c["confederation"],
"rank_change": rank_change,
"point_change": round(point_change, 1),
})
movers_df = pd.DataFrame(movers)
top_climbers = movers_df.nlargest(15, "rank_change").to_dict(orient="records")
top_fallers = movers_df.nsmallest(15, "rank_change").to_dict(orient="records")
top_point_gainers = movers_df.nlargest(15, "point_change").to_dict(orient="records")
# H1 vs H2 (first half vs second half of the year)
mid = len(all_quarters_sorted) // 2
h1_labels = all_quarters_sorted[:mid]
h2_labels = all_quarters_sorted[mid:]
h1h2 = []
for c in all_countries:
pts = c["points"]
rnk = c["ranks"]
if all(l in pts for l in h1_labels + h2_labels) and all(l in rnk for l in h1_labels + h2_labels):
h1_point_delta = sum(pts[l] for l in h1_labels[1:]) - sum(pts[l] for l in h1_labels[:-1])
h2_point_delta = sum(pts[l] for l in h2_labels[1:]) - sum(pts[l] for l in h2_labels[:-1])
h1_rank_delta = rnk[h1_labels[0]] - rnk[h1_labels[-1]]
h2_rank_delta = rnk[h2_labels[0]] - rnk[h2_labels[-1]]
h1h2.append({
"country": c["country"],
"code": c["code"],
"confederation": c["confederation"],
"h1_point_delta": round(h1_point_delta, 1),
"h2_point_delta": round(h2_point_delta, 1),
"h1_rank_delta": h1_rank_delta,
"h2_rank_delta": h2_rank_delta,
})
h1h2_df = pd.DataFrame(h1h2)
# Top 10 trajectories (by average points across all years)
top10_countries = avg_points.head(10)["Country"].tolist()
trajectories = [c for c in all_countries if c["country"] in top10_countries]
return {
"top25": top25,
"confederation_avg": conf_avg,
"top_climbers": top_climbers,
"top_fallers": top_fallers,
"top_point_gainers": top_point_gainers,
"h1_vs_h2": h1h2_df.nlargest(20, "h2_point_delta").to_dict(orient="records") if len(h1h2_df) else [],
"h1_vs_h2_risers": h1h2_df.assign(
h2_improvement=h1h2_df["h2_point_delta"] - h1h2_df["h1_point_delta"]
).nlargest(15, "h2_improvement").to_dict(orient="records") if len(h1h2_df) else [],
"trajectories": trajectories,
"quarters": [q[1] for q in quarters],
}
# ---------------------------------------------------------------------------
# WIFXScore (aggregated across years)
# ---------------------------------------------------------------------------
def build_wifx_scores() -> dict:
path = DATA / "wifx_scores.csv"
df = pd.read_csv(path)
# Aggregate by player across all years/competitions
# Use max score so merging entries never penalises players
# (different sources have different feature richness)
player_agg = df.groupby("player").apply(
lambda g: pd.Series({
"WIFXScore": g["WIFXScore"].max(),
"epm_raw": g.loc[g["WIFXScore"].idxmax(), "epm_raw"],
"offensive_score": g.loc[g["WIFXScore"].idxmax(), "offensive_score"],
"creative_score": g.loc[g["WIFXScore"].idxmax(), "creative_score"],
"defensive_score": g.loc[g["WIFXScore"].idxmax(), "defensive_score"],
"total_events": g["total_events"].sum(),
"team": g["team"].value_counts().index[0] if len(g["team"].value_counts()) > 0 else "Unknown",
"primary_comp": ", ".join(g["primary_comp"].unique()[:3]) if len(g["primary_comp"].unique()) > 0 else "Unknown",
})
).reset_index()
# Filter for minimum events threshold
player_agg = player_agg[player_agg["total_events"] >= 50] # At least 50 events total
player_agg = player_agg[player_agg["WIFXScore"].notna()] # Remove NaN scores
# Top 25 by average WIFXScore - include all metrics
top25 = player_agg.nlargest(25, "WIFXScore")[
["player", "team", "primary_comp", "WIFXScore", "epm_raw", "offensive_score", "creative_score", "defensive_score", "total_events"]
].to_dict(orient="records")
# Bottom 25 by WIFXScore
bottom25 = player_agg.nsmallest(25, "WIFXScore")[
["player", "team", "primary_comp", "WIFXScore", "epm_raw", "offensive_score", "creative_score", "defensive_score", "total_events"]
].to_dict(orient="records")
# All players for component breakdown (top 15)
all_players = player_agg.sort_values("WIFXScore", ascending=False)[
["player", "team", "primary_comp", "WIFXScore", "epm_raw", "offensive_score", "creative_score", "defensive_score", "total_events"]
].to_dict(orient="records")
# Distribution histogram
hist_counts, hist_edges = np.histogram(player_agg["WIFXScore"], bins=30)
distribution = {
"counts": hist_counts.tolist(),
"edges": [round(float(e), 2) for e in hist_edges.tolist()],
"mean": round(float(player_agg["WIFXScore"].mean()), 2),
"std": round(float(player_agg["WIFXScore"].std()), 2),
}
# By competition (still useful to show)
by_comp = df.groupby("primary_comp")["WIFXScore"].agg(["mean", "median", "std", "count", "min", "max"]).round(2)
by_comp_list = []
for comp, row in by_comp.iterrows():
scores = df[df["primary_comp"] == comp]["WIFXScore"].tolist()
by_comp_list.append({
"competition": comp,
"mean": row["mean"],
"median": row["median"],
"std": row["std"],
"count": int(row["count"]),
"scores": [round(s, 2) for s in scores],
})
return {
"top25": top25,
"bottom25": bottom25,
"all_players": all_players,
"distribution": distribution,
"by_competition": by_comp_list,
}
# ---------------------------------------------------------------------------
# WIFXScore Historical (retired/legend players)
# ---------------------------------------------------------------------------
def build_wifx_historical_scores() -> dict:
path = DATA / "wifx_historical_scores.csv"
retired_path = DATA / "retired_players.csv"
df = pd.read_csv(path)
retired_df = pd.read_csv(retired_path)
category_map = dict(zip(retired_df["player"], retired_df["category"]))
player_agg = df.groupby("player").apply(
lambda g: pd.Series({
"WIFXScore": g["WIFXScore"].max(),
"epm_raw": g.loc[g["WIFXScore"].idxmax(), "epm_raw"],
"offensive_score": g.loc[g["WIFXScore"].idxmax(), "offensive_score"],
"creative_score": g.loc[g["WIFXScore"].idxmax(), "creative_score"],
"defensive_score": g.loc[g["WIFXScore"].idxmax(), "defensive_score"],
"total_events": g["total_events"].sum(),
"team": g["team"].value_counts().index[0] if len(g["team"].value_counts()) > 0 else "Unknown",
"primary_comp": ", ".join(g["primary_comp"].unique()[:3]) if len(g["primary_comp"].unique()) > 0 else "Unknown",
})
).reset_index()
player_agg = player_agg[player_agg["total_events"] >= 50]
player_agg = player_agg[player_agg["WIFXScore"].notna()]
player_agg["category"] = player_agg["player"].map(category_map).fillna("retired")
cols = ["player", "team", "primary_comp", "WIFXScore", "epm_raw",
"offensive_score", "creative_score", "defensive_score",
"total_events", "category"]
all_players = player_agg.sort_values("WIFXScore", ascending=False)[cols].to_dict(orient="records")
top25 = player_agg.nlargest(25, "WIFXScore")[cols].to_dict(orient="records")
return {
"top25": top25,
"all_players": all_players,
}
# ---------------------------------------------------------------------------
# Historical Match Results
# ---------------------------------------------------------------------------
def build_match_results() -> dict:
results_path = DATA / "versions" / "36" / "results.csv"
goals_path = DATA / "versions" / "36" / "goalscorers.csv"
results = pd.read_csv(results_path)
goalscorers = pd.read_csv(goals_path)
# Team aggregates (min 10 matches)
records = []
for _, m in results.iterrows():
home, away = m["home_team"], m["away_team"]
hs, as_ = m["home_score"], m["away_score"]
for team, opp, gs, gc in [(home, away, hs, as_), (away, home, as_, hs)]:
records.append({
"team": team,
"date": m["date"],
"goals_scored": gs,
"goals_conceded": gc,
"points": 3 if gs > gc else (1 if gs == gc else 0),
})
df = pd.DataFrame(records)
team_stats = df.groupby("team").agg(
matches=("points", "count"),
total_points=("points", "sum"),
goals_scored=("goals_scored", "sum"),
goals_conceded=("goals_conceded", "sum"),
).reset_index()
team_stats = team_stats[team_stats["matches"] >= 10]
team_stats["ppg"] = (team_stats["total_points"] / team_stats["matches"]).round(2)
team_stats["gd_per_game"] = ((team_stats["goals_scored"] - team_stats["goals_conceded"]) / team_stats["matches"]).round(2)
# Elo
results_sorted = results.sort_values("date")
elo = {}
for _, m in results_sorted.iterrows():
home, away = m["home_team"], m["away_team"]
hs, as_ = m["home_score"], m["away_score"]
eh = elo.get(home, 1500)
ea = elo.get(away, 1500)
exp_h = 1 / (1 + 10 ** ((ea - eh) / 400))
actual_h = 1.0 if hs > as_ else (0.5 if hs == as_ else 0.0)
K = 40
elo[home] = eh + K * (actual_h - exp_h)
elo[away] = ea + K * ((1 - actual_h) - (1 - exp_h))
team_stats["elo"] = team_stats["team"].map(elo).round(0)
# Composite
for col in ["ppg", "elo", "gd_per_game"]:
team_stats[f"{col}_pct"] = percentile_rank(team_stats[col])
team_stats["composite"] = ((team_stats["ppg_pct"] + team_stats["elo_pct"] + team_stats["gd_per_game_pct"]) / 3).round(1)
team_stats = team_stats.sort_values("composite", ascending=False)
top_teams = team_stats.head(30)[["team", "matches", "ppg", "elo", "gd_per_game", "composite"]].to_dict(orient="records")
# Top scorers
scorer_counts = goalscorers.groupby("scorer").agg(
goals=("scorer", "count"),
teams=("team", lambda x: ", ".join(x.unique())),
penalties=("penalty", "sum"),
).reset_index().sort_values("goals", ascending=False)
top_scorers = scorer_counts.head(30).to_dict(orient="records")
return {
"top_teams": top_teams,
"top_scorers": top_scorers,
}
# ---------------------------------------------------------------------------
# WIFX National Team Scores (aggregated across all years)
# ---------------------------------------------------------------------------
def build_wifx_national_team_scores():
path = DATA / "wifx_national_team_scores.csv"
df = pd.read_csv(path)
# Championship wins weighting (major tournaments)
CHAMPIONSHIP_WINS = {
"United States Women's": 4, # WWC: 1991, 1999, 2015, 2019
"United States": 4,
"Germany Women's": 2, # Euro: 1995, 2001, 2009, 2013
"Germany": 2,
"Norway Women's": 1, # Euro: 1995, WWC: 2023
"Norway": 1,
"Japan Women's": 1, # WWC: 2011
"Japan": 1,
"Spain Women's": 2, # Euro: 2022, WWC: 2023
"Spain": 2,
"England Women's": 1, # Euro: 2022
"England": 1,
"Netherlands Women's": 1, # Euro: 2017
"Netherlands": 1,
"France Women's": 0,
"France": 0,
"Sweden Women's": 0,
"Sweden": 0,
"Canada Women's": 1, # Olympics: 2020, 2024
"Canada": 1,
"Brazil Women's": 0,
"Brazil": 0,
"Australia Women's": 0,
"Australia": 0,
}
# Add championship wins
df["championship_wins"] = df["team"].map(CHAMPIONSHIP_WINS).fillna(0)
# Aggregate by team
agg_cols = {
"offensive_rating": "mean",
"defensive_rating": "mean",
"net_rating": "mean",
"composite_rating": "mean",
"matches": "sum",
"goals_scored": "sum",
"championship_wins": "max", # Keep max wins
}
if "goals_conceded" in df.columns:
agg_cols["goals_conceded"] = "sum"
agg = df.groupby("team").agg(agg_cols).reset_index()
# Weight net rating by championship wins (add number of championships)
agg["wifx_global_ranking"] = agg["net_rating"] + agg["championship_wins"]
# Sort by WIFX Global Ranking
agg = agg.sort_values("wifx_global_ranking", ascending=False)
# Rename net_rating to wifx_global_ranking for output
result = {
"all_teams": agg.to_dict(orient="records"),
}
write_json("wifx_national_team_scores.json", result)
# ---------------------------------------------------------------------------
# WIFX Club Team Scores (aggregated across all years)
# ---------------------------------------------------------------------------
def build_wifx_club_team_scores():
# First, load existing StatsBomb data
path = DATA / "wifx_club_team_scores.csv"
df = pd.read_csv(path)
# Proper weighted average aggregation for StatsBomb
agg = {}
for _, row in df.iterrows():
team = row['team']
matches = row['matches']
if team not in agg:
agg[team] = {
'team': team,
'matches': 0,
'goals_scored': 0,
'offensive_rating_sum': 0,
'defensive_rating_sum': 0,
'net_rating_sum': 0,
'composite_rating_sum': 0,
'comps': set()
}
agg[team]['matches'] += matches
agg[team]['goals_scored'] += int(row.get('goals_scored', 0) or 0)
if 'goals_conceded' in row and pd.notna(row.get('goals_conceded')):
if 'goals_conceded' not in agg[team]:
agg[team]['goals_conceded'] = 0
agg[team]['goals_conceded'] += int(row['goals_conceded'])
agg[team]['offensive_rating_sum'] += (row['offensive_rating'] or 0) * matches
agg[team]['defensive_rating_sum'] += (row['defensive_rating'] or 0) * matches
agg[team]['net_rating_sum'] += (row['net_rating'] or 0) * matches
agg[team]['composite_rating_sum'] += (row['composite_rating'] or 0) * matches
if pd.notna(row.get('comp_label')):
agg[team]['comps'].add(row['comp_label'])
# Compute StatsBomb averages
sb_result = []
for team, data in agg.items():
comps_str = ", ".join(sorted(data['comps'])) if data['comps'] else "FAWSL"
result = {
'team': team,
'offensive_rating': round(data['offensive_rating_sum'] / data['matches'], 1),
'defensive_rating': round(data['defensive_rating_sum'] / data['matches'], 1),
'net_rating': round(data['net_rating_sum'] / data['matches'], 1),
'composite_rating': round(data['composite_rating_sum'] / data['matches'], 1),
'matches': data['matches'],
'goals_scored': data['goals_scored'],
'comp_label': comps_str,
'source': 'statsbomb'
}
if 'goals_conceded' in data:
result['goals_conceded'] = data['goals_conceded']
sb_result.append(result)
# Normalize StatsBomb to 0-100 scale (was 0-30)
sb_off_min = min(t['offensive_rating'] for t in sb_result)
sb_off_max = max(t['offensive_rating'] for t in sb_result)
sb_def_min = min(t['defensive_rating'] for t in sb_result)
sb_def_max = max(t['defensive_rating'] for t in sb_result)
if sb_off_max > sb_off_min:
for t in sb_result:
t['offensive_rating'] = round((t['offensive_rating'] - sb_off_min) / (sb_off_max - sb_off_min) * 100, 1)
t['defensive_rating'] = round((t['defensive_rating'] - sb_def_min) / (sb_def_max - sb_def_min) * 100, 1)
t['net_rating'] = round(t['offensive_rating'] - t['defensive_rating'], 1)
t['composite_rating'] = round((t['offensive_rating'] + t['defensive_rating']) / 2, 1)
TEAM_MAP = {
'KPqjw8PQ6v': 'Portland Thorns',
'aDQ0lzvQEv': 'OL Reign',
'4JMAk47qKg': 'Chicago Red Stars',
'XVqKeVKM01': 'Washington Spirit',
'raMyrr25d2': 'Houston Dash',
'zeQZeazqKw': 'Orlando Pride',
'7vQ7BBzqD1': 'FC Kansas City',
'4wM4rZdqjB': 'North Carolina Courage',
'Pk5LeeNqOW': 'Kansas City Current',
'4wM4Ezg5jB': 'Sky Blue FC',
'7VqG1lYMvW': 'NJ/NY Gotham',
'eV5DR6YQKn': 'Angel City',
'kRQa8JOqKZ': 'San Diego Wave',
'eV5D2w9QKn': 'Bay FC',
'315VnJ759x': 'Racing Louisville',
'xW5pwDBMg1': 'Boston Breakers',
'kRQaWa15KZ': 'Western New York Flash',
}
ga_path = DATA / "asa_nwsl" / "goals_added.csv"
if ga_path.exists():
ga = pd.read_csv(ga_path)
team_year = ga.groupby(['team_id_ga', 'season']).agg({
'minutes_played_ga': 'sum',
'ga_shooting_raw': 'sum',
'ga_passing_raw': 'sum',
'ga_dribbling_raw': 'sum',
'ga_interrupting_raw': 'sum',
'ga_receiving_raw': 'sum',
'player_id': 'count',
}).reset_index()
team_year.columns = ['team_id', 'season', 'minutes', 'shooting', 'passing', 'dribbling', 'interrupting', 'receiving', 'players']
team_year['team'] = team_year['team_id'].map(TEAM_MAP).fillna('Unknown')
team_year = team_year[(team_year['team'] != 'Unknown') & (team_year['minutes'] > 5000)]
# Percentile ranking within each season
team_year['offensive_rating'] = team_year.groupby('season')['shooting'].transform(lambda x: (x.rank(pct=True) * 100).round(1))
team_year['defensive_rating'] = team_year.groupby('season')['interrupting'].transform(lambda x: (x.rank(pct=True) * 100).round(1))
team_year['net_rating'] = (team_year['offensive_rating'] - team_year['defensive_rating']).round(1)
team_year['composite_rating'] = ((team_year['offensive_rating'] + team_year['defensive_rating']) / 2).round(1)
# Convert minutes to matches (approx 90 min = 1 match)
team_year['matches'] = (team_year['minutes'] / 90).astype(int)
team_year['comp_label'] = 'NWSL ' + team_year['season'].astype(str)
# Aggregate across all years
asa_agg = {}
for _, row in team_year.iterrows():
team = row['team']
matches = row['matches']
if team not in asa_agg:
asa_agg[team] = {
'team': team,
'matches': 0,
'offensive_rating_sum': 0,
'defensive_rating_sum': 0,
'net_rating_sum': 0,
'composite_rating_sum': 0,
}
asa_agg[team]['matches'] += matches
asa_agg[team]['offensive_rating_sum'] += row['offensive_rating'] * matches
asa_agg[team]['defensive_rating_sum'] += row['defensive_rating'] * matches
asa_agg[team]['net_rating_sum'] += row['net_rating'] * matches
asa_agg[team]['composite_rating_sum'] += row['composite_rating'] * matches
asa_result = []
for team, data in asa_agg.items():
asa_result.append({
'team': team,
'offensive_rating': round(data['offensive_rating_sum'] / data['matches'], 1),
'defensive_rating': round(data['defensive_rating_sum'] / data['matches'], 1),
'net_rating': round(data['net_rating_sum'] / data['matches'], 1),
'composite_rating': round(data['composite_rating_sum'] / data['matches'], 1),
'matches': data['matches'],
'goals_scored': 0, # Not available in ASA
'goals_conceded': 0,
'comp_label': 'NWSL 2016-2025',
'source': 'asa'
})
else:
asa_result = []
# Combine both (deduplicate by team name - prefer ASA if available as it has more data)
combined = {}
# Championship wins mapping for clubs (NWSL weighted slightly higher)
CLUB_CHAMPIONSHIPS = {
# NWSL (weighted 1.5x)
"Portland Thorns": 3, # 2017, 2022, 2024
"North Carolina Courage": 3, # 2018, 2019, 2023
"Kansas City Current": 1, # 2024 (as Current)
"FC Kansas City": 2, # 2014, 2015
"Western New York Flash": 1, # 2016
"OL Reign": 1, # 2020
"Seattle Reign": 1, # 2020
"Chicago Red Stars": 0,
"Washington Spirit": 1, # 2021
"Houston Dash": 0,
"Angel City": 0,
"NJ/NY Gotham": 0,
"Boston Breakers": 0,
"Sky Blue FC": 0,
# FAWSL
"Chelsea": 4, # 2015-16, 2017-18, 2019-20, 2020-21
"Manchester City Women": 2, # 2016-17, 2020-21
"Arsenal Women": 1, # 2022-23
"Liverpool FFC": 1, # 2013-14
"Everton Ladies": 0,
"Bristol City WFC": 0,
"Brighton & Hove Albion Women": 0,
"Reading FC Women": 0,
"Tottenham Hotspur Women": 0,
"West Ham United LFC": 0,
"Aston Villa": 0,
"Yeovil Town LFC": 0,
# UWCL
"Lyon": 8,
"OL Lyonnes": 8, # 2016-2020 (5), 2021-22, 2022-23, 2023-24
"Barcelona": 3,
"Fútbol Club Barcelona": 3, # 2020-21, 2021-22, 2022-23
"Wolfsburg": 2,
"VfL Wolfsburg": 2, # 2013-14, 2015-16
"Paris Saint-Germain": 0,
"Olympique Lyonnais": 8,
# Other leagues
"Bay FC": 0,
"Racing Louisville": 0,
"San Diego Wave": 0,
"FC Barcelona": 3,
}
# First add StatsBomb teams
for t in sb_result:
combined[t['team']] = t
# Then add ASA teams (will overwrite StatsBomb if exists)
for t in asa_result:
if t['team'] in combined:
# Merge - keep statsbomb goals data, use ASA ratings weighted by matches
existing = combined[t['team']]
total_matches = existing['matches'] + t['matches']
combined[t['team']] = {
'team': t['team'],
'offensive_rating': round((existing['offensive_rating'] * existing['matches'] + t['offensive_rating'] * t['matches']) / total_matches, 1),
'defensive_rating': round((existing['defensive_rating'] * existing['matches'] + t['defensive_rating'] * t['matches']) / total_matches, 1),
'net_rating': round((existing['net_rating'] * existing['matches'] + t['net_rating'] * t['matches']) / total_matches, 1),
'composite_rating': round((existing['composite_rating'] * existing['matches'] + t['composite_rating'] * t['matches']) / total_matches, 1),
'matches': total_matches,
'goals_scored': existing.get('goals_scored', 0),
'goals_conceded': existing.get('goals_conceded', 0),
'comp_label': 'NWSL + FAWSL',
}
else:
combined[t['team']] = t
# Add championship wins and WIFX Global Club Ranking
for team, data in combined.items():
wins = CLUB_CHAMPIONSHIPS.get(team, 0)
# NWSL championships weighted 1.5x
nwsl_teams = ["Portland Thorns", "North Carolina Courage", "Kansas City Current", "FC Kansas City",
"Western New York Flash", "OL Reign", "Seattle Reign", "Chicago Red Stars",
"Washington Spirit", "Houston Dash", "Angel City", "NJ/NY Gotham", "Boston Breakers",
"Sky Blue FC", "Bay FC", "Racing Louisville", "San Diego Wave"]
if team in nwsl_teams:
data['championship_wins'] = wins
data['wifx_global_club_ranking'] = data['net_rating'] + (wins * 1.5)
else:
data['championship_wins'] = wins
data['wifx_global_club_ranking'] = data['net_rating'] + wins
all_teams = list(combined.values())
all_teams.sort(key=lambda x: x.get('wifx_global_club_ranking', x.get('composite_rating', 0)), reverse=True)
write_json("wifx_club_team_scores.json", {"all_teams": all_teams})
# ---------------------------------------------------------------------------
# WIFX Confederation Scores (aggregated across years)
# ---------------------------------------------------------------------------
def build_wifx_confederation_scores():
path = DATA / "wifx_club_confederation_scores.csv"
df = pd.read_csv(path)
# Aggregate by team
agg = df.groupby("team").agg({
"wifx_club_score": "mean",
"country": "first",
"confederation": "first",
"championships_won": "sum",
"finals_reached": "sum",
}).reset_index()
agg = agg.sort_values("wifx_club_score", ascending=False)
agg = agg.assign(rank=range(1, len(agg) + 1))
result = {
"club_confederation_scores": agg.to_dict(orient="records"),
}
write_json("wifx_club_confederation_scores.json", result)
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
print("Loading StatsBomb events (this may take a minute)...")
all_events = pd.concat([load_events(c) for c in COMPETITIONS], ignore_index=True)
print(f" Loaded {len(all_events):,} events")
all_matches = pd.concat([load_matches(c) for c in COMPETITIONS], ignore_index=True)
print(f" Loaded {len(all_matches):,} matches")
all_lineups = pd.concat([load_lineups(c) for c in COMPETITIONS], ignore_index=True)
print(f" Loaded {len(all_lineups):,} lineup entries")
# Build WIFX dashboards only
print("Building WIFX scores...")
wifx = build_wifx_scores()
write_json("wifx_scores.json", wifx)
print("Building WIFX historical scores...")
wifx_hist = build_wifx_historical_scores()
write_json("wifx_historical_scores.json", wifx_hist)
print("Building aggregated WIFX national team scores...")
build_wifx_national_team_scores()
print("Building aggregated WIFX club team scores...")
build_wifx_club_team_scores()
print("Building aggregated WIFX confederation scores...")
build_wifx_confederation_scores()
print("Done! All JSON files written to data/dashboard/")
def write_json(filename: str, data: dict):
import math
path = OUT / filename
def clean_nan(obj):
if isinstance(obj, dict):
return {k: clean_nan(v) for k, v in obj.items()}
elif isinstance(obj, list):
return [clean_nan(v) for v in obj]
elif isinstance(obj, float) and (math.isnan(obj) or math.isinf(obj)):
return None
elif obj == "NaN":
return None
return obj
data = clean_nan(data)
with path.open("w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, default=str)
size = path.stat().st_size
print(f" Wrote {path} ({size / 1024:.1f} KB)")
if __name__ == "__main__":
main()