Spaces:

scfive
/

socr

Configuration error

File size: 13,153 Bytes

d6ea71e

"""Implements the VAEP framework.

Attributes
----------
xfns_default : list(callable)
    The default VAEP features.

"""

import math
from typing import Any, Optional

import numpy as np
import pandas as pd
from sklearn.exceptions import NotFittedError
from sklearn.metrics import brier_score_loss, roc_auc_score

import socceraction.spadl as spadlcfg

from . import features as fs
from . import formula as vaep
from . import labels as lab

try:
    import xgboost
except ImportError:
    xgboost = None  # type: ignore
try:
    import catboost
except ImportError:
    catboost = None  # type: ignore
try:
    import lightgbm
except ImportError:
    lightgbm = None  # type: ignore


xfns_default = [
    fs.actiontype_onehot,
    fs.result_onehot,
    fs.actiontype_result_onehot,
    fs.bodypart_onehot,
    fs.time,
    fs.startlocation,
    fs.endlocation,
    fs.startpolar,
    fs.endpolar,
    fs.movement,
    fs.team,
    fs.time_delta,
    fs.space_delta,
    fs.goalscore,
]


class VAEP:
    """
    An implementation of the VAEP framework.

    VAEP (Valuing Actions by Estimating Probabilities) [1]_ defines the
    problem of valuing a soccer player's contributions within a match as
    a binary classification problem and rates actions by estimating its effect
    on the short-term probablities that a team will both score and concede.

    Parameters
    ----------
    xfns : list
        List of feature transformers (see :mod:`socceraction.vaep.features`)
        used to describe the game states. Uses :attr:`~socceraction.vaep.base.xfns_default`
        if None.
    nb_prev_actions : int, default=3  # noqa: DAR103
        Number of previous actions used to decscribe the game state.


    References
    ----------
    .. [1] Tom Decroos, Lotte Bransen, Jan Van Haaren, and Jesse Davis.
        "Actions speak louder than goals: Valuing player actions in soccer." In
        Proceedings of the 25th ACM SIGKDD International Conference on Knowledge
        Discovery & Data Mining, pp. 1851-1861. 2019.
    """

    _spadlcfg = spadlcfg
    _fs = fs
    _lab = lab
    _vaep = vaep

    def __init__(
        self,
        xfns: Optional[list[fs.FeatureTransfomer]] = None,
        nb_prev_actions: int = 3,
    ) -> None:
        self.__models: dict[str, Any] = {}
        self.xfns = xfns_default if xfns is None else xfns
        self.yfns = [self._lab.scores, self._lab.concedes]
        self.nb_prev_actions = nb_prev_actions

    def compute_features(self, game: pd.Series, game_actions: fs.Actions) -> pd.DataFrame:
        """
        Transform actions to the feature-based representation of game states.

        Parameters
        ----------
        game : pd.Series
            The SPADL representation of a single game.
        game_actions : pd.DataFrame
            The actions performed during `game` in the SPADL representation.

        Returns
        -------
        features : pd.DataFrame
            Returns the feature-based representation of each game state in the game.
        """
        game_actions_with_names = self._spadlcfg.add_names(game_actions)  # type: ignore
        gamestates = self._fs.gamestates(game_actions_with_names, self.nb_prev_actions)
        gamestates = self._fs.play_left_to_right(gamestates, game.home_team_id)
        return pd.concat([fn(gamestates) for fn in self.xfns], axis=1)

    def compute_labels(
        self,
        game: pd.Series,
        game_actions: fs.Actions,  # pylint: disable=W0613
    ) -> pd.DataFrame:
        """
        Compute the labels for each game state in the given game.

        Parameters
        ----------
        game : pd.Series
            The SPADL representation of a single game.
        game_actions : pd.DataFrame
            The actions performed during `game` in the SPADL representation.

        Returns
        -------
        labels : pd.DataFrame
            Returns the labels of each game state in the game.
        """
        game_actions_with_names = self._spadlcfg.add_names(game_actions)  # type: ignore
        return pd.concat([fn(game_actions_with_names) for fn in self.yfns], axis=1)

    def fit(
        self,
        X: pd.DataFrame,
        y: pd.DataFrame,
        learner: str = "xgboost",
        val_size: float = 0.25,
        tree_params: Optional[dict[str, Any]] = None,
        fit_params: Optional[dict[str, Any]] = None,
    ) -> "VAEP":
        """
        Fit the model according to the given training data.

        Parameters
        ----------
        X : pd.DataFrame
            Feature representation of the game states.
        y : pd.DataFrame
            Scoring and conceding labels for each game state.
        learner : string, default='xgboost'  # noqa: DAR103
            Gradient boosting implementation which should be used to learn the
            model. The supported learners are 'xgboost', 'catboost' and 'lightgbm'.
        val_size : float, default=0.25  # noqa: DAR103
            Percentage of the dataset that will be used as the validation set
            for early stopping. When zero, no validation data will be used.
        tree_params : dict
            Parameters passed to the constructor of the learner.
        fit_params : dict
            Parameters passed to the fit method of the learner.

        Raises
        ------
        ValueError
            If one of the features is missing in the provided dataframe.

        Returns
        -------
        self
            Fitted VAEP model.

        """
        nb_states = len(X)
        idx = np.random.permutation(nb_states)
        # fmt: off
        train_idx = idx[:math.floor(nb_states * (1 - val_size))]
        val_idx = idx[(math.floor(nb_states * (1 - val_size)) + 1):]
        # fmt: on

        # filter feature columns
        cols = self._fs.feature_column_names(self.xfns, self.nb_prev_actions)
        if not set(cols).issubset(set(X.columns)):
            missing_cols = " and ".join(set(cols).difference(X.columns))
            raise ValueError(f"{missing_cols} are not available in the features dataframe")

        # split train and validation data
        X_train, y_train = X.iloc[train_idx][cols], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx][cols], y.iloc[val_idx]

        # train classifiers F(X) = Y
        for col in list(y.columns):
            eval_set = [(X_val, y_val[col])] if val_size > 0 else None
            if learner == "xgboost":
                self.__models[col] = self._fit_xgboost(
                    X_train, y_train[col], eval_set, tree_params, fit_params
                )
            elif learner == "catboost":
                self.__models[col] = self._fit_catboost(
                    X_train, y_train[col], eval_set, tree_params, fit_params
                )
            elif learner == "lightgbm":
                self.__models[col] = self._fit_lightgbm(
                    X_train, y_train[col], eval_set, tree_params, fit_params
                )
            else:
                raise ValueError(f"A {learner} learner is not supported")
        return self

    def _fit_xgboost(
        self,
        X: pd.DataFrame,
        y: pd.Series,
        eval_set: Optional[list[tuple[pd.DataFrame, pd.Series]]] = None,
        tree_params: Optional[dict[str, Any]] = None,
        fit_params: Optional[dict[str, Any]] = None,
    ) -> "xgboost.XGBClassifier":
        if xgboost is None:
            raise ImportError("xgboost is not installed.")
        # Default settings
        if tree_params is None:
            tree_params = {
                "n_estimators": 100,
                "max_depth": 3,
                "eval_metric": "auc",
                "early_stopping_rounds": 10,
                "enable_categorical": True,
            }
        if fit_params is None:
            fit_params = {"verbose": True}
        if eval_set is not None:
            val_params = {"eval_set": eval_set}
            fit_params = {**fit_params, **val_params}
        # Train the model
        model = xgboost.XGBClassifier(**tree_params)
        return model.fit(X, y, **fit_params)

    def _fit_catboost(
        self,
        X: pd.DataFrame,
        y: pd.Series,
        eval_set: Optional[list[tuple[pd.DataFrame, pd.Series]]] = None,
        tree_params: Optional[dict[str, Any]] = None,
        fit_params: Optional[dict[str, Any]] = None,
    ) -> "catboost.CatBoostClassifier":
        if catboost is None:
            raise ImportError("catboost is not installed.")
        # Default settings
        if tree_params is None:
            tree_params = {
                "eval_metric": "BrierScore",
                "loss_function": "Logloss",
                "iterations": 100,
            }
        if fit_params is None:
            is_cat_feature = [c.dtype.name == "category" for (_, c) in X.iteritems()]
            fit_params = {
                "cat_features": np.nonzero(is_cat_feature)[0].tolist(),
                "verbose": True,
            }
        if eval_set is not None:
            val_params = {"early_stopping_rounds": 10, "eval_set": eval_set}
            fit_params = {**fit_params, **val_params}
        # Train the model
        model = catboost.CatBoostClassifier(**tree_params)
        return model.fit(X, y, **fit_params)

    def _fit_lightgbm(
        self,
        X: pd.DataFrame,
        y: pd.Series,
        eval_set: Optional[list[tuple[pd.DataFrame, pd.Series]]] = None,
        tree_params: Optional[dict[str, Any]] = None,
        fit_params: Optional[dict[str, Any]] = None,
    ) -> "lightgbm.LGBMClassifier":
        if lightgbm is None:
            raise ImportError("lightgbm is not installed.")
        if tree_params is None:
            tree_params = {"n_estimators": 100, "max_depth": 3}
        if fit_params is None:
            fit_params = {"eval_metric": "auc", "verbose": True}
        if eval_set is not None:
            val_params = {"early_stopping_rounds": 10, "eval_set": eval_set}
            fit_params = {**fit_params, **val_params}
        # Train the model
        model = lightgbm.LGBMClassifier(**tree_params)
        return model.fit(X, y, **fit_params)

    def _estimate_probabilities(self, X: pd.DataFrame) -> pd.DataFrame:
        # filter feature columns
        cols = self._fs.feature_column_names(self.xfns, self.nb_prev_actions)
        if not set(cols).issubset(set(X.columns)):
            missing_cols = " and ".join(set(cols).difference(X.columns))
            raise ValueError(f"{missing_cols} are not available in the features dataframe")

        Y_hat = pd.DataFrame()
        for col in self.__models:
            Y_hat[col] = [p[1] for p in self.__models[col].predict_proba(X[cols])]
        return Y_hat

    def rate(
        self,
        game: pd.Series,
        game_actions: fs.Actions,
        game_states: Optional[fs.Features] = None,
    ) -> pd.DataFrame:
        """
        Compute the VAEP rating for the given game states.

        Parameters
        ----------
        game : pd.Series
            The SPADL representation of a single game.
        game_actions : pd.DataFrame
            The actions performed during `game` in the SPADL representation.
        game_states : pd.DataFrame, default=None
            DataFrame with the game state representation of each action. If
            `None`, these will be computed on-th-fly.

        Raises
        ------
        NotFittedError
            If the model is not fitted yet.

        Returns
        -------
        ratings : pd.DataFrame
            Returns the VAEP rating for each given action, as well as the
            offensive and defensive value of each action.
        """
        if not self.__models:
            raise NotFittedError()

        game_actions_with_names = self._spadlcfg.add_names(game_actions)  # type: ignore
        if game_states is None:
            game_states = self.compute_features(game, game_actions)

        y_hat = self._estimate_probabilities(game_states)
        p_scores, p_concedes = y_hat.scores, y_hat.concedes
        vaep_values = self._vaep.value(game_actions_with_names, p_scores, p_concedes)
        return vaep_values

    def score(self, X: pd.DataFrame, y: pd.DataFrame) -> dict[str, dict[str, float]]:
        """Evaluate the fit of the model on the given test data and labels.

        Parameters
        ----------
        X : pd.DataFrame
            Feature representation of the game states.
        y : pd.DataFrame
            Scoring and conceding labels for each game state.

        Raises
        ------
        NotFittedError
            If the model is not fitted yet.

        Returns
        -------
        score : dict
            The Brier and AUROC scores for both binary classification problems.
        """
        if not self.__models:
            raise NotFittedError()

        y_hat = self._estimate_probabilities(X)

        scores: dict[str, dict[str, float]] = {}
        for col in self.__models:
            scores[col] = {}
            scores[col]["brier"] = brier_score_loss(y[col], y_hat[col])
            scores[col]["auroc"] = roc_auc_score(y[col], y_hat[col])

        return scores