Spaces:

phongdtd
/

WC2022_predictor

Build error

App Files Files Community

phong.dao commited on Dec 13, 2022

Commit

9e6c24e

1 Parent(s): 38b12ed

init app

Browse files

Files changed (7) hide show

app.py +107 -49
configs/config.py +1 -1
configs/constants.py +1 -1
ml/data_prepare.py +249 -70
ml/model.py +135 -67
ml/predictor.py +135 -31
ml/utils.py +2 -2

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os.path
 import shutil
@@ -9,7 +10,25 @@ import requests
 from configs.config import cfg
 from ml.model import base_df, ml_model
 from ml.predictor import Predictor
-from ml.utils import load_pickle
 def function(team1, team2):
@@ -23,65 +42,104 @@ def function(team1, team2):
     response = requests.get(cfg.live_prediction)
     if response.status_code == 200:
         five_thirty_eight_predict = response.json()
-        for match in five_thirty_eight_predict['matches']:
-            if (team1 == match['team1'] and team2 == match['team2']) \
-                    or (team1 == match['team2'] and team2 == match['team1']):
-                if match['status'] != 'live':
-                    probability = {
-                        match['team1']: match['prob1'],
-                        match['team2']: match['prob2'],
-                        'draw': match['probtie'],
-                    }
-                else:
-                    probability = {
-                        match['team1']: match['live_winprobs']['winprobs'][-1]['prob1'],
-                        match['team2']: match['live_winprobs']['winprobs'][-1]['prob2'],
-                        'draw': match['live_winprobs']['winprobs'][-1]['probtie'],
-                    }
-                if match['probtie'] < match['prob1'] or match['probtie'] < match['prob2']:
-                    if match['prob1'] > match['prob2']:
-                        winner = match['team1']
-                    else:
-                        winner = match['team2']
-                else:
-                    return {
-                        "result": 'Draw!',
-                        "probability": probability
-                    }
-                return {
-                    "winner": winner,
-                    "probability": probability
-                }
     draw, winner, winner_proba = predictor.predict(team1, team2)
     if draw:
         return {
-            'result': "Draw!",
-            'probability': round(random.uniform(0.7, 0.9), 10)
         }
     else:
         return {
-            'winner': winner,
-            'probability': winner_proba
         }
-shutil.copytree("static", os.path.abspath(os.path.join(
-    os.path.dirname(gr.__file__), "templates/frontend/static")), dirs_exist_ok=True)
-shutil.copy("templates/asset.html", os.path.abspath(os.path.join(
-    os.path.dirname(gr.__file__), "templates/frontend/static/asset.html")))
-shutil.copytree("templates/asset", os.path.abspath(os.path.join(
-    os.path.dirname(gr.__file__), "templates/frontend/static/asset")), dirs_exist_ok=True)
 predictor = Predictor(base_df, ml_model)
-examples = random.choices([x[1:3] for x in load_pickle("data/table_match.pkl")['matches']], k=20)
 examples = [list(x) for x in examples]
-iface = gr.Interface(fn=function,
-                     inputs=[gr.Textbox(placeholder="Qatar"), gr.Textbox(placeholder="Ecuador")],
-                     outputs="json",
-                     title="WorldCup-Prediction \n\n "
-                           "Predicting the 2022 FIFA World Cup results with Machine Learning!",
-                     examples=examples,
-                     article=f'<iframe style="width: 100%; height: 2000px" src=\'./static/asset.html\' ></iframe>',
-                     )
 iface.queue(concurrency_count=5)
 iface.launch()

+import math
 import os.path
 import shutil
 from configs.config import cfg
 from ml.model import base_df, ml_model
 from ml.predictor import Predictor
+def get_result(team1, prob1, score1, team2, prob2, score2, probtie):
+    if prob1 > prob2 and prob1 > probtie:
+        winner = {"name": team1, "probability": prob1, "goals": score1}
+        loser = {"name": team2, "probability": prob2, "goals": score2}
+    elif prob1 < prob2 and prob2 > probtie:
+        loser = {"name": team1, "probability": prob1, "goals": score1}
+        winner = {"name": team2, "probability": prob2, "goals": score2}
+    else:
+        loser = {"name": None, "probability": 0.0, "goals": score1}
+        winner = {"name": None, "probability": 0.0, "goals": score2}
+    result = {
+        "winner": winner,
+        "loser": loser,
+        "draw": {"probability": probtie},
+    }
+    return result
 def function(team1, team2):
     response = requests.get(cfg.live_prediction)
     if response.status_code == 200:
         five_thirty_eight_predict = response.json()
+        for match in five_thirty_eight_predict["matches"]:
+            if not (
+                (team1 == match["team1"] and team2 == match["team2"])
+                or (team1 == match["team2"] and team2 == match["team1"])
+            ):
+                continue
+            if match["status"] != "live":
+                result = get_result(
+                    match["team1"],
+                    match["prob1"],
+                    math.ceil(match["adj_score1"])
+                    if "adj_score1" in match
+                    else math.ceil(match["o1"] - match["d2"]),
+                    match["team2"],
+                    match["prob2"],
+                    math.ceil(match["adj_score2"])
+                    if "adj_score2" in match
+                    else math.ceil(match["o2"] - match["d1"]),
+                    match["probtie"],
+                )
+            else:
+                result = get_result(
+                    match["team1"],
+                    match["live_winprobs"]["winprobs"][-1]["prob1"],
+                    math.ceil(match["adj_score1"])
+                    if "adj_score1" in match
+                    else math.ceil(match["o1"] - match["d2"]),
+                    match["team2"],
+                    match["live_winprobs"]["winprobs"][-1]["prob2"],
+                    math.ceil(match["adj_score2"])
+                    if "adj_score2" in match
+                    else math.ceil(match["o2"] - match["d1"]),
+                    match["probtie"],
+                )
+            return result
     draw, winner, winner_proba = predictor.predict(team1, team2)
     if draw:
+        draw_prob = round(random.uniform(0.7, 0.9), 10)
+        winner_proba = round(random.uniform(0, 1 - draw_prob), 10)
+        loser_proba = 1 - draw_prob - winner_proba
         return {
+            "winner": {"name": team1, "probability": winner_proba, "goals": None},
+            "loser": {"name": team2, "probability": loser_proba, "goals": None},
+            "draw": {"probability": draw_prob},
         }
     else:
+        loser_proba = round(random.uniform(0, 1 - winner_proba), 10)
         return {
+            "winner": {"name": winner, "probability": winner_proba, "goals": None},
+            "loser": {
+                "name": team1 if winner == team2 else team2,
+                "probability": loser_proba,
+                "goals": None,
+            },
+            "draw": {"probability": 1 - winner_proba - loser_proba},
         }
+shutil.copytree(
+    "static",
+    os.path.abspath(
+        os.path.join(os.path.dirname(gr.__file__), "templates/frontend/static")
+    ),
+    dirs_exist_ok=True,
+)
+shutil.copy(
+    "templates/asset.html",
+    os.path.abspath(
+        os.path.join(
+            os.path.dirname(gr.__file__), "templates/frontend/static/asset.html"
+        )
+    ),
+)
+shutil.copytree(
+    "templates/asset",
+    os.path.abspath(
+        os.path.join(os.path.dirname(gr.__file__), "templates/frontend/static/asset")
+    ),
+    dirs_exist_ok=True,
+)
 predictor = Predictor(base_df, ml_model)
+examples = (
+    ("Croatia", "Argentina"),
+    ("Morocco", "France"),
+    ("Argentina", "France"),
+    ("Morocco", "Croatia"),
+)
 examples = [list(x) for x in examples]
+iface = gr.Interface(
+    fn=function,
+    inputs=[gr.Textbox(placeholder="Qatar"), gr.Textbox(placeholder="Ecuador")],
+    outputs="json",
+    title="WorldCup-Prediction \n\n "
+    "Predicting the 2022 FIFA World Cup results with Machine Learning!",
+    examples=examples,
+    article=f"<iframe style=\"width: 100%; height: 2000px\" src='./static/asset.html' ></iframe>",
+)
 iface.queue(concurrency_count=5)
 iface.launch()

configs/config.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Text, Union
 from omegaconf import OmegaConf, DictConfig, ListConfig
-def get_config(config_file: Text = 'base') -> Union[DictConfig, ListConfig]:
     if not config_file.endswith(".yaml") or not config_file.endswith(".yml"):
         config_file += ".yaml"
     root_configs_dir = os.path.abspath(os.path.join(__file__, ".."))

 from omegaconf import OmegaConf, DictConfig, ListConfig
+def get_config(config_file: Text = "base") -> Union[DictConfig, ListConfig]:
     if not config_file.endswith(".yaml") or not config_file.endswith(".yml"):
         config_file += ".yaml"
     root_configs_dir = os.path.abspath(os.path.join(__file__, ".."))

configs/constants.py CHANGED Viewed

@@ -11,6 +11,6 @@ SUPPORT_MODEL = (
     "RandomForestClassifier",
     "LGBMClassifier",
     "XGBClassifier",
-    "GradientBoostingClassifier"
 )
 DEFAULT_MODEL = "GradientBoostingClassifier"

     "RandomForestClassifier",
     "LGBMClassifier",
     "XGBClassifier",
+    "GradientBoostingClassifier",
 )
 DEFAULT_MODEL = "GradientBoostingClassifier"

ml/data_prepare.py CHANGED Viewed

@@ -33,7 +33,8 @@ def create_dataset(df: pd.DataFrame):
     """
     x_, y = df.iloc[:, 3:], df[["target"]]
     x_train, x_test, y_train, y_test = train_test_split(
-        x_, y, test_size=0.22, random_state=100)
     return x_train, x_test, y_train, y_test
@@ -55,28 +56,59 @@ def data_preparing():
     rank = pd.read_csv(os.path.join(DATA_ROOT, cfg.data.rank_file))
     rank["rank_date"] = pd.to_datetime(rank["rank_date"])
     rank = rank[(rank["rank_date"] >= cfg.day_get_rank)].reset_index(drop=True)
-    rank["country_full"] = rank["country_full"].str.replace("IR Iran", "Iran").str.replace("Korea Republic",
-                                                                                           "South Korea").str.replace(
-        "USA", "United States")
     # The merge is made in order to get a dataset FIFA games and its rankings.
-    rank = rank.set_index(['rank_date']).groupby(['country_full'], group_keys=False).resample('D').first().fillna(
-        method='ffill').reset_index()
     df_wc_ranked = df.merge(
-        rank[["country_full", "total_points", "previous_points", "rank", "rank_change", "rank_date"]],
-        left_on=["date", "home_team"], right_on=["rank_date", "country_full"]).drop(["rank_date", "country_full"],
-                                                                                    axis=1)
     df_wc_ranked = df_wc_ranked.merge(
-        rank[["country_full", "total_points", "previous_points", "rank", "rank_change", "rank_date"]],
-        left_on=["date", "away_team"], right_on=["rank_date", "country_full"], suffixes=("_home", "_away")).drop(
-        ["rank_date", "country_full"], axis=1)
     # Featuring
     df = df_wc_ranked
     df[["result", "home_team_points", "away_team_points"]] = df.apply(
-        lambda x: result_finder(x["home_score"], x["away_score"]), axis=1)
     # we create columns that will help in the creation of the features: ranking difference,
     # points won at the game vs. team faced rank, and goals difference in the game.
@@ -90,16 +122,54 @@ def data_preparing():
     # unify them and calculate the past game values.
     # After that, I'll separate again and merge them, retrieving the original dataset.
     # This process optimizes the creation of the features.
-    home_team = df[["date", "home_team", "home_score", "away_score", "rank_home", "rank_away", "rank_change_home",
-                    "total_points_home", "result", "rank_dif", "points_home_by_rank", "home_team_points"]]
-    away_team = df[["date", "away_team", "away_score", "home_score", "rank_away", "rank_home", "rank_change_away",
-                    "total_points_away", "result", "rank_dif", "points_away_by_rank", "away_team_points"]]
-    home_team.columns = [h.replace("home_", "").replace("_home", "").replace("away_", "suf_").replace("_away", "_suf")
-                         for h in home_team.columns]
-    away_team.columns = [a.replace("away_", "").replace("_away", "").replace("home_", "suf_").replace("_home", "_suf")
-                         for a in away_team.columns]
     team_stats = home_team.append(away_team)
     stats_val = []
@@ -109,7 +179,7 @@ def data_preparing():
         date = row["date"]
         past_games = team_stats.loc[
             (team_stats["team"] == team) & (team_stats["date"] < date)
-            ].sort_values(by=['date'], ascending=False)
         last5 = past_games.head(5)
         goals = past_games["score"].mean()
@@ -122,9 +192,13 @@ def data_preparing():
         rank_l5 = last5["rank_suf"].mean()
         if len(last5) > 0:
-            points = past_games["total_points"].values[0] - past_games["total_points"].values[
-                -1]  # amount of points earned
-            points_l5 = last5["total_points"].values[0] - last5["total_points"].values[-1]
         else:
             points = 0
             points_l5 = 0
@@ -136,49 +210,109 @@ def data_preparing():
         gp_rank_l5 = last5["points_by_rank"].mean()
         stats_val.append(
-            [goals, goals_l5, goals_suf, goals_suf_l5, rank, rank_l5, points, points_l5, gp, gp_l5, gp_rank,
-             gp_rank_l5])
-    stats_cols = ["goals_mean", "goals_mean_l5", "goals_suf_mean", "goals_suf_mean_l5", "rank_mean", "rank_mean_l5",
-                  "points_mean", "points_mean_l5", "game_points_mean", "game_points_mean_l5",
-                  "game_points_rank_mean", "game_points_rank_mean_l5"]
     stats_df = pd.DataFrame(stats_val, columns=stats_cols)
-    full_df = pd.concat([team_stats.reset_index(drop=True), stats_df], axis=1, ignore_index=False)
-    home_team_stats = full_df.iloc[:int(full_df.shape[0] / 2), :]
-    away_team_stats = full_df.iloc[int(full_df.shape[0] / 2):, :]
     home_team_stats = home_team_stats[home_team_stats.columns[-12:]]
     away_team_stats = away_team_stats[away_team_stats.columns[-12:]]
-    home_team_stats.columns = ['home_' + str(col) for col in home_team_stats.columns]
-    away_team_stats.columns = ['away_' + str(col) for col in away_team_stats.columns]
     # In order to unify the database, is needed to add home and away suffix for each column.
     # After that, the data is ready to be merged.
-    match_stats = pd.concat([home_team_stats, away_team_stats.reset_index(drop=True)], axis=1, ignore_index=False)
-    full_df = pd.concat([df, match_stats.reset_index(drop=True)], axis=1, ignore_index=False)
     # Drop friendly game
     full_df["is_friendly"] = full_df["tournament"].apply(lambda x: find_friendly(x))
     full_df = pd.get_dummies(full_df, columns=["is_friendly"])
     base_df = full_df[
-        ["date", "home_team", "away_team", "rank_home", "rank_away", "home_score", "away_score", "result",
-         "rank_dif", "rank_change_home", "rank_change_away", 'home_goals_mean',
-         'home_goals_mean_l5', 'home_goals_suf_mean', 'home_goals_suf_mean_l5',
-         'home_rank_mean', 'home_rank_mean_l5', 'home_points_mean',
-         'home_points_mean_l5', 'away_goals_mean', 'away_goals_mean_l5',
-         'away_goals_suf_mean', 'away_goals_suf_mean_l5', 'away_rank_mean',
-         'away_rank_mean_l5', 'away_points_mean', 'away_points_mean_l5', 'home_game_points_mean',
-         'home_game_points_mean_l5',
-         'home_game_points_rank_mean', 'home_game_points_rank_mean_l5', 'away_game_points_mean',
-         'away_game_points_mean_l5', 'away_game_points_rank_mean',
-         'away_game_points_rank_mean_l5',
-         'is_friendly_0', 'is_friendly_1']]
     df = base_df.dropna()
@@ -207,30 +341,75 @@ def create_db(df):
     :param df:
     :return:
     """
-    columns = ["home_team", "away_team", "target", "rank_dif", "home_goals_mean",
-               "home_rank_mean", "away_goals_mean", "away_rank_mean", "home_rank_mean_l5", "away_rank_mean_l5",
-               "home_goals_suf_mean", "away_goals_suf_mean", "home_goals_mean_l5", "away_goals_mean_l5",
-               "home_goals_suf_mean_l5", "away_goals_suf_mean_l5", "home_game_points_rank_mean",
-               "home_game_points_rank_mean_l5", "away_game_points_rank_mean", "away_game_points_rank_mean_l5",
-               "is_friendly_0", "is_friendly_1"]
     base = df.loc[:, columns]
     base.loc[:, "goals_dif"] = base["home_goals_mean"] - base["away_goals_mean"]
-    base.loc[:, "goals_dif_l5"] = base["home_goals_mean_l5"] - base["away_goals_mean_l5"]
-    base.loc[:, "goals_suf_dif"] = base["home_goals_suf_mean"] - base["away_goals_suf_mean"]
-    base.loc[:, "goals_suf_dif_l5"] = base["home_goals_suf_mean_l5"] - base["away_goals_suf_mean_l5"]
-    base.loc[:, "goals_per_ranking_dif"] = (base["home_goals_mean"] / base["home_rank_mean"]) - (
-            base["away_goals_mean"] / base["away_rank_mean"])
     base.loc[:, "dif_rank_agst"] = base["home_rank_mean"] - base["away_rank_mean"]
-    base.loc[:, "dif_rank_agst_l5"] = base["home_rank_mean_l5"] - base["away_rank_mean_l5"]
-    base.loc[:, "dif_points_rank"] = base["home_game_points_rank_mean"] - base["away_game_points_rank_mean"]
-    base.loc[:, "dif_points_rank_l5"] = base["home_game_points_rank_mean_l5"] - base[
-        "away_game_points_rank_mean_l5"]
     model_df = base[
-        ["home_team", "away_team", "target", "rank_dif", "goals_dif", "goals_dif_l5",
-         "goals_suf_dif", "goals_suf_dif_l5", "goals_per_ranking_dif", "dif_rank_agst", "dif_rank_agst_l5",
-         "dif_points_rank", "dif_points_rank_l5", "is_friendly_0", "is_friendly_1"]]
     return model_df

     """
     x_, y = df.iloc[:, 3:], df[["target"]]
     x_train, x_test, y_train, y_test = train_test_split(
+        x_, y, test_size=0.22, random_state=100
+    )
     return x_train, x_test, y_train, y_test
     rank = pd.read_csv(os.path.join(DATA_ROOT, cfg.data.rank_file))
     rank["rank_date"] = pd.to_datetime(rank["rank_date"])
     rank = rank[(rank["rank_date"] >= cfg.day_get_rank)].reset_index(drop=True)
+    rank["country_full"] = (
+        rank["country_full"]
+        .str.replace("IR Iran", "Iran")
+        .str.replace("Korea Republic", "South Korea")
+        .str.replace("USA", "United States")
+    )
     # The merge is made in order to get a dataset FIFA games and its rankings.
+    rank = (
+        rank.set_index(["rank_date"])
+        .groupby(["country_full"], group_keys=False)
+        .resample("D")
+        .first()
+        .fillna(method="ffill")
+        .reset_index()
+    )
     df_wc_ranked = df.merge(
+        rank[
+            [
+                "country_full",
+                "total_points",
+                "previous_points",
+                "rank",
+                "rank_change",
+                "rank_date",
+            ]
+        ],
+        left_on=["date", "home_team"],
+        right_on=["rank_date", "country_full"],
+    ).drop(["rank_date", "country_full"], axis=1)
     df_wc_ranked = df_wc_ranked.merge(
+        rank[
+            [
+                "country_full",
+                "total_points",
+                "previous_points",
+                "rank",
+                "rank_change",
+                "rank_date",
+            ]
+        ],
+        left_on=["date", "away_team"],
+        right_on=["rank_date", "country_full"],
+        suffixes=("_home", "_away"),
+    ).drop(["rank_date", "country_full"], axis=1)
     # Featuring
     df = df_wc_ranked
     df[["result", "home_team_points", "away_team_points"]] = df.apply(
+        lambda x: result_finder(x["home_score"], x["away_score"]), axis=1
+    )
     # we create columns that will help in the creation of the features: ranking difference,
     # points won at the game vs. team faced rank, and goals difference in the game.
     # unify them and calculate the past game values.
     # After that, I'll separate again and merge them, retrieving the original dataset.
     # This process optimizes the creation of the features.
+    home_team = df[
+        [
+            "date",
+            "home_team",
+            "home_score",
+            "away_score",
+            "rank_home",
+            "rank_away",
+            "rank_change_home",
+            "total_points_home",
+            "result",
+            "rank_dif",
+            "points_home_by_rank",
+            "home_team_points",
+        ]
+    ]
+    away_team = df[
+        [
+            "date",
+            "away_team",
+            "away_score",
+            "home_score",
+            "rank_away",
+            "rank_home",
+            "rank_change_away",
+            "total_points_away",
+            "result",
+            "rank_dif",
+            "points_away_by_rank",
+            "away_team_points",
+        ]
+    ]
+    home_team.columns = [
+        h.replace("home_", "")
+        .replace("_home", "")
+        .replace("away_", "suf_")
+        .replace("_away", "_suf")
+        for h in home_team.columns
+    ]
+    away_team.columns = [
+        a.replace("away_", "")
+        .replace("_away", "")
+        .replace("home_", "suf_")
+        .replace("_home", "_suf")
+        for a in away_team.columns
+    ]
     team_stats = home_team.append(away_team)
     stats_val = []
         date = row["date"]
         past_games = team_stats.loc[
             (team_stats["team"] == team) & (team_stats["date"] < date)
+        ].sort_values(by=["date"], ascending=False)
         last5 = past_games.head(5)
         goals = past_games["score"].mean()
         rank_l5 = last5["rank_suf"].mean()
         if len(last5) > 0:
+            points = (
+                past_games["total_points"].values[0]
+                - past_games["total_points"].values[-1]
+            )  # amount of points earned
+            points_l5 = (
+                last5["total_points"].values[0] - last5["total_points"].values[-1]
+            )
         else:
             points = 0
             points_l5 = 0
         gp_rank_l5 = last5["points_by_rank"].mean()
         stats_val.append(
+            [
+                goals,
+                goals_l5,
+                goals_suf,
+                goals_suf_l5,
+                rank,
+                rank_l5,
+                points,
+                points_l5,
+                gp,
+                gp_l5,
+                gp_rank,
+                gp_rank_l5,
+            ]
+        )
+    stats_cols = [
+        "goals_mean",
+        "goals_mean_l5",
+        "goals_suf_mean",
+        "goals_suf_mean_l5",
+        "rank_mean",
+        "rank_mean_l5",
+        "points_mean",
+        "points_mean_l5",
+        "game_points_mean",
+        "game_points_mean_l5",
+        "game_points_rank_mean",
+        "game_points_rank_mean_l5",
+    ]
     stats_df = pd.DataFrame(stats_val, columns=stats_cols)
+    full_df = pd.concat(
+        [team_stats.reset_index(drop=True), stats_df], axis=1, ignore_index=False
+    )
+    home_team_stats = full_df.iloc[: int(full_df.shape[0] / 2), :]
+    away_team_stats = full_df.iloc[int(full_df.shape[0] / 2) :, :]
     home_team_stats = home_team_stats[home_team_stats.columns[-12:]]
     away_team_stats = away_team_stats[away_team_stats.columns[-12:]]
+    home_team_stats.columns = ["home_" + str(col) for col in home_team_stats.columns]
+    away_team_stats.columns = ["away_" + str(col) for col in away_team_stats.columns]
     # In order to unify the database, is needed to add home and away suffix for each column.
     # After that, the data is ready to be merged.
+    match_stats = pd.concat(
+        [home_team_stats, away_team_stats.reset_index(drop=True)],
+        axis=1,
+        ignore_index=False,
+    )
+    full_df = pd.concat(
+        [df, match_stats.reset_index(drop=True)], axis=1, ignore_index=False
+    )
     # Drop friendly game
     full_df["is_friendly"] = full_df["tournament"].apply(lambda x: find_friendly(x))
     full_df = pd.get_dummies(full_df, columns=["is_friendly"])
     base_df = full_df[
+        [
+            "date",
+            "home_team",
+            "away_team",
+            "rank_home",
+            "rank_away",
+            "home_score",
+            "away_score",
+            "result",
+            "rank_dif",
+            "rank_change_home",
+            "rank_change_away",
+            "home_goals_mean",
+            "home_goals_mean_l5",
+            "home_goals_suf_mean",
+            "home_goals_suf_mean_l5",
+            "home_rank_mean",
+            "home_rank_mean_l5",
+            "home_points_mean",
+            "home_points_mean_l5",
+            "away_goals_mean",
+            "away_goals_mean_l5",
+            "away_goals_suf_mean",
+            "away_goals_suf_mean_l5",
+            "away_rank_mean",
+            "away_rank_mean_l5",
+            "away_points_mean",
+            "away_points_mean_l5",
+            "home_game_points_mean",
+            "home_game_points_mean_l5",
+            "home_game_points_rank_mean",
+            "home_game_points_rank_mean_l5",
+            "away_game_points_mean",
+            "away_game_points_mean_l5",
+            "away_game_points_rank_mean",
+            "away_game_points_rank_mean_l5",
+            "is_friendly_0",
+            "is_friendly_1",
+        ]
+    ]
     df = base_df.dropna()
     :param df:
     :return:
     """
+    columns = [
+        "home_team",
+        "away_team",
+        "target",
+        "rank_dif",
+        "home_goals_mean",
+        "home_rank_mean",
+        "away_goals_mean",
+        "away_rank_mean",
+        "home_rank_mean_l5",
+        "away_rank_mean_l5",
+        "home_goals_suf_mean",
+        "away_goals_suf_mean",
+        "home_goals_mean_l5",
+        "away_goals_mean_l5",
+        "home_goals_suf_mean_l5",
+        "away_goals_suf_mean_l5",
+        "home_game_points_rank_mean",
+        "home_game_points_rank_mean_l5",
+        "away_game_points_rank_mean",
+        "away_game_points_rank_mean_l5",
+        "is_friendly_0",
+        "is_friendly_1",
+    ]
     base = df.loc[:, columns]
     base.loc[:, "goals_dif"] = base["home_goals_mean"] - base["away_goals_mean"]
+    base.loc[:, "goals_dif_l5"] = (
+        base["home_goals_mean_l5"] - base["away_goals_mean_l5"]
+    )
+    base.loc[:, "goals_suf_dif"] = (
+        base["home_goals_suf_mean"] - base["away_goals_suf_mean"]
+    )
+    base.loc[:, "goals_suf_dif_l5"] = (
+        base["home_goals_suf_mean_l5"] - base["away_goals_suf_mean_l5"]
+    )
+    base.loc[:, "goals_per_ranking_dif"] = (
+        base["home_goals_mean"] / base["home_rank_mean"]
+    ) - (base["away_goals_mean"] / base["away_rank_mean"])
     base.loc[:, "dif_rank_agst"] = base["home_rank_mean"] - base["away_rank_mean"]
+    base.loc[:, "dif_rank_agst_l5"] = (
+        base["home_rank_mean_l5"] - base["away_rank_mean_l5"]
+    )
+    base.loc[:, "dif_points_rank"] = (
+        base["home_game_points_rank_mean"] - base["away_game_points_rank_mean"]
+    )
+    base.loc[:, "dif_points_rank_l5"] = (
+        base["home_game_points_rank_mean_l5"] - base["away_game_points_rank_mean_l5"]
+    )
     model_df = base[
+        [
+            "home_team",
+            "away_team",
+            "target",
+            "rank_dif",
+            "goals_dif",
+            "goals_dif_l5",
+            "goals_suf_dif",
+            "goals_suf_dif_l5",
+            "goals_per_ranking_dif",
+            "dif_rank_agst",
+            "dif_rank_agst_l5",
+            "dif_points_rank",
+            "dif_points_rank_l5",
+            "is_friendly_0",
+            "is_friendly_1",
+        ]
+    ]
     return model_df

ml/model.py CHANGED Viewed

@@ -7,8 +7,14 @@ import numpy as np
 import xgboost as xgb
 from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
 from sklearn.linear_model import LogisticRegression
-from sklearn.metrics import accuracy_score, roc_auc_score, cohen_kappa_score, plot_confusion_matrix, roc_curve, \
-    classification_report
 from sklearn.model_selection import GridSearchCV
 from sklearn.neural_network import MLPClassifier
 from sklearn.tree import DecisionTreeClassifier
@@ -23,11 +29,11 @@ def plot_roc_cur(fper, tper):
     :param fper:
     :param tper:
     """
-    plt.plot(fper, tper, color='orange', label='ROC')
-    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
-    plt.xlabel('False Positive Rate')
-    plt.ylabel('True Positive Rate')
-    plt.title('Receiver Operating Characteristic (ROC) Curve')
     plt.legend()
     plt.show()
@@ -39,8 +45,11 @@ class MLModel:
     def __init__(self, model_type: Text):
-        assert model_type in SUPPORT_MODEL, \
-            "Not support the kind of model. Please choose one of {}".format(SUPPORT_MODEL)
         self.model_type = model_type
         if self.model_type == "LogisticRegression":
             self.model = self.get_logistic_regression_model()
@@ -95,11 +104,13 @@ class MLModel:
             params_lr = {
                 "C": np.logspace(-3, 3, 7),
                 "penalty": ["l1", "l2"],
-                'solver': 'liblinear'
             }
         model_lr = LogisticRegression()
-        model_lr = GridSearchCV(model_lr, params_lr, cv=3, verbose=False, scoring='roc_auc', refit=True)
         return model_lr
     @staticmethod
@@ -109,14 +120,22 @@ class MLModel:
         :return:
         """
         if not all(params.values()):
-            params = {'max_features': ['auto', 'sqrt', 'log2'],
-                      'ccp_alpha': [0.1, .01, .001],
-                      'max_depth': [5, 6, 7, 8, 9],
-                      'criterion': ['gini', 'entropy']
-                      }
         model = DecisionTreeClassifier()
-        model = GridSearchCV(estimator=model, param_grid=params, cv=3, verbose=False, scoring='roc_auc', refit=True)
         return model
     @staticmethod
@@ -126,14 +145,30 @@ class MLModel:
         :return:
         """
         if not all(params_nn.values()):
-            params_nn = {'solver': ['lbfgs'],
-                         'max_iter': [1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000],
-                         'alpha': 10.0 ** -np.arange(1, 10),
-                         'hidden_layer_sizes': np.arange(10, 15),
-                         'random_state': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}
         model_nn = MLPClassifier()
-        model_nn = GridSearchCV(model_nn, params_nn, n_jobs=-1, scoring='roc_auc', refit=True, verbose=False)
         return model_nn
     @staticmethod
@@ -143,16 +178,25 @@ class MLModel:
         :return:
         """
         if not all(params_rf.values()):
-            params_rf = {"max_depth": [20],
-                         "min_samples_split": [10],
-                         "max_leaf_nodes": [175],
-                         "min_samples_leaf": [5],
-                         "n_estimators": [250],
-                         "max_features": ["sqrt"],
-                         }
         model_rf = RandomForestClassifier()
-        model_rf = GridSearchCV(model_rf, params_rf, cv=3, n_jobs=-1, verbose=False, scoring='roc_auc', refit=True)
         return model_rf
@@ -164,21 +208,37 @@ class MLModel:
         """
         if not all(params_lgb.values()):
             params_lgb = {
-                'learning_rate': [0.005, 0.01],
-                'n_estimators': [8, 16, 24],
-                'num_leaves': [6, 8, 12, 16],  # large num_leaves helps improve accuracy but might lead to over-fitting
-                'boosting_type': ['gbdt', 'dart'],  # for better accuracy -> try dart
-                'objective': ['binary'],
-                'max_bin': [255, 510],  # large max_bin helps improve accuracy but might slow down training progress
-                'random_state': [500],
-                'colsample_bytree': [0.64, 0.65, 0.66],
-                'subsample': [0.7, 0.75],
-                'reg_alpha': [1, 1.2],
-                'reg_lambda': [1, 1.2, 1.4],
             }
         model = lgb.LGBMClassifier()
-        model = GridSearchCV(model, params_lgb, verbose=False, cv=3, n_jobs=-1, scoring='roc_auc', refit=True)
         return model
@@ -190,22 +250,28 @@ class MLModel:
         """
         if not all(params_xgb.values()):
             params_xgb = {
-                'nthread': [4],  # when use hyper thread, xgboost may become slower
-                'objective': ['binary:logistic'],
-                'learning_rate': [0.05],  # so called `eta` value
-                'max_depth': [6],
-                'min_child_weight': [11],
-                'silent': [1],
-                'subsample': [0.8],
-                'colsample_bytree': [0.7],
-                'n_estimators': [100],  # number of trees, change it to 1000 for better results
-                'missing': [-999],
-                'seed': [1337]
             }
-        model = GridSearchCV(xgb.XGBClassifier(), params_xgb, n_jobs=-1,
-                             cv=3,
-                             scoring='roc_auc',
-                             refit=True)
         return model
@@ -218,8 +284,9 @@ class MLModel:
         :param y_test:
         :return:
         """
-        model_lr, accuracy_lr, roc_auc_lr, coh_kap_lr, tt_lr = \
-            self.__run_model(self.model, x_train, y_train, x_test, y_test)
         return model_lr, accuracy_lr, roc_auc_lr, coh_kap_lr, tt_lr
     @staticmethod
@@ -230,13 +297,14 @@ class MLModel:
         :return:
         """
         if not all(params.values()):
-            params = {"learning_rate": [0.01, 0.02, 0.03],
-                      "min_samples_split": [5, 10],
-                      "min_samples_leaf": [3, 5],
-                      "max_depth": [3, 5, 10],
-                      "max_features": ["sqrt"],
-                      "n_estimators": [100, 200]
-                      }
         model = GradientBoostingClassifier(random_state=100)
         return GridSearchCV(model, params, cv=3, n_jobs=-1)

 import xgboost as xgb
 from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
 from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import (
+    accuracy_score,
+    roc_auc_score,
+    cohen_kappa_score,
+    plot_confusion_matrix,
+    roc_curve,
+    classification_report,
+)
 from sklearn.model_selection import GridSearchCV
 from sklearn.neural_network import MLPClassifier
 from sklearn.tree import DecisionTreeClassifier
     :param fper:
     :param tper:
     """
+    plt.plot(fper, tper, color="orange", label="ROC")
+    plt.plot([0, 1], [0, 1], color="darkblue", linestyle="--")
+    plt.xlabel("False Positive Rate")
+    plt.ylabel("True Positive Rate")
+    plt.title("Receiver Operating Characteristic (ROC) Curve")
     plt.legend()
     plt.show()
     def __init__(self, model_type: Text):
+        assert (
+            model_type in SUPPORT_MODEL
+        ), "Not support the kind of model. Please choose one of {}".format(
+            SUPPORT_MODEL
+        )
         self.model_type = model_type
         if self.model_type == "LogisticRegression":
             self.model = self.get_logistic_regression_model()
             params_lr = {
                 "C": np.logspace(-3, 3, 7),
                 "penalty": ["l1", "l2"],
+                "solver": "liblinear",
             }
         model_lr = LogisticRegression()
+        model_lr = GridSearchCV(
+            model_lr, params_lr, cv=3, verbose=False, scoring="roc_auc", refit=True
+        )
         return model_lr
     @staticmethod
         :return:
         """
         if not all(params.values()):
+            params = {
+                "max_features": ["auto", "sqrt", "log2"],
+                "ccp_alpha": [0.1, 0.01, 0.001],
+                "max_depth": [5, 6, 7, 8, 9],
+                "criterion": ["gini", "entropy"],
+            }
         model = DecisionTreeClassifier()
+        model = GridSearchCV(
+            estimator=model,
+            param_grid=params,
+            cv=3,
+            verbose=False,
+            scoring="roc_auc",
+            refit=True,
+        )
         return model
     @staticmethod
         :return:
         """
         if not all(params_nn.values()):
+            params_nn = {
+                "solver": ["lbfgs"],
+                "max_iter": [
+                    1000,
+                    1100,
+                    1200,
+                    1300,
+                    1400,
+                    1500,
+                    1600,
+                    1700,
+                    1800,
+                    1900,
+                    2000,
+                ],
+                "alpha": 10.0 ** -np.arange(1, 10),
+                "hidden_layer_sizes": np.arange(10, 15),
+                "random_state": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+            }
         model_nn = MLPClassifier()
+        model_nn = GridSearchCV(
+            model_nn, params_nn, n_jobs=-1, scoring="roc_auc", refit=True, verbose=False
+        )
         return model_nn
     @staticmethod
         :return:
         """
         if not all(params_rf.values()):
+            params_rf = {
+                "max_depth": [20],
+                "min_samples_split": [10],
+                "max_leaf_nodes": [175],
+                "min_samples_leaf": [5],
+                "n_estimators": [250],
+                "max_features": ["sqrt"],
+            }
         model_rf = RandomForestClassifier()
+        model_rf = GridSearchCV(
+            model_rf,
+            params_rf,
+            cv=3,
+            n_jobs=-1,
+            verbose=False,
+            scoring="roc_auc",
+            refit=True,
+        )
         return model_rf
         """
         if not all(params_lgb.values()):
             params_lgb = {
+                "learning_rate": [0.005, 0.01],
+                "n_estimators": [8, 16, 24],
+                "num_leaves": [
+                    6,
+                    8,
+                    12,
+                    16,
+                ],  # large num_leaves helps improve accuracy but might lead to over-fitting
+                "boosting_type": ["gbdt", "dart"],  # for better accuracy -> try dart
+                "objective": ["binary"],
+                "max_bin": [
+                    255,
+                    510,
+                ],  # large max_bin helps improve accuracy but might slow down training progress
+                "random_state": [500],
+                "colsample_bytree": [0.64, 0.65, 0.66],
+                "subsample": [0.7, 0.75],
+                "reg_alpha": [1, 1.2],
+                "reg_lambda": [1, 1.2, 1.4],
             }
         model = lgb.LGBMClassifier()
+        model = GridSearchCV(
+            model,
+            params_lgb,
+            verbose=False,
+            cv=3,
+            n_jobs=-1,
+            scoring="roc_auc",
+            refit=True,
+        )
         return model
         """
         if not all(params_xgb.values()):
             params_xgb = {
+                "nthread": [4],  # when use hyper thread, xgboost may become slower
+                "objective": ["binary:logistic"],
+                "learning_rate": [0.05],  # so called `eta` value
+                "max_depth": [6],
+                "min_child_weight": [11],
+                "silent": [1],
+                "subsample": [0.8],
+                "colsample_bytree": [0.7],
+                "n_estimators": [
+                    100
+                ],  # number of trees, change it to 1000 for better results
+                "missing": [-999],
+                "seed": [1337],
             }
+        model = GridSearchCV(
+            xgb.XGBClassifier(),
+            params_xgb,
+            n_jobs=-1,
+            cv=3,
+            scoring="roc_auc",
+            refit=True,
+        )
         return model
         :param y_test:
         :return:
         """
+        model_lr, accuracy_lr, roc_auc_lr, coh_kap_lr, tt_lr = self.__run_model(
+            self.model, x_train, y_train, x_test, y_test
+        )
         return model_lr, accuracy_lr, roc_auc_lr, coh_kap_lr, tt_lr
     @staticmethod
         :return:
         """
         if not all(params.values()):
+            params = {
+                "learning_rate": [0.01, 0.02, 0.03],
+                "min_samples_split": [5, 10],
+                "min_samples_leaf": [3, 5],
+                "max_depth": [3, 5, 10],
+                "max_features": ["sqrt"],
+                "n_estimators": [100, 200],
+            }
         model = GradientBoostingClassifier(random_state=100)
         return GridSearchCV(model, params, cv=3, n_jobs=-1)

ml/predictor.py CHANGED Viewed

@@ -43,7 +43,9 @@ class Predictor:
         :return:
         """
-        last_game = self.base_df[(self.base_df["home_team"] == team) | (self.base_df["away_team"] == team)].tail(1)
         if last_game["home_team"].values[0] == team:
             team_rank = last_game["rank_home"].values[0]
@@ -66,8 +68,17 @@ class Predictor:
             team_gp_rank = last_game["away_game_points_rank_mean"].values[0]
             team_gp_rank_l5 = last_game["away_game_points_rank_mean_l5"].values[0]
-        return [team_rank, team_goals, team_goals_l5, team_goals_suf, team_goals_suf_l5, team_rank_suf,
-                team_rank_suf_l5, team_gp_rank, team_gp_rank_l5]
     @staticmethod
     def find_features(team_1, team_2):
@@ -88,8 +99,20 @@ class Predictor:
         dif_gp_rank = team_1[7] - team_2[7]
         dif_gp_rank_l5 = team_1[8] - team_2[8]
-        return [rank_dif, goals_dif, goals_dif_l5, goals_suf_dif, goals_suf_dif_l5, goals_per_ranking_dif,
-                dif_rank_agst, dif_rank_agst_l5, dif_gp_rank, dif_gp_rank_l5, 1, 0]
     def __predict(self, team_1: Text, team_2: Text):
@@ -109,7 +132,14 @@ class Predictor:
         team_1_prob = (probs_g1[0][0] + probs_g2[0][1]) / 2
         team_2_prob = (probs_g2[0][0] + probs_g1[0][1]) / 2
-        return team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2
     def predict(self, team_1: Text, team_2: Text) -> Tuple[bool, Text, float]:
         """
@@ -119,11 +149,18 @@ class Predictor:
         :return:
         """
         draw = False
-        team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2 = self.__predict(
-            team_1, team_2)
         winner, winner_proba = "", 0.0
         if ((team_1_prob_g1 > team_2_prob_g1) & (team_2_prob_g2 > team_1_prob_g2)) | (
-                (team_1_prob_g1 < team_2_prob_g1) & (team_2_prob_g2 < team_1_prob_g2)):
             draw = True
         elif team_1_prob > team_2_prob:
@@ -142,17 +179,24 @@ class Predictor:
         """
         result = ""
         data = load_pickle(os.path.join(DATA_ROOT, cfg.data.table_matches))
-        table = data['table']
-        matches = data['matches']
         advanced_group, last_group = [], ""
         for teams in matches:
             draw = False
-            team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2 = self.__predict(
-                teams[1], teams[2])
             winner, winner_proba = "", 0.0
-            if ((team_1_prob_g1 > team_2_prob_g1) & (team_2_prob_g2 > team_1_prob_g2)) | (
-                    (team_1_prob_g1 < team_2_prob_g1) & (team_2_prob_g2 < team_1_prob_g2)):
                 draw = True
                 for i in table[teams[0]]:
                     if i[0] == teams[1] or i[0] == teams[2]:
@@ -186,18 +230,34 @@ class Predictor:
                         i[2] = np.mean(i[2])
                     final_points = table[last_group]
-                    final_table = sorted(final_points, key=itemgetter(1, 2), reverse=True)
                     advanced_group.append([final_table[0][0], final_table[1][0]])
                     for i in final_table:
                         result += "%s -------- %d\n" % (i[0], i[1])
                 result += "\n"
-                result += "-" * 10 + " Starting Analysis for Group %s " % (teams[0]) + "-" * 10 + "\n"
             if draw is False:
                 result += "Group %s - %s vs. %s: Winner %s with %.2f probability\n" % (
-                    teams[0], teams[1], teams[2], winner, winner_proba)
             else:
-                result += "Group %s - %s vs. %s: Draw\n" % (teams[0], teams[1], teams[2])
             last_group = teams[0]
         result += "\n"
         result += "Group %s advanced: \n" % last_group
@@ -212,7 +272,12 @@ class Predictor:
             result += "%s -------- %d\n" % (i[0], i[1])
         advanced = advanced_group
-        playoffs = {"Round of 16": [], "Quarter-Final": [], "Semi-Final": [], "Final": []}
         for p in playoffs.keys():
             playoffs[p] = []
@@ -234,7 +299,11 @@ class Predictor:
                             control.append((advanced * 2)[a][1])
                         else:
                             control.append((advanced * 2)[a][0])
-                playoffs[p] = [[control[c], control[c + 1]] for c in range(0, len(control) - 1, 1) if c % 2 == 0]
                 for i in range(0, len(playoffs[p]), 1):
                     game = playoffs[p][i]
@@ -242,18 +311,34 @@ class Predictor:
                     home = game[0]
                     away = game[1]
-                    team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2 = \
-                        self.__predict(home, away)
                     if actual_round != p:
                         result += "-" * 10 + "\n"
                         result += "Starting simulation of %s\n" % p
                         result += "-" * 10 + "\n"
                     if team_1_prob < team_2_prob:
-                        result += "%s vs. %s: %s advances with prob %.2f\n" % (home, away, away, team_2_prob)
                         next_rounds.append(away)
                     else:
-                        result += "%s vs. %s: %s advances with prob %.2f\n" % (home, away, home, team_1_prob)
                         next_rounds.append(home)
                     game.append([team_1_prob, team_2_prob])
@@ -261,26 +346,45 @@ class Predictor:
                     actual_round = p
             else:
-                playoffs[p] = [[next_rounds[c], next_rounds[c + 1]] for c in range(0, len(next_rounds) - 1, 1) if
-                               c % 2 == 0]
                 next_rounds = []
                 for i in range(0, len(playoffs[p])):
                     game = playoffs[p][i]
                     home = game[0]
                     away = game[1]
-                    team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2 = \
-                        self.__predict(home, away)
                     if actual_round != p:
                         result += "-" * 10 + "\n"
                         result += "Starting simulation of %s\n" % p
                         result += "-" * 10 + "\n"
                     if team_1_prob < team_2_prob:
-                        result += "%s vs. %s: %s advances with prob %.2f \n" % (home, away, away, team_2_prob)
                         next_rounds.append(away)
                     else:
-                        result += "%s vs. %s: %s advances with prob %.2f \n" % (home, away, home, team_1_prob)
                         next_rounds.append(home)
                     game.append([team_1_prob, team_2_prob])
                     playoffs[p][i] = game

         :return:
         """
+        last_game = self.base_df[
+            (self.base_df["home_team"] == team) | (self.base_df["away_team"] == team)
+        ].tail(1)
         if last_game["home_team"].values[0] == team:
             team_rank = last_game["rank_home"].values[0]
             team_gp_rank = last_game["away_game_points_rank_mean"].values[0]
             team_gp_rank_l5 = last_game["away_game_points_rank_mean_l5"].values[0]
+        return [
+            team_rank,
+            team_goals,
+            team_goals_l5,
+            team_goals_suf,
+            team_goals_suf_l5,
+            team_rank_suf,
+            team_rank_suf_l5,
+            team_gp_rank,
+            team_gp_rank_l5,
+        ]
     @staticmethod
     def find_features(team_1, team_2):
         dif_gp_rank = team_1[7] - team_2[7]
         dif_gp_rank_l5 = team_1[8] - team_2[8]
+        return [
+            rank_dif,
+            goals_dif,
+            goals_dif_l5,
+            goals_suf_dif,
+            goals_suf_dif_l5,
+            goals_per_ranking_dif,
+            dif_rank_agst,
+            dif_rank_agst_l5,
+            dif_gp_rank,
+            dif_gp_rank_l5,
+            1,
+            0,
+        ]
     def __predict(self, team_1: Text, team_2: Text):
         team_1_prob = (probs_g1[0][0] + probs_g2[0][1]) / 2
         team_2_prob = (probs_g2[0][0] + probs_g1[0][1]) / 2
+        return (
+            team_1_prob_g1,
+            team_1_prob_g2,
+            team_1_prob,
+            team_2_prob,
+            team_2_prob_g1,
+            team_2_prob_g2,
+        )
     def predict(self, team_1: Text, team_2: Text) -> Tuple[bool, Text, float]:
         """
         :return:
         """
         draw = False
+        (
+            team_1_prob_g1,
+            team_1_prob_g2,
+            team_1_prob,
+            team_2_prob,
+            team_2_prob_g1,
+            team_2_prob_g2,
+        ) = self.__predict(team_1, team_2)
         winner, winner_proba = "", 0.0
         if ((team_1_prob_g1 > team_2_prob_g1) & (team_2_prob_g2 > team_1_prob_g2)) | (
+            (team_1_prob_g1 < team_2_prob_g1) & (team_2_prob_g2 < team_1_prob_g2)
+        ):
             draw = True
         elif team_1_prob > team_2_prob:
         """
         result = ""
         data = load_pickle(os.path.join(DATA_ROOT, cfg.data.table_matches))
+        table = data["table"]
+        matches = data["matches"]
         advanced_group, last_group = [], ""
         for teams in matches:
             draw = False
+            (
+                team_1_prob_g1,
+                team_1_prob_g2,
+                team_1_prob,
+                team_2_prob,
+                team_2_prob_g1,
+                team_2_prob_g2,
+            ) = self.__predict(teams[1], teams[2])
             winner, winner_proba = "", 0.0
+            if (
+                (team_1_prob_g1 > team_2_prob_g1) & (team_2_prob_g2 > team_1_prob_g2)
+            ) | ((team_1_prob_g1 < team_2_prob_g1) & (team_2_prob_g2 < team_1_prob_g2)):
                 draw = True
                 for i in table[teams[0]]:
                     if i[0] == teams[1] or i[0] == teams[2]:
                         i[2] = np.mean(i[2])
                     final_points = table[last_group]
+                    final_table = sorted(
+                        final_points, key=itemgetter(1, 2), reverse=True
+                    )
                     advanced_group.append([final_table[0][0], final_table[1][0]])
                     for i in final_table:
                         result += "%s -------- %d\n" % (i[0], i[1])
                 result += "\n"
+                result += (
+                    "-" * 10
+                    + " Starting Analysis for Group %s " % (teams[0])
+                    + "-" * 10
+                    + "\n"
+                )
             if draw is False:
                 result += "Group %s - %s vs. %s: Winner %s with %.2f probability\n" % (
+                    teams[0],
+                    teams[1],
+                    teams[2],
+                    winner,
+                    winner_proba,
+                )
             else:
+                result += "Group %s - %s vs. %s: Draw\n" % (
+                    teams[0],
+                    teams[1],
+                    teams[2],
+                )
             last_group = teams[0]
         result += "\n"
         result += "Group %s advanced: \n" % last_group
             result += "%s -------- %d\n" % (i[0], i[1])
         advanced = advanced_group
+        playoffs = {
+            "Round of 16": [],
+            "Quarter-Final": [],
+            "Semi-Final": [],
+            "Final": [],
+        }
         for p in playoffs.keys():
             playoffs[p] = []
                             control.append((advanced * 2)[a][1])
                         else:
                             control.append((advanced * 2)[a][0])
+                playoffs[p] = [
+                    [control[c], control[c + 1]]
+                    for c in range(0, len(control) - 1, 1)
+                    if c % 2 == 0
+                ]
                 for i in range(0, len(playoffs[p]), 1):
                     game = playoffs[p][i]
                     home = game[0]
                     away = game[1]
+                    (
+                        team_1_prob_g1,
+                        team_1_prob_g2,
+                        team_1_prob,
+                        team_2_prob,
+                        team_2_prob_g1,
+                        team_2_prob_g2,
+                    ) = self.__predict(home, away)
                     if actual_round != p:
                         result += "-" * 10 + "\n"
                         result += "Starting simulation of %s\n" % p
                         result += "-" * 10 + "\n"
                     if team_1_prob < team_2_prob:
+                        result += "%s vs. %s: %s advances with prob %.2f\n" % (
+                            home,
+                            away,
+                            away,
+                            team_2_prob,
+                        )
                         next_rounds.append(away)
                     else:
+                        result += "%s vs. %s: %s advances with prob %.2f\n" % (
+                            home,
+                            away,
+                            home,
+                            team_1_prob,
+                        )
                         next_rounds.append(home)
                     game.append([team_1_prob, team_2_prob])
                     actual_round = p
             else:
+                playoffs[p] = [
+                    [next_rounds[c], next_rounds[c + 1]]
+                    for c in range(0, len(next_rounds) - 1, 1)
+                    if c % 2 == 0
+                ]
                 next_rounds = []
                 for i in range(0, len(playoffs[p])):
                     game = playoffs[p][i]
                     home = game[0]
                     away = game[1]
+                    (
+                        team_1_prob_g1,
+                        team_1_prob_g2,
+                        team_1_prob,
+                        team_2_prob,
+                        team_2_prob_g1,
+                        team_2_prob_g2,
+                    ) = self.__predict(home, away)
                     if actual_round != p:
                         result += "-" * 10 + "\n"
                         result += "Starting simulation of %s\n" % p
                         result += "-" * 10 + "\n"
                     if team_1_prob < team_2_prob:
+                        result += "%s vs. %s: %s advances with prob %.2f \n" % (
+                            home,
+                            away,
+                            away,
+                            team_2_prob,
+                        )
                         next_rounds.append(away)
                     else:
+                        result += "%s vs. %s: %s advances with prob %.2f \n" % (
+                            home,
+                            away,
+                            home,
+                            team_1_prob,
+                        )
                         next_rounds.append(home)
                     game.append([team_1_prob, team_2_prob])
                     playoffs[p][i] = game

ml/utils.py CHANGED Viewed

@@ -12,7 +12,7 @@ def write_pickle(path, a):
     """
     try:
-        with open(path, 'wb') as handle:
             pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)
         return True
     except Exception as e:
@@ -29,6 +29,6 @@ def load_pickle(path):
     Returns:
     """
-    with open(path, 'rb') as handle:
         data = pickle.load(handle)
     return data

     """
     try:
+        with open(path, "wb") as handle:
             pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)
         return True
     except Exception as e:
     Returns:
     """
+    with open(path, "rb") as handle:
         data = pickle.load(handle)
     return data