phong.dao commited on
Commit
9e6c24e
1 Parent(s): 38b12ed
Files changed (7) hide show
  1. app.py +107 -49
  2. configs/config.py +1 -1
  3. configs/constants.py +1 -1
  4. ml/data_prepare.py +249 -70
  5. ml/model.py +135 -67
  6. ml/predictor.py +135 -31
  7. ml/utils.py +2 -2
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os.path
2
  import shutil
3
 
@@ -9,7 +10,25 @@ import requests
9
  from configs.config import cfg
10
  from ml.model import base_df, ml_model
11
  from ml.predictor import Predictor
12
- from ml.utils import load_pickle
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
 
15
  def function(team1, team2):
@@ -23,65 +42,104 @@ def function(team1, team2):
23
  response = requests.get(cfg.live_prediction)
24
  if response.status_code == 200:
25
  five_thirty_eight_predict = response.json()
26
- for match in five_thirty_eight_predict['matches']:
27
- if (team1 == match['team1'] and team2 == match['team2']) \
28
- or (team1 == match['team2'] and team2 == match['team1']):
29
- if match['status'] != 'live':
30
- probability = {
31
- match['team1']: match['prob1'],
32
- match['team2']: match['prob2'],
33
- 'draw': match['probtie'],
34
- }
35
- else:
36
- probability = {
37
- match['team1']: match['live_winprobs']['winprobs'][-1]['prob1'],
38
- match['team2']: match['live_winprobs']['winprobs'][-1]['prob2'],
39
- 'draw': match['live_winprobs']['winprobs'][-1]['probtie'],
40
- }
41
- if match['probtie'] < match['prob1'] or match['probtie'] < match['prob2']:
42
- if match['prob1'] > match['prob2']:
43
- winner = match['team1']
44
- else:
45
- winner = match['team2']
46
- else:
47
- return {
48
- "result": 'Draw!',
49
- "probability": probability
50
- }
51
- return {
52
- "winner": winner,
53
- "probability": probability
54
- }
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  draw, winner, winner_proba = predictor.predict(team1, team2)
57
  if draw:
 
 
 
58
  return {
59
- 'result': "Draw!",
60
- 'probability': round(random.uniform(0.7, 0.9), 10)
 
61
  }
62
  else:
 
63
  return {
64
- 'winner': winner,
65
- 'probability': winner_proba
 
 
 
 
 
66
  }
67
 
68
 
69
- shutil.copytree("static", os.path.abspath(os.path.join(
70
- os.path.dirname(gr.__file__), "templates/frontend/static")), dirs_exist_ok=True)
71
- shutil.copy("templates/asset.html", os.path.abspath(os.path.join(
72
- os.path.dirname(gr.__file__), "templates/frontend/static/asset.html")))
73
- shutil.copytree("templates/asset", os.path.abspath(os.path.join(
74
- os.path.dirname(gr.__file__), "templates/frontend/static/asset")), dirs_exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  predictor = Predictor(base_df, ml_model)
76
- examples = random.choices([x[1:3] for x in load_pickle("data/table_match.pkl")['matches']], k=20)
 
 
 
 
 
77
  examples = [list(x) for x in examples]
78
- iface = gr.Interface(fn=function,
79
- inputs=[gr.Textbox(placeholder="Qatar"), gr.Textbox(placeholder="Ecuador")],
80
- outputs="json",
81
- title="WorldCup-Prediction \n\n "
82
- "Predicting the 2022 FIFA World Cup results with Machine Learning!",
83
- examples=examples,
84
- article=f'<iframe style="width: 100%; height: 2000px" src=\'./static/asset.html\' ></iframe>',
85
- )
 
86
  iface.queue(concurrency_count=5)
87
  iface.launch()
 
1
+ import math
2
  import os.path
3
  import shutil
4
 
 
10
  from configs.config import cfg
11
  from ml.model import base_df, ml_model
12
  from ml.predictor import Predictor
13
+
14
+
15
+ def get_result(team1, prob1, score1, team2, prob2, score2, probtie):
16
+ if prob1 > prob2 and prob1 > probtie:
17
+ winner = {"name": team1, "probability": prob1, "goals": score1}
18
+ loser = {"name": team2, "probability": prob2, "goals": score2}
19
+
20
+ elif prob1 < prob2 and prob2 > probtie:
21
+ loser = {"name": team1, "probability": prob1, "goals": score1}
22
+ winner = {"name": team2, "probability": prob2, "goals": score2}
23
+ else:
24
+ loser = {"name": None, "probability": 0.0, "goals": score1}
25
+ winner = {"name": None, "probability": 0.0, "goals": score2}
26
+ result = {
27
+ "winner": winner,
28
+ "loser": loser,
29
+ "draw": {"probability": probtie},
30
+ }
31
+ return result
32
 
33
 
34
  def function(team1, team2):
 
42
  response = requests.get(cfg.live_prediction)
43
  if response.status_code == 200:
44
  five_thirty_eight_predict = response.json()
45
+ for match in five_thirty_eight_predict["matches"]:
46
+ if not (
47
+ (team1 == match["team1"] and team2 == match["team2"])
48
+ or (team1 == match["team2"] and team2 == match["team1"])
49
+ ):
50
+ continue
51
+
52
+ if match["status"] != "live":
53
+ result = get_result(
54
+ match["team1"],
55
+ match["prob1"],
56
+ math.ceil(match["adj_score1"])
57
+ if "adj_score1" in match
58
+ else math.ceil(match["o1"] - match["d2"]),
59
+ match["team2"],
60
+ match["prob2"],
61
+ math.ceil(match["adj_score2"])
62
+ if "adj_score2" in match
63
+ else math.ceil(match["o2"] - match["d1"]),
64
+ match["probtie"],
65
+ )
66
+ else:
 
 
 
 
 
 
 
67
 
68
+ result = get_result(
69
+ match["team1"],
70
+ match["live_winprobs"]["winprobs"][-1]["prob1"],
71
+ math.ceil(match["adj_score1"])
72
+ if "adj_score1" in match
73
+ else math.ceil(match["o1"] - match["d2"]),
74
+ match["team2"],
75
+ match["live_winprobs"]["winprobs"][-1]["prob2"],
76
+ math.ceil(match["adj_score2"])
77
+ if "adj_score2" in match
78
+ else math.ceil(match["o2"] - match["d1"]),
79
+ match["probtie"],
80
+ )
81
+ return result
82
  draw, winner, winner_proba = predictor.predict(team1, team2)
83
  if draw:
84
+ draw_prob = round(random.uniform(0.7, 0.9), 10)
85
+ winner_proba = round(random.uniform(0, 1 - draw_prob), 10)
86
+ loser_proba = 1 - draw_prob - winner_proba
87
  return {
88
+ "winner": {"name": team1, "probability": winner_proba, "goals": None},
89
+ "loser": {"name": team2, "probability": loser_proba, "goals": None},
90
+ "draw": {"probability": draw_prob},
91
  }
92
  else:
93
+ loser_proba = round(random.uniform(0, 1 - winner_proba), 10)
94
  return {
95
+ "winner": {"name": winner, "probability": winner_proba, "goals": None},
96
+ "loser": {
97
+ "name": team1 if winner == team2 else team2,
98
+ "probability": loser_proba,
99
+ "goals": None,
100
+ },
101
+ "draw": {"probability": 1 - winner_proba - loser_proba},
102
  }
103
 
104
 
105
+ shutil.copytree(
106
+ "static",
107
+ os.path.abspath(
108
+ os.path.join(os.path.dirname(gr.__file__), "templates/frontend/static")
109
+ ),
110
+ dirs_exist_ok=True,
111
+ )
112
+ shutil.copy(
113
+ "templates/asset.html",
114
+ os.path.abspath(
115
+ os.path.join(
116
+ os.path.dirname(gr.__file__), "templates/frontend/static/asset.html"
117
+ )
118
+ ),
119
+ )
120
+ shutil.copytree(
121
+ "templates/asset",
122
+ os.path.abspath(
123
+ os.path.join(os.path.dirname(gr.__file__), "templates/frontend/static/asset")
124
+ ),
125
+ dirs_exist_ok=True,
126
+ )
127
  predictor = Predictor(base_df, ml_model)
128
+ examples = (
129
+ ("Croatia", "Argentina"),
130
+ ("Morocco", "France"),
131
+ ("Argentina", "France"),
132
+ ("Morocco", "Croatia"),
133
+ )
134
  examples = [list(x) for x in examples]
135
+ iface = gr.Interface(
136
+ fn=function,
137
+ inputs=[gr.Textbox(placeholder="Qatar"), gr.Textbox(placeholder="Ecuador")],
138
+ outputs="json",
139
+ title="WorldCup-Prediction \n\n "
140
+ "Predicting the 2022 FIFA World Cup results with Machine Learning!",
141
+ examples=examples,
142
+ article=f"<iframe style=\"width: 100%; height: 2000px\" src='./static/asset.html' ></iframe>",
143
+ )
144
  iface.queue(concurrency_count=5)
145
  iface.launch()
configs/config.py CHANGED
@@ -4,7 +4,7 @@ from typing import Text, Union
4
  from omegaconf import OmegaConf, DictConfig, ListConfig
5
 
6
 
7
- def get_config(config_file: Text = 'base') -> Union[DictConfig, ListConfig]:
8
  if not config_file.endswith(".yaml") or not config_file.endswith(".yml"):
9
  config_file += ".yaml"
10
  root_configs_dir = os.path.abspath(os.path.join(__file__, ".."))
 
4
  from omegaconf import OmegaConf, DictConfig, ListConfig
5
 
6
 
7
+ def get_config(config_file: Text = "base") -> Union[DictConfig, ListConfig]:
8
  if not config_file.endswith(".yaml") or not config_file.endswith(".yml"):
9
  config_file += ".yaml"
10
  root_configs_dir = os.path.abspath(os.path.join(__file__, ".."))
configs/constants.py CHANGED
@@ -11,6 +11,6 @@ SUPPORT_MODEL = (
11
  "RandomForestClassifier",
12
  "LGBMClassifier",
13
  "XGBClassifier",
14
- "GradientBoostingClassifier"
15
  )
16
  DEFAULT_MODEL = "GradientBoostingClassifier"
 
11
  "RandomForestClassifier",
12
  "LGBMClassifier",
13
  "XGBClassifier",
14
+ "GradientBoostingClassifier",
15
  )
16
  DEFAULT_MODEL = "GradientBoostingClassifier"
ml/data_prepare.py CHANGED
@@ -33,7 +33,8 @@ def create_dataset(df: pd.DataFrame):
33
  """
34
  x_, y = df.iloc[:, 3:], df[["target"]]
35
  x_train, x_test, y_train, y_test = train_test_split(
36
- x_, y, test_size=0.22, random_state=100)
 
37
  return x_train, x_test, y_train, y_test
38
 
39
 
@@ -55,28 +56,59 @@ def data_preparing():
55
  rank = pd.read_csv(os.path.join(DATA_ROOT, cfg.data.rank_file))
56
  rank["rank_date"] = pd.to_datetime(rank["rank_date"])
57
  rank = rank[(rank["rank_date"] >= cfg.day_get_rank)].reset_index(drop=True)
58
- rank["country_full"] = rank["country_full"].str.replace("IR Iran", "Iran").str.replace("Korea Republic",
59
- "South Korea").str.replace(
60
- "USA", "United States")
 
 
 
61
 
62
  # The merge is made in order to get a dataset FIFA games and its rankings.
63
- rank = rank.set_index(['rank_date']).groupby(['country_full'], group_keys=False).resample('D').first().fillna(
64
- method='ffill').reset_index()
 
 
 
 
 
 
65
  df_wc_ranked = df.merge(
66
- rank[["country_full", "total_points", "previous_points", "rank", "rank_change", "rank_date"]],
67
- left_on=["date", "home_team"], right_on=["rank_date", "country_full"]).drop(["rank_date", "country_full"],
68
- axis=1)
 
 
 
 
 
 
 
 
 
 
69
 
70
  df_wc_ranked = df_wc_ranked.merge(
71
- rank[["country_full", "total_points", "previous_points", "rank", "rank_change", "rank_date"]],
72
- left_on=["date", "away_team"], right_on=["rank_date", "country_full"], suffixes=("_home", "_away")).drop(
73
- ["rank_date", "country_full"], axis=1)
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  # Featuring
76
  df = df_wc_ranked
77
 
78
  df[["result", "home_team_points", "away_team_points"]] = df.apply(
79
- lambda x: result_finder(x["home_score"], x["away_score"]), axis=1)
 
80
 
81
  # we create columns that will help in the creation of the features: ranking difference,
82
  # points won at the game vs. team faced rank, and goals difference in the game.
@@ -90,16 +122,54 @@ def data_preparing():
90
  # unify them and calculate the past game values.
91
  # After that, I'll separate again and merge them, retrieving the original dataset.
92
  # This process optimizes the creation of the features.
93
- home_team = df[["date", "home_team", "home_score", "away_score", "rank_home", "rank_away", "rank_change_home",
94
- "total_points_home", "result", "rank_dif", "points_home_by_rank", "home_team_points"]]
95
-
96
- away_team = df[["date", "away_team", "away_score", "home_score", "rank_away", "rank_home", "rank_change_away",
97
- "total_points_away", "result", "rank_dif", "points_away_by_rank", "away_team_points"]]
98
- home_team.columns = [h.replace("home_", "").replace("_home", "").replace("away_", "suf_").replace("_away", "_suf")
99
- for h in home_team.columns]
100
-
101
- away_team.columns = [a.replace("away_", "").replace("_away", "").replace("home_", "suf_").replace("_home", "_suf")
102
- for a in away_team.columns]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  team_stats = home_team.append(away_team)
104
 
105
  stats_val = []
@@ -109,7 +179,7 @@ def data_preparing():
109
  date = row["date"]
110
  past_games = team_stats.loc[
111
  (team_stats["team"] == team) & (team_stats["date"] < date)
112
- ].sort_values(by=['date'], ascending=False)
113
  last5 = past_games.head(5)
114
 
115
  goals = past_games["score"].mean()
@@ -122,9 +192,13 @@ def data_preparing():
122
  rank_l5 = last5["rank_suf"].mean()
123
 
124
  if len(last5) > 0:
125
- points = past_games["total_points"].values[0] - past_games["total_points"].values[
126
- -1] # amount of points earned
127
- points_l5 = last5["total_points"].values[0] - last5["total_points"].values[-1]
 
 
 
 
128
  else:
129
  points = 0
130
  points_l5 = 0
@@ -136,49 +210,109 @@ def data_preparing():
136
  gp_rank_l5 = last5["points_by_rank"].mean()
137
 
138
  stats_val.append(
139
- [goals, goals_l5, goals_suf, goals_suf_l5, rank, rank_l5, points, points_l5, gp, gp_l5, gp_rank,
140
- gp_rank_l5])
141
-
142
- stats_cols = ["goals_mean", "goals_mean_l5", "goals_suf_mean", "goals_suf_mean_l5", "rank_mean", "rank_mean_l5",
143
- "points_mean", "points_mean_l5", "game_points_mean", "game_points_mean_l5",
144
- "game_points_rank_mean", "game_points_rank_mean_l5"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
  stats_df = pd.DataFrame(stats_val, columns=stats_cols)
147
 
148
- full_df = pd.concat([team_stats.reset_index(drop=True), stats_df], axis=1, ignore_index=False)
 
 
149
 
150
- home_team_stats = full_df.iloc[:int(full_df.shape[0] / 2), :]
151
- away_team_stats = full_df.iloc[int(full_df.shape[0] / 2):, :]
152
 
153
  home_team_stats = home_team_stats[home_team_stats.columns[-12:]]
154
  away_team_stats = away_team_stats[away_team_stats.columns[-12:]]
155
 
156
- home_team_stats.columns = ['home_' + str(col) for col in home_team_stats.columns]
157
- away_team_stats.columns = ['away_' + str(col) for col in away_team_stats.columns]
158
 
159
  # In order to unify the database, is needed to add home and away suffix for each column.
160
  # After that, the data is ready to be merged.
161
- match_stats = pd.concat([home_team_stats, away_team_stats.reset_index(drop=True)], axis=1, ignore_index=False)
 
 
 
 
162
 
163
- full_df = pd.concat([df, match_stats.reset_index(drop=True)], axis=1, ignore_index=False)
 
 
164
 
165
  # Drop friendly game
166
  full_df["is_friendly"] = full_df["tournament"].apply(lambda x: find_friendly(x))
167
  full_df = pd.get_dummies(full_df, columns=["is_friendly"])
168
 
169
  base_df = full_df[
170
- ["date", "home_team", "away_team", "rank_home", "rank_away", "home_score", "away_score", "result",
171
- "rank_dif", "rank_change_home", "rank_change_away", 'home_goals_mean',
172
- 'home_goals_mean_l5', 'home_goals_suf_mean', 'home_goals_suf_mean_l5',
173
- 'home_rank_mean', 'home_rank_mean_l5', 'home_points_mean',
174
- 'home_points_mean_l5', 'away_goals_mean', 'away_goals_mean_l5',
175
- 'away_goals_suf_mean', 'away_goals_suf_mean_l5', 'away_rank_mean',
176
- 'away_rank_mean_l5', 'away_points_mean', 'away_points_mean_l5', 'home_game_points_mean',
177
- 'home_game_points_mean_l5',
178
- 'home_game_points_rank_mean', 'home_game_points_rank_mean_l5', 'away_game_points_mean',
179
- 'away_game_points_mean_l5', 'away_game_points_rank_mean',
180
- 'away_game_points_rank_mean_l5',
181
- 'is_friendly_0', 'is_friendly_1']]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
  df = base_df.dropna()
184
 
@@ -207,30 +341,75 @@ def create_db(df):
207
  :param df:
208
  :return:
209
  """
210
- columns = ["home_team", "away_team", "target", "rank_dif", "home_goals_mean",
211
- "home_rank_mean", "away_goals_mean", "away_rank_mean", "home_rank_mean_l5", "away_rank_mean_l5",
212
- "home_goals_suf_mean", "away_goals_suf_mean", "home_goals_mean_l5", "away_goals_mean_l5",
213
- "home_goals_suf_mean_l5", "away_goals_suf_mean_l5", "home_game_points_rank_mean",
214
- "home_game_points_rank_mean_l5", "away_game_points_rank_mean", "away_game_points_rank_mean_l5",
215
- "is_friendly_0", "is_friendly_1"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
  base = df.loc[:, columns]
218
  base.loc[:, "goals_dif"] = base["home_goals_mean"] - base["away_goals_mean"]
219
- base.loc[:, "goals_dif_l5"] = base["home_goals_mean_l5"] - base["away_goals_mean_l5"]
220
- base.loc[:, "goals_suf_dif"] = base["home_goals_suf_mean"] - base["away_goals_suf_mean"]
221
- base.loc[:, "goals_suf_dif_l5"] = base["home_goals_suf_mean_l5"] - base["away_goals_suf_mean_l5"]
222
- base.loc[:, "goals_per_ranking_dif"] = (base["home_goals_mean"] / base["home_rank_mean"]) - (
223
- base["away_goals_mean"] / base["away_rank_mean"])
 
 
 
 
 
 
 
224
  base.loc[:, "dif_rank_agst"] = base["home_rank_mean"] - base["away_rank_mean"]
225
- base.loc[:, "dif_rank_agst_l5"] = base["home_rank_mean_l5"] - base["away_rank_mean_l5"]
226
- base.loc[:, "dif_points_rank"] = base["home_game_points_rank_mean"] - base["away_game_points_rank_mean"]
227
- base.loc[:, "dif_points_rank_l5"] = base["home_game_points_rank_mean_l5"] - base[
228
- "away_game_points_rank_mean_l5"]
 
 
 
 
 
229
 
230
  model_df = base[
231
- ["home_team", "away_team", "target", "rank_dif", "goals_dif", "goals_dif_l5",
232
- "goals_suf_dif", "goals_suf_dif_l5", "goals_per_ranking_dif", "dif_rank_agst", "dif_rank_agst_l5",
233
- "dif_points_rank", "dif_points_rank_l5", "is_friendly_0", "is_friendly_1"]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  return model_df
235
 
236
 
 
33
  """
34
  x_, y = df.iloc[:, 3:], df[["target"]]
35
  x_train, x_test, y_train, y_test = train_test_split(
36
+ x_, y, test_size=0.22, random_state=100
37
+ )
38
  return x_train, x_test, y_train, y_test
39
 
40
 
 
56
  rank = pd.read_csv(os.path.join(DATA_ROOT, cfg.data.rank_file))
57
  rank["rank_date"] = pd.to_datetime(rank["rank_date"])
58
  rank = rank[(rank["rank_date"] >= cfg.day_get_rank)].reset_index(drop=True)
59
+ rank["country_full"] = (
60
+ rank["country_full"]
61
+ .str.replace("IR Iran", "Iran")
62
+ .str.replace("Korea Republic", "South Korea")
63
+ .str.replace("USA", "United States")
64
+ )
65
 
66
  # The merge is made in order to get a dataset FIFA games and its rankings.
67
+ rank = (
68
+ rank.set_index(["rank_date"])
69
+ .groupby(["country_full"], group_keys=False)
70
+ .resample("D")
71
+ .first()
72
+ .fillna(method="ffill")
73
+ .reset_index()
74
+ )
75
  df_wc_ranked = df.merge(
76
+ rank[
77
+ [
78
+ "country_full",
79
+ "total_points",
80
+ "previous_points",
81
+ "rank",
82
+ "rank_change",
83
+ "rank_date",
84
+ ]
85
+ ],
86
+ left_on=["date", "home_team"],
87
+ right_on=["rank_date", "country_full"],
88
+ ).drop(["rank_date", "country_full"], axis=1)
89
 
90
  df_wc_ranked = df_wc_ranked.merge(
91
+ rank[
92
+ [
93
+ "country_full",
94
+ "total_points",
95
+ "previous_points",
96
+ "rank",
97
+ "rank_change",
98
+ "rank_date",
99
+ ]
100
+ ],
101
+ left_on=["date", "away_team"],
102
+ right_on=["rank_date", "country_full"],
103
+ suffixes=("_home", "_away"),
104
+ ).drop(["rank_date", "country_full"], axis=1)
105
 
106
  # Featuring
107
  df = df_wc_ranked
108
 
109
  df[["result", "home_team_points", "away_team_points"]] = df.apply(
110
+ lambda x: result_finder(x["home_score"], x["away_score"]), axis=1
111
+ )
112
 
113
  # we create columns that will help in the creation of the features: ranking difference,
114
  # points won at the game vs. team faced rank, and goals difference in the game.
 
122
  # unify them and calculate the past game values.
123
  # After that, I'll separate again and merge them, retrieving the original dataset.
124
  # This process optimizes the creation of the features.
125
+ home_team = df[
126
+ [
127
+ "date",
128
+ "home_team",
129
+ "home_score",
130
+ "away_score",
131
+ "rank_home",
132
+ "rank_away",
133
+ "rank_change_home",
134
+ "total_points_home",
135
+ "result",
136
+ "rank_dif",
137
+ "points_home_by_rank",
138
+ "home_team_points",
139
+ ]
140
+ ]
141
+
142
+ away_team = df[
143
+ [
144
+ "date",
145
+ "away_team",
146
+ "away_score",
147
+ "home_score",
148
+ "rank_away",
149
+ "rank_home",
150
+ "rank_change_away",
151
+ "total_points_away",
152
+ "result",
153
+ "rank_dif",
154
+ "points_away_by_rank",
155
+ "away_team_points",
156
+ ]
157
+ ]
158
+ home_team.columns = [
159
+ h.replace("home_", "")
160
+ .replace("_home", "")
161
+ .replace("away_", "suf_")
162
+ .replace("_away", "_suf")
163
+ for h in home_team.columns
164
+ ]
165
+
166
+ away_team.columns = [
167
+ a.replace("away_", "")
168
+ .replace("_away", "")
169
+ .replace("home_", "suf_")
170
+ .replace("_home", "_suf")
171
+ for a in away_team.columns
172
+ ]
173
  team_stats = home_team.append(away_team)
174
 
175
  stats_val = []
 
179
  date = row["date"]
180
  past_games = team_stats.loc[
181
  (team_stats["team"] == team) & (team_stats["date"] < date)
182
+ ].sort_values(by=["date"], ascending=False)
183
  last5 = past_games.head(5)
184
 
185
  goals = past_games["score"].mean()
 
192
  rank_l5 = last5["rank_suf"].mean()
193
 
194
  if len(last5) > 0:
195
+ points = (
196
+ past_games["total_points"].values[0]
197
+ - past_games["total_points"].values[-1]
198
+ ) # amount of points earned
199
+ points_l5 = (
200
+ last5["total_points"].values[0] - last5["total_points"].values[-1]
201
+ )
202
  else:
203
  points = 0
204
  points_l5 = 0
 
210
  gp_rank_l5 = last5["points_by_rank"].mean()
211
 
212
  stats_val.append(
213
+ [
214
+ goals,
215
+ goals_l5,
216
+ goals_suf,
217
+ goals_suf_l5,
218
+ rank,
219
+ rank_l5,
220
+ points,
221
+ points_l5,
222
+ gp,
223
+ gp_l5,
224
+ gp_rank,
225
+ gp_rank_l5,
226
+ ]
227
+ )
228
+
229
+ stats_cols = [
230
+ "goals_mean",
231
+ "goals_mean_l5",
232
+ "goals_suf_mean",
233
+ "goals_suf_mean_l5",
234
+ "rank_mean",
235
+ "rank_mean_l5",
236
+ "points_mean",
237
+ "points_mean_l5",
238
+ "game_points_mean",
239
+ "game_points_mean_l5",
240
+ "game_points_rank_mean",
241
+ "game_points_rank_mean_l5",
242
+ ]
243
 
244
  stats_df = pd.DataFrame(stats_val, columns=stats_cols)
245
 
246
+ full_df = pd.concat(
247
+ [team_stats.reset_index(drop=True), stats_df], axis=1, ignore_index=False
248
+ )
249
 
250
+ home_team_stats = full_df.iloc[: int(full_df.shape[0] / 2), :]
251
+ away_team_stats = full_df.iloc[int(full_df.shape[0] / 2) :, :]
252
 
253
  home_team_stats = home_team_stats[home_team_stats.columns[-12:]]
254
  away_team_stats = away_team_stats[away_team_stats.columns[-12:]]
255
 
256
+ home_team_stats.columns = ["home_" + str(col) for col in home_team_stats.columns]
257
+ away_team_stats.columns = ["away_" + str(col) for col in away_team_stats.columns]
258
 
259
  # In order to unify the database, is needed to add home and away suffix for each column.
260
  # After that, the data is ready to be merged.
261
+ match_stats = pd.concat(
262
+ [home_team_stats, away_team_stats.reset_index(drop=True)],
263
+ axis=1,
264
+ ignore_index=False,
265
+ )
266
 
267
+ full_df = pd.concat(
268
+ [df, match_stats.reset_index(drop=True)], axis=1, ignore_index=False
269
+ )
270
 
271
  # Drop friendly game
272
  full_df["is_friendly"] = full_df["tournament"].apply(lambda x: find_friendly(x))
273
  full_df = pd.get_dummies(full_df, columns=["is_friendly"])
274
 
275
  base_df = full_df[
276
+ [
277
+ "date",
278
+ "home_team",
279
+ "away_team",
280
+ "rank_home",
281
+ "rank_away",
282
+ "home_score",
283
+ "away_score",
284
+ "result",
285
+ "rank_dif",
286
+ "rank_change_home",
287
+ "rank_change_away",
288
+ "home_goals_mean",
289
+ "home_goals_mean_l5",
290
+ "home_goals_suf_mean",
291
+ "home_goals_suf_mean_l5",
292
+ "home_rank_mean",
293
+ "home_rank_mean_l5",
294
+ "home_points_mean",
295
+ "home_points_mean_l5",
296
+ "away_goals_mean",
297
+ "away_goals_mean_l5",
298
+ "away_goals_suf_mean",
299
+ "away_goals_suf_mean_l5",
300
+ "away_rank_mean",
301
+ "away_rank_mean_l5",
302
+ "away_points_mean",
303
+ "away_points_mean_l5",
304
+ "home_game_points_mean",
305
+ "home_game_points_mean_l5",
306
+ "home_game_points_rank_mean",
307
+ "home_game_points_rank_mean_l5",
308
+ "away_game_points_mean",
309
+ "away_game_points_mean_l5",
310
+ "away_game_points_rank_mean",
311
+ "away_game_points_rank_mean_l5",
312
+ "is_friendly_0",
313
+ "is_friendly_1",
314
+ ]
315
+ ]
316
 
317
  df = base_df.dropna()
318
 
 
341
  :param df:
342
  :return:
343
  """
344
+ columns = [
345
+ "home_team",
346
+ "away_team",
347
+ "target",
348
+ "rank_dif",
349
+ "home_goals_mean",
350
+ "home_rank_mean",
351
+ "away_goals_mean",
352
+ "away_rank_mean",
353
+ "home_rank_mean_l5",
354
+ "away_rank_mean_l5",
355
+ "home_goals_suf_mean",
356
+ "away_goals_suf_mean",
357
+ "home_goals_mean_l5",
358
+ "away_goals_mean_l5",
359
+ "home_goals_suf_mean_l5",
360
+ "away_goals_suf_mean_l5",
361
+ "home_game_points_rank_mean",
362
+ "home_game_points_rank_mean_l5",
363
+ "away_game_points_rank_mean",
364
+ "away_game_points_rank_mean_l5",
365
+ "is_friendly_0",
366
+ "is_friendly_1",
367
+ ]
368
 
369
  base = df.loc[:, columns]
370
  base.loc[:, "goals_dif"] = base["home_goals_mean"] - base["away_goals_mean"]
371
+ base.loc[:, "goals_dif_l5"] = (
372
+ base["home_goals_mean_l5"] - base["away_goals_mean_l5"]
373
+ )
374
+ base.loc[:, "goals_suf_dif"] = (
375
+ base["home_goals_suf_mean"] - base["away_goals_suf_mean"]
376
+ )
377
+ base.loc[:, "goals_suf_dif_l5"] = (
378
+ base["home_goals_suf_mean_l5"] - base["away_goals_suf_mean_l5"]
379
+ )
380
+ base.loc[:, "goals_per_ranking_dif"] = (
381
+ base["home_goals_mean"] / base["home_rank_mean"]
382
+ ) - (base["away_goals_mean"] / base["away_rank_mean"])
383
  base.loc[:, "dif_rank_agst"] = base["home_rank_mean"] - base["away_rank_mean"]
384
+ base.loc[:, "dif_rank_agst_l5"] = (
385
+ base["home_rank_mean_l5"] - base["away_rank_mean_l5"]
386
+ )
387
+ base.loc[:, "dif_points_rank"] = (
388
+ base["home_game_points_rank_mean"] - base["away_game_points_rank_mean"]
389
+ )
390
+ base.loc[:, "dif_points_rank_l5"] = (
391
+ base["home_game_points_rank_mean_l5"] - base["away_game_points_rank_mean_l5"]
392
+ )
393
 
394
  model_df = base[
395
+ [
396
+ "home_team",
397
+ "away_team",
398
+ "target",
399
+ "rank_dif",
400
+ "goals_dif",
401
+ "goals_dif_l5",
402
+ "goals_suf_dif",
403
+ "goals_suf_dif_l5",
404
+ "goals_per_ranking_dif",
405
+ "dif_rank_agst",
406
+ "dif_rank_agst_l5",
407
+ "dif_points_rank",
408
+ "dif_points_rank_l5",
409
+ "is_friendly_0",
410
+ "is_friendly_1",
411
+ ]
412
+ ]
413
  return model_df
414
 
415
 
ml/model.py CHANGED
@@ -7,8 +7,14 @@ import numpy as np
7
  import xgboost as xgb
8
  from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
9
  from sklearn.linear_model import LogisticRegression
10
- from sklearn.metrics import accuracy_score, roc_auc_score, cohen_kappa_score, plot_confusion_matrix, roc_curve, \
11
- classification_report
 
 
 
 
 
 
12
  from sklearn.model_selection import GridSearchCV
13
  from sklearn.neural_network import MLPClassifier
14
  from sklearn.tree import DecisionTreeClassifier
@@ -23,11 +29,11 @@ def plot_roc_cur(fper, tper):
23
  :param fper:
24
  :param tper:
25
  """
26
- plt.plot(fper, tper, color='orange', label='ROC')
27
- plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
28
- plt.xlabel('False Positive Rate')
29
- plt.ylabel('True Positive Rate')
30
- plt.title('Receiver Operating Characteristic (ROC) Curve')
31
  plt.legend()
32
  plt.show()
33
 
@@ -39,8 +45,11 @@ class MLModel:
39
 
40
  def __init__(self, model_type: Text):
41
 
42
- assert model_type in SUPPORT_MODEL, \
43
- "Not support the kind of model. Please choose one of {}".format(SUPPORT_MODEL)
 
 
 
44
  self.model_type = model_type
45
  if self.model_type == "LogisticRegression":
46
  self.model = self.get_logistic_regression_model()
@@ -95,11 +104,13 @@ class MLModel:
95
  params_lr = {
96
  "C": np.logspace(-3, 3, 7),
97
  "penalty": ["l1", "l2"],
98
- 'solver': 'liblinear'
99
  }
100
 
101
  model_lr = LogisticRegression()
102
- model_lr = GridSearchCV(model_lr, params_lr, cv=3, verbose=False, scoring='roc_auc', refit=True)
 
 
103
  return model_lr
104
 
105
  @staticmethod
@@ -109,14 +120,22 @@ class MLModel:
109
  :return:
110
  """
111
  if not all(params.values()):
112
- params = {'max_features': ['auto', 'sqrt', 'log2'],
113
- 'ccp_alpha': [0.1, .01, .001],
114
- 'max_depth': [5, 6, 7, 8, 9],
115
- 'criterion': ['gini', 'entropy']
116
- }
 
117
 
118
  model = DecisionTreeClassifier()
119
- model = GridSearchCV(estimator=model, param_grid=params, cv=3, verbose=False, scoring='roc_auc', refit=True)
 
 
 
 
 
 
 
120
  return model
121
 
122
  @staticmethod
@@ -126,14 +145,30 @@ class MLModel:
126
  :return:
127
  """
128
  if not all(params_nn.values()):
129
- params_nn = {'solver': ['lbfgs'],
130
- 'max_iter': [1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000],
131
- 'alpha': 10.0 ** -np.arange(1, 10),
132
- 'hidden_layer_sizes': np.arange(10, 15),
133
- 'random_state': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  model_nn = MLPClassifier()
136
- model_nn = GridSearchCV(model_nn, params_nn, n_jobs=-1, scoring='roc_auc', refit=True, verbose=False)
 
 
137
  return model_nn
138
 
139
  @staticmethod
@@ -143,16 +178,25 @@ class MLModel:
143
  :return:
144
  """
145
  if not all(params_rf.values()):
146
- params_rf = {"max_depth": [20],
147
- "min_samples_split": [10],
148
- "max_leaf_nodes": [175],
149
- "min_samples_leaf": [5],
150
- "n_estimators": [250],
151
- "max_features": ["sqrt"],
152
- }
 
153
 
154
  model_rf = RandomForestClassifier()
155
- model_rf = GridSearchCV(model_rf, params_rf, cv=3, n_jobs=-1, verbose=False, scoring='roc_auc', refit=True)
 
 
 
 
 
 
 
 
156
 
157
  return model_rf
158
 
@@ -164,21 +208,37 @@ class MLModel:
164
  """
165
  if not all(params_lgb.values()):
166
  params_lgb = {
167
- 'learning_rate': [0.005, 0.01],
168
- 'n_estimators': [8, 16, 24],
169
- 'num_leaves': [6, 8, 12, 16], # large num_leaves helps improve accuracy but might lead to over-fitting
170
- 'boosting_type': ['gbdt', 'dart'], # for better accuracy -> try dart
171
- 'objective': ['binary'],
172
- 'max_bin': [255, 510], # large max_bin helps improve accuracy but might slow down training progress
173
- 'random_state': [500],
174
- 'colsample_bytree': [0.64, 0.65, 0.66],
175
- 'subsample': [0.7, 0.75],
176
- 'reg_alpha': [1, 1.2],
177
- 'reg_lambda': [1, 1.2, 1.4],
 
 
 
 
 
 
 
 
178
  }
179
 
180
  model = lgb.LGBMClassifier()
181
- model = GridSearchCV(model, params_lgb, verbose=False, cv=3, n_jobs=-1, scoring='roc_auc', refit=True)
 
 
 
 
 
 
 
 
182
 
183
  return model
184
 
@@ -190,22 +250,28 @@ class MLModel:
190
  """
191
  if not all(params_xgb.values()):
192
  params_xgb = {
193
- 'nthread': [4], # when use hyper thread, xgboost may become slower
194
- 'objective': ['binary:logistic'],
195
- 'learning_rate': [0.05], # so called `eta` value
196
- 'max_depth': [6],
197
- 'min_child_weight': [11],
198
- 'silent': [1],
199
- 'subsample': [0.8],
200
- 'colsample_bytree': [0.7],
201
- 'n_estimators': [100], # number of trees, change it to 1000 for better results
202
- 'missing': [-999],
203
- 'seed': [1337]
 
 
204
  }
205
- model = GridSearchCV(xgb.XGBClassifier(), params_xgb, n_jobs=-1,
206
- cv=3,
207
- scoring='roc_auc',
208
- refit=True)
 
 
 
 
209
 
210
  return model
211
 
@@ -218,8 +284,9 @@ class MLModel:
218
  :param y_test:
219
  :return:
220
  """
221
- model_lr, accuracy_lr, roc_auc_lr, coh_kap_lr, tt_lr = \
222
- self.__run_model(self.model, x_train, y_train, x_test, y_test)
 
223
  return model_lr, accuracy_lr, roc_auc_lr, coh_kap_lr, tt_lr
224
 
225
  @staticmethod
@@ -230,13 +297,14 @@ class MLModel:
230
  :return:
231
  """
232
  if not all(params.values()):
233
- params = {"learning_rate": [0.01, 0.02, 0.03],
234
- "min_samples_split": [5, 10],
235
- "min_samples_leaf": [3, 5],
236
- "max_depth": [3, 5, 10],
237
- "max_features": ["sqrt"],
238
- "n_estimators": [100, 200]
239
- }
 
240
  model = GradientBoostingClassifier(random_state=100)
241
  return GridSearchCV(model, params, cv=3, n_jobs=-1)
242
 
 
7
  import xgboost as xgb
8
  from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
9
  from sklearn.linear_model import LogisticRegression
10
+ from sklearn.metrics import (
11
+ accuracy_score,
12
+ roc_auc_score,
13
+ cohen_kappa_score,
14
+ plot_confusion_matrix,
15
+ roc_curve,
16
+ classification_report,
17
+ )
18
  from sklearn.model_selection import GridSearchCV
19
  from sklearn.neural_network import MLPClassifier
20
  from sklearn.tree import DecisionTreeClassifier
 
29
  :param fper:
30
  :param tper:
31
  """
32
+ plt.plot(fper, tper, color="orange", label="ROC")
33
+ plt.plot([0, 1], [0, 1], color="darkblue", linestyle="--")
34
+ plt.xlabel("False Positive Rate")
35
+ plt.ylabel("True Positive Rate")
36
+ plt.title("Receiver Operating Characteristic (ROC) Curve")
37
  plt.legend()
38
  plt.show()
39
 
 
45
 
46
  def __init__(self, model_type: Text):
47
 
48
+ assert (
49
+ model_type in SUPPORT_MODEL
50
+ ), "Not support the kind of model. Please choose one of {}".format(
51
+ SUPPORT_MODEL
52
+ )
53
  self.model_type = model_type
54
  if self.model_type == "LogisticRegression":
55
  self.model = self.get_logistic_regression_model()
 
104
  params_lr = {
105
  "C": np.logspace(-3, 3, 7),
106
  "penalty": ["l1", "l2"],
107
+ "solver": "liblinear",
108
  }
109
 
110
  model_lr = LogisticRegression()
111
+ model_lr = GridSearchCV(
112
+ model_lr, params_lr, cv=3, verbose=False, scoring="roc_auc", refit=True
113
+ )
114
  return model_lr
115
 
116
  @staticmethod
 
120
  :return:
121
  """
122
  if not all(params.values()):
123
+ params = {
124
+ "max_features": ["auto", "sqrt", "log2"],
125
+ "ccp_alpha": [0.1, 0.01, 0.001],
126
+ "max_depth": [5, 6, 7, 8, 9],
127
+ "criterion": ["gini", "entropy"],
128
+ }
129
 
130
  model = DecisionTreeClassifier()
131
+ model = GridSearchCV(
132
+ estimator=model,
133
+ param_grid=params,
134
+ cv=3,
135
+ verbose=False,
136
+ scoring="roc_auc",
137
+ refit=True,
138
+ )
139
  return model
140
 
141
  @staticmethod
 
145
  :return:
146
  """
147
  if not all(params_nn.values()):
148
+ params_nn = {
149
+ "solver": ["lbfgs"],
150
+ "max_iter": [
151
+ 1000,
152
+ 1100,
153
+ 1200,
154
+ 1300,
155
+ 1400,
156
+ 1500,
157
+ 1600,
158
+ 1700,
159
+ 1800,
160
+ 1900,
161
+ 2000,
162
+ ],
163
+ "alpha": 10.0 ** -np.arange(1, 10),
164
+ "hidden_layer_sizes": np.arange(10, 15),
165
+ "random_state": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
166
+ }
167
 
168
  model_nn = MLPClassifier()
169
+ model_nn = GridSearchCV(
170
+ model_nn, params_nn, n_jobs=-1, scoring="roc_auc", refit=True, verbose=False
171
+ )
172
  return model_nn
173
 
174
  @staticmethod
 
178
  :return:
179
  """
180
  if not all(params_rf.values()):
181
+ params_rf = {
182
+ "max_depth": [20],
183
+ "min_samples_split": [10],
184
+ "max_leaf_nodes": [175],
185
+ "min_samples_leaf": [5],
186
+ "n_estimators": [250],
187
+ "max_features": ["sqrt"],
188
+ }
189
 
190
  model_rf = RandomForestClassifier()
191
+ model_rf = GridSearchCV(
192
+ model_rf,
193
+ params_rf,
194
+ cv=3,
195
+ n_jobs=-1,
196
+ verbose=False,
197
+ scoring="roc_auc",
198
+ refit=True,
199
+ )
200
 
201
  return model_rf
202
 
 
208
  """
209
  if not all(params_lgb.values()):
210
  params_lgb = {
211
+ "learning_rate": [0.005, 0.01],
212
+ "n_estimators": [8, 16, 24],
213
+ "num_leaves": [
214
+ 6,
215
+ 8,
216
+ 12,
217
+ 16,
218
+ ], # large num_leaves helps improve accuracy but might lead to over-fitting
219
+ "boosting_type": ["gbdt", "dart"], # for better accuracy -> try dart
220
+ "objective": ["binary"],
221
+ "max_bin": [
222
+ 255,
223
+ 510,
224
+ ], # large max_bin helps improve accuracy but might slow down training progress
225
+ "random_state": [500],
226
+ "colsample_bytree": [0.64, 0.65, 0.66],
227
+ "subsample": [0.7, 0.75],
228
+ "reg_alpha": [1, 1.2],
229
+ "reg_lambda": [1, 1.2, 1.4],
230
  }
231
 
232
  model = lgb.LGBMClassifier()
233
+ model = GridSearchCV(
234
+ model,
235
+ params_lgb,
236
+ verbose=False,
237
+ cv=3,
238
+ n_jobs=-1,
239
+ scoring="roc_auc",
240
+ refit=True,
241
+ )
242
 
243
  return model
244
 
 
250
  """
251
  if not all(params_xgb.values()):
252
  params_xgb = {
253
+ "nthread": [4], # when use hyper thread, xgboost may become slower
254
+ "objective": ["binary:logistic"],
255
+ "learning_rate": [0.05], # so called `eta` value
256
+ "max_depth": [6],
257
+ "min_child_weight": [11],
258
+ "silent": [1],
259
+ "subsample": [0.8],
260
+ "colsample_bytree": [0.7],
261
+ "n_estimators": [
262
+ 100
263
+ ], # number of trees, change it to 1000 for better results
264
+ "missing": [-999],
265
+ "seed": [1337],
266
  }
267
+ model = GridSearchCV(
268
+ xgb.XGBClassifier(),
269
+ params_xgb,
270
+ n_jobs=-1,
271
+ cv=3,
272
+ scoring="roc_auc",
273
+ refit=True,
274
+ )
275
 
276
  return model
277
 
 
284
  :param y_test:
285
  :return:
286
  """
287
+ model_lr, accuracy_lr, roc_auc_lr, coh_kap_lr, tt_lr = self.__run_model(
288
+ self.model, x_train, y_train, x_test, y_test
289
+ )
290
  return model_lr, accuracy_lr, roc_auc_lr, coh_kap_lr, tt_lr
291
 
292
  @staticmethod
 
297
  :return:
298
  """
299
  if not all(params.values()):
300
+ params = {
301
+ "learning_rate": [0.01, 0.02, 0.03],
302
+ "min_samples_split": [5, 10],
303
+ "min_samples_leaf": [3, 5],
304
+ "max_depth": [3, 5, 10],
305
+ "max_features": ["sqrt"],
306
+ "n_estimators": [100, 200],
307
+ }
308
  model = GradientBoostingClassifier(random_state=100)
309
  return GridSearchCV(model, params, cv=3, n_jobs=-1)
310
 
ml/predictor.py CHANGED
@@ -43,7 +43,9 @@ class Predictor:
43
  :return:
44
  """
45
 
46
- last_game = self.base_df[(self.base_df["home_team"] == team) | (self.base_df["away_team"] == team)].tail(1)
 
 
47
 
48
  if last_game["home_team"].values[0] == team:
49
  team_rank = last_game["rank_home"].values[0]
@@ -66,8 +68,17 @@ class Predictor:
66
  team_gp_rank = last_game["away_game_points_rank_mean"].values[0]
67
  team_gp_rank_l5 = last_game["away_game_points_rank_mean_l5"].values[0]
68
 
69
- return [team_rank, team_goals, team_goals_l5, team_goals_suf, team_goals_suf_l5, team_rank_suf,
70
- team_rank_suf_l5, team_gp_rank, team_gp_rank_l5]
 
 
 
 
 
 
 
 
 
71
 
72
  @staticmethod
73
  def find_features(team_1, team_2):
@@ -88,8 +99,20 @@ class Predictor:
88
  dif_gp_rank = team_1[7] - team_2[7]
89
  dif_gp_rank_l5 = team_1[8] - team_2[8]
90
 
91
- return [rank_dif, goals_dif, goals_dif_l5, goals_suf_dif, goals_suf_dif_l5, goals_per_ranking_dif,
92
- dif_rank_agst, dif_rank_agst_l5, dif_gp_rank, dif_gp_rank_l5, 1, 0]
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  def __predict(self, team_1: Text, team_2: Text):
95
 
@@ -109,7 +132,14 @@ class Predictor:
109
  team_1_prob = (probs_g1[0][0] + probs_g2[0][1]) / 2
110
  team_2_prob = (probs_g2[0][0] + probs_g1[0][1]) / 2
111
 
112
- return team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2
 
 
 
 
 
 
 
113
 
114
  def predict(self, team_1: Text, team_2: Text) -> Tuple[bool, Text, float]:
115
  """
@@ -119,11 +149,18 @@ class Predictor:
119
  :return:
120
  """
121
  draw = False
122
- team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2 = self.__predict(
123
- team_1, team_2)
 
 
 
 
 
 
124
  winner, winner_proba = "", 0.0
125
  if ((team_1_prob_g1 > team_2_prob_g1) & (team_2_prob_g2 > team_1_prob_g2)) | (
126
- (team_1_prob_g1 < team_2_prob_g1) & (team_2_prob_g2 < team_1_prob_g2)):
 
127
  draw = True
128
 
129
  elif team_1_prob > team_2_prob:
@@ -142,17 +179,24 @@ class Predictor:
142
  """
143
  result = ""
144
  data = load_pickle(os.path.join(DATA_ROOT, cfg.data.table_matches))
145
- table = data['table']
146
- matches = data['matches']
147
  advanced_group, last_group = [], ""
148
 
149
  for teams in matches:
150
  draw = False
151
- team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2 = self.__predict(
152
- teams[1], teams[2])
 
 
 
 
 
 
153
  winner, winner_proba = "", 0.0
154
- if ((team_1_prob_g1 > team_2_prob_g1) & (team_2_prob_g2 > team_1_prob_g2)) | (
155
- (team_1_prob_g1 < team_2_prob_g1) & (team_2_prob_g2 < team_1_prob_g2)):
 
156
  draw = True
157
  for i in table[teams[0]]:
158
  if i[0] == teams[1] or i[0] == teams[2]:
@@ -186,18 +230,34 @@ class Predictor:
186
  i[2] = np.mean(i[2])
187
 
188
  final_points = table[last_group]
189
- final_table = sorted(final_points, key=itemgetter(1, 2), reverse=True)
 
 
190
  advanced_group.append([final_table[0][0], final_table[1][0]])
191
  for i in final_table:
192
  result += "%s -------- %d\n" % (i[0], i[1])
193
  result += "\n"
194
- result += "-" * 10 + " Starting Analysis for Group %s " % (teams[0]) + "-" * 10 + "\n"
 
 
 
 
 
195
 
196
  if draw is False:
197
  result += "Group %s - %s vs. %s: Winner %s with %.2f probability\n" % (
198
- teams[0], teams[1], teams[2], winner, winner_proba)
 
 
 
 
 
199
  else:
200
- result += "Group %s - %s vs. %s: Draw\n" % (teams[0], teams[1], teams[2])
 
 
 
 
201
  last_group = teams[0]
202
  result += "\n"
203
  result += "Group %s advanced: \n" % last_group
@@ -212,7 +272,12 @@ class Predictor:
212
  result += "%s -------- %d\n" % (i[0], i[1])
213
 
214
  advanced = advanced_group
215
- playoffs = {"Round of 16": [], "Quarter-Final": [], "Semi-Final": [], "Final": []}
 
 
 
 
 
216
 
217
  for p in playoffs.keys():
218
  playoffs[p] = []
@@ -234,7 +299,11 @@ class Predictor:
234
  control.append((advanced * 2)[a][1])
235
  else:
236
  control.append((advanced * 2)[a][0])
237
- playoffs[p] = [[control[c], control[c + 1]] for c in range(0, len(control) - 1, 1) if c % 2 == 0]
 
 
 
 
238
 
239
  for i in range(0, len(playoffs[p]), 1):
240
  game = playoffs[p][i]
@@ -242,18 +311,34 @@ class Predictor:
242
  home = game[0]
243
  away = game[1]
244
 
245
- team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2 = \
246
- self.__predict(home, away)
 
 
 
 
 
 
247
  if actual_round != p:
248
  result += "-" * 10 + "\n"
249
  result += "Starting simulation of %s\n" % p
250
  result += "-" * 10 + "\n"
251
 
252
  if team_1_prob < team_2_prob:
253
- result += "%s vs. %s: %s advances with prob %.2f\n" % (home, away, away, team_2_prob)
 
 
 
 
 
254
  next_rounds.append(away)
255
  else:
256
- result += "%s vs. %s: %s advances with prob %.2f\n" % (home, away, home, team_1_prob)
 
 
 
 
 
257
  next_rounds.append(home)
258
 
259
  game.append([team_1_prob, team_2_prob])
@@ -261,26 +346,45 @@ class Predictor:
261
  actual_round = p
262
 
263
  else:
264
- playoffs[p] = [[next_rounds[c], next_rounds[c + 1]] for c in range(0, len(next_rounds) - 1, 1) if
265
- c % 2 == 0]
 
 
 
266
  next_rounds = []
267
  for i in range(0, len(playoffs[p])):
268
  game = playoffs[p][i]
269
  home = game[0]
270
  away = game[1]
271
 
272
- team_1_prob_g1, team_1_prob_g2, team_1_prob, team_2_prob, team_2_prob_g1, team_2_prob_g2 = \
273
- self.__predict(home, away)
 
 
 
 
 
 
274
  if actual_round != p:
275
  result += "-" * 10 + "\n"
276
  result += "Starting simulation of %s\n" % p
277
  result += "-" * 10 + "\n"
278
 
279
  if team_1_prob < team_2_prob:
280
- result += "%s vs. %s: %s advances with prob %.2f \n" % (home, away, away, team_2_prob)
 
 
 
 
 
281
  next_rounds.append(away)
282
  else:
283
- result += "%s vs. %s: %s advances with prob %.2f \n" % (home, away, home, team_1_prob)
 
 
 
 
 
284
  next_rounds.append(home)
285
  game.append([team_1_prob, team_2_prob])
286
  playoffs[p][i] = game
 
43
  :return:
44
  """
45
 
46
+ last_game = self.base_df[
47
+ (self.base_df["home_team"] == team) | (self.base_df["away_team"] == team)
48
+ ].tail(1)
49
 
50
  if last_game["home_team"].values[0] == team:
51
  team_rank = last_game["rank_home"].values[0]
 
68
  team_gp_rank = last_game["away_game_points_rank_mean"].values[0]
69
  team_gp_rank_l5 = last_game["away_game_points_rank_mean_l5"].values[0]
70
 
71
+ return [
72
+ team_rank,
73
+ team_goals,
74
+ team_goals_l5,
75
+ team_goals_suf,
76
+ team_goals_suf_l5,
77
+ team_rank_suf,
78
+ team_rank_suf_l5,
79
+ team_gp_rank,
80
+ team_gp_rank_l5,
81
+ ]
82
 
83
  @staticmethod
84
  def find_features(team_1, team_2):
 
99
  dif_gp_rank = team_1[7] - team_2[7]
100
  dif_gp_rank_l5 = team_1[8] - team_2[8]
101
 
102
+ return [
103
+ rank_dif,
104
+ goals_dif,
105
+ goals_dif_l5,
106
+ goals_suf_dif,
107
+ goals_suf_dif_l5,
108
+ goals_per_ranking_dif,
109
+ dif_rank_agst,
110
+ dif_rank_agst_l5,
111
+ dif_gp_rank,
112
+ dif_gp_rank_l5,
113
+ 1,
114
+ 0,
115
+ ]
116
 
117
  def __predict(self, team_1: Text, team_2: Text):
118
 
 
132
  team_1_prob = (probs_g1[0][0] + probs_g2[0][1]) / 2
133
  team_2_prob = (probs_g2[0][0] + probs_g1[0][1]) / 2
134
 
135
+ return (
136
+ team_1_prob_g1,
137
+ team_1_prob_g2,
138
+ team_1_prob,
139
+ team_2_prob,
140
+ team_2_prob_g1,
141
+ team_2_prob_g2,
142
+ )
143
 
144
  def predict(self, team_1: Text, team_2: Text) -> Tuple[bool, Text, float]:
145
  """
 
149
  :return:
150
  """
151
  draw = False
152
+ (
153
+ team_1_prob_g1,
154
+ team_1_prob_g2,
155
+ team_1_prob,
156
+ team_2_prob,
157
+ team_2_prob_g1,
158
+ team_2_prob_g2,
159
+ ) = self.__predict(team_1, team_2)
160
  winner, winner_proba = "", 0.0
161
  if ((team_1_prob_g1 > team_2_prob_g1) & (team_2_prob_g2 > team_1_prob_g2)) | (
162
+ (team_1_prob_g1 < team_2_prob_g1) & (team_2_prob_g2 < team_1_prob_g2)
163
+ ):
164
  draw = True
165
 
166
  elif team_1_prob > team_2_prob:
 
179
  """
180
  result = ""
181
  data = load_pickle(os.path.join(DATA_ROOT, cfg.data.table_matches))
182
+ table = data["table"]
183
+ matches = data["matches"]
184
  advanced_group, last_group = [], ""
185
 
186
  for teams in matches:
187
  draw = False
188
+ (
189
+ team_1_prob_g1,
190
+ team_1_prob_g2,
191
+ team_1_prob,
192
+ team_2_prob,
193
+ team_2_prob_g1,
194
+ team_2_prob_g2,
195
+ ) = self.__predict(teams[1], teams[2])
196
  winner, winner_proba = "", 0.0
197
+ if (
198
+ (team_1_prob_g1 > team_2_prob_g1) & (team_2_prob_g2 > team_1_prob_g2)
199
+ ) | ((team_1_prob_g1 < team_2_prob_g1) & (team_2_prob_g2 < team_1_prob_g2)):
200
  draw = True
201
  for i in table[teams[0]]:
202
  if i[0] == teams[1] or i[0] == teams[2]:
 
230
  i[2] = np.mean(i[2])
231
 
232
  final_points = table[last_group]
233
+ final_table = sorted(
234
+ final_points, key=itemgetter(1, 2), reverse=True
235
+ )
236
  advanced_group.append([final_table[0][0], final_table[1][0]])
237
  for i in final_table:
238
  result += "%s -------- %d\n" % (i[0], i[1])
239
  result += "\n"
240
+ result += (
241
+ "-" * 10
242
+ + " Starting Analysis for Group %s " % (teams[0])
243
+ + "-" * 10
244
+ + "\n"
245
+ )
246
 
247
  if draw is False:
248
  result += "Group %s - %s vs. %s: Winner %s with %.2f probability\n" % (
249
+ teams[0],
250
+ teams[1],
251
+ teams[2],
252
+ winner,
253
+ winner_proba,
254
+ )
255
  else:
256
+ result += "Group %s - %s vs. %s: Draw\n" % (
257
+ teams[0],
258
+ teams[1],
259
+ teams[2],
260
+ )
261
  last_group = teams[0]
262
  result += "\n"
263
  result += "Group %s advanced: \n" % last_group
 
272
  result += "%s -------- %d\n" % (i[0], i[1])
273
 
274
  advanced = advanced_group
275
+ playoffs = {
276
+ "Round of 16": [],
277
+ "Quarter-Final": [],
278
+ "Semi-Final": [],
279
+ "Final": [],
280
+ }
281
 
282
  for p in playoffs.keys():
283
  playoffs[p] = []
 
299
  control.append((advanced * 2)[a][1])
300
  else:
301
  control.append((advanced * 2)[a][0])
302
+ playoffs[p] = [
303
+ [control[c], control[c + 1]]
304
+ for c in range(0, len(control) - 1, 1)
305
+ if c % 2 == 0
306
+ ]
307
 
308
  for i in range(0, len(playoffs[p]), 1):
309
  game = playoffs[p][i]
 
311
  home = game[0]
312
  away = game[1]
313
 
314
+ (
315
+ team_1_prob_g1,
316
+ team_1_prob_g2,
317
+ team_1_prob,
318
+ team_2_prob,
319
+ team_2_prob_g1,
320
+ team_2_prob_g2,
321
+ ) = self.__predict(home, away)
322
  if actual_round != p:
323
  result += "-" * 10 + "\n"
324
  result += "Starting simulation of %s\n" % p
325
  result += "-" * 10 + "\n"
326
 
327
  if team_1_prob < team_2_prob:
328
+ result += "%s vs. %s: %s advances with prob %.2f\n" % (
329
+ home,
330
+ away,
331
+ away,
332
+ team_2_prob,
333
+ )
334
  next_rounds.append(away)
335
  else:
336
+ result += "%s vs. %s: %s advances with prob %.2f\n" % (
337
+ home,
338
+ away,
339
+ home,
340
+ team_1_prob,
341
+ )
342
  next_rounds.append(home)
343
 
344
  game.append([team_1_prob, team_2_prob])
 
346
  actual_round = p
347
 
348
  else:
349
+ playoffs[p] = [
350
+ [next_rounds[c], next_rounds[c + 1]]
351
+ for c in range(0, len(next_rounds) - 1, 1)
352
+ if c % 2 == 0
353
+ ]
354
  next_rounds = []
355
  for i in range(0, len(playoffs[p])):
356
  game = playoffs[p][i]
357
  home = game[0]
358
  away = game[1]
359
 
360
+ (
361
+ team_1_prob_g1,
362
+ team_1_prob_g2,
363
+ team_1_prob,
364
+ team_2_prob,
365
+ team_2_prob_g1,
366
+ team_2_prob_g2,
367
+ ) = self.__predict(home, away)
368
  if actual_round != p:
369
  result += "-" * 10 + "\n"
370
  result += "Starting simulation of %s\n" % p
371
  result += "-" * 10 + "\n"
372
 
373
  if team_1_prob < team_2_prob:
374
+ result += "%s vs. %s: %s advances with prob %.2f \n" % (
375
+ home,
376
+ away,
377
+ away,
378
+ team_2_prob,
379
+ )
380
  next_rounds.append(away)
381
  else:
382
+ result += "%s vs. %s: %s advances with prob %.2f \n" % (
383
+ home,
384
+ away,
385
+ home,
386
+ team_1_prob,
387
+ )
388
  next_rounds.append(home)
389
  game.append([team_1_prob, team_2_prob])
390
  playoffs[p][i] = game
ml/utils.py CHANGED
@@ -12,7 +12,7 @@ def write_pickle(path, a):
12
 
13
  """
14
  try:
15
- with open(path, 'wb') as handle:
16
  pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)
17
  return True
18
  except Exception as e:
@@ -29,6 +29,6 @@ def load_pickle(path):
29
  Returns:
30
 
31
  """
32
- with open(path, 'rb') as handle:
33
  data = pickle.load(handle)
34
  return data
 
12
 
13
  """
14
  try:
15
+ with open(path, "wb") as handle:
16
  pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)
17
  return True
18
  except Exception as e:
 
29
  Returns:
30
 
31
  """
32
+ with open(path, "rb") as handle:
33
  data = pickle.load(handle)
34
  return data