cricket-prophet / model.py
Instantaneous1's picture
first commit
56f6887
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import math
import matplotlib.pyplot as plt, joblib
# from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
# from sklearn.tree import DecisionTreeRegressor
# from catboost import CatBoostRegressor
import warnings, random
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
# from sklearn import tree
# from sklearn.svm import SVR
# from sklearn.ensemble import VotingRegressor
import os
warnings.filterwarnings("ignore")
features = [
"batting_team",
# "bowling_team",
# "balls",
# "runs",
# "wickets",
"wkt_last_5_overs",
# "runrate_last_5_overs",
"current_RR",
# "average",
"balls_left",
"wkts_left",
# "required_RR",
# "projected_score_more",
# "min_score_more",
# "max_score_more",
# "projected_avg_score_more",
"runrate_last_5_overs-current_RR",
]
target = "deviation_from_projected"
# evaluate
def evaluate(model, featuresdf, x_test, fname):
predictdf = featuresdf.loc[x_test.index].copy()
# print(predictdf.columns)
predictdf["h_deviation_from_projected"] = model.predict(
featuresdf.loc[x_test.index][features]
)
predictdf["error"] = (
predictdf["h_deviation_from_projected"] - predictdf["deviation_from_projected"]
)
predictdf["abs_error"] = predictdf["error"].abs()
ax = plt.gca()
plt.plot(predictdf.groupby("balls").aggregate({"abs_error": "mean"}))
plt.legend("Abs deviation")
# ax.set_ylim([-50, 50])
plt.title(type(model).__name__)
plt.xlabel("Balls on which prediction was made")
plt.ylabel("Mean Abs Prediction error")
plt.savefig("result/" + fname + ".png")
plt.clf()
predictdf.sample(frac=0.0001).to_csv("result/" + fname + "_sample.csv")
# fig = plt.figure(figsize=(25, 20))
# tree.plot_tree(model)
# fig.savefig(fname + ".png")
# plt.clf()batting_teamsort_values("overs", ascending=False).to_string(index=False))
def train_test_split_matchid(df, matchids, split=0.2):
unique_match_ids = set(matchids)
print(f"{len(unique_match_ids)=}")
testids = random.sample(unique_match_ids, int(len(unique_match_ids) * split))
trainids = list(unique_match_ids.difference(testids))
return (
df[features][df.matchid.isin(trainids)],
df[features][df.matchid.isin(testids)],
df[target][df.matchid.isin(trainids)],
df[target][df.matchid.isin(testids)],
)
def encode_teams(series):
encoder = LabelEncoder()
encoder.fit(series)
np.save("model/team.npy", encoder.classes_)
def transform_teams(series):
encoder = LabelEncoder()
encoder.classes_ = np.load("model/team.npy", allow_pickle=True)
return encoder.transform(np.array(series).reshape(-1, 1)).reshape(-1)
def plot_feature_importance(f, imp, fname):
importance = (
pd.DataFrame(
zip(*[f, imp]),
columns=["feature", "importance"],
)
.sort_values("importance", ascending=False)
.set_index("feature")
)
importance["importance"] = importance["importance"] / importance["importance"].sum()
fig, ax = plt.subplots()
importance.plot.bar(ax=ax)
ax.bar_label(ax.containers[0], labels=f, rotation=90, label_type="center")
ax.set_xticks([])
ax.set_title("Feature importances for predicted score " + fname)
ax.set_ylabel("Significance")
ax.set_xlabel("Features")
plt.savefig("result/" + fname + "featureimp.png")
plt.clf()
def train(fname, max_depth=-1):
print("training on", fname, "...")
featuresdf = pd.read_feather(fname)
featuresdf = featuresdf[featuresdf["inning"] == 2]
encode_teams(
featuresdf["batting_team"].to_list() + featuresdf["bowling_team"].to_list()
)
featuresdf["batting_team"] = transform_teams(featuresdf["batting_team"])
featuresdf["bowling_team"] = transform_teams(featuresdf["bowling_team"])
x_train, x_test, y_train, y_test = train_test_split_matchid(
featuresdf, featuresdf["matchid"], 0.2
)
print(f"{len(x_train)=} {len(x_test)=}")
model = RandomForestRegressor(max_depth=8)
model.fit(x_train, y_train)
# for xgb
# plot_feature_importance(
# model.get_booster().get_score(importance_type="gain").keys(),
# model.get_booster().get_score(importance_type="gain").values(),
# fname,
# )
# for rf
plot_feature_importance(
features,
np.std([tree.feature_importances_ for tree in model.estimators_], axis=0),
os.path.basename(fname),
)
print("Depth:", [e.tree_.max_depth for e in model.estimators_])
# for dt
# plot_feature_importance(
# features,
# model.feature_importances_,
# fname,
# )
# print(model.tree_.max_depth)
# print(f"{model.score(x_train, y_train)=}, {model.score(x_test, y_test)=}")
print(
f"{mse(model.predict(x_train), y_train, squared=False)=}, {mse(model.predict(x_test), y_test, squared=False)=}"
)
evaluate(model, featuresdf, x_test, os.path.basename(fname))
model.fit(featuresdf[features], featuresdf[target])
joblib.dump(model, f"model/{os.path.basename(fname)}.joblib")
return model
if __name__ == "__main__":
train("data/t20features.feather")
train("data/odifeatures.feather")