import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, MinMaxScaler import math import matplotlib.pyplot as plt, joblib # from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor # from sklearn.tree import DecisionTreeRegressor # from catboost import CatBoostRegressor import warnings, random from sklearn.metrics import mean_absolute_error as mae from sklearn.metrics import mean_squared_error as mse # from sklearn import tree # from sklearn.svm import SVR # from sklearn.ensemble import VotingRegressor import os warnings.filterwarnings("ignore") features = [ "batting_team", # "bowling_team", # "balls", # "runs", # "wickets", "wkt_last_5_overs", # "runrate_last_5_overs", "current_RR", # "average", "balls_left", "wkts_left", # "required_RR", # "projected_score_more", # "min_score_more", # "max_score_more", # "projected_avg_score_more", "runrate_last_5_overs-current_RR", ] target = "deviation_from_projected" # evaluate def evaluate(model, featuresdf, x_test, fname): predictdf = featuresdf.loc[x_test.index].copy() # print(predictdf.columns) predictdf["h_deviation_from_projected"] = model.predict( featuresdf.loc[x_test.index][features] ) predictdf["error"] = ( predictdf["h_deviation_from_projected"] - predictdf["deviation_from_projected"] ) predictdf["abs_error"] = predictdf["error"].abs() ax = plt.gca() plt.plot(predictdf.groupby("balls").aggregate({"abs_error": "mean"})) plt.legend("Abs deviation") # ax.set_ylim([-50, 50]) plt.title(type(model).__name__) plt.xlabel("Balls on which prediction was made") plt.ylabel("Mean Abs Prediction error") plt.savefig("result/" + fname + ".png") plt.clf() predictdf.sample(frac=0.0001).to_csv("result/" + fname + "_sample.csv") # fig = plt.figure(figsize=(25, 20)) # tree.plot_tree(model) # fig.savefig(fname + ".png") # plt.clf()batting_teamsort_values("overs", ascending=False).to_string(index=False)) def train_test_split_matchid(df, matchids, split=0.2): unique_match_ids = set(matchids) print(f"{len(unique_match_ids)=}") testids = random.sample(unique_match_ids, int(len(unique_match_ids) * split)) trainids = list(unique_match_ids.difference(testids)) return ( df[features][df.matchid.isin(trainids)], df[features][df.matchid.isin(testids)], df[target][df.matchid.isin(trainids)], df[target][df.matchid.isin(testids)], ) def encode_teams(series): encoder = LabelEncoder() encoder.fit(series) np.save("model/team.npy", encoder.classes_) def transform_teams(series): encoder = LabelEncoder() encoder.classes_ = np.load("model/team.npy", allow_pickle=True) return encoder.transform(np.array(series).reshape(-1, 1)).reshape(-1) def plot_feature_importance(f, imp, fname): importance = ( pd.DataFrame( zip(*[f, imp]), columns=["feature", "importance"], ) .sort_values("importance", ascending=False) .set_index("feature") ) importance["importance"] = importance["importance"] / importance["importance"].sum() fig, ax = plt.subplots() importance.plot.bar(ax=ax) ax.bar_label(ax.containers[0], labels=f, rotation=90, label_type="center") ax.set_xticks([]) ax.set_title("Feature importances for predicted score " + fname) ax.set_ylabel("Significance") ax.set_xlabel("Features") plt.savefig("result/" + fname + "featureimp.png") plt.clf() def train(fname, max_depth=-1): print("training on", fname, "...") featuresdf = pd.read_feather(fname) featuresdf = featuresdf[featuresdf["inning"] == 2] encode_teams( featuresdf["batting_team"].to_list() + featuresdf["bowling_team"].to_list() ) featuresdf["batting_team"] = transform_teams(featuresdf["batting_team"]) featuresdf["bowling_team"] = transform_teams(featuresdf["bowling_team"]) x_train, x_test, y_train, y_test = train_test_split_matchid( featuresdf, featuresdf["matchid"], 0.2 ) print(f"{len(x_train)=} {len(x_test)=}") model = RandomForestRegressor(max_depth=8) model.fit(x_train, y_train) # for xgb # plot_feature_importance( # model.get_booster().get_score(importance_type="gain").keys(), # model.get_booster().get_score(importance_type="gain").values(), # fname, # ) # for rf plot_feature_importance( features, np.std([tree.feature_importances_ for tree in model.estimators_], axis=0), os.path.basename(fname), ) print("Depth:", [e.tree_.max_depth for e in model.estimators_]) # for dt # plot_feature_importance( # features, # model.feature_importances_, # fname, # ) # print(model.tree_.max_depth) # print(f"{model.score(x_train, y_train)=}, {model.score(x_test, y_test)=}") print( f"{mse(model.predict(x_train), y_train, squared=False)=}, {mse(model.predict(x_test), y_test, squared=False)=}" ) evaluate(model, featuresdf, x_test, os.path.basename(fname)) model.fit(featuresdf[features], featuresdf[target]) joblib.dump(model, f"model/{os.path.basename(fname)}.joblib") return model if __name__ == "__main__": train("data/t20features.feather") train("data/odifeatures.feather")