cricket-prophet / features.py
Instantaneous1's picture
first commit
56f6887
import subprocess, sys
from multiprocessing import Pool
import pandas as pd, json, os, math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from cricksheet import get_all_matches
# import ydata_profiling
## Reading IPL dataset
total_wickets = 10
n_pools = 100
## Feature selection/creation and ngram creation
features = [
"matchid",
"format",
"inning",
"batting_team",
"bowling_team",
"balls",
"runs",
"wickets",
"wkt_last_5_overs",
"runrate_last_5_overs",
"runrate_last_5_overs-current_RR",
"current_RR",
# "average",
"balls_left",
"wkts_left",
# "required_RR",
# "projected_score_more",
# "min_score_more",
# "max_score_more",
# "projected_avg_score_more",
"final_score",
"final_score_more",
"deviation_from_projected",
]
getformat = {"ODI": 1, "T20": 2}
def extract_features(inning):
data = []
# total_balls = (
# 120 if inning.format == "T20" else 300 if inning.format == "ODI" else None
# )
total_balls = len(inning.df)
df = inning.df
# matchid = inning.matchid
# batting_team = inning.battingteam
for i in range(1, len(df)):
min_RR = 0.5
max_RR = 2.5
runs = df.iloc[:i]["run"].sum()
run_last_5_overs = df["run"].iloc[-30:].sum()
runrate_last_5_overs = run_last_5_overs / 6
wickets = df.iloc[:i]["wicket"].sum()
wkt_last_5_overs = df.iloc[:i]["wicket"].iloc[-30:].sum()
balls = len(df.iloc[:i])
current_RR = (runs * 6) / balls
rr_diff = runrate_last_5_overs - current_RR
average = runs / (wickets + 1)
balls_left = total_balls - balls
wk_left = total_wickets - wickets
required_RR = (
((inning.target - runs) * 6) / balls if inning.inning == 2 else -9999
)
projected_score_more = current_RR * balls_left / 6
min_score_more = min_RR * balls_left / 6
max_score_more = max_RR * balls_left / 6
projected_avg_score_more = average * wk_left / 6
final_score_more = inning.final_score - runs
format = getformat[inning.format]
deviation_from_projected = final_score_more - projected_score_more
data.append(
(
inning.matchid,
format,
inning.inning,
inning.battingteam,
inning.bowlingteam,
balls,
runs,
wickets,
wkt_last_5_overs,
round(runrate_last_5_overs, 2),
round(rr_diff, 2),
round(current_RR, 2),
# average,
balls_left,
wk_left,
# required_RR,
# projected_score_more,
# min_score_more,
# max_score_more,
# projected_avg_score_more,
inning.final_score,
final_score_more,
round(deviation_from_projected),
)
)
return data
def save_features(innings, fname):
print("Feature enggineering and ngram creation...")
n_innings = len(innings)
print(f"{n_innings=}")
pool = Pool(processes=n_pools)
Xy = pool.map(extract_features, innings)
Xy = [xi for Xi in Xy for xi in Xi]
print(f"{len(Xy)=}")
featuresdf = pd.DataFrame(Xy, columns=features)
# ydata_profiling.ProfileReport(featuresdf, title=fname).to_file(fname + ".html")
featuresdf.to_feather(fname)
featuresdf.to_csv(fname + ".csv")
if __name__ == "__main__":
print("Loading t20 data...")
innings = get_all_matches(format="T20", since=2021)
print("Saving t20 data")
save_features(innings, "data/t20features.feather")
print("Loading odi data...")
innings = get_all_matches(format="ODI", since=2021)
print("Saving odi data")
save_features(innings, "data/odifeatures.feather")