Spaces:
Runtime error
Runtime error
import subprocess, sys | |
from multiprocessing import Pool | |
import pandas as pd, json, os, math | |
import numpy as np | |
from tqdm import tqdm | |
from sklearn.model_selection import train_test_split | |
import matplotlib.pyplot as plt | |
from cricksheet import get_all_matches | |
# import ydata_profiling | |
## Reading IPL dataset | |
total_wickets = 10 | |
n_pools = 100 | |
## Feature selection/creation and ngram creation | |
features = [ | |
"matchid", | |
"format", | |
"inning", | |
"batting_team", | |
"bowling_team", | |
"balls", | |
"runs", | |
"wickets", | |
"wkt_last_5_overs", | |
"runrate_last_5_overs", | |
"runrate_last_5_overs-current_RR", | |
"current_RR", | |
# "average", | |
"balls_left", | |
"wkts_left", | |
# "required_RR", | |
# "projected_score_more", | |
# "min_score_more", | |
# "max_score_more", | |
# "projected_avg_score_more", | |
"final_score", | |
"final_score_more", | |
"deviation_from_projected", | |
] | |
getformat = {"ODI": 1, "T20": 2} | |
def extract_features(inning): | |
data = [] | |
# total_balls = ( | |
# 120 if inning.format == "T20" else 300 if inning.format == "ODI" else None | |
# ) | |
total_balls = len(inning.df) | |
df = inning.df | |
# matchid = inning.matchid | |
# batting_team = inning.battingteam | |
for i in range(1, len(df)): | |
min_RR = 0.5 | |
max_RR = 2.5 | |
runs = df.iloc[:i]["run"].sum() | |
run_last_5_overs = df["run"].iloc[-30:].sum() | |
runrate_last_5_overs = run_last_5_overs / 6 | |
wickets = df.iloc[:i]["wicket"].sum() | |
wkt_last_5_overs = df.iloc[:i]["wicket"].iloc[-30:].sum() | |
balls = len(df.iloc[:i]) | |
current_RR = (runs * 6) / balls | |
rr_diff = runrate_last_5_overs - current_RR | |
average = runs / (wickets + 1) | |
balls_left = total_balls - balls | |
wk_left = total_wickets - wickets | |
required_RR = ( | |
((inning.target - runs) * 6) / balls if inning.inning == 2 else -9999 | |
) | |
projected_score_more = current_RR * balls_left / 6 | |
min_score_more = min_RR * balls_left / 6 | |
max_score_more = max_RR * balls_left / 6 | |
projected_avg_score_more = average * wk_left / 6 | |
final_score_more = inning.final_score - runs | |
format = getformat[inning.format] | |
deviation_from_projected = final_score_more - projected_score_more | |
data.append( | |
( | |
inning.matchid, | |
format, | |
inning.inning, | |
inning.battingteam, | |
inning.bowlingteam, | |
balls, | |
runs, | |
wickets, | |
wkt_last_5_overs, | |
round(runrate_last_5_overs, 2), | |
round(rr_diff, 2), | |
round(current_RR, 2), | |
# average, | |
balls_left, | |
wk_left, | |
# required_RR, | |
# projected_score_more, | |
# min_score_more, | |
# max_score_more, | |
# projected_avg_score_more, | |
inning.final_score, | |
final_score_more, | |
round(deviation_from_projected), | |
) | |
) | |
return data | |
def save_features(innings, fname): | |
print("Feature enggineering and ngram creation...") | |
n_innings = len(innings) | |
print(f"{n_innings=}") | |
pool = Pool(processes=n_pools) | |
Xy = pool.map(extract_features, innings) | |
Xy = [xi for Xi in Xy for xi in Xi] | |
print(f"{len(Xy)=}") | |
featuresdf = pd.DataFrame(Xy, columns=features) | |
# ydata_profiling.ProfileReport(featuresdf, title=fname).to_file(fname + ".html") | |
featuresdf.to_feather(fname) | |
featuresdf.to_csv(fname + ".csv") | |
if __name__ == "__main__": | |
print("Loading t20 data...") | |
innings = get_all_matches(format="T20", since=2021) | |
print("Saving t20 data") | |
save_features(innings, "data/t20features.feather") | |
print("Loading odi data...") | |
innings = get_all_matches(format="ODI", since=2021) | |
print("Saving odi data") | |
save_features(innings, "data/odifeatures.feather") | |