diff --git "a/Notebook.ipynb" "b/Notebook.ipynb" new file mode 100644--- /dev/null +++ "b/Notebook.ipynb" @@ -0,0 +1,1030 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build Data" + ] + }, + { + "cell_type": "code", + "execution_count": 193, + "metadata": {}, + "outputs": [], + "source": [ + "import nfl_data_py.nfl_data_py as nfl\n", + "from tqdm import tqdm\n", + "import numpy as np\n", + "import pandas as pd\n", + "pd.set_option('chained_assignment',None)\n", + "pd.set_option('display.max_columns',None)\n", + "\n", + "\n", + "def get_pbp_data(years,overwrite=False):\n", + " \"\"\"\n", + " Pull data from nflFastR's Github repo. \n", + " If you choose to overwrite, it will replace the existing pbp data with the data you pull.\n", + "\n", + " \"\"\"\n", + " pbp = nfl.import_pbp_data(years)\n", + " pbp['TOP_seconds'] = pbp['drive_time_of_possession'].apply(lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1]) if pd.notnull(x) else 0)\n", + " \n", + " if overwrite:\n", + " pbp.to_csv('pbp.csv')\n", + "\n", + " return pbp\n", + "\n", + "\n", + "def build_gbg_data(update_seasons=[]):\n", + " \"\"\"\n", + " Using pbp.csv, build a game-by-game dataset to use for prediction models.\n", + " Populate update_seasons with the current year to only update this season's data while preserving historical data.\n", + "\n", + " \"\"\"\n", + " print('Loading play-by-play data.')\n", + " pbp = pd.read_csv('pbp.csv', index_col=0, low_memory=False)\n", + " game_date_dict = dict(pbp[['game_id','game_date']].values)\n", + "\n", + " if update_seasons:\n", + " print('Getting data for', update_seasons)\n", + " pbp = get_pbp_data(update_seasons)\n", + "\n", + " teams = pbp['home_team'].unique()\n", + " seasons = pbp['season'].unique()\n", + " \n", + " print('Building game-by-game data.')\n", + " data = pd.DataFrame()\n", + " for season in seasons:\n", + " print(season)\n", + " for team_name in tqdm(teams):\n", + " # create features\n", + " team = pbp.loc[((pbp['home_team']==team_name) | (pbp['away_team']==team_name)) & (pbp['season']==season)] \n", + " team['GP'] = team['week']\n", + " team['W'] = [1 if r>0 and team_name==h else 1 if r<0 and team_name==a else 0 for r,a,h in team[['result','away_team','home_team']].values]\n", + " team['L'] = [0 if r>0 and team_name==h else 0 if r<0 and team_name==a else 1 for r,a,h in team[['result','away_team','home_team']].values]\n", + " team['W_PCT'] = team['W']/team['GP']\n", + " team['TOP'] = [t if team_name==p else 0 for t,p in team[['TOP_seconds','posteam']].values]\n", + " team['FGA'] = [1 if team_name==p and f==1 else 0 for p,f in team[['posteam','field_goal_attempt']].values]\n", + " team['FGM'] = [1 if team_name==p and f=='made' else 0 for p,f in team[['posteam','field_goal_result']].values]\n", + " team['FG_PCT'] = team['FGM']/team['FGA']\n", + " team['PassTD'] = np.where((team['posteam'] == team_name) & (team['pass_touchdown'] == 1), 1, 0)\n", + " team['RushTD'] = np.where((team['posteam'] == team_name) & (team['rush_touchdown'] == 1), 1, 0)\n", + " team['PassTD_Allowed'] = np.where((team['defteam'] == team_name) & (team['pass_touchdown'] == 1), 1, 0)\n", + " team['RushTD_Allowed'] = np.where((team['defteam'] == team_name) & (team['rush_touchdown'] == 1), 1, 0)\n", + " team['PassYds'] = [y if p==team_name else 0 for p,y in team[['posteam','passing_yards']].values]\n", + " team['RushYds'] = [y if p==team_name else 0 for p,y in team[['posteam','rushing_yards']].values]\n", + " team['PassYds_Allowed'] = [y if d==team_name else 0 for d,y in team[['defteam','passing_yards']].values]\n", + " team['RushYds_Allowed'] = [y if d==team_name else 0 for d,y in team[['defteam','rushing_yards']].values]\n", + " team['Fum'] = np.where((team['defteam'] == team_name) & (team['fumble_lost'] == 1), 1, 0)\n", + " team['Fum_Allowed'] = np.where((team['posteam'] == team_name) & (team['fumble_lost'] == 1), 1, 0)\n", + " team['INT'] = np.where((team['defteam'] == team_name) & (team['interception'] == 1), 1, 0)\n", + " team['INT_Allowed'] = np.where((team['posteam'] == team_name) & (team['interception'] == 1), 1, 0)\n", + " team['Sacks'] = np.where((team['defteam'] == team_name) & (team['sack'] == 1), 1, 0)\n", + " team['Sacks_Allowed'] = np.where((team['posteam'] == team_name) & (team['sack'] == 1), 1, 0)\n", + " team['Penalties'] = np.where((team['penalty_team'] == team_name), 1, 0)\n", + " team['FirstDowns'] = [1 if team_name==p and f==1 else 0 for p,f in team[['posteam','first_down']].values]\n", + " team['3rdDownConverted'] = [1 if p==team_name and t==1 else 0 for p,t in team[['posteam','third_down_converted']].values]\n", + " team['3rdDownFailed'] = [1 if p==team_name and t==1 else 0 for p,t in team[['posteam','third_down_failed']].values]\n", + " team['3rdDownAllowed'] = [1 if d==team_name and t==1 else 0 for d,t in team[['defteam','third_down_converted']].values]\n", + " team['3rdDownDefended'] = [1 if d==team_name and t==1 else 0 for d,t in team[['defteam','third_down_failed']].values]\n", + " team['PTS'] = [ap if at==team_name else hp if ht==team_name else None for ht,at,hp,ap in team[['home_team','away_team','home_score','away_score']].values]\n", + " team['PointDiff'] = [r if team_name==h else -r if team_name==a else 0 for r,a,h in team[['result','away_team','home_team']].values]\n", + "\n", + " # aggregate from play-by-play to game-by-game\n", + " features = {\n", + " 'GP':'mean',\n", + " 'W':'mean',\n", + " 'L':'mean',\n", + " 'W_PCT':'mean',\n", + " 'TOP':'sum',\n", + " 'FGA':'sum',\n", + " 'FGM':'sum',\n", + " 'FG_PCT':'mean',\n", + " 'PassTD':'sum',\n", + " 'RushTD':'sum',\n", + " 'PassTD_Allowed':'sum',\n", + " 'RushTD_Allowed':'sum',\n", + " 'PassYds':'sum',\n", + " 'RushYds':'sum',\n", + " 'PassYds_Allowed':'sum',\n", + " 'RushYds_Allowed':'sum',\n", + " 'Fum':'sum',\n", + " 'Fum_Allowed':'sum',\n", + " 'INT':'sum',\n", + " 'INT_Allowed':'sum',\n", + " 'Sacks':'sum',\n", + " 'Sacks_Allowed':'sum',\n", + " 'Penalties':'sum',\n", + " 'FirstDowns':'sum',\n", + " '3rdDownConverted':'sum',\n", + " '3rdDownFailed':'sum',\n", + " '3rdDownAllowed':'sum',\n", + " '3rdDownDefended':'sum',\n", + " 'PTS':'mean',\n", + " 'PointDiff':'mean'\n", + " }\n", + "\n", + " game = team.groupby('game_id').agg(features).reset_index()\n", + " game[['W','L']] = game[['W','L']].expanding().sum()\n", + " game[game.columns[4:]] = game[game.columns[4:]].expanding().mean()\n", + " game[game.columns[1:]] = game[game.columns[1:]].shift()\n", + " game['TEAM'] = team_name\n", + " game['Season'] = season\n", + "\n", + " data = pd.concat([data,game])\n", + "\n", + " # get rank data for each feature\n", + " #feature_columns = list(features.keys())\n", + " #for col in feature_columns:\n", + " # rank_col = f\"{col}_Rank\"\n", + " # data[rank_col] = data.groupby(['Season', 'GP', 'TEAM'])[col].rank(ascending=False)\n", + "\n", + " # separate home and away data and merge\n", + " data = data.merge(pbp[['game_id','home_team','away_team']].drop_duplicates())\n", + " home = data.loc[data['home_team']==data['TEAM']]\n", + " away = data.loc[data['away_team']==data['TEAM']]\n", + " away.columns = [f'{i}.Away' for i in away.columns]\n", + " gbg = home.merge(away,left_on='game_id',right_on='game_id.Away')\n", + " gbg.drop(columns=['TEAM','TEAM.Away','home_team.Away','away_team.Away','Season.Away','game_id.Away'], inplace=True)\n", + " gbg['game_date'] = gbg['game_id'].map(game_date_dict)\n", + "\n", + " if update_seasons:\n", + " old = pd.read_csv('gbg.csv', index_col=0, low_memory=False)\n", + " old = old.loc[~old['Season'].isin(seasons)]\n", + " gbg = pd.concat([old,gbg])\n", + "\n", + " gbg.to_csv('gbg.csv')\n", + "\n", + "\n", + "def add_odds_data():\n", + " \"\"\"\n", + " Get odds from Australian Sports Betting's free online dataset and merge it with game-by-game data.\n", + "\n", + " \"\"\"\n", + " # load gbg data\n", + " gbg = pd.read_csv('gbg.csv', index_col=0)\n", + " \n", + " # get team abbreviations\n", + " team_descriptions = nfl.import_team_desc()\n", + " team_abbreviation_dict = dict(team_descriptions[['team_name','team_abbr']].values)\n", + " \n", + " # get odds\n", + " odds = pd.read_excel('https://www.aussportsbetting.com/historical_data/nfl.xlsx')\n", + " odds['Home Team'] = odds['Home Team'].str.replace('Washington Redskins','Washington Commanders').str.replace('Washington Football Team','Washington Commanders')\n", + " odds['Away Team'] = odds['Away Team'].str.replace('Washington Redskins','Washington Commanders').str.replace('Washington Football Team','Washington Commanders')\n", + " odds['Season'] = [i.year if i.month in [8,9,10,11,12] else i.year-1 for i in odds['Date']]\n", + " odds['Home Team Abbrev'] = odds['Home Team'].map(team_abbreviation_dict)\n", + " odds['Away Team Abbrev'] = odds['Away Team'].map(team_abbreviation_dict)\n", + " odds = odds[['Date','Home Score','Away Score','Home Team Abbrev','Away Team Abbrev','Home Odds Close','Away Odds Close','Total Score Close']]\n", + " odds['Key'] = odds['Date'].astype(str) + odds['Home Team Abbrev'] + odds['Away Team Abbrev']\n", + " odds = odds.drop(columns=['Date','Home Team Abbrev','Away Team Abbrev']).dropna()\n", + " odds['Home Odds'] = [round((i-1)*100) if i>= 2 else round(-100/(i-1)) for i in odds['Home Odds Close']]\n", + " odds['Away Odds'] = [round((i-1)*100) if i>= 2 else round(-100/(i-1)) for i in odds['Away Odds Close']]\n", + " odds['Home Winnings'] = [ho-1 if h>a else -1 if a>h else 0 for ho,h,a in odds[['Home Odds Close','Home Score','Away Score']].values]\n", + " odds['Away Winnings'] = [ao-1 if a>h else -1 if h>a else 0 for ao,h,a in odds[['Away Odds Close','Home Score','Away Score']].values]\n", + "\n", + " # merge with gbg\n", + " gbg['Key'] = gbg['game_date'].astype(str) + gbg['home_team'] + gbg['away_team']\n", + " gbg_and_odds = gbg.merge(odds, left_on='Key', right_on='Key')\n", + " gbg_and_odds['Home-Team-Win'] = (gbg_and_odds['Home Score']>gbg_and_odds['Away Score']).astype(int)\n", + " gbg_and_odds['Over'] = ((gbg_and_odds['Home Score'] + gbg_and_odds['Away Score'])>gbg_and_odds['Total Score Close']).astype(int)\n", + " gbg_and_odds.to_csv('gbg_and_odds.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ML" + ] + }, + { + "cell_type": "code", + "execution_count": 196, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 100/100 [01:15<00:00, 1.33it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best accuracy: 75.4%\n", + "Done\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "import xgboost as xgb\n", + "import pickle as pkl\n", + "from IPython.display import clear_output\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "data = pd.read_csv('gbg_and_odds.csv', index_col=0).dropna()\n", + "\n", + "margin = data['Home-Team-Win']\n", + "data.drop(columns=['Home-Team-Win','Over','Season','home_team','away_team','game_date','Key','Home Score','Away Score','Home Odds Close','Away Odds Close','Home Winnings','Away Winnings'], inplace=True)\n", + "\n", + "acc_results = []\n", + "for x in tqdm(range(100)):\n", + " X_train, X_test, y_train, y_test = train_test_split(data, margin, test_size=.1)\n", + "\n", + " train_games = X_train['game_id']\n", + " test_games = X_test['game_id']\n", + "\n", + " X_train.drop(columns=['game_id'], inplace=True)\n", + " X_test.drop(columns=['game_id'], inplace=True)\n", + "\n", + " train = xgb.DMatrix(X_train.astype(float).values, label=y_train)\n", + " test = xgb.DMatrix(X_test.astype(float).values, label=y_test)\n", + "\n", + " param = {\n", + " 'max_depth': 2,\n", + " 'eta': 0.01,\n", + " 'objective': 'multi:softprob',\n", + " 'num_class': 2\n", + " }\n", + " epochs = 500\n", + "\n", + " model = xgb.train(param, train, epochs)\n", + " predictions = model.predict(test)\n", + " y = []\n", + " for z in predictions:\n", + " y.append(np.argmax(z))\n", + "\n", + " acc = round(accuracy_score(y_test, y)*100, 1)\n", + " acc_results.append(acc)\n", + " clear_output(wait=True)\n", + " print(f\"Best accuracy: {max(acc_results)}%\")\n", + "\n", + " # only save results if they are the best so far\n", + " if acc == max(acc_results):\n", + " with open('train_games_ML.pkl','wb') as f:\n", + " pkl.dump(train_games,f)\n", + " with open('test_games_ML.pkl','wb') as f:\n", + " pkl.dump(test_games,f)\n", + " model.save_model(f'models/xgboost_ML_{acc}%.json')\n", + "\n", + "print('Done')" + ] + }, + { + "cell_type": "code", + "execution_count": 203, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import xgboost as xgb\n", + "xgb_ml = xgb.Booster()\n", + "xgb_ml.load_model('models/xgboost_ML_75.4%.json')\n", + "\n", + "with open('test_games_ML.pkl','rb') as f:\n", + " test_games = pkl.load(f).tolist()\n", + "\n", + "gbg_and_odds = pd.read_csv('gbg_and_odds.csv', index_col=0)\n", + "test_data = gbg_and_odds.loc[gbg_and_odds['game_id'].isin(test_games)]\n", + "test_data_matrix = xgb.DMatrix(test_data.drop(columns=['game_id','Over','Home-Team-Win','Season','home_team','away_team','game_date','Key','Home Score','Away Score','Home Odds Close','Away Odds Close','Home Winnings','Away Winnings']).astype(float).values)\n", + "\n", + "predicted_probas = xgb_ml.predict(test_data_matrix)\n", + "predictions = np.argmax(predicted_probas, axis=1)\n", + "test_data['predicted_proba'] = [i[1] for i in predicted_probas]\n", + "test_data['prediction'] = (test_data['predicted_proba']>0.5).astype(int)\n", + "test_data['correct'] = test_data['Home-Team-Win']==test_data['prediction']\n", + "test_data['home_ev'] = [((h-1)*p)+((-1)*p-1) for h,a,p in test_data[['Home Odds Close','Away Odds Close','predicted_proba']].values]\n", + "test_data['away_ev'] = [((-1)*p)+((a-1)*p-1) for h,a,p in test_data[['Home Odds Close','Away Odds Close','predicted_proba']].values]\n", + "\n", + "bets = test_data.loc[(test_data['predicted_proba']>0.6) | (test_data['predicted_proba']<0.4)]\n", + "bets['winnings'] = [h if he>ae else a for h,a,he,ae in bets[['Home Winnings','Away Winnings','home_ev','away_ev']].values]\n", + "bets['winnings'] = [h if c else a for h,a,c in bets[['Home Winnings','Away Winnings','correct']].values]\n", + "\n", + "import matplotlib.pyplot as plt\n", + "plt.plot(bets['winnings'].cumsum().values*100, linewidth=3)\n", + "plt.title('MARCI 3.0 - MoneyLine w/ 10% Threshold')\n", + "plt.xlabel('Games Bet On')\n", + "plt.ylabel('Return (%)')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## OU" + ] + }, + { + "cell_type": "code", + "execution_count": 204, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 100/100 [02:17<00:00, 1.37s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best accuracy: 59.3%\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "import xgboost as xgb\n", + "import pickle as pkl\n", + "from IPython.display import clear_output\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "data = pd.read_csv('gbg_and_odds.csv', index_col=0).dropna()\n", + "\n", + "OU = data['Over']\n", + "data.drop(columns=['Home-Team-Win','Over','Season','home_team','away_team','game_date','Key','Home Score','Away Score','Home Odds Close','Away Odds Close','Home Winnings','Away Winnings'], inplace=True)\n", + "\n", + "acc_results = []\n", + "\n", + "for x in tqdm(range(100)):\n", + " X_train, X_test, y_train, y_test = train_test_split(data, OU, test_size=.1)\n", + "\n", + " train_games = X_train['game_id']\n", + " test_games = X_test['game_id']\n", + "\n", + " X_train.drop(columns=['game_id'], inplace=True)\n", + " X_test.drop(columns=['game_id'], inplace=True)\n", + "\n", + " train = xgb.DMatrix(X_train.astype(float).values, label=y_train)\n", + " test = xgb.DMatrix(X_test.astype(float).values, label=y_test)\n", + "\n", + " param = {\n", + " 'max_depth': 6,\n", + " 'eta': 0.05,\n", + " 'objective': 'multi:softprob',\n", + " 'num_class': 3\n", + " }\n", + " epochs = 300\n", + "\n", + " model = xgb.train(param, train, epochs)\n", + "\n", + " predictions = model.predict(test)\n", + " y = []\n", + "\n", + " for z in predictions:\n", + " y.append(np.argmax(z))\n", + "\n", + " acc = round(accuracy_score(y_test, y)*100, 1)\n", + " acc_results.append(acc)\n", + " clear_output(wait=True)\n", + " print(f\"Best accuracy: {max(acc_results)}%\")\n", + " \n", + " # only save results if they are the best so far\n", + " if acc == max(acc_results):\n", + " with open('train_games_OU.pkl','wb') as f:\n", + " pkl.dump(train_games,f)\n", + " with open('test_games_OU.pkl','wb') as f:\n", + " pkl.dump(test_games,f)\n", + " model.save_model(f'models/xgboost_OU_{acc}%.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 207, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import xgboost as xgb\n", + "xgb_ou = xgb.Booster()\n", + "xgb_ou.load_model('models/xgboost_OU_59.3%.json')\n", + "\n", + "with open('test_games_OU.pkl','rb') as f:\n", + " test_games = pkl.load(f).tolist()\n", + "\n", + "gbg_and_odds = pd.read_csv('gbg_and_odds.csv', index_col=0)\n", + "test_data = gbg_and_odds.loc[gbg_and_odds['game_id'].isin(test_games)]\n", + "test_data_matrix = xgb.DMatrix(test_data.drop(columns=['game_id','Over','Home-Team-Win','Season','home_team','away_team','game_date','Key','Home Score','Away Score','Home Odds Close','Away Odds Close','Home Winnings','Away Winnings']).astype(float).values)\n", + "\n", + "predicted_probas = xgb_ou.predict(test_data_matrix)\n", + "predictions = np.argmax(predicted_probas, axis=1)\n", + "test_data['predicted_proba'] = [i[1] for i in predicted_probas]\n", + "test_data['prediction'] = (test_data['predicted_proba']>0.5).astype(int)\n", + "test_data['correct'] = test_data['Over']==test_data['prediction']\n", + "\n", + "bets = test_data\n", + "bets['winnings'] = [0.91 if c else -1 for c in bets[['correct']].values]\n", + "\n", + "import matplotlib.pyplot as plt\n", + "plt.plot(bets['winnings'].cumsum().values*100, linewidth=3)\n", + "plt.title('MARCI 3.0 - Over/Under')\n", + "plt.xlabel('Games Bet On')\n", + "plt.ylabel('Return (%)')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Predict" + ] + }, + { + "cell_type": "code", + "execution_count": 321, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Brayden\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python311\\site-packages\\dateutil\\parser\\_parser.py:1207: UnknownTimezoneWarning: tzname EST identified but not understood. Pass `tzinfos` argument in order to correctly return a timezone-aware datetime. In a future version, this will raise an exception.\n", + " warnings.warn(\"tzname {tzname} identified but not understood. \"\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Away TeamAway TeamHome TeamHome TeamGame TimeGame TimeNetworkNetworkVenueVenueAway TeamHome TeamDate
0DET Detroit LionsKC Kansas City Chiefs09/07/23 8:20 PM ESTNBCGEHA Field at Arrowhead StadiumDetroit LionsKansas City ChiefsThursday 7/9 08:20
1CIN Cincinnati BengalsCLE Cleveland Browns09/10/23 1:00 PM ESTCBSCleveland Browns StadiumCincinnati BengalsCleveland BrownsSunday 10/9 01:00
2Jax Jacksonville JaguarsIND Indianapolis Colts09/10/23 1:00 PM ESTFOXLucas Oil StadiumJacksonville JaguarsIndianapolis ColtsSunday 10/9 01:00
3TB Tampa Bay BuccaneersMIN Minnesota Vikings09/10/23 1:00 PM ESTCBSU.S. Bank StadiumTampa Bay BuccaneersMinnesota VikingsSunday 10/9 01:00
4TEN Tennessee TitansNO New Orleans Saints09/10/23 1:00 PM ESTCBSCaesars SuperdomeTennessee TitansNew Orleans SaintsSunday 10/9 01:00
5CAR Carolina PanthersATL Atlanta Falcons09/10/23 1:00 PM ESTFOXMercedes-Benz StadiumCarolina PanthersAtlanta FalconsSunday 10/9 01:00
6HOU Houston TexansBAL Baltimore Ravens09/10/23 1:00 PM ESTCBSM&T Bank StadiumHouston TexansBaltimore RavensSunday 10/9 01:00
7SF San Francisco 49ersPIT Pittsburgh Steelers09/10/23 1:00 PM ESTFOXAcrisure StadiumSan Francisco 49ersPittsburgh SteelersSunday 10/9 01:00
8ARI Arizona CardinalsWAS Washington Commanders09/10/23 1:00 PM ESTFOXFedEx FieldArizona CardinalsWashington CommandersSunday 10/9 01:00
9GB Green Bay PackersCHI Chicago Bears09/10/23 4:25 PM ESTFOXSoldier FieldGreen Bay PackersChicago BearsSunday 10/9 04:25
10MIA Miami DolphinsLAC Los Angeles Chargers09/10/23 4:25 PM ESTCBSSoFi StadiumMiami DolphinsLos Angeles ChargersSunday 10/9 04:25
11LV Las Vegas RaidersDEN Denver Broncos09/10/23 4:25 PM ESTCBSEmpower Field at Mile HighLas Vegas RaidersDenver BroncosSunday 10/9 04:25
12PHI Philadelphia EaglesNE New England Patriots09/10/23 4:25 PM ESTCBSGillette StadiumPhiladelphia EaglesNew England PatriotsSunday 10/9 04:25
13LAR Los Angeles RamsSEA Seattle Seahawks09/10/23 4:25 PM ESTFOXLumen FieldLos Angeles RamsSeattle SeahawksSunday 10/9 04:25
14DAL Dallas CowboysNYG New York Giants09/10/23 8:20 PM ESTNBC, PEAKMetLife StadiumDallas CowboysNew York GiantsSunday 10/9 08:20
15BUF Buffalo BillsNYJ New York Jets09/11/23 8:15 PM ESTESPN, ABC, ESP+MetLife StadiumBuffalo BillsNew York JetsMonday 11/9 08:15
\n", + "
" + ], + "text/plain": [ + " Away TeamAway Team Home TeamHome Team Game TimeGame Time \\\n", + "0 DET Detroit Lions KC Kansas City Chiefs 09/07/23 8:20 PM EST \n", + "1 CIN Cincinnati Bengals CLE Cleveland Browns 09/10/23 1:00 PM EST \n", + "2 Jax Jacksonville Jaguars IND Indianapolis Colts 09/10/23 1:00 PM EST \n", + "3 TB Tampa Bay Buccaneers MIN Minnesota Vikings 09/10/23 1:00 PM EST \n", + "4 TEN Tennessee Titans NO New Orleans Saints 09/10/23 1:00 PM EST \n", + "5 CAR Carolina Panthers ATL Atlanta Falcons 09/10/23 1:00 PM EST \n", + "6 HOU Houston Texans BAL Baltimore Ravens 09/10/23 1:00 PM EST \n", + "7 SF San Francisco 49ers PIT Pittsburgh Steelers 09/10/23 1:00 PM EST \n", + "8 ARI Arizona Cardinals WAS Washington Commanders 09/10/23 1:00 PM EST \n", + "9 GB Green Bay Packers CHI Chicago Bears 09/10/23 4:25 PM EST \n", + "10 MIA Miami Dolphins LAC Los Angeles Chargers 09/10/23 4:25 PM EST \n", + "11 LV Las Vegas Raiders DEN Denver Broncos 09/10/23 4:25 PM EST \n", + "12 PHI Philadelphia Eagles NE New England Patriots 09/10/23 4:25 PM EST \n", + "13 LAR Los Angeles Rams SEA Seattle Seahawks 09/10/23 4:25 PM EST \n", + "14 DAL Dallas Cowboys NYG New York Giants 09/10/23 8:20 PM EST \n", + "15 BUF Buffalo Bills NYJ New York Jets 09/11/23 8:15 PM EST \n", + "\n", + " NetworkNetwork VenueVenue Away Team \\\n", + "0 NBC GEHA Field at Arrowhead Stadium Detroit Lions \n", + "1 CBS Cleveland Browns Stadium Cincinnati Bengals \n", + "2 FOX Lucas Oil Stadium Jacksonville Jaguars \n", + "3 CBS U.S. Bank Stadium Tampa Bay Buccaneers \n", + "4 CBS Caesars Superdome Tennessee Titans \n", + "5 FOX Mercedes-Benz Stadium Carolina Panthers \n", + "6 CBS M&T Bank Stadium Houston Texans \n", + "7 FOX Acrisure Stadium San Francisco 49ers \n", + "8 FOX FedEx Field Arizona Cardinals \n", + "9 FOX Soldier Field Green Bay Packers \n", + "10 CBS SoFi Stadium Miami Dolphins \n", + "11 CBS Empower Field at Mile High Las Vegas Raiders \n", + "12 CBS Gillette Stadium Philadelphia Eagles \n", + "13 FOX Lumen Field Los Angeles Rams \n", + "14 NBC, PEAK MetLife Stadium Dallas Cowboys \n", + "15 ESPN, ABC, ESP+ MetLife Stadium Buffalo Bills \n", + "\n", + " Home Team Date \n", + "0 Kansas City Chiefs Thursday 7/9 08:20 \n", + "1 Cleveland Browns Sunday 10/9 01:00 \n", + "2 Indianapolis Colts Sunday 10/9 01:00 \n", + "3 Minnesota Vikings Sunday 10/9 01:00 \n", + "4 New Orleans Saints Sunday 10/9 01:00 \n", + "5 Atlanta Falcons Sunday 10/9 01:00 \n", + "6 Baltimore Ravens Sunday 10/9 01:00 \n", + "7 Pittsburgh Steelers Sunday 10/9 01:00 \n", + "8 Washington Commanders Sunday 10/9 01:00 \n", + "9 Chicago Bears Sunday 10/9 04:25 \n", + "10 Los Angeles Chargers Sunday 10/9 04:25 \n", + "11 Denver Broncos Sunday 10/9 04:25 \n", + "12 New England Patriots Sunday 10/9 04:25 \n", + "13 Seattle Seahawks Sunday 10/9 04:25 \n", + "14 New York Giants Sunday 10/9 08:20 \n", + "15 New York Jets Monday 11/9 08:15 " + ] + }, + "execution_count": 321, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "url = 'https://www.nbcsports.com/nfl/schedule'\n", + "df = pd.read_html(url)[0]\n", + "df['Away Team'] = [' '.join(i.split('\\xa0')[1:]) for i in df['Away TeamAway Team']]\n", + "df['Home Team'] = [' '.join(i.split('\\xa0')[1:]) for i in df['Home TeamHome Team']]\n", + "df['Date'] = pd.to_datetime(df['Game TimeGame Time'])\n", + "df['Date'] = df['Date'].dt.strftime('%A %m/%d %I:%M %p')\n", + "df['Date'] = df['Date'].apply(lambda x: f\"{x.split()[0]} {int(x.split()[1].split('/')[1])}/{int(x.split()[1].split('/')[0])} {x.split()[2]}\".capitalize())\n", + "\n", + "return df[['Away Team','Home Team','Date']]" + ] + }, + { + "cell_type": "code", + "execution_count": 243, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Brayden\\AppData\\Local\\Temp\\ipykernel_21628\\2050820634.py:1: DtypeWarning: Columns (181,182,184,185,191,192,195,196,199,200,205,206,207,208,220,221,222,224,226,228,245,246,247,255,256,257,262,264,265,285,286,303) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " pbp = pd.read_csv('Data/pbp_this_year.csv')\n" + ] + } + ], + "source": [ + "pbp = pd.read_csv('Data/pbp_this_year.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 314, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Brayden\\AppData\\Local\\Temp\\ipykernel_21628\\414576273.py:8: DtypeWarning: Columns (181,182,184,185,191,192,195,196,199,200,205,206,207,208,220,221,222,224,226,228,245,246,247,255,256,257,262,264,265,285,286,303) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " pbp = pd.read_csv(file_path)\n" + ] + } + ], + "source": [ + "import os\n", + "current_directory = os.path.dirname('')\n", + "parent_directory = os.path.dirname(current_directory)\n", + "data_directory = os.path.join(parent_directory, 'Data')\n", + "model_directory = os.path.join(parent_directory, 'Models')\n", + "\n", + "file_path = os.path.join(data_directory, f'pbp_this_year.csv')\n", + "pbp = pd.read_csv(file_path)\n", + "\n", + "def get_one_week(team_name,season,week):\n", + " # create columns\n", + " team = pbp.loc[((pbp['home_team']==team_name) | (pbp['away_team']==team_name)) & (pbp['season']==season)] \n", + " team['GP'] = team['week']\n", + " team['W'] = [1 if r>0 and team_name==h else 1 if r<0 and team_name==a else 0 for r,a,h in team[['result','away_team','home_team']].values]\n", + " team['L'] = [0 if r>0 and team_name==h else 0 if r<0 and team_name==a else 1 for r,a,h in team[['result','away_team','home_team']].values]\n", + " team['W_PCT'] = team['W']/team['GP']\n", + " team['TOP'] = [t if team_name==p else 0 for t,p in team[['TOP_seconds','posteam']].values]\n", + " team['FGA'] = [1 if team_name==p and f==1 else 0 for p,f in team[['posteam','field_goal_attempt']].values]\n", + " team['FGM'] = [1 if team_name==p and f=='made' else 0 for p,f in team[['posteam','field_goal_result']].values]\n", + " team['FG_PCT'] = team['FGM']/team['FGA']\n", + " team['PassTD'] = np.where((team['posteam'] == team_name) & (team['pass_touchdown'] == 1), 1, 0)\n", + " team['RushTD'] = np.where((team['posteam'] == team_name) & (team['rush_touchdown'] == 1), 1, 0)\n", + " team['PassTD_Allowed'] = np.where((team['defteam'] == team_name) & (team['pass_touchdown'] == 1), 1, 0)\n", + " team['RushTD_Allowed'] = np.where((team['defteam'] == team_name) & (team['rush_touchdown'] == 1), 1, 0)\n", + " team['PassYds'] = [y if p==team_name else 0 for p,y in team[['posteam','passing_yards']].values]\n", + " team['RushYds'] = [y if p==team_name else 0 for p,y in team[['posteam','rushing_yards']].values]\n", + " team['PassYds_Allowed'] = [y if d==team_name else 0 for d,y in team[['defteam','passing_yards']].values]\n", + " team['RushYds_Allowed'] = [y if d==team_name else 0 for d,y in team[['defteam','rushing_yards']].values]\n", + " team['Fum'] = np.where((team['defteam'] == team_name) & (team['fumble_lost'] == 1), 1, 0)\n", + " team['Fum_Allowed'] = np.where((team['posteam'] == team_name) & (team['fumble_lost'] == 1), 1, 0)\n", + " team['INT'] = np.where((team['defteam'] == team_name) & (team['interception'] == 1), 1, 0)\n", + " team['INT_Allowed'] = np.where((team['posteam'] == team_name) & (team['interception'] == 1), 1, 0)\n", + " team['Sacks'] = np.where((team['defteam'] == team_name) & (team['sack'] == 1), 1, 0)\n", + " team['Sacks_Allowed'] = np.where((team['posteam'] == team_name) & (team['sack'] == 1), 1, 0)\n", + " team['Penalties'] = np.where((team['penalty_team'] == team_name), 1, 0)\n", + " team['FirstDowns'] = [1 if team_name==p and f==1 else 0 for p,f in team[['posteam','first_down']].values]\n", + " team['3rdDownConverted'] = [1 if p==team_name and t==1 else 0 for p,t in team[['posteam','third_down_converted']].values]\n", + " team['3rdDownFailed'] = [1 if p==team_name and t==1 else 0 for p,t in team[['posteam','third_down_failed']].values]\n", + " team['3rdDownAllowed'] = [1 if d==team_name and t==1 else 0 for d,t in team[['defteam','third_down_converted']].values]\n", + " team['3rdDownDefended'] = [1 if d==team_name and t==1 else 0 for d,t in team[['defteam','third_down_failed']].values]\n", + " team['PTS'] = [ap if at==team_name else hp if ht==team_name else None for ht,at,hp,ap in team[['home_team','away_team','home_score','away_score']].values]\n", + " team['PointDiff'] = [r if team_name==h else -r if team_name==a else 0 for r,a,h in team[['result','away_team','home_team']].values]\n", + "\n", + " # aggregate from play-by-play to game-by-game\n", + " features = {\n", + " 'GP':'mean',\n", + " 'W':'mean',\n", + " 'L':'mean',\n", + " 'W_PCT':'mean',\n", + " 'TOP':'sum',\n", + " 'FGA':'sum',\n", + " 'FGM':'sum',\n", + " 'FG_PCT':'mean',\n", + " 'PassTD':'sum',\n", + " 'RushTD':'sum',\n", + " 'PassTD_Allowed':'sum',\n", + " 'RushTD_Allowed':'sum',\n", + " 'PassYds':'sum',\n", + " 'RushYds':'sum',\n", + " 'PassYds_Allowed':'sum',\n", + " 'RushYds_Allowed':'sum',\n", + " 'Fum':'sum',\n", + " 'Fum_Allowed':'sum',\n", + " 'INT':'sum',\n", + " 'INT_Allowed':'sum',\n", + " 'Sacks':'sum',\n", + " 'Sacks_Allowed':'sum',\n", + " 'Penalties':'sum',\n", + " 'FirstDowns':'sum',\n", + " '3rdDownConverted':'sum',\n", + " '3rdDownFailed':'sum',\n", + " '3rdDownAllowed':'sum',\n", + " '3rdDownDefended':'sum',\n", + " 'PTS':'mean',\n", + " 'PointDiff':'mean'\n", + " }\n", + "\n", + " game = team.groupby('game_id').agg(features).reset_index()\n", + " game[['W','L']] = game[['W','L']].expanding().sum()\n", + " game[game.columns[4:]] = game[game.columns[4:]].expanding().mean()\n", + " game['TEAM'] = team_name\n", + " game['Season'] = season\n", + " return game.loc[game['GP']==week]\n", + "\n", + "\n", + "def get_one_week_home_and_away(home,away,season,week):\n", + " home = get_one_week(home,season,week)\n", + " away = get_one_week(away,season,week)\n", + " away.columns = [f'{i}.Away' for i in away.columns]\n", + " gbg = home.merge(away,left_index=True,right_index=True)\n", + " gbg.drop(columns=['TEAM','TEAM.Away','Season.Away','game_id.Away'], inplace=True)\n", + " return gbg\n", + "\n", + "\n", + "def predict(home,away,season,week,total,home_odds,away_odds):\n", + " # finish preparing data\n", + " gbg = get_one_week_home_and_away(home,away,season,week)\n", + " gbg['Total Score Close'] = total\n", + " gbg['Home Odds'] = home_odds\n", + " gbg['Away Odds'] = away_odds\n", + " matrix = xgb.DMatrix(gbg.drop(columns=['game_id','Season']).astype(float).values)\n", + "\n", + " # moneyline\n", + " model = 'xgboost_ML_75.4%'\n", + " file_path = os.path.join(model_directory, f'{model}.json')\n", + " xgb_ml = xgb.Booster()\n", + " xgb_ml.load_model(file_path)\n", + " ml_predicted_proba = xgb_ml.predict(matrix)[0][1]\n", + "\n", + " # over/under\n", + " model = 'xgboost_OU_59.3%'\n", + " file_path = os.path.join(model_directory, f'{model}.json')\n", + " xgb_ou = xgb.Booster()\n", + " xgb_ou.load_model(file_path)\n", + " ou_predicted_proba = xgb_ou.predict(matrix)[0][1]\n", + "\n", + " # return dict\n", + " predictions = {'Moneyline':{\n", + " home: ml_predicted_proba,\n", + " away: 1-ml_predicted_proba},\n", + " 'Over/Under':{\n", + " 'Over':ou_predicted_proba,\n", + " 'Under':1-ou_predicted_proba}\n", + " }\n", + " \n", + " return predictions\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 315, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Moneyline': {'PHI': 0.4090659, 'KC': 0.5909340977668762},\n", + " 'Over/Under': {'Over': 0.8390681, 'Under': 0.16093188524246216}}" + ] + }, + "execution_count": 315, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predict('PHI','KC',2022,22,51,-125,105)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import Source.Build.nfl_data_py.nfl_data_py as nfl\n", + "# get team abbreviations\n", + "team_descriptions = nfl.import_team_desc()\n", + "team_abbreviation_dict = dict(team_descriptions[['team_name','team_abbr']].values)\n", + "team_name_dict = dict(team_descriptions[['team_abbr','team_name']].values)\n", + "\n", + "import pickle as pkl\n", + "\n", + "with open('Source/Pickles/team_name_to_abbreviation.pkl', 'wb') as f:\n", + " pkl.dump(team_abbreviation_dict,f)\n", + "\n", + "with open('Source/Pickles/team_abbreviation_to_name.pkl', 'wb') as f:\n", + " pkl.dump(team_name_dict,f)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1782" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pickle as pkl\n", + "with open('Source/Pickles/train_games_OU.pkl', 'rb') as f:\n", + " train_games_ML = pkl.load(f)\n", + "len(train_games_ML)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}