{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "\n", "DATA_DIR = os.path.join(\"..\", \"data\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SeasonDayNumWTeamIDWScoreLTeamIDLScoreWLocNumOTWFGMWFGA...LFTMLFTALORLDRLAstLTOLStlLBlkLPFLeague
8082015137132071146154N02350...3941814122317M
6822021146325760333242N02663...4514237145610W
11142019154143885140377N12759...1315923986318M
7182022138326183323877H03074...1929122811155423W
12172022138111653130848N01451...6104328174415M
\n", "

5 rows × 35 columns

\n", "
" ], "text/plain": [ " Season DayNum WTeamID WScore LTeamID LScore WLoc NumOT WFGM \\\n", "808 2015 137 1320 71 1461 54 N 0 23 \n", "682 2021 146 3257 60 3332 42 N 0 26 \n", "1114 2019 154 1438 85 1403 77 N 1 27 \n", "718 2022 138 3261 83 3238 77 H 0 30 \n", "1217 2022 138 1116 53 1308 48 N 0 14 \n", "\n", " WFGA ... LFTM LFTA LOR LDR LAst LTO LStl LBlk LPF League \n", "808 50 ... 3 9 4 18 14 12 2 3 17 M \n", "682 63 ... 4 5 14 23 7 14 5 6 10 W \n", "1114 59 ... 13 15 9 23 9 8 6 3 18 M \n", "718 74 ... 19 29 12 28 11 15 5 4 23 W \n", "1217 51 ... 6 10 4 32 8 17 4 4 15 M \n", "\n", "[5 rows x 35 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "detailed_tourney_games_df = pd.concat(\n", " [\n", " pd.read_csv(os.path.join(DATA_DIR, \"MNCAATourneyDetailedResults.csv\")).assign(\n", " League=\"M\"\n", " ),\n", " pd.read_csv(os.path.join(DATA_DIR, \"WNCAATourneyDetailedResults.csv\")).assign(\n", " League=\"W\"\n", " ),\n", " ]\n", ")\n", "\n", "detailed_tourney_games_df.sample(5, random_state=1)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SeasonDayNumWTeamIDWScoreLTeamIDLScoreWLocNumOTWFGMWFGA...LFTMLFTALORLDRLAstLTOLStlLBlkLPFLeague
279482008110119360118051A02252...712142013104316M
88315202019134581124049H03156...122113211084019M
1536200359127272111667A02350...142051781310225M
104465202366118692134080H03051...10131217151111220M
85458201986129271141265H02557...152252771111214M
\n", "

5 rows × 35 columns

\n", "
" ], "text/plain": [ " Season DayNum WTeamID WScore LTeamID LScore WLoc NumOT WFGM \\\n", "27948 2008 110 1193 60 1180 51 A 0 22 \n", "88315 2020 19 1345 81 1240 49 H 0 31 \n", "1536 2003 59 1272 72 1116 67 A 0 23 \n", "104465 2023 66 1186 92 1340 80 H 0 30 \n", "85458 2019 86 1292 71 1412 65 H 0 25 \n", "\n", " WFGA ... LFTM LFTA LOR LDR LAst LTO LStl LBlk LPF League \n", "27948 52 ... 7 12 14 20 13 10 4 3 16 M \n", "88315 56 ... 12 21 13 21 10 8 4 0 19 M \n", "1536 50 ... 14 20 5 17 8 13 10 2 25 M \n", "104465 51 ... 10 13 12 17 15 11 11 2 20 M \n", "85458 57 ... 15 22 5 27 7 11 11 2 14 M \n", "\n", "[5 rows x 35 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "detailed_reg_games_df = pd.concat(\n", " [\n", " pd.read_csv(os.path.join(DATA_DIR, \"MRegularSeasonDetailedResults.csv\")).assign(\n", " League=\"M\"\n", " ),\n", " pd.read_csv(os.path.join(DATA_DIR, \"WRegularSeasonDetailedResults.csv\")).assign(\n", " League=\"W\"\n", " ),\n", " ]\n", ")\n", "\n", "detailed_reg_games_df.sample(5, random_state=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concat" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# here we are making it such that each game has two rows, where each one is a team view of the game with\n", "# opposing metrics.\n", "\n", "detailed_metrics = {\n", " \"Score\",\n", " # \"Loc\",\n", " \"FGM\",\n", " \"FGA\",\n", " \"FGM3\",\n", " \"FTM\",\n", " \"FTA\",\n", " \"OR\",\n", " \"DR\",\n", " \"Ast\",\n", " \"Blk\",\n", " \"TO\",\n", " \"Stl\",\n", " \"PF\",\n", "}\n", "\n", "w_renamed_cols = {f\"W{col}\": f\"Team{col}\" for col in detailed_metrics} | {\n", " f\"L{col}\": f\"Opp{col}\" for col in detailed_metrics\n", "}\n", "l_renamed_cols = {f\"L{col}\": f\"Team{col}\" for col in detailed_metrics} | {\n", " f\"W{col}\": f\"Opp{col}\" for col in detailed_metrics\n", "}" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 373324 entries, 0 to 373323\n", "Data columns (total 36 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Season 373324 non-null int64 \n", " 1 DayNum 373324 non-null int64 \n", " 2 TeamID 373324 non-null int64 \n", " 3 TeamScore 373324 non-null int64 \n", " 4 OppTeamID 373324 non-null int64 \n", " 5 OppScore 373324 non-null int64 \n", " 6 WLoc 373324 non-null object\n", " 7 NumOT 373324 non-null int64 \n", " 8 TeamFGM 373324 non-null int64 \n", " 9 TeamFGA 373324 non-null int64 \n", " 10 TeamFGM3 373324 non-null int64 \n", " 11 WFGA3 373324 non-null int64 \n", " 12 TeamFTM 373324 non-null int64 \n", " 13 TeamFTA 373324 non-null int64 \n", " 14 TeamOR 373324 non-null int64 \n", " 15 TeamDR 373324 non-null int64 \n", " 16 TeamAst 373324 non-null int64 \n", " 17 TeamTO 373324 non-null int64 \n", " 18 TeamStl 373324 non-null int64 \n", " 19 TeamBlk 373324 non-null int64 \n", " 20 TeamPF 373324 non-null int64 \n", " 21 OppFGM 373324 non-null int64 \n", " 22 OppFGA 373324 non-null int64 \n", " 23 OppFGM3 373324 non-null int64 \n", " 24 LFGA3 373324 non-null int64 \n", " 25 OppFTM 373324 non-null int64 \n", " 26 OppFTA 373324 non-null int64 \n", " 27 OppOR 373324 non-null int64 \n", " 28 OppDR 373324 non-null int64 \n", " 29 OppAst 373324 non-null int64 \n", " 30 OppTO 373324 non-null int64 \n", " 31 OppStl 373324 non-null int64 \n", " 32 OppBlk 373324 non-null int64 \n", " 33 OppPF 373324 non-null int64 \n", " 34 League 373324 non-null object\n", " 35 GameResult 373324 non-null object\n", "dtypes: int64(33), object(3)\n", "memory usage: 102.5+ MB\n" ] } ], "source": [ "detailed_reg_games_df = pd.concat(\n", " [\n", " (\n", " # detailed_reg_games_df[[col for col in detailed_reg_games_df.columns if col != \"LTeamID\"]]\n", " detailed_reg_games_df[[col for col in detailed_reg_games_df.columns]]\n", " .assign(GameResult=\"W\")\n", " .rename(\n", " columns=w_renamed_cols | {\"WTeamID\": \"TeamID\", \"LTeamID\": \"OppTeamID\"}\n", " )\n", " ),\n", " (\n", " # detailed_reg_games_df[[col for col in detailed_reg_games_df.columns if col != \"WTeamID\"]]\n", " detailed_reg_games_df[[col for col in detailed_reg_games_df.columns]]\n", " .assign(GameResult=\"L\")\n", " .rename(\n", " columns=l_renamed_cols | {\"LTeamID\": \"TeamID\", \"WTeamID\": \"OppTeamID\"}\n", " )\n", " ),\n", " ]\n", ").reset_index(drop=True)\n", "\n", "detailed_reg_games_df.info()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 4284 entries, 0 to 4283\n", "Data columns (total 36 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Season 4284 non-null int64 \n", " 1 DayNum 4284 non-null int64 \n", " 2 TeamID 4284 non-null int64 \n", " 3 TeamScore 4284 non-null int64 \n", " 4 OppTeamID 4284 non-null int64 \n", " 5 OppScore 4284 non-null int64 \n", " 6 WLoc 4284 non-null object\n", " 7 NumOT 4284 non-null int64 \n", " 8 TeamFGM 4284 non-null int64 \n", " 9 TeamFGA 4284 non-null int64 \n", " 10 TeamFGM3 4284 non-null int64 \n", " 11 WFGA3 4284 non-null int64 \n", " 12 TeamFTM 4284 non-null int64 \n", " 13 TeamFTA 4284 non-null int64 \n", " 14 TeamOR 4284 non-null int64 \n", " 15 TeamDR 4284 non-null int64 \n", " 16 TeamAst 4284 non-null int64 \n", " 17 TeamTO 4284 non-null int64 \n", " 18 TeamStl 4284 non-null int64 \n", " 19 TeamBlk 4284 non-null int64 \n", " 20 TeamPF 4284 non-null int64 \n", " 21 OppFGM 4284 non-null int64 \n", " 22 OppFGA 4284 non-null int64 \n", " 23 OppFGM3 4284 non-null int64 \n", " 24 LFGA3 4284 non-null int64 \n", " 25 OppFTM 4284 non-null int64 \n", " 26 OppFTA 4284 non-null int64 \n", " 27 OppOR 4284 non-null int64 \n", " 28 OppDR 4284 non-null int64 \n", " 29 OppAst 4284 non-null int64 \n", " 30 OppTO 4284 non-null int64 \n", " 31 OppStl 4284 non-null int64 \n", " 32 OppBlk 4284 non-null int64 \n", " 33 OppPF 4284 non-null int64 \n", " 34 League 4284 non-null object\n", " 35 GameResult 4284 non-null object\n", "dtypes: int64(33), object(3)\n", "memory usage: 1.2+ MB\n" ] } ], "source": [ "# do the same thing for the tournament games\n", "detailed_tourney_games_df = pd.concat(\n", " [\n", " (\n", " # detailed_tourney_games_df[[col for col in detailed_tourney_games_df.columns if col != \"LTeamID\"]]\n", " detailed_tourney_games_df[\n", " [col for col in detailed_tourney_games_df.columns]\n", " ]\n", " .assign(GameResult=\"W\")\n", " .rename(\n", " columns=w_renamed_cols | {\"WTeamID\": \"TeamID\", \"LTeamID\": \"OppTeamID\"}\n", " )\n", " ),\n", " (\n", " # detailed_tourney_games_df[[col for col in detailed_tourney_games_df.columns if col != \"WTeamID\"]]\n", " detailed_tourney_games_df[\n", " [col for col in detailed_tourney_games_df.columns]\n", " ]\n", " .assign(GameResult=\"L\")\n", " .rename(\n", " columns=l_renamed_cols | {\"LTeamID\": \"TeamID\", \"WTeamID\": \"OppTeamID\"}\n", " )\n", " ),\n", " ]\n", ").reset_index(drop=True)\n", "\n", "detailed_tourney_games_df.info()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "for col in detailed_metrics:\n", " detailed_reg_games_df[f\"{col}Diff\"] = detailed_reg_games_df.apply(\n", " lambda row: row[f\"Team{col}\"] - row[f\"Opp{col}\"],\n", " axis=1,\n", " )\n", "\n", " detailed_tourney_games_df[f\"{col}Diff\"] = detailed_tourney_games_df.apply(\n", " lambda row: row[f\"Team{col}\"] - row[f\"Opp{col}\"],\n", " axis=1,\n", " )" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SeasonDayNumTeamIDTeamScoreOppTeamIDOppScoreWLocNumOTTeamFGMTeamFGA...FTADiffPFDiffScoreDiffFGADiffBlkDiffFGM3DiffORDiffStlDiffAstDiffDRDiff
337067201774315856318984A02155...-119-28-121-3-11-7-1-4
1007322022103143971139359H02360...17-912-42-2-14111
83150201926118082135269H02758...10-513-6214-5113
34500920194343558329265H01955...-117-713-3-12-34-7
3187072013128332245327063N02051...-112-1831-32-72-3
\n", "

5 rows × 49 columns

\n", "
" ], "text/plain": [ " Season DayNum TeamID TeamScore OppTeamID OppScore WLoc NumOT \\\n", "337067 2017 74 3158 56 3189 84 A 0 \n", "100732 2022 103 1439 71 1393 59 H 0 \n", "83150 2019 26 1180 82 1352 69 H 0 \n", "345009 2019 4 3435 58 3292 65 H 0 \n", "318707 2013 128 3322 45 3270 63 N 0 \n", "\n", " TeamFGM TeamFGA ... FTADiff PFDiff ScoreDiff FGADiff BlkDiff \\\n", "337067 21 55 ... -11 9 -28 -12 1 \n", "100732 23 60 ... 17 -9 12 -4 2 \n", "83150 27 58 ... 10 -5 13 -6 2 \n", "345009 19 55 ... -11 7 -7 13 -3 \n", "318707 20 51 ... -11 2 -18 3 1 \n", "\n", " FGM3Diff ORDiff StlDiff AstDiff DRDiff \n", "337067 -3 -11 -7 -1 -4 \n", "100732 -2 -1 4 11 1 \n", "83150 1 4 -5 1 13 \n", "345009 -1 2 -3 4 -7 \n", "318707 -3 2 -7 2 -3 \n", "\n", "[5 rows x 49 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "detailed_reg_games_df.sample(5, random_state=1)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "detailed_reg_games_df[\"Win\"] = detailed_reg_games_df.apply(\n", " lambda row: 0 if row[\"GameResult\"] == \"L\" else 1,\n", " axis=1,\n", ")\n", "\n", "detailed_reg_games_df[\"OppWin\"] = detailed_reg_games_df.apply(\n", " lambda row: 1 if row[\"GameResult\"] == \"L\" else 0,\n", " axis=1,\n", ")\n", "\n", "detailed_tourney_games_df[\"Win\"] = detailed_tourney_games_df.apply(\n", " lambda row: 0 if row[\"GameResult\"] == \"L\" else 1,\n", " axis=1,\n", ")\n", "\n", "detailed_tourney_games_df[\"OppWin\"] = detailed_tourney_games_df.apply(\n", " lambda row: 1 if row[\"GameResult\"] == \"L\" else 0,\n", " axis=1,\n", ")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# combine the two detailed game dataframes into one for future use\n", "\n", "all_detailed_games_df = pd.concat(\n", " [\n", " detailed_reg_games_df.assign(GameType=\"reg\"),\n", " detailed_tourney_games_df.assign(GameType=\"tourney\"),\n", " ]\n", ")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 1328\n", "1 1393\n", "2 1437\n", "3 1457\n", "4 1208\n", " ... \n", "4279 3376\n", "4280 3439\n", "4281 3234\n", "4282 3261\n", "4283 3261\n", "Name: OppTeamID, Length: 377608, dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_detailed_games_df[\"OppTeamID\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Aggregation" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "exclude_agg_cols = {\n", " \"TeamID\",\n", " \"Season\",\n", " \"League\",\n", " \"GameResult\",\n", " \"OppLoc\",\n", " \"TeamLoc\",\n", " \"Season\",\n", " \"DayNum\",\n", " # \"OppTeamID\",\n", "}\n", "\n", "agg_funcs = [\n", " np.min,\n", " np.max,\n", " np.std,\n", " np.median,\n", " np.mean,\n", "]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TeamIDSeasonLeagueTeamScore minTeamScore maxTeamScore stdTeamScore medianTeamScore meanOppTeamID minOppTeamID max...Win minWin maxWin stdWin medianWin meanOppWin minOppWin maxOppWin stdOppWin medianOppWin mean
1234834302012W417810.80833961.058.96551731293451...010.5085480.00.482759010.5085481.00.517241
690014312018M338812.28324767.066.46666711111450...010.4794630.00.333333010.4794631.00.666667
440613152014M439510.01998072.073.00000011321458...010.5080010.00.483871010.5080011.00.516129
423313072005M5310112.91186077.075.87096811021461...010.4016101.00.806452010.4016100.00.193548
340712662008M5110011.84131575.575.90625011531458...010.4568031.00.718750010.4568030.00.281250
519013522016M448910.29856767.065.06250011021464...010.4709290.00.312500010.4709291.00.687500
189211942005M4510414.19461876.076.77777811251424...010.4921030.00.370370010.4921031.00.629630
1002032702021W248013.38513753.055.47619031243418...010.4629100.00.285714010.4629101.00.714286
956732402014W438411.31900962.563.59375031203404...010.5040160.00.437500010.5040161.00.562500
1261734522011W399012.51837465.065.75000031483438...010.4568031.00.718750010.4568030.00.281250
\n", "

10 rows × 228 columns

\n", "
" ], "text/plain": [ " TeamID Season League TeamScore min TeamScore max TeamScore std \\\n", "12348 3430 2012 W 41 78 10.808339 \n", "6900 1431 2018 M 33 88 12.283247 \n", "4406 1315 2014 M 43 95 10.019980 \n", "4233 1307 2005 M 53 101 12.911860 \n", "3407 1266 2008 M 51 100 11.841315 \n", "5190 1352 2016 M 44 89 10.298567 \n", "1892 1194 2005 M 45 104 14.194618 \n", "10020 3270 2021 W 24 80 13.385137 \n", "9567 3240 2014 W 43 84 11.319009 \n", "12617 3452 2011 W 39 90 12.518374 \n", "\n", " TeamScore median TeamScore mean OppTeamID min OppTeamID max ... \\\n", "12348 61.0 58.965517 3129 3451 ... \n", "6900 67.0 66.466667 1111 1450 ... \n", "4406 72.0 73.000000 1132 1458 ... \n", "4233 77.0 75.870968 1102 1461 ... \n", "3407 75.5 75.906250 1153 1458 ... \n", "5190 67.0 65.062500 1102 1464 ... \n", "1892 76.0 76.777778 1125 1424 ... \n", "10020 53.0 55.476190 3124 3418 ... \n", "9567 62.5 63.593750 3120 3404 ... \n", "12617 65.0 65.750000 3148 3438 ... \n", "\n", " Win min Win max Win std Win median Win mean OppWin min \\\n", "12348 0 1 0.508548 0.0 0.482759 0 \n", "6900 0 1 0.479463 0.0 0.333333 0 \n", "4406 0 1 0.508001 0.0 0.483871 0 \n", "4233 0 1 0.401610 1.0 0.806452 0 \n", "3407 0 1 0.456803 1.0 0.718750 0 \n", "5190 0 1 0.470929 0.0 0.312500 0 \n", "1892 0 1 0.492103 0.0 0.370370 0 \n", "10020 0 1 0.462910 0.0 0.285714 0 \n", "9567 0 1 0.504016 0.0 0.437500 0 \n", "12617 0 1 0.456803 1.0 0.718750 0 \n", "\n", " OppWin max OppWin std OppWin median OppWin mean \n", "12348 1 0.508548 1.0 0.517241 \n", "6900 1 0.479463 1.0 0.666667 \n", "4406 1 0.508001 1.0 0.516129 \n", "4233 1 0.401610 0.0 0.193548 \n", "3407 1 0.456803 0.0 0.281250 \n", "5190 1 0.470929 1.0 0.687500 \n", "1892 1 0.492103 1.0 0.629630 \n", "10020 1 0.462910 1.0 0.714286 \n", "9567 1 0.504016 1.0 0.562500 \n", "12617 1 0.456803 0.0 0.281250 \n", "\n", "[10 rows x 228 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "team_reg_agg = (\n", " detailed_reg_games_df.groupby([\"TeamID\", \"Season\", \"League\"])\n", " .agg(\n", " {\n", " col: agg_funcs\n", " for col in detailed_reg_games_df.select_dtypes(\"number\").columns\n", " if col not in exclude_agg_cols\n", " }\n", " )\n", " .reset_index()\n", ")\n", "\n", "team_reg_agg.columns = [\" \".join(col).strip() for col in team_reg_agg.columns.values]\n", "\n", "team_reg_agg.sample(10, random_state=1)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TeamIDSeasonLeagueTeamScore minTeamScore maxTeamScore stdTeamScore medianTeamScore meanOppTeamID minOppTeamID max...Win minWin maxWin stdWin medianWin meanOppWin minOppWin maxOppWin stdOppWin medianOppWin mean
99513902008M628210.40833077.073.66666711651400...010.5773501.00.666667010.5773500.00.333333
160132262021W6363NaN63.063.00000032463246...00NaN0.00.00000011NaN1.01.000000
180533012023W6363NaN63.063.00000033433343...00NaN0.00.00000011NaN1.01.000000
95213732009M72741.41421473.073.00000012571326...010.7071070.50.500000010.7071070.50.500000
92413612012M6565NaN65.065.00000013011301...00NaN0.00.00000011NaN1.01.000000
138131242014W69909.91211481.080.25000031433443...010.5000001.00.750000010.5000000.00.250000
126614522021M72848.48528178.078.00000012871393...010.7071070.50.500000010.7071070.50.500000
181033042015W6969NaN69.069.00000033933393...00NaN0.00.00000011NaN1.01.000000
69713012023M6363NaN63.063.00000011661166...00NaN0.00.00000011NaN1.01.000000
76313232003M68711.52752570.069.66666711121454...010.5773501.00.666667010.5773500.00.333333
\n", "

10 rows × 228 columns

\n", "
" ], "text/plain": [ " TeamID Season League TeamScore min TeamScore max TeamScore std \\\n", "995 1390 2008 M 62 82 10.408330 \n", "1601 3226 2021 W 63 63 NaN \n", "1805 3301 2023 W 63 63 NaN \n", "952 1373 2009 M 72 74 1.414214 \n", "924 1361 2012 M 65 65 NaN \n", "1381 3124 2014 W 69 90 9.912114 \n", "1266 1452 2021 M 72 84 8.485281 \n", "1810 3304 2015 W 69 69 NaN \n", "697 1301 2023 M 63 63 NaN \n", "763 1323 2003 M 68 71 1.527525 \n", "\n", " TeamScore median TeamScore mean OppTeamID min OppTeamID max ... \\\n", "995 77.0 73.666667 1165 1400 ... \n", "1601 63.0 63.000000 3246 3246 ... \n", "1805 63.0 63.000000 3343 3343 ... \n", "952 73.0 73.000000 1257 1326 ... \n", "924 65.0 65.000000 1301 1301 ... \n", "1381 81.0 80.250000 3143 3443 ... \n", "1266 78.0 78.000000 1287 1393 ... \n", "1810 69.0 69.000000 3393 3393 ... \n", "697 63.0 63.000000 1166 1166 ... \n", "763 70.0 69.666667 1112 1454 ... \n", "\n", " Win min Win max Win std Win median Win mean OppWin min \\\n", "995 0 1 0.577350 1.0 0.666667 0 \n", "1601 0 0 NaN 0.0 0.000000 1 \n", "1805 0 0 NaN 0.0 0.000000 1 \n", "952 0 1 0.707107 0.5 0.500000 0 \n", "924 0 0 NaN 0.0 0.000000 1 \n", "1381 0 1 0.500000 1.0 0.750000 0 \n", "1266 0 1 0.707107 0.5 0.500000 0 \n", "1810 0 0 NaN 0.0 0.000000 1 \n", "697 0 0 NaN 0.0 0.000000 1 \n", "763 0 1 0.577350 1.0 0.666667 0 \n", "\n", " OppWin max OppWin std OppWin median OppWin mean \n", "995 1 0.577350 0.0 0.333333 \n", "1601 1 NaN 1.0 1.000000 \n", "1805 1 NaN 1.0 1.000000 \n", "952 1 0.707107 0.5 0.500000 \n", "924 1 NaN 1.0 1.000000 \n", "1381 1 0.500000 0.0 0.250000 \n", "1266 1 0.707107 0.5 0.500000 \n", "1810 1 NaN 1.0 1.000000 \n", "697 1 NaN 1.0 1.000000 \n", "763 1 0.577350 0.0 0.333333 \n", "\n", "[10 rows x 228 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# aggregate the same metrics for the tournament dataset\n", "\n", "team_tourney_agg = (\n", " detailed_tourney_games_df.groupby([\"TeamID\", \"Season\", \"League\"])\n", " .agg(\n", " {\n", " col: agg_funcs\n", " for col in detailed_tourney_games_df.select_dtypes(\"number\").columns\n", " if col not in exclude_agg_cols\n", " }\n", " )\n", " .reset_index()\n", ")\n", "\n", "team_tourney_agg.columns = [\n", " \" \".join(col).strip() for col in team_tourney_agg.columns.values\n", "]\n", "\n", "team_tourney_agg.sample(10, random_state=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Join Aggregated w/ Attributes" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SeasonSeedTeamIDLeagueConfAbbrevTeamNameFirstD1SeasonLastD1SeasonChalkSeed
35912004X023243Wbig_twelveKansas StNaNNaN2
35282013Y013124Wbig_twelveBaylorNaNNaN1
18912003W021448MaccWake Forest1985.02024.02
7782019Y011314MaccNorth Carolina1985.02024.01
29322019X053266Wbig_eastMarquetteNaNNaN5
\n", "
" ], "text/plain": [ " Season Seed TeamID League ConfAbbrev TeamName FirstD1Season \\\n", "3591 2004 X02 3243 W big_twelve Kansas St NaN \n", "3528 2013 Y01 3124 W big_twelve Baylor NaN \n", "1891 2003 W02 1448 M acc Wake Forest 1985.0 \n", "778 2019 Y01 1314 M acc North Carolina 1985.0 \n", "2932 2019 X05 3266 W big_east Marquette NaN \n", "\n", " LastD1Season ChalkSeed \n", "3591 NaN 2 \n", "3528 NaN 1 \n", "1891 2024.0 2 \n", "778 2024.0 1 \n", "2932 NaN 5 " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "conference_df = pd.concat(\n", " [\n", " pd.read_csv(os.path.join(DATA_DIR, \"MNCAATourneySeeds.csv\")).assign(League=\"M\"),\n", " pd.read_csv(os.path.join(DATA_DIR, \"WNCAATourneySeeds.csv\")).assign(League=\"W\"),\n", " ]\n", ")\n", "\n", "team_conf_seeds_df = conference_df.merge(\n", " right=(\n", " pd.concat(\n", " [\n", " pd.read_csv(os.path.join(DATA_DIR, \"MTeamConferences.csv\")).assign(\n", " League=\"M\"\n", " ),\n", " pd.read_csv(os.path.join(DATA_DIR, \"WTeamConferences.csv\")).assign(\n", " League=\"W\"\n", " ),\n", " ]\n", " )\n", " ),\n", " on=[\"League\", \"Season\", \"TeamID\"],\n", " how=\"left\",\n", ").merge(right=(\n", " pd.concat([\n", " pd.read_csv(os.path.join(DATA_DIR, \"MTeams.csv\")),\n", " pd.read_csv(os.path.join(DATA_DIR, \"WTeams.csv\")),\n", " ])),\n", " on=\"TeamID\",\n", ")\n", "\n", "team_conf_seeds_df[\"ChalkSeed\"] = team_conf_seeds_df.apply(\n", " lambda row: int(row[\"Seed\"][1:].replace(\"a\", \"\").replace(\"b\", \"\")),\n", " axis=1,\n", ")\n", "\n", "team_conf_seeds_df.sample(5, random_state=1)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TeamIDSeasonLeagueTeamScore min regTeamScore max regTeamScore std regTeamScore median regTeamScore mean regOppTeamID min regOppTeamID max reg...Win min tourneyWin max tourneyWin std tourneyWin median tourneyWin mean tourneyOppWin min tourneyOppWin max tourneyOppWin std tourneyOppWin median tourneyOppWin mean tourney
1234834302012W417810.80833961.058.96551731293451...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
690014312018M338812.28324767.066.46666711111450...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
440613152014M439510.01998072.073.00000011321458...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
423313072005M5310112.91186077.075.87096811021461...0.00.0NaN0.00.01.01.0NaN1.01.0
340712662008M5110011.84131575.575.90625011531458...0.01.00.7071070.50.50.01.00.7071070.50.5
519013522016M448910.29856767.065.06250011021464...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
189211942005M4510414.19461876.076.77777811251424...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1002032702021W248013.38513753.055.47619031243418...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
956732402014W438411.31900962.563.59375031203404...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1261734522011W399012.51837465.065.75000031483438...0.01.00.7071070.50.50.01.00.7071070.50.5
\n", "

10 rows × 453 columns

\n", "
" ], "text/plain": [ " TeamID Season League TeamScore min reg TeamScore max reg \\\n", "12348 3430 2012 W 41 78 \n", "6900 1431 2018 M 33 88 \n", "4406 1315 2014 M 43 95 \n", "4233 1307 2005 M 53 101 \n", "3407 1266 2008 M 51 100 \n", "5190 1352 2016 M 44 89 \n", "1892 1194 2005 M 45 104 \n", "10020 3270 2021 W 24 80 \n", "9567 3240 2014 W 43 84 \n", "12617 3452 2011 W 39 90 \n", "\n", " TeamScore std reg TeamScore median reg TeamScore mean reg \\\n", "12348 10.808339 61.0 58.965517 \n", "6900 12.283247 67.0 66.466667 \n", "4406 10.019980 72.0 73.000000 \n", "4233 12.911860 77.0 75.870968 \n", "3407 11.841315 75.5 75.906250 \n", "5190 10.298567 67.0 65.062500 \n", "1892 14.194618 76.0 76.777778 \n", "10020 13.385137 53.0 55.476190 \n", "9567 11.319009 62.5 63.593750 \n", "12617 12.518374 65.0 65.750000 \n", "\n", " OppTeamID min reg OppTeamID max reg ... Win min tourney \\\n", "12348 3129 3451 ... NaN \n", "6900 1111 1450 ... NaN \n", "4406 1132 1458 ... NaN \n", "4233 1102 1461 ... 0.0 \n", "3407 1153 1458 ... 0.0 \n", "5190 1102 1464 ... NaN \n", "1892 1125 1424 ... NaN \n", "10020 3124 3418 ... NaN \n", "9567 3120 3404 ... NaN \n", "12617 3148 3438 ... 0.0 \n", "\n", " Win max tourney Win std tourney Win median tourney Win mean tourney \\\n", "12348 NaN NaN NaN NaN \n", "6900 NaN NaN NaN NaN \n", "4406 NaN NaN NaN NaN \n", "4233 0.0 NaN 0.0 0.0 \n", "3407 1.0 0.707107 0.5 0.5 \n", "5190 NaN NaN NaN NaN \n", "1892 NaN NaN NaN NaN \n", "10020 NaN NaN NaN NaN \n", "9567 NaN NaN NaN NaN \n", "12617 1.0 0.707107 0.5 0.5 \n", "\n", " OppWin min tourney OppWin max tourney OppWin std tourney \\\n", "12348 NaN NaN NaN \n", "6900 NaN NaN NaN \n", "4406 NaN NaN NaN \n", "4233 1.0 1.0 NaN \n", "3407 0.0 1.0 0.707107 \n", "5190 NaN NaN NaN \n", "1892 NaN NaN NaN \n", "10020 NaN NaN NaN \n", "9567 NaN NaN NaN \n", "12617 0.0 1.0 0.707107 \n", "\n", " OppWin median tourney OppWin mean tourney \n", "12348 NaN NaN \n", "6900 NaN NaN \n", "4406 NaN NaN \n", "4233 1.0 1.0 \n", "3407 0.5 0.5 \n", "5190 NaN NaN \n", "1892 NaN NaN \n", "10020 NaN NaN \n", "9567 NaN NaN \n", "12617 0.5 0.5 \n", "\n", "[10 rows x 453 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# merge the tournament aggregated metrics with the regular season aggregated metrics\n", "team_agg_df = pd.merge(\n", " left=team_reg_agg,\n", " right=team_tourney_agg,\n", " how=\"left\",\n", " on=[\"TeamID\", \"Season\", \"League\"],\n", " suffixes=(\" reg\", \" tourney\"),\n", " validate=\"1:1\",\n", ")\n", "\n", "team_agg_df.sample(10, random_state=1)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# merge the team_conf_seeds_df with team attributes into the aggregated data\n", "team_agg_df2 = pd.merge(\n", " left=team_agg_df,\n", " right=team_conf_seeds_df[team_conf_seeds_df[\"Season\"] >= 2003],\n", " on=[\"TeamID\", \"Season\", \"League\"],\n", " how=\"outer\",\n", " validate=\"1:1\",\n", ")\n", "\n", "team_agg_df2 = team_agg_df2[team_agg_df2[\"Season\"] >= 2003]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 13305 entries, 0 to 13304\n", "Columns: 459 entries, TeamID to ChalkSeed\n", "dtypes: float64(453), int64(2), object(4)\n", "memory usage: 46.7+ MB\n" ] } ], "source": [ "team_agg_df2.info()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 377608 entries, 0 to 377607\n", "Columns: 508 entries, Season to ChalkSeed\n", "dtypes: float64(453), int64(48), object(7)\n", "memory usage: 1.4+ GB\n" ] } ], "source": [ "# re merge the aggregated team stats to the games dataset\n", "super_detailed_games_df = pd.merge(\n", " left=all_detailed_games_df[all_detailed_games_df[\"Season\"] >= 2003],\n", " right=team_agg_df2,\n", " on=[\"TeamID\", \"Season\", \"League\"],\n", " how=\"left\",\n", " validate=\"m:1\",\n", ")\n", "\n", "super_detailed_games_df.info()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 1328\n", "1 1393\n", "2 1437\n", "3 1457\n", "4 1208\n", " ... \n", "377603 3376\n", "377604 3439\n", "377605 3234\n", "377606 3261\n", "377607 3261\n", "Name: OppTeamID, Length: 377608, dtype: int64" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "super_detailed_games_df[\"OppTeamID\"]" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 377608 entries, 0 to 377607\n", "Columns: 509 entries, Season to OppChalkSeed\n", "dtypes: float64(454), int64(48), object(7)\n", "memory usage: 1.4+ GB\n" ] } ], "source": [ "opp_chalk_seed_map = team_conf_seeds_df.groupby(\"TeamID\")[\"ChalkSeed\"].last()\n", "\n", "super_detailed_games_df[\"OppChalkSeed\"] = super_detailed_games_df[\"OppTeamID\"].map(\n", " opp_chalk_seed_map\n", ")\n", "\n", "super_detailed_games_df.info()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 2.0\n", "1 -4.0\n", "2 1.0\n", "3 NaN\n", "4 -9.0\n", " ... \n", "377603 1.0\n", "377604 2.0\n", "377605 -1.0\n", "377606 -2.0\n", "377607 -1.0\n", "Name: ChalkSeedDiff, Length: 377608, dtype: float64" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "super_detailed_games_df[\"ChalkSeedDiff\"] = (\n", " super_detailed_games_df[\"OppChalkSeed\"] - super_detailed_games_df[\"ChalkSeed\"]\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Save New Data" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "# save all aggregated teams dataframe as well as the super deatiled games dataframe\n", "team_agg_df.to_csv(os.path.join(DATA_DIR, \"AllTeamsAgg.csv\"))" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "super_detailed_games_df.to_csv(os.path.join(DATA_DIR, \"AllSuperDetailedGames.csv\"))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 2 }