{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "\n", "DATA_DIR = os.path.join(\"..\", \"data\") " ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SeasonDayNumWTeamIDWScoreLTeamIDLScoreWLocNumOTWFGMWFGA...LFTMLFTALORLDRLAstLTOLStlLBlkLPFLeague
8082015137132071146154N02350...3941814122317M
6822021146325760333242N02663...4514237145610W
11142019154143885140377N12759...1315923986318M
7182022138326183323877H03074...1929122811155423W
12172022138111653130848N01451...6104328174415M
\n", "

5 rows × 35 columns

\n", "
" ], "text/plain": [ " Season DayNum WTeamID WScore LTeamID LScore WLoc NumOT WFGM \\\n", "808 2015 137 1320 71 1461 54 N 0 23 \n", "682 2021 146 3257 60 3332 42 N 0 26 \n", "1114 2019 154 1438 85 1403 77 N 1 27 \n", "718 2022 138 3261 83 3238 77 H 0 30 \n", "1217 2022 138 1116 53 1308 48 N 0 14 \n", "\n", " WFGA ... LFTM LFTA LOR LDR LAst LTO LStl LBlk LPF League \n", "808 50 ... 3 9 4 18 14 12 2 3 17 M \n", "682 63 ... 4 5 14 23 7 14 5 6 10 W \n", "1114 59 ... 13 15 9 23 9 8 6 3 18 M \n", "718 74 ... 19 29 12 28 11 15 5 4 23 W \n", "1217 51 ... 6 10 4 32 8 17 4 4 15 M \n", "\n", "[5 rows x 35 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "detailed_tourney_games_df = pd.concat([\n", " pd.read_csv(os.path.join(DATA_DIR, \"MNCAATourneyDetailedResults.csv\")).assign(League=\"M\"),\n", " pd.read_csv(os.path.join(DATA_DIR, \"WNCAATourneyDetailedResults.csv\")).assign(League=\"W\"),\n", "])\n", "\n", "detailed_tourney_games_df.sample(5, random_state=1)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SeasonDayNumWTeamIDWScoreLTeamIDLScoreWLocNumOTWFGMWFGA...LFTMLFTALORLDRLAstLTOLStlLBlkLPFLeague
279482008110119360118051A02252...712142013104316M
88315202019134581124049H03156...122113211084019M
1536200359127272111667A02350...142051781310225M
104465202366118692134080H03051...10131217151111220M
85458201986129271141265H02557...152252771111214M
\n", "

5 rows × 35 columns

\n", "
" ], "text/plain": [ " Season DayNum WTeamID WScore LTeamID LScore WLoc NumOT WFGM \\\n", "27948 2008 110 1193 60 1180 51 A 0 22 \n", "88315 2020 19 1345 81 1240 49 H 0 31 \n", "1536 2003 59 1272 72 1116 67 A 0 23 \n", "104465 2023 66 1186 92 1340 80 H 0 30 \n", "85458 2019 86 1292 71 1412 65 H 0 25 \n", "\n", " WFGA ... LFTM LFTA LOR LDR LAst LTO LStl LBlk LPF League \n", "27948 52 ... 7 12 14 20 13 10 4 3 16 M \n", "88315 56 ... 12 21 13 21 10 8 4 0 19 M \n", "1536 50 ... 14 20 5 17 8 13 10 2 25 M \n", "104465 51 ... 10 13 12 17 15 11 11 2 20 M \n", "85458 57 ... 15 22 5 27 7 11 11 2 14 M \n", "\n", "[5 rows x 35 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "detailed_reg_games_df = pd.concat([\n", " pd.read_csv(os.path.join(DATA_DIR, \"MRegularSeasonDetailedResults.csv\")).assign(League=\"M\"), \n", " pd.read_csv(os.path.join(DATA_DIR, \"WRegularSeasonDetailedResults.csv\")).assign(League=\"W\"),\n", "])\n", "\n", "detailed_reg_games_df.sample(5, random_state=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Concat" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# here we are making it such that each game has two rows, where each one is a team view of the game with\n", "# opposing metrics.\n", "\n", "detailed_metrics = {\n", " \"Score\",\n", " \"Loc\", \n", " \"FGM\",\n", " \"FGA\",\n", " \"FGM3\",\n", " \"FTM\",\n", " \"FTA\",\n", " \"OR\",\n", " \"DR\",\n", " \"Ast\",\n", " \"Blk\",\n", " \"TO\",\n", " \"Stl\",\n", " \"PF\",\n", "}\n", "\n", "w_renamed_cols = {f\"W{col}\": f\"Team{col}\" for col in detailed_metrics} | {f\"L{col}\": f\"Opp{col}\" for col in detailed_metrics}\n", "l_renamed_cols = {f\"L{col}\": f\"Team{col}\" for col in detailed_metrics} | {f\"W{col}\": f\"Opp{col}\" for col in detailed_metrics}" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 373324 entries, 0 to 373323\n", "Data columns (total 36 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Season 373324 non-null int64 \n", " 1 DayNum 373324 non-null int64 \n", " 2 TeamID 373324 non-null int64 \n", " 3 TeamScore 373324 non-null int64 \n", " 4 OppScore 373324 non-null int64 \n", " 5 TeamLoc 186662 non-null object\n", " 6 NumOT 373324 non-null int64 \n", " 7 TeamFGM 373324 non-null int64 \n", " 8 TeamFGA 373324 non-null int64 \n", " 9 TeamFGM3 373324 non-null int64 \n", " 10 WFGA3 373324 non-null int64 \n", " 11 TeamFTM 373324 non-null int64 \n", " 12 TeamFTA 373324 non-null int64 \n", " 13 TeamOR 373324 non-null int64 \n", " 14 TeamDR 373324 non-null int64 \n", " 15 TeamAst 373324 non-null int64 \n", " 16 TeamTO 373324 non-null int64 \n", " 17 TeamStl 373324 non-null int64 \n", " 18 TeamBlk 373324 non-null int64 \n", " 19 TeamPF 373324 non-null int64 \n", " 20 OppFGM 373324 non-null int64 \n", " 21 OppFGA 373324 non-null int64 \n", " 22 OppFGM3 373324 non-null int64 \n", " 23 LFGA3 373324 non-null int64 \n", " 24 OppFTM 373324 non-null int64 \n", " 25 OppFTA 373324 non-null int64 \n", " 26 OppOR 373324 non-null int64 \n", " 27 OppDR 373324 non-null int64 \n", " 28 OppAst 373324 non-null int64 \n", " 29 OppTO 373324 non-null int64 \n", " 30 OppStl 373324 non-null int64 \n", " 31 OppBlk 373324 non-null int64 \n", " 32 OppPF 373324 non-null int64 \n", " 33 League 373324 non-null object\n", " 34 GameResult 373324 non-null object\n", " 35 OppLoc 186662 non-null object\n", "dtypes: int64(32), object(4)\n", "memory usage: 102.5+ MB\n" ] } ], "source": [ "\n", "detailed_reg_games_df = pd.concat([\n", " (\n", " detailed_reg_games_df[[col for col in detailed_reg_games_df.columns if col != \"LTeamID\"]]\n", " .assign(GameResult=\"W\")\n", " .rename(columns=w_renamed_cols | {\"WTeamID\": \"TeamID\"})\n", " ),\n", " (\n", " detailed_reg_games_df[[col for col in detailed_reg_games_df.columns if col != \"WTeamID\"]]\n", " .assign(GameResult=\"L\")\n", " .rename(columns=l_renamed_cols | {\"LTeamID\": \"TeamID\"})\n", " )\n", "\n", "]).reset_index(drop=True)\n", "\n", "detailed_reg_games_df.info()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 4284 entries, 0 to 4283\n", "Data columns (total 36 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Season 4284 non-null int64 \n", " 1 DayNum 4284 non-null int64 \n", " 2 TeamID 4284 non-null int64 \n", " 3 TeamScore 4284 non-null int64 \n", " 4 OppScore 4284 non-null int64 \n", " 5 TeamLoc 2142 non-null object\n", " 6 NumOT 4284 non-null int64 \n", " 7 TeamFGM 4284 non-null int64 \n", " 8 TeamFGA 4284 non-null int64 \n", " 9 TeamFGM3 4284 non-null int64 \n", " 10 WFGA3 4284 non-null int64 \n", " 11 TeamFTM 4284 non-null int64 \n", " 12 TeamFTA 4284 non-null int64 \n", " 13 TeamOR 4284 non-null int64 \n", " 14 TeamDR 4284 non-null int64 \n", " 15 TeamAst 4284 non-null int64 \n", " 16 TeamTO 4284 non-null int64 \n", " 17 TeamStl 4284 non-null int64 \n", " 18 TeamBlk 4284 non-null int64 \n", " 19 TeamPF 4284 non-null int64 \n", " 20 OppFGM 4284 non-null int64 \n", " 21 OppFGA 4284 non-null int64 \n", " 22 OppFGM3 4284 non-null int64 \n", " 23 LFGA3 4284 non-null int64 \n", " 24 OppFTM 4284 non-null int64 \n", " 25 OppFTA 4284 non-null int64 \n", " 26 OppOR 4284 non-null int64 \n", " 27 OppDR 4284 non-null int64 \n", " 28 OppAst 4284 non-null int64 \n", " 29 OppTO 4284 non-null int64 \n", " 30 OppStl 4284 non-null int64 \n", " 31 OppBlk 4284 non-null int64 \n", " 32 OppPF 4284 non-null int64 \n", " 33 League 4284 non-null object\n", " 34 GameResult 4284 non-null object\n", " 35 OppLoc 2142 non-null object\n", "dtypes: int64(32), object(4)\n", "memory usage: 1.2+ MB\n" ] } ], "source": [ "# do the same thing for the tournament games\n", "detailed_tourney_games_df = pd.concat([\n", " (\n", " detailed_tourney_games_df[[col for col in detailed_tourney_games_df.columns if col != \"LTeamID\"]]\n", " .assign(GameResult=\"W\")\n", " .rename(columns=w_renamed_cols | {\"WTeamID\": \"TeamID\"})\n", " ),\n", " (\n", " detailed_tourney_games_df[[col for col in detailed_tourney_games_df.columns if col != \"WTeamID\"]]\n", " .assign(GameResult=\"L\")\n", " .rename(columns=l_renamed_cols | {\"LTeamID\": \"TeamID\"})\n", " )\n", "]).reset_index(drop=True)\n", "\n", "detailed_tourney_games_df.info()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "detailed_reg_games_df[\"ScoreDiff\"] = detailed_reg_games_df.apply(\n", " lambda row: row[\"TeamScore\"] - row[\"OppScore\"],\n", " axis=1,\n", ")\n", "\n", "detailed_tourney_games_df[\"ScoreDiff\"] = detailed_tourney_games_df.apply(\n", " lambda row: row[\"TeamScore\"] - row[\"OppScore\"],\n", " axis=1,\n", ")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "detailed_reg_games_df[\"Win\"] = detailed_reg_games_df.apply(\n", " lambda row: 0 if row[\"GameResult\"] == \"L\" else 1,\n", " axis=1,\n", ")\n", "\n", "detailed_tourney_games_df[\"Win\"] = detailed_tourney_games_df.apply(\n", " lambda row: 0 if row[\"GameResult\"] == \"L\" else 1,\n", " axis=1,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Aggregation" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "exclude_agg_cols = {\n", " \"TeamID\",\n", " \"Season\",\n", " \"League\",\n", " \"GameResult\",\n", " \"OppLoc\",\n", " \"TeamLoc\",\n", " \"Season\",\n", " \"DayNum\",\n", "}\n", "\n", "agg_funcs = [\n", " np.min,\n", " np.max,\n", " np.std,\n", " np.median,\n", " np.mean,\n", "]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TeamIDSeasonLeagueTeamScore minTeamScore maxTeamScore stdTeamScore medianTeamScore meanOppScore minOppScore max...ScoreDiff minScoreDiff maxScoreDiff stdScoreDiff medianScoreDiff meanWin minWin maxWin stdWin medianWin mean
1234834302012W417810.80833961.058.9655173685...-323516.997102-1.0-2.517241010.5085480.00.482759
690014312018M338812.28324767.066.4666674497...-492914.772645-5.0-5.100000010.4794630.00.333333
440613152014M439510.01998072.073.00000061103...-271812.316786-2.0-2.645161010.5080010.00.483871
423313072005M5310112.91186077.075.8709684781...-173413.02289111.010.935484010.4016101.00.806452
340712662008M5110011.84131575.575.9062503789...-204717.82868210.011.593750010.4568031.00.718750
519013522016M448910.29856767.065.06250045106...-621814.365582-7.0-5.781250010.4709290.00.312500
189211942005M4510414.19461876.076.77777859107...-452714.449736-3.0-1.888889010.4921030.00.370370
1002032702021W248013.38513753.055.47619041117...-932427.245445-15.0-14.285714010.4629100.00.285714
956732402014W438411.31900962.563.59375045100...-421713.277095-2.0-4.093750010.5040160.00.437500
1261734522011W399012.51837465.065.7500002179...-235718.77713113.513.500000010.4568031.00.718750
\n", "

10 rows × 158 columns

\n", "
" ], "text/plain": [ " TeamID Season League TeamScore min TeamScore max TeamScore std \\\n", "12348 3430 2012 W 41 78 10.808339 \n", "6900 1431 2018 M 33 88 12.283247 \n", "4406 1315 2014 M 43 95 10.019980 \n", "4233 1307 2005 M 53 101 12.911860 \n", "3407 1266 2008 M 51 100 11.841315 \n", "5190 1352 2016 M 44 89 10.298567 \n", "1892 1194 2005 M 45 104 14.194618 \n", "10020 3270 2021 W 24 80 13.385137 \n", "9567 3240 2014 W 43 84 11.319009 \n", "12617 3452 2011 W 39 90 12.518374 \n", "\n", " TeamScore median TeamScore mean OppScore min OppScore max ... \\\n", "12348 61.0 58.965517 36 85 ... \n", "6900 67.0 66.466667 44 97 ... \n", "4406 72.0 73.000000 61 103 ... \n", "4233 77.0 75.870968 47 81 ... \n", "3407 75.5 75.906250 37 89 ... \n", "5190 67.0 65.062500 45 106 ... \n", "1892 76.0 76.777778 59 107 ... \n", "10020 53.0 55.476190 41 117 ... \n", "9567 62.5 63.593750 45 100 ... \n", "12617 65.0 65.750000 21 79 ... \n", "\n", " ScoreDiff min ScoreDiff max ScoreDiff std ScoreDiff median \\\n", "12348 -32 35 16.997102 -1.0 \n", "6900 -49 29 14.772645 -5.0 \n", "4406 -27 18 12.316786 -2.0 \n", "4233 -17 34 13.022891 11.0 \n", "3407 -20 47 17.828682 10.0 \n", "5190 -62 18 14.365582 -7.0 \n", "1892 -45 27 14.449736 -3.0 \n", "10020 -93 24 27.245445 -15.0 \n", "9567 -42 17 13.277095 -2.0 \n", "12617 -23 57 18.777131 13.5 \n", "\n", " ScoreDiff mean Win min Win max Win std Win median Win mean \n", "12348 -2.517241 0 1 0.508548 0.0 0.482759 \n", "6900 -5.100000 0 1 0.479463 0.0 0.333333 \n", "4406 -2.645161 0 1 0.508001 0.0 0.483871 \n", "4233 10.935484 0 1 0.401610 1.0 0.806452 \n", "3407 11.593750 0 1 0.456803 1.0 0.718750 \n", "5190 -5.781250 0 1 0.470929 0.0 0.312500 \n", "1892 -1.888889 0 1 0.492103 0.0 0.370370 \n", "10020 -14.285714 0 1 0.462910 0.0 0.285714 \n", "9567 -4.093750 0 1 0.504016 0.0 0.437500 \n", "12617 13.500000 0 1 0.456803 1.0 0.718750 \n", "\n", "[10 rows x 158 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "team_reg_agg = (\n", " detailed_reg_games_df.groupby([\"TeamID\", \"Season\", \"League\"])\n", " .agg({col: agg_funcs for col in detailed_reg_games_df.select_dtypes(\"number\").columns if col not in exclude_agg_cols})\n", " .reset_index()\n", ")\n", "\n", "team_reg_agg.columns = [\" \".join(col).strip() for col in team_reg_agg.columns.values]\n", "\n", "team_reg_agg.sample(10, random_state=1)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TeamIDSeasonLeagueTeamScore minTeamScore maxTeamScore stdTeamScore medianTeamScore meanOppScore minOppScore max...ScoreDiff minScoreDiff maxScoreDiff stdScoreDiff medianScoreDiff meanWin minWin maxWin stdWin medianWin mean
99513902008M628210.40833077.073.6666675382...-202422.0075741.01.666667010.5773501.00.666667
160132262021W6363NaN63.063.0000007171...-8-8NaN-8.0-8.00000000NaN0.00.000000
180533012023W6363NaN63.063.0000006464...-1-1NaN-1.0-1.00000000NaN0.00.000000
95213732009M72741.41421473.073.0000007279...-726.363961-2.5-2.500000010.7071070.50.500000
92413612012M6565NaN65.065.0000007979...-14-14NaN-14.0-14.00000000NaN0.00.000000
138131242014W69909.91211481.080.2500005688...-191918.02544515.57.750000010.5000001.00.750000
126614522021M72848.48528178.078.0000006775...-31714.1421367.07.000000010.7071070.50.500000
181033042015W6969NaN69.069.0000007272...-3-3NaN-3.0-3.00000000NaN0.00.000000
69713012023M6363NaN63.063.0000007272...-9-9NaN-9.0-9.00000000NaN0.00.000000
76313232003M68711.52752570.069.6666676088...-17812.8970281.0-2.666667010.5773501.00.666667
\n", "

10 rows × 158 columns

\n", "
" ], "text/plain": [ " TeamID Season League TeamScore min TeamScore max TeamScore std \\\n", "995 1390 2008 M 62 82 10.408330 \n", "1601 3226 2021 W 63 63 NaN \n", "1805 3301 2023 W 63 63 NaN \n", "952 1373 2009 M 72 74 1.414214 \n", "924 1361 2012 M 65 65 NaN \n", "1381 3124 2014 W 69 90 9.912114 \n", "1266 1452 2021 M 72 84 8.485281 \n", "1810 3304 2015 W 69 69 NaN \n", "697 1301 2023 M 63 63 NaN \n", "763 1323 2003 M 68 71 1.527525 \n", "\n", " TeamScore median TeamScore mean OppScore min OppScore max ... \\\n", "995 77.0 73.666667 53 82 ... \n", "1601 63.0 63.000000 71 71 ... \n", "1805 63.0 63.000000 64 64 ... \n", "952 73.0 73.000000 72 79 ... \n", "924 65.0 65.000000 79 79 ... \n", "1381 81.0 80.250000 56 88 ... \n", "1266 78.0 78.000000 67 75 ... \n", "1810 69.0 69.000000 72 72 ... \n", "697 63.0 63.000000 72 72 ... \n", "763 70.0 69.666667 60 88 ... \n", "\n", " ScoreDiff min ScoreDiff max ScoreDiff std ScoreDiff median \\\n", "995 -20 24 22.007574 1.0 \n", "1601 -8 -8 NaN -8.0 \n", "1805 -1 -1 NaN -1.0 \n", "952 -7 2 6.363961 -2.5 \n", "924 -14 -14 NaN -14.0 \n", "1381 -19 19 18.025445 15.5 \n", "1266 -3 17 14.142136 7.0 \n", "1810 -3 -3 NaN -3.0 \n", "697 -9 -9 NaN -9.0 \n", "763 -17 8 12.897028 1.0 \n", "\n", " ScoreDiff mean Win min Win max Win std Win median Win mean \n", "995 1.666667 0 1 0.577350 1.0 0.666667 \n", "1601 -8.000000 0 0 NaN 0.0 0.000000 \n", "1805 -1.000000 0 0 NaN 0.0 0.000000 \n", "952 -2.500000 0 1 0.707107 0.5 0.500000 \n", "924 -14.000000 0 0 NaN 0.0 0.000000 \n", "1381 7.750000 0 1 0.500000 1.0 0.750000 \n", "1266 7.000000 0 1 0.707107 0.5 0.500000 \n", "1810 -3.000000 0 0 NaN 0.0 0.000000 \n", "697 -9.000000 0 0 NaN 0.0 0.000000 \n", "763 -2.666667 0 1 0.577350 1.0 0.666667 \n", "\n", "[10 rows x 158 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# aggregate the same metrics for the tournament dataset \n", "\n", "team_tourney_agg = (\n", " detailed_tourney_games_df.groupby([\"TeamID\", \"Season\", \"League\"])\n", " .agg({col: agg_funcs for col in detailed_tourney_games_df.select_dtypes(\"number\").columns if col not in exclude_agg_cols})\n", " .reset_index()\n", ")\n", "\n", "team_tourney_agg.columns = [\" \".join(col).strip() for col in team_tourney_agg.columns.values]\n", "\n", "team_tourney_agg.sample(10, random_state=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Join Aggregated w/ Attributes" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SeasonTeamIDConfAbbrevLeagueSeedTeamNameFirstD1SeasonLastD1Season
50720161292cusaMX15MTSU19852024
117419941438accMZ07Virginia19852024
115819981428wacMX03Utah19852024
63819881328big_eightMY01Oklahoma19852024
40419991268accMY02Maryland19852024
179720071308wacMW13New Mexico St19852024
147119881185macMX15E Michigan19852024
131320211196secMZ07Florida19852024
88119871424pcaaMZ01UNLV19852024
199820061203a_tenMW08G Washington19852024
\n", "
" ], "text/plain": [ " Season TeamID ConfAbbrev League Seed TeamName FirstD1Season \\\n", "507 2016 1292 cusa M X15 MTSU 1985 \n", "1174 1994 1438 acc M Z07 Virginia 1985 \n", "1158 1998 1428 wac M X03 Utah 1985 \n", "638 1988 1328 big_eight M Y01 Oklahoma 1985 \n", "404 1999 1268 acc M Y02 Maryland 1985 \n", "1797 2007 1308 wac M W13 New Mexico St 1985 \n", "1471 1988 1185 mac M X15 E Michigan 1985 \n", "1313 2021 1196 sec M Z07 Florida 1985 \n", "881 1987 1424 pcaa M Z01 UNLV 1985 \n", "1998 2006 1203 a_ten M W08 G Washington 1985 \n", "\n", " LastD1Season \n", "507 2024 \n", "1174 2024 \n", "1158 2024 \n", "638 2024 \n", "404 2024 \n", "1797 2024 \n", "1471 2024 \n", "1313 2024 \n", "881 2024 \n", "1998 2024 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "conference_df = pd.concat([\n", " pd.read_csv(os.path.join(DATA_DIR, \"MTeamConferences.csv\")).assign(League=\"M\"),\n", " pd.read_csv(os.path.join(DATA_DIR, \"WTeamConferences.csv\")).assign(League=\"W\"),\n", "])\n", "\n", "team_conf_seeds_df = (\n", " conference_df.merge(\n", " right=(pd.concat([\n", " pd.read_csv(os.path.join(DATA_DIR, \"MNCAATourneySeeds.csv\")).assign(League=\"M\"),\n", " pd.read_csv(os.path.join(DATA_DIR, \"WNCAATourneySeeds.csv\")).assign(League=\"W\"),\n", " ])),\n", " on=[\"League\", \"Season\", \"TeamID\"],\n", " )\n", " .merge(right=pd.read_csv(os.path.join(DATA_DIR, \"MTeams.csv\")), on=\"TeamID\")\n", ")\n", "\n", "team_conf_seeds_df.sample(10, random_state=10)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# merge the tournament aggregated metrics with the regular season aggregated metrics\n", "team_agg_df = (\n", " pd.merge(\n", " left=team_reg_agg, \n", " right=team_tourney_agg, \n", " on=[\"TeamID\", \"Season\", \"League\"], \n", " suffixes=(\" reg\", \" tourney\"),\n", " )\n", ")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TeamIDSeasonLeagueTeamScore min regTeamScore max regTeamScore std regTeamScore median regTeamScore mean regOppScore min regOppScore max reg...Win min tourneyWin max tourneyWin std tourneyWin median tourneyWin mean tourneyConfAbbrevSeedTeamNameFirstD1SeasonLastD1Season
120214372011M50889.45015673.072.7187503693...00NaN0.00.000000big_eastW09Villanova19852024
10911372013M56887.12107366.067.2500004279...00NaN0.00.000000patriotW11Bucknell19852024
110514172015M3911316.59442772.072.0000004587...010.5773501.00.666667pac_twelveX11UCLA19852024
70013052005M52848.72019972.069.5517244685...010.7071070.50.500000wacX09Nevada19852024
83413322019M478411.18657772.070.4857144690...010.5773501.00.666667pac_twelveZ12Oregon19852024
100913932018M449011.51925370.067.54545545101...010.5000001.00.750000accX11bSyracuse19852024
95313732010M539910.16455576.075.4545455187...00NaN0.00.000000maacX13Siena19852024
114114252022M589810.50604170.072.5757584391...00NaN0.00.000000pac_twelveY07USC19852024
41912342015M449010.21655670.569.4062504490...010.7071070.50.500000big_tenX07Iowa19852024
97313862008M559810.80123472.073.33333342102...00NaN0.00.000000a_tenW11St Joseph's PA19852024
\n", "

10 rows × 318 columns

\n", "
" ], "text/plain": [ " TeamID Season League TeamScore min reg TeamScore max reg \\\n", "1202 1437 2011 M 50 88 \n", "109 1137 2013 M 56 88 \n", "1105 1417 2015 M 39 113 \n", "700 1305 2005 M 52 84 \n", "834 1332 2019 M 47 84 \n", "1009 1393 2018 M 44 90 \n", "953 1373 2010 M 53 99 \n", "1141 1425 2022 M 58 98 \n", "419 1234 2015 M 44 90 \n", "973 1386 2008 M 55 98 \n", "\n", " TeamScore std reg TeamScore median reg TeamScore mean reg \\\n", "1202 9.450156 73.0 72.718750 \n", "109 7.121073 66.0 67.250000 \n", "1105 16.594427 72.0 72.000000 \n", "700 8.720199 72.0 69.551724 \n", "834 11.186577 72.0 70.485714 \n", "1009 11.519253 70.0 67.545455 \n", "953 10.164555 76.0 75.454545 \n", "1141 10.506041 70.0 72.575758 \n", "419 10.216556 70.5 69.406250 \n", "973 10.801234 72.0 73.333333 \n", "\n", " OppScore min reg OppScore max reg ... Win min tourney \\\n", "1202 36 93 ... 0 \n", "109 42 79 ... 0 \n", "1105 45 87 ... 0 \n", "700 46 85 ... 0 \n", "834 46 90 ... 0 \n", "1009 45 101 ... 0 \n", "953 51 87 ... 0 \n", "1141 43 91 ... 0 \n", "419 44 90 ... 0 \n", "973 42 102 ... 0 \n", "\n", " Win max tourney Win std tourney Win median tourney Win mean tourney \\\n", "1202 0 NaN 0.0 0.000000 \n", "109 0 NaN 0.0 0.000000 \n", "1105 1 0.577350 1.0 0.666667 \n", "700 1 0.707107 0.5 0.500000 \n", "834 1 0.577350 1.0 0.666667 \n", "1009 1 0.500000 1.0 0.750000 \n", "953 0 NaN 0.0 0.000000 \n", "1141 0 NaN 0.0 0.000000 \n", "419 1 0.707107 0.5 0.500000 \n", "973 0 NaN 0.0 0.000000 \n", "\n", " ConfAbbrev Seed TeamName FirstD1Season LastD1Season \n", "1202 big_east W09 Villanova 1985 2024 \n", "109 patriot W11 Bucknell 1985 2024 \n", "1105 pac_twelve X11 UCLA 1985 2024 \n", "700 wac X09 Nevada 1985 2024 \n", "834 pac_twelve Z12 Oregon 1985 2024 \n", "1009 acc X11b Syracuse 1985 2024 \n", "953 maac X13 Siena 1985 2024 \n", "1141 pac_twelve Y07 USC 1985 2024 \n", "419 big_ten X07 Iowa 1985 2024 \n", "973 a_ten W11 St Joseph's PA 1985 2024 \n", "\n", "[10 rows x 318 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# merge the team_conf_seeds_df with team attributes into the aggregated data\n", "\n", "# team_reg_agg_df = team_reg_agg.merge(right=team_conf_seeds_df, on=[\"TeamID\", \"Season\", \"League\"])\n", "# team_tourney_agg_df = team_tourney_agg.merge(right=team_conf_seeds_df, on=[\"TeamID\", \"Season\", \"League\"])\n", "\n", "team_agg_df = pd.merge(\n", " left=team_agg_df,\n", " right=team_conf_seeds_df,\n", " on=[\"TeamID\", \"Season\", \"League\"],\n", ")\n", "\n", "team_agg_df.sample(10, random_state=10)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TeamIDSeasonLeagueTeamScore min regTeamScore max regTeamScore std regTeamScore median regTeamScore mean regOppScore min regOppScore max reg...Win min tourneyWin max tourneyWin std tourneyWin median tourneyWin mean tourneyConfAbbrevSeedTeamNameFirstD1SeasonLastD1Season
40912332013M6010411.90190381.080.69697057105...00NaN0.00.000000maacZ15Iona19852024
118914362005M55908.57261173.572.6000004493...010.7071070.50.500000aecY13Vermont19852024
127514552013M52949.40725469.069.4411763991...010.4472141.00.800000mvcZ09Wichita St19852024
48712462017M6611512.68724387.085.94117648100...010.5000001.00.750000secZ02Kentucky19852024
113514252008M469512.62597869.569.0625004496...00NaN0.00.000000pac_tenX06USC19852024
55612682007M5810211.97308879.579.50000050103...010.7071070.50.500000accY04Maryland19852024
36812162021M498310.11885966.065.8695655087...00NaN0.00.000000aecZ16Hartford19852023
103713972023M469411.18804571.071.7878794086...010.5773501.00.666667secW04Tennessee19852024
89113452023M54899.01301374.573.0882353979...00NaN0.00.000000big_tenW01Purdue19852024
55912682015M55959.36456568.069.4848484889...010.7071070.50.500000big_tenY04Maryland19852024
\n", "

10 rows × 318 columns

\n", "
" ], "text/plain": [ " TeamID Season League TeamScore min reg TeamScore max reg \\\n", "409 1233 2013 M 60 104 \n", "1189 1436 2005 M 55 90 \n", "1275 1455 2013 M 52 94 \n", "487 1246 2017 M 66 115 \n", "1135 1425 2008 M 46 95 \n", "556 1268 2007 M 58 102 \n", "368 1216 2021 M 49 83 \n", "1037 1397 2023 M 46 94 \n", "891 1345 2023 M 54 89 \n", "559 1268 2015 M 55 95 \n", "\n", " TeamScore std reg TeamScore median reg TeamScore mean reg \\\n", "409 11.901903 81.0 80.696970 \n", "1189 8.572611 73.5 72.600000 \n", "1275 9.407254 69.0 69.441176 \n", "487 12.687243 87.0 85.941176 \n", "1135 12.625978 69.5 69.062500 \n", "556 11.973088 79.5 79.500000 \n", "368 10.118859 66.0 65.869565 \n", "1037 11.188045 71.0 71.787879 \n", "891 9.013013 74.5 73.088235 \n", "559 9.364565 68.0 69.484848 \n", "\n", " OppScore min reg OppScore max reg ... Win min tourney \\\n", "409 57 105 ... 0 \n", "1189 44 93 ... 0 \n", "1275 39 91 ... 0 \n", "487 48 100 ... 0 \n", "1135 44 96 ... 0 \n", "556 50 103 ... 0 \n", "368 50 87 ... 0 \n", "1037 40 86 ... 0 \n", "891 39 79 ... 0 \n", "559 48 89 ... 0 \n", "\n", " Win max tourney Win std tourney Win median tourney Win mean tourney \\\n", "409 0 NaN 0.0 0.000000 \n", "1189 1 0.707107 0.5 0.500000 \n", "1275 1 0.447214 1.0 0.800000 \n", "487 1 0.500000 1.0 0.750000 \n", "1135 0 NaN 0.0 0.000000 \n", "556 1 0.707107 0.5 0.500000 \n", "368 0 NaN 0.0 0.000000 \n", "1037 1 0.577350 1.0 0.666667 \n", "891 0 NaN 0.0 0.000000 \n", "559 1 0.707107 0.5 0.500000 \n", "\n", " ConfAbbrev Seed TeamName FirstD1Season LastD1Season \n", "409 maac Z15 Iona 1985 2024 \n", "1189 aec Y13 Vermont 1985 2024 \n", "1275 mvc Z09 Wichita St 1985 2024 \n", "487 sec Z02 Kentucky 1985 2024 \n", "1135 pac_ten X06 USC 1985 2024 \n", "556 acc Y04 Maryland 1985 2024 \n", "368 aec Z16 Hartford 1985 2023 \n", "1037 sec W04 Tennessee 1985 2024 \n", "891 big_ten W01 Purdue 1985 2024 \n", "559 big_ten Y04 Maryland 1985 2024 \n", "\n", "[10 rows x 318 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "team_agg_df.sample(10, random_state=1)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# save it \n", "team_agg_df.to_csv(os.path.join(DATA_DIR, \"AllTeamsAgg.csv\"))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 2 }