Jensen-holm commited on
Commit
39ca15e
1 Parent(s): 47d59ff

removing modeling and processing that used to be specific to mens bball,

Browse files
Files changed (2) hide show
  1. src/mens_nn.ipynb +0 -0
  2. src/mens_pre_processing.ipynb +0 -1706
src/mens_nn.ipynb DELETED
The diff for this file is too large to render. See raw diff
 
src/mens_pre_processing.ipynb DELETED
@@ -1,1706 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "import pandas as pd\n",
10
- "import numpy as np\n",
11
- "import os\n",
12
- "from typing import Callable\n",
13
- "from itertools import product\n",
14
- "\n",
15
- "DATA_DIR = os.path.join(\"..\", \"data\")"
16
- ]
17
- },
18
- {
19
- "cell_type": "code",
20
- "execution_count": 2,
21
- "metadata": {},
22
- "outputs": [
23
- {
24
- "name": "stdout",
25
- "output_type": "stream",
26
- "text": [
27
- "<class 'pandas.core.frame.DataFrame'>\n",
28
- "RangeIndex: 1315 entries, 0 to 1314\n",
29
- "Data columns (total 38 columns):\n",
30
- " # Column Non-Null Count Dtype \n",
31
- "--- ------ -------------- ----- \n",
32
- " 0 Season 1315 non-null int64 \n",
33
- " 1 DayNum 1315 non-null int64 \n",
34
- " 2 WTeamID 1315 non-null int64 \n",
35
- " 3 WScore 1315 non-null int64 \n",
36
- " 4 LTeamID 1315 non-null int64 \n",
37
- " 5 LScore 1315 non-null int64 \n",
38
- " 6 WLoc 1315 non-null int64 \n",
39
- " 7 NumOT 1315 non-null int64 \n",
40
- " 8 WFGM 1315 non-null int64 \n",
41
- " 9 WFGA 1315 non-null int64 \n",
42
- " 10 WFGM3 1315 non-null int64 \n",
43
- " 11 WFGA3 1315 non-null int64 \n",
44
- " 12 WFTM 1315 non-null int64 \n",
45
- " 13 WFTA 1315 non-null int64 \n",
46
- " 14 WOR 1315 non-null int64 \n",
47
- " 15 WDR 1315 non-null int64 \n",
48
- " 16 WAst 1315 non-null int64 \n",
49
- " 17 WTO 1315 non-null int64 \n",
50
- " 18 WStl 1315 non-null int64 \n",
51
- " 19 WBlk 1315 non-null int64 \n",
52
- " 20 WPF 1315 non-null int64 \n",
53
- " 21 LFGM 1315 non-null int64 \n",
54
- " 22 LFGA 1315 non-null int64 \n",
55
- " 23 LFGM3 1315 non-null int64 \n",
56
- " 24 LFGA3 1315 non-null int64 \n",
57
- " 25 LFTM 1315 non-null int64 \n",
58
- " 26 LFTA 1315 non-null int64 \n",
59
- " 27 LOR 1315 non-null int64 \n",
60
- " 28 LDR 1315 non-null int64 \n",
61
- " 29 LAst 1315 non-null int64 \n",
62
- " 30 LTO 1315 non-null int64 \n",
63
- " 31 LStl 1315 non-null int64 \n",
64
- " 32 LBlk 1315 non-null int64 \n",
65
- " 33 LPF 1315 non-null int64 \n",
66
- " 34 GameType 1315 non-null object\n",
67
- " 35 WPA 1315 non-null int64 \n",
68
- " 36 LPA 1315 non-null int64 \n",
69
- " 37 LLoc 1315 non-null int64 \n",
70
- "dtypes: int64(37), object(1)\n",
71
- "memory usage: 390.5+ KB\n"
72
- ]
73
- }
74
- ],
75
- "source": [
76
- "# read in the tournament games data\n",
77
- "tourney_games_df = pd.read_csv(\n",
78
- " os.path.join(DATA_DIR, \"MNCAATourneyDetailedResults.csv\")\n",
79
- ")\n",
80
- "\n",
81
- "tourney_games_df[\"GameType\"] = \"tourney\"\n",
82
- "\n",
83
- "tourney_games_df[\"WPA\"] = tourney_games_df[\"LScore\"]\n",
84
- "tourney_games_df[\"LPA\"] = tourney_games_df[\"WScore\"]\n",
85
- "\n",
86
- "tourney_games_df[\"LLoc\"] = tourney_games_df[\"WLoc\"].apply(lambda x: 0 if x == \"A\" else 1)\n",
87
- "tourney_games_df[\"WLoc\"] = tourney_games_df[\"LLoc\"].apply(lambda x: 0 if x == \"A\" else 1)\n",
88
- "\n",
89
- "tourney_games_df.info()"
90
- ]
91
- },
92
- {
93
- "cell_type": "code",
94
- "execution_count": 3,
95
- "metadata": {},
96
- "outputs": [
97
- {
98
- "name": "stdout",
99
- "output_type": "stream",
100
- "text": [
101
- "<class 'pandas.core.frame.DataFrame'>\n",
102
- "RangeIndex: 111817 entries, 0 to 111816\n",
103
- "Data columns (total 38 columns):\n",
104
- " # Column Non-Null Count Dtype \n",
105
- "--- ------ -------------- ----- \n",
106
- " 0 Season 111817 non-null int64 \n",
107
- " 1 DayNum 111817 non-null int64 \n",
108
- " 2 WTeamID 111817 non-null int64 \n",
109
- " 3 WScore 111817 non-null int64 \n",
110
- " 4 LTeamID 111817 non-null int64 \n",
111
- " 5 LScore 111817 non-null int64 \n",
112
- " 6 WLoc 111817 non-null int64 \n",
113
- " 7 NumOT 111817 non-null int64 \n",
114
- " 8 WFGM 111817 non-null int64 \n",
115
- " 9 WFGA 111817 non-null int64 \n",
116
- " 10 WFGM3 111817 non-null int64 \n",
117
- " 11 WFGA3 111817 non-null int64 \n",
118
- " 12 WFTM 111817 non-null int64 \n",
119
- " 13 WFTA 111817 non-null int64 \n",
120
- " 14 WOR 111817 non-null int64 \n",
121
- " 15 WDR 111817 non-null int64 \n",
122
- " 16 WAst 111817 non-null int64 \n",
123
- " 17 WTO 111817 non-null int64 \n",
124
- " 18 WStl 111817 non-null int64 \n",
125
- " 19 WBlk 111817 non-null int64 \n",
126
- " 20 WPF 111817 non-null int64 \n",
127
- " 21 LFGM 111817 non-null int64 \n",
128
- " 22 LFGA 111817 non-null int64 \n",
129
- " 23 LFGM3 111817 non-null int64 \n",
130
- " 24 LFGA3 111817 non-null int64 \n",
131
- " 25 LFTM 111817 non-null int64 \n",
132
- " 26 LFTA 111817 non-null int64 \n",
133
- " 27 LOR 111817 non-null int64 \n",
134
- " 28 LDR 111817 non-null int64 \n",
135
- " 29 LAst 111817 non-null int64 \n",
136
- " 30 LTO 111817 non-null int64 \n",
137
- " 31 LStl 111817 non-null int64 \n",
138
- " 32 LBlk 111817 non-null int64 \n",
139
- " 33 LPF 111817 non-null int64 \n",
140
- " 34 GameType 111817 non-null object\n",
141
- " 35 WPA 111817 non-null int64 \n",
142
- " 36 LPA 111817 non-null int64 \n",
143
- " 37 LLoc 111817 non-null int64 \n",
144
- "dtypes: int64(37), object(1)\n",
145
- "memory usage: 32.4+ MB\n"
146
- ]
147
- }
148
- ],
149
- "source": [
150
- "# read in regular season games data\n",
151
- "reg_games_df = pd.read_csv(\n",
152
- " os.path.join(DATA_DIR, \"MRegularSeasonDetailedResults.csv\")\n",
153
- ")\n",
154
- "\n",
155
- "reg_games_df[\"GameType\"] = \"reg\"\n",
156
- "\n",
157
- "# points allowed column\n",
158
- "reg_games_df[\"WPA\"] = reg_games_df[\"LScore\"]\n",
159
- "reg_games_df[\"LPA\"] = reg_games_df[\"WScore\"]\n",
160
- "\n",
161
- "# loser location column\n",
162
- "reg_games_df[\"LLoc\"] = reg_games_df[\"WLoc\"].apply(lambda x: 0 if x == \"A\" else 1)\n",
163
- "reg_games_df[\"WLoc\"] = reg_games_df[\"LLoc\"].apply(lambda x: 0 if x == \"A\" else 1)\n",
164
- "\n",
165
- "reg_games_df.info()"
166
- ]
167
- },
168
- {
169
- "cell_type": "code",
170
- "execution_count": 4,
171
- "metadata": {},
172
- "outputs": [],
173
- "source": [
174
- "# when doing groupbys' and aggregations on our data, it is important to keep it readable. At times where\n",
175
- "# our dataframes are turned into MultiIndex objects, call this function to flatten it out.\n",
176
- "def flatten_multi_idx(df: pd.DataFrame) -> list[str]:\n",
177
- " return [\"_\".join(filter(None, col)) for col in df.columns.to_flat_index()]"
178
- ]
179
- },
180
- {
181
- "cell_type": "code",
182
- "execution_count": 5,
183
- "metadata": {},
184
- "outputs": [],
185
- "source": [
186
- "# here we are defining the metrics that we want to look at (practically all of them) as features\n",
187
- "# for building models. I want to do so with metrics regardless of winning and losing metrics, or at least\n",
188
- "# make extra features with combined stats from wins and losses. Because of that, here I am defining them manually\n",
189
- "\n",
190
- "outcomes = [\"W\", \"L\"]\n",
191
- "\n",
192
- "metrics = [\n",
193
- " \"Score\",\n",
194
- " \"FGM\", # field goals made\n",
195
- " \"FGA\", # field goals attempted\n",
196
- " \"FGM3\", # three pointers made\n",
197
- " \"FGA3\", # three pointers attempetd\n",
198
- " \"FTM\", # free throws made\n",
199
- " \"FTA\", # free throws attempted\n",
200
- " \"OR\", # Offensive rebounds\n",
201
- " \"DR\", # Defensive rebounds\n",
202
- " \"Ast\", # assists\n",
203
- " \"TO\", # turnovers\n",
204
- " \"Stl\", # steals\n",
205
- " \"Blk\", # blocks\n",
206
- " \"PF\", # personal fouls\n",
207
- " \"PA\", # points allowed\n",
208
- "]\n",
209
- "\n",
210
- "agg_funcs = [\n",
211
- " np.sum,\n",
212
- " np.min,\n",
213
- " np.max,\n",
214
- " np.median,\n",
215
- " np.std,\n",
216
- " np.mean,\n",
217
- "]\n",
218
- "\n",
219
- "def aggregate_teams(szn_df: pd.DataFrame, outcome: str, szn_prefix: str) -> pd.DataFrame:\n",
220
- " teams_df = szn_df \\\n",
221
- " .groupby([f\"{outcome}TeamID\", \"Season\"]) \\\n",
222
- " .agg({f\"{outcome}{metric}\": agg_funcs for outcome in outcomes for metric in metrics}) \\\n",
223
- " .reset_index()\n",
224
- "\n",
225
- " teams_df.columns = flatten_multi_idx(teams_df)\n",
226
- " exclude = {f\"{outcome}TeamID\", \"Season\"}\n",
227
- " teams_df.rename(columns={col: f\"{szn_prefix}_{col}\" for col in teams_df.columns if col not in exclude}, inplace=True)\n",
228
- " return teams_df"
229
- ]
230
- },
231
- {
232
- "cell_type": "code",
233
- "execution_count": 6,
234
- "metadata": {},
235
- "outputs": [],
236
- "source": [
237
- "# here we will summarize each teams statistics by creating new columns for each metric we are interested in\n",
238
- "# that is the combined result of each teams winning stats and losing stats\n",
239
- "\n",
240
- "def summarize_teams(szn_df: pd.DataFrame, szn_prefix: str) -> pd.DataFrame:\n",
241
- " w_team_sum_df = aggregate_teams(szn_df, outcome=\"W\", szn_prefix=szn_prefix)\n",
242
- " l_team_sum_df = aggregate_teams(szn_df, outcome=\"L\", szn_prefix=szn_prefix)\n",
243
- " \n",
244
- " w_team_sum_df.drop([col for col in w_team_sum_df.columns if \"L\" in col], axis=1, inplace=True)\n",
245
- " l_team_sum_df.drop([col for col in l_team_sum_df.columns if \"W\" in col], axis=1, inplace=True)\n",
246
- "\n",
247
- " w_team_sum_df.rename(columns={\"WTeamID\": \"TeamID\"}, inplace=True)\n",
248
- " l_team_sum_df.rename(columns={\"LTeamID\": \"TeamID\"}, inplace=True)\n",
249
- " \n",
250
- " ovr_team_df = pd.merge(\n",
251
- " left=w_team_sum_df,\n",
252
- " right=l_team_sum_df,\n",
253
- " on=[\"TeamID\", \"Season\"],\n",
254
- " )\n",
255
- "\n",
256
- " ovr_team_df[f\"tot_W_{szn_prefix}\"] = szn_df.groupby(\"WTeamID\").size()\n",
257
- " ovr_team_df[f\"tot_L_{szn_prefix}\"] = szn_df.groupby(\"LTeamID\").size()\n",
258
- " ovr_team_df[f\"tot_games_{szn_prefix}\"] = ovr_team_df.apply(\n",
259
- " lambda team: team[f\"tot_L_{szn_prefix}\"] + team[f\"tot_W_{szn_prefix}\"],\n",
260
- " axis=1,\n",
261
- " )\n",
262
- "\n",
263
- " def precalculate_metric(df: pd.DataFrame, outcome: str, agg_func: Callable, szn_prefix: str) -> pd.DataFrame:\n",
264
- " col = f\"{szn_prefix}_{agg_func.__name__}_{metric}\"\n",
265
- " return df \\\n",
266
- " .groupby([f\"{outcome}TeamID\", \"Season\"])[f\"{outcome}{metric}\"] \\\n",
267
- " .agg(agg_func) \\\n",
268
- " .rename(col) \\\n",
269
- " .reset_index() \\\n",
270
- " .rename(columns={f\"{outcome}TeamID\": \"TeamID\"})[col]\n",
271
- " \n",
272
- " for outcome, metric, agg_func in product(outcomes, metrics, agg_funcs):\n",
273
- " ovr_team_df[f\"{szn_prefix}_{metric}_{agg_func.__name__}\"] = precalculate_metric(\n",
274
- " szn_df,\n",
275
- " outcome=outcome,\n",
276
- " agg_func=agg_func,\n",
277
- " szn_prefix=szn_prefix,\n",
278
- " )\n",
279
- " \n",
280
- " return ovr_team_df\n"
281
- ]
282
- },
283
- {
284
- "cell_type": "code",
285
- "execution_count": 7,
286
- "metadata": {},
287
- "outputs": [
288
- {
289
- "name": "stdout",
290
- "output_type": "stream",
291
- "text": [
292
- "<class 'pandas.core.frame.DataFrame'>\n",
293
- "Int64Index: 7605 entries, 0 to 7604\n",
294
- "Columns: 275 entries, TeamID to reg_PA_mean\n",
295
- "dtypes: float64(138), int64(137)\n",
296
- "memory usage: 16.0 MB\n"
297
- ]
298
- }
299
- ],
300
- "source": [
301
- "reg_summary_df = summarize_teams(reg_games_df, \"reg\")\n",
302
- "\n",
303
- "reg_summary_df.info()"
304
- ]
305
- },
306
- {
307
- "cell_type": "code",
308
- "execution_count": 8,
309
- "metadata": {},
310
- "outputs": [
311
- {
312
- "name": "stdout",
313
- "output_type": "stream",
314
- "text": [
315
- "<class 'pandas.core.frame.DataFrame'>\n",
316
- "Int64Index: 664 entries, 0 to 663\n",
317
- "Columns: 275 entries, TeamID to tourney_PA_mean\n",
318
- "dtypes: float64(138), int64(137)\n",
319
- "memory usage: 1.4 MB\n"
320
- ]
321
- }
322
- ],
323
- "source": [
324
- "tourney_summary_df = summarize_teams(tourney_games_df, \"tourney\")\n",
325
- "\n",
326
- "tourney_summary_df.info()"
327
- ]
328
- },
329
- {
330
- "cell_type": "code",
331
- "execution_count": 9,
332
- "metadata": {},
333
- "outputs": [
334
- {
335
- "data": {
336
- "text/html": [
337
- "<div>\n",
338
- "<style scoped>\n",
339
- " .dataframe tbody tr th:only-of-type {\n",
340
- " vertical-align: middle;\n",
341
- " }\n",
342
- "\n",
343
- " .dataframe tbody tr th {\n",
344
- " vertical-align: top;\n",
345
- " }\n",
346
- "\n",
347
- " .dataframe thead th {\n",
348
- " text-align: right;\n",
349
- " }\n",
350
- "</style>\n",
351
- "<table border=\"1\" class=\"dataframe\">\n",
352
- " <thead>\n",
353
- " <tr style=\"text-align: right;\">\n",
354
- " <th></th>\n",
355
- " <th>TeamID</th>\n",
356
- " <th>Season</th>\n",
357
- " <th>reg_WScore_sum</th>\n",
358
- " <th>reg_WScore_min</th>\n",
359
- " <th>reg_WScore_max</th>\n",
360
- " <th>reg_WScore_median</th>\n",
361
- " <th>reg_WScore_std</th>\n",
362
- " <th>reg_WScore_mean</th>\n",
363
- " <th>reg_WFGM_sum</th>\n",
364
- " <th>reg_WFGM_min</th>\n",
365
- " <th>...</th>\n",
366
- " <th>tourney_PF_max</th>\n",
367
- " <th>tourney_PF_median</th>\n",
368
- " <th>tourney_PF_std</th>\n",
369
- " <th>tourney_PF_mean</th>\n",
370
- " <th>tourney_PA_sum</th>\n",
371
- " <th>tourney_PA_min</th>\n",
372
- " <th>tourney_PA_max</th>\n",
373
- " <th>tourney_PA_median</th>\n",
374
- " <th>tourney_PA_std</th>\n",
375
- " <th>tourney_PA_mean</th>\n",
376
- " </tr>\n",
377
- " </thead>\n",
378
- " <tbody>\n",
379
- " <tr>\n",
380
- " <th>0</th>\n",
381
- " <td>1101</td>\n",
382
- " <td>2021</td>\n",
383
- " <td>1500</td>\n",
384
- " <td>63</td>\n",
385
- " <td>93</td>\n",
386
- " <td>80.0</td>\n",
387
- " <td>8.579521</td>\n",
388
- " <td>78.947368</td>\n",
389
- " <td>529</td>\n",
390
- " <td>20</td>\n",
391
- " <td>...</td>\n",
392
- " <td>16</td>\n",
393
- " <td>16.0</td>\n",
394
- " <td>0.0</td>\n",
395
- " <td>16.0</td>\n",
396
- " <td>79</td>\n",
397
- " <td>79</td>\n",
398
- " <td>79</td>\n",
399
- " <td>79.0</td>\n",
400
- " <td>0.0</td>\n",
401
- " <td>79.0</td>\n",
402
- " </tr>\n",
403
- " <tr>\n",
404
- " <th>1</th>\n",
405
- " <td>1104</td>\n",
406
- " <td>2004</td>\n",
407
- " <td>1298</td>\n",
408
- " <td>45</td>\n",
409
- " <td>101</td>\n",
410
- " <td>77.0</td>\n",
411
- " <td>11.837130</td>\n",
412
- " <td>76.352941</td>\n",
413
- " <td>440</td>\n",
414
- " <td>16</td>\n",
415
- " <td>...</td>\n",
416
- " <td>15</td>\n",
417
- " <td>15.0</td>\n",
418
- " <td>0.0</td>\n",
419
- " <td>15.0</td>\n",
420
- " <td>67</td>\n",
421
- " <td>67</td>\n",
422
- " <td>67</td>\n",
423
- " <td>67.0</td>\n",
424
- " <td>0.0</td>\n",
425
- " <td>67.0</td>\n",
426
- " </tr>\n",
427
- " <tr>\n",
428
- " <th>2</th>\n",
429
- " <td>1104</td>\n",
430
- " <td>2006</td>\n",
431
- " <td>1269</td>\n",
432
- " <td>56</td>\n",
433
- " <td>105</td>\n",
434
- " <td>71.0</td>\n",
435
- " <td>14.321929</td>\n",
436
- " <td>74.647059</td>\n",
437
- " <td>439</td>\n",
438
- " <td>18</td>\n",
439
- " <td>...</td>\n",
440
- " <td>15</td>\n",
441
- " <td>15.0</td>\n",
442
- " <td>0.0</td>\n",
443
- " <td>15.0</td>\n",
444
- " <td>63</td>\n",
445
- " <td>63</td>\n",
446
- " <td>63</td>\n",
447
- " <td>63.0</td>\n",
448
- " <td>0.0</td>\n",
449
- " <td>63.0</td>\n",
450
- " </tr>\n",
451
- " <tr>\n",
452
- " <th>3</th>\n",
453
- " <td>1104</td>\n",
454
- " <td>2018</td>\n",
455
- " <td>1476</td>\n",
456
- " <td>68</td>\n",
457
- " <td>104</td>\n",
458
- " <td>77.0</td>\n",
459
- " <td>8.165324</td>\n",
460
- " <td>77.684211</td>\n",
461
- " <td>530</td>\n",
462
- " <td>21</td>\n",
463
- " <td>...</td>\n",
464
- " <td>13</td>\n",
465
- " <td>13.0</td>\n",
466
- " <td>0.0</td>\n",
467
- " <td>13.0</td>\n",
468
- " <td>78</td>\n",
469
- " <td>78</td>\n",
470
- " <td>78</td>\n",
471
- " <td>78.0</td>\n",
472
- " <td>0.0</td>\n",
473
- " <td>78.0</td>\n",
474
- " </tr>\n",
475
- " <tr>\n",
476
- " <th>4</th>\n",
477
- " <td>1104</td>\n",
478
- " <td>2021</td>\n",
479
- " <td>2004</td>\n",
480
- " <td>64</td>\n",
481
- " <td>115</td>\n",
482
- " <td>82.5</td>\n",
483
- " <td>10.942538</td>\n",
484
- " <td>83.500000</td>\n",
485
- " <td>702</td>\n",
486
- " <td>20</td>\n",
487
- " <td>...</td>\n",
488
- " <td>23</td>\n",
489
- " <td>23.0</td>\n",
490
- " <td>0.0</td>\n",
491
- " <td>23.0</td>\n",
492
- " <td>77</td>\n",
493
- " <td>77</td>\n",
494
- " <td>77</td>\n",
495
- " <td>77.0</td>\n",
496
- " <td>0.0</td>\n",
497
- " <td>77.0</td>\n",
498
- " </tr>\n",
499
- " </tbody>\n",
500
- "</table>\n",
501
- "<p>5 rows × 548 columns</p>\n",
502
- "</div>"
503
- ],
504
- "text/plain": [
505
- " TeamID Season reg_WScore_sum reg_WScore_min reg_WScore_max \\\n",
506
- "0 1101 2021 1500 63 93 \n",
507
- "1 1104 2004 1298 45 101 \n",
508
- "2 1104 2006 1269 56 105 \n",
509
- "3 1104 2018 1476 68 104 \n",
510
- "4 1104 2021 2004 64 115 \n",
511
- "\n",
512
- " reg_WScore_median reg_WScore_std reg_WScore_mean reg_WFGM_sum \\\n",
513
- "0 80.0 8.579521 78.947368 529 \n",
514
- "1 77.0 11.837130 76.352941 440 \n",
515
- "2 71.0 14.321929 74.647059 439 \n",
516
- "3 77.0 8.165324 77.684211 530 \n",
517
- "4 82.5 10.942538 83.500000 702 \n",
518
- "\n",
519
- " reg_WFGM_min ... tourney_PF_max tourney_PF_median tourney_PF_std \\\n",
520
- "0 20 ... 16 16.0 0.0 \n",
521
- "1 16 ... 15 15.0 0.0 \n",
522
- "2 18 ... 15 15.0 0.0 \n",
523
- "3 21 ... 13 13.0 0.0 \n",
524
- "4 20 ... 23 23.0 0.0 \n",
525
- "\n",
526
- " tourney_PF_mean tourney_PA_sum tourney_PA_min tourney_PA_max \\\n",
527
- "0 16.0 79 79 79 \n",
528
- "1 15.0 67 67 67 \n",
529
- "2 15.0 63 63 63 \n",
530
- "3 13.0 78 78 78 \n",
531
- "4 23.0 77 77 77 \n",
532
- "\n",
533
- " tourney_PA_median tourney_PA_std tourney_PA_mean \n",
534
- "0 79.0 0.0 79.0 \n",
535
- "1 67.0 0.0 67.0 \n",
536
- "2 63.0 0.0 63.0 \n",
537
- "3 78.0 0.0 78.0 \n",
538
- "4 77.0 0.0 77.0 \n",
539
- "\n",
540
- "[5 rows x 548 columns]"
541
- ]
542
- },
543
- "execution_count": 9,
544
- "metadata": {},
545
- "output_type": "execute_result"
546
- }
547
- ],
548
- "source": [
549
- "all_teams_summary_df = pd.merge(\n",
550
- " left=reg_summary_df,\n",
551
- " right=tourney_summary_df,\n",
552
- " on=[\"TeamID\", \"Season\"],\n",
553
- ").fillna(0)\n",
554
- "\n",
555
- "all_teams_summary_df.head()"
556
- ]
557
- },
558
- {
559
- "cell_type": "code",
560
- "execution_count": 10,
561
- "metadata": {},
562
- "outputs": [],
563
- "source": [
564
- "# now merge some of the other datasets that have interesting things together with this one\n",
565
- "\n",
566
- "seeds_history_df = pd.read_csv(\n",
567
- " os.path.join(DATA_DIR, \"MNCAATourneySeeds.csv\"),\n",
568
- ")\n",
569
- "\n",
570
- "conferences_df = pd.read_csv(\n",
571
- " os.path.join(DATA_DIR, \"MTeamConferences.csv\"),\n",
572
- ")\n",
573
- "\n",
574
- "teams_df = pd.read_csv(\n",
575
- " os.path.join(DATA_DIR, \"MTeams.csv\"),\n",
576
- ")\n"
577
- ]
578
- },
579
- {
580
- "cell_type": "code",
581
- "execution_count": 11,
582
- "metadata": {},
583
- "outputs": [],
584
- "source": [
585
- "tms = pd.merge(\n",
586
- " left=all_teams_summary_df,\n",
587
- " right=teams_df,\n",
588
- " on=\"TeamID\",\n",
589
- ")\n",
590
- "\n",
591
- "tms_conf = pd.merge(\n",
592
- " left=tms,\n",
593
- " right=conferences_df[conferences_df[\"Season\"] >= 2003],\n",
594
- " on=[\"TeamID\", \"Season\"]\n",
595
- ")\n",
596
- "\n",
597
- "tms_conf_seeds = pd.merge(\n",
598
- " left=tms_conf,\n",
599
- " right=seeds_history_df[seeds_history_df[\"Season\"] >= 2003],\n",
600
- " on=[\"TeamID\", \"Season\"]\n",
601
- ")"
602
- ]
603
- },
604
- {
605
- "cell_type": "code",
606
- "execution_count": 12,
607
- "metadata": {},
608
- "outputs": [
609
- {
610
- "name": "stdout",
611
- "output_type": "stream",
612
- "text": [
613
- "<class 'pandas.core.frame.DataFrame'>\n",
614
- "Int64Index: 661 entries, 0 to 660\n",
615
- "Columns: 553 entries, TeamID to Seed\n",
616
- "dtypes: float64(276), int64(274), object(3)\n",
617
- "memory usage: 2.8+ MB\n"
618
- ]
619
- }
620
- ],
621
- "source": [
622
- "tms_conf_seeds.info()"
623
- ]
624
- },
625
- {
626
- "cell_type": "code",
627
- "execution_count": 13,
628
- "metadata": {},
629
- "outputs": [
630
- {
631
- "data": {
632
- "text/html": [
633
- "<div>\n",
634
- "<style scoped>\n",
635
- " .dataframe tbody tr th:only-of-type {\n",
636
- " vertical-align: middle;\n",
637
- " }\n",
638
- "\n",
639
- " .dataframe tbody tr th {\n",
640
- " vertical-align: top;\n",
641
- " }\n",
642
- "\n",
643
- " .dataframe thead th {\n",
644
- " text-align: right;\n",
645
- " }\n",
646
- "</style>\n",
647
- "<table border=\"1\" class=\"dataframe\">\n",
648
- " <thead>\n",
649
- " <tr style=\"text-align: right;\">\n",
650
- " <th></th>\n",
651
- " <th>TeamID</th>\n",
652
- " <th>Season</th>\n",
653
- " <th>reg_WScore_sum</th>\n",
654
- " <th>reg_WScore_min</th>\n",
655
- " <th>reg_WScore_max</th>\n",
656
- " <th>reg_WScore_median</th>\n",
657
- " <th>reg_WScore_std</th>\n",
658
- " <th>reg_WScore_mean</th>\n",
659
- " <th>reg_WFGM_sum</th>\n",
660
- " <th>reg_WFGM_min</th>\n",
661
- " <th>...</th>\n",
662
- " <th>tourney_PA_min</th>\n",
663
- " <th>tourney_PA_max</th>\n",
664
- " <th>tourney_PA_median</th>\n",
665
- " <th>tourney_PA_std</th>\n",
666
- " <th>tourney_PA_mean</th>\n",
667
- " <th>TeamName</th>\n",
668
- " <th>FirstD1Season</th>\n",
669
- " <th>LastD1Season</th>\n",
670
- " <th>ConfAbbrev</th>\n",
671
- " <th>Seed</th>\n",
672
- " </tr>\n",
673
- " </thead>\n",
674
- " <tbody>\n",
675
- " <tr>\n",
676
- " <th>0</th>\n",
677
- " <td>1101</td>\n",
678
- " <td>2021</td>\n",
679
- " <td>1500</td>\n",
680
- " <td>63</td>\n",
681
- " <td>93</td>\n",
682
- " <td>80.0</td>\n",
683
- " <td>8.579521</td>\n",
684
- " <td>78.947368</td>\n",
685
- " <td>529</td>\n",
686
- " <td>20</td>\n",
687
- " <td>...</td>\n",
688
- " <td>79</td>\n",
689
- " <td>79</td>\n",
690
- " <td>79.0</td>\n",
691
- " <td>0.0</td>\n",
692
- " <td>79.0</td>\n",
693
- " <td>Abilene Chr</td>\n",
694
- " <td>2014</td>\n",
695
- " <td>2024</td>\n",
696
- " <td>southland</td>\n",
697
- " <td>W14</td>\n",
698
- " </tr>\n",
699
- " <tr>\n",
700
- " <th>1</th>\n",
701
- " <td>1104</td>\n",
702
- " <td>2004</td>\n",
703
- " <td>1298</td>\n",
704
- " <td>45</td>\n",
705
- " <td>101</td>\n",
706
- " <td>77.0</td>\n",
707
- " <td>11.837130</td>\n",
708
- " <td>76.352941</td>\n",
709
- " <td>440</td>\n",
710
- " <td>16</td>\n",
711
- " <td>...</td>\n",
712
- " <td>67</td>\n",
713
- " <td>67</td>\n",
714
- " <td>67.0</td>\n",
715
- " <td>0.0</td>\n",
716
- " <td>67.0</td>\n",
717
- " <td>Alabama</td>\n",
718
- " <td>1985</td>\n",
719
- " <td>2024</td>\n",
720
- " <td>sec</td>\n",
721
- " <td>X08</td>\n",
722
- " </tr>\n",
723
- " <tr>\n",
724
- " <th>2</th>\n",
725
- " <td>1104</td>\n",
726
- " <td>2006</td>\n",
727
- " <td>1269</td>\n",
728
- " <td>56</td>\n",
729
- " <td>105</td>\n",
730
- " <td>71.0</td>\n",
731
- " <td>14.321929</td>\n",
732
- " <td>74.647059</td>\n",
733
- " <td>439</td>\n",
734
- " <td>18</td>\n",
735
- " <td>...</td>\n",
736
- " <td>63</td>\n",
737
- " <td>63</td>\n",
738
- " <td>63.0</td>\n",
739
- " <td>0.0</td>\n",
740
- " <td>63.0</td>\n",
741
- " <td>Alabama</td>\n",
742
- " <td>1985</td>\n",
743
- " <td>2024</td>\n",
744
- " <td>sec</td>\n",
745
- " <td>X10</td>\n",
746
- " </tr>\n",
747
- " <tr>\n",
748
- " <th>3</th>\n",
749
- " <td>1104</td>\n",
750
- " <td>2018</td>\n",
751
- " <td>1476</td>\n",
752
- " <td>68</td>\n",
753
- " <td>104</td>\n",
754
- " <td>77.0</td>\n",
755
- " <td>8.165324</td>\n",
756
- " <td>77.684211</td>\n",
757
- " <td>530</td>\n",
758
- " <td>21</td>\n",
759
- " <td>...</td>\n",
760
- " <td>78</td>\n",
761
- " <td>78</td>\n",
762
- " <td>78.0</td>\n",
763
- " <td>0.0</td>\n",
764
- " <td>78.0</td>\n",
765
- " <td>Alabama</td>\n",
766
- " <td>1985</td>\n",
767
- " <td>2024</td>\n",
768
- " <td>sec</td>\n",
769
- " <td>W09</td>\n",
770
- " </tr>\n",
771
- " <tr>\n",
772
- " <th>4</th>\n",
773
- " <td>1104</td>\n",
774
- " <td>2021</td>\n",
775
- " <td>2004</td>\n",
776
- " <td>64</td>\n",
777
- " <td>115</td>\n",
778
- " <td>82.5</td>\n",
779
- " <td>10.942538</td>\n",
780
- " <td>83.500000</td>\n",
781
- " <td>702</td>\n",
782
- " <td>20</td>\n",
783
- " <td>...</td>\n",
784
- " <td>77</td>\n",
785
- " <td>77</td>\n",
786
- " <td>77.0</td>\n",
787
- " <td>0.0</td>\n",
788
- " <td>77.0</td>\n",
789
- " <td>Alabama</td>\n",
790
- " <td>1985</td>\n",
791
- " <td>2024</td>\n",
792
- " <td>sec</td>\n",
793
- " <td>W02</td>\n",
794
- " </tr>\n",
795
- " </tbody>\n",
796
- "</table>\n",
797
- "<p>5 rows × 553 columns</p>\n",
798
- "</div>"
799
- ],
800
- "text/plain": [
801
- " TeamID Season reg_WScore_sum reg_WScore_min reg_WScore_max \\\n",
802
- "0 1101 2021 1500 63 93 \n",
803
- "1 1104 2004 1298 45 101 \n",
804
- "2 1104 2006 1269 56 105 \n",
805
- "3 1104 2018 1476 68 104 \n",
806
- "4 1104 2021 2004 64 115 \n",
807
- "\n",
808
- " reg_WScore_median reg_WScore_std reg_WScore_mean reg_WFGM_sum \\\n",
809
- "0 80.0 8.579521 78.947368 529 \n",
810
- "1 77.0 11.837130 76.352941 440 \n",
811
- "2 71.0 14.321929 74.647059 439 \n",
812
- "3 77.0 8.165324 77.684211 530 \n",
813
- "4 82.5 10.942538 83.500000 702 \n",
814
- "\n",
815
- " reg_WFGM_min ... tourney_PA_min tourney_PA_max tourney_PA_median \\\n",
816
- "0 20 ... 79 79 79.0 \n",
817
- "1 16 ... 67 67 67.0 \n",
818
- "2 18 ... 63 63 63.0 \n",
819
- "3 21 ... 78 78 78.0 \n",
820
- "4 20 ... 77 77 77.0 \n",
821
- "\n",
822
- " tourney_PA_std tourney_PA_mean TeamName FirstD1Season LastD1Season \\\n",
823
- "0 0.0 79.0 Abilene Chr 2014 2024 \n",
824
- "1 0.0 67.0 Alabama 1985 2024 \n",
825
- "2 0.0 63.0 Alabama 1985 2024 \n",
826
- "3 0.0 78.0 Alabama 1985 2024 \n",
827
- "4 0.0 77.0 Alabama 1985 2024 \n",
828
- "\n",
829
- " ConfAbbrev Seed \n",
830
- "0 southland W14 \n",
831
- "1 sec X08 \n",
832
- "2 sec X10 \n",
833
- "3 sec W09 \n",
834
- "4 sec W02 \n",
835
- "\n",
836
- "[5 rows x 553 columns]"
837
- ]
838
- },
839
- "execution_count": 13,
840
- "metadata": {},
841
- "output_type": "execute_result"
842
- }
843
- ],
844
- "source": [
845
- "tms_conf_seeds.head()"
846
- ]
847
- },
848
- {
849
- "cell_type": "code",
850
- "execution_count": 22,
851
- "metadata": {},
852
- "outputs": [],
853
- "source": [
854
- "def seed_to_num(seed: str) -> int:\n",
855
- " chars = [char for char in seed]\n",
856
- " all_nums = all([char.isnumeric() for char in chars])\n",
857
- " if not all_nums:\n",
858
- " return int(seed[1:-1])\n",
859
- " return int(seed[1:])\n",
860
- "\n",
861
- "tms_conf_seeds[\"Seed_Num\"] = tms_conf_seeds.apply(\n",
862
- " lambda row: seed_to_num(row[\"Seed\"]),\n",
863
- " axis=1,\n",
864
- ")\n",
865
- "\n",
866
- "tms_conf_seeds[\"Seed_Region\"] = tms_conf_seeds.apply(\n",
867
- " lambda row: row[\"Seed\"][0],\n",
868
- " axis=1,\n",
869
- ")\n"
870
- ]
871
- },
872
- {
873
- "cell_type": "code",
874
- "execution_count": 23,
875
- "metadata": {},
876
- "outputs": [
877
- {
878
- "name": "stdout",
879
- "output_type": "stream",
880
- "text": [
881
- "<class 'pandas.core.series.Series'>\n",
882
- "Int64Index: 661 entries, 0 to 660\n",
883
- "Series name: Seed_Region\n",
884
- "Non-Null Count Dtype \n",
885
- "-------------- ----- \n",
886
- "661 non-null object\n",
887
- "dtypes: object(1)\n",
888
- "memory usage: 10.3+ KB\n"
889
- ]
890
- }
891
- ],
892
- "source": [
893
- "tms_conf_seeds[\"Seed_Region\"].info()"
894
- ]
895
- },
896
- {
897
- "cell_type": "code",
898
- "execution_count": 16,
899
- "metadata": {},
900
- "outputs": [
901
- {
902
- "name": "stdout",
903
- "output_type": "stream",
904
- "text": [
905
- "<class 'pandas.core.series.Series'>\n",
906
- "Int64Index: 661 entries, 0 to 660\n",
907
- "Series name: Seed_Num\n",
908
- "Non-Null Count Dtype\n",
909
- "-------------- -----\n",
910
- "661 non-null int64\n",
911
- "dtypes: int64(1)\n",
912
- "memory usage: 10.3 KB\n"
913
- ]
914
- }
915
- ],
916
- "source": [
917
- "tms_conf_seeds[\"Seed_Num\"].info()"
918
- ]
919
- },
920
- {
921
- "cell_type": "code",
922
- "execution_count": 17,
923
- "metadata": {},
924
- "outputs": [
925
- {
926
- "data": {
927
- "text/html": [
928
- "<div>\n",
929
- "<style scoped>\n",
930
- " .dataframe tbody tr th:only-of-type {\n",
931
- " vertical-align: middle;\n",
932
- " }\n",
933
- "\n",
934
- " .dataframe tbody tr th {\n",
935
- " vertical-align: top;\n",
936
- " }\n",
937
- "\n",
938
- " .dataframe thead th {\n",
939
- " text-align: right;\n",
940
- " }\n",
941
- "</style>\n",
942
- "<table border=\"1\" class=\"dataframe\">\n",
943
- " <thead>\n",
944
- " <tr style=\"text-align: right;\">\n",
945
- " <th></th>\n",
946
- " <th>TeamID</th>\n",
947
- " <th>Season</th>\n",
948
- " <th>reg_WScore_sum</th>\n",
949
- " <th>reg_WScore_min</th>\n",
950
- " <th>reg_WScore_max</th>\n",
951
- " <th>reg_WScore_median</th>\n",
952
- " <th>reg_WScore_std</th>\n",
953
- " <th>reg_WScore_mean</th>\n",
954
- " <th>reg_WFGM_sum</th>\n",
955
- " <th>reg_WFGM_min</th>\n",
956
- " <th>...</th>\n",
957
- " <th>tourney_PF_mean</th>\n",
958
- " <th>tourney_PA_sum</th>\n",
959
- " <th>tourney_PA_min</th>\n",
960
- " <th>tourney_PA_max</th>\n",
961
- " <th>tourney_PA_median</th>\n",
962
- " <th>tourney_PA_std</th>\n",
963
- " <th>tourney_PA_mean</th>\n",
964
- " <th>FirstD1Season</th>\n",
965
- " <th>LastD1Season</th>\n",
966
- " <th>Seed_Num</th>\n",
967
- " </tr>\n",
968
- " </thead>\n",
969
- " <tbody>\n",
970
- " <tr>\n",
971
- " <th>count</th>\n",
972
- " <td>661.000000</td>\n",
973
- " <td>661.000000</td>\n",
974
- " <td>661.000000</td>\n",
975
- " <td>661.000000</td>\n",
976
- " <td>661.000000</td>\n",
977
- " <td>661.000000</td>\n",
978
- " <td>661.000000</td>\n",
979
- " <td>661.000000</td>\n",
980
- " <td>661.00000</td>\n",
981
- " <td>661.000000</td>\n",
982
- " <td>...</td>\n",
983
- " <td>661.000000</td>\n",
984
- " <td>661.000000</td>\n",
985
- " <td>661.000000</td>\n",
986
- " <td>661.000000</td>\n",
987
- " <td>661.000000</td>\n",
988
- " <td>661.0</td>\n",
989
- " <td>661.000000</td>\n",
990
- " <td>661.000000</td>\n",
991
- " <td>661.0</td>\n",
992
- " <td>661.000000</td>\n",
993
- " </tr>\n",
994
- " <tr>\n",
995
- " <th>mean</th>\n",
996
- " <td>1295.801815</td>\n",
997
- " <td>2012.774584</td>\n",
998
- " <td>1831.642965</td>\n",
999
- " <td>58.370651</td>\n",
1000
- " <td>99.411498</td>\n",
1001
- " <td>76.929652</td>\n",
1002
- " <td>10.357126</td>\n",
1003
- " <td>77.391245</td>\n",
1004
- " <td>644.32829</td>\n",
1005
- " <td>19.069592</td>\n",
1006
- " <td>...</td>\n",
1007
- " <td>18.833585</td>\n",
1008
- " <td>76.003026</td>\n",
1009
- " <td>76.003026</td>\n",
1010
- " <td>76.003026</td>\n",
1011
- " <td>76.003026</td>\n",
1012
- " <td>0.0</td>\n",
1013
- " <td>76.003026</td>\n",
1014
- " <td>1985.512859</td>\n",
1015
- " <td>2024.0</td>\n",
1016
- " <td>1.366112</td>\n",
1017
- " </tr>\n",
1018
- " <tr>\n",
1019
- " <th>std</th>\n",
1020
- " <td>103.450611</td>\n",
1021
- " <td>5.978169</td>\n",
1022
- " <td>326.691393</td>\n",
1023
- " <td>5.972435</td>\n",
1024
- " <td>9.382937</td>\n",
1025
- " <td>5.002799</td>\n",
1026
- " <td>1.844741</td>\n",
1027
- " <td>4.865862</td>\n",
1028
- " <td>120.26041</td>\n",
1029
- " <td>2.439200</td>\n",
1030
- " <td>...</td>\n",
1031
- " <td>4.233291</td>\n",
1032
- " <td>10.964128</td>\n",
1033
- " <td>10.964128</td>\n",
1034
- " <td>10.964128</td>\n",
1035
- " <td>10.964128</td>\n",
1036
- " <td>0.0</td>\n",
1037
- " <td>10.964128</td>\n",
1038
- " <td>2.812079</td>\n",
1039
- " <td>0.0</td>\n",
1040
- " <td>3.930341</td>\n",
1041
- " </tr>\n",
1042
- " <tr>\n",
1043
- " <th>min</th>\n",
1044
- " <td>1101.000000</td>\n",
1045
- " <td>2003.000000</td>\n",
1046
- " <td>725.000000</td>\n",
1047
- " <td>38.000000</td>\n",
1048
- " <td>76.000000</td>\n",
1049
- " <td>62.000000</td>\n",
1050
- " <td>5.821974</td>\n",
1051
- " <td>64.280000</td>\n",
1052
- " <td>248.00000</td>\n",
1053
- " <td>12.000000</td>\n",
1054
- " <td>...</td>\n",
1055
- " <td>7.000000</td>\n",
1056
- " <td>47.000000</td>\n",
1057
- " <td>47.000000</td>\n",
1058
- " <td>47.000000</td>\n",
1059
- " <td>47.000000</td>\n",
1060
- " <td>0.0</td>\n",
1061
- " <td>47.000000</td>\n",
1062
- " <td>1985.000000</td>\n",
1063
- " <td>2024.0</td>\n",
1064
- " <td>0.000000</td>\n",
1065
- " </tr>\n",
1066
- " <tr>\n",
1067
- " <th>25%</th>\n",
1068
- " <td>1211.000000</td>\n",
1069
- " <td>2008.000000</td>\n",
1070
- " <td>1610.000000</td>\n",
1071
- " <td>55.000000</td>\n",
1072
- " <td>93.000000</td>\n",
1073
- " <td>73.500000</td>\n",
1074
- " <td>8.985621</td>\n",
1075
- " <td>74.142857</td>\n",
1076
- " <td>569.00000</td>\n",
1077
- " <td>17.000000</td>\n",
1078
- " <td>...</td>\n",
1079
- " <td>16.000000</td>\n",
1080
- " <td>69.000000</td>\n",
1081
- " <td>69.000000</td>\n",
1082
- " <td>69.000000</td>\n",
1083
- " <td>69.000000</td>\n",
1084
- " <td>0.0</td>\n",
1085
- " <td>69.000000</td>\n",
1086
- " <td>1985.000000</td>\n",
1087
- " <td>2024.0</td>\n",
1088
- " <td>0.000000</td>\n",
1089
- " </tr>\n",
1090
- " <tr>\n",
1091
- " <th>50%</th>\n",
1092
- " <td>1287.000000</td>\n",
1093
- " <td>2013.000000</td>\n",
1094
- " <td>1834.000000</td>\n",
1095
- " <td>59.000000</td>\n",
1096
- " <td>99.000000</td>\n",
1097
- " <td>77.000000</td>\n",
1098
- " <td>10.284318</td>\n",
1099
- " <td>77.350000</td>\n",
1100
- " <td>639.00000</td>\n",
1101
- " <td>19.000000</td>\n",
1102
- " <td>...</td>\n",
1103
- " <td>19.000000</td>\n",
1104
- " <td>76.000000</td>\n",
1105
- " <td>76.000000</td>\n",
1106
- " <td>76.000000</td>\n",
1107
- " <td>76.000000</td>\n",
1108
- " <td>0.0</td>\n",
1109
- " <td>76.000000</td>\n",
1110
- " <td>1985.000000</td>\n",
1111
- " <td>2024.0</td>\n",
1112
- " <td>0.000000</td>\n",
1113
- " </tr>\n",
1114
- " <tr>\n",
1115
- " <th>75%</th>\n",
1116
- " <td>1393.000000</td>\n",
1117
- " <td>2018.000000</td>\n",
1118
- " <td>2036.000000</td>\n",
1119
- " <td>62.000000</td>\n",
1120
- " <td>105.000000</td>\n",
1121
- " <td>80.000000</td>\n",
1122
- " <td>11.584882</td>\n",
1123
- " <td>80.541667</td>\n",
1124
- " <td>721.00000</td>\n",
1125
- " <td>21.000000</td>\n",
1126
- " <td>...</td>\n",
1127
- " <td>22.000000</td>\n",
1128
- " <td>83.000000</td>\n",
1129
- " <td>83.000000</td>\n",
1130
- " <td>83.000000</td>\n",
1131
- " <td>83.000000</td>\n",
1132
- " <td>0.0</td>\n",
1133
- " <td>83.000000</td>\n",
1134
- " <td>1985.000000</td>\n",
1135
- " <td>2024.0</td>\n",
1136
- " <td>1.000000</td>\n",
1137
- " </tr>\n",
1138
- " <tr>\n",
1139
- " <th>max</th>\n",
1140
- " <td>1463.000000</td>\n",
1141
- " <td>2023.000000</td>\n",
1142
- " <td>2858.000000</td>\n",
1143
- " <td>76.000000</td>\n",
1144
- " <td>144.000000</td>\n",
1145
- " <td>91.500000</td>\n",
1146
- " <td>16.534890</td>\n",
1147
- " <td>91.689655</td>\n",
1148
- " <td>1025.00000</td>\n",
1149
- " <td>27.000000</td>\n",
1150
- " <td>...</td>\n",
1151
- " <td>33.000000</td>\n",
1152
- " <td>121.000000</td>\n",
1153
- " <td>121.000000</td>\n",
1154
- " <td>121.000000</td>\n",
1155
- " <td>121.000000</td>\n",
1156
- " <td>0.0</td>\n",
1157
- " <td>121.000000</td>\n",
1158
- " <td>2014.000000</td>\n",
1159
- " <td>2024.0</td>\n",
1160
- " <td>16.000000</td>\n",
1161
- " </tr>\n",
1162
- " </tbody>\n",
1163
- "</table>\n",
1164
- "<p>8 rows × 551 columns</p>\n",
1165
- "</div>"
1166
- ],
1167
- "text/plain": [
1168
- " TeamID Season reg_WScore_sum reg_WScore_min \\\n",
1169
- "count 661.000000 661.000000 661.000000 661.000000 \n",
1170
- "mean 1295.801815 2012.774584 1831.642965 58.370651 \n",
1171
- "std 103.450611 5.978169 326.691393 5.972435 \n",
1172
- "min 1101.000000 2003.000000 725.000000 38.000000 \n",
1173
- "25% 1211.000000 2008.000000 1610.000000 55.000000 \n",
1174
- "50% 1287.000000 2013.000000 1834.000000 59.000000 \n",
1175
- "75% 1393.000000 2018.000000 2036.000000 62.000000 \n",
1176
- "max 1463.000000 2023.000000 2858.000000 76.000000 \n",
1177
- "\n",
1178
- " reg_WScore_max reg_WScore_median reg_WScore_std reg_WScore_mean \\\n",
1179
- "count 661.000000 661.000000 661.000000 661.000000 \n",
1180
- "mean 99.411498 76.929652 10.357126 77.391245 \n",
1181
- "std 9.382937 5.002799 1.844741 4.865862 \n",
1182
- "min 76.000000 62.000000 5.821974 64.280000 \n",
1183
- "25% 93.000000 73.500000 8.985621 74.142857 \n",
1184
- "50% 99.000000 77.000000 10.284318 77.350000 \n",
1185
- "75% 105.000000 80.000000 11.584882 80.541667 \n",
1186
- "max 144.000000 91.500000 16.534890 91.689655 \n",
1187
- "\n",
1188
- " reg_WFGM_sum reg_WFGM_min ... tourney_PF_mean tourney_PA_sum \\\n",
1189
- "count 661.00000 661.000000 ... 661.000000 661.000000 \n",
1190
- "mean 644.32829 19.069592 ... 18.833585 76.003026 \n",
1191
- "std 120.26041 2.439200 ... 4.233291 10.964128 \n",
1192
- "min 248.00000 12.000000 ... 7.000000 47.000000 \n",
1193
- "25% 569.00000 17.000000 ... 16.000000 69.000000 \n",
1194
- "50% 639.00000 19.000000 ... 19.000000 76.000000 \n",
1195
- "75% 721.00000 21.000000 ... 22.000000 83.000000 \n",
1196
- "max 1025.00000 27.000000 ... 33.000000 121.000000 \n",
1197
- "\n",
1198
- " tourney_PA_min tourney_PA_max tourney_PA_median tourney_PA_std \\\n",
1199
- "count 661.000000 661.000000 661.000000 661.0 \n",
1200
- "mean 76.003026 76.003026 76.003026 0.0 \n",
1201
- "std 10.964128 10.964128 10.964128 0.0 \n",
1202
- "min 47.000000 47.000000 47.000000 0.0 \n",
1203
- "25% 69.000000 69.000000 69.000000 0.0 \n",
1204
- "50% 76.000000 76.000000 76.000000 0.0 \n",
1205
- "75% 83.000000 83.000000 83.000000 0.0 \n",
1206
- "max 121.000000 121.000000 121.000000 0.0 \n",
1207
- "\n",
1208
- " tourney_PA_mean FirstD1Season LastD1Season Seed_Num \n",
1209
- "count 661.000000 661.000000 661.0 661.000000 \n",
1210
- "mean 76.003026 1985.512859 2024.0 1.366112 \n",
1211
- "std 10.964128 2.812079 0.0 3.930341 \n",
1212
- "min 47.000000 1985.000000 2024.0 0.000000 \n",
1213
- "25% 69.000000 1985.000000 2024.0 0.000000 \n",
1214
- "50% 76.000000 1985.000000 2024.0 0.000000 \n",
1215
- "75% 83.000000 1985.000000 2024.0 1.000000 \n",
1216
- "max 121.000000 2014.000000 2024.0 16.000000 \n",
1217
- "\n",
1218
- "[8 rows x 551 columns]"
1219
- ]
1220
- },
1221
- "execution_count": 17,
1222
- "metadata": {},
1223
- "output_type": "execute_result"
1224
- }
1225
- ],
1226
- "source": [
1227
- "tms_conf_seeds.to_csv(os.path.join(DATA_DIR, \"MTeamsAgg.csv\"))\n",
1228
- "tms_conf_seeds.describe()"
1229
- ]
1230
- },
1231
- {
1232
- "cell_type": "markdown",
1233
- "metadata": {},
1234
- "source": [
1235
- "# Extra Dataset with Games and Team Data mapped together"
1236
- ]
1237
- },
1238
- {
1239
- "cell_type": "code",
1240
- "execution_count": 18,
1241
- "metadata": {},
1242
- "outputs": [
1243
- {
1244
- "data": {
1245
- "text/html": [
1246
- "<div>\n",
1247
- "<style scoped>\n",
1248
- " .dataframe tbody tr th:only-of-type {\n",
1249
- " vertical-align: middle;\n",
1250
- " }\n",
1251
- "\n",
1252
- " .dataframe tbody tr th {\n",
1253
- " vertical-align: top;\n",
1254
- " }\n",
1255
- "\n",
1256
- " .dataframe thead th {\n",
1257
- " text-align: right;\n",
1258
- " }\n",
1259
- "</style>\n",
1260
- "<table border=\"1\" class=\"dataframe\">\n",
1261
- " <thead>\n",
1262
- " <tr style=\"text-align: right;\">\n",
1263
- " <th></th>\n",
1264
- " <th>Season</th>\n",
1265
- " <th>DayNum</th>\n",
1266
- " <th>WTeamID</th>\n",
1267
- " <th>WScore</th>\n",
1268
- " <th>LTeamID</th>\n",
1269
- " <th>LScore</th>\n",
1270
- " <th>WLoc</th>\n",
1271
- " <th>NumOT</th>\n",
1272
- " <th>WFGM</th>\n",
1273
- " <th>WFGA</th>\n",
1274
- " <th>...</th>\n",
1275
- " <th>LDR</th>\n",
1276
- " <th>LAst</th>\n",
1277
- " <th>LTO</th>\n",
1278
- " <th>LStl</th>\n",
1279
- " <th>LBlk</th>\n",
1280
- " <th>LPF</th>\n",
1281
- " <th>GameType</th>\n",
1282
- " <th>WPA</th>\n",
1283
- " <th>LPA</th>\n",
1284
- " <th>LLoc</th>\n",
1285
- " </tr>\n",
1286
- " </thead>\n",
1287
- " <tbody>\n",
1288
- " <tr>\n",
1289
- " <th>0</th>\n",
1290
- " <td>2003</td>\n",
1291
- " <td>10</td>\n",
1292
- " <td>1104</td>\n",
1293
- " <td>68</td>\n",
1294
- " <td>1328</td>\n",
1295
- " <td>62</td>\n",
1296
- " <td>1</td>\n",
1297
- " <td>0</td>\n",
1298
- " <td>27</td>\n",
1299
- " <td>58</td>\n",
1300
- " <td>...</td>\n",
1301
- " <td>22</td>\n",
1302
- " <td>8</td>\n",
1303
- " <td>18</td>\n",
1304
- " <td>9</td>\n",
1305
- " <td>2</td>\n",
1306
- " <td>20</td>\n",
1307
- " <td>reg</td>\n",
1308
- " <td>62</td>\n",
1309
- " <td>68</td>\n",
1310
- " <td>1</td>\n",
1311
- " </tr>\n",
1312
- " <tr>\n",
1313
- " <th>1</th>\n",
1314
- " <td>2003</td>\n",
1315
- " <td>10</td>\n",
1316
- " <td>1272</td>\n",
1317
- " <td>70</td>\n",
1318
- " <td>1393</td>\n",
1319
- " <td>63</td>\n",
1320
- " <td>1</td>\n",
1321
- " <td>0</td>\n",
1322
- " <td>26</td>\n",
1323
- " <td>62</td>\n",
1324
- " <td>...</td>\n",
1325
- " <td>25</td>\n",
1326
- " <td>7</td>\n",
1327
- " <td>12</td>\n",
1328
- " <td>8</td>\n",
1329
- " <td>6</td>\n",
1330
- " <td>16</td>\n",
1331
- " <td>reg</td>\n",
1332
- " <td>63</td>\n",
1333
- " <td>70</td>\n",
1334
- " <td>1</td>\n",
1335
- " </tr>\n",
1336
- " <tr>\n",
1337
- " <th>2</th>\n",
1338
- " <td>2003</td>\n",
1339
- " <td>11</td>\n",
1340
- " <td>1266</td>\n",
1341
- " <td>73</td>\n",
1342
- " <td>1437</td>\n",
1343
- " <td>61</td>\n",
1344
- " <td>1</td>\n",
1345
- " <td>0</td>\n",
1346
- " <td>24</td>\n",
1347
- " <td>58</td>\n",
1348
- " <td>...</td>\n",
1349
- " <td>22</td>\n",
1350
- " <td>9</td>\n",
1351
- " <td>12</td>\n",
1352
- " <td>2</td>\n",
1353
- " <td>5</td>\n",
1354
- " <td>23</td>\n",
1355
- " <td>reg</td>\n",
1356
- " <td>61</td>\n",
1357
- " <td>73</td>\n",
1358
- " <td>1</td>\n",
1359
- " </tr>\n",
1360
- " <tr>\n",
1361
- " <th>3</th>\n",
1362
- " <td>2003</td>\n",
1363
- " <td>11</td>\n",
1364
- " <td>1296</td>\n",
1365
- " <td>56</td>\n",
1366
- " <td>1457</td>\n",
1367
- " <td>50</td>\n",
1368
- " <td>1</td>\n",
1369
- " <td>0</td>\n",
1370
- " <td>18</td>\n",
1371
- " <td>38</td>\n",
1372
- " <td>...</td>\n",
1373
- " <td>20</td>\n",
1374
- " <td>9</td>\n",
1375
- " <td>19</td>\n",
1376
- " <td>4</td>\n",
1377
- " <td>3</td>\n",
1378
- " <td>23</td>\n",
1379
- " <td>reg</td>\n",
1380
- " <td>50</td>\n",
1381
- " <td>56</td>\n",
1382
- " <td>1</td>\n",
1383
- " </tr>\n",
1384
- " <tr>\n",
1385
- " <th>4</th>\n",
1386
- " <td>2003</td>\n",
1387
- " <td>11</td>\n",
1388
- " <td>1400</td>\n",
1389
- " <td>77</td>\n",
1390
- " <td>1208</td>\n",
1391
- " <td>71</td>\n",
1392
- " <td>1</td>\n",
1393
- " <td>0</td>\n",
1394
- " <td>30</td>\n",
1395
- " <td>61</td>\n",
1396
- " <td>...</td>\n",
1397
- " <td>15</td>\n",
1398
- " <td>12</td>\n",
1399
- " <td>10</td>\n",
1400
- " <td>7</td>\n",
1401
- " <td>1</td>\n",
1402
- " <td>14</td>\n",
1403
- " <td>reg</td>\n",
1404
- " <td>71</td>\n",
1405
- " <td>77</td>\n",
1406
- " <td>1</td>\n",
1407
- " </tr>\n",
1408
- " </tbody>\n",
1409
- "</table>\n",
1410
- "<p>5 rows × 38 columns</p>\n",
1411
- "</div>"
1412
- ],
1413
- "text/plain": [
1414
- " Season DayNum WTeamID WScore LTeamID LScore WLoc NumOT WFGM WFGA \\\n",
1415
- "0 2003 10 1104 68 1328 62 1 0 27 58 \n",
1416
- "1 2003 10 1272 70 1393 63 1 0 26 62 \n",
1417
- "2 2003 11 1266 73 1437 61 1 0 24 58 \n",
1418
- "3 2003 11 1296 56 1457 50 1 0 18 38 \n",
1419
- "4 2003 11 1400 77 1208 71 1 0 30 61 \n",
1420
- "\n",
1421
- " ... LDR LAst LTO LStl LBlk LPF GameType WPA LPA LLoc \n",
1422
- "0 ... 22 8 18 9 2 20 reg 62 68 1 \n",
1423
- "1 ... 25 7 12 8 6 16 reg 63 70 1 \n",
1424
- "2 ... 22 9 12 2 5 23 reg 61 73 1 \n",
1425
- "3 ... 20 9 19 4 3 23 reg 50 56 1 \n",
1426
- "4 ... 15 12 10 7 1 14 reg 71 77 1 \n",
1427
- "\n",
1428
- "[5 rows x 38 columns]"
1429
- ]
1430
- },
1431
- "execution_count": 18,
1432
- "metadata": {},
1433
- "output_type": "execute_result"
1434
- }
1435
- ],
1436
- "source": [
1437
- "# get a dataframe of all games regardless of wether or not it was a tournament game or not\n",
1438
- "all_games_df = pd.concat([reg_games_df, tourney_games_df])\n",
1439
- "all_games_df.head()"
1440
- ]
1441
- },
1442
- {
1443
- "cell_type": "code",
1444
- "execution_count": 19,
1445
- "metadata": {},
1446
- "outputs": [
1447
- {
1448
- "data": {
1449
- "text/html": [
1450
- "<div>\n",
1451
- "<style scoped>\n",
1452
- " .dataframe tbody tr th:only-of-type {\n",
1453
- " vertical-align: middle;\n",
1454
- " }\n",
1455
- "\n",
1456
- " .dataframe tbody tr th {\n",
1457
- " vertical-align: top;\n",
1458
- " }\n",
1459
- "\n",
1460
- " .dataframe thead th {\n",
1461
- " text-align: right;\n",
1462
- " }\n",
1463
- "</style>\n",
1464
- "<table border=\"1\" class=\"dataframe\">\n",
1465
- " <thead>\n",
1466
- " <tr style=\"text-align: right;\">\n",
1467
- " <th></th>\n",
1468
- " <th>Season</th>\n",
1469
- " <th>DayNum</th>\n",
1470
- " <th>WTeamID</th>\n",
1471
- " <th>WScore</th>\n",
1472
- " <th>LTeamID</th>\n",
1473
- " <th>LScore</th>\n",
1474
- " <th>WLoc</th>\n",
1475
- " <th>NumOT</th>\n",
1476
- " <th>WFGM</th>\n",
1477
- " <th>WFGA</th>\n",
1478
- " <th>...</th>\n",
1479
- " <th>tourney_PA_median_L</th>\n",
1480
- " <th>tourney_PA_std_L</th>\n",
1481
- " <th>tourney_PA_mean_L</th>\n",
1482
- " <th>TeamName_L</th>\n",
1483
- " <th>FirstD1Season_L</th>\n",
1484
- " <th>LastD1Season_L</th>\n",
1485
- " <th>ConfAbbrev_L</th>\n",
1486
- " <th>Seed_L</th>\n",
1487
- " <th>Seed_Num_L</th>\n",
1488
- " <th>Seed_Region_L</th>\n",
1489
- " </tr>\n",
1490
- " </thead>\n",
1491
- " <tbody>\n",
1492
- " <tr>\n",
1493
- " <th>0</th>\n",
1494
- " <td>2003</td>\n",
1495
- " <td>40</td>\n",
1496
- " <td>1266</td>\n",
1497
- " <td>63</td>\n",
1498
- " <td>1458</td>\n",
1499
- " <td>54</td>\n",
1500
- " <td>1</td>\n",
1501
- " <td>0</td>\n",
1502
- " <td>24</td>\n",
1503
- " <td>46</td>\n",
1504
- " <td>...</td>\n",
1505
- " <td>62.0</td>\n",
1506
- " <td>0.0</td>\n",
1507
- " <td>62.0</td>\n",
1508
- " <td>Wisconsin</td>\n",
1509
- " <td>1985</td>\n",
1510
- " <td>2024</td>\n",
1511
- " <td>big_ten</td>\n",
1512
- " <td>Y05</td>\n",
1513
- " <td>0</td>\n",
1514
- " <td>Y</td>\n",
1515
- " </tr>\n",
1516
- " <tr>\n",
1517
- " <th>1</th>\n",
1518
- " <td>2003</td>\n",
1519
- " <td>93</td>\n",
1520
- " <td>1345</td>\n",
1521
- " <td>78</td>\n",
1522
- " <td>1458</td>\n",
1523
- " <td>60</td>\n",
1524
- " <td>1</td>\n",
1525
- " <td>0</td>\n",
1526
- " <td>23</td>\n",
1527
- " <td>51</td>\n",
1528
- " <td>...</td>\n",
1529
- " <td>62.0</td>\n",
1530
- " <td>0.0</td>\n",
1531
- " <td>62.0</td>\n",
1532
- " <td>Wisconsin</td>\n",
1533
- " <td>1985</td>\n",
1534
- " <td>2024</td>\n",
1535
- " <td>big_ten</td>\n",
1536
- " <td>Y05</td>\n",
1537
- " <td>0</td>\n",
1538
- " <td>Y</td>\n",
1539
- " </tr>\n",
1540
- " <tr>\n",
1541
- " <th>2</th>\n",
1542
- " <td>2003</td>\n",
1543
- " <td>68</td>\n",
1544
- " <td>1228</td>\n",
1545
- " <td>69</td>\n",
1546
- " <td>1458</td>\n",
1547
- " <td>63</td>\n",
1548
- " <td>1</td>\n",
1549
- " <td>0</td>\n",
1550
- " <td>25</td>\n",
1551
- " <td>50</td>\n",
1552
- " <td>...</td>\n",
1553
- " <td>62.0</td>\n",
1554
- " <td>0.0</td>\n",
1555
- " <td>62.0</td>\n",
1556
- " <td>Wisconsin</td>\n",
1557
- " <td>1985</td>\n",
1558
- " <td>2024</td>\n",
1559
- " <td>big_ten</td>\n",
1560
- " <td>Y05</td>\n",
1561
- " <td>0</td>\n",
1562
- " <td>Y</td>\n",
1563
- " </tr>\n",
1564
- " <tr>\n",
1565
- " <th>3</th>\n",
1566
- " <td>2003</td>\n",
1567
- " <td>143</td>\n",
1568
- " <td>1246</td>\n",
1569
- " <td>63</td>\n",
1570
- " <td>1458</td>\n",
1571
- " <td>57</td>\n",
1572
- " <td>1</td>\n",
1573
- " <td>0</td>\n",
1574
- " <td>24</td>\n",
1575
- " <td>49</td>\n",
1576
- " <td>...</td>\n",
1577
- " <td>62.0</td>\n",
1578
- " <td>0.0</td>\n",
1579
- " <td>62.0</td>\n",
1580
- " <td>Wisconsin</td>\n",
1581
- " <td>1985</td>\n",
1582
- " <td>2024</td>\n",
1583
- " <td>big_ten</td>\n",
1584
- " <td>Y05</td>\n",
1585
- " <td>0</td>\n",
1586
- " <td>Y</td>\n",
1587
- " </tr>\n",
1588
- " <tr>\n",
1589
- " <th>4</th>\n",
1590
- " <td>2003</td>\n",
1591
- " <td>30</td>\n",
1592
- " <td>1448</td>\n",
1593
- " <td>90</td>\n",
1594
- " <td>1458</td>\n",
1595
- " <td>80</td>\n",
1596
- " <td>1</td>\n",
1597
- " <td>0</td>\n",
1598
- " <td>33</td>\n",
1599
- " <td>61</td>\n",
1600
- " <td>...</td>\n",
1601
- " <td>62.0</td>\n",
1602
- " <td>0.0</td>\n",
1603
- " <td>62.0</td>\n",
1604
- " <td>Wisconsin</td>\n",
1605
- " <td>1985</td>\n",
1606
- " <td>2024</td>\n",
1607
- " <td>big_ten</td>\n",
1608
- " <td>Y05</td>\n",
1609
- " <td>0</td>\n",
1610
- " <td>Y</td>\n",
1611
- " </tr>\n",
1612
- " </tbody>\n",
1613
- "</table>\n",
1614
- "<p>5 rows × 1146 columns</p>\n",
1615
- "</div>"
1616
- ],
1617
- "text/plain": [
1618
- " Season DayNum WTeamID WScore LTeamID LScore WLoc NumOT WFGM WFGA \\\n",
1619
- "0 2003 40 1266 63 1458 54 1 0 24 46 \n",
1620
- "1 2003 93 1345 78 1458 60 1 0 23 51 \n",
1621
- "2 2003 68 1228 69 1458 63 1 0 25 50 \n",
1622
- "3 2003 143 1246 63 1458 57 1 0 24 49 \n",
1623
- "4 2003 30 1448 90 1458 80 1 0 33 61 \n",
1624
- "\n",
1625
- " ... tourney_PA_median_L tourney_PA_std_L tourney_PA_mean_L TeamName_L \\\n",
1626
- "0 ... 62.0 0.0 62.0 Wisconsin \n",
1627
- "1 ... 62.0 0.0 62.0 Wisconsin \n",
1628
- "2 ... 62.0 0.0 62.0 Wisconsin \n",
1629
- "3 ... 62.0 0.0 62.0 Wisconsin \n",
1630
- "4 ... 62.0 0.0 62.0 Wisconsin \n",
1631
- "\n",
1632
- " FirstD1Season_L LastD1Season_L ConfAbbrev_L Seed_L Seed_Num_L \\\n",
1633
- "0 1985 2024 big_ten Y05 0 \n",
1634
- "1 1985 2024 big_ten Y05 0 \n",
1635
- "2 1985 2024 big_ten Y05 0 \n",
1636
- "3 1985 2024 big_ten Y05 0 \n",
1637
- "4 1985 2024 big_ten Y05 0 \n",
1638
- "\n",
1639
- " Seed_Region_L \n",
1640
- "0 Y \n",
1641
- "1 Y \n",
1642
- "2 Y \n",
1643
- "3 Y \n",
1644
- "4 Y \n",
1645
- "\n",
1646
- "[5 rows x 1146 columns]"
1647
- ]
1648
- },
1649
- "execution_count": 19,
1650
- "metadata": {},
1651
- "output_type": "execute_result"
1652
- }
1653
- ],
1654
- "source": [
1655
- "# detailed game result dataframe that has all of the information about winning and losing teams that we just aggregated together in the \n",
1656
- "# tms_conf_seeds dataframe\n",
1657
- "\n",
1658
- "# match the winning teams id with their summary data for all games season games\n",
1659
- "dgt_df = pd.merge(\n",
1660
- " left=all_games_df,\n",
1661
- " right=tms_conf_seeds,\n",
1662
- " left_on=[\"WTeamID\", \"Season\"],\n",
1663
- " right_on=[\"TeamID\", \"Season\"],\n",
1664
- ").merge(\n",
1665
- " right=tms_conf_seeds,\n",
1666
- " left_on=[\"LTeamID\", \"Season\"],\n",
1667
- " right_on=[\"TeamID\", \"Season\"],\n",
1668
- " suffixes=(\"_W\", \"_L\"),\n",
1669
- ")\n",
1670
- "\n",
1671
- "dgt_df.head()"
1672
- ]
1673
- },
1674
- {
1675
- "cell_type": "code",
1676
- "execution_count": 20,
1677
- "metadata": {},
1678
- "outputs": [],
1679
- "source": [
1680
- "# save the newly organized dataset\n",
1681
- "dgt_df.to_csv(os.path.join(DATA_DIR, \"MDetailedGamesOvrStats.csv\"))"
1682
- ]
1683
- }
1684
- ],
1685
- "metadata": {
1686
- "kernelspec": {
1687
- "display_name": "Python 3 (ipykernel)",
1688
- "language": "python",
1689
- "name": "python3"
1690
- },
1691
- "language_info": {
1692
- "codemirror_mode": {
1693
- "name": "ipython",
1694
- "version": 3
1695
- },
1696
- "file_extension": ".py",
1697
- "mimetype": "text/x-python",
1698
- "name": "python",
1699
- "nbconvert_exporter": "python",
1700
- "pygments_lexer": "ipython3",
1701
- "version": "3.11.7"
1702
- }
1703
- },
1704
- "nbformat": 4,
1705
- "nbformat_minor": 2
1706
- }