Jensen-holm commited on
Commit
7aa5cea
1 Parent(s): 9cabe38

adding gitignore

Browse files
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+ .ipynb_checkpoints
3
+ .vscode
4
+ .DS_Store
5
+
src/.ipynb_checkpoints/m_pp-checkpoint.ipynb DELETED
@@ -1,1093 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "import pandas as pd\n",
10
- "import numpy as np\n",
11
- "import os\n",
12
- "\n",
13
- "DATA_DIR = os.path.join(\"..\", \"data\")"
14
- ]
15
- },
16
- {
17
- "cell_type": "code",
18
- "execution_count": 2,
19
- "metadata": {},
20
- "outputs": [
21
- {
22
- "name": "stdout",
23
- "output_type": "stream",
24
- "text": [
25
- "<class 'pandas.core.frame.DataFrame'>\n",
26
- "RangeIndex: 1315 entries, 0 to 1314\n",
27
- "Data columns (total 38 columns):\n",
28
- " # Column Non-Null Count Dtype \n",
29
- "--- ------ -------------- ----- \n",
30
- " 0 Season 1315 non-null int64 \n",
31
- " 1 DayNum 1315 non-null int64 \n",
32
- " 2 WTeamID 1315 non-null int64 \n",
33
- " 3 WScore 1315 non-null int64 \n",
34
- " 4 LTeamID 1315 non-null int64 \n",
35
- " 5 LScore 1315 non-null int64 \n",
36
- " 6 WLoc 1315 non-null int64 \n",
37
- " 7 NumOT 1315 non-null int64 \n",
38
- " 8 WFGM 1315 non-null int64 \n",
39
- " 9 WFGA 1315 non-null int64 \n",
40
- " 10 WFGM3 1315 non-null int64 \n",
41
- " 11 WFGA3 1315 non-null int64 \n",
42
- " 12 WFTM 1315 non-null int64 \n",
43
- " 13 WFTA 1315 non-null int64 \n",
44
- " 14 WOR 1315 non-null int64 \n",
45
- " 15 WDR 1315 non-null int64 \n",
46
- " 16 WAst 1315 non-null int64 \n",
47
- " 17 WTO 1315 non-null int64 \n",
48
- " 18 WStl 1315 non-null int64 \n",
49
- " 19 WBlk 1315 non-null int64 \n",
50
- " 20 WPF 1315 non-null int64 \n",
51
- " 21 LFGM 1315 non-null int64 \n",
52
- " 22 LFGA 1315 non-null int64 \n",
53
- " 23 LFGM3 1315 non-null int64 \n",
54
- " 24 LFGA3 1315 non-null int64 \n",
55
- " 25 LFTM 1315 non-null int64 \n",
56
- " 26 LFTA 1315 non-null int64 \n",
57
- " 27 LOR 1315 non-null int64 \n",
58
- " 28 LDR 1315 non-null int64 \n",
59
- " 29 LAst 1315 non-null int64 \n",
60
- " 30 LTO 1315 non-null int64 \n",
61
- " 31 LStl 1315 non-null int64 \n",
62
- " 32 LBlk 1315 non-null int64 \n",
63
- " 33 LPF 1315 non-null int64 \n",
64
- " 34 GameType 1315 non-null object\n",
65
- " 35 WPA 1315 non-null int64 \n",
66
- " 36 LPA 1315 non-null int64 \n",
67
- " 37 LLoc 1315 non-null int64 \n",
68
- "dtypes: int64(37), object(1)\n",
69
- "memory usage: 390.5+ KB\n"
70
- ]
71
- }
72
- ],
73
- "source": [
74
- "tourney_games_df = pd.read_csv(\n",
75
- " os.path.join(DATA_DIR, \"MNCAATourneyDetailedResults.csv\")\n",
76
- ")\n",
77
- "\n",
78
- "tourney_games_df[\"GameType\"] = \"tourney\"\n",
79
- "\n",
80
- "tourney_games_df[\"WPA\"] = tourney_games_df[\"LScore\"]\n",
81
- "tourney_games_df[\"LPA\"] = tourney_games_df[\"WScore\"]\n",
82
- "\n",
83
- "tourney_games_df[\"LLoc\"] = tourney_games_df[\"WLoc\"].apply(lambda x: 0 if x == \"A\" else 1)\n",
84
- "tourney_games_df[\"WLoc\"] = tourney_games_df[\"LLoc\"].apply(lambda x: 0 if x == \"A\" else 1)\n",
85
- "\n",
86
- "tourney_games_df.info()"
87
- ]
88
- },
89
- {
90
- "cell_type": "code",
91
- "execution_count": 3,
92
- "metadata": {},
93
- "outputs": [
94
- {
95
- "name": "stdout",
96
- "output_type": "stream",
97
- "text": [
98
- "<class 'pandas.core.frame.DataFrame'>\n",
99
- "RangeIndex: 111817 entries, 0 to 111816\n",
100
- "Data columns (total 38 columns):\n",
101
- " # Column Non-Null Count Dtype \n",
102
- "--- ------ -------------- ----- \n",
103
- " 0 Season 111817 non-null int64 \n",
104
- " 1 DayNum 111817 non-null int64 \n",
105
- " 2 WTeamID 111817 non-null int64 \n",
106
- " 3 WScore 111817 non-null int64 \n",
107
- " 4 LTeamID 111817 non-null int64 \n",
108
- " 5 LScore 111817 non-null int64 \n",
109
- " 6 WLoc 111817 non-null int64 \n",
110
- " 7 NumOT 111817 non-null int64 \n",
111
- " 8 WFGM 111817 non-null int64 \n",
112
- " 9 WFGA 111817 non-null int64 \n",
113
- " 10 WFGM3 111817 non-null int64 \n",
114
- " 11 WFGA3 111817 non-null int64 \n",
115
- " 12 WFTM 111817 non-null int64 \n",
116
- " 13 WFTA 111817 non-null int64 \n",
117
- " 14 WOR 111817 non-null int64 \n",
118
- " 15 WDR 111817 non-null int64 \n",
119
- " 16 WAst 111817 non-null int64 \n",
120
- " 17 WTO 111817 non-null int64 \n",
121
- " 18 WStl 111817 non-null int64 \n",
122
- " 19 WBlk 111817 non-null int64 \n",
123
- " 20 WPF 111817 non-null int64 \n",
124
- " 21 LFGM 111817 non-null int64 \n",
125
- " 22 LFGA 111817 non-null int64 \n",
126
- " 23 LFGM3 111817 non-null int64 \n",
127
- " 24 LFGA3 111817 non-null int64 \n",
128
- " 25 LFTM 111817 non-null int64 \n",
129
- " 26 LFTA 111817 non-null int64 \n",
130
- " 27 LOR 111817 non-null int64 \n",
131
- " 28 LDR 111817 non-null int64 \n",
132
- " 29 LAst 111817 non-null int64 \n",
133
- " 30 LTO 111817 non-null int64 \n",
134
- " 31 LStl 111817 non-null int64 \n",
135
- " 32 LBlk 111817 non-null int64 \n",
136
- " 33 LPF 111817 non-null int64 \n",
137
- " 34 GameType 111817 non-null object\n",
138
- " 35 WPA 111817 non-null int64 \n",
139
- " 36 LPA 111817 non-null int64 \n",
140
- " 37 LLoc 111817 non-null int64 \n",
141
- "dtypes: int64(37), object(1)\n",
142
- "memory usage: 32.4+ MB\n"
143
- ]
144
- }
145
- ],
146
- "source": [
147
- "reg_games_df = pd.read_csv(\n",
148
- " os.path.join(DATA_DIR, \"MRegularSeasonDetailedResults.csv\")\n",
149
- ")\n",
150
- "\n",
151
- "reg_games_df[\"GameType\"] = \"reg\"\n",
152
- "\n",
153
- "# points allowed column\n",
154
- "reg_games_df[\"WPA\"] = reg_games_df[\"LScore\"]\n",
155
- "reg_games_df[\"LPA\"] = reg_games_df[\"WScore\"]\n",
156
- "\n",
157
- "# loser location column\n",
158
- "reg_games_df[\"LLoc\"] = reg_games_df[\"WLoc\"].apply(lambda x: 0 if x == \"A\" else 1)\n",
159
- "reg_games_df[\"WLoc\"] = reg_games_df[\"LLoc\"].apply(lambda x: 0 if x == \"A\" else 1)\n",
160
- "\n",
161
- "reg_games_df.info()"
162
- ]
163
- },
164
- {
165
- "cell_type": "code",
166
- "execution_count": 4,
167
- "metadata": {},
168
- "outputs": [],
169
- "source": [
170
- "# here we are defining the metrics that we want to look at (practically all of them) as features\n",
171
- "# for building models. I want to do so with metrics regardless of winning and losing metrics, or at least\n",
172
- "# make extra features with combined stats from wins and losses. Because of that, here I am defining them manually\n",
173
- "\n",
174
- "outcomes = [\"W\", \"L\"]\n",
175
- "\n",
176
- "metrics = [\n",
177
- " \"FGM\", # field goals made\n",
178
- " \"FGA\", # field goals attempted\n",
179
- " \"FGM3\", # three pointers made\n",
180
- " \"FGA3\", # three pointers attempetd\n",
181
- " \"FTM\", # free throws made\n",
182
- " \"FTA\", # free throws attempted\n",
183
- " \"OR\", # Offensive rebounds\n",
184
- " \"DR\", # Defensive rebounds\n",
185
- " \"Ast\", # assists\n",
186
- " \"TO\", # turnovers\n",
187
- " \"Stl\", # steals\n",
188
- " \"Blk\", # blocks\n",
189
- " \"PF\", # personal fouls\n",
190
- "]\n"
191
- ]
192
- },
193
- {
194
- "cell_type": "code",
195
- "execution_count": 5,
196
- "metadata": {},
197
- "outputs": [],
198
- "source": [
199
- "# when doing groupbys' and aggregations on our data, it is important to keep it readable. At times where\n",
200
- "# our dataframes are turned into MultiIndex objects, call this function to flatten it out.\n",
201
- "def flatten_multi_idx(df: pd.DataFrame) -> None:\n",
202
- " df.columns = [\"_\".join(filter(None, col)) for col in df.columns.to_flat_index()]\n"
203
- ]
204
- },
205
- {
206
- "cell_type": "code",
207
- "execution_count": 39,
208
- "metadata": {},
209
- "outputs": [],
210
- "source": [
211
- "# here we will summarize each teams statistics by creating new columns for each metric we are interested in\n",
212
- "# that is the combined result of each teams winning stats and losing stats\n",
213
- "\n",
214
- "def summarize_teams(szn_df: pd.DataFrame) -> pd.DataFrame:\n",
215
- " ovr_df = szn_df.copy()\n",
216
- " \n",
217
- " agg_funcs = [np.mean, np.sum, np.std, np.median, np.min, np.max]\n",
218
- " agg_dict = {f\"{outcome}{metric}\": agg_funcs for metric in metrics for outcome in outcomes}\n",
219
- " w_team_sum_df = ovr_df.groupby([\"WTeamID\", \"Season\"]).agg(agg_dict).reset_index()\n",
220
- " l_team_sum_df = ovr_df.groupby([\"LTeamID\", \"Season\"]).agg(agg_dict).reset_index()\n",
221
- " \n",
222
- " flatten_multi_idx(l_team_sum_df)\n",
223
- " flatten_multi_idx(w_team_sum_df)\n",
224
- " \n",
225
- " w_team_sum_df.drop([col for col in w_team_sum_df.columns if \"L\" in col], axis=1, inplace=True)\n",
226
- " l_team_sum_df.drop([col for col in l_team_sum_df.columns if \"W\" in col], axis=1, inplace=True)\n",
227
- " \n",
228
- " w_team_sum_df[\"TeamID\"] = w_team_sum_df[\"WTeamID\"]\n",
229
- " l_team_sum_df[\"TeamID\"] = l_team_sum_df[\"LTeamID\"]\n",
230
- " \n",
231
- " w_team_sum_df.drop([\"WTeamID\"], axis=1, inplace=True)\n",
232
- " l_team_sum_df.drop([\"LTeamID\"], axis=1, inplace=True)\n",
233
- " \n",
234
- " ovr_team_df = pd.merge(\n",
235
- " left=w_team_sum_df,\n",
236
- " right=l_team_sum_df,\n",
237
- " on=[\"TeamID\", \"Season\"],\n",
238
- " )\n",
239
- " \n",
240
- " # calculate the total of all metrics\n",
241
- " for metric in metrics:\n",
242
- " ovr_team_df[f\"tot_{metric}\"] = ovr_team_df.apply(\n",
243
- " lambda team: team[f\"W{metric}_sum\"] + team[f\"L{metric}_sum\"],\n",
244
- " axis=1,\n",
245
- " )\n",
246
- " \n",
247
- " return ovr_team_df\n"
248
- ]
249
- },
250
- {
251
- "cell_type": "code",
252
- "execution_count": 40,
253
- "metadata": {},
254
- "outputs": [
255
- {
256
- "data": {
257
- "text/html": [
258
- "<div>\n",
259
- "<style scoped>\n",
260
- " .dataframe tbody tr th:only-of-type {\n",
261
- " vertical-align: middle;\n",
262
- " }\n",
263
- "\n",
264
- " .dataframe tbody tr th {\n",
265
- " vertical-align: top;\n",
266
- " }\n",
267
- "\n",
268
- " .dataframe thead th {\n",
269
- " text-align: right;\n",
270
- " }\n",
271
- "</style>\n",
272
- "<table border=\"1\" class=\"dataframe\">\n",
273
- " <thead>\n",
274
- " <tr style=\"text-align: right;\">\n",
275
- " <th></th>\n",
276
- " <th>Season</th>\n",
277
- " <th>WFGM_mean</th>\n",
278
- " <th>WFGM_sum</th>\n",
279
- " <th>WFGM_std</th>\n",
280
- " <th>WFGM_median</th>\n",
281
- " <th>WFGM_min</th>\n",
282
- " <th>WFGM_max</th>\n",
283
- " <th>WFGA_mean</th>\n",
284
- " <th>WFGA_sum</th>\n",
285
- " <th>WFGA_std</th>\n",
286
- " <th>...</th>\n",
287
- " <th>tot_FGA3</th>\n",
288
- " <th>tot_FTM</th>\n",
289
- " <th>tot_FTA</th>\n",
290
- " <th>tot_OR</th>\n",
291
- " <th>tot_DR</th>\n",
292
- " <th>tot_Ast</th>\n",
293
- " <th>tot_TO</th>\n",
294
- " <th>tot_Stl</th>\n",
295
- " <th>tot_Blk</th>\n",
296
- " <th>tot_PF</th>\n",
297
- " </tr>\n",
298
- " </thead>\n",
299
- " <tbody>\n",
300
- " <tr>\n",
301
- " <th>0</th>\n",
302
- " <td>2014</td>\n",
303
- " <td>26.000000</td>\n",
304
- " <td>52</td>\n",
305
- " <td>1.414214</td>\n",
306
- " <td>26.0</td>\n",
307
- " <td>25</td>\n",
308
- " <td>27</td>\n",
309
- " <td>48.500000</td>\n",
310
- " <td>97</td>\n",
311
- " <td>6.363961</td>\n",
312
- " <td>...</td>\n",
313
- " <td>375.0</td>\n",
314
- " <td>332.0</td>\n",
315
- " <td>445.0</td>\n",
316
- " <td>168.0</td>\n",
317
- " <td>427.0</td>\n",
318
- " <td>210.0</td>\n",
319
- " <td>315.0</td>\n",
320
- " <td>121.0</td>\n",
321
- " <td>31.0</td>\n",
322
- " <td>453.0</td>\n",
323
- " </tr>\n",
324
- " <tr>\n",
325
- " <th>1</th>\n",
326
- " <td>2015</td>\n",
327
- " <td>27.000000</td>\n",
328
- " <td>189</td>\n",
329
- " <td>5.291503</td>\n",
330
- " <td>24.0</td>\n",
331
- " <td>22</td>\n",
332
- " <td>34</td>\n",
333
- " <td>53.000000</td>\n",
334
- " <td>371</td>\n",
335
- " <td>5.773503</td>\n",
336
- " <td>...</td>\n",
337
- " <td>537.0</td>\n",
338
- " <td>305.0</td>\n",
339
- " <td>419.0</td>\n",
340
- " <td>231.0</td>\n",
341
- " <td>550.0</td>\n",
342
- " <td>332.0</td>\n",
343
- " <td>359.0</td>\n",
344
- " <td>166.0</td>\n",
345
- " <td>33.0</td>\n",
346
- " <td>577.0</td>\n",
347
- " </tr>\n",
348
- " <tr>\n",
349
- " <th>2</th>\n",
350
- " <td>2016</td>\n",
351
- " <td>25.666667</td>\n",
352
- " <td>231</td>\n",
353
- " <td>2.872281</td>\n",
354
- " <td>27.0</td>\n",
355
- " <td>21</td>\n",
356
- " <td>28</td>\n",
357
- " <td>54.000000</td>\n",
358
- " <td>486</td>\n",
359
- " <td>4.555217</td>\n",
360
- " <td>...</td>\n",
361
- " <td>509.0</td>\n",
362
- " <td>415.0</td>\n",
363
- " <td>587.0</td>\n",
364
- " <td>221.0</td>\n",
365
- " <td>608.0</td>\n",
366
- " <td>348.0</td>\n",
367
- " <td>362.0</td>\n",
368
- " <td>182.0</td>\n",
369
- " <td>66.0</td>\n",
370
- " <td>604.0</td>\n",
371
- " </tr>\n",
372
- " <tr>\n",
373
- " <th>3</th>\n",
374
- " <td>2017</td>\n",
375
- " <td>24.000000</td>\n",
376
- " <td>216</td>\n",
377
- " <td>3.162278</td>\n",
378
- " <td>25.0</td>\n",
379
- " <td>19</td>\n",
380
- " <td>28</td>\n",
381
- " <td>49.555556</td>\n",
382
- " <td>446</td>\n",
383
- " <td>5.981453</td>\n",
384
- " <td>...</td>\n",
385
- " <td>477.0</td>\n",
386
- " <td>298.0</td>\n",
387
- " <td>464.0</td>\n",
388
- " <td>189.0</td>\n",
389
- " <td>572.0</td>\n",
390
- " <td>340.0</td>\n",
391
- " <td>362.0</td>\n",
392
- " <td>175.0</td>\n",
393
- " <td>69.0</td>\n",
394
- " <td>554.0</td>\n",
395
- " </tr>\n",
396
- " <tr>\n",
397
- " <th>4</th>\n",
398
- " <td>2018</td>\n",
399
- " <td>27.416667</td>\n",
400
- " <td>329</td>\n",
401
- " <td>3.964807</td>\n",
402
- " <td>27.0</td>\n",
403
- " <td>22</td>\n",
404
- " <td>34</td>\n",
405
- " <td>57.250000</td>\n",
406
- " <td>687</td>\n",
407
- " <td>4.731423</td>\n",
408
- " <td>...</td>\n",
409
- " <td>539.0</td>\n",
410
- " <td>355.0</td>\n",
411
- " <td>504.0</td>\n",
412
- " <td>244.0</td>\n",
413
- " <td>627.0</td>\n",
414
- " <td>375.0</td>\n",
415
- " <td>389.0</td>\n",
416
- " <td>193.0</td>\n",
417
- " <td>98.0</td>\n",
418
- " <td>568.0</td>\n",
419
- " </tr>\n",
420
- " <tr>\n",
421
- " <th>...</th>\n",
422
- " <td>...</td>\n",
423
- " <td>...</td>\n",
424
- " <td>...</td>\n",
425
- " <td>...</td>\n",
426
- " <td>...</td>\n",
427
- " <td>...</td>\n",
428
- " <td>...</td>\n",
429
- " <td>...</td>\n",
430
- " <td>...</td>\n",
431
- " <td>...</td>\n",
432
- " <td>...</td>\n",
433
- " <td>...</td>\n",
434
- " <td>...</td>\n",
435
- " <td>...</td>\n",
436
- " <td>...</td>\n",
437
- " <td>...</td>\n",
438
- " <td>...</td>\n",
439
- " <td>...</td>\n",
440
- " <td>...</td>\n",
441
- " <td>...</td>\n",
442
- " <td>...</td>\n",
443
- " </tr>\n",
444
- " <tr>\n",
445
- " <th>7600</th>\n",
446
- " <td>2023</td>\n",
447
- " <td>24.153846</td>\n",
448
- " <td>314</td>\n",
449
- " <td>5.063697</td>\n",
450
- " <td>25.0</td>\n",
451
- " <td>16</td>\n",
452
- " <td>31</td>\n",
453
- " <td>51.461538</td>\n",
454
- " <td>669</td>\n",
455
- " <td>6.118488</td>\n",
456
- " <td>...</td>\n",
457
- " <td>649.0</td>\n",
458
- " <td>384.0</td>\n",
459
- " <td>506.0</td>\n",
460
- " <td>149.0</td>\n",
461
- " <td>676.0</td>\n",
462
- " <td>357.0</td>\n",
463
- " <td>384.0</td>\n",
464
- " <td>209.0</td>\n",
465
- " <td>85.0</td>\n",
466
- " <td>454.0</td>\n",
467
- " </tr>\n",
468
- " <tr>\n",
469
- " <th>7601</th>\n",
470
- " <td>2024</td>\n",
471
- " <td>23.000000</td>\n",
472
- " <td>46</td>\n",
473
- " <td>2.828427</td>\n",
474
- " <td>23.0</td>\n",
475
- " <td>21</td>\n",
476
- " <td>25</td>\n",
477
- " <td>45.500000</td>\n",
478
- " <td>91</td>\n",
479
- " <td>4.949747</td>\n",
480
- " <td>...</td>\n",
481
- " <td>684.0</td>\n",
482
- " <td>233.0</td>\n",
483
- " <td>330.0</td>\n",
484
- " <td>168.0</td>\n",
485
- " <td>565.0</td>\n",
486
- " <td>287.0</td>\n",
487
- " <td>336.0</td>\n",
488
- " <td>171.0</td>\n",
489
- " <td>57.0</td>\n",
490
- " <td>395.0</td>\n",
491
- " </tr>\n",
492
- " <tr>\n",
493
- " <th>7602</th>\n",
494
- " <td>2023</td>\n",
495
- " <td>25.583333</td>\n",
496
- " <td>307</td>\n",
497
- " <td>3.800917</td>\n",
498
- " <td>26.0</td>\n",
499
- " <td>19</td>\n",
500
- " <td>31</td>\n",
501
- " <td>57.000000</td>\n",
502
- " <td>684</td>\n",
503
- " <td>6.208499</td>\n",
504
- " <td>...</td>\n",
505
- " <td>827.0</td>\n",
506
- " <td>359.0</td>\n",
507
- " <td>513.0</td>\n",
508
- " <td>240.0</td>\n",
509
- " <td>675.0</td>\n",
510
- " <td>443.0</td>\n",
511
- " <td>398.0</td>\n",
512
- " <td>178.0</td>\n",
513
- " <td>92.0</td>\n",
514
- " <td>600.0</td>\n",
515
- " </tr>\n",
516
- " <tr>\n",
517
- " <th>7603</th>\n",
518
- " <td>2024</td>\n",
519
- " <td>27.166667</td>\n",
520
- " <td>163</td>\n",
521
- " <td>4.875107</td>\n",
522
- " <td>28.5</td>\n",
523
- " <td>21</td>\n",
524
- " <td>32</td>\n",
525
- " <td>60.166667</td>\n",
526
- " <td>361</td>\n",
527
- " <td>6.823977</td>\n",
528
- " <td>...</td>\n",
529
- " <td>626.0</td>\n",
530
- " <td>250.0</td>\n",
531
- " <td>363.0</td>\n",
532
- " <td>164.0</td>\n",
533
- " <td>448.0</td>\n",
534
- " <td>289.0</td>\n",
535
- " <td>253.0</td>\n",
536
- " <td>163.0</td>\n",
537
- " <td>105.0</td>\n",
538
- " <td>403.0</td>\n",
539
- " </tr>\n",
540
- " <tr>\n",
541
- " <th>7604</th>\n",
542
- " <td>2024</td>\n",
543
- " <td>28.285714</td>\n",
544
- " <td>198</td>\n",
545
- " <td>5.154748</td>\n",
546
- " <td>31.0</td>\n",
547
- " <td>19</td>\n",
548
- " <td>34</td>\n",
549
- " <td>57.142857</td>\n",
550
- " <td>400</td>\n",
551
- " <td>3.976119</td>\n",
552
- " <td>...</td>\n",
553
- " <td>576.0</td>\n",
554
- " <td>226.0</td>\n",
555
- " <td>292.0</td>\n",
556
- " <td>155.0</td>\n",
557
- " <td>459.0</td>\n",
558
- " <td>318.0</td>\n",
559
- " <td>231.0</td>\n",
560
- " <td>155.0</td>\n",
561
- " <td>61.0</td>\n",
562
- " <td>332.0</td>\n",
563
- " </tr>\n",
564
- " </tbody>\n",
565
- "</table>\n",
566
- "<p>7605 rows × 171 columns</p>\n",
567
- "</div>"
568
- ],
569
- "text/plain": [
570
- " Season WFGM_mean WFGM_sum WFGM_std WFGM_median WFGM_min WFGM_max \\\n",
571
- "0 2014 26.000000 52 1.414214 26.0 25 27 \n",
572
- "1 2015 27.000000 189 5.291503 24.0 22 34 \n",
573
- "2 2016 25.666667 231 2.872281 27.0 21 28 \n",
574
- "3 2017 24.000000 216 3.162278 25.0 19 28 \n",
575
- "4 2018 27.416667 329 3.964807 27.0 22 34 \n",
576
- "... ... ... ... ... ... ... ... \n",
577
- "7600 2023 24.153846 314 5.063697 25.0 16 31 \n",
578
- "7601 2024 23.000000 46 2.828427 23.0 21 25 \n",
579
- "7602 2023 25.583333 307 3.800917 26.0 19 31 \n",
580
- "7603 2024 27.166667 163 4.875107 28.5 21 32 \n",
581
- "7604 2024 28.285714 198 5.154748 31.0 19 34 \n",
582
- "\n",
583
- " WFGA_mean WFGA_sum WFGA_std ... tot_FGA3 tot_FTM tot_FTA tot_OR \\\n",
584
- "0 48.500000 97 6.363961 ... 375.0 332.0 445.0 168.0 \n",
585
- "1 53.000000 371 5.773503 ... 537.0 305.0 419.0 231.0 \n",
586
- "2 54.000000 486 4.555217 ... 509.0 415.0 587.0 221.0 \n",
587
- "3 49.555556 446 5.981453 ... 477.0 298.0 464.0 189.0 \n",
588
- "4 57.250000 687 4.731423 ... 539.0 355.0 504.0 244.0 \n",
589
- "... ... ... ... ... ... ... ... ... \n",
590
- "7600 51.461538 669 6.118488 ... 649.0 384.0 506.0 149.0 \n",
591
- "7601 45.500000 91 4.949747 ... 684.0 233.0 330.0 168.0 \n",
592
- "7602 57.000000 684 6.208499 ... 827.0 359.0 513.0 240.0 \n",
593
- "7603 60.166667 361 6.823977 ... 626.0 250.0 363.0 164.0 \n",
594
- "7604 57.142857 400 3.976119 ... 576.0 226.0 292.0 155.0 \n",
595
- "\n",
596
- " tot_DR tot_Ast tot_TO tot_Stl tot_Blk tot_PF \n",
597
- "0 427.0 210.0 315.0 121.0 31.0 453.0 \n",
598
- "1 550.0 332.0 359.0 166.0 33.0 577.0 \n",
599
- "2 608.0 348.0 362.0 182.0 66.0 604.0 \n",
600
- "3 572.0 340.0 362.0 175.0 69.0 554.0 \n",
601
- "4 627.0 375.0 389.0 193.0 98.0 568.0 \n",
602
- "... ... ... ... ... ... ... \n",
603
- "7600 676.0 357.0 384.0 209.0 85.0 454.0 \n",
604
- "7601 565.0 287.0 336.0 171.0 57.0 395.0 \n",
605
- "7602 675.0 443.0 398.0 178.0 92.0 600.0 \n",
606
- "7603 448.0 289.0 253.0 163.0 105.0 403.0 \n",
607
- "7604 459.0 318.0 231.0 155.0 61.0 332.0 \n",
608
- "\n",
609
- "[7605 rows x 171 columns]"
610
- ]
611
- },
612
- "execution_count": 40,
613
- "metadata": {},
614
- "output_type": "execute_result"
615
- }
616
- ],
617
- "source": [
618
- "summarize_teams(reg_games_df)"
619
- ]
620
- },
621
- {
622
- "cell_type": "code",
623
- "execution_count": 20,
624
- "metadata": {},
625
- "outputs": [],
626
- "source": [
627
- "# def summarize_teams(df: pd.DataFrame) -> pd.DataFrame:\n",
628
- "# other_cols = {\"TeamID\", \"WTeamID\", \"LTeamID\", \"DayNum\", \"Season\", \"GameType\", \"total_games\"}\n",
629
- "# agg_funcs = [np.sum, np.mean, np.median, np.std, np.min, np.max]\n",
630
- "# dfs = {}\n",
631
- "# subsets = [\"W\", \"L\"]\n",
632
- "# for subset in subsets:\n",
633
- "# sub = df[[col for col in df.columns if subset in col or col in other_cols]]\n",
634
- "# agg_df = sub \\\n",
635
- "# .groupby([f\"{subset}TeamID\", \"Season\"]) \\\n",
636
- "# .agg({col: agg_funcs for col in sub.columns if col not in other_cols}) \\\n",
637
- "# .reset_index()\n",
638
- " \n",
639
- "# flatten_multi_idx(agg_df)\n",
640
- "# agg_df[f\"total{subset}\"] = df \\\n",
641
- "# .groupby([f\"{subset}TeamID\", \"Season\"])[f\"{subset}TeamID\"] \\\n",
642
- "# .transform(\"count\")\n",
643
- "# dfs[subset] = agg_df\n",
644
- "\n",
645
- "# merged = pd.merge(\n",
646
- "# left=dfs[\"W\"],\n",
647
- "# right=dfs[\"L\"],\n",
648
- "# left_on=[\"WTeamID\", \"Season\"],\n",
649
- "# right_on=[\"LTeamID\", \"Season\"],\n",
650
- "# )\n",
651
- "\n",
652
- "# merged[\"total_games\"] = merged[\"totalW\"] + merged[\"totalL\"]\n",
653
- "# merged[\"TeamID\"] = merged[\"WTeamID\"]\n",
654
- "# merged.drop([\"WTeamID\", \"LTeamID\"], axis=1, inplace=True)\n",
655
- "# return merged\n",
656
- "\n",
657
- "# # overall_stats_df = merged[[\"TeamID\", \"Season\", \"total_games\", \"WPA_sum\", \"LPA_sum\", \"total_games\"]]\n",
658
- "# # # Combine stats from games won and games lost\n",
659
- "# # overall_stats_df[\"TotalPA\"] = overall_stats_df[\"WPA_sum\"] + overall_stats_df[\"LPA_sum\"]\n",
660
- "# return merged"
661
- ]
662
- },
663
- {
664
- "cell_type": "code",
665
- "execution_count": null,
666
- "metadata": {},
667
- "outputs": [],
668
- "source": []
669
- },
670
- {
671
- "cell_type": "code",
672
- "execution_count": 18,
673
- "metadata": {},
674
- "outputs": [],
675
- "source": [
676
- "reg_agg_df = summarize_teams(reg_games_df)"
677
- ]
678
- },
679
- {
680
- "cell_type": "code",
681
- "execution_count": 19,
682
- "metadata": {},
683
- "outputs": [
684
- {
685
- "data": {
686
- "text/html": [
687
- "<div>\n",
688
- "<style scoped>\n",
689
- " .dataframe tbody tr th:only-of-type {\n",
690
- " vertical-align: middle;\n",
691
- " }\n",
692
- "\n",
693
- " .dataframe tbody tr th {\n",
694
- " vertical-align: top;\n",
695
- " }\n",
696
- "\n",
697
- " .dataframe thead th {\n",
698
- " text-align: right;\n",
699
- " }\n",
700
- "</style>\n",
701
- "<table border=\"1\" class=\"dataframe\">\n",
702
- " <thead>\n",
703
- " <tr style=\"text-align: right;\">\n",
704
- " <th></th>\n",
705
- " <th>Season</th>\n",
706
- " <th>WScore_sum</th>\n",
707
- " <th>WScore_mean</th>\n",
708
- " <th>WScore_median</th>\n",
709
- " <th>WScore_std</th>\n",
710
- " <th>WScore_min</th>\n",
711
- " <th>WScore_max</th>\n",
712
- " <th>WLoc_sum_x</th>\n",
713
- " <th>WLoc_mean_x</th>\n",
714
- " <th>WLoc_median_x</th>\n",
715
- " <th>...</th>\n",
716
- " <th>LPA_max</th>\n",
717
- " <th>LLoc_sum</th>\n",
718
- " <th>LLoc_mean</th>\n",
719
- " <th>LLoc_median</th>\n",
720
- " <th>LLoc_std</th>\n",
721
- " <th>LLoc_min</th>\n",
722
- " <th>LLoc_max</th>\n",
723
- " <th>totalL</th>\n",
724
- " <th>total_games</th>\n",
725
- " <th>TeamID</th>\n",
726
- " </tr>\n",
727
- " </thead>\n",
728
- " <tbody>\n",
729
- " <tr>\n",
730
- " <th>0</th>\n",
731
- " <td>2014</td>\n",
732
- " <td>160</td>\n",
733
- " <td>80.000000</td>\n",
734
- " <td>80.0</td>\n",
735
- " <td>9.899495</td>\n",
736
- " <td>73</td>\n",
737
- " <td>87</td>\n",
738
- " <td>2</td>\n",
739
- " <td>1.0</td>\n",
740
- " <td>1.0</td>\n",
741
- " <td>...</td>\n",
742
- " <td>103</td>\n",
743
- " <td>14</td>\n",
744
- " <td>0.736842</td>\n",
745
- " <td>1.0</td>\n",
746
- " <td>0.452414</td>\n",
747
- " <td>0</td>\n",
748
- " <td>1</td>\n",
749
- " <td>6</td>\n",
750
- " <td>23</td>\n",
751
- " <td>1101</td>\n",
752
- " </tr>\n",
753
- " <tr>\n",
754
- " <th>1</th>\n",
755
- " <td>2015</td>\n",
756
- " <td>542</td>\n",
757
- " <td>77.428571</td>\n",
758
- " <td>72.0</td>\n",
759
- " <td>11.012979</td>\n",
760
- " <td>65</td>\n",
761
- " <td>95</td>\n",
762
- " <td>7</td>\n",
763
- " <td>1.0</td>\n",
764
- " <td>1.0</td>\n",
765
- " <td>...</td>\n",
766
- " <td>102</td>\n",
767
- " <td>15</td>\n",
768
- " <td>0.714286</td>\n",
769
- " <td>1.0</td>\n",
770
- " <td>0.462910</td>\n",
771
- " <td>0</td>\n",
772
- " <td>1</td>\n",
773
- " <td>5</td>\n",
774
- " <td>28</td>\n",
775
- " <td>1101</td>\n",
776
- " </tr>\n",
777
- " <tr>\n",
778
- " <th>2</th>\n",
779
- " <td>2016</td>\n",
780
- " <td>704</td>\n",
781
- " <td>78.222222</td>\n",
782
- " <td>79.0</td>\n",
783
- " <td>9.257129</td>\n",
784
- " <td>62</td>\n",
785
- " <td>91</td>\n",
786
- " <td>9</td>\n",
787
- " <td>1.0</td>\n",
788
- " <td>1.0</td>\n",
789
- " <td>...</td>\n",
790
- " <td>108</td>\n",
791
- " <td>13</td>\n",
792
- " <td>0.722222</td>\n",
793
- " <td>1.0</td>\n",
794
- " <td>0.460889</td>\n",
795
- " <td>0</td>\n",
796
- " <td>1</td>\n",
797
- " <td>15</td>\n",
798
- " <td>38</td>\n",
799
- " <td>1101</td>\n",
800
- " </tr>\n",
801
- " <tr>\n",
802
- " <th>3</th>\n",
803
- " <td>2017</td>\n",
804
- " <td>669</td>\n",
805
- " <td>74.333333</td>\n",
806
- " <td>71.0</td>\n",
807
- " <td>7.648529</td>\n",
808
- " <td>65</td>\n",
809
- " <td>85</td>\n",
810
- " <td>9</td>\n",
811
- " <td>1.0</td>\n",
812
- " <td>1.0</td>\n",
813
- " <td>...</td>\n",
814
- " <td>89</td>\n",
815
- " <td>11</td>\n",
816
- " <td>0.687500</td>\n",
817
- " <td>1.0</td>\n",
818
- " <td>0.478714</td>\n",
819
- " <td>0</td>\n",
820
- " <td>1</td>\n",
821
- " <td>10</td>\n",
822
- " <td>27</td>\n",
823
- " <td>1101</td>\n",
824
- " </tr>\n",
825
- " <tr>\n",
826
- " <th>4</th>\n",
827
- " <td>2018</td>\n",
828
- " <td>915</td>\n",
829
- " <td>76.250000</td>\n",
830
- " <td>77.0</td>\n",
831
- " <td>7.484833</td>\n",
832
- " <td>62</td>\n",
833
- " <td>88</td>\n",
834
- " <td>12</td>\n",
835
- " <td>1.0</td>\n",
836
- " <td>1.0</td>\n",
837
- " <td>...</td>\n",
838
- " <td>88</td>\n",
839
- " <td>9</td>\n",
840
- " <td>0.600000</td>\n",
841
- " <td>1.0</td>\n",
842
- " <td>0.507093</td>\n",
843
- " <td>0</td>\n",
844
- " <td>1</td>\n",
845
- " <td>8</td>\n",
846
- " <td>30</td>\n",
847
- " <td>1101</td>\n",
848
- " </tr>\n",
849
- " <tr>\n",
850
- " <th>...</th>\n",
851
- " <td>...</td>\n",
852
- " <td>...</td>\n",
853
- " <td>...</td>\n",
854
- " <td>...</td>\n",
855
- " <td>...</td>\n",
856
- " <td>...</td>\n",
857
- " <td>...</td>\n",
858
- " <td>...</td>\n",
859
- " <td>...</td>\n",
860
- " <td>...</td>\n",
861
- " <td>...</td>\n",
862
- " <td>...</td>\n",
863
- " <td>...</td>\n",
864
- " <td>...</td>\n",
865
- " <td>...</td>\n",
866
- " <td>...</td>\n",
867
- " <td>...</td>\n",
868
- " <td>...</td>\n",
869
- " <td>...</td>\n",
870
- " <td>...</td>\n",
871
- " <td>...</td>\n",
872
- " </tr>\n",
873
- " <tr>\n",
874
- " <th>7600</th>\n",
875
- " <td>2023</td>\n",
876
- " <td>920</td>\n",
877
- " <td>70.769231</td>\n",
878
- " <td>73.0</td>\n",
879
- " <td>9.047595</td>\n",
880
- " <td>51</td>\n",
881
- " <td>82</td>\n",
882
- " <td>13</td>\n",
883
- " <td>1.0</td>\n",
884
- " <td>1.0</td>\n",
885
- " <td>...</td>\n",
886
- " <td>102</td>\n",
887
- " <td>13</td>\n",
888
- " <td>0.764706</td>\n",
889
- " <td>1.0</td>\n",
890
- " <td>0.437237</td>\n",
891
- " <td>0</td>\n",
892
- " <td>1</td>\n",
893
- " <td>14</td>\n",
894
- " <td>29</td>\n",
895
- " <td>1476</td>\n",
896
- " </tr>\n",
897
- " <tr>\n",
898
- " <th>7601</th>\n",
899
- " <td>2024</td>\n",
900
- " <td>128</td>\n",
901
- " <td>64.000000</td>\n",
902
- " <td>64.0</td>\n",
903
- " <td>9.899495</td>\n",
904
- " <td>57</td>\n",
905
- " <td>71</td>\n",
906
- " <td>2</td>\n",
907
- " <td>1.0</td>\n",
908
- " <td>1.0</td>\n",
909
- " <td>...</td>\n",
910
- " <td>107</td>\n",
911
- " <td>17</td>\n",
912
- " <td>0.739130</td>\n",
913
- " <td>1.0</td>\n",
914
- " <td>0.448978</td>\n",
915
- " <td>0</td>\n",
916
- " <td>1</td>\n",
917
- " <td>5</td>\n",
918
- " <td>25</td>\n",
919
- " <td>1476</td>\n",
920
- " </tr>\n",
921
- " <tr>\n",
922
- " <th>7602</th>\n",
923
- " <td>2023</td>\n",
924
- " <td>864</td>\n",
925
- " <td>72.000000</td>\n",
926
- " <td>74.0</td>\n",
927
- " <td>10.206950</td>\n",
928
- " <td>53</td>\n",
929
- " <td>84</td>\n",
930
- " <td>12</td>\n",
931
- " <td>1.0</td>\n",
932
- " <td>1.0</td>\n",
933
- " <td>...</td>\n",
934
- " <td>97</td>\n",
935
- " <td>15</td>\n",
936
- " <td>0.750000</td>\n",
937
- " <td>1.0</td>\n",
938
- " <td>0.444262</td>\n",
939
- " <td>0</td>\n",
940
- " <td>1</td>\n",
941
- " <td>20</td>\n",
942
- " <td>34</td>\n",
943
- " <td>1477</td>\n",
944
- " </tr>\n",
945
- " <tr>\n",
946
- " <th>7603</th>\n",
947
- " <td>2024</td>\n",
948
- " <td>483</td>\n",
949
- " <td>80.500000</td>\n",
950
- " <td>80.0</td>\n",
951
- " <td>17.683325</td>\n",
952
- " <td>57</td>\n",
953
- " <td>101</td>\n",
954
- " <td>6</td>\n",
955
- " <td>1.0</td>\n",
956
- " <td>1.0</td>\n",
957
- " <td>...</td>\n",
958
- " <td>90</td>\n",
959
- " <td>10</td>\n",
960
- " <td>0.625000</td>\n",
961
- " <td>1.0</td>\n",
962
- " <td>0.500000</td>\n",
963
- " <td>0</td>\n",
964
- " <td>1</td>\n",
965
- " <td>9</td>\n",
966
- " <td>33</td>\n",
967
- " <td>1477</td>\n",
968
- " </tr>\n",
969
- " <tr>\n",
970
- " <th>7604</th>\n",
971
- " <td>2024</td>\n",
972
- " <td>578</td>\n",
973
- " <td>82.571429</td>\n",
974
- " <td>80.0</td>\n",
975
- " <td>7.345228</td>\n",
976
- " <td>74</td>\n",
977
- " <td>94</td>\n",
978
- " <td>7</td>\n",
979
- " <td>1.0</td>\n",
980
- " <td>1.0</td>\n",
981
- " <td>...</td>\n",
982
- " <td>96</td>\n",
983
- " <td>12</td>\n",
984
- " <td>0.857143</td>\n",
985
- " <td>1.0</td>\n",
986
- " <td>0.363137</td>\n",
987
- " <td>0</td>\n",
988
- " <td>1</td>\n",
989
- " <td>12</td>\n",
990
- " <td>26</td>\n",
991
- " <td>1478</td>\n",
992
- " </tr>\n",
993
- " </tbody>\n",
994
- "</table>\n",
995
- "<p>7605 rows × 203 columns</p>\n",
996
- "</div>"
997
- ],
998
- "text/plain": [
999
- " Season WScore_sum WScore_mean WScore_median WScore_std WScore_min \\\n",
1000
- "0 2014 160 80.000000 80.0 9.899495 73 \n",
1001
- "1 2015 542 77.428571 72.0 11.012979 65 \n",
1002
- "2 2016 704 78.222222 79.0 9.257129 62 \n",
1003
- "3 2017 669 74.333333 71.0 7.648529 65 \n",
1004
- "4 2018 915 76.250000 77.0 7.484833 62 \n",
1005
- "... ... ... ... ... ... ... \n",
1006
- "7600 2023 920 70.769231 73.0 9.047595 51 \n",
1007
- "7601 2024 128 64.000000 64.0 9.899495 57 \n",
1008
- "7602 2023 864 72.000000 74.0 10.206950 53 \n",
1009
- "7603 2024 483 80.500000 80.0 17.683325 57 \n",
1010
- "7604 2024 578 82.571429 80.0 7.345228 74 \n",
1011
- "\n",
1012
- " WScore_max WLoc_sum_x WLoc_mean_x WLoc_median_x ... LPA_max \\\n",
1013
- "0 87 2 1.0 1.0 ... 103 \n",
1014
- "1 95 7 1.0 1.0 ... 102 \n",
1015
- "2 91 9 1.0 1.0 ... 108 \n",
1016
- "3 85 9 1.0 1.0 ... 89 \n",
1017
- "4 88 12 1.0 1.0 ... 88 \n",
1018
- "... ... ... ... ... ... ... \n",
1019
- "7600 82 13 1.0 1.0 ... 102 \n",
1020
- "7601 71 2 1.0 1.0 ... 107 \n",
1021
- "7602 84 12 1.0 1.0 ... 97 \n",
1022
- "7603 101 6 1.0 1.0 ... 90 \n",
1023
- "7604 94 7 1.0 1.0 ... 96 \n",
1024
- "\n",
1025
- " LLoc_sum LLoc_mean LLoc_median LLoc_std LLoc_min LLoc_max totalL \\\n",
1026
- "0 14 0.736842 1.0 0.452414 0 1 6 \n",
1027
- "1 15 0.714286 1.0 0.462910 0 1 5 \n",
1028
- "2 13 0.722222 1.0 0.460889 0 1 15 \n",
1029
- "3 11 0.687500 1.0 0.478714 0 1 10 \n",
1030
- "4 9 0.600000 1.0 0.507093 0 1 8 \n",
1031
- "... ... ... ... ... ... ... ... \n",
1032
- "7600 13 0.764706 1.0 0.437237 0 1 14 \n",
1033
- "7601 17 0.739130 1.0 0.448978 0 1 5 \n",
1034
- "7602 15 0.750000 1.0 0.444262 0 1 20 \n",
1035
- "7603 10 0.625000 1.0 0.500000 0 1 9 \n",
1036
- "7604 12 0.857143 1.0 0.363137 0 1 12 \n",
1037
- "\n",
1038
- " total_games TeamID \n",
1039
- "0 23 1101 \n",
1040
- "1 28 1101 \n",
1041
- "2 38 1101 \n",
1042
- "3 27 1101 \n",
1043
- "4 30 1101 \n",
1044
- "... ... ... \n",
1045
- "7600 29 1476 \n",
1046
- "7601 25 1476 \n",
1047
- "7602 34 1477 \n",
1048
- "7603 33 1477 \n",
1049
- "7604 26 1478 \n",
1050
- "\n",
1051
- "[7605 rows x 203 columns]"
1052
- ]
1053
- },
1054
- "execution_count": 19,
1055
- "metadata": {},
1056
- "output_type": "execute_result"
1057
- }
1058
- ],
1059
- "source": [
1060
- "# combine the winning and losing stats so that we have overall game stats\n",
1061
- "reg_agg_df\n"
1062
- ]
1063
- },
1064
- {
1065
- "cell_type": "code",
1066
- "execution_count": null,
1067
- "metadata": {},
1068
- "outputs": [],
1069
- "source": []
1070
- }
1071
- ],
1072
- "metadata": {
1073
- "kernelspec": {
1074
- "display_name": "Python 3 (ipykernel)",
1075
- "language": "python",
1076
- "name": "python3"
1077
- },
1078
- "language_info": {
1079
- "codemirror_mode": {
1080
- "name": "ipython",
1081
- "version": 3
1082
- },
1083
- "file_extension": ".py",
1084
- "mimetype": "text/x-python",
1085
- "name": "python",
1086
- "nbconvert_exporter": "python",
1087
- "pygments_lexer": "ipython3",
1088
- "version": "3.11.7"
1089
- }
1090
- },
1091
- "nbformat": 4,
1092
- "nbformat_minor": 2
1093
- }