nesticot commited on
Commit
474d8d9
·
verified ·
1 Parent(s): efe72aa

Upload 3 files

Browse files
Files changed (3) hide show
  1. api_scraper.py +747 -747
  2. batting_update.py +632 -0
  3. pitcher_update.py +573 -0
api_scraper.py CHANGED
@@ -1,747 +1,747 @@
1
- import requests
2
- import pandas as pd
3
- import numpy as np
4
- from datetime import datetime
5
- from tqdm import tqdm
6
- import time
7
- from pytz import timezone
8
-
9
-
10
- class MLB_Scrape:
11
-
12
- # def __init__(self):
13
- # # Initialize your class here if needed
14
- # pass
15
-
16
- def get_sport_id(self):
17
- df = pd.DataFrame(requests.get(url=f'https://statsapi.mlb.com/api/v1/sports').json()['sports']).set_index('id')
18
- return df
19
-
20
- def get_sport_id_check(self,sport_id):
21
- sport_id_df = self.get_sport_id()
22
- if sport_id not in sport_id_df.index:
23
- print('Please Select a New Sport ID from the following')
24
- print(sport_id_df)
25
- return False
26
- return True
27
-
28
- def get_schedule(self,year_input=2023,
29
- sport_id=1,
30
- start_date='YYYY-MM-DD',
31
- end_date='YYYY-MM-DD',
32
- final=True,
33
- regular=True,
34
- spring=False):
35
- # Get MLB Schedule
36
-
37
- if not self.get_sport_id_check(sport_id=sport_id):
38
- return
39
- if regular == True:
40
- game_call = requests.get(url=f'https://statsapi.mlb.com/api/v1/schedule/?sportId={sport_id}&gameTypes=R&season={year_input}&hydrate=lineup,players').json()
41
- print(f'https://statsapi.mlb.com/api/v1/schedule/?sportId={sport_id}&gameTypes=R&season={year_input}&hydrate=lineup,players')
42
- elif spring == True:
43
- print('spring')
44
- game_call = requests.get(url=f'https://statsapi.mlb.com/api/v1/schedule/?sportId={sport_id}&gameTypes=S&season={year_input}&hydrate=lineup,players').json()
45
- print(f'https://statsapi.mlb.com/api/v1/schedule/?sportId={sport_id}&gameTypes=S&season={year_input}&hydrate=lineup,players')
46
- else:
47
- game_call = requests.get(url=f'https://statsapi.mlb.com/api/v1/schedule/?sportId={sport_id}&season={year_input}&hydrate=lineup,players').json()
48
-
49
- # Grab data from MLB Schedule (game id, away, home, state)
50
- game_list = [item for sublist in [[y['gamePk'] for y in x['games']] for x in game_call['dates']] for item in sublist]
51
- time_list = [item for sublist in [[y['gameDate'] for y in x['games']] for x in game_call['dates']] for item in sublist]
52
- date_list = [item for sublist in [[y['officialDate'] for y in x['games']] for x in game_call['dates']] for item in sublist]
53
- away_team_list = [item for sublist in [[y['teams']['away']['team']['name'] for y in x['games']] for x in game_call['dates']] for item in sublist]
54
- home_team_list = [item for sublist in [[y['teams']['home']['team']['name'] for y in x['games']] for x in game_call['dates']] for item in sublist]
55
- state_list = [item for sublist in [[y['status']['codedGameState'] for y in x['games']] for x in game_call['dates']] for item in sublist]
56
- venue_id = [item for sublist in [[y['venue']['id'] for y in x['games']] for x in game_call['dates']] for item in sublist]
57
- venue_name = [item for sublist in [[y['venue']['name'] for y in x['games']] for x in game_call['dates']] for item in sublist]
58
-
59
- game_df = pd.DataFrame(data={'game_id':game_list,
60
- 'time':time_list,
61
- 'date':date_list,
62
- 'away':away_team_list,
63
- 'home':home_team_list,
64
- 'state':state_list,
65
- 'venue_id':venue_id,
66
- 'venue_name':venue_name})
67
-
68
- # game_list = [item for sublist in [[y['gamePk'] for y in x['games']] for x in game_call['dates']] for item in sublist]
69
- # date_list = [item for sublist in [[y['officialDate'] for y in x['games']] for x in game_call['dates']] for item in sublist]
70
- # cancel_list = [item for sublist in [[y['status']['codedGameState'] for y in x['games']] for x in game_call['dates']] for item in sublist]
71
- # game_df = pd.DataFrame(data={'game_id':game_list,'date':date_list,'state':cancel_list})
72
- #game_df = pd.concat([game_df,game_df])
73
- if len(game_df) == 0:
74
- return 'Schedule Length of 0, please select different parameters.'
75
-
76
- game_df['date'] = pd.to_datetime(game_df['date']).dt.date
77
- #game_df['time'] = game_df['time'].dt.tz_localize('UTC')
78
- #game_df['time'] = game_df['time'].dt.tz_localize('UTC')
79
- game_df['time'] = pd.to_datetime(game_df['time'])
80
- eastern = timezone('US/Eastern')
81
- game_df['time'] = game_df['time'].dt.tz_convert(eastern)
82
- game_df['time'] = game_df['time'].dt.strftime("%I:%M %p EST")#.dt.time
83
-
84
- if not start_date == 'YYYY-MM-DD' or not end_date == 'YYYY-MM-DD':
85
- try:
86
- start_date = datetime.strptime(start_date, "%Y-%m-%d").date()
87
- end_date = datetime.strptime(end_date, "%Y-%m-%d").date()
88
- game_df = game_df[(game_df['date'] >= start_date) & (game_df['date'] <= end_date)]
89
-
90
- except ValueError:
91
- return 'Please use YYYY-MM-DD Format for Start and End Dates'
92
- if final:
93
- game_df = game_df[game_df['state'] == 'F'].drop_duplicates(subset='game_id').reset_index(drop=True)
94
-
95
- game_df = game_df.drop_duplicates(subset='game_id').reset_index(drop=True)
96
-
97
- if len(game_df) == 0:
98
- return 'Schedule Length of 0, please select different parameters.'
99
-
100
- return game_df
101
-
102
- def get_data(self,game_list_input = [748540]):
103
- data_total = []
104
- #n_count = 0
105
- print('This May Take a While. Progress Bar shows Completion of Data Retrieval.')
106
- for i in tqdm(range(len(game_list_input)), desc="Processing", unit="iteration"):
107
- #for game_id_select in game_list:
108
- # if n_count%50 == 0:
109
- # print(n_count)
110
- r = requests.get(f'https://statsapi.mlb.com/api/v1.1/game/{game_list_input[i]}/feed/live')
111
- data_total.append(r.json())
112
- #n_count = n_count + 1
113
- return data_total
114
-
115
- def get_data_df(self,data_list):
116
-
117
- swing_list = ['X','F','S','D','E','T','W']
118
- whiff_list = ['S','T','W']
119
- print('Converting Data to Dataframe.')
120
- game_id = []
121
- game_date = []
122
- batter_id = []
123
- batter_name = []
124
- batter_hand = []
125
- batter_team = []
126
- batter_team_id = []
127
- pitcher_id = []
128
- pitcher_name = []
129
- pitcher_hand = []
130
- pitcher_team = []
131
- pitcher_team_id = []
132
-
133
- play_description = []
134
- play_code = []
135
- in_play = []
136
- is_strike = []
137
- is_swing = []
138
- is_whiff = []
139
- is_out = []
140
- is_ball = []
141
- is_review = []
142
- pitch_type = []
143
- pitch_description = []
144
- strikes = []
145
- balls = []
146
- outs = []
147
-
148
- start_speed = []
149
- end_speed = []
150
- sz_top = []
151
- sz_bot = []
152
- x = []
153
- y = []
154
- ax = []
155
- ay = []
156
- az = []
157
- pfxx = []
158
- pfxz = []
159
- px = []
160
- pz = []
161
- vx0 = []
162
- vy0 = []
163
- vz0 = []
164
- x0 = []
165
- y0 = []
166
- z0 = []
167
- zone = []
168
- type_confidence = []
169
- plate_time = []
170
- extension = []
171
- spin_rate = []
172
- spin_direction = []
173
- ivb = []
174
- hb = []
175
-
176
- launch_speed = []
177
- launch_angle = []
178
- launch_distance = []
179
- launch_location = []
180
- trajectory = []
181
- hardness = []
182
- hit_x = []
183
- hit_y = []
184
-
185
- index_play = []
186
- play_id = []
187
- start_time = []
188
- end_time = []
189
- is_pitch = []
190
- type_type = []
191
-
192
-
193
- type_ab = []
194
- ab_number = []
195
- event = []
196
- event_type = []
197
- rbi = []
198
- away_score = []
199
- home_score = []
200
-
201
- #data[0]['liveData']['plays']['allPlays'][32]['playEvents'][-1]['details']['call']['code'] in ['VP']
202
-
203
- for data in data_list:
204
- for ab_id in range(len(data['liveData']['plays']['allPlays'])):
205
- ab_list = data['liveData']['plays']['allPlays'][ab_id]
206
- for n in range(len(ab_list['playEvents'])):
207
- if ab_list['playEvents'][n]['isPitch'] == True or 'call' in ab_list['playEvents'][n]['details']:
208
-
209
- game_id.append(data['gamePk'])
210
- game_date.append(data['gameData']['datetime']['officialDate'])
211
- if 'matchup' in ab_list:
212
- batter_id.append(ab_list['matchup']['batter']['id'] if 'batter' in ab_list['matchup'] else np.nan)
213
- if 'batter' in ab_list['matchup']:
214
- batter_name.append(ab_list['matchup']['batter']['fullName'] if 'fullName' in ab_list['matchup']['batter'] else np.nan)
215
- else:
216
- batter_name.append(np.nan)
217
-
218
- batter_hand.append(ab_list['matchup']['batSide']['code'] if 'batSide' in ab_list['matchup'] else np.nan)
219
- pitcher_id.append(ab_list['matchup']['pitcher']['id'] if 'pitcher' in ab_list['matchup'] else np.nan)
220
- if 'pitcher' in ab_list['matchup']:
221
- pitcher_name.append(ab_list['matchup']['pitcher']['fullName'] if 'fullName' in ab_list['matchup']['pitcher'] else np.nan)
222
- else:
223
- pitcher_name.append(np.nan)
224
- #pitcher_name.append(ab_list['matchup']['pitcher']['fullName'] if 'pitcher' in ab_list['matchup'] else np.nan)
225
- pitcher_hand.append(ab_list['matchup']['pitchHand']['code'] if 'pitchHand' in ab_list['matchup'] else np.nan)
226
-
227
-
228
- # batter_id.append(ab_list['matchup']['batter']['id'] if 'batter' in ab_list['matchup'] else np.nan)
229
- # batter_name.append(ab_list['matchup']['batter']['fullName'] if 'batter' in ab_list['matchup'] else np.nan)
230
- # batter_hand.append(ab_list['matchup']['batSide']['code'] if 'batSide' in ab_list['matchup'] else np.nan)
231
- # pitcher_id.append(ab_list['matchup']['pitcher']['id'] if 'pitcher' in ab_list['matchup'] else np.nan)
232
- # pitcher_name.append(ab_list['matchup']['pitcher']['fullName'] if 'pitcher' in ab_list['matchup'] else np.nan)
233
- # pitcher_hand.append(ab_list['matchup']['pitchHand']['code'] if 'pitchHand' in ab_list['matchup'] else np.nan)
234
-
235
- if ab_list['about']['isTopInning']:
236
- batter_team.append(data['gameData']['teams']['away']['abbreviation'] if 'away' in data['gameData']['teams'] else np.nan)
237
- batter_team_id.append(data['gameData']['teams']['away']['id'] if 'away' in data['gameData']['teams'] else np.nan)
238
- pitcher_team.append(data['gameData']['teams']['home']['abbreviation'] if 'home' in data['gameData']['teams'] else np.nan)
239
- pitcher_team_id.append(data['gameData']['teams']['away']['id'] if 'away' in data['gameData']['teams'] else np.nan)
240
-
241
- else:
242
- batter_team.append(data['gameData']['teams']['home']['abbreviation'] if 'home' in data['gameData']['teams'] else np.nan)
243
- batter_team_id.append(data['gameData']['teams']['home']['id'] if 'home' in data['gameData']['teams'] else np.nan)
244
- pitcher_team.append(data['gameData']['teams']['away']['abbreviation'] if 'away' in data['gameData']['teams'] else np.nan)
245
- pitcher_team_id.append(data['gameData']['teams']['home']['id'] if 'home' in data['gameData']['teams'] else np.nan)
246
-
247
- play_description.append(ab_list['playEvents'][n]['details']['description'] if 'description' in ab_list['playEvents'][n]['details'] else np.nan)
248
- play_code.append(ab_list['playEvents'][n]['details']['code'] if 'code' in ab_list['playEvents'][n]['details'] else np.nan)
249
- in_play.append(ab_list['playEvents'][n]['details']['isInPlay'] if 'isInPlay' in ab_list['playEvents'][n]['details'] else np.nan)
250
- is_strike.append(ab_list['playEvents'][n]['details']['isStrike'] if 'isStrike' in ab_list['playEvents'][n]['details'] else np.nan)
251
-
252
- if 'details' in ab_list['playEvents'][n]:
253
- is_swing.append(True if ab_list['playEvents'][n]['details']['code'] in swing_list else np.nan)
254
- is_whiff.append(True if ab_list['playEvents'][n]['details']['code'] in whiff_list else np.nan)
255
- else:
256
- is_swing.append(np.nan)
257
- is_whiff.append(np.nan)
258
-
259
- #is_out.append(ab_list['playEvents'][n]['details']['isBall'] if 'isBall' in ab_list['playEvents'][n]['details'] else np.nan)
260
- is_ball.append(ab_list['playEvents'][n]['details']['isOut'] if 'isOut' in ab_list['playEvents'][n]['details'] else np.nan)
261
- is_review.append(ab_list['playEvents'][n]['details']['hasReview'] if 'hasReview' in ab_list['playEvents'][n]['details'] else np.nan)
262
- pitch_type.append(ab_list['playEvents'][n]['details']['type']['code'] if 'type' in ab_list['playEvents'][n]['details'] else np.nan)
263
- pitch_description.append(ab_list['playEvents'][n]['details']['type']['description'] if 'type' in ab_list['playEvents'][n]['details'] else np.nan)
264
-
265
- #if ab_list['playEvents'][n]['isPitch'] == True:
266
- if ab_list['playEvents'][n]['pitchNumber'] == 1:
267
- ab_number.append(ab_list['playEvents'][n]['atBatIndex'] if 'atBatIndex' in ab_list['playEvents'][n] else np.nan)
268
- strikes.append(0)
269
- balls.append(0)
270
- outs.append(0)
271
- else:
272
- ab_number.append(ab_list['playEvents'][n]['atBatIndex'] if 'atBatIndex' in ab_list['playEvents'][n] else np.nan)
273
- strikes.append(ab_list['playEvents'][n-1]['count']['strikes'] if 'strikes' in ab_list['playEvents'][n-1]['count'] else np.nan)
274
- balls.append(ab_list['playEvents'][n-1]['count']['balls'] if 'balls' in ab_list['playEvents'][n-1]['count'] else np.nan)
275
- outs.append(ab_list['playEvents'][n-1]['count']['outs'] if 'outs' in ab_list['playEvents'][n-1]['count'] else np.nan)
276
-
277
- if 'pitchData' in ab_list['playEvents'][n]:
278
-
279
- start_speed.append(ab_list['playEvents'][n]['pitchData']['startSpeed'] if 'startSpeed' in ab_list['playEvents'][n]['pitchData'] else np.nan)
280
- end_speed.append(ab_list['playEvents'][n]['pitchData']['endSpeed'] if 'endSpeed' in ab_list['playEvents'][n]['pitchData'] else np.nan)
281
-
282
- sz_top.append(ab_list['playEvents'][n]['pitchData']['strikeZoneTop'] if 'strikeZoneTop' in ab_list['playEvents'][n]['pitchData'] else np.nan)
283
- sz_bot.append(ab_list['playEvents'][n]['pitchData']['strikeZoneBottom'] if 'strikeZoneBottom' in ab_list['playEvents'][n]['pitchData'] else np.nan)
284
- x.append(ab_list['playEvents'][n]['pitchData']['coordinates']['x'] if 'x' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
285
- y.append(ab_list['playEvents'][n]['pitchData']['coordinates']['y'] if 'y' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
286
-
287
- ax.append(ab_list['playEvents'][n]['pitchData']['coordinates']['aX'] if 'aX' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
288
- ay.append(ab_list['playEvents'][n]['pitchData']['coordinates']['aY'] if 'aY' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
289
- az.append(ab_list['playEvents'][n]['pitchData']['coordinates']['aZ'] if 'aZ' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
290
- pfxx.append(ab_list['playEvents'][n]['pitchData']['coordinates']['pfxX'] if 'pfxX' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
291
- pfxz.append(ab_list['playEvents'][n]['pitchData']['coordinates']['pfxZ'] if 'pfxZ' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
292
- px.append(ab_list['playEvents'][n]['pitchData']['coordinates']['pX'] if 'pX' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
293
- pz.append(ab_list['playEvents'][n]['pitchData']['coordinates']['pZ'] if 'pZ' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
294
- vx0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['vX0'] if 'vX0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
295
- vy0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['vY0'] if 'vY0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
296
- vz0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['vZ0'] if 'vZ0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
297
- x0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['x0'] if 'x0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
298
- y0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['y0'] if 'y0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
299
- z0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['z0'] if 'z0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
300
-
301
- zone.append(ab_list['playEvents'][n]['pitchData']['zone'] if 'zone' in ab_list['playEvents'][n]['pitchData'] else np.nan)
302
- type_confidence.append(ab_list['playEvents'][n]['pitchData']['typeConfidence'] if 'typeConfidence' in ab_list['playEvents'][n]['pitchData'] else np.nan)
303
- plate_time.append(ab_list['playEvents'][n]['pitchData']['plateTime'] if 'plateTime' in ab_list['playEvents'][n]['pitchData'] else np.nan)
304
- extension.append(ab_list['playEvents'][n]['pitchData']['extension'] if 'extension' in ab_list['playEvents'][n]['pitchData'] else np.nan)
305
-
306
- if 'breaks' in ab_list['playEvents'][n]['pitchData']:
307
- spin_rate.append(ab_list['playEvents'][n]['pitchData']['breaks']['spinRate'] if 'spinRate' in ab_list['playEvents'][n]['pitchData']['breaks'] else np.nan)
308
- spin_direction.append(ab_list['playEvents'][n]['pitchData']['breaks']['spinDirection'] if 'spinDirection' in ab_list['playEvents'][n]['pitchData']['breaks'] else np.nan)
309
- ivb.append(ab_list['playEvents'][n]['pitchData']['breaks']['breakVerticalInduced'] if 'breakVerticalInduced' in ab_list['playEvents'][n]['pitchData']['breaks'] else np.nan)
310
- hb.append(ab_list['playEvents'][n]['pitchData']['breaks']['breakHorizontal'] if 'breakHorizontal' in ab_list['playEvents'][n]['pitchData']['breaks'] else np.nan)
311
-
312
- else:
313
- start_speed.append(np.nan)
314
- end_speed.append(np.nan)
315
-
316
- sz_top.append(np.nan)
317
- sz_bot.append(np.nan)
318
- x.append(np.nan)
319
- y.append(np.nan)
320
-
321
- ax.append(np.nan)
322
- ay.append(np.nan)
323
- az.append(np.nan)
324
- pfxx.append(np.nan)
325
- pfxz.append(np.nan)
326
- px.append(np.nan)
327
- pz.append(np.nan)
328
- vx0.append(np.nan)
329
- vy0.append(np.nan)
330
- vz0.append(np.nan)
331
- x0.append(np.nan)
332
- y0.append(np.nan)
333
- z0.append(np.nan)
334
-
335
- zone.append(np.nan)
336
- type_confidence.append(np.nan)
337
- plate_time.append(np.nan)
338
- extension.append(np.nan)
339
- spin_rate.append(np.nan)
340
- spin_direction.append(np.nan)
341
- ivb.append(np.nan)
342
- hb.append(np.nan)
343
-
344
- if 'hitData' in ab_list['playEvents'][n]:
345
- launch_speed.append(ab_list['playEvents'][n]['hitData']['launchSpeed'] if 'launchSpeed' in ab_list['playEvents'][n]['hitData'] else np.nan)
346
- launch_angle.append(ab_list['playEvents'][n]['hitData']['launchAngle'] if 'launchAngle' in ab_list['playEvents'][n]['hitData'] else np.nan)
347
- launch_distance.append(ab_list['playEvents'][n]['hitData']['totalDistance'] if 'totalDistance' in ab_list['playEvents'][n]['hitData'] else np.nan)
348
- launch_location.append(ab_list['playEvents'][n]['hitData']['location'] if 'location' in ab_list['playEvents'][n]['hitData'] else np.nan)
349
-
350
- trajectory.append(ab_list['playEvents'][n]['hitData']['trajectory'] if 'trajectory' in ab_list['playEvents'][n]['hitData'] else np.nan)
351
- hardness.append(ab_list['playEvents'][n]['hitData']['hardness'] if 'hardness' in ab_list['playEvents'][n]['hitData'] else np.nan)
352
- hit_x.append(ab_list['playEvents'][n]['hitData']['coordinates']['coordX'] if 'coordX' in ab_list['playEvents'][n]['hitData']['coordinates'] else np.nan)
353
- hit_y.append(ab_list['playEvents'][n]['hitData']['coordinates']['coordY'] if 'coordY' in ab_list['playEvents'][n]['hitData']['coordinates'] else np.nan)
354
- else:
355
- launch_speed.append(np.nan)
356
- launch_angle.append(np.nan)
357
- launch_distance.append(np.nan)
358
- launch_location.append(np.nan)
359
- trajectory.append(np.nan)
360
- hardness.append(np.nan)
361
- hit_x.append(np.nan)
362
- hit_y.append(np.nan)
363
-
364
- index_play.append(ab_list['playEvents'][n]['index'] if 'index' in ab_list['playEvents'][n] else np.nan)
365
- play_id.append(ab_list['playEvents'][n]['playId'] if 'playId' in ab_list['playEvents'][n] else np.nan)
366
- start_time.append(ab_list['playEvents'][n]['startTime'] if 'startTime' in ab_list['playEvents'][n] else np.nan)
367
- end_time.append(ab_list['playEvents'][n]['endTime'] if 'endTime' in ab_list['playEvents'][n] else np.nan)
368
- is_pitch.append(ab_list['playEvents'][n]['isPitch'] if 'isPitch' in ab_list['playEvents'][n] else np.nan)
369
- type_type.append(ab_list['playEvents'][n]['type'] if 'type' in ab_list['playEvents'][n] else np.nan)
370
-
371
-
372
-
373
- if n == len(ab_list['playEvents']) - 1 :
374
-
375
- type_ab.append(data['liveData']['plays']['allPlays'][ab_id]['result']['type'] if 'type' in data['liveData']['plays']['allPlays'][ab_id]['result'] else np.nan)
376
- event.append(data['liveData']['plays']['allPlays'][ab_id]['result']['event'] if 'event' in data['liveData']['plays']['allPlays'][ab_id]['result'] else np.nan)
377
- event_type.append(data['liveData']['plays']['allPlays'][ab_id]['result']['eventType'] if 'eventType' in data['liveData']['plays']['allPlays'][ab_id]['result'] else np.nan)
378
- rbi.append(data['liveData']['plays']['allPlays'][ab_id]['result']['rbi'] if 'rbi' in data['liveData']['plays']['allPlays'][ab_id]['result'] else np.nan)
379
- away_score.append(data['liveData']['plays']['allPlays'][ab_id]['result']['awayScore'] if 'awayScore' in data['liveData']['plays']['allPlays'][ab_id]['result'] else np.nan)
380
- home_score.append(data['liveData']['plays']['allPlays'][ab_id]['result']['homeScore'] if 'homeScore' in data['liveData']['plays']['allPlays'][ab_id]['result'] else np.nan)
381
- is_out.append(data['liveData']['plays']['allPlays'][ab_id]['result']['isOut'] if 'isOut' in data['liveData']['plays']['allPlays'][ab_id]['result'] else np.nan)
382
-
383
- else:
384
-
385
- type_ab.append(np.nan)
386
- event.append(np.nan)
387
- event_type.append(np.nan)
388
- rbi.append(np.nan)
389
- away_score.append(np.nan)
390
- home_score.append(np.nan)
391
- is_out.append(np.nan)
392
-
393
- elif ab_list['playEvents'][n]['count']['balls'] == 4:
394
-
395
- event.append(data['liveData']['plays']['allPlays'][ab_id]['result']['event'])
396
- event_type.append(data['liveData']['plays']['allPlays'][ab_id]['result']['eventType'])
397
-
398
-
399
- game_id.append(data['gamePk'])
400
- game_date.append(data['gameData']['datetime']['officialDate'])
401
- batter_id.append(ab_list['matchup']['batter']['id'] if 'batter' in ab_list['matchup'] else np.nan)
402
- batter_name.append(ab_list['matchup']['batter']['fullName'] if 'batter' in ab_list['matchup'] else np.nan)
403
- batter_hand.append(ab_list['matchup']['batSide']['code'] if 'batSide' in ab_list['matchup'] else np.nan)
404
- pitcher_id.append(ab_list['matchup']['pitcher']['id'] if 'pitcher' in ab_list['matchup'] else np.nan)
405
- pitcher_name.append(ab_list['matchup']['pitcher']['fullName'] if 'pitcher' in ab_list['matchup'] else np.nan)
406
- pitcher_hand.append(ab_list['matchup']['pitchHand']['code'] if 'pitchHand' in ab_list['matchup'] else np.nan)
407
- if ab_list['about']['isTopInning']:
408
- batter_team.append(data['gameData']['teams']['away']['abbreviation'] if 'away' in data['gameData']['teams'] else np.nan)
409
- batter_team_id.append(data['gameData']['teams']['away']['id'] if 'away' in data['gameData']['teams'] else np.nan)
410
- pitcher_team.append(data['gameData']['teams']['home']['abbreviation'] if 'home' in data['gameData']['teams'] else np.nan)
411
- pitcher_team_id.append(data['gameData']['teams']['away']['id'] if 'away' in data['gameData']['teams'] else np.nan)
412
- else:
413
- batter_team.append(data['gameData']['teams']['home']['abbreviation'] if 'home' in data['gameData']['teams'] else np.nan)
414
- batter_team_id.append(data['gameData']['teams']['home']['id'] if 'home' in data['gameData']['teams'] else np.nan)
415
- pitcher_team.append(data['gameData']['teams']['away']['abbreviation'] if 'away' in data['gameData']['teams'] else np.nan)
416
- pitcher_team_id.append(data['gameData']['teams']['home']['id'] if 'home' in data['gameData']['teams'] else np.nan)
417
-
418
- play_description.append(np.nan)
419
- play_code.append(np.nan)
420
- in_play.append(np.nan)
421
- is_strike.append(np.nan)
422
- is_ball.append(np.nan)
423
- is_review.append(np.nan)
424
- pitch_type.append(np.nan)
425
- pitch_description.append(np.nan)
426
- strikes.append(ab_list['playEvents'][n]['count']['balls'] if 'balls' in ab_list['playEvents'][n]['count'] else np.nan)
427
- balls.append(ab_list['playEvents'][n]['count']['strikes'] if 'strikes' in ab_list['playEvents'][n]['count'] else np.nan)
428
- outs.append(ab_list['playEvents'][n]['count']['outs'] if 'outs' in ab_list['playEvents'][n]['count'] else np.nan)
429
- index_play.append(ab_list['playEvents'][n]['index'] if 'index' in ab_list['playEvents'][n] else np.nan)
430
- play_id.append(ab_list['playEvents'][n]['playId'] if 'playId' in ab_list['playEvents'][n] else np.nan)
431
- start_time.append(ab_list['playEvents'][n]['startTime'] if 'startTime' in ab_list['playEvents'][n] else np.nan)
432
- end_time.append(ab_list['playEvents'][n]['endTime'] if 'endTime' in ab_list['playEvents'][n] else np.nan)
433
- is_pitch.append(ab_list['playEvents'][n]['isPitch'] if 'isPitch' in ab_list['playEvents'][n] else np.nan)
434
- type_type.append(ab_list['playEvents'][n]['type'] if 'type' in ab_list['playEvents'][n] else np.nan)
435
-
436
-
437
-
438
- is_swing.append(np.nan)
439
- is_whiff.append(np.nan)
440
- start_speed.append(np.nan)
441
- end_speed.append(np.nan)
442
- sz_top.append(np.nan)
443
- sz_bot.append(np.nan)
444
- x.append(np.nan)
445
- y.append(np.nan)
446
- ax.append(np.nan)
447
- ay.append(np.nan)
448
- az.append(np.nan)
449
- pfxx.append(np.nan)
450
- pfxz.append(np.nan)
451
- px.append(np.nan)
452
- pz.append(np.nan)
453
- vx0.append(np.nan)
454
- vy0.append(np.nan)
455
- vz0.append(np.nan)
456
- x0.append(np.nan)
457
- y0.append(np.nan)
458
- z0.append(np.nan)
459
- zone.append(np.nan)
460
- type_confidence.append(np.nan)
461
- plate_time.append(np.nan)
462
- extension.append(np.nan)
463
- spin_rate.append(np.nan)
464
- spin_direction.append(np.nan)
465
- ivb.append(np.nan)
466
- hb.append(np.nan)
467
- launch_speed.append(np.nan)
468
- launch_angle.append(np.nan)
469
- launch_distance.append(np.nan)
470
- launch_location.append(np.nan)
471
- trajectory.append(np.nan)
472
- hardness.append(np.nan)
473
- hit_x.append(np.nan)
474
- hit_y.append(np.nan)
475
- type_ab.append(np.nan)
476
- ab_number.append(np.nan)
477
-
478
- rbi.append(np.nan)
479
- away_score.append(np.nan)
480
- home_score.append(np.nan)
481
- is_out.append(np.nan)
482
- print({
483
- 'game_id':len(game_id),
484
- 'game_date':len(game_date),
485
- 'batter_id':len(batter_id),
486
- 'batter_name':len(batter_name),
487
- 'batter_hand':len(batter_hand),
488
- 'batter_team':len(batter_team),
489
- 'batter_team_id':len(batter_team_id),
490
- 'pitcher_id':len(pitcher_id),
491
- 'pitcher_name':len(pitcher_name),
492
- 'pitcher_hand':len(pitcher_hand),
493
- 'pitcher_team':len(pitcher_team),
494
- 'pitcher_team_id':len(pitcher_team_id),
495
- 'play_description':len(play_description),
496
- 'play_code':len(play_code),
497
- 'in_play':len(in_play),
498
- 'is_strike':len(is_strike),
499
- 'is_swing':len(is_swing),
500
- 'is_whiff':len(is_whiff),
501
- 'is_out':len(is_out),
502
- 'is_ball':len(is_ball),
503
- 'is_review':len(is_review),
504
- 'pitch_type':len(pitch_type),
505
- 'pitch_description':len(pitch_description),
506
- 'strikes':len(strikes),
507
- 'balls':len(balls),
508
- 'outs':len(outs),
509
- 'start_speed':len(start_speed),
510
- 'end_speed':len(end_speed),
511
- 'sz_top':len(sz_top),
512
- 'sz_bot':len(sz_bot),
513
- 'x':len(x),
514
- 'y':len(y),
515
- 'ax':len(ax),
516
- 'ay':len(ay),
517
- 'az':len(az),
518
- 'pfxx':len(pfxx),
519
- 'pfxz':len(pfxz),
520
- 'px':len(px),
521
- 'pz':len(pz),
522
- 'vx0':len(vx0),
523
- 'vy0':len(vy0),
524
- 'vz0':len(vz0),
525
- 'x0':len(x0),
526
- 'y0':len(y0),
527
- 'z0':len(z0),
528
- 'zone':len(zone),
529
- 'type_confidence':len(type_confidence),
530
- 'plate_time':len(plate_time),
531
- 'extension':len(extension),
532
- 'spin_rate':len(spin_rate),
533
- 'spin_direction':len(spin_direction),
534
- 'ivb':len(ivb),
535
- 'hb':len(hb),
536
- 'launch_speed':len(launch_speed),
537
- 'launch_angle':len(launch_angle),
538
- 'launch_distance':len(launch_distance),
539
- 'launch_location':len(launch_location),
540
- 'trajectory':len(trajectory),
541
- 'hardness':len(hardness),
542
- 'hit_x':len(hit_x),
543
- 'hit_y':len(hit_y),
544
- 'index_play':len(index_play),
545
- 'play_id':len(play_id),
546
- 'start_time':len(start_time),
547
- 'end_time':len(end_time),
548
- 'is_pitch':len(is_pitch),
549
- 'type_type':len(type_type),
550
- 'type_ab':len(type_ab),
551
- 'event':len(event),
552
- 'event_type':len(event_type),
553
- 'rbi':len(rbi),
554
- 'away_score':len(away_score),
555
- 'home_score':len(home_score),
556
- }
557
-
558
-
559
- )
560
- df = pd.DataFrame(data={
561
- 'game_id':game_id,
562
- 'game_date':game_date,
563
- 'batter_id':batter_id,
564
- 'batter_name':batter_name,
565
- 'batter_hand':batter_hand,
566
- 'batter_team':batter_team,
567
- 'batter_team_id':batter_team_id,
568
- 'pitcher_id':pitcher_id,
569
- 'pitcher_name':pitcher_name,
570
- 'pitcher_hand':pitcher_hand,
571
- 'pitcher_team':pitcher_team,
572
- 'pitcher_team_id':pitcher_team_id,
573
- 'play_description':play_description,
574
- 'play_code':play_code,
575
- 'in_play':in_play,
576
- 'is_strike':is_strike,
577
- 'is_swing':is_swing,
578
- 'is_whiff':is_whiff,
579
- 'is_out':is_out,
580
- 'is_ball':is_ball,
581
- 'is_review':is_review,
582
- 'pitch_type':pitch_type,
583
- 'pitch_description':pitch_description,
584
- 'strikes':strikes,
585
- 'balls':balls,
586
- 'outs':outs,
587
- 'start_speed':start_speed,
588
- 'end_speed':end_speed,
589
- 'sz_top':sz_top,
590
- 'sz_bot':sz_bot,
591
- 'x':x,
592
- 'y':y,
593
- 'ax':ax,
594
- 'ay':ay,
595
- 'az':az,
596
- 'pfxx':pfxx,
597
- 'pfxz':pfxz,
598
- 'px':px,
599
- 'pz':pz,
600
- 'vx0':vx0,
601
- 'vy0':vy0,
602
- 'vz0':vz0,
603
- 'x0':x0,
604
- 'y0':y0,
605
- 'z0':z0,
606
- 'zone':zone,
607
- 'type_confidence':type_confidence,
608
- 'plate_time':plate_time,
609
- 'extension':extension,
610
- 'spin_rate':spin_rate,
611
- 'spin_direction':spin_direction,
612
- 'ivb':ivb,
613
- 'hb':hb,
614
- 'launch_speed':launch_speed,
615
- 'launch_angle':launch_angle,
616
- 'launch_distance':launch_distance,
617
- 'launch_location':launch_location,
618
- 'trajectory':trajectory,
619
- 'hardness':hardness,
620
- 'hit_x':hit_x,
621
- 'hit_y':hit_y,
622
- 'index_play':index_play,
623
- 'play_id':play_id,
624
- 'start_time':start_time,
625
- 'end_time':end_time,
626
- 'is_pitch':is_pitch,
627
- 'type_type':type_type,
628
- 'type_ab':type_ab,
629
- 'event':event,
630
- 'event_type':event_type,
631
- 'rbi':rbi,
632
- 'away_score':away_score,
633
- 'home_score':home_score,
634
-
635
- }
636
- )
637
- return df
638
-
639
- def get_players(self,sport_id=1):
640
- player_data = requests.get(url=f'https://statsapi.mlb.com/api/v1/sports/{sport_id}/players').json()
641
-
642
- #Select relevant data that will help distinguish players from one another
643
- fullName_list = [x['fullName'] for x in player_data['people']]
644
- id_list = [x['id'] for x in player_data['people']]
645
- position_list = [x['primaryPosition']['abbreviation'] for x in player_data['people']]
646
- team_list = [x['currentTeam']['id']for x in player_data['people']]
647
- age_list = [x['currentAge']for x in player_data['people']]
648
-
649
- player_df = pd.DataFrame(data={'player_id':id_list,
650
- 'name':fullName_list,
651
- 'position':position_list,
652
- 'team':team_list,
653
- 'age':age_list})
654
- return player_df
655
-
656
- def get_teams(self):
657
- teams = requests.get(url='https://statsapi.mlb.com/api/v1/teams/').json()
658
- #Select only teams that are at the MLB level
659
- # mlb_teams_city = [x['franchiseName'] for x in teams['teams'] if x['sport']['name'] == 'Major League Baseball']
660
- # mlb_teams_name = [x['teamName'] for x in teams['teams'] if x['sport']['name'] == 'Major League Baseball']
661
- # mlb_teams_franchise = [x['name'] for x in teams['teams'] if x['sport']['name'] == 'Major League Baseball']
662
- # mlb_teams_id = [x['id'] for x in teams['teams'] if x['sport']['name'] == 'Major League Baseball']
663
- # mlb_teams_abb = [x['abbreviation'] for x in teams['teams'] if x['sport']['name'] == 'Major League Baseball']
664
-
665
- mlb_teams_city = [x['franchiseName'] if 'franchiseName' in x else None for x in teams['teams']]
666
- mlb_teams_name = [x['teamName'] if 'franchiseName' in x else None for x in teams['teams']]
667
- mlb_teams_franchise = [x['name'] if 'franchiseName' in x else None for x in teams['teams']]
668
- mlb_teams_id = [x['id'] if 'franchiseName' in x else None for x in teams['teams']]
669
- mlb_teams_abb = [x['abbreviation'] if 'franchiseName' in x else None for x in teams['teams']]
670
- mlb_teams_parent_id = [x['parentOrgId'] if 'parentOrgId' in x else None for x in teams['teams']]
671
- mlb_teams_parent = [x['parentOrgName'] if 'parentOrgName' in x else None for x in teams['teams']]
672
- mlb_teams_league_id = [x['league']['id'] if 'id' in x['league'] else None for x in teams['teams']]
673
- mlb_teams_league_name = [x['league']['name'] if 'name' in x['league'] else None for x in teams['teams']]
674
-
675
-
676
-
677
- #Create a dataframe of all the teams
678
- mlb_teams_df = pd.DataFrame(data={'team_id':mlb_teams_id,
679
- 'city':mlb_teams_franchise,
680
- 'name':mlb_teams_name,
681
- 'franchise':mlb_teams_franchise,
682
- 'abbreviation':mlb_teams_abb,
683
- 'parent_org_id':mlb_teams_parent_id,
684
- 'parent_org':mlb_teams_parent,
685
- 'league_id':mlb_teams_league_id,
686
- 'league_name':mlb_teams_league_name
687
-
688
- }).drop_duplicates().dropna(subset=['team_id']).reset_index(drop=True).sort_values('team_id')
689
-
690
- mlb_teams_df.loc[mlb_teams_df['parent_org_id'].isnull(),'parent_org_id'] = mlb_teams_df.loc[mlb_teams_df['parent_org_id'].isnull(),'team_id']
691
- mlb_teams_df.loc[mlb_teams_df['parent_org'].isnull(),'parent_org'] = mlb_teams_df.loc[mlb_teams_df['parent_org'].isnull(),'franchise']
692
-
693
-
694
- mlb_teams_df['parent_org_abbreviation'] = mlb_teams_df['parent_org_id'].map(mlb_teams_df.set_index('team_id')['abbreviation'].to_dict())
695
-
696
-
697
- #mlb_teams_df.loc[mlb_teams_df.franchise.isin(mlb_teams_df.parent_org.unique()),'parent_org'] = mlb_teams_df.loc[mlb_teams_df.franchise.isin(mlb_teams_df.parent_org.unique()),'franchise']
698
-
699
- return mlb_teams_df
700
-
701
- def get_leagues(self):
702
- leagues = requests.get(url='https://statsapi.mlb.com/api/v1/leagues/').json()
703
-
704
- sport_id = [x['sport']['id'] if 'sport' in x else None for x in leagues['leagues']]
705
- league_id = [x['id'] if 'id' in x else None for x in leagues['leagues']]
706
- league_name = [x['name'] if 'name' in x else None for x in leagues['leagues']]
707
- league_abbreviation = [x['abbreviation'] if 'abbreviation' in x else None for x in leagues['leagues']]
708
-
709
-
710
-
711
- leagues_df = pd.DataFrame(data= {
712
- 'league_id':league_id,
713
- 'league_name':league_name,
714
- 'league_abbreviation':league_abbreviation,
715
- 'sport_id':sport_id,
716
- })
717
-
718
- return leagues_df
719
-
720
- def get_player_games_list(self,player_id=691587):
721
- player_game_list = [x['game']['gamePk'] for x in requests.get(url=f'http://statsapi.mlb.com/api/v1/people/{player_id}?hydrate=stats(type=gameLog,season=2023),hydrations').json()['people'][0]['stats'][0]['splits']]
722
- return player_game_list
723
-
724
- def get_team_schedule(self,year=2023,sport_id=1,mlb_team='Toronto Blue Jays'):
725
- if not self.get_sport_id_check(sport_id=sport_id):
726
- print('Please Select a New Sport ID from the following')
727
- print(self.get_sport_id())
728
- return False, False
729
-
730
- schedule_df = self.get_schedule(year_input=year,sport_id=sport_id)
731
- teams_df = self.get_teams().merge(self.get_leagues()).merge(self.get_sport_id(),left_on=['sport_id'],right_index=True,suffixes=['','_sport'])
732
- teams_df = teams_df[teams_df['sport_id'] == sport_id]
733
- team_abb_select = teams_df[teams_df['parent_org'] == mlb_team]['abbreviation'].values[0]
734
- team_name_select = teams_df[teams_df['parent_org'] == mlb_team]['franchise'].values[0]
735
- schedule_df = schedule_df[((schedule_df.away == team_name_select) | (schedule_df.home == team_name_select)) & (schedule_df.state == 'F')].reset_index(drop=True)
736
- return schedule_df,teams_df
737
-
738
- def get_team_game_data(self,year=2023,sport_id=1,mlb_team='Toronto Blue Jays'):
739
- schedule_df,teams_df = self.get_team_schedule(year=year,sport_id=sport_id,mlb_team=mlb_team)
740
- if not schedule_df:
741
- return
742
- data = self.get_data(schedule_df['game_id'][:])
743
- df = self.get_data_df(data_list = data)
744
- df['mlb_team'] = teams_df[teams_df['parent_org'] == mlb_team]['parent_org_abbreviation'].values[0]
745
- df['level'] = teams_df[teams_df['parent_org'] == mlb_team]['abbreviation_sport'].values[0]
746
-
747
- return df
 
1
+ import requests
2
+ import pandas as pd
3
+ import numpy as np
4
+ from datetime import datetime
5
+ from tqdm import tqdm
6
+ import time
7
+ from pytz import timezone
8
+
9
+
10
+ class MLB_Scrape:
11
+
12
+ # def __init__(self):
13
+ # # Initialize your class here if needed
14
+ # pass
15
+
16
+ def get_sport_id(self):
17
+ df = pd.DataFrame(requests.get(url=f'https://statsapi.mlb.com/api/v1/sports').json()['sports']).set_index('id')
18
+ return df
19
+
20
+ def get_sport_id_check(self,sport_id):
21
+ sport_id_df = self.get_sport_id()
22
+ if sport_id not in sport_id_df.index:
23
+ print('Please Select a New Sport ID from the following')
24
+ print(sport_id_df)
25
+ return False
26
+ return True
27
+
28
+ def get_schedule(self,year_input=2023,
29
+ sport_id=1,
30
+ start_date='YYYY-MM-DD',
31
+ end_date='YYYY-MM-DD',
32
+ final=True,
33
+ regular=True,
34
+ spring=False):
35
+ # Get MLB Schedule
36
+
37
+ if not self.get_sport_id_check(sport_id=sport_id):
38
+ return
39
+ if regular == True:
40
+ game_call = requests.get(url=f'https://statsapi.mlb.com/api/v1/schedule/?sportId={sport_id}&gameTypes=R&season={year_input}&hydrate=lineup,players').json()
41
+ print(f'https://statsapi.mlb.com/api/v1/schedule/?sportId={sport_id}&gameTypes=R&season={year_input}&hydrate=lineup,players')
42
+ elif spring == True:
43
+ print('spring')
44
+ game_call = requests.get(url=f'https://statsapi.mlb.com/api/v1/schedule/?sportId={sport_id}&gameTypes=S&season={year_input}&hydrate=lineup,players').json()
45
+ print(f'https://statsapi.mlb.com/api/v1/schedule/?sportId={sport_id}&gameTypes=S&season={year_input}&hydrate=lineup,players')
46
+ else:
47
+ game_call = requests.get(url=f'https://statsapi.mlb.com/api/v1/schedule/?sportId={sport_id}&season={year_input}&hydrate=lineup,players').json()
48
+
49
+ # Grab data from MLB Schedule (game id, away, home, state)
50
+ game_list = [item for sublist in [[y['gamePk'] for y in x['games']] for x in game_call['dates']] for item in sublist]
51
+ time_list = [item for sublist in [[y['gameDate'] for y in x['games']] for x in game_call['dates']] for item in sublist]
52
+ date_list = [item for sublist in [[y['officialDate'] for y in x['games']] for x in game_call['dates']] for item in sublist]
53
+ away_team_list = [item for sublist in [[y['teams']['away']['team']['name'] for y in x['games']] for x in game_call['dates']] for item in sublist]
54
+ home_team_list = [item for sublist in [[y['teams']['home']['team']['name'] for y in x['games']] for x in game_call['dates']] for item in sublist]
55
+ state_list = [item for sublist in [[y['status']['codedGameState'] for y in x['games']] for x in game_call['dates']] for item in sublist]
56
+ venue_id = [item for sublist in [[y['venue']['id'] for y in x['games']] for x in game_call['dates']] for item in sublist]
57
+ venue_name = [item for sublist in [[y['venue']['name'] for y in x['games']] for x in game_call['dates']] for item in sublist]
58
+
59
+ game_df = pd.DataFrame(data={'game_id':game_list,
60
+ 'time':time_list,
61
+ 'date':date_list,
62
+ 'away':away_team_list,
63
+ 'home':home_team_list,
64
+ 'state':state_list,
65
+ 'venue_id':venue_id,
66
+ 'venue_name':venue_name})
67
+
68
+ # game_list = [item for sublist in [[y['gamePk'] for y in x['games']] for x in game_call['dates']] for item in sublist]
69
+ # date_list = [item for sublist in [[y['officialDate'] for y in x['games']] for x in game_call['dates']] for item in sublist]
70
+ # cancel_list = [item for sublist in [[y['status']['codedGameState'] for y in x['games']] for x in game_call['dates']] for item in sublist]
71
+ # game_df = pd.DataFrame(data={'game_id':game_list,'date':date_list,'state':cancel_list})
72
+ #game_df = pd.concat([game_df,game_df])
73
+ if len(game_df) == 0:
74
+ return 'Schedule Length of 0, please select different parameters.'
75
+
76
+ game_df['date'] = pd.to_datetime(game_df['date']).dt.date
77
+ #game_df['time'] = game_df['time'].dt.tz_localize('UTC')
78
+ #game_df['time'] = game_df['time'].dt.tz_localize('UTC')
79
+ game_df['time'] = pd.to_datetime(game_df['time'])
80
+ eastern = timezone('US/Eastern')
81
+ game_df['time'] = game_df['time'].dt.tz_convert(eastern)
82
+ game_df['time'] = game_df['time'].dt.strftime("%I:%M %p EST")#.dt.time
83
+
84
+ if not start_date == 'YYYY-MM-DD' or not end_date == 'YYYY-MM-DD':
85
+ try:
86
+ start_date = datetime.strptime(start_date, "%Y-%m-%d").date()
87
+ end_date = datetime.strptime(end_date, "%Y-%m-%d").date()
88
+ game_df = game_df[(game_df['date'] >= start_date) & (game_df['date'] <= end_date)]
89
+
90
+ except ValueError:
91
+ return 'Please use YYYY-MM-DD Format for Start and End Dates'
92
+ if final:
93
+ game_df = game_df[game_df['state'] == 'F'].drop_duplicates(subset='game_id').reset_index(drop=True)
94
+
95
+ game_df = game_df.drop_duplicates(subset='game_id').reset_index(drop=True)
96
+
97
+ if len(game_df) == 0:
98
+ return 'Schedule Length of 0, please select different parameters.'
99
+
100
+ return game_df
101
+
102
+ def get_data(self,game_list_input = [748540]):
103
+ data_total = []
104
+ #n_count = 0
105
+ print('This May Take a While. Progress Bar shows Completion of Data Retrieval.')
106
+ for i in tqdm(range(len(game_list_input)), desc="Processing", unit="iteration"):
107
+ #for game_id_select in game_list:
108
+ # if n_count%50 == 0:
109
+ # print(n_count)
110
+ r = requests.get(f'https://statsapi.mlb.com/api/v1.1/game/{game_list_input[i]}/feed/live')
111
+ data_total.append(r.json())
112
+ #n_count = n_count + 1
113
+ return data_total
114
+
115
+ def get_data_df(self,data_list):
116
+
117
+ swing_list = ['X','F','S','D','E','T','W']
118
+ whiff_list = ['S','T','W']
119
+ print('Converting Data to Dataframe.')
120
+ game_id = []
121
+ game_date = []
122
+ batter_id = []
123
+ batter_name = []
124
+ batter_hand = []
125
+ batter_team = []
126
+ batter_team_id = []
127
+ pitcher_id = []
128
+ pitcher_name = []
129
+ pitcher_hand = []
130
+ pitcher_team = []
131
+ pitcher_team_id = []
132
+
133
+ play_description = []
134
+ play_code = []
135
+ in_play = []
136
+ is_strike = []
137
+ is_swing = []
138
+ is_whiff = []
139
+ is_out = []
140
+ is_ball = []
141
+ is_review = []
142
+ pitch_type = []
143
+ pitch_description = []
144
+ strikes = []
145
+ balls = []
146
+ outs = []
147
+
148
+ start_speed = []
149
+ end_speed = []
150
+ sz_top = []
151
+ sz_bot = []
152
+ x = []
153
+ y = []
154
+ ax = []
155
+ ay = []
156
+ az = []
157
+ pfxx = []
158
+ pfxz = []
159
+ px = []
160
+ pz = []
161
+ vx0 = []
162
+ vy0 = []
163
+ vz0 = []
164
+ x0 = []
165
+ y0 = []
166
+ z0 = []
167
+ zone = []
168
+ type_confidence = []
169
+ plate_time = []
170
+ extension = []
171
+ spin_rate = []
172
+ spin_direction = []
173
+ ivb = []
174
+ hb = []
175
+
176
+ launch_speed = []
177
+ launch_angle = []
178
+ launch_distance = []
179
+ launch_location = []
180
+ trajectory = []
181
+ hardness = []
182
+ hit_x = []
183
+ hit_y = []
184
+
185
+ index_play = []
186
+ play_id = []
187
+ start_time = []
188
+ end_time = []
189
+ is_pitch = []
190
+ type_type = []
191
+
192
+
193
+ type_ab = []
194
+ ab_number = []
195
+ event = []
196
+ event_type = []
197
+ rbi = []
198
+ away_score = []
199
+ home_score = []
200
+
201
+ #data[0]['liveData']['plays']['allPlays'][32]['playEvents'][-1]['details']['call']['code'] in ['VP']
202
+
203
+ for data in data_list:
204
+ for ab_id in range(len(data['liveData']['plays']['allPlays'])):
205
+ ab_list = data['liveData']['plays']['allPlays'][ab_id]
206
+ for n in range(len(ab_list['playEvents'])):
207
+ if ab_list['playEvents'][n]['isPitch'] == True or 'call' in ab_list['playEvents'][n]['details']:
208
+
209
+ game_id.append(data['gamePk'])
210
+ game_date.append(data['gameData']['datetime']['officialDate'])
211
+ if 'matchup' in ab_list:
212
+ batter_id.append(ab_list['matchup']['batter']['id'] if 'batter' in ab_list['matchup'] else np.nan)
213
+ if 'batter' in ab_list['matchup']:
214
+ batter_name.append(ab_list['matchup']['batter']['fullName'] if 'fullName' in ab_list['matchup']['batter'] else np.nan)
215
+ else:
216
+ batter_name.append(np.nan)
217
+
218
+ batter_hand.append(ab_list['matchup']['batSide']['code'] if 'batSide' in ab_list['matchup'] else np.nan)
219
+ pitcher_id.append(ab_list['matchup']['pitcher']['id'] if 'pitcher' in ab_list['matchup'] else np.nan)
220
+ if 'pitcher' in ab_list['matchup']:
221
+ pitcher_name.append(ab_list['matchup']['pitcher']['fullName'] if 'fullName' in ab_list['matchup']['pitcher'] else np.nan)
222
+ else:
223
+ pitcher_name.append(np.nan)
224
+ #pitcher_name.append(ab_list['matchup']['pitcher']['fullName'] if 'pitcher' in ab_list['matchup'] else np.nan)
225
+ pitcher_hand.append(ab_list['matchup']['pitchHand']['code'] if 'pitchHand' in ab_list['matchup'] else np.nan)
226
+
227
+
228
+ # batter_id.append(ab_list['matchup']['batter']['id'] if 'batter' in ab_list['matchup'] else np.nan)
229
+ # batter_name.append(ab_list['matchup']['batter']['fullName'] if 'batter' in ab_list['matchup'] else np.nan)
230
+ # batter_hand.append(ab_list['matchup']['batSide']['code'] if 'batSide' in ab_list['matchup'] else np.nan)
231
+ # pitcher_id.append(ab_list['matchup']['pitcher']['id'] if 'pitcher' in ab_list['matchup'] else np.nan)
232
+ # pitcher_name.append(ab_list['matchup']['pitcher']['fullName'] if 'pitcher' in ab_list['matchup'] else np.nan)
233
+ # pitcher_hand.append(ab_list['matchup']['pitchHand']['code'] if 'pitchHand' in ab_list['matchup'] else np.nan)
234
+
235
+ if ab_list['about']['isTopInning']:
236
+ batter_team.append(data['gameData']['teams']['away']['abbreviation'] if 'away' in data['gameData']['teams'] else np.nan)
237
+ batter_team_id.append(data['gameData']['teams']['away']['id'] if 'away' in data['gameData']['teams'] else np.nan)
238
+ pitcher_team.append(data['gameData']['teams']['home']['abbreviation'] if 'home' in data['gameData']['teams'] else np.nan)
239
+ pitcher_team_id.append(data['gameData']['teams']['away']['id'] if 'away' in data['gameData']['teams'] else np.nan)
240
+
241
+ else:
242
+ batter_team.append(data['gameData']['teams']['home']['abbreviation'] if 'home' in data['gameData']['teams'] else np.nan)
243
+ batter_team_id.append(data['gameData']['teams']['home']['id'] if 'home' in data['gameData']['teams'] else np.nan)
244
+ pitcher_team.append(data['gameData']['teams']['away']['abbreviation'] if 'away' in data['gameData']['teams'] else np.nan)
245
+ pitcher_team_id.append(data['gameData']['teams']['home']['id'] if 'home' in data['gameData']['teams'] else np.nan)
246
+
247
+ play_description.append(ab_list['playEvents'][n]['details']['description'] if 'description' in ab_list['playEvents'][n]['details'] else np.nan)
248
+ play_code.append(ab_list['playEvents'][n]['details']['code'] if 'code' in ab_list['playEvents'][n]['details'] else np.nan)
249
+ in_play.append(ab_list['playEvents'][n]['details']['isInPlay'] if 'isInPlay' in ab_list['playEvents'][n]['details'] else np.nan)
250
+ is_strike.append(ab_list['playEvents'][n]['details']['isStrike'] if 'isStrike' in ab_list['playEvents'][n]['details'] else np.nan)
251
+
252
+ if 'details' in ab_list['playEvents'][n]:
253
+ is_swing.append(True if ab_list['playEvents'][n]['details']['code'] in swing_list else np.nan)
254
+ is_whiff.append(True if ab_list['playEvents'][n]['details']['code'] in whiff_list else np.nan)
255
+ else:
256
+ is_swing.append(np.nan)
257
+ is_whiff.append(np.nan)
258
+
259
+ #is_out.append(ab_list['playEvents'][n]['details']['isBall'] if 'isBall' in ab_list['playEvents'][n]['details'] else np.nan)
260
+ is_ball.append(ab_list['playEvents'][n]['details']['isOut'] if 'isOut' in ab_list['playEvents'][n]['details'] else np.nan)
261
+ is_review.append(ab_list['playEvents'][n]['details']['hasReview'] if 'hasReview' in ab_list['playEvents'][n]['details'] else np.nan)
262
+ pitch_type.append(ab_list['playEvents'][n]['details']['type']['code'] if 'type' in ab_list['playEvents'][n]['details'] else np.nan)
263
+ pitch_description.append(ab_list['playEvents'][n]['details']['type']['description'] if 'type' in ab_list['playEvents'][n]['details'] else np.nan)
264
+
265
+ #if ab_list['playEvents'][n]['isPitch'] == True:
266
+ if ab_list['playEvents'][n]['pitchNumber'] == 1:
267
+ ab_number.append(ab_list['playEvents'][n]['atBatIndex'] if 'atBatIndex' in ab_list['playEvents'][n] else np.nan)
268
+ strikes.append(0)
269
+ balls.append(0)
270
+ outs.append(0)
271
+ else:
272
+ ab_number.append(ab_list['playEvents'][n]['atBatIndex'] if 'atBatIndex' in ab_list['playEvents'][n] else np.nan)
273
+ strikes.append(ab_list['playEvents'][n-1]['count']['strikes'] if 'strikes' in ab_list['playEvents'][n-1]['count'] else np.nan)
274
+ balls.append(ab_list['playEvents'][n-1]['count']['balls'] if 'balls' in ab_list['playEvents'][n-1]['count'] else np.nan)
275
+ outs.append(ab_list['playEvents'][n-1]['count']['outs'] if 'outs' in ab_list['playEvents'][n-1]['count'] else np.nan)
276
+
277
+ if 'pitchData' in ab_list['playEvents'][n]:
278
+
279
+ start_speed.append(ab_list['playEvents'][n]['pitchData']['startSpeed'] if 'startSpeed' in ab_list['playEvents'][n]['pitchData'] else np.nan)
280
+ end_speed.append(ab_list['playEvents'][n]['pitchData']['endSpeed'] if 'endSpeed' in ab_list['playEvents'][n]['pitchData'] else np.nan)
281
+
282
+ sz_top.append(ab_list['playEvents'][n]['pitchData']['strikeZoneTop'] if 'strikeZoneTop' in ab_list['playEvents'][n]['pitchData'] else np.nan)
283
+ sz_bot.append(ab_list['playEvents'][n]['pitchData']['strikeZoneBottom'] if 'strikeZoneBottom' in ab_list['playEvents'][n]['pitchData'] else np.nan)
284
+ x.append(ab_list['playEvents'][n]['pitchData']['coordinates']['x'] if 'x' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
285
+ y.append(ab_list['playEvents'][n]['pitchData']['coordinates']['y'] if 'y' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
286
+
287
+ ax.append(ab_list['playEvents'][n]['pitchData']['coordinates']['aX'] if 'aX' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
288
+ ay.append(ab_list['playEvents'][n]['pitchData']['coordinates']['aY'] if 'aY' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
289
+ az.append(ab_list['playEvents'][n]['pitchData']['coordinates']['aZ'] if 'aZ' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
290
+ pfxx.append(ab_list['playEvents'][n]['pitchData']['coordinates']['pfxX'] if 'pfxX' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
291
+ pfxz.append(ab_list['playEvents'][n]['pitchData']['coordinates']['pfxZ'] if 'pfxZ' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
292
+ px.append(ab_list['playEvents'][n]['pitchData']['coordinates']['pX'] if 'pX' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
293
+ pz.append(ab_list['playEvents'][n]['pitchData']['coordinates']['pZ'] if 'pZ' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
294
+ vx0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['vX0'] if 'vX0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
295
+ vy0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['vY0'] if 'vY0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
296
+ vz0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['vZ0'] if 'vZ0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
297
+ x0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['x0'] if 'x0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
298
+ y0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['y0'] if 'y0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
299
+ z0.append(ab_list['playEvents'][n]['pitchData']['coordinates']['z0'] if 'z0' in ab_list['playEvents'][n]['pitchData']['coordinates'] else np.nan)
300
+
301
+ zone.append(ab_list['playEvents'][n]['pitchData']['zone'] if 'zone' in ab_list['playEvents'][n]['pitchData'] else np.nan)
302
+ type_confidence.append(ab_list['playEvents'][n]['pitchData']['typeConfidence'] if 'typeConfidence' in ab_list['playEvents'][n]['pitchData'] else np.nan)
303
+ plate_time.append(ab_list['playEvents'][n]['pitchData']['plateTime'] if 'plateTime' in ab_list['playEvents'][n]['pitchData'] else np.nan)
304
+ extension.append(ab_list['playEvents'][n]['pitchData']['extension'] if 'extension' in ab_list['playEvents'][n]['pitchData'] else np.nan)
305
+
306
+ if 'breaks' in ab_list['playEvents'][n]['pitchData']:
307
+ spin_rate.append(ab_list['playEvents'][n]['pitchData']['breaks']['spinRate'] if 'spinRate' in ab_list['playEvents'][n]['pitchData']['breaks'] else np.nan)
308
+ spin_direction.append(ab_list['playEvents'][n]['pitchData']['breaks']['spinDirection'] if 'spinDirection' in ab_list['playEvents'][n]['pitchData']['breaks'] else np.nan)
309
+ ivb.append(ab_list['playEvents'][n]['pitchData']['breaks']['breakVerticalInduced'] if 'breakVerticalInduced' in ab_list['playEvents'][n]['pitchData']['breaks'] else np.nan)
310
+ hb.append(ab_list['playEvents'][n]['pitchData']['breaks']['breakHorizontal'] if 'breakHorizontal' in ab_list['playEvents'][n]['pitchData']['breaks'] else np.nan)
311
+
312
+ else:
313
+ start_speed.append(np.nan)
314
+ end_speed.append(np.nan)
315
+
316
+ sz_top.append(np.nan)
317
+ sz_bot.append(np.nan)
318
+ x.append(np.nan)
319
+ y.append(np.nan)
320
+
321
+ ax.append(np.nan)
322
+ ay.append(np.nan)
323
+ az.append(np.nan)
324
+ pfxx.append(np.nan)
325
+ pfxz.append(np.nan)
326
+ px.append(np.nan)
327
+ pz.append(np.nan)
328
+ vx0.append(np.nan)
329
+ vy0.append(np.nan)
330
+ vz0.append(np.nan)
331
+ x0.append(np.nan)
332
+ y0.append(np.nan)
333
+ z0.append(np.nan)
334
+
335
+ zone.append(np.nan)
336
+ type_confidence.append(np.nan)
337
+ plate_time.append(np.nan)
338
+ extension.append(np.nan)
339
+ spin_rate.append(np.nan)
340
+ spin_direction.append(np.nan)
341
+ ivb.append(np.nan)
342
+ hb.append(np.nan)
343
+
344
+ if 'hitData' in ab_list['playEvents'][n]:
345
+ launch_speed.append(ab_list['playEvents'][n]['hitData']['launchSpeed'] if 'launchSpeed' in ab_list['playEvents'][n]['hitData'] else np.nan)
346
+ launch_angle.append(ab_list['playEvents'][n]['hitData']['launchAngle'] if 'launchAngle' in ab_list['playEvents'][n]['hitData'] else np.nan)
347
+ launch_distance.append(ab_list['playEvents'][n]['hitData']['totalDistance'] if 'totalDistance' in ab_list['playEvents'][n]['hitData'] else np.nan)
348
+ launch_location.append(ab_list['playEvents'][n]['hitData']['location'] if 'location' in ab_list['playEvents'][n]['hitData'] else np.nan)
349
+
350
+ trajectory.append(ab_list['playEvents'][n]['hitData']['trajectory'] if 'trajectory' in ab_list['playEvents'][n]['hitData'] else np.nan)
351
+ hardness.append(ab_list['playEvents'][n]['hitData']['hardness'] if 'hardness' in ab_list['playEvents'][n]['hitData'] else np.nan)
352
+ hit_x.append(ab_list['playEvents'][n]['hitData']['coordinates']['coordX'] if 'coordX' in ab_list['playEvents'][n]['hitData']['coordinates'] else np.nan)
353
+ hit_y.append(ab_list['playEvents'][n]['hitData']['coordinates']['coordY'] if 'coordY' in ab_list['playEvents'][n]['hitData']['coordinates'] else np.nan)
354
+ else:
355
+ launch_speed.append(np.nan)
356
+ launch_angle.append(np.nan)
357
+ launch_distance.append(np.nan)
358
+ launch_location.append(np.nan)
359
+ trajectory.append(np.nan)
360
+ hardness.append(np.nan)
361
+ hit_x.append(np.nan)
362
+ hit_y.append(np.nan)
363
+
364
+ index_play.append(ab_list['playEvents'][n]['index'] if 'index' in ab_list['playEvents'][n] else np.nan)
365
+ play_id.append(ab_list['playEvents'][n]['playId'] if 'playId' in ab_list['playEvents'][n] else np.nan)
366
+ start_time.append(ab_list['playEvents'][n]['startTime'] if 'startTime' in ab_list['playEvents'][n] else np.nan)
367
+ end_time.append(ab_list['playEvents'][n]['endTime'] if 'endTime' in ab_list['playEvents'][n] else np.nan)
368
+ is_pitch.append(ab_list['playEvents'][n]['isPitch'] if 'isPitch' in ab_list['playEvents'][n] else np.nan)
369
+ type_type.append(ab_list['playEvents'][n]['type'] if 'type' in ab_list['playEvents'][n] else np.nan)
370
+
371
+
372
+
373
+ if n == len(ab_list['playEvents']) - 1 :
374
+
375
+ type_ab.append(data['liveData']['plays']['allPlays'][ab_id]['result']['type'] if 'type' in data['liveData']['plays']['allPlays'][ab_id]['result'] else np.nan)
376
+ event.append(data['liveData']['plays']['allPlays'][ab_id]['result']['event'] if 'event' in data['liveData']['plays']['allPlays'][ab_id]['result'] else np.nan)
377
+ event_type.append(data['liveData']['plays']['allPlays'][ab_id]['result']['eventType'] if 'eventType' in data['liveData']['plays']['allPlays'][ab_id]['result'] else np.nan)
378
+ rbi.append(data['liveData']['plays']['allPlays'][ab_id]['result']['rbi'] if 'rbi' in data['liveData']['plays']['allPlays'][ab_id]['result'] else np.nan)
379
+ away_score.append(data['liveData']['plays']['allPlays'][ab_id]['result']['awayScore'] if 'awayScore' in data['liveData']['plays']['allPlays'][ab_id]['result'] else np.nan)
380
+ home_score.append(data['liveData']['plays']['allPlays'][ab_id]['result']['homeScore'] if 'homeScore' in data['liveData']['plays']['allPlays'][ab_id]['result'] else np.nan)
381
+ is_out.append(data['liveData']['plays']['allPlays'][ab_id]['result']['isOut'] if 'isOut' in data['liveData']['plays']['allPlays'][ab_id]['result'] else np.nan)
382
+
383
+ else:
384
+
385
+ type_ab.append(np.nan)
386
+ event.append(np.nan)
387
+ event_type.append(np.nan)
388
+ rbi.append(np.nan)
389
+ away_score.append(np.nan)
390
+ home_score.append(np.nan)
391
+ is_out.append(np.nan)
392
+
393
+ elif ab_list['playEvents'][n]['count']['balls'] == 4:
394
+
395
+ event.append(data['liveData']['plays']['allPlays'][ab_id]['result']['event'])
396
+ event_type.append(data['liveData']['plays']['allPlays'][ab_id]['result']['eventType'])
397
+
398
+
399
+ game_id.append(data['gamePk'])
400
+ game_date.append(data['gameData']['datetime']['officialDate'])
401
+ batter_id.append(ab_list['matchup']['batter']['id'] if 'batter' in ab_list['matchup'] else np.nan)
402
+ batter_name.append(ab_list['matchup']['batter']['fullName'] if 'batter' in ab_list['matchup'] else np.nan)
403
+ batter_hand.append(ab_list['matchup']['batSide']['code'] if 'batSide' in ab_list['matchup'] else np.nan)
404
+ pitcher_id.append(ab_list['matchup']['pitcher']['id'] if 'pitcher' in ab_list['matchup'] else np.nan)
405
+ pitcher_name.append(ab_list['matchup']['pitcher']['fullName'] if 'pitcher' in ab_list['matchup'] else np.nan)
406
+ pitcher_hand.append(ab_list['matchup']['pitchHand']['code'] if 'pitchHand' in ab_list['matchup'] else np.nan)
407
+ if ab_list['about']['isTopInning']:
408
+ batter_team.append(data['gameData']['teams']['away']['abbreviation'] if 'away' in data['gameData']['teams'] else np.nan)
409
+ batter_team_id.append(data['gameData']['teams']['away']['id'] if 'away' in data['gameData']['teams'] else np.nan)
410
+ pitcher_team.append(data['gameData']['teams']['home']['abbreviation'] if 'home' in data['gameData']['teams'] else np.nan)
411
+ pitcher_team_id.append(data['gameData']['teams']['away']['id'] if 'away' in data['gameData']['teams'] else np.nan)
412
+ else:
413
+ batter_team.append(data['gameData']['teams']['home']['abbreviation'] if 'home' in data['gameData']['teams'] else np.nan)
414
+ batter_team_id.append(data['gameData']['teams']['home']['id'] if 'home' in data['gameData']['teams'] else np.nan)
415
+ pitcher_team.append(data['gameData']['teams']['away']['abbreviation'] if 'away' in data['gameData']['teams'] else np.nan)
416
+ pitcher_team_id.append(data['gameData']['teams']['home']['id'] if 'home' in data['gameData']['teams'] else np.nan)
417
+
418
+ play_description.append(np.nan)
419
+ play_code.append(np.nan)
420
+ in_play.append(np.nan)
421
+ is_strike.append(np.nan)
422
+ is_ball.append(np.nan)
423
+ is_review.append(np.nan)
424
+ pitch_type.append(np.nan)
425
+ pitch_description.append(np.nan)
426
+ strikes.append(ab_list['playEvents'][n]['count']['balls'] if 'balls' in ab_list['playEvents'][n]['count'] else np.nan)
427
+ balls.append(ab_list['playEvents'][n]['count']['strikes'] if 'strikes' in ab_list['playEvents'][n]['count'] else np.nan)
428
+ outs.append(ab_list['playEvents'][n]['count']['outs'] if 'outs' in ab_list['playEvents'][n]['count'] else np.nan)
429
+ index_play.append(ab_list['playEvents'][n]['index'] if 'index' in ab_list['playEvents'][n] else np.nan)
430
+ play_id.append(ab_list['playEvents'][n]['playId'] if 'playId' in ab_list['playEvents'][n] else np.nan)
431
+ start_time.append(ab_list['playEvents'][n]['startTime'] if 'startTime' in ab_list['playEvents'][n] else np.nan)
432
+ end_time.append(ab_list['playEvents'][n]['endTime'] if 'endTime' in ab_list['playEvents'][n] else np.nan)
433
+ is_pitch.append(ab_list['playEvents'][n]['isPitch'] if 'isPitch' in ab_list['playEvents'][n] else np.nan)
434
+ type_type.append(ab_list['playEvents'][n]['type'] if 'type' in ab_list['playEvents'][n] else np.nan)
435
+
436
+
437
+
438
+ is_swing.append(np.nan)
439
+ is_whiff.append(np.nan)
440
+ start_speed.append(np.nan)
441
+ end_speed.append(np.nan)
442
+ sz_top.append(np.nan)
443
+ sz_bot.append(np.nan)
444
+ x.append(np.nan)
445
+ y.append(np.nan)
446
+ ax.append(np.nan)
447
+ ay.append(np.nan)
448
+ az.append(np.nan)
449
+ pfxx.append(np.nan)
450
+ pfxz.append(np.nan)
451
+ px.append(np.nan)
452
+ pz.append(np.nan)
453
+ vx0.append(np.nan)
454
+ vy0.append(np.nan)
455
+ vz0.append(np.nan)
456
+ x0.append(np.nan)
457
+ y0.append(np.nan)
458
+ z0.append(np.nan)
459
+ zone.append(np.nan)
460
+ type_confidence.append(np.nan)
461
+ plate_time.append(np.nan)
462
+ extension.append(np.nan)
463
+ spin_rate.append(np.nan)
464
+ spin_direction.append(np.nan)
465
+ ivb.append(np.nan)
466
+ hb.append(np.nan)
467
+ launch_speed.append(np.nan)
468
+ launch_angle.append(np.nan)
469
+ launch_distance.append(np.nan)
470
+ launch_location.append(np.nan)
471
+ trajectory.append(np.nan)
472
+ hardness.append(np.nan)
473
+ hit_x.append(np.nan)
474
+ hit_y.append(np.nan)
475
+ type_ab.append(np.nan)
476
+ ab_number.append(np.nan)
477
+
478
+ rbi.append(np.nan)
479
+ away_score.append(np.nan)
480
+ home_score.append(np.nan)
481
+ is_out.append(np.nan)
482
+ print({
483
+ 'game_id':len(game_id),
484
+ 'game_date':len(game_date),
485
+ 'batter_id':len(batter_id),
486
+ 'batter_name':len(batter_name),
487
+ 'batter_hand':len(batter_hand),
488
+ 'batter_team':len(batter_team),
489
+ 'batter_team_id':len(batter_team_id),
490
+ 'pitcher_id':len(pitcher_id),
491
+ 'pitcher_name':len(pitcher_name),
492
+ 'pitcher_hand':len(pitcher_hand),
493
+ 'pitcher_team':len(pitcher_team),
494
+ 'pitcher_team_id':len(pitcher_team_id),
495
+ 'play_description':len(play_description),
496
+ 'play_code':len(play_code),
497
+ 'in_play':len(in_play),
498
+ 'is_strike':len(is_strike),
499
+ 'is_swing':len(is_swing),
500
+ 'is_whiff':len(is_whiff),
501
+ 'is_out':len(is_out),
502
+ 'is_ball':len(is_ball),
503
+ 'is_review':len(is_review),
504
+ 'pitch_type':len(pitch_type),
505
+ 'pitch_description':len(pitch_description),
506
+ 'strikes':len(strikes),
507
+ 'balls':len(balls),
508
+ 'outs':len(outs),
509
+ 'start_speed':len(start_speed),
510
+ 'end_speed':len(end_speed),
511
+ 'sz_top':len(sz_top),
512
+ 'sz_bot':len(sz_bot),
513
+ 'x':len(x),
514
+ 'y':len(y),
515
+ 'ax':len(ax),
516
+ 'ay':len(ay),
517
+ 'az':len(az),
518
+ 'pfxx':len(pfxx),
519
+ 'pfxz':len(pfxz),
520
+ 'px':len(px),
521
+ 'pz':len(pz),
522
+ 'vx0':len(vx0),
523
+ 'vy0':len(vy0),
524
+ 'vz0':len(vz0),
525
+ 'x0':len(x0),
526
+ 'y0':len(y0),
527
+ 'z0':len(z0),
528
+ 'zone':len(zone),
529
+ 'type_confidence':len(type_confidence),
530
+ 'plate_time':len(plate_time),
531
+ 'extension':len(extension),
532
+ 'spin_rate':len(spin_rate),
533
+ 'spin_direction':len(spin_direction),
534
+ 'ivb':len(ivb),
535
+ 'hb':len(hb),
536
+ 'launch_speed':len(launch_speed),
537
+ 'launch_angle':len(launch_angle),
538
+ 'launch_distance':len(launch_distance),
539
+ 'launch_location':len(launch_location),
540
+ 'trajectory':len(trajectory),
541
+ 'hardness':len(hardness),
542
+ 'hit_x':len(hit_x),
543
+ 'hit_y':len(hit_y),
544
+ 'index_play':len(index_play),
545
+ 'play_id':len(play_id),
546
+ 'start_time':len(start_time),
547
+ 'end_time':len(end_time),
548
+ 'is_pitch':len(is_pitch),
549
+ 'type_type':len(type_type),
550
+ 'type_ab':len(type_ab),
551
+ 'event':len(event),
552
+ 'event_type':len(event_type),
553
+ 'rbi':len(rbi),
554
+ 'away_score':len(away_score),
555
+ 'home_score':len(home_score),
556
+ }
557
+
558
+
559
+ )
560
+ df = pd.DataFrame(data={
561
+ 'game_id':game_id,
562
+ 'game_date':game_date,
563
+ 'batter_id':batter_id,
564
+ 'batter_name':batter_name,
565
+ 'batter_hand':batter_hand,
566
+ 'batter_team':batter_team,
567
+ 'batter_team_id':batter_team_id,
568
+ 'pitcher_id':pitcher_id,
569
+ 'pitcher_name':pitcher_name,
570
+ 'pitcher_hand':pitcher_hand,
571
+ 'pitcher_team':pitcher_team,
572
+ 'pitcher_team_id':pitcher_team_id,
573
+ 'play_description':play_description,
574
+ 'play_code':play_code,
575
+ 'in_play':in_play,
576
+ 'is_strike':is_strike,
577
+ 'is_swing':is_swing,
578
+ 'is_whiff':is_whiff,
579
+ 'is_out':is_out,
580
+ 'is_ball':is_ball,
581
+ 'is_review':is_review,
582
+ 'pitch_type':pitch_type,
583
+ 'pitch_description':pitch_description,
584
+ 'strikes':strikes,
585
+ 'balls':balls,
586
+ 'outs':outs,
587
+ 'start_speed':start_speed,
588
+ 'end_speed':end_speed,
589
+ 'sz_top':sz_top,
590
+ 'sz_bot':sz_bot,
591
+ 'x':x,
592
+ 'y':y,
593
+ 'ax':ax,
594
+ 'ay':ay,
595
+ 'az':az,
596
+ 'pfxx':pfxx,
597
+ 'pfxz':pfxz,
598
+ 'px':px,
599
+ 'pz':pz,
600
+ 'vx0':vx0,
601
+ 'vy0':vy0,
602
+ 'vz0':vz0,
603
+ 'x0':x0,
604
+ 'y0':y0,
605
+ 'z0':z0,
606
+ 'zone':zone,
607
+ 'type_confidence':type_confidence,
608
+ 'plate_time':plate_time,
609
+ 'extension':extension,
610
+ 'spin_rate':spin_rate,
611
+ 'spin_direction':spin_direction,
612
+ 'ivb':ivb,
613
+ 'hb':hb,
614
+ 'launch_speed':launch_speed,
615
+ 'launch_angle':launch_angle,
616
+ 'launch_distance':launch_distance,
617
+ 'launch_location':launch_location,
618
+ 'trajectory':trajectory,
619
+ 'hardness':hardness,
620
+ 'hit_x':hit_x,
621
+ 'hit_y':hit_y,
622
+ 'index_play':index_play,
623
+ 'play_id':play_id,
624
+ 'start_time':start_time,
625
+ 'end_time':end_time,
626
+ 'is_pitch':is_pitch,
627
+ 'type_type':type_type,
628
+ 'type_ab':type_ab,
629
+ 'event':event,
630
+ 'event_type':event_type,
631
+ 'rbi':rbi,
632
+ 'away_score':away_score,
633
+ 'home_score':home_score,
634
+
635
+ }
636
+ )
637
+ return df
638
+
639
+ def get_players(self,sport_id=1):
640
+ player_data = requests.get(url=f'https://statsapi.mlb.com/api/v1/sports/{sport_id}/players').json()
641
+
642
+ #Select relevant data that will help distinguish players from one another
643
+ fullName_list = [x['fullName'] for x in player_data['people']]
644
+ id_list = [x['id'] for x in player_data['people']]
645
+ position_list = [x['primaryPosition']['abbreviation'] for x in player_data['people']]
646
+ team_list = [x['currentTeam']['id']for x in player_data['people']]
647
+ age_list = [x['currentAge']for x in player_data['people']]
648
+
649
+ player_df = pd.DataFrame(data={'player_id':id_list,
650
+ 'name':fullName_list,
651
+ 'position':position_list,
652
+ 'team':team_list,
653
+ 'age':age_list})
654
+ return player_df
655
+
656
+ def get_teams(self):
657
+ teams = requests.get(url='https://statsapi.mlb.com/api/v1/teams/').json()
658
+ #Select only teams that are at the MLB level
659
+ # mlb_teams_city = [x['franchiseName'] for x in teams['teams'] if x['sport']['name'] == 'Major League Baseball']
660
+ # mlb_teams_name = [x['teamName'] for x in teams['teams'] if x['sport']['name'] == 'Major League Baseball']
661
+ # mlb_teams_franchise = [x['name'] for x in teams['teams'] if x['sport']['name'] == 'Major League Baseball']
662
+ # mlb_teams_id = [x['id'] for x in teams['teams'] if x['sport']['name'] == 'Major League Baseball']
663
+ # mlb_teams_abb = [x['abbreviation'] for x in teams['teams'] if x['sport']['name'] == 'Major League Baseball']
664
+
665
+ mlb_teams_city = [x['franchiseName'] if 'franchiseName' in x else None for x in teams['teams']]
666
+ mlb_teams_name = [x['teamName'] if 'franchiseName' in x else None for x in teams['teams']]
667
+ mlb_teams_franchise = [x['name'] if 'franchiseName' in x else None for x in teams['teams']]
668
+ mlb_teams_id = [x['id'] if 'franchiseName' in x else None for x in teams['teams']]
669
+ mlb_teams_abb = [x['abbreviation'] if 'franchiseName' in x else None for x in teams['teams']]
670
+ mlb_teams_parent_id = [x['parentOrgId'] if 'parentOrgId' in x else None for x in teams['teams']]
671
+ mlb_teams_parent = [x['parentOrgName'] if 'parentOrgName' in x else None for x in teams['teams']]
672
+ mlb_teams_league_id = [x['league']['id'] if 'id' in x['league'] else None for x in teams['teams']]
673
+ mlb_teams_league_name = [x['league']['name'] if 'name' in x['league'] else None for x in teams['teams']]
674
+
675
+
676
+
677
+ #Create a dataframe of all the teams
678
+ mlb_teams_df = pd.DataFrame(data={'team_id':mlb_teams_id,
679
+ 'city':mlb_teams_franchise,
680
+ 'name':mlb_teams_name,
681
+ 'franchise':mlb_teams_franchise,
682
+ 'abbreviation':mlb_teams_abb,
683
+ 'parent_org_id':mlb_teams_parent_id,
684
+ 'parent_org':mlb_teams_parent,
685
+ 'league_id':mlb_teams_league_id,
686
+ 'league_name':mlb_teams_league_name
687
+
688
+ }).drop_duplicates().dropna(subset=['team_id']).reset_index(drop=True).sort_values('team_id')
689
+
690
+ mlb_teams_df.loc[mlb_teams_df['parent_org_id'].isnull(),'parent_org_id'] = mlb_teams_df.loc[mlb_teams_df['parent_org_id'].isnull(),'team_id']
691
+ mlb_teams_df.loc[mlb_teams_df['parent_org'].isnull(),'parent_org'] = mlb_teams_df.loc[mlb_teams_df['parent_org'].isnull(),'franchise']
692
+
693
+
694
+ mlb_teams_df['parent_org_abbreviation'] = mlb_teams_df['parent_org_id'].map(mlb_teams_df.set_index('team_id')['abbreviation'].to_dict())
695
+
696
+
697
+ #mlb_teams_df.loc[mlb_teams_df.franchise.isin(mlb_teams_df.parent_org.unique()),'parent_org'] = mlb_teams_df.loc[mlb_teams_df.franchise.isin(mlb_teams_df.parent_org.unique()),'franchise']
698
+
699
+ return mlb_teams_df
700
+
701
+ def get_leagues(self):
702
+ leagues = requests.get(url='https://statsapi.mlb.com/api/v1/leagues/').json()
703
+
704
+ sport_id = [x['sport']['id'] if 'sport' in x else None for x in leagues['leagues']]
705
+ league_id = [x['id'] if 'id' in x else None for x in leagues['leagues']]
706
+ league_name = [x['name'] if 'name' in x else None for x in leagues['leagues']]
707
+ league_abbreviation = [x['abbreviation'] if 'abbreviation' in x else None for x in leagues['leagues']]
708
+
709
+
710
+
711
+ leagues_df = pd.DataFrame(data= {
712
+ 'league_id':league_id,
713
+ 'league_name':league_name,
714
+ 'league_abbreviation':league_abbreviation,
715
+ 'sport_id':sport_id,
716
+ })
717
+
718
+ return leagues_df
719
+
720
+ def get_player_games_list(self,player_id=691587):
721
+ player_game_list = [x['game']['gamePk'] for x in requests.get(url=f'http://statsapi.mlb.com/api/v1/people/{player_id}?hydrate=stats(type=gameLog,season=2023),hydrations').json()['people'][0]['stats'][0]['splits']]
722
+ return player_game_list
723
+
724
+ def get_team_schedule(self,year=2023,sport_id=1,mlb_team='Toronto Blue Jays'):
725
+ if not self.get_sport_id_check(sport_id=sport_id):
726
+ print('Please Select a New Sport ID from the following')
727
+ print(self.get_sport_id())
728
+ return False, False
729
+
730
+ schedule_df = self.get_schedule(year_input=year,sport_id=sport_id)
731
+ teams_df = self.get_teams().merge(self.get_leagues()).merge(self.get_sport_id(),left_on=['sport_id'],right_index=True,suffixes=['','_sport'])
732
+ teams_df = teams_df[teams_df['sport_id'] == sport_id]
733
+ team_abb_select = teams_df[teams_df['parent_org'] == mlb_team]['abbreviation'].values[0]
734
+ team_name_select = teams_df[teams_df['parent_org'] == mlb_team]['franchise'].values[0]
735
+ schedule_df = schedule_df[((schedule_df.away == team_name_select) | (schedule_df.home == team_name_select)) & (schedule_df.state == 'F')].reset_index(drop=True)
736
+ return schedule_df,teams_df
737
+
738
+ def get_team_game_data(self,year=2023,sport_id=1,mlb_team='Toronto Blue Jays'):
739
+ schedule_df,teams_df = self.get_team_schedule(year=year,sport_id=sport_id,mlb_team=mlb_team)
740
+ if not schedule_df:
741
+ return
742
+ data = self.get_data(schedule_df['game_id'][:])
743
+ df = self.get_data_df(data_list = data)
744
+ df['mlb_team'] = teams_df[teams_df['parent_org'] == mlb_team]['parent_org_abbreviation'].values[0]
745
+ df['level'] = teams_df[teams_df['parent_org'] == mlb_team]['abbreviation_sport'].values[0]
746
+
747
+ return df
batting_update.py ADDED
@@ -0,0 +1,632 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import joblib
4
+ import math
5
+ import pickle
6
+
7
+ loaded_model = joblib.load('joblib_model/barrel_model.joblib')
8
+ in_zone_model = joblib.load('joblib_model/in_zone_model_knn_20240410.joblib')
9
+ attack_zone_model = joblib.load('joblib_model/model_attack_zone.joblib')
10
+ xwoba_model = joblib.load('joblib_model/xwoba_model.joblib')
11
+ px_model = joblib.load('joblib_model/linear_reg_model_x.joblib')
12
+ pz_model = joblib.load('joblib_model/linear_reg_model_z.joblib')
13
+ barrel_model = joblib.load('joblib_model/barrel_model.joblib')
14
+
15
+
16
+ def percentile(n):
17
+ def percentile_(x):
18
+ return np.nanpercentile(x, n)
19
+ percentile_.__name__ = 'percentile_%s' % n
20
+ return percentile_
21
+
22
+
23
+ def df_update(df=pd.DataFrame()):
24
+ df.loc[df['sz_top']==0,'sz_top'] = np.nan
25
+ df.loc[df['sz_bot']==0,'sz_bot'] = np.nan
26
+
27
+
28
+ df['in_zone'] = [x < 10 if x > 0 else np.nan for x in df['zone']]
29
+ if len(df.loc[(~df['x'].isnull())&(df['px'].isnull()),'px']) > 0:
30
+ df.loc[(~df['x'].isnull())&(df['px'].isnull()),'px'] = px_model.predict(df.loc[(~df['x'].isnull())&(df['px'].isnull())][['x']])
31
+ df.loc[(~df['y'].isnull())&(df['pz'].isnull()),'pz'] = px_model.predict(df.loc[(~df['y'].isnull())&(df['pz'].isnull())][['y']]) + 3.2
32
+
33
+
34
+ # df['in_zone'] = [x < 10 if x > 0 else np.nan for x in df['zone']]
35
+ # df_a['in_zone'] = [x < 10 if x > 0 else np.nan for x in df_a['zone']]
36
+ if len(df.loc[(~df['px'].isna())&
37
+ (df['in_zone'].isna())&
38
+ (~df['sz_top'].isna())]) > 0:
39
+ print('We found missing data')
40
+ df.loc[(~df['px'].isna())&
41
+ (df['in_zone'].isna())&
42
+ (~df['sz_top'].isna())&
43
+ (~df['pz'].isna())&
44
+ (~df['sz_bot'].isna())
45
+ ,'in_zone'] = in_zone_model.predict(df.loc[(~df['px'].isna())&
46
+ (df['in_zone'].isna())&
47
+ (~df['sz_top'].isna())&
48
+ (~df['pz'].isna())&
49
+ (~df['sz_bot'].isna())][['px','pz','sz_top','sz_bot']].values)
50
+ hit_codes = ['single',
51
+ 'double','home_run', 'triple']
52
+
53
+ ab_codes = ['single', 'strikeout', 'field_out',
54
+ 'grounded_into_double_play', 'fielders_choice', 'force_out',
55
+ 'double', 'field_error', 'home_run', 'triple',
56
+ 'double_play',
57
+ 'fielders_choice_out', 'strikeout_double_play',
58
+ 'other_out','triple_play']
59
+
60
+
61
+ obp_true_codes = ['single', 'walk',
62
+ 'double','home_run', 'triple',
63
+ 'hit_by_pitch', 'intent_walk']
64
+
65
+ obp_codes = ['single', 'strikeout', 'walk', 'field_out',
66
+ 'grounded_into_double_play', 'fielders_choice', 'force_out',
67
+ 'double', 'sac_fly', 'field_error', 'home_run', 'triple',
68
+ 'hit_by_pitch', 'double_play', 'intent_walk',
69
+ 'fielders_choice_out', 'strikeout_double_play',
70
+ 'sac_fly_double_play',
71
+ 'other_out','triple_play']
72
+
73
+
74
+ contact_codes = ['In play, no out',
75
+ 'Foul', 'In play, out(s)',
76
+ 'In play, run(s)',
77
+ 'Foul Bunt']
78
+
79
+
80
+
81
+ conditions_hit = [df.event_type.isin(hit_codes)]
82
+ choices_hit = [True]
83
+ df['hits'] = np.select(conditions_hit, choices_hit, default=False)
84
+
85
+ conditions_ab = [df.event_type.isin(ab_codes)]
86
+ choices_ab = [True]
87
+ df['ab'] = np.select(conditions_ab, choices_ab, default=False)
88
+
89
+ conditions_obp_true = [df.event_type.isin(obp_true_codes)]
90
+ choices_obp_true = [True]
91
+ df['on_base'] = np.select(conditions_obp_true, choices_obp_true, default=False)
92
+
93
+ conditions_obp = [df.event_type.isin(obp_codes)]
94
+ choices_obp = [True]
95
+ df['obp'] = np.select(conditions_obp, choices_obp, default=False)
96
+
97
+ bip_codes = ['In play, no out', 'In play, run(s)','In play, out(s)']
98
+
99
+ conditions_bip = [df.play_description.isin(bip_codes)]
100
+ choices_bip = [True]
101
+ df['bip'] = np.select(conditions_bip, choices_bip, default=False)
102
+
103
+ # conditions = [
104
+ # (df['launch_speed'].isna()),
105
+ # (df['launch_speed']*1.5 - df['launch_angle'] >= 117 ) & (df['launch_speed'] + df['launch_angle'] >= 124) & (df['launch_speed'] > 98) & (df['launch_angle'] >= 8) & (df['launch_angle'] <= 50)
106
+ # ]
107
+ df['bip_div'] = ~df.launch_speed.isna()
108
+ # choices = [False,True]
109
+ # df['barrel'] = np.select(conditions, choices, default=np.nan)
110
+ # df['barrel'] = loaded_model.predict(df[['launch_speed','launch_angle']].fillna(0).values)
111
+ df['barrel'] = np.nan
112
+ if len(df.loc[(~df['launch_speed'].isnull())]) > 0:
113
+ df.loc[(~df['launch_speed'].isnull())&(~df['launch_angle'].isnull()),'barrel'] = barrel_model.predict(df.loc[(~df['launch_speed'].isnull())&(~df['launch_angle'].isnull())][['launch_speed','launch_angle']])
114
+
115
+
116
+ conditions_ss = [
117
+ (df['launch_angle'].isna()),
118
+ (df['launch_angle'] >= 8 ) * (df['launch_angle'] <= 32 )
119
+ ]
120
+
121
+ choices_ss = [False,True]
122
+ df['sweet_spot'] = np.select(conditions_ss, choices_ss, default=np.nan)
123
+
124
+ conditions_hh = [
125
+ (df['launch_speed'].isna()),
126
+ (df['launch_speed'] >= 94.5 )
127
+ ]
128
+
129
+ choices_hh = [False,True]
130
+ df['hard_hit'] = np.select(conditions_hh, choices_hh, default=np.nan)
131
+
132
+
133
+ conditions_tb = [
134
+ (df['event_type']=='single'),
135
+ (df['event_type']=='double'),
136
+ (df['event_type']=='triple'),
137
+ (df['event_type']=='home_run'),
138
+ ]
139
+
140
+ choices_tb = [1,2,3,4]
141
+
142
+ df['tb'] = np.select(conditions_tb, choices_tb, default=np.nan)
143
+
144
+ conditions_woba = [
145
+ (df['event_type'].isin(['strikeout', 'field_out', 'sac_fly', 'force_out',
146
+ 'grounded_into_double_play', 'fielders_choice', 'field_error',
147
+ 'sac_bunt', 'double_play', 'fielders_choice_out', 'strikeout_double_play',
148
+ 'sac_fly_double_play', 'other_out'])),
149
+ (df['event_type']=='walk'),
150
+ (df['event_type']=='hit_by_pitch'),
151
+ (df['event_type']=='single'),
152
+ (df['event_type']=='double'),
153
+ (df['event_type']=='triple'),
154
+ (df['event_type']=='home_run'),
155
+ ]
156
+
157
+ choices_woba = [0,
158
+ 0.696,
159
+ 0.726,
160
+ 0.883,
161
+ 1.244,
162
+ 1.569,
163
+ 2.004]
164
+
165
+ df['woba'] = np.select(conditions_woba, choices_woba, default=np.nan)
166
+
167
+
168
+ woba_codes = ['strikeout', 'field_out', 'single', 'walk', 'hit_by_pitch',
169
+ 'double', 'sac_fly', 'force_out', 'home_run',
170
+ 'grounded_into_double_play', 'fielders_choice', 'field_error',
171
+ 'triple', 'sac_bunt', 'double_play',
172
+ 'fielders_choice_out', 'strikeout_double_play',
173
+ 'sac_fly_double_play', 'other_out']
174
+
175
+
176
+
177
+
178
+
179
+
180
+ conditions_woba_code = [
181
+ (df['event_type'].isin(woba_codes))
182
+ ]
183
+
184
+ choices_woba_code = [1]
185
+
186
+ df['woba_codes'] = np.select(conditions_woba_code, choices_woba_code, default=np.nan)
187
+
188
+
189
+ df['woba_contact'] = [df['woba'].values[x] if df['bip'].values[x] == 1 else np.nan for x in range(len(df['woba_codes']))]
190
+
191
+ #df['in_zone'] = [x < 10 if type(x) == int else np.nan for x in df['zone']]
192
+
193
+ # df['in_zone_2'] = in_zone_model.predict(df[['x','y','sz_bot','sz_top']].fillna(0).values)
194
+ # df['in_zone_3'] = df['in_zone_2'] < 10
195
+ # df.loc[df['in_zone'].isna(),'in_zone'] = df.loc[df['in_zone'].isna(),'in_zone_3'].fillna(0)
196
+
197
+
198
+ df['whiffs'] = [1 if ((x == 'S')|(x == 'W')|(x =='T')) else 0 for x in df.play_code]
199
+ df['csw'] = [1 if ((x == 'S')|(x == 'W')|(x =='T')|(x == 'C')) else 0 for x in df.play_code]
200
+ df['swings'] = [1 if x == True else 0 for x in df.is_swing]
201
+
202
+
203
+ df['out_zone'] = df.in_zone == False
204
+ df['zone_swing'] = (df.in_zone == True)&(df.swings == 1)
205
+ df['zone_contact'] = (df.in_zone == True)&(df.swings == 1)&(df.whiffs == 0)
206
+ df['ozone_swing'] = (df.in_zone==False)&(df.swings == 1)
207
+ df['ozone_contact'] = (df.in_zone==False)&(df.swings == 1)&(df.whiffs == 0)
208
+
209
+ df['k'] = df.event_type.isin(list(filter(None, [x if 'strikeout' in x else '' for x in df.event_type.dropna().unique()])))
210
+ df['bb'] = df.event_type.isin(['walk','intent_walk'])
211
+
212
+ df['k_minus_bb'] = df['k'].astype(np.float32)-df['bb'].astype(np.float32)
213
+ df['bb_minus_k'] = df['bb'].astype(np.float32)-df['k'].astype(np.float32)
214
+
215
+ df['pa'] = [1 if isinstance(x, str) else 0 for x in df.event_type]
216
+ df['pitches'] = [1 if x else 0 for x in df.is_pitch]
217
+
218
+
219
+ df.loc[df['launch_speed'].isna(),'barrel'] = np.nan
220
+
221
+
222
+ pitch_cat = {'FA':'Fastball',
223
+ 'FF':'Fastball',
224
+ 'FT':'Fastball',
225
+ 'FC':'Fastball',
226
+ 'FS':'Off-Speed',
227
+ 'FO':'Off-Speed',
228
+ 'SI':'Fastball',
229
+ 'ST':'Breaking',
230
+ 'SL':'Breaking',
231
+ 'CU':'Breaking',
232
+ 'KC':'Breaking',
233
+ 'SC':'Off-Speed',
234
+ 'GY':'Off-Speed',
235
+ 'SV':'Breaking',
236
+ 'CS':'Breaking',
237
+ 'CH':'Off-Speed',
238
+ 'KN':'Off-Speed',
239
+ 'EP':'Breaking',
240
+ 'UN':np.nan,
241
+ 'IN':np.nan,
242
+ 'PO':np.nan,
243
+ 'AB':np.nan,
244
+ 'AS':np.nan,
245
+ 'NP':np.nan}
246
+ df['pitch_category'] = df['pitch_type'].map(pitch_cat).fillna('Unknown')
247
+ df['average'] = 'average'
248
+
249
+ df.loc[df['trajectory'] == 'bunt_popup','trajectory'] = 'popup'
250
+ df.loc[df['trajectory'] == 'bunt_grounder','trajectory'] = 'ground_ball'
251
+ df.loc[df['trajectory'] == '','trajectory'] = np.nan
252
+ df.loc[df['trajectory'] == 'bunt_line_drive','trajectory'] = 'line_drive'
253
+ df[['trajectory_fly_ball','trajectory_ground_ball','trajectory_line_drive','trajectory_popup']] = pd.get_dummies(df['trajectory'], prefix='trajectory')
254
+
255
+ df['attack_zone'] = np.nan
256
+
257
+
258
+
259
+ df.loc[df[['px','pz','sz_top','sz_bot']].isnull().sum(axis=1)==0,'attack_zone'] = attack_zone_model.predict(df.loc[df[['px','pz','sz_top','sz_bot']].isnull().sum(axis=1)==0][['px','pz','sz_top','sz_bot']])
260
+
261
+
262
+
263
+ df['heart'] = df['attack_zone'] == 0
264
+ df['shadow'] = df['attack_zone'] == 1
265
+ df['chase'] = df['attack_zone'] == 2
266
+ df['waste'] = df['attack_zone'] == 3
267
+
268
+ df['heart_swing'] = (df['attack_zone'] == 0)&(df['swings']==1)
269
+ df['shadow_swing'] = (df['attack_zone'] == 1)&(df['swings']==1)
270
+ df['chase_swing'] = (df['attack_zone'] == 2)&(df['swings']==1)
271
+ df['waste_swing'] = (df['attack_zone'] == 3)&(df['swings']==1)
272
+
273
+ df['xwoba'] = np.nan
274
+ df['xwoba_contact'] = np.nan
275
+
276
+ if len(df.loc[df[['launch_angle','launch_speed']].isnull().sum(axis=1)==0,'xwoba']) > 0:
277
+
278
+
279
+ df.loc[df[['launch_angle','launch_speed']].isnull().sum(axis=1)==0,'xwoba'] = [sum(x) for x in xwoba_model.predict_proba(df.loc[df[['launch_angle','launch_speed']].isnull().sum(axis=1)==0][['launch_angle','launch_speed']]) * ([0, 0.883,1.244,1.569,2.004])]
280
+
281
+ ## Assign a value of 0.696 to every walk in the dataset
282
+ df.loc[df['event_type'].isin(['walk']),'xwoba'] = 0.696
283
+
284
+ ## Assign a value of 0.726 to every hit by pitch in the dataset
285
+ df.loc[df['event_type'].isin(['hit_by_pitch']),'xwoba'] = 0.726
286
+
287
+ ## Assign a value of 0 to every Strikeout in the dataset
288
+ df.loc[df['event_type'].isin(['strikeout','strikeout_double_play']),'xwoba'] = 0
289
+
290
+
291
+ df.loc[df[['launch_angle','launch_speed']].isnull().sum(axis=1)==0,'xwoba_contact'] = [sum(x) for x in xwoba_model.predict_proba(df.loc[df[['launch_angle','launch_speed']].isnull().sum(axis=1)==0][['launch_angle','launch_speed']]) * ([0, 0.883,1.244,1.569,2.004])]
292
+
293
+ df['xwoba_codes'] = np.nan
294
+ df.loc[df[['launch_angle','launch_speed']].isnull().sum(axis=1)==0,'xwoba_codes'] = 1
295
+ ## Assign a value of 0.696 to every walk in the dataset
296
+ df.loc[df['event_type'].isin(['walk']),'xwoba_codes'] = 1
297
+
298
+ ## Assign a value of 0.726 to every hit by pitch in the dataset
299
+ df.loc[df['event_type'].isin(['hit_by_pitch']),'xwoba_codes'] = 1
300
+
301
+ ## Assign a value of 0 to every Strikeout in the dataset
302
+ df.loc[df['event_type'].isin(['strikeout','strikeout_double_play']),'xwoba_codes'] = 1
303
+ return df
304
+
305
+ def df_update_summ(df=pd.DataFrame()):
306
+ df_summ = df.groupby(['batter_id','batter_name']).agg(
307
+ pa = ('pa','sum'),
308
+ ab = ('ab','sum'),
309
+ obp_pa = ('obp','sum'),
310
+ hits = ('hits','sum'),
311
+ on_base = ('on_base','sum'),
312
+ k = ('k','sum'),
313
+ bb = ('bb','sum'),
314
+ bb_minus_k = ('bb_minus_k','sum'),
315
+ csw = ('csw','sum'),
316
+ bip = ('bip','sum'),
317
+ bip_div = ('bip_div','sum'),
318
+ tb = ('tb','sum'),
319
+ woba = ('woba','sum'),
320
+ woba_contact = ('woba_contact','sum'),
321
+ xwoba = ('xwoba','sum'),
322
+ xwoba_contact = ('xwoba_contact','sum'),
323
+ woba_codes = ('woba_codes','sum'),
324
+ xwoba_codes = ('xwoba_codes','sum'),
325
+ hard_hit = ('hard_hit','sum'),
326
+ barrel = ('barrel','sum'),
327
+ sweet_spot = ('sweet_spot','sum'),
328
+ max_launch_speed = ('launch_speed','max'),
329
+ launch_speed_90 = ('launch_speed',percentile(90)),
330
+ launch_speed = ('launch_speed','mean'),
331
+ launch_angle = ('launch_angle','mean'),
332
+ pitches = ('is_pitch','sum'),
333
+ swings = ('swings','sum'),
334
+ in_zone = ('in_zone','sum'),
335
+ out_zone = ('out_zone','sum'),
336
+ whiffs = ('whiffs','sum'),
337
+ zone_swing = ('zone_swing','sum'),
338
+ zone_contact = ('zone_contact','sum'),
339
+ ozone_swing = ('ozone_swing','sum'),
340
+ ozone_contact = ('ozone_contact','sum'),
341
+ ground_ball = ('trajectory_ground_ball','sum'),
342
+ line_drive = ('trajectory_line_drive','sum'),
343
+ fly_ball =('trajectory_fly_ball','sum'),
344
+ pop_up = ('trajectory_popup','sum'),
345
+ attack_zone = ('attack_zone','count'),
346
+ heart = ('heart','sum'),
347
+ shadow = ('shadow','sum'),
348
+ chase = ('chase','sum'),
349
+ waste = ('waste','sum'),
350
+ heart_swing = ('heart_swing','sum'),
351
+ shadow_swing = ('shadow_swing','sum'),
352
+ chase_swing = ('chase_swing','sum'),
353
+ waste_swing = ('waste_swing','sum'),
354
+ ).reset_index()
355
+ return df_summ
356
+
357
+ def df_update_summ_avg(df=pd.DataFrame()):
358
+ df_summ_avg = df.groupby(['average']).agg(
359
+ pa = ('pa','sum'),
360
+ ab = ('ab','sum'),
361
+ obp_pa = ('obp','sum'),
362
+ hits = ('hits','sum'),
363
+ on_base = ('on_base','sum'),
364
+ k = ('k','sum'),
365
+ bb = ('bb','sum'),
366
+ bb_minus_k = ('bb_minus_k','sum'),
367
+ csw = ('csw','sum'),
368
+ bip = ('bip','sum'),
369
+ bip_div = ('bip_div','sum'),
370
+ tb = ('tb','sum'),
371
+ woba = ('woba','sum'),
372
+ woba_contact = ('woba_contact','sum'),
373
+ xwoba = ('xwoba','sum'),
374
+ xwoba_contact = ('xwoba_contact','sum'),
375
+ woba_codes = ('woba_codes','sum'),
376
+ xwoba_codes = ('xwoba_codes','sum'),
377
+ hard_hit = ('hard_hit','sum'),
378
+ barrel = ('barrel','sum'),
379
+ sweet_spot = ('sweet_spot','sum'),
380
+ max_launch_speed = ('launch_speed','max'),
381
+ launch_speed_90 = ('launch_speed',percentile(90)),
382
+ launch_speed = ('launch_speed','mean'),
383
+ launch_angle = ('launch_angle','mean'),
384
+ pitches = ('is_pitch','sum'),
385
+ swings = ('swings','sum'),
386
+ in_zone = ('in_zone','sum'),
387
+ out_zone = ('out_zone','sum'),
388
+ whiffs = ('whiffs','sum'),
389
+ zone_swing = ('zone_swing','sum'),
390
+ zone_contact = ('zone_contact','sum'),
391
+ ozone_swing = ('ozone_swing','sum'),
392
+ ozone_contact = ('ozone_contact','sum'),
393
+ ground_ball = ('trajectory_ground_ball','sum'),
394
+ line_drive = ('trajectory_line_drive','sum'),
395
+ fly_ball =('trajectory_fly_ball','sum'),
396
+ pop_up = ('trajectory_popup','sum'),
397
+ attack_zone = ('attack_zone','count'),
398
+ heart = ('heart','sum'),
399
+ shadow = ('shadow','sum'),
400
+ chase = ('chase','sum'),
401
+ waste = ('waste','sum'),
402
+ heart_swing = ('heart_swing','sum'),
403
+ shadow_swing = ('shadow_swing','sum'),
404
+ chase_swing = ('chase_swing','sum'),
405
+ waste_swing = ('waste_swing','sum'),
406
+
407
+
408
+
409
+
410
+ ).reset_index()
411
+ return df_summ_avg
412
+
413
+ def df_summ_changes(df_summ=pd.DataFrame()):
414
+ df_summ['avg'] = [df_summ.hits[x]/df_summ.ab[x] if df_summ.ab[x] != 0 else np.nan for x in range(len(df_summ))]
415
+ df_summ['obp'] = [df_summ.on_base[x]/df_summ.obp_pa[x] if df_summ.obp_pa[x] != 0 else np.nan for x in range(len(df_summ))]
416
+ df_summ['slg'] = [df_summ.tb[x]/df_summ.ab[x] if df_summ.ab[x] != 0 else np.nan for x in range(len(df_summ))]
417
+
418
+ df_summ['ops'] = df_summ['obp']+df_summ['slg']
419
+
420
+ df_summ['k_percent'] = [df_summ.k[x]/df_summ.pa[x] if df_summ.pa[x] != 0 else np.nan for x in range(len(df_summ))]
421
+ df_summ['bb_percent'] =[df_summ.bb[x]/df_summ.pa[x] if df_summ.pa[x] != 0 else np.nan for x in range(len(df_summ))]
422
+ df_summ['bb_minus_k_percent'] =[(df_summ.bb_minus_k[x])/df_summ.pa[x] if df_summ.pa[x] != 0 else np.nan for x in range(len(df_summ))]
423
+
424
+ df_summ['bb_over_k_percent'] =[df_summ.bb[x]/df_summ.k[x] if df_summ.k[x] != 0 else np.nan for x in range(len(df_summ))]
425
+
426
+
427
+
428
+
429
+ df_summ['csw_percent'] =[df_summ.csw[x]/df_summ.pitches[x] if df_summ.pitches[x] != 0 else np.nan for x in range(len(df_summ))]
430
+
431
+
432
+ df_summ['sweet_spot_percent'] = [df_summ.sweet_spot[x]/df_summ.bip_div[x] if df_summ.bip_div[x] != 0 else np.nan for x in range(len(df_summ))]
433
+
434
+ df_summ['woba_percent'] = [df_summ.woba[x]/df_summ.woba_codes[x] if df_summ.woba_codes[x] != 0 else np.nan for x in range(len(df_summ))]
435
+ df_summ['woba_percent_contact'] = [df_summ.woba_contact[x]/df_summ.bip[x] if df_summ.bip[x] != 0 else np.nan for x in range(len(df_summ))]
436
+ #df_summ['hard_hit_percent'] = [df_summ.sweet_spot[x]/df_summ.bip[x] if df_summ.bip[x] != 0 else np.nan for x in range(len(df_summ))]
437
+ df_summ['hard_hit_percent'] = [df_summ.hard_hit[x]/df_summ.bip_div[x] if df_summ.bip_div[x] != 0 else np.nan for x in range(len(df_summ))]
438
+
439
+
440
+ df_summ['barrel_percent'] = [df_summ.barrel[x]/df_summ.bip_div[x] if df_summ.bip_div[x] != 0 else np.nan for x in range(len(df_summ))]
441
+
442
+ df_summ['zone_contact_percent'] = [df_summ.zone_contact[x]/df_summ.zone_swing[x] if df_summ.zone_swing[x] != 0 else np.nan for x in range(len(df_summ))]
443
+
444
+ df_summ['zone_swing_percent'] = [df_summ.zone_swing[x]/df_summ.in_zone[x] if df_summ.in_zone[x] != 0 else np.nan for x in range(len(df_summ))]
445
+
446
+ df_summ['zone_percent'] = [df_summ.in_zone[x]/df_summ.pitches[x] if df_summ.pitches[x] > 0 else np.nan for x in range(len(df_summ))]
447
+
448
+ df_summ['chase_percent'] = [df_summ.ozone_swing[x]/(df_summ.pitches[x] - df_summ.in_zone[x]) if (df_summ.pitches[x]- df_summ.in_zone[x]) != 0 else np.nan for x in range(len(df_summ))]
449
+
450
+ df_summ['chase_contact'] = [df_summ.ozone_contact[x]/df_summ.ozone_swing[x] if df_summ.ozone_swing[x] != 0 else np.nan for x in range(len(df_summ))]
451
+
452
+ df_summ['swing_percent'] = [df_summ.swings[x]/df_summ.pitches[x] if df_summ.pitches[x] > 0 else np.nan for x in range(len(df_summ))]
453
+
454
+ df_summ['whiff_rate'] = [df_summ.whiffs[x]/df_summ.swings[x] if df_summ.swings[x] != 0 else np.nan for x in range(len(df_summ))]
455
+
456
+ df_summ['swstr_rate'] = [df_summ.whiffs[x]/df_summ.pitches[x] if df_summ.pitches[x] > 0 else np.nan for x in range(len(df_summ))]
457
+
458
+ df_summ['ground_ball_percent'] = [df_summ.ground_ball[x]/df_summ.bip[x] if df_summ.bip[x] != 0 else np.nan for x in range(len(df_summ))]
459
+
460
+ df_summ['line_drive_percent'] = [df_summ.line_drive[x]/df_summ.bip[x] if df_summ.bip[x] != 0 else np.nan for x in range(len(df_summ))]
461
+
462
+ df_summ['fly_ball_percent'] = [df_summ.fly_ball[x]/df_summ.bip[x] if df_summ.bip[x] != 0 else np.nan for x in range(len(df_summ))]
463
+
464
+ df_summ['pop_up_percent'] = [df_summ.pop_up[x]/df_summ.bip[x] if df_summ.bip[x] != 0 else np.nan for x in range(len(df_summ))]
465
+
466
+
467
+
468
+ df_summ['heart_zone_percent'] = [df_summ.heart[x]/df_summ.attack_zone[x] if df_summ.attack_zone[x] != 0 else np.nan for x in range(len(df_summ))]
469
+
470
+ df_summ['shadow_zone_percent'] = [df_summ.shadow[x]/df_summ.attack_zone[x] if df_summ.attack_zone[x] != 0 else np.nan for x in range(len(df_summ))]
471
+
472
+ df_summ['chase_zone_percent'] = [df_summ.chase[x]/df_summ.attack_zone[x] if df_summ.attack_zone[x] != 0 else np.nan for x in range(len(df_summ))]
473
+
474
+ df_summ['waste_zone_percent'] = [df_summ.waste[x]/df_summ.attack_zone[x] if df_summ.attack_zone[x] != 0 else np.nan for x in range(len(df_summ))]
475
+
476
+
477
+ df_summ['heart_zone_swing_percent'] = [df_summ.heart_swing[x]/df_summ.heart[x] if df_summ.heart[x] != 0 else np.nan for x in range(len(df_summ))]
478
+
479
+ df_summ['shadow_zone_swing_percent'] = [df_summ.shadow_swing[x]/df_summ.shadow[x] if df_summ.shadow[x] != 0 else np.nan for x in range(len(df_summ))]
480
+
481
+ df_summ['chase_zone_swing_percent'] = [df_summ.chase_swing[x]/df_summ.chase[x] if df_summ.chase[x] != 0 else np.nan for x in range(len(df_summ))]
482
+
483
+ df_summ['waste_zone_swing_percent'] = [df_summ.waste_swing[x]/df_summ.waste[x] if df_summ.waste[x] != 0 else np.nan for x in range(len(df_summ))]
484
+
485
+
486
+ df_summ['xwoba_percent'] = [df_summ.xwoba[x]/df_summ.xwoba_codes[x] if df_summ.xwoba_codes[x] != 0 else np.nan for x in range(len(df_summ))]
487
+ df_summ['xwoba_percent_contact'] = [df_summ.xwoba_contact[x]/df_summ.bip[x] if df_summ.bip[x] != 0 else np.nan for x in range(len(df_summ))]
488
+
489
+ df_summ = df_summ.dropna(subset=['bip'])
490
+ return df_summ
491
+
492
+ def df_summ_filter_out(df_summ=pd.DataFrame(),batter_select = 0,date_min=0):
493
+ import datetime
494
+
495
+ def weeks_after(day):
496
+ today = datetime.date.today()
497
+ time_difference = today - day
498
+ weeks = time_difference.days // 7
499
+ return weeks
500
+
501
+ df_summ_filter = df_summ[df_summ['pa'] >= min(math.floor(df_summ.xs(batter_select,level=0)['pa']/10)*10,500,weeks_after(date_min)*20)]
502
+ df_summ_filter_pct = df_summ_filter.rank(pct=True,ascending=True)
503
+ df_summ_player = df_summ.xs(batter_select,level=0)
504
+ df_summ_player_pct = df_summ_filter_pct.xs(batter_select,level=0)
505
+ return df_summ_filter,df_summ_filter_pct,df_summ_player,df_summ_player_pct
506
+
507
+
508
+ def df_summ_batter_pitch_up(df=pd.DataFrame()):
509
+ df_summ_batter_pitch = df.dropna(subset=['pitch_category']).groupby(['batter_id','batter_name','pitch_category']).agg(
510
+ pa = ('pa','sum'),
511
+ ab = ('ab','sum'),
512
+ obp_pa = ('obp','sum'),
513
+ hits = ('hits','sum'),
514
+ on_base = ('on_base','sum'),
515
+ k = ('k','sum'),
516
+ bb = ('bb','sum'),
517
+ bb_minus_k = ('bb_minus_k','sum'),
518
+ csw = ('csw','sum'),
519
+ bip = ('bip','sum'),
520
+ bip_div = ('bip_div','sum'),
521
+ tb = ('tb','sum'),
522
+ woba = ('woba','sum'),
523
+ woba_contact = ('xwoba_contact','sum'),
524
+ xwoba = ('xwoba','sum'),
525
+ xwoba_contact = ('xwoba','sum'),
526
+ woba_codes = ('woba_codes','sum'),
527
+ xwoba_codes = ('xwoba_codes','sum'),
528
+ hard_hit = ('hard_hit','sum'),
529
+ barrel = ('barrel','sum'),
530
+ sweet_spot = ('sweet_spot','sum'),
531
+ max_launch_speed = ('launch_speed','max'),
532
+ launch_speed_90 = ('launch_speed',percentile(90)),
533
+ launch_speed = ('launch_speed','mean'),
534
+ launch_angle = ('launch_angle','mean'),
535
+ pitches = ('is_pitch','sum'),
536
+ swings = ('swings','sum'),
537
+ in_zone = ('in_zone','sum'),
538
+ out_zone = ('out_zone','sum'),
539
+ whiffs = ('whiffs','sum'),
540
+ zone_swing = ('zone_swing','sum'),
541
+ zone_contact = ('zone_contact','sum'),
542
+ ozone_swing = ('ozone_swing','sum'),
543
+ ozone_contact = ('ozone_contact','sum'),
544
+ ground_ball = ('trajectory_ground_ball','sum'),
545
+ line_drive = ('trajectory_line_drive','sum'),
546
+ fly_ball =('trajectory_fly_ball','sum'),
547
+ pop_up = ('trajectory_popup','sum'),
548
+ attack_zone = ('attack_zone','count'),
549
+ heart = ('heart','sum'),
550
+ shadow = ('shadow','sum'),
551
+ chase = ('chase','sum'),
552
+ waste = ('waste','sum'),
553
+ heart_swing = ('heart_swing','sum'),
554
+ shadow_swing = ('shadow_swing','sum'),
555
+ chase_swing = ('chase_swing','sum'),
556
+ waste_swing = ('waste_swing','sum'),
557
+ ).reset_index()
558
+
559
+ #return df_summ_batter_pitch
560
+ df_summ_batter_pitch['avg'] = [df_summ_batter_pitch.hits[x]/df_summ_batter_pitch.ab[x] if df_summ_batter_pitch.ab[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
561
+ df_summ_batter_pitch['obp'] = [df_summ_batter_pitch.on_base[x]/df_summ_batter_pitch.obp_pa[x] if df_summ_batter_pitch.obp_pa[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
562
+ df_summ_batter_pitch['slg'] = [df_summ_batter_pitch.tb[x]/df_summ_batter_pitch.ab[x] if df_summ_batter_pitch.ab[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
563
+
564
+ df_summ_batter_pitch['ops'] = df_summ_batter_pitch['obp']+df_summ_batter_pitch['slg']
565
+
566
+ df_summ_batter_pitch['k_percent'] = [df_summ_batter_pitch.k[x]/df_summ_batter_pitch.pa[x] if df_summ_batter_pitch.pa[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
567
+ df_summ_batter_pitch['bb_percent'] =[df_summ_batter_pitch.bb[x]/df_summ_batter_pitch.pa[x] if df_summ_batter_pitch.pa[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
568
+ df_summ_batter_pitch['bb_minus_k_percent'] =[(df_summ_batter_pitch.bb_minus_k[x])/df_summ_batter_pitch.pa[x] if df_summ_batter_pitch.pa[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
569
+
570
+ df_summ_batter_pitch['bb_over_k_percent'] =[df_summ_batter_pitch.bb[x]/df_summ_batter_pitch.k[x] if df_summ_batter_pitch.k[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
571
+
572
+
573
+
574
+
575
+ df_summ_batter_pitch['csw_percent'] =[df_summ_batter_pitch.csw[x]/df_summ_batter_pitch.pitches[x] if df_summ_batter_pitch.pitches[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
576
+
577
+
578
+ df_summ_batter_pitch['sweet_spot_percent'] = [df_summ_batter_pitch.sweet_spot[x]/df_summ_batter_pitch.bip_div[x] if df_summ_batter_pitch.bip_div[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
579
+
580
+ df_summ_batter_pitch['woba_percent'] = [df_summ_batter_pitch.woba[x]/df_summ_batter_pitch.woba_codes[x] if df_summ_batter_pitch.woba_codes[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
581
+ df_summ_batter_pitch['woba_percent_contact'] = [df_summ_batter_pitch.woba_contact[x]/df_summ_batter_pitch.bip[x] if df_summ_batter_pitch.bip[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
582
+ #df_summ_batter_pitch['hard_hit_percent'] = [df_summ_batter_pitch.sweet_spot[x]/df_summ_batter_pitch.bip[x] if df_summ_batter_pitch.bip[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
583
+ df_summ_batter_pitch['hard_hit_percent'] = [df_summ_batter_pitch.hard_hit[x]/df_summ_batter_pitch.bip_div[x] if df_summ_batter_pitch.bip_div[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
584
+
585
+
586
+ df_summ_batter_pitch['barrel_percent'] = [df_summ_batter_pitch.barrel[x]/df_summ_batter_pitch.bip_div[x] if df_summ_batter_pitch.bip_div[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
587
+
588
+ df_summ_batter_pitch['zone_contact_percent'] = [df_summ_batter_pitch.zone_contact[x]/df_summ_batter_pitch.zone_swing[x] if df_summ_batter_pitch.zone_swing[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
589
+
590
+ df_summ_batter_pitch['zone_swing_percent'] = [df_summ_batter_pitch.zone_swing[x]/df_summ_batter_pitch.in_zone[x] if df_summ_batter_pitch.in_zone[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
591
+
592
+ df_summ_batter_pitch['zone_percent'] = [df_summ_batter_pitch.in_zone[x]/df_summ_batter_pitch.pitches[x] if df_summ_batter_pitch.pitches[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
593
+
594
+ df_summ_batter_pitch['chase_percent'] = [df_summ_batter_pitch.ozone_swing[x]/(df_summ_batter_pitch.pitches[x] - df_summ_batter_pitch.in_zone[x]) if (df_summ_batter_pitch.pitches[x]- df_summ_batter_pitch.in_zone[x]) != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
595
+
596
+ df_summ_batter_pitch['chase_contact'] = [df_summ_batter_pitch.ozone_contact[x]/df_summ_batter_pitch.ozone_swing[x] if df_summ_batter_pitch.ozone_swing[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
597
+
598
+ df_summ_batter_pitch['swing_percent'] = [df_summ_batter_pitch.swings[x]/df_summ_batter_pitch.pitches[x] if df_summ_batter_pitch.pitches[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
599
+
600
+ df_summ_batter_pitch['whiff_rate'] = [df_summ_batter_pitch.whiffs[x]/df_summ_batter_pitch.swings[x] if df_summ_batter_pitch.swings[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
601
+
602
+ df_summ_batter_pitch['swstr_rate'] = [df_summ_batter_pitch.whiffs[x]/df_summ_batter_pitch.pitches[x] if df_summ_batter_pitch.pitches[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
603
+
604
+ df_summ_batter_pitch['heart_zone_percent'] = [df_summ_batter_pitch.heart[x]/df_summ_batter_pitch.attack_zone[x] if df_summ_batter_pitch.attack_zone[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
605
+
606
+ df_summ_batter_pitch['shadow_zone_percent'] = [df_summ_batter_pitch.shadow[x]/df_summ_batter_pitch.attack_zone[x] if df_summ_batter_pitch.attack_zone[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
607
+
608
+ df_summ_batter_pitch['chase_zone_percent'] = [df_summ_batter_pitch.chase[x]/df_summ_batter_pitch.attack_zone[x] if df_summ_batter_pitch.attack_zone[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
609
+
610
+ df_summ_batter_pitch['waste_zone_percent'] = [df_summ_batter_pitch.waste[x]/df_summ_batter_pitch.attack_zone[x] if df_summ_batter_pitch.attack_zone[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
611
+
612
+
613
+ df_summ_batter_pitch['heart_zone_swing_percent'] = [df_summ_batter_pitch.heart_swing[x]/df_summ_batter_pitch.heart[x] if df_summ_batter_pitch.heart[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
614
+
615
+ df_summ_batter_pitch['shadow_zone_swing_percent'] = [df_summ_batter_pitch.shadow_swing[x]/df_summ_batter_pitch.shadow[x] if df_summ_batter_pitch.shadow[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
616
+
617
+ df_summ_batter_pitch['chase_zone_swing_percent'] = [df_summ_batter_pitch.chase_swing[x]/df_summ_batter_pitch.chase[x] if df_summ_batter_pitch.chase[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
618
+
619
+ df_summ_batter_pitch['waste_zone_swing_percent'] = [df_summ_batter_pitch.waste_swing[x]/df_summ_batter_pitch.waste[x] if df_summ_batter_pitch.waste[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
620
+
621
+
622
+
623
+
624
+ df_summ_batter_pitch['xwoba_percent'] = [df_summ_batter_pitch.xwoba[x]/df_summ_batter_pitch.xwoba_codes[x] if df_summ_batter_pitch.xwoba_codes[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
625
+ df_summ_batter_pitch['xwoba_percent_contact'] = [df_summ_batter_pitch.xwoba_contact[x]/df_summ_batter_pitch.bip[x] if df_summ_batter_pitch.bip[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
626
+
627
+
628
+
629
+
630
+ df_summ_batter_pitch['bip'] = df_summ_batter_pitch['bip'].fillna(0)
631
+
632
+ return df_summ_batter_pitch
pitcher_update.py ADDED
@@ -0,0 +1,573 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import joblib
4
+ import math
5
+ import pickle
6
+
7
+ loaded_model = joblib.load('joblib_model/barrel_model.joblib')
8
+ in_zone_model = joblib.load('joblib_model/in_zone_model_knn_20240410.joblib')
9
+ attack_zone_model = joblib.load('joblib_model/model_attack_zone.joblib')
10
+ xwoba_model = joblib.load('joblib_model/xwoba_model.joblib')
11
+ px_model = joblib.load('joblib_model/linear_reg_model_x.joblib')
12
+ pz_model = joblib.load('joblib_model/linear_reg_model_z.joblib')
13
+
14
+
15
+ def percentile(n):
16
+ def percentile_(x):
17
+ return np.nanpercentile(x, n)
18
+ percentile_.__name__ = 'percentile_%s' % n
19
+ return percentile_
20
+
21
+
22
+ def df_update(df=pd.DataFrame()):
23
+ df.loc[df['sz_top']==0,'sz_top'] = np.nan
24
+ df.loc[df['sz_bot']==0,'sz_bot'] = np.nan
25
+
26
+
27
+ df['in_zone'] = [x < 10 if x > 0 else np.nan for x in df['zone']]
28
+ if len(df.loc[(~df['x'].isnull())&(df['px'].isnull()),'px']) > 0:
29
+ df.loc[(~df['x'].isnull())&(df['px'].isnull()),'px'] = px_model.predict(df.loc[(~df['x'].isnull())&(df['px'].isnull())][['x']])
30
+ df.loc[(~df['y'].isnull())&(df['pz'].isnull()),'pz'] = px_model.predict(df.loc[(~df['y'].isnull())&(df['pz'].isnull())][['y']]) + 3.2
31
+
32
+
33
+ # df['in_zone'] = [x < 10 if x > 0 else np.nan for x in df['zone']]
34
+ if len(df.loc[(~df['px'].isna())&
35
+ (df['in_zone'].isna())&
36
+ (~df['sz_top'].isna())]) > 0:
37
+ print('We found missing data')
38
+ df.loc[(~df['px'].isna())&
39
+ (df['in_zone'].isna())&
40
+ (~df['sz_top'].isna())&
41
+ (~df['pz'].isna())&
42
+ (~df['sz_bot'].isna())
43
+ ,'in_zone'] = in_zone_model.predict(df.loc[(~df['px'].isna())&
44
+ (df['in_zone'].isna())&
45
+ (~df['sz_top'].isna())&
46
+ (~df['pz'].isna())&
47
+ (~df['sz_bot'].isna())][['px','pz','sz_top','sz_bot']].values)
48
+ hit_codes = ['single',
49
+ 'double','home_run', 'triple']
50
+
51
+ ab_codes = ['single', 'strikeout', 'field_out',
52
+ 'grounded_into_double_play', 'fielders_choice', 'force_out',
53
+ 'double', 'field_error', 'home_run', 'triple',
54
+ 'double_play',
55
+ 'fielders_choice_out', 'strikeout_double_play',
56
+ 'other_out','triple_play']
57
+
58
+
59
+ obp_true_codes = ['single', 'walk',
60
+ 'double','home_run', 'triple',
61
+ 'hit_by_pitch', 'intent_walk']
62
+
63
+ obp_codes = ['single', 'strikeout', 'walk', 'field_out',
64
+ 'grounded_into_double_play', 'fielders_choice', 'force_out',
65
+ 'double', 'sac_fly', 'field_error', 'home_run', 'triple',
66
+ 'hit_by_pitch', 'double_play', 'intent_walk',
67
+ 'fielders_choice_out', 'strikeout_double_play',
68
+ 'sac_fly_double_play',
69
+ 'other_out','triple_play']
70
+
71
+
72
+ contact_codes = ['In play, no out',
73
+ 'Foul', 'In play, out(s)',
74
+ 'In play, run(s)',
75
+ 'Foul Bunt']
76
+
77
+
78
+
79
+ conditions_hit = [df.event_type.isin(hit_codes)]
80
+ choices_hit = [True]
81
+ df['hits'] = np.select(conditions_hit, choices_hit, default=False)
82
+
83
+ conditions_ab = [df.event_type.isin(ab_codes)]
84
+ choices_ab = [True]
85
+ df['ab'] = np.select(conditions_ab, choices_ab, default=False)
86
+
87
+ conditions_obp_true = [df.event_type.isin(obp_true_codes)]
88
+ choices_obp_true = [True]
89
+ df['on_base'] = np.select(conditions_obp_true, choices_obp_true, default=False)
90
+
91
+ conditions_obp = [df.event_type.isin(obp_codes)]
92
+ choices_obp = [True]
93
+ df['obp'] = np.select(conditions_obp, choices_obp, default=False)
94
+
95
+ bip_codes = ['In play, no out', 'In play, run(s)','In play, out(s)']
96
+
97
+ conditions_bip = [df.play_description.isin(bip_codes)]
98
+ choices_bip = [True]
99
+ df['bip'] = np.select(conditions_bip, choices_bip, default=False)
100
+
101
+ conditions = [
102
+ (df['launch_speed'].isna()),
103
+ (df['launch_speed']*1.5 - df['launch_angle'] >= 117 ) & (df['launch_speed'] + df['launch_angle'] >= 124) & (df['launch_speed'] > 98) & (df['launch_angle'] >= 8) & (df['launch_angle'] <= 50)
104
+ ]
105
+ df['bip_div'] = ~df.launch_speed.isna()
106
+ choices = [False,True]
107
+ df['barrel'] = np.select(conditions, choices, default=np.nan)
108
+ df['barrel'] = loaded_model.predict(df[['launch_speed','launch_angle']].fillna(0).values)
109
+ conditions_ss = [
110
+ (df['launch_angle'].isna()),
111
+ (df['launch_angle'] >= 8 ) * (df['launch_angle'] <= 32 )
112
+ ]
113
+
114
+ choices_ss = [False,True]
115
+ df['sweet_spot'] = np.select(conditions_ss, choices_ss, default=np.nan)
116
+
117
+ conditions_hh = [
118
+ (df['launch_speed'].isna()),
119
+ (df['launch_speed'] >= 94.5 )
120
+ ]
121
+
122
+ choices_hh = [False,True]
123
+ df['hard_hit'] = np.select(conditions_hh, choices_hh, default=np.nan)
124
+
125
+
126
+ conditions_tb = [
127
+ (df['event_type']=='single'),
128
+ (df['event_type']=='double'),
129
+ (df['event_type']=='triple'),
130
+ (df['event_type']=='home_run'),
131
+ ]
132
+
133
+ choices_tb = [1,2,3,4]
134
+
135
+ df['tb'] = np.select(conditions_tb, choices_tb, default=np.nan)
136
+
137
+ conditions_woba = [
138
+ (df['event_type'].isin(['strikeout', 'field_out', 'sac_fly', 'force_out',
139
+ 'grounded_into_double_play', 'fielders_choice', 'field_error',
140
+ 'sac_bunt', 'double_play', 'fielders_choice_out', 'strikeout_double_play',
141
+ 'sac_fly_double_play', 'other_out'])),
142
+ (df['event_type']=='walk'),
143
+ (df['event_type']=='hit_by_pitch'),
144
+ (df['event_type']=='single'),
145
+ (df['event_type']=='double'),
146
+ (df['event_type']=='triple'),
147
+ (df['event_type']=='home_run'),
148
+ ]
149
+
150
+ choices_woba = [0,
151
+ 0.696,
152
+ 0.726,
153
+ 0.883,
154
+ 1.244,
155
+ 1.569,
156
+ 2.004]
157
+
158
+ df['woba'] = np.select(conditions_woba, choices_woba, default=np.nan)
159
+
160
+
161
+ woba_codes = ['strikeout', 'field_out', 'single', 'walk', 'hit_by_pitch',
162
+ 'double', 'sac_fly', 'force_out', 'home_run',
163
+ 'grounded_into_double_play', 'fielders_choice', 'field_error',
164
+ 'triple', 'sac_bunt', 'double_play',
165
+ 'fielders_choice_out', 'strikeout_double_play',
166
+ 'sac_fly_double_play', 'other_out']
167
+
168
+
169
+
170
+
171
+
172
+
173
+ conditions_woba_code = [
174
+ (df['event_type'].isin(woba_codes))
175
+ ]
176
+
177
+ choices_woba_code = [1]
178
+
179
+ df['woba_codes'] = np.select(conditions_woba_code, choices_woba_code, default=np.nan)
180
+
181
+
182
+ df['woba_contact'] = [df['woba'].values[x] if df['bip'].values[x] == 1 else np.nan for x in range(len(df['woba_codes']))]
183
+
184
+ #df['in_zone'] = [x < 10 if type(x) == int else np.nan for x in df['zone']]
185
+
186
+ # df['in_zone_2'] = in_zone_model.predict(df[['x','y','sz_bot','sz_top']].fillna(0).values)
187
+ # df['in_zone_3'] = df['in_zone_2'] < 10
188
+ # df.loc[df['in_zone'].isna(),'in_zone'] = df.loc[df['in_zone'].isna(),'in_zone_3'].fillna(0)
189
+
190
+
191
+ df['whiffs'] = [1 if ((x == 'S')|(x == 'W')|(x =='T')) else 0 for x in df.play_code]
192
+ df['csw'] = [1 if ((x == 'S')|(x == 'W')|(x =='T')|(x == 'C')) else 0 for x in df.play_code]
193
+ df['swings'] = [1 if x == True else 0 for x in df.is_swing]
194
+
195
+
196
+ df['out_zone'] = df.in_zone == False
197
+ df['zone_swing'] = (df.in_zone == True)&(df.swings == 1)
198
+ df['zone_contact'] = (df.in_zone == True)&(df.swings == 1)&(df.whiffs == 0)
199
+ df['ozone_swing'] = (df.in_zone==False)&(df.swings == 1)
200
+ df['ozone_contact'] = (df.in_zone==False)&(df.swings == 1)&(df.whiffs == 0)
201
+
202
+ df['k'] = df.event_type.isin(list(filter(None, [x if 'strikeout' in x else '' for x in df.event_type.dropna().unique()])))
203
+ df['bb'] = df.event_type.isin(['walk','intent_walk'])
204
+
205
+ df['k_minus_bb'] = df['k'].astype(np.float32)-df['bb'].astype(np.float32)
206
+ df['bb_minus_k'] = df['bb'].astype(np.float32)-df['k'].astype(np.float32)
207
+
208
+ df['pa'] = [1 if isinstance(x, str) else 0 for x in df.event_type]
209
+ df['pitches'] = [1 if x else 0 for x in df.is_pitch]
210
+
211
+
212
+ df.loc[df['launch_speed'].isna(),'barrel'] = np.nan
213
+
214
+
215
+ pitch_cat = {'FA':'Fastball',
216
+ 'FF':'Fastball',
217
+ 'FT':'Fastball',
218
+ 'FC':'Fastball',
219
+ 'FS':'Off-Speed',
220
+ 'FO':'Off-Speed',
221
+ 'SI':'Fastball',
222
+ 'ST':'Breaking',
223
+ 'SL':'Breaking',
224
+ 'CU':'Breaking',
225
+ 'KC':'Breaking',
226
+ 'SC':'Off-Speed',
227
+ 'GY':'Off-Speed',
228
+ 'SV':'Breaking',
229
+ 'CS':'Breaking',
230
+ 'CH':'Off-Speed',
231
+ 'KN':'Off-Speed',
232
+ 'EP':'Breaking',
233
+ 'UN':np.nan,
234
+ 'IN':np.nan,
235
+ 'PO':np.nan,
236
+ 'AB':np.nan,
237
+ 'AS':np.nan,
238
+ 'NP':np.nan}
239
+ #df['pitch_type'] = df['pitch_type'].map(pitch_cat).fillna('Unknown')
240
+ df['average'] = 'average'
241
+
242
+ df.loc[df['trajectory'] == 'bunt_popup','trajectory'] = 'popup'
243
+ df.loc[df['trajectory'] == 'bunt_grounder','trajectory'] = 'ground_ball'
244
+ df.loc[df['trajectory'] == '','trajectory'] = np.nan
245
+ df.loc[df['trajectory'] == 'bunt_line_drive','trajectory'] = 'line_drive'
246
+ df[['trajectory_fly_ball','trajectory_ground_ball','trajectory_line_drive','trajectory_popup']] = pd.get_dummies(df['trajectory'], prefix='trajectory')
247
+
248
+ df['attack_zone'] = np.nan
249
+
250
+
251
+
252
+ df.loc[df[['px','pz','sz_top','sz_bot']].isnull().sum(axis=1)==0,'attack_zone'] = attack_zone_model.predict(df.loc[df[['px','pz','sz_top','sz_bot']].isnull().sum(axis=1)==0][['px','pz','sz_top','sz_bot']])
253
+
254
+
255
+
256
+ df['heart'] = df['attack_zone'] == 0
257
+ df['shadow'] = df['attack_zone'] == 1
258
+ df['chase'] = df['attack_zone'] == 2
259
+ df['waste'] = df['attack_zone'] == 3
260
+
261
+ df['heart_swing'] = (df['attack_zone'] == 0)&(df['swings']==1)
262
+ df['shadow_swing'] = (df['attack_zone'] == 1)&(df['swings']==1)
263
+ df['chase_swing'] = (df['attack_zone'] == 2)&(df['swings']==1)
264
+ df['waste_swing'] = (df['attack_zone'] == 3)&(df['swings']==1)
265
+
266
+ df['heart_whiff'] = (df['attack_zone'] == 0)&(df['whiffs']==1)
267
+ df['shadow_whiff'] = (df['attack_zone'] == 1)&(df['whiffs']==1)
268
+ df['chase_whiff'] = (df['attack_zone'] == 2)&(df['whiffs']==1)
269
+ df['waste_whiff'] = (df['attack_zone'] == 3)&(df['whiffs']==1)
270
+
271
+ df['woba_pred'] = np.nan
272
+ df['woba_pred_contact'] = np.nan
273
+
274
+ if len(df.loc[df[['launch_angle','launch_speed']].isnull().sum(axis=1)==0,'woba_pred']) > 0:
275
+
276
+
277
+ df.loc[df[['launch_angle','launch_speed']].isnull().sum(axis=1)==0,'woba_pred'] = [sum(x) for x in xwoba_model.predict_proba(df.loc[df[['launch_angle','launch_speed']].isnull().sum(axis=1)==0][['launch_angle','launch_speed']]) * ([0, 0.883,1.244,1.569,2.004])]
278
+
279
+ ## Assign a value of 0.696 to every walk in the dataset
280
+ df.loc[df['event_type'].isin(['walk']),'woba_pred'] = 0.696
281
+
282
+ ## Assign a value of 0.726 to every hit by pitch in the dataset
283
+ df.loc[df['event_type'].isin(['hit_by_pitch']),'woba_pred'] = 0.726
284
+
285
+ ## Assign a value of 0 to every Strikeout in the dataset
286
+ df.loc[df['event_type'].isin(['strikeout','strikeout_double_play']),'woba_pred'] = 0
287
+
288
+
289
+ df.loc[df[['launch_angle','launch_speed']].isnull().sum(axis=1)==0,'woba_pred_contact'] = [sum(x) for x in xwoba_model.predict_proba(df.loc[df[['launch_angle','launch_speed']].isnull().sum(axis=1)==0][['launch_angle','launch_speed']]) * ([0, 0.883,1.244,1.569,2.004])]
290
+
291
+ df['xwoba_codes'] = np.nan
292
+ df.loc[df[['launch_angle','launch_speed']].isnull().sum(axis=1)==0,'xwoba_codes'] = 1
293
+ ## Assign a value of 0.696 to every walk in the dataset
294
+ df.loc[df['event_type'].isin(['walk']),'xwoba_codes'] = 1
295
+
296
+ ## Assign a value of 0.726 to every hit by pitch in the dataset
297
+ df.loc[df['event_type'].isin(['hit_by_pitch']),'xwoba_codes'] = 1
298
+
299
+ ## Assign a value of 0 to every Strikeout in the dataset
300
+ df.loc[df['event_type'].isin(['strikeout','strikeout_double_play']),'xwoba_codes'] = 1
301
+ return df
302
+
303
+ def df_update_summ(df=pd.DataFrame()):
304
+ df_summ = df.groupby(['pitcher_id','pitcher_name']).agg(
305
+ pa = ('pa','sum'),
306
+ ab = ('ab','sum'),
307
+ obp_pa = ('obp','sum'),
308
+ hits = ('hits','sum'),
309
+ on_base = ('on_base','sum'),
310
+ k = ('k','sum'),
311
+ bb = ('bb','sum'),
312
+ bb_minus_k = ('bb_minus_k','sum'),
313
+ csw = ('csw','sum'),
314
+ bip = ('bip','sum'),
315
+ bip_div = ('bip_div','sum'),
316
+ tb = ('tb','sum'),
317
+ woba = ('woba','sum'),
318
+ woba_contact = ('woba_contact','sum'),
319
+ xwoba = ('woba_pred','sum'),
320
+ xwoba_contact = ('woba_pred_contact','sum'),
321
+ woba_codes = ('woba_codes','sum'),
322
+ xwoba_codes = ('xwoba_codes','sum'),
323
+ hard_hit = ('hard_hit','sum'),
324
+ barrel = ('barrel','sum'),
325
+ sweet_spot = ('sweet_spot','sum'),
326
+ max_launch_speed = ('launch_speed','max'),
327
+ launch_speed_90 = ('launch_speed',percentile(90)),
328
+ launch_speed = ('launch_speed','mean'),
329
+ launch_angle = ('launch_angle','mean'),
330
+ pitches = ('is_pitch','sum'),
331
+ swings = ('swings','sum'),
332
+ in_zone = ('in_zone','sum'),
333
+ out_zone = ('out_zone','sum'),
334
+ whiffs = ('whiffs','sum'),
335
+ zone_swing = ('zone_swing','sum'),
336
+ zone_contact = ('zone_contact','sum'),
337
+ ozone_swing = ('ozone_swing','sum'),
338
+ ozone_contact = ('ozone_contact','sum'),
339
+ ground_ball = ('trajectory_ground_ball','sum'),
340
+ line_drive = ('trajectory_line_drive','sum'),
341
+ fly_ball =('trajectory_fly_ball','sum'),
342
+ pop_up = ('trajectory_popup','sum'),
343
+ attack_zone = ('attack_zone','count'),
344
+ heart = ('heart','sum'),
345
+ shadow = ('shadow','sum'),
346
+ chase = ('chase','sum'),
347
+ waste = ('waste','sum'),
348
+ heart_swing = ('heart_swing','sum'),
349
+ shadow_swing = ('shadow_swing','sum'),
350
+ chase_swing = ('chase_swing','sum'),
351
+ waste_swing = ('waste_swing','sum'),
352
+ ).reset_index()
353
+ return df_summ
354
+
355
+ def df_update_summ_avg(df=pd.DataFrame()):
356
+ df_summ_avg = df.groupby(['average']).agg(
357
+
358
+ ).reset_index()
359
+ return df_summ_avg
360
+
361
+ def df_summ_changes(df_summ=pd.DataFrame()):
362
+ df_summ['avg'] = [df_summ.hits[x]/df_summ.ab[x] if df_summ.ab[x] != 0 else np.nan for x in range(len(df_summ))]
363
+ df_summ['obp'] = [df_summ.on_base[x]/df_summ.obp_pa[x] if df_summ.obp_pa[x] != 0 else np.nan for x in range(len(df_summ))]
364
+ df_summ['slg'] = [df_summ.tb[x]/df_summ.ab[x] if df_summ.ab[x] != 0 else np.nan for x in range(len(df_summ))]
365
+
366
+ df_summ['ops'] = df_summ['obp']+df_summ['slg']
367
+
368
+ df_summ['k_percent'] = [df_summ.k[x]/df_summ.pa[x] if df_summ.pa[x] != 0 else np.nan for x in range(len(df_summ))]
369
+ df_summ['bb_percent'] =[df_summ.bb[x]/df_summ.pa[x] if df_summ.pa[x] != 0 else np.nan for x in range(len(df_summ))]
370
+ df_summ['bb_minus_k_percent'] =[(df_summ.bb_minus_k[x])/df_summ.pa[x] if df_summ.pa[x] != 0 else np.nan for x in range(len(df_summ))]
371
+
372
+ df_summ['bb_over_k_percent'] =[df_summ.bb[x]/df_summ.k[x] if df_summ.k[x] != 0 else np.nan for x in range(len(df_summ))]
373
+
374
+
375
+
376
+
377
+ df_summ['csw_percent'] =[df_summ.csw[x]/df_summ.pitches[x] if df_summ.pitches[x] != 0 else np.nan for x in range(len(df_summ))]
378
+
379
+
380
+ df_summ['sweet_spot_percent'] = [df_summ.sweet_spot[x]/df_summ.bip_div[x] if df_summ.bip_div[x] != 0 else np.nan for x in range(len(df_summ))]
381
+
382
+ df_summ['woba_percent'] = [df_summ.woba[x]/df_summ.woba_codes[x] if df_summ.woba_codes[x] != 0 else np.nan for x in range(len(df_summ))]
383
+ df_summ['woba_percent_contact'] = [df_summ.woba_contact[x]/df_summ.bip[x] if df_summ.bip[x] != 0 else np.nan for x in range(len(df_summ))]
384
+ #df_summ['hard_hit_percent'] = [df_summ.sweet_spot[x]/df_summ.bip[x] if df_summ.bip[x] != 0 else np.nan for x in range(len(df_summ))]
385
+ df_summ['hard_hit_percent'] = [df_summ.hard_hit[x]/df_summ.bip_div[x] if df_summ.bip_div[x] != 0 else np.nan for x in range(len(df_summ))]
386
+
387
+
388
+ df_summ['barrel_percent'] = [df_summ.barrel[x]/df_summ.bip_div[x] if df_summ.bip_div[x] != 0 else np.nan for x in range(len(df_summ))]
389
+
390
+ df_summ['zone_contact_percent'] = [df_summ.zone_contact[x]/df_summ.zone_swing[x] if df_summ.zone_swing[x] != 0 else np.nan for x in range(len(df_summ))]
391
+
392
+ df_summ['zone_swing_percent'] = [df_summ.zone_swing[x]/df_summ.in_zone[x] if df_summ.in_zone[x] != 0 else np.nan for x in range(len(df_summ))]
393
+
394
+ df_summ['zone_percent'] = [df_summ.in_zone[x]/df_summ.pitches[x] if df_summ.pitches[x] > 0 else np.nan for x in range(len(df_summ))]
395
+
396
+ df_summ['chase_percent'] = [df_summ.ozone_swing[x]/(df_summ.pitches[x] - df_summ.in_zone[x]) if (df_summ.pitches[x]- df_summ.in_zone[x]) != 0 else np.nan for x in range(len(df_summ))]
397
+
398
+ df_summ['chase_contact'] = [df_summ.ozone_contact[x]/df_summ.ozone_swing[x] if df_summ.ozone_swing[x] != 0 else np.nan for x in range(len(df_summ))]
399
+
400
+ df_summ['swing_percent'] = [df_summ.swings[x]/df_summ.pitches[x] if df_summ.pitches[x] > 0 else np.nan for x in range(len(df_summ))]
401
+
402
+ df_summ['whiff_rate'] = [df_summ.whiffs[x]/df_summ.swings[x] if df_summ.swings[x] != 0 else np.nan for x in range(len(df_summ))]
403
+
404
+ df_summ['swstr_rate'] = [df_summ.whiffs[x]/df_summ.pitches[x] if df_summ.pitches[x] > 0 else np.nan for x in range(len(df_summ))]
405
+
406
+ df_summ['ground_ball_percent'] = [df_summ.ground_ball[x]/df_summ.bip[x] if df_summ.bip[x] != 0 else np.nan for x in range(len(df_summ))]
407
+
408
+ df_summ['line_drive_percent'] = [df_summ.line_drive[x]/df_summ.bip[x] if df_summ.bip[x] != 0 else np.nan for x in range(len(df_summ))]
409
+
410
+ df_summ['fly_ball_percent'] = [df_summ.fly_ball[x]/df_summ.bip[x] if df_summ.bip[x] != 0 else np.nan for x in range(len(df_summ))]
411
+
412
+ df_summ['pop_up_percent'] = [df_summ.pop_up[x]/df_summ.bip[x] if df_summ.bip[x] != 0 else np.nan for x in range(len(df_summ))]
413
+
414
+
415
+
416
+ df_summ['heart_zone_percent'] = [df_summ.heart[x]/df_summ.attack_zone[x] if df_summ.attack_zone[x] != 0 else np.nan for x in range(len(df_summ))]
417
+
418
+ df_summ['shadow_zone_percent'] = [df_summ.shadow[x]/df_summ.attack_zone[x] if df_summ.attack_zone[x] != 0 else np.nan for x in range(len(df_summ))]
419
+
420
+ df_summ['chase_zone_percent'] = [df_summ.chase[x]/df_summ.attack_zone[x] if df_summ.attack_zone[x] != 0 else np.nan for x in range(len(df_summ))]
421
+
422
+ df_summ['waste_zone_percent'] = [df_summ.waste[x]/df_summ.attack_zone[x] if df_summ.attack_zone[x] != 0 else np.nan for x in range(len(df_summ))]
423
+
424
+
425
+ df_summ['heart_zone_swing_percent'] = [df_summ.heart_swing[x]/df_summ.heart[x] if df_summ.heart[x] != 0 else np.nan for x in range(len(df_summ))]
426
+
427
+ df_summ['shadow_zone_swing_percent'] = [df_summ.shadow_swing[x]/df_summ.shadow[x] if df_summ.shadow[x] != 0 else np.nan for x in range(len(df_summ))]
428
+
429
+ df_summ['chase_zone_swing_percent'] = [df_summ.chase_swing[x]/df_summ.chase[x] if df_summ.chase[x] != 0 else np.nan for x in range(len(df_summ))]
430
+
431
+ df_summ['waste_zone_swing_percent'] = [df_summ.waste_swing[x]/df_summ.waste[x] if df_summ.waste[x] != 0 else np.nan for x in range(len(df_summ))]
432
+
433
+
434
+
435
+
436
+ df_summ['xwoba_percent'] = [df_summ.xwoba[x]/df_summ.xwoba_codes[x] if df_summ.xwoba_codes[x] != 0 else np.nan for x in range(len(df_summ))]
437
+ df_summ['xwoba_percent_contact'] = [df_summ.xwoba_contact[x]/df_summ.bip[x] if df_summ.bip[x] != 0 else np.nan for x in range(len(df_summ))]
438
+
439
+ df_summ = df_summ.dropna(subset=['bip'])
440
+ return df_summ
441
+
442
+ def df_summ_filter_out(df_summ=pd.DataFrame(),batter_select = 0):
443
+ df_summ_filter = df_summ[df_summ['pa'] >= min(math.floor(df_summ.xs(batter_select,level=0)['pa']/10)*10,500)]
444
+ df_summ_filter_pct = df_summ_filter.rank(pct=True,ascending=True)
445
+ df_summ_player = df_summ.xs(batter_select,level=0)
446
+ df_summ_player_pct = df_summ_filter_pct.xs(batter_select,level=0)
447
+ return df_summ_filter,df_summ_filter_pct,df_summ_player,df_summ_player_pct
448
+
449
+ def df_summ_batter_pitch_up(df=pd.DataFrame()):
450
+ df_summ_batter_pitch = df.dropna(subset=['pitch_type']).groupby(['pitcher_id','pitcher_name','pitch_type']).agg(
451
+ pa = ('pa','sum'),
452
+ ab = ('ab','sum'),
453
+ obp_pa = ('obp','sum'),
454
+ hits = ('hits','sum'),
455
+ on_base = ('on_base','sum'),
456
+ k = ('k','sum'),
457
+ bb = ('bb','sum'),
458
+ bb_minus_k = ('bb_minus_k','sum'),
459
+ csw = ('csw','sum'),
460
+ bip = ('bip','sum'),
461
+ bip_div = ('bip_div','sum'),
462
+ tb = ('tb','sum'),
463
+ woba = ('woba','sum'),
464
+ woba_contact = ('woba_pred_contact','sum'),
465
+ xwoba = ('woba_pred','sum'),
466
+ xwoba_contact = ('woba_pred','sum'),
467
+ woba_codes = ('woba_codes','sum'),
468
+ xwoba_codes = ('xwoba_codes','sum'),
469
+ hard_hit = ('hard_hit','sum'),
470
+ barrel = ('barrel','sum'),
471
+ sweet_spot = ('sweet_spot','sum'),
472
+ max_launch_speed = ('launch_speed','max'),
473
+ launch_speed_90 = ('launch_speed',percentile(90)),
474
+ launch_speed = ('launch_speed','mean'),
475
+ launch_angle = ('launch_angle','mean'),
476
+ pitches = ('is_pitch','sum'),
477
+ swings = ('swings','sum'),
478
+ in_zone = ('in_zone','sum'),
479
+ out_zone = ('out_zone','sum'),
480
+ whiffs = ('whiffs','sum'),
481
+ zone_swing = ('zone_swing','sum'),
482
+ zone_contact = ('zone_contact','sum'),
483
+ ozone_swing = ('ozone_swing','sum'),
484
+ ozone_contact = ('ozone_contact','sum'),
485
+ ground_ball = ('trajectory_ground_ball','sum'),
486
+ line_drive = ('trajectory_line_drive','sum'),
487
+ fly_ball =('trajectory_fly_ball','sum'),
488
+ pop_up = ('trajectory_popup','sum'),
489
+ attack_zone = ('attack_zone','count'),
490
+ heart = ('heart','sum'),
491
+ shadow = ('shadow','sum'),
492
+ chase = ('chase','sum'),
493
+ waste = ('waste','sum'),
494
+ heart_swing = ('heart_swing','sum'),
495
+ shadow_swing = ('shadow_swing','sum'),
496
+ chase_swing = ('chase_swing','sum'),
497
+ waste_swing = ('waste_swing','sum'),
498
+ ).reset_index()
499
+
500
+ #return df_summ_batter_pitch
501
+ df_summ_batter_pitch['avg'] = [df_summ_batter_pitch.hits[x]/df_summ_batter_pitch.ab[x] if df_summ_batter_pitch.ab[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
502
+ df_summ_batter_pitch['obp'] = [df_summ_batter_pitch.on_base[x]/df_summ_batter_pitch.obp_pa[x] if df_summ_batter_pitch.obp_pa[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
503
+ df_summ_batter_pitch['slg'] = [df_summ_batter_pitch.tb[x]/df_summ_batter_pitch.ab[x] if df_summ_batter_pitch.ab[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
504
+
505
+ df_summ_batter_pitch['ops'] = df_summ_batter_pitch['obp']+df_summ_batter_pitch['slg']
506
+
507
+ df_summ_batter_pitch['k_percent'] = [df_summ_batter_pitch.k[x]/df_summ_batter_pitch.pa[x] if df_summ_batter_pitch.pa[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
508
+ df_summ_batter_pitch['bb_percent'] =[df_summ_batter_pitch.bb[x]/df_summ_batter_pitch.pa[x] if df_summ_batter_pitch.pa[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
509
+ df_summ_batter_pitch['bb_minus_k_percent'] =[(df_summ_batter_pitch.bb_minus_k[x])/df_summ_batter_pitch.pa[x] if df_summ_batter_pitch.pa[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
510
+
511
+ df_summ_batter_pitch['bb_over_k_percent'] =[df_summ_batter_pitch.bb[x]/df_summ_batter_pitch.k[x] if df_summ_batter_pitch.k[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
512
+
513
+
514
+
515
+
516
+ df_summ_batter_pitch['csw_percent'] =[df_summ_batter_pitch.csw[x]/df_summ_batter_pitch.pitches[x] if df_summ_batter_pitch.pitches[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
517
+
518
+
519
+ df_summ_batter_pitch['sweet_spot_percent'] = [df_summ_batter_pitch.sweet_spot[x]/df_summ_batter_pitch.bip_div[x] if df_summ_batter_pitch.bip_div[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
520
+
521
+ df_summ_batter_pitch['woba_percent'] = [df_summ_batter_pitch.woba[x]/df_summ_batter_pitch.woba_codes[x] if df_summ_batter_pitch.woba_codes[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
522
+ df_summ_batter_pitch['woba_percent_contact'] = [df_summ_batter_pitch.woba_contact[x]/df_summ_batter_pitch.bip[x] if df_summ_batter_pitch.bip[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
523
+ #df_summ_batter_pitch['hard_hit_percent'] = [df_summ_batter_pitch.sweet_spot[x]/df_summ_batter_pitch.bip[x] if df_summ_batter_pitch.bip[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
524
+ df_summ_batter_pitch['hard_hit_percent'] = [df_summ_batter_pitch.hard_hit[x]/df_summ_batter_pitch.bip_div[x] if df_summ_batter_pitch.bip_div[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
525
+
526
+
527
+ df_summ_batter_pitch['barrel_percent'] = [df_summ_batter_pitch.barrel[x]/df_summ_batter_pitch.bip_div[x] if df_summ_batter_pitch.bip_div[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
528
+
529
+ df_summ_batter_pitch['zone_contact_percent'] = [df_summ_batter_pitch.zone_contact[x]/df_summ_batter_pitch.zone_swing[x] if df_summ_batter_pitch.zone_swing[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
530
+
531
+ df_summ_batter_pitch['zone_swing_percent'] = [df_summ_batter_pitch.zone_swing[x]/df_summ_batter_pitch.in_zone[x] if df_summ_batter_pitch.in_zone[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
532
+
533
+ df_summ_batter_pitch['zone_percent'] = [df_summ_batter_pitch.in_zone[x]/df_summ_batter_pitch.pitches[x] if df_summ_batter_pitch.pitches[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
534
+
535
+ df_summ_batter_pitch['chase_percent'] = [df_summ_batter_pitch.ozone_swing[x]/(df_summ_batter_pitch.pitches[x] - df_summ_batter_pitch.in_zone[x]) if (df_summ_batter_pitch.pitches[x]- df_summ_batter_pitch.in_zone[x]) != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
536
+
537
+ df_summ_batter_pitch['chase_contact'] = [df_summ_batter_pitch.ozone_contact[x]/df_summ_batter_pitch.ozone_swing[x] if df_summ_batter_pitch.ozone_swing[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
538
+
539
+ df_summ_batter_pitch['swing_percent'] = [df_summ_batter_pitch.swings[x]/df_summ_batter_pitch.pitches[x] if df_summ_batter_pitch.pitches[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
540
+
541
+ df_summ_batter_pitch['whiff_rate'] = [df_summ_batter_pitch.whiffs[x]/df_summ_batter_pitch.swings[x] if df_summ_batter_pitch.swings[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
542
+
543
+ df_summ_batter_pitch['swstr_rate'] = [df_summ_batter_pitch.whiffs[x]/df_summ_batter_pitch.pitches[x] if df_summ_batter_pitch.pitches[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
544
+
545
+ df_summ_batter_pitch['heart_zone_percent'] = [df_summ_batter_pitch.heart[x]/df_summ_batter_pitch.attack_zone[x] if df_summ_batter_pitch.attack_zone[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
546
+
547
+ df_summ_batter_pitch['shadow_zone_percent'] = [df_summ_batter_pitch.shadow[x]/df_summ_batter_pitch.attack_zone[x] if df_summ_batter_pitch.attack_zone[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
548
+
549
+ df_summ_batter_pitch['chase_zone_percent'] = [df_summ_batter_pitch.chase[x]/df_summ_batter_pitch.attack_zone[x] if df_summ_batter_pitch.attack_zone[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
550
+
551
+ df_summ_batter_pitch['waste_zone_percent'] = [df_summ_batter_pitch.waste[x]/df_summ_batter_pitch.attack_zone[x] if df_summ_batter_pitch.attack_zone[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
552
+
553
+
554
+ df_summ_batter_pitch['heart_zone_swing_percent'] = [df_summ_batter_pitch.heart_swing[x]/df_summ_batter_pitch.heart[x] if df_summ_batter_pitch.heart[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
555
+
556
+ df_summ_batter_pitch['shadow_zone_swing_percent'] = [df_summ_batter_pitch.shadow_swing[x]/df_summ_batter_pitch.shadow[x] if df_summ_batter_pitch.shadow[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
557
+
558
+ df_summ_batter_pitch['chase_zone_swing_percent'] = [df_summ_batter_pitch.chase_swing[x]/df_summ_batter_pitch.chase[x] if df_summ_batter_pitch.chase[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
559
+
560
+ df_summ_batter_pitch['waste_zone_swing_percent'] = [df_summ_batter_pitch.waste_swing[x]/df_summ_batter_pitch.waste[x] if df_summ_batter_pitch.waste[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
561
+
562
+
563
+
564
+
565
+ df_summ_batter_pitch['xwoba_percent'] = [df_summ_batter_pitch.xwoba[x]/df_summ_batter_pitch.xwoba_codes[x] if df_summ_batter_pitch.xwoba_codes[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
566
+ df_summ_batter_pitch['xwoba_percent_contact'] = [df_summ_batter_pitch.xwoba_contact[x]/df_summ_batter_pitch.bip[x] if df_summ_batter_pitch.bip[x] != 0 else np.nan for x in range(len(df_summ_batter_pitch))]
567
+
568
+
569
+
570
+
571
+ df_summ_batter_pitch['bip'] = df_summ_batter_pitch['bip'].fillna(0)
572
+
573
+ return df_summ_batter_pitch