nesticot commited on
Commit
82c2db4
·
verified ·
1 Parent(s): c944979

Upload api_scraper.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. api_scraper.py +36 -42
api_scraper.py CHANGED
@@ -100,20 +100,25 @@ class MLB_Scrape:
100
  # Make API call to retrieve game schedule
101
  game_call = requests.get(url=f'https://statsapi.mlb.com/api/v1/schedule/?sportId={sport_id_str}&gameTypes={game_type_str}&season={year_input_str}&hydrate=lineup,players').json()
102
  try:
103
- # Extract relevant data from the API response
104
- game_list = [item for sublist in [[y['gamePk'] for y in x['games']] for x in game_call['dates']] for item in sublist]
105
- time_list = [item for sublist in [[y['gameDate'] for y in x['games']] for x in game_call['dates']] for item in sublist]
106
- date_list = [item for sublist in [[y['officialDate'] for y in x['games']] for x in game_call['dates']] for item in sublist]
107
- away_team_list = [item for sublist in [[y['teams']['away']['team']['name'] for y in x['games']] for x in game_call['dates']] for item in sublist]
108
- away_team_id_list = [item for sublist in [[y['teams']['away']['team']['id'] for y in x['games']] for x in game_call['dates']] for item in sublist]
109
- home_team_list = [item for sublist in [[y['teams']['home']['team']['name'] for y in x['games']] for x in game_call['dates']] for item in sublist]
110
- home_team_id_list = [item for sublist in [[y['teams']['home']['team']['id'] for y in x['games']] for x in game_call['dates']] for item in sublist]
111
- state_list = [item for sublist in [[y['status']['codedGameState'] for y in x['games']] for x in game_call['dates']] for item in sublist]
112
- venue_id = [item for sublist in [[y['venue']['id'] for y in x['games']] for x in game_call['dates']] for item in sublist]
113
- venue_name = [item for sublist in [[y['venue']['name'] for y in x['games']] for x in game_call['dates']] for item in sublist]
114
- gameday_type = [item for sublist in [[y['gamedayType'] for y in x['games']] for x in game_call['dates']] for item in sublist]
115
- # Create a Polars DataFrame with the extracted data
116
-
 
 
 
 
 
117
 
118
  # Create a Polars DataFrame with the extracted data
119
  game_df = pl.DataFrame(data={'game_id': game_list,
@@ -154,41 +159,30 @@ class MLB_Scrape:
154
  return game_df
155
 
156
 
157
- # def get_data(self, game_list_input: list):
158
- # """
159
- # Retrieves live game data for a list of game IDs in parallel.
160
 
161
- # Parameters:
162
- # - game_list_input (list): A list of game IDs for which to retrieve live data.
163
 
164
- # Returns:
165
- # - data_total (list): A list of JSON responses containing live game data for each game ID.
166
- # """
167
- # data_total = []
168
- # print('This May Take a While. Progress Bar shows Completion of Data Retrieval.')
169
 
170
- # def fetch_data(game_id):
171
- # r = requests.get(f'https://statsapi.mlb.com/api/v1.1/game/{game_id}/feed/live')
172
- # return r.json()
173
 
174
- # with ThreadPoolExecutor() as executor:
175
- # futures = {executor.submit(fetch_data, game_id): game_id for game_id in game_list_input}
176
- # for future in tqdm(as_completed(futures), total=len(futures), desc="Processing", unit="iteration"):
177
- # data_total.append(future.result())
178
 
179
- # return data_total
180
-
181
-
182
- def get_data(self,game_list_input = [748540]):
183
- data_total = []
184
- #n_count = 0
185
- print('This May Take a While. Progress Bar shows Completion of Data Retrieval.')
186
- for i in tqdm(range(len(game_list_input)), desc="Processing", unit="iteration"):
187
- r = requests.get(f'https://statsapi.mlb.com/api/v1.1/game/{game_list_input[i]}/feed/live')
188
- data_total.append(r.json())
189
  return data_total
190
 
191
-
192
  def get_data_df(self, data_list):
193
  """
194
  Converts a list of game data JSON objects into a Polars DataFrame.
 
100
  # Make API call to retrieve game schedule
101
  game_call = requests.get(url=f'https://statsapi.mlb.com/api/v1/schedule/?sportId={sport_id_str}&gameTypes={game_type_str}&season={year_input_str}&hydrate=lineup,players').json()
102
  try:
103
+ def safe_get(d, keys, default=np.nan):
104
+ """Safely retrieve nested dictionary values."""
105
+ for key in keys:
106
+ d = d.get(key, {})
107
+ if not isinstance(d, dict):
108
+ return d # Return value if it's not a dict
109
+ return default # Return default if keys don't exist
110
+
111
+ game_list = [item for sublist in [[y.get('gamePk', np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
112
+ time_list = [item for sublist in [[y.get('gameDate', np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
113
+ date_list = [item for sublist in [[y.get('officialDate', np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
114
+ away_team_list = [item for sublist in [[safe_get(y, ['teams', 'away', 'team', 'name'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
115
+ away_team_id_list = [item for sublist in [[safe_get(y, ['teams', 'away', 'team', 'id'], np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
116
+ home_team_list = [item for sublist in [[safe_get(y, ['teams', 'home', 'team', 'name'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
117
+ home_team_id_list = [item for sublist in [[safe_get(y, ['teams', 'home', 'team', 'id'], np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
118
+ state_list = [item for sublist in [[safe_get(y, ['status', 'codedGameState'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
119
+ venue_id = [item for sublist in [[safe_get(y, ['venue', 'id'], np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
120
+ venue_name = [item for sublist in [[safe_get(y, ['venue', 'name'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
121
+ gameday_type = [item for sublist in [[safe_get(y, ['gamedayType'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
122
 
123
  # Create a Polars DataFrame with the extracted data
124
  game_df = pl.DataFrame(data={'game_id': game_list,
 
159
  return game_df
160
 
161
 
162
+ def get_data(self, game_list_input: list):
163
+ """
164
+ Retrieves live game data for a list of game IDs in parallel.
165
 
166
+ Parameters:
167
+ - game_list_input (list): A list of game IDs for which to retrieve live data.
168
 
169
+ Returns:
170
+ - data_total (list): A list of JSON responses containing live game data for each game ID.
171
+ """
172
+ data_total = []
173
+ print('This May Take a While. Progress Bar shows Completion of Data Retrieval.')
174
 
175
+ def fetch_data(game_id):
176
+ r = requests.get(f'https://statsapi.mlb.com/api/v1.1/game/{game_id}/feed/live')
177
+ return r.json()
178
 
179
+ with ThreadPoolExecutor() as executor:
180
+ futures = {executor.submit(fetch_data, game_id): game_id for game_id in game_list_input}
181
+ for future in tqdm(as_completed(futures), total=len(futures), desc="Processing", unit="iteration"):
182
+ data_total.append(future.result())
183
 
 
 
 
 
 
 
 
 
 
 
184
  return data_total
185
 
 
186
  def get_data_df(self, data_list):
187
  """
188
  Converts a list of game data JSON objects into a Polars DataFrame.