Upload api_scraper.py with huggingface_hub
Browse files- api_scraper.py +36 -42
api_scraper.py
CHANGED
@@ -100,20 +100,25 @@ class MLB_Scrape:
|
|
100 |
# Make API call to retrieve game schedule
|
101 |
game_call = requests.get(url=f'https://statsapi.mlb.com/api/v1/schedule/?sportId={sport_id_str}&gameTypes={game_type_str}&season={year_input_str}&hydrate=lineup,players').json()
|
102 |
try:
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
# Create a Polars DataFrame with the extracted data
|
119 |
game_df = pl.DataFrame(data={'game_id': game_list,
|
@@ -154,41 +159,30 @@ class MLB_Scrape:
|
|
154 |
return game_df
|
155 |
|
156 |
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
|
161 |
-
|
162 |
-
|
163 |
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
|
179 |
-
# return data_total
|
180 |
-
|
181 |
-
|
182 |
-
def get_data(self,game_list_input = [748540]):
|
183 |
-
data_total = []
|
184 |
-
#n_count = 0
|
185 |
-
print('This May Take a While. Progress Bar shows Completion of Data Retrieval.')
|
186 |
-
for i in tqdm(range(len(game_list_input)), desc="Processing", unit="iteration"):
|
187 |
-
r = requests.get(f'https://statsapi.mlb.com/api/v1.1/game/{game_list_input[i]}/feed/live')
|
188 |
-
data_total.append(r.json())
|
189 |
return data_total
|
190 |
|
191 |
-
|
192 |
def get_data_df(self, data_list):
|
193 |
"""
|
194 |
Converts a list of game data JSON objects into a Polars DataFrame.
|
|
|
100 |
# Make API call to retrieve game schedule
|
101 |
game_call = requests.get(url=f'https://statsapi.mlb.com/api/v1/schedule/?sportId={sport_id_str}&gameTypes={game_type_str}&season={year_input_str}&hydrate=lineup,players').json()
|
102 |
try:
|
103 |
+
def safe_get(d, keys, default=np.nan):
|
104 |
+
"""Safely retrieve nested dictionary values."""
|
105 |
+
for key in keys:
|
106 |
+
d = d.get(key, {})
|
107 |
+
if not isinstance(d, dict):
|
108 |
+
return d # Return value if it's not a dict
|
109 |
+
return default # Return default if keys don't exist
|
110 |
+
|
111 |
+
game_list = [item for sublist in [[y.get('gamePk', np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
|
112 |
+
time_list = [item for sublist in [[y.get('gameDate', np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
|
113 |
+
date_list = [item for sublist in [[y.get('officialDate', np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
|
114 |
+
away_team_list = [item for sublist in [[safe_get(y, ['teams', 'away', 'team', 'name'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
|
115 |
+
away_team_id_list = [item for sublist in [[safe_get(y, ['teams', 'away', 'team', 'id'], np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
|
116 |
+
home_team_list = [item for sublist in [[safe_get(y, ['teams', 'home', 'team', 'name'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
|
117 |
+
home_team_id_list = [item for sublist in [[safe_get(y, ['teams', 'home', 'team', 'id'], np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
|
118 |
+
state_list = [item for sublist in [[safe_get(y, ['status', 'codedGameState'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
|
119 |
+
venue_id = [item for sublist in [[safe_get(y, ['venue', 'id'], np.nan) for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
|
120 |
+
venue_name = [item for sublist in [[safe_get(y, ['venue', 'name'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
|
121 |
+
gameday_type = [item for sublist in [[safe_get(y, ['gamedayType'], "") for y in x.get('games', [])] for x in game_call.get('dates', [])] for item in sublist]
|
122 |
|
123 |
# Create a Polars DataFrame with the extracted data
|
124 |
game_df = pl.DataFrame(data={'game_id': game_list,
|
|
|
159 |
return game_df
|
160 |
|
161 |
|
162 |
+
def get_data(self, game_list_input: list):
|
163 |
+
"""
|
164 |
+
Retrieves live game data for a list of game IDs in parallel.
|
165 |
|
166 |
+
Parameters:
|
167 |
+
- game_list_input (list): A list of game IDs for which to retrieve live data.
|
168 |
|
169 |
+
Returns:
|
170 |
+
- data_total (list): A list of JSON responses containing live game data for each game ID.
|
171 |
+
"""
|
172 |
+
data_total = []
|
173 |
+
print('This May Take a While. Progress Bar shows Completion of Data Retrieval.')
|
174 |
|
175 |
+
def fetch_data(game_id):
|
176 |
+
r = requests.get(f'https://statsapi.mlb.com/api/v1.1/game/{game_id}/feed/live')
|
177 |
+
return r.json()
|
178 |
|
179 |
+
with ThreadPoolExecutor() as executor:
|
180 |
+
futures = {executor.submit(fetch_data, game_id): game_id for game_id in game_list_input}
|
181 |
+
for future in tqdm(as_completed(futures), total=len(futures), desc="Processing", unit="iteration"):
|
182 |
+
data_total.append(future.result())
|
183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
return data_total
|
185 |
|
|
|
186 |
def get_data_df(self, data_list):
|
187 |
"""
|
188 |
Converts a list of game data JSON objects into a Polars DataFrame.
|