Spaces:
Sleeping
Sleeping
import datetime | |
from multiprocessing import Pool | |
import numpy as np | |
import pandas as pd | |
from pydantic import BaseModel, Field | |
from typing import Optional | |
from urllib.parse import urljoin | |
from domain.teams import ALL_TEAMS, NFLTeam | |
MULTIPROCESSING_ENABLED = False | |
PRACTICE_WEEK = { | |
"Mon": 0, | |
"Tue": 1, | |
"Wed": 2, | |
"Thu": 3, | |
"Fri": 4, | |
"Sat": 5, | |
"Sun": 6, | |
"Monday": 0, | |
"Tuesday": 1, | |
"Wednesday": 2, | |
"Thursday": 3, | |
"Friday": 4, | |
"Saturday": 5, | |
"Sunday": 6, | |
} | |
DAY_OF_WEEK_STRING_MAPPING = { | |
"Monday": "Mon", | |
"Tuesday": "Tue", | |
"Wednesday": "Wed", | |
"Thursday": "Thu", | |
"Friday": "Fri", | |
"Saturday": "Sat", | |
"Sunday": "Sun", | |
} | |
WEEK_1_BEGIN_DATE = datetime.datetime(2023, 9, 4) | |
CURRENT_DATE = datetime.datetime.now() | |
CURRENT_WEEK = max(1, int(1 + (CURRENT_DATE - WEEK_1_BEGIN_DATE).days / 7)) | |
CURRENT_SEASON = 2023 | |
class PracticeReportRawRow(BaseModel): | |
Team: str | |
Player: str | |
Position: str | |
Injury: str | |
Sun: Optional[str] = None | |
Mon: Optional[str] = None | |
Tue: Optional[str] = None | |
Wed: Optional[str] = None | |
Thu: Optional[str] = None | |
Fri: Optional[str] = None | |
Sat: Optional[str] = None | |
game_status: str = Field(alias="Game Status") | |
def replace_nan(self, value) -> str: | |
if isinstance(value, float): | |
if np.isnan(value): | |
return "" | |
return value | |
def from_raw(cls, input_dict) -> "PracticeReportRawRow": | |
return cls(**{DAY_OF_WEEK_STRING_MAPPING.get(k, k): cls.replace_nan(v) for k, v in input_dict.items()}) | |
def scrape_team_injury_report(team: NFLTeam) -> pd.DataFrame: | |
print(f"Scraping Injury Report for: {team.team_full_name}") | |
injury_report_url = urljoin(team.injury_report_url, f"week/REG-{CURRENT_WEEK}") | |
try: | |
team_report = pd.read_html(injury_report_url)[0] | |
except Exception: | |
print(f"Failed to scrape practice report for: {team.team_full_name}") | |
return pd.DataFrame() | |
validated_row_list = [] | |
for df_row_dict in team_report.to_dict("records"): | |
row_to_add = df_row_dict | |
row_to_add["Team"] = team.team_full_name | |
validated_row_list.append(PracticeReportRawRow.from_raw(row_to_add)) | |
validated_df = pd.DataFrame([x.dict() for x in validated_row_list]) | |
# drop all na columns | |
validated_df.dropna(axis=1, how="all", inplace=True) | |
# replace day of week with practice day from 1-3 | |
day_idx = 1 | |
last_practice_day = None | |
for col in validated_df.columns: | |
if col in PRACTICE_WEEK: | |
validated_df.rename(columns={col: str(day_idx)}, inplace=True) | |
day_idx += 1 | |
last_practice_day = col | |
validated_df["Last Practice Day"] = last_practice_day | |
return validated_df | |
def scrape_all_team_injury_report() -> pd.DataFrame: | |
if MULTIPROCESSING_ENABLED: | |
with Pool() as pool: | |
team_df_list = pool.map(scrape_team_injury_report, ALL_TEAMS) | |
else: | |
team_df_list = [scrape_team_injury_report(team) for team in ALL_TEAMS] | |
return pd.concat(team_df_list) | |