import datetime from multiprocessing import Pool import numpy as np import pandas as pd from pydantic import BaseModel, Field from typing import Optional from urllib.parse import urljoin from domain.teams import ALL_TEAMS, NFLTeam MULTIPROCESSING_ENABLED = False PRACTICE_WEEK = { "Mon": 0, "Tue": 1, "Wed": 2, "Thu": 3, "Fri": 4, "Sat": 5, "Sun": 6, "Monday": 0, "Tuesday": 1, "Wednesday": 2, "Thursday": 3, "Friday": 4, "Saturday": 5, "Sunday": 6, } DAY_OF_WEEK_STRING_MAPPING = { "Monday": "Mon", "Tuesday": "Tue", "Wednesday": "Wed", "Thursday": "Thu", "Friday": "Fri", "Saturday": "Sat", "Sunday": "Sun", } WEEK_1_BEGIN_DATE = datetime.datetime(2023, 9, 4) CURRENT_DATE = datetime.datetime.now() CURRENT_WEEK = max(1, int(1 + (CURRENT_DATE - WEEK_1_BEGIN_DATE).days / 7)) CURRENT_SEASON = 2023 class PracticeReportRawRow(BaseModel): Team: str Player: str Position: str Injury: str Sun: Optional[str] = None Mon: Optional[str] = None Tue: Optional[str] = None Wed: Optional[str] = None Thu: Optional[str] = None Fri: Optional[str] = None Sat: Optional[str] = None game_status: str = Field(alias="Game Status") @classmethod def replace_nan(self, value) -> str: if isinstance(value, float): if np.isnan(value): return "" return value @classmethod def from_raw(cls, input_dict) -> "PracticeReportRawRow": return cls(**{DAY_OF_WEEK_STRING_MAPPING.get(k, k): cls.replace_nan(v) for k, v in input_dict.items()}) def scrape_team_injury_report(team: NFLTeam) -> pd.DataFrame: print(f"Scraping Injury Report for: {team.team_full_name}") injury_report_url = urljoin(team.injury_report_url, f"week/REG-{CURRENT_WEEK}") try: team_report = pd.read_html(injury_report_url)[0] except Exception: print(f"Failed to scrape practice report for: {team.team_full_name}") return pd.DataFrame() validated_row_list = [] for df_row_dict in team_report.to_dict("records"): row_to_add = df_row_dict row_to_add["Team"] = team.team_full_name validated_row_list.append(PracticeReportRawRow.from_raw(row_to_add)) validated_df = pd.DataFrame([x.dict() for x in validated_row_list]) # drop all na columns validated_df.dropna(axis=1, how="all", inplace=True) # replace day of week with practice day from 1-3 day_idx = 1 last_practice_day = None for col in validated_df.columns: if col in PRACTICE_WEEK: validated_df.rename(columns={col: str(day_idx)}, inplace=True) day_idx += 1 last_practice_day = col validated_df["Last Practice Day"] = last_practice_day return validated_df def scrape_all_team_injury_report() -> pd.DataFrame: if MULTIPROCESSING_ENABLED: with Pool() as pool: team_df_list = pool.map(scrape_team_injury_report, ALL_TEAMS) else: team_df_list = [scrape_team_injury_report(team) for team in ALL_TEAMS] return pd.concat(team_df_list)