YFDashboard / src /queries /nfl_teams /practice_reports.py
Jon Solow
Remove repeated line added for testing that should only be in try-except
3bd0fa3
raw
history blame
3.13 kB
import datetime
from multiprocessing import Pool
import numpy as np
import pandas as pd
from pydantic import BaseModel, Field
from typing import Optional
from urllib.parse import urljoin
from domain.teams import ALL_TEAMS, NFLTeam
MULTIPROCESSING_ENABLED = False
PRACTICE_WEEK = {
"Mon": 0,
"Tue": 1,
"Wed": 2,
"Thu": 3,
"Fri": 4,
"Sat": 5,
"Sun": 6,
"Monday": 0,
"Tuesday": 1,
"Wednesday": 2,
"Thursday": 3,
"Friday": 4,
"Saturday": 5,
"Sunday": 6,
}
DAY_OF_WEEK_STRING_MAPPING = {
"Monday": "Mon",
"Tuesday": "Tue",
"Wednesday": "Wed",
"Thursday": "Thu",
"Friday": "Fri",
"Saturday": "Sat",
"Sunday": "Sun",
}
WEEK_1_BEGIN_DATE = datetime.datetime(2023, 9, 4)
CURRENT_DATE = datetime.datetime.now()
CURRENT_WEEK = max(1, int(1 + (CURRENT_DATE - WEEK_1_BEGIN_DATE).days / 7))
CURRENT_SEASON = 2023
class PracticeReportRawRow(BaseModel):
Team: str
Player: str
Position: str
Injury: str
Sun: Optional[str] = None
Mon: Optional[str] = None
Tue: Optional[str] = None
Wed: Optional[str] = None
Thu: Optional[str] = None
Fri: Optional[str] = None
Sat: Optional[str] = None
game_status: str = Field(alias="Game Status")
@classmethod
def replace_nan(self, value) -> str:
if isinstance(value, float):
if np.isnan(value):
return ""
return value
@classmethod
def from_raw(cls, input_dict) -> "PracticeReportRawRow":
return cls(**{DAY_OF_WEEK_STRING_MAPPING.get(k, k): cls.replace_nan(v) for k, v in input_dict.items()})
def scrape_team_injury_report(team: NFLTeam) -> pd.DataFrame:
print(f"Scraping Injury Report for: {team.team_full_name}")
injury_report_url = urljoin(team.injury_report_url, f"week/REG-{CURRENT_WEEK}")
try:
team_report = pd.read_html(injury_report_url)[0]
except Exception:
print(f"Failed to scrape practice report for: {team.team_full_name}")
return pd.DataFrame()
validated_row_list = []
for df_row_dict in team_report.to_dict("records"):
row_to_add = df_row_dict
row_to_add["Team"] = team.team_full_name
validated_row_list.append(PracticeReportRawRow.from_raw(row_to_add))
validated_df = pd.DataFrame([x.dict() for x in validated_row_list])
# drop all na columns
validated_df.dropna(axis=1, how="all", inplace=True)
# replace day of week with practice day from 1-3
day_idx = 1
last_practice_day = None
for col in validated_df.columns:
if col in PRACTICE_WEEK:
validated_df.rename(columns={col: str(day_idx)}, inplace=True)
day_idx += 1
last_practice_day = col
validated_df["Last Practice Day"] = last_practice_day
return validated_df
def scrape_all_team_injury_report() -> pd.DataFrame:
if MULTIPROCESSING_ENABLED:
with Pool() as pool:
team_df_list = pool.map(scrape_team_injury_report, ALL_TEAMS)
else:
team_df_list = [scrape_team_injury_report(team) for team in ALL_TEAMS]
return pd.concat(team_df_list)