socr / tests /datasets /download.py
scfive's picture
Upload 203 files
d6ea71e verified
"""Script for downloading test data."""
import argparse
import logging
import os
import shutil
import ssl
import warnings
from pathlib import Path
from urllib.parse import urlparse
from urllib.request import urlopen, urlretrieve
from zipfile import ZipFile, is_zipfile
import pandas as pd
try:
from tqdm import tqdm
except ImportError:
tqdm = None
import socceraction.atomic.spadl as atomicspadl
import socceraction.spadl as spadl
import socceraction.spadl.statsbomb as statsbomb
import socceraction.spadl.wyscout as wyscout
from socceraction.data.statsbomb import StatsBombLoader
from socceraction.data.wyscout import PublicWyscoutLoader
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
warnings.filterwarnings(
action="ignore", message="credentials were not supplied. open data access only"
)
# optional: if you get a SSL CERTIFICATE_VERIFY_FAILED exception
ssl._create_default_https_context = ssl._create_unverified_context
_data_dir = os.path.dirname(__file__)
def download_statsbomb_data() -> None:
"""Download and extract the StatsBomb open data repository."""
logging.info("Downloading StatsBomb data")
dataset_url = "https://github.com/statsbomb/open-data/archive/master.zip"
tmp_datafolder = os.path.join(_data_dir, "statsbomb", "tmp")
raw_datafolder = os.path.join(_data_dir, "statsbomb", "raw")
for datafolder in [tmp_datafolder, raw_datafolder]:
if not os.path.exists(datafolder):
os.makedirs(datafolder, exist_ok=True)
statsbombzip = os.path.join(tmp_datafolder, "statsbomb-open-data.zip")
with urlopen(dataset_url) as dl_file:
with open(statsbombzip, "wb") as out_file:
out_file.write(dl_file.read())
with ZipFile(statsbombzip, "r") as zipObj:
zipObj.extractall(tmp_datafolder)
shutil.rmtree(raw_datafolder)
Path(f"{tmp_datafolder}/open-data-master/data").rename(raw_datafolder)
shutil.rmtree(tmp_datafolder)
logging.info("Done! Data was saved to %s", raw_datafolder)
def convert_statsbomb_data() -> None:
"""Convert StatsBomb data to SPADL."""
logging.info("Converting StatsBomb data")
seasons = {
3: "2018",
}
leagues = {
"FIFA World Cup": "WorldCup",
}
spadl_datafolder = os.path.join(_data_dir, "statsbomb")
free_open_data_remote = "https://raw.githubusercontent.com/statsbomb/open-data/master/data/"
SBL = StatsBombLoader(root=free_open_data_remote, getter="remote")
# View all available competitions
df_competitions = SBL.competitions()
selected_competitions = df_competitions.competition_name.isin(leagues.keys())
selected_seasons = df_competitions.season_id.isin(seasons.keys())
df_selected_competitions = df_competitions.loc[selected_competitions & selected_seasons]
for competition in df_selected_competitions.itertuples():
# Get games from all selected competition
games = SBL.games(competition.competition_id, competition.season_id)
if tqdm is not None:
games_verbose = tqdm(list(games.itertuples()), desc="Loading match data")
else:
games_verbose = games.itertuples()
teams, players = [], []
competition_id = leagues[competition.competition_name]
season_id = seasons[competition.season_id]
spadl_h5 = os.path.join(spadl_datafolder, f"spadl-{competition_id}-{season_id}.h5")
with pd.HDFStore(spadl_h5) as spadlstore:
spadlstore.put("actiontypes", spadl.actiontypes_df(), format="table")
spadlstore.put("results", spadl.results_df(), format="table")
spadlstore.put("bodyparts", spadl.bodyparts_df(), format="table")
for game in games_verbose:
# load data
teams.append(SBL.teams(game.game_id))
players.append(SBL.players(game.game_id))
events = SBL.events(game.game_id)
# convert data
spadlstore.put(
f"actions/game_{game.game_id}",
statsbomb.convert_to_actions(events, game.home_team_id),
format="table",
)
games.season_id = season_id
games.competition_id = competition_id
spadlstore.put("games", games)
spadlstore.put(
"teams",
pd.concat(teams).drop_duplicates("team_id").reset_index(drop=True),
)
spadlstore.put(
"players",
pd.concat(players).drop_duplicates("player_id").reset_index(drop=True),
)
logging.info("Done! Data was saved to %s", spadl_datafolder)
def download_wyscout_data() -> None:
"""Download and extract the Wyscout public dataset."""
logging.info("Downloading Wyscout data")
# https://figshare.com/collections/Soccer_match_event_dataset/4415000/5
dataset_urls = {
"competitions": "https://ndownloader.figshare.com/files/15073685",
"teams": "https://ndownloader.figshare.com/files/15073697",
"players": "https://ndownloader.figshare.com/files/15073721",
"games": "https://ndownloader.figshare.com/files/14464622",
"events": "https://ndownloader.figshare.com/files/14464685",
}
raw_datafolder = os.path.join(_data_dir, "wyscout_public", "raw")
if not os.path.exists(raw_datafolder):
os.makedirs(raw_datafolder, exist_ok=True)
# download and unzip Wyscout open data
for url in dataset_urls.values():
url_obj = urlopen(url).geturl()
path = Path(urlparse(url_obj).path)
file_name = os.path.join(raw_datafolder, path.name)
file_local, _ = urlretrieve(url_obj, file_name)
if is_zipfile(file_local):
with ZipFile(file_local) as zip_file:
zip_file.extractall(raw_datafolder)
logging.info("Done! Data was saved to %s", raw_datafolder)
def convert_wyscout_data() -> None:
"""Convert Wyscout data to SPADL."""
logging.info("Converting Wyscout data")
seasons = {
10078: "2018",
}
leagues = {
28: "WorldCup",
}
raw_datafolder = os.path.join(_data_dir, "wyscout_public", "raw")
spadl_datafolder = os.path.join(_data_dir, "wyscout_public")
WYL = PublicWyscoutLoader(root=raw_datafolder)
# View all available competitions
df_competitions = WYL.competitions()
selected_competitions = df_competitions.competition_id.isin(leagues.keys())
df_selected_competitions = df_competitions.loc[selected_competitions]
for competition in df_selected_competitions.itertuples():
# Get games from all selected competition
games = WYL.games(competition.competition_id, competition.season_id)
if tqdm is not None:
games_verbose = tqdm(list(games.itertuples()), desc="Loading match data")
else:
games_verbose = games.itertuples()
teams, players = [], []
competition_id = leagues[competition.competition_id]
season_id = seasons[competition.season_id]
spadl_h5 = os.path.join(spadl_datafolder, f"spadl-{competition_id}-{season_id}.h5")
with pd.HDFStore(spadl_h5) as spadlstore:
spadlstore.put("actiontypes", spadl.actiontypes_df(), format="table")
spadlstore.put("results", spadl.results_df(), format="table")
spadlstore.put("bodyparts", spadl.bodyparts_df(), format="table")
for game in games_verbose:
# load data
teams.append(WYL.teams(game.game_id))
players.append(WYL.players(game.game_id))
events = WYL.events(game.game_id)
# convert data
spadlstore.put(
f"actions/game_{game.game_id}",
wyscout.convert_to_actions(events, game.home_team_id),
# format='table',
)
games.season_id = season_id
games.competition_id = competition_id
spadlstore.put("games", games)
spadlstore.put(
"teams",
pd.concat(teams).drop_duplicates("team_id").reset_index(drop=True),
)
spadlstore.put(
"players",
pd.concat(players).drop_duplicates("player_id").reset_index(drop=True),
)
logging.info("Done! Data was saved to %s", spadl_datafolder)
def create_spadl(game_id: int, home_team_id: int) -> None:
"""Create SPADL actions from StatsBomb data for a given game."""
logging.info("Creating SPADL data")
spadl_datafolder = os.path.join(_data_dir, "spadl")
if not os.path.exists(spadl_datafolder):
os.makedirs(spadl_datafolder, exist_ok=True)
# load events
free_open_data_remote = "https://raw.githubusercontent.com/statsbomb/open-data/master/data/"
SBL = StatsBombLoader(root=free_open_data_remote, getter="remote")
events = SBL.events(game_id)
# convert to spadl
spadl_json = os.path.join(spadl_datafolder, "spadl.json")
df_actions = statsbomb.convert_to_actions(events, home_team_id)
pd.concat(
[
df_actions[df_actions.period_id == 1].head(n=200),
df_actions[df_actions.period_id == 2].head(n=200),
]
).to_json(spadl_json, orient="records")
# convert to atomic spadl
atomic_spadl_json = os.path.join(spadl_datafolder, "atomic_spadl.json")
df_atomic_actions = atomicspadl.convert_to_atomic(df_actions)
pd.concat(
[
df_atomic_actions[df_atomic_actions.period_id == 1].head(n=200),
df_atomic_actions[df_atomic_actions.period_id == 2].head(n=200),
]
).to_json(atomic_spadl_json, orient="records")
logging.info("Done! SPADL data was saved to %s and %s", spadl_json, atomic_spadl_json)
if __name__ == "__main__":
# Setup logging
logging.basicConfig(level=logging.INFO)
# Create the parser
my_parser = argparse.ArgumentParser(
prog="download",
usage="%(prog)s [options]",
formatter_class=argparse.RawTextHelpFormatter,
description="""Download and prepare the data needed for running the tests.
Use the options specified below to select specific preprocessing
steps. When this script is run without any options, all preprocessing
steps required to run the default test setup will be executed.
""",
)
# Add the arguments
my_parser.add_argument(
"--download-statsbomb",
action="store_true",
help="Download the public StatsBomb data.",
)
my_parser.add_argument(
"--convert-statsbomb",
action="store_true",
help="Convert the public StatsBomb data to SPADL.",
)
my_parser.add_argument(
"--download-wyscout",
action="store_true",
help="Download the public Wyscout data.",
)
my_parser.add_argument(
"--convert-wyscout",
action="store_true",
help="Convert the public Wyscout data to SPADL.",
)
my_parser.add_argument(
"--spadl",
action="store_true",
help="Create a JSON file with example SPADL and Atomic-SPADL data.",
)
# Execute the parse_args() method
args = my_parser.parse_args()
no_options = not any(
[
args.download_statsbomb,
args.convert_statsbomb,
args.download_wyscout,
args.convert_wyscout,
args.spadl,
]
)
# Run the requested steps
if args.download_statsbomb or no_options:
download_statsbomb_data()
if args.convert_statsbomb:
convert_statsbomb_data()
if args.download_wyscout or no_options:
download_wyscout_data()
if args.convert_wyscout:
convert_wyscout_data()
if args.spadl:
create_spadl(8657, 777)