import logging from pathlib import Path from typing import Optional, Tuple import matplotlib.pyplot as plt import pandas as pd import plotly.graph_objects as go import requests import seaborn as sns from bs4 import BeautifulSoup from wordcloud import WordCloud from util import get_max_abs_int, snake_case_to_human_readable, int_csv_to_list def _rename_columns(df: pd.DataFrame, is_tournament: bool) -> pd.DataFrame: columns = { "Rating": "rating", "Result": "result", "Scores": "scores", "Opponent": "opponent", "OpponentRating": "opponent_rating", } if is_tournament: columns.update({ "TournamentStartDate": "tournament_start_date", "TournamentEndDate": "tournament_end_date", " Touranament": "tournament", }) else: columns.update({ "EventDate": "event_date", "LeagueName": "league_name" }) return df.rename(columns=columns) def _fix_dtypes(df: pd.DataFrame, is_tournament: bool) -> pd.DataFrame: if is_tournament: df["tournament_start_date"] = pd.to_datetime(df["tournament_start_date"]) df["tournament_end_date"] = pd.to_datetime(df["tournament_end_date"]) df["tournament"] = df["tournament"].astype('category') else: df["event_date"] = pd.to_datetime(df["event_date"]) df["league_name"] = df["league_name"].astype('string') df["rating"] = df["rating"].astype('int') df["result"] = df["result"].astype('category') df["scores"] = df["scores"].astype('string') df["opponent"] = df["opponent"].astype('category') df["opponent_rating"] = df["opponent_rating"].astype('int') return df def make_df_columns_readable(df: Optional[pd.DataFrame], is_tournament: bool) -> Optional[pd.DataFrame]: """Make a data frame's columns human-readable.""" if df is None: return None nat_to_none = lambda x: None if x == "NaT" else x if is_tournament: if "tournament_start_date" in df.columns and "tournament_end_date" in df.columns: df['tournament_start_date'] = pd.to_datetime(df['tournament_start_date']) df['tournament_end_date'] = pd.to_datetime(df['tournament_end_date']) df['tournament_start_date'] = df['tournament_start_date'].dt.date.astype(str).apply(nat_to_none) df['tournament_end_date'] = df['tournament_end_date'].dt.date.astype(str).apply(nat_to_none) def create_date(tournament_start_date, tournament_end_date): missing_start_date = tournament_start_date is None missing_end_date = tournament_end_date is None if not missing_start_date and not missing_end_date: if tournament_start_date is not tournament_end_date: return ' - '.join((tournament_start_date, tournament_end_date)) else: return tournament_start_date else: return tournament_start_date if missing_end_date else tournament_end_date df["date"] = df.apply(lambda row: create_date(row['tournament_start_date'], row['tournament_end_date']), axis=1) df = df.drop(columns=["tournament_start_date", "tournament_end_date"]) # Move date to the front. columns = list(df.columns) columns.insert(0, columns.pop(columns.index("date"))) df = df.loc[:, columns] else: if "event_date" in df.columns: df['event_date'] = pd.to_datetime(df['event_date']) df['event_date'] = df['event_date'].dt.date.astype(str).apply(nat_to_none) df = df.rename(columns={"league_name": "league"}) df = df.rename(columns=lambda c: snake_case_to_human_readable(c)) return df def _check_match_type(match_type: str) -> str: allowed_match_types = {"tournament", "league"} if match_type not in allowed_match_types: raise ValueError( f"The only supported match types are {allowed_match_types}. Found match type of '{match_type}'.") return match_type def fetch_player_name(profile_id: int) -> str: """Fetch a player name from theUSATT website. note: the profile ID is NOT the USATT number. """ url = f"https://usatt.simplycompete.com/userAccount/up/{profile_id}" logging.info(f"Fetching player name from {url}") page = requests.get(url) soup = BeautifulSoup(page.content, "html.parser") profile_elt = soup.find("div", class_="profile-header") return profile_elt.find(class_="title").text.strip() def get_player_name(file_stem: str) -> str: profile_id = int(file_stem.split(" ")[0].replace("_", "").split("matches")[-1]) return fetch_player_name(profile_id) def get_num_competitions_played(df: pd.DataFrame, is_tournament: bool) -> int: key_name = "tournament_end_date" if is_tournament else "event_date" return df[key_name].nunique() def get_first_competition_year(df: pd.DataFrame, is_tournament: bool) -> int: key_name = "tournament_end_date" if is_tournament else "event_date" return df[key_name].min().year def get_num_active_years(df: pd.DataFrame, is_tournament: bool) -> int: key_name = "tournament_end_date" if is_tournament else "event_date" return df[key_name].dt.year.nunique() def get_current_rating(df: pd.DataFrame) -> int: return df.rating.iloc[0] def get_max_rating(df: pd.DataFrame) -> int: return df.rating.max() def get_matches_per_competition_fig(df: pd.DataFrame, is_tournament: bool): fig = plt.figure() plt.title('Matches per competition') sns.histplot(df.groupby('tournament' if is_tournament else "event_date", observed=False).size()) plt.xlabel('Number of matches in competition') return fig def get_competition_name_word_cloud_fig(df: pd.DataFrame, is_tournament: bool): fig = plt.figure() key_name = "tournament" if is_tournament else "league_name" wordcloud = WordCloud().generate(" ".join(df[key_name].values.tolist())) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") return fig def get_opponent_name_word_cloud_fig(df: pd.DataFrame): fig = plt.figure() wordcloud = WordCloud().generate(" ".join(df.opponent.values.tolist())) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") return fig def get_rating_over_time_fig(df: pd.DataFrame, is_tournament: bool, span: int = 60): df['ema'] = df['rating'].ewm(span=span, adjust=False).mean() fig = go.Figure() # Raw rating over time trace x_key_name = "tournament_end_date" if is_tournament else "event_date" fig.add_trace(go.Scatter(x=df[x_key_name], y=df["rating"], name='Rating', mode='lines+markers', line=dict(width=0.9), marker=dict(size=4))), # EMA trace fig.add_trace(go.Scatter(x=df[x_key_name], y=df["ema"], mode='lines', name='Rating EMA', visible='legendonly', line=dict(width=1.5, dash='dot'))) fig.update_layout( title='Rating over time', xaxis_title='Competition date', yaxis_title='Rating', showlegend=True, template="plotly_white", ) return fig def get_match_with_longest_game(df: pd.DataFrame, is_tournament: bool) -> Optional[pd.DataFrame]: if not is_tournament: return None df_non_null = df.loc[~df.scores.isna()] return df_non_null.iloc[[df_non_null.scores.apply(get_max_abs_int).argmax()]] def get_win_loss_record_str(group_df) -> str: if len(group_df) > 0: win_loss_counts = group_df.value_counts() n_wins = win_loss_counts.Won if hasattr(win_loss_counts, "Won") else 0 n_losses = win_loss_counts.Lost if hasattr(win_loss_counts, "Lost") else 0 else: n_wins = 0 n_losses = 0 return f"{n_wins}, {n_losses}" def get_most_frequent_opponents(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame: df_with_opponents = df.loc[df.opponent != "-, -"] most_common_opponents_df = df_with_opponents.groupby('opponent', observed=False).agg( {"result": [get_win_loss_record_str, "size"]}) most_common_opponents_df.columns = most_common_opponents_df.columns.get_level_values(1) most_common_opponents_df.rename({"get_win_loss_record_str": "Win/loss record", "size": "Number of matches"}, axis=1, inplace=True) most_common_opponents_df["Opponent"] = most_common_opponents_df.index return most_common_opponents_df.sort_values("Number of matches", ascending=False)[ ["Opponent", "Number of matches", "Win/loss record"]].head(top_n) def get_best_wins(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame: """Get the top-n wins sorted by opponent rating.""" return df.loc[df.result == 'Won'].sort_values("opponent_rating", ascending=False).head(top_n) def get_biggest_upsets(df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame: """Get the top-n wins sorted by rating difference.""" df['rating_difference'] = df['opponent_rating'] - df['rating'] return df.loc[df.result == 'Won'].sort_values("rating_difference", ascending=False).head(top_n) def get_worst_recent_losses(df: pd.DataFrame, is_tournament: bool, top_k_losses: int = 5, top_n_comps: int = 5) -> pd.DataFrame: """Get the top-k most recent worst losses from the top-n most recent competitions.""" x_key_name = "tournament_end_date" if is_tournament else "event_date" most_recent_competition_dates = df.groupby(x_key_name).first().reset_index().nlargest(top_n_comps, columns=x_key_name)[ x_key_name] df_recent = df.loc[df[x_key_name].isin(most_recent_competition_dates)] return df_recent.loc[df_recent.result == 'Lost'].sort_values("opponent_rating", ascending=True).head(top_k_losses) def get_best_competitions(df: pd.DataFrame, is_tournament: bool, top_n: int = 5) -> pd.DataFrame: # First add pre-competition ratings x_key_name = "tournament_end_date" if is_tournament else "event_date" grouped = df.groupby(x_key_name) # We incorrectly fill the first pre-competition rating to the first rating so that # the top-k rating differences make sense. fill_value = df.iloc[-1].rating pre_comp_ratings_by_group = grouped['rating'].first().shift(periods=1, fill_value=fill_value) def assign_pre_comp_rating(group_df): """Assign a pre-competition rating to a given group.""" comp_end_date = group_df[x_key_name].unique()[0] group_df['pre-competition_rating'] = pre_comp_ratings_by_group.loc[comp_end_date] return group_df df = grouped.apply(lambda x: assign_pre_comp_rating(x)) df['rating_increase'] = df['rating'] - df['pre-competition_rating'] df.reset_index(drop=True, inplace=True) best_competition_dates = df.groupby(x_key_name)["rating_increase"].first().nlargest(top_n).index tournament_df = df.loc[df[x_key_name].isin(best_competition_dates)].groupby( [x_key_name]).first().sort_values(by='rating_increase', ascending=False).reset_index() cols = [] if is_tournament: cols += ['tournament_start_date', 'tournament_end_date', 'tournament'] else: cols += ["event_date", "league_name"] cols += ['rating_increase', 'pre-competition_rating', 'rating'] tournament_df = tournament_df[cols] tournament_df = tournament_df.rename(columns={"rating": "post-competition_rating"}) return tournament_df def get_highest_rated_opponent(df: pd.DataFrame) -> pd.DataFrame: return df.iloc[df.opponent_rating.idxmax()].to_frame().transpose() def get_opponent_rating_distr_fig(df: pd.DataFrame): fig = plt.figure() plt.title('Opponent rating distribution') sns.histplot(data=df, x="opponent_rating", hue='result') plt.xlabel('Opponent rating') return fig def get_opponent_rating_dist_over_time_fig(df: pd.DataFrame, is_tournament: bool): fig, ax = plt.subplots(figsize=(12, 8)) plt.title(f'Opponent rating distribution over time') x_key_name = "tournament_end_date" if is_tournament else "event_date" sns.violinplot(data=df, x=df[x_key_name].dt.year, y="opponent_rating", hue="result", split=True, inner='points', cut=1, ax=ax) plt.xticks(rotation=30) plt.xlabel('Competition year') plt.ylabel('Opponent rating') return fig def get_total_match_points(score_str: str) -> int: single_game_scores = int_csv_to_list(score_str) total_points = 0 for single_game_score in single_game_scores: abs_gscore = abs(single_game_score) if abs_gscore < 10: total_points += abs_gscore + 11 else: total_points += 2 * abs_gscore + 2 return total_points def get_longest_match(df: pd.DataFrame, is_tournament: bool) -> Optional[pd.DataFrame]: """Get the longest match, where longest is defined as the most number of points played.""" if not is_tournament: return None df_non_null = df.loc[~df.scores.isna()] df_non_null["total_points"] = df_non_null.scores.apply(get_total_match_points) return df_non_null.iloc[[df_non_null["total_points"].argmax()]] def load_match_df(file_path: Path) -> Tuple[pd.DataFrame, bool]: match_type = _check_match_type(file_path.name.split('_')[0]) is_tournament = match_type == "tournament" df = pd.read_csv(file_path) df = _rename_columns(df, is_tournament) df = _fix_dtypes(df, is_tournament) logging.info(f"Loaded match CSV {file_path}.") return df, is_tournament