import os import pickle import pandas as pd import numpy as np import gradio as gr from datetime import datetime from huggingface_hub import HfApi from apscheduler.schedulers.background import BackgroundScheduler import plotly.graph_objects as go from utils import ( KEY_TO_CATEGORY_NAME, CAT_NAME_TO_EXPLANATION, download_latest_data_from_space, get_constants, update_release_date_mapping, format_data, get_trendlines, find_crossover_point, sigmoid_transition ) ################### ### Initialize scheduler ################### def restart_space(): HfApi(token=os.getenv("HF_TOKEN", None)).restart_space( repo_id="andrewrreed/closed-vs-open-arena-elo" ) print(f"Space restarted on {datetime.now()}") # restart the space every day at 9am scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "cron", day_of_week="mon-sun", hour=7, minute=0) scheduler.start() ################### ### Load Data ################### # gather ELO data latest_elo_file_local = download_latest_data_from_space( repo_id="lmsys/chatbot-arena-leaderboard", file_type="pkl" ) with open(latest_elo_file_local, "rb") as fin: elo_results = pickle.load(fin) # TO-DO: need to also include vision elo_results = elo_results["text"] arena_dfs = {} for k in KEY_TO_CATEGORY_NAME.keys(): if k not in elo_results: continue arena_dfs[KEY_TO_CATEGORY_NAME[k]] = elo_results[k]["leaderboard_table_df"] # gather open llm leaderboard data latest_leaderboard_file_local = download_latest_data_from_space( repo_id="lmsys/chatbot-arena-leaderboard", file_type="csv" ) leaderboard_df = pd.read_csv(latest_leaderboard_file_local) # load release date mapping data release_date_mapping = pd.read_json("release_date_mapping.json", orient="records") ################### ### Prepare Data ################### # update release date mapping with new models # check for new models in ELO data new_model_keys_to_add = [ model for model in arena_dfs["Overall"].index.to_list() if model not in release_date_mapping["key"].to_list() ] if new_model_keys_to_add: release_date_mapping = update_release_date_mapping( new_model_keys_to_add, leaderboard_df, release_date_mapping ) # merge leaderboard data with ELO data merged_dfs = {} for k, v in arena_dfs.items(): merged_dfs[k] = ( pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on="key") .sort_values("rating", ascending=False) .reset_index(drop=True) ) # add release dates into the merged data for k, v in merged_dfs.items(): merged_dfs[k] = pd.merge( merged_dfs[k], release_date_mapping[["key", "Release Date"]], on="key" ) # format dataframes merged_dfs = {k: format_data(v) for k, v in merged_dfs.items()} # get constants min_elo_score, max_elo_score, _ = get_constants(merged_dfs) date_updated = elo_results["full"]["last_updated_datetime"].split(" ")[0] orgs = merged_dfs["Overall"].Organization.unique().tolist() ################### ### Build and Plot Data ################### df = merged_dfs["Overall"] top_orgs = df.groupby("Organization")["rating"].max().nlargest(11).index.tolist() df = df.loc[(df["Organization"].isin(top_orgs)) & (df["rating"] > 1000)] print(df) df = df.loc[~df["Release Date"].isna()] def get_data_split(dfs, set_name): df = dfs[set_name].copy(deep=True) return df.reset_index(drop=True) def clean_df_for_display(df): df = df.loc[ :, [ "Model", "rating", "MMLU", "MT-bench (score)", "Release Date", "Organization", "License", "Link", ], ].rename(columns={"rating": "ELO Score", "MT-bench (score)": "MT-Bench"}) df["Release Date"] = df["Release Date"].astype(str) df.sort_values("ELO Score", ascending=False, inplace=True) df.reset_index(drop=True, inplace=True) return df def format_data(df): """ Formats the given DataFrame by performing the following operations: - Converts the 'License' column values to 'Proprietary LLM' if they are in PROPRIETARY_LICENSES, otherwise 'Open LLM'. - Converts the 'Release Date' column to datetime format. - Adds a new 'Month-Year' column by extracting the month and year from the 'Release Date' column. - Rounds the 'rating' column to the nearest integer. - Resets the index of the DataFrame. Args: df (pandas.DataFrame): The DataFrame to be formatted. Returns: pandas.DataFrame: The formatted DataFrame. """ PROPRIETARY_LICENSES = ["Proprietary", "Proprietory"] df["License"] = df["License"].apply( lambda x: "Proprietary LLM" if x in PROPRIETARY_LICENSES else "Open LLM" ) df["Release Date"] = pd.to_datetime(df["Release Date"]) df["Month-Year"] = df["Release Date"].dt.to_period("M") df["rating"] = df["rating"].round() return df.reset_index(drop=True) # Define organization to country mapping and colors org_info = { "OpenAI": ("#00A67E", "🇺🇸"), # Teal "Google": ("#4285F4", "🇺🇸"), # Google Blue "xAI": ("black", "🇺🇸"), # Bright Orange "Anthropic": ("#cc785c", "🇺🇸"), # Brown (as requested) "Meta": ("#0064E0", "🇺🇸"), # Facebook Blue "Alibaba": ("#6958cf", "🇨🇳"), "DeepSeek": ("#C70039", "🇨🇳"), "01 AI": ("#11871e", "🇨🇳"), # Bright Green "DeepSeek AI": ("#9900CC", "🇨🇳"), # Purple "Mistral": ("#ff7000", "🇫🇷"), # Mistral Orange (as requested) "AI21 Labs": ("#1E90FF", "🇮🇱"), # Dodger Blue, "Reka AI": ("#FFC300", "🇺🇸"), "Zhipu AI": ("#FFC300", "🇨🇳"), } def make_figure(df): fig = go.Figure() for i, org in enumerate( df.groupby("Organization")["rating"] .max() .sort_values(ascending=False) .index.tolist() ): org_data = df[df["Organization"] == org] if len(org_data) > 0: x_values = [] y_values = [] current_best = -np.inf best_models = [] # Group by date and get the best model for each date daily_best = org_data.groupby("Release Date").first().reset_index() for _, row in daily_best.iterrows(): if row["rating"] > current_best: if len(x_values) > 0: # Create smooth transition transition_days = (row["Release Date"] - x_values[-1]).days transition_points = pd.date_range( x_values[-1], row["Release Date"], periods=max(100, transition_days), ) x_values.extend(transition_points) transition_y = current_best + ( row["rating"] - current_best ) * sigmoid_transition( np.linspace(-6, 6, len(transition_points)), 0, k=1 ) y_values.extend(transition_y) x_values.append(row["Release Date"]) y_values.append(row["rating"]) current_best = row["rating"] best_models.append(row) # Extend the line to the current date if x_values[-1] < current_date: x_values.append(current_date) y_values.append(current_best) # Get org color and flag color, flag = org_info.get(org, ("#808080", "")) # Add line plot fig.add_trace( go.Scatter( x=x_values, y=y_values, mode="lines", name=f"{i+1}. {org} {flag}", line=dict(color=color, width=2), hoverinfo="skip", ) ) # Add scatter plot for best model points best_models_df = pd.DataFrame(best_models) fig.add_trace( go.Scatter( x=best_models_df["Release Date"], y=best_models_df["rating"], mode="markers", name=org, showlegend=False, marker=dict(color=color, size=8, symbol="circle"), text=best_models_df["Model"], hovertemplate="%{text}
Date: %{x}
ELO Score: %{y:.2f}", ) ) # Update layout fig.update_layout( xaxis_title="Date", title="La course au classement", yaxis_title="Score ELO", legend_title="Classement en Novembre 2024", xaxis_range=[pd.Timestamp("2024-01-01"), current_date], # Extend x-axis for labels yaxis_range=[1103, 1350], ) # apply_template(fig) fig.update_xaxes( tickformat="%m-%Y", ) return fig, df def filter_df(): return df set_dark_mode = """ function refresh() { const url = new URL(window.location); if (url.searchParams.get('__theme') !== 'dark') { url.searchParams.set('__theme', 'dark'); window.location.href = url.href; } } """ with gr.Blocks( theme=gr.themes.Soft( primary_hue=gr.themes.colors.sky, secondary_hue=gr.themes.colors.green, # spacing_size=gr.themes.sizes.spacing_sm, text_size=gr.themes.sizes.text_sm, font=[ gr.themes.GoogleFont("Open Sans"), "ui-sans-serif", "system-ui", "sans-serif", ], ), js=set_dark_mode, ) as demo: gr.Markdown( """

🚀 The race for the best LLM 🚀

This app visualizes the progress of LLMs over time as scored by the LMSYS Chatbot Arena. The app is adapted from this app by Andew Reed, and is intended to stay up-to-date as new models are released and evaluated.

Plot info:

""" ) filtered_df = gr.State() with gr.Group(): with gr.Tab("Plot"): plot = gr.Plot(show_label=False) with gr.Tab("Raw Data"): display_df = gr.DataFrame() demo.load( fn=filter_df, inputs=[], outputs=filtered_df, ).then( fn=make_figure, inputs=[filtered_df], outputs=[plot, display_df], ) demo.launch()