import os import pickle import pandas as pd import gradio as gr import plotly.express as px from datetime import datetime from huggingface_hub import HfApi from apscheduler.schedulers.background import BackgroundScheduler from utils import ( KEY_TO_CATEGORY_NAME, CAT_NAME_TO_EXPLANATION, download_latest_data_from_space, get_constants, update_release_date_mapping, format_data, get_trendlines, find_crossover_point, ) ################### ### Initialize scheduler ################### def restart_space(): HfApi(token=os.getenv("HF_TOKEN", None)).restart_space( repo_id="andrewrreed/closed-vs-open-arena-elo" ) print(f"Space restarted on {datetime.now()}") # restart the space every day at 9am scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "cron", day_of_week="mon-sun", hour=7, minute=0) scheduler.start() ################### ### Load Data ################### # gather ELO data latest_elo_file_local = download_latest_data_from_space( repo_id="lmsys/chatbot-arena-leaderboard", file_type="pkl" ) with open(latest_elo_file_local, "rb") as fin: elo_results = pickle.load(fin) arena_dfs = {} for k in KEY_TO_CATEGORY_NAME.keys(): if k not in elo_results: continue arena_dfs[KEY_TO_CATEGORY_NAME[k]] = elo_results[k]["leaderboard_table_df"] # gather open llm leaderboard data latest_leaderboard_file_local = download_latest_data_from_space( repo_id="lmsys/chatbot-arena-leaderboard", file_type="csv" ) leaderboard_df = pd.read_csv(latest_leaderboard_file_local) # load release date mapping data release_date_mapping = pd.read_json("release_date_mapping.json", orient="records") ################### ### Prepare Data ################### # update release date mapping with new models # check for new models in ELO data new_model_keys_to_add = [ model for model in arena_dfs["Overall"].index.to_list() if model not in release_date_mapping["key"].to_list() ] if new_model_keys_to_add: release_date_mapping = update_release_date_mapping( new_model_keys_to_add, leaderboard_df, release_date_mapping ) # merge leaderboard data with ELO data merged_dfs = {} for k, v in arena_dfs.items(): merged_dfs[k] = ( pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on="key") .sort_values("rating", ascending=False) .reset_index(drop=True) ) # add release dates into the merged data for k, v in merged_dfs.items(): merged_dfs[k] = pd.merge( merged_dfs[k], release_date_mapping[["key", "Release Date"]], on="key" ) # format dataframes merged_dfs = {k: format_data(v) for k, v in merged_dfs.items()} # get constants min_elo_score, max_elo_score, upper_models_per_month = get_constants(merged_dfs) date_updated = elo_results["full"]["last_updated_datetime"].split(" ")[0] orgs = merged_dfs["Overall"].Organization.unique().tolist() ################### ### Build and Plot Data ################### def get_data_split(dfs, set_name): df = dfs[set_name].copy(deep=True) return df.reset_index(drop=True) def clean_df_for_display(df): df = df.loc[ :, [ "Model", "rating", "MMLU", "MT-bench (score)", "Release Date", "Organization", "License", "Link", ], ].rename(columns={"rating": "ELO Score", "MT-bench (score)": "MT-Bench"}) df["Release Date"] = df["Release Date"].astype(str) df.sort_values("ELO Score", ascending=False, inplace=True) df.reset_index(drop=True, inplace=True) return df def filter_df(min_score, max_models_per_month, set_selector, org_selector): df = get_data_split(merged_dfs, set_name=set_selector) # filter data filtered_df = df[ (df["rating"] >= min_score) & (df["Organization"].isin(org_selector)) ] filtered_df = ( filtered_df.groupby(["Month-Year", "License"], group_keys=False) .apply(lambda x: x.nlargest(max_models_per_month, "rating")) .reset_index(drop=True) ) return filtered_df def build_plot(toggle_annotations, filtered_df): # construct plot custom_colors = {"Open": "#ff7f0e", "Proprietary": "#1f77b4"} fig = px.scatter( filtered_df, x="Release Date", y="rating", color="License", hover_name="Model", hover_data=["Organization", "License", "Link"], trendline="ols", title=f"Open vs Proprietary LLMs by LMSYS Arena ELO Score
(as of {date_updated})", labels={"rating": "Arena ELO", "Release Date": "Release Date"}, height=700, template="plotly_dark", color_discrete_map=custom_colors, ) fig.update_layout( plot_bgcolor="rgba(0,0,0,0)", # Set background color to transparent paper_bgcolor="rgba(0,0,0,0)", # Set paper (plot) background color to transparent title={"x": 0.5}, ) fig.update_traces(marker=dict(size=10, opacity=0.6)) # calculate days until crossover trend1, trend2 = get_trendlines(fig) crossover = find_crossover_point( b1=trend1[0], m1=trend1[1], b2=trend2[0], m2=trend2[1] ) days_til_crossover = ( pd.to_datetime(crossover, unit="s") - pd.Timestamp.today() ).days # add annotation with number of models and days til crossover fig.add_annotation( xref="paper", yref="paper", # use paper coordinates x=-0.05, y=1.13, text=f"Number of models: {len(filtered_df)}
Days til crossover: {days_til_crossover}", showarrow=False, font=dict(size=14, color="white"), bgcolor="rgba(0,0,0,0.5)", ) if toggle_annotations: # get the points to annotate (only the highest rated model per month per license) idx_to_annotate = filtered_df.groupby(["Month-Year", "License"])[ "rating" ].idxmax() points_to_annotate_df = filtered_df.loc[idx_to_annotate] for i, row in points_to_annotate_df.iterrows(): fig.add_annotation( x=row["Release Date"], y=row["rating"], text=row["Model"], showarrow=True, arrowhead=0, ) return fig, clean_df_for_display(filtered_df) set_dark_mode = """ function refresh() { const url = new URL(window.location); if (url.searchParams.get('__theme') !== 'dark') { url.searchParams.set('__theme', 'dark'); window.location.href = url.href; } } """ with gr.Blocks( theme=gr.themes.Soft( primary_hue=gr.themes.colors.sky, secondary_hue=gr.themes.colors.green, # spacing_size=gr.themes.sizes.spacing_sm, text_size=gr.themes.sizes.text_sm, font=[ gr.themes.GoogleFont("Open Sans"), "ui-sans-serif", "system-ui", "sans-serif", ], ), js=set_dark_mode, ) as demo: gr.Markdown( """

🔬 Progress Tracker: Open vs. Proprietary LLMs 🔬

This app visualizes the progress of proprietary and open-source LLMs over time as scored by the LMSYS Chatbot Arena. The idea is inspired by this great work from Maxime Labonne, and is intended to stay up-to-date as new models are released and evaluated.

Plot info:

""" ) with gr.Group(): with gr.Row(variant="compact"): set_selector = gr.Dropdown( choices=list(CAT_NAME_TO_EXPLANATION.keys()), label="Select Category", value="Overall", info="Select the category to visualize", ) min_score = gr.Slider( minimum=min_elo_score, maximum=max_elo_score, value=(max_elo_score - min_elo_score) * 0.3 + min_elo_score, step=50, label="Minimum ELO Score", info="Filter out low scoring models", ) max_models_per_month = gr.Slider( value=upper_models_per_month - 2, minimum=1, maximum=upper_models_per_month, step=1, label="Max Models per Month (per License)", info="Limit to N best models per month per license to reduce clutter", ) toggle_annotations = gr.Radio( choices=[True, False], label="Overlay Best Model Name", value=True, info="Toggle to overlay the name of the best model per month per license", ) with gr.Row(variant="compact"): with gr.Accordion("More options", open=False): org_selector = gr.Dropdown( choices=sorted(orgs), label="Filter by Organization", value=sorted(orgs), multiselect=True, info="Limit organizations included in plot", ) # Show plot filtered_df = gr.State() with gr.Group(): with gr.Tab("Plot"): plot = gr.Plot(show_label=False) with gr.Tab("Raw Data"): display_df = gr.DataFrame() demo.load( fn=filter_df, inputs=[min_score, max_models_per_month, set_selector, org_selector], outputs=filtered_df, ).then( fn=build_plot, inputs=[toggle_annotations, filtered_df], outputs=[plot, display_df], ) min_score.change( fn=filter_df, inputs=[min_score, max_models_per_month, set_selector, org_selector], outputs=filtered_df, ).then( fn=build_plot, inputs=[toggle_annotations, filtered_df], outputs=[plot, display_df], ) max_models_per_month.change( fn=filter_df, inputs=[min_score, max_models_per_month, set_selector, org_selector], outputs=filtered_df, ).then( fn=build_plot, inputs=[toggle_annotations, filtered_df], outputs=[plot, display_df], ) toggle_annotations.change( fn=filter_df, inputs=[min_score, max_models_per_month, set_selector, org_selector], outputs=filtered_df, ).then( fn=build_plot, inputs=[toggle_annotations, filtered_df], outputs=[plot, display_df], ) set_selector.change( fn=filter_df, inputs=[min_score, max_models_per_month, set_selector, org_selector], outputs=filtered_df, ).then( fn=build_plot, inputs=[toggle_annotations, filtered_df], outputs=[plot, display_df], ) org_selector.change( fn=filter_df, inputs=[min_score, max_models_per_month, set_selector, org_selector], outputs=filtered_df, ).then( fn=build_plot, inputs=[toggle_annotations, filtered_df], outputs=[plot, display_df], ) gr.Markdown( """

If you have any questions, feel free to open a discussion or reach out to me on social.

""" ) demo.launch()