import os import pickle import pandas as pd import gradio as gr import plotly.express as px from datetime import datetime from huggingface_hub import HfApi from apscheduler.schedulers.background import BackgroundScheduler from utils import ( KEY_TO_CATEGORY_NAME, CAT_NAME_TO_EXPLANATION, download_latest_data_from_space, get_constants, update_release_date_mapping, format_data, ) ################### ### Initialize scheduler ################### def restart_space(): HfApi(token=os.getenv("HF_TOKEN", None)).restart_space( repo_id="andrewrreed/closed-vs-open-arena-elo" ) print(f"Space restarted on {datetime.now()}") # restart the space every day at 9am scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "cron", day_of_week="mon-sun", hour=7, minute=0) scheduler.start() ################### ### Load Data ################### # gather ELO data latest_elo_file_local = download_latest_data_from_space( repo_id="lmsys/chatbot-arena-leaderboard", file_type="pkl" ) with open(latest_elo_file_local, "rb") as fin: elo_results = pickle.load(fin) arena_dfs = {} for k in KEY_TO_CATEGORY_NAME.keys(): if k not in elo_results: continue arena_dfs[KEY_TO_CATEGORY_NAME[k]] = elo_results[k]["leaderboard_table_df"] # gather open llm leaderboard data latest_leaderboard_file_local = download_latest_data_from_space( repo_id="lmsys/chatbot-arena-leaderboard", file_type="csv" ) leaderboard_df = pd.read_csv(latest_leaderboard_file_local) # load release date mapping data release_date_mapping = pd.read_json("release_date_mapping.json", orient="records") ################### ### Prepare Data ################### # update release date mapping with new models # check for new models in ELO data new_model_keys_to_add = [ model for model in arena_dfs["Overall"].index.to_list() if model not in release_date_mapping["key"].to_list() ] if new_model_keys_to_add: release_date_mapping = update_release_date_mapping( new_model_keys_to_add, leaderboard_df, release_date_mapping ) # merge leaderboard data with ELO data merged_dfs = {} for k, v in arena_dfs.items(): merged_dfs[k] = ( pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on="key") .sort_values("rating", ascending=False) .reset_index(drop=True) ) # add release dates into the merged data for k, v in merged_dfs.items(): merged_dfs[k] = pd.merge( merged_dfs[k], release_date_mapping[["key", "Release Date"]], on="key" ) # format dataframes merged_dfs = {k: format_data(v) for k, v in merged_dfs.items()} # get constants min_elo_score, max_elo_score, upper_models_per_month = get_constants(merged_dfs) date_updated = elo_results["full"]["last_updated_datetime"].split(" ")[0] ################### ### Plot Data ################### def get_data_split(dfs, set_name): df = dfs[set_name].copy(deep=True) return df.reset_index(drop=True) def build_plot(min_score, max_models_per_month, toggle_annotations, set_selector): df = get_data_split(merged_dfs, set_name=set_selector) # filter data filtered_df = df[(df["rating"] >= min_score)] filtered_df = ( filtered_df.groupby(["Month-Year", "License"]) .apply( lambda x: x.nlargest(max_models_per_month, "rating"), include_groups=True ) .reset_index(drop=True) ) # construct plot custom_colors = {"Open": "#ff7f0e", "Proprietary": "#1f77b4"} fig = px.scatter( filtered_df, x="Release Date", y="rating", color="License", hover_name="Model", hover_data=["Organization", "License", "Link"], trendline="ols", title=f"Open vs Proprietary LLMs by LMSYS Arena ELO Score
(as of {date_updated})", labels={"rating": "Arena ELO", "Release Date": "Release Date"}, height=700, template="plotly_dark", color_discrete_map=custom_colors, ) fig.update_layout( plot_bgcolor="rgba(0,0,0,0)", # Set background color to transparent paper_bgcolor="rgba(0,0,0,0)", # Set paper (plot) background color to transparent title={"x": 0.5}, ) fig.update_traces(marker=dict(size=10, opacity=0.6)) if toggle_annotations: # get the points to annotate (only the highest rated model per month per license) idx_to_annotate = filtered_df.groupby(["Month-Year", "License"])[ "rating" ].idxmax() points_to_annotate_df = filtered_df.loc[idx_to_annotate] for i, row in points_to_annotate_df.iterrows(): fig.add_annotation( x=row["Release Date"], y=row["rating"], text=row["Model"], showarrow=True, arrowhead=0, ) return fig set_dark_mode = """ function refresh() { const url = new URL(window.location); if (url.searchParams.get('__theme') !== 'dark') { url.searchParams.set('__theme', 'dark'); window.location.href = url.href; } } """ with gr.Blocks( theme=gr.themes.Soft( primary_hue=gr.themes.colors.sky, secondary_hue=gr.themes.colors.green, # spacing_size=gr.themes.sizes.spacing_sm, text_size=gr.themes.sizes.text_sm, font=[ gr.themes.GoogleFont("Open Sans"), "ui-sans-serif", "system-ui", "sans-serif", ], ), js=set_dark_mode, ) as demo: gr.Markdown( """

🔬 Progress Tracker: Open vs. Proprietary LLMs 🔬

This app visualizes the progress of proprietary and open-source LLMs over time as scored by the LMSYS Chatbot Arena. The idea is inspired by this great work from Maxime Labonne, and is intended to stay up-to-date as new models are released and evaluated.

Plot info:

""" ) with gr.Row(variant="compact"): set_selector = gr.Dropdown( choices=list(CAT_NAME_TO_EXPLANATION.keys()), label="Select Category", value="Overall", info="Select the category to visualize", ) min_score = gr.Slider( minimum=min_elo_score, maximum=max_elo_score, value=(max_elo_score - min_elo_score) * 0.3 + min_elo_score, step=50, label="Minimum ELO Score", info="Filter out low scoring models", ) max_models_per_month = gr.Slider( value=upper_models_per_month - 2, minimum=1, maximum=upper_models_per_month, step=1, label="Max Models per Month (per License)", info="Limit to N best models per month per license to reduce clutter", ) toggle_annotations = gr.Radio( choices=[True, False], label="Overlay Best Model Name", value=True, info="Toggle to overlay the name of the best model per month per license", ) # Show plot plot = gr.Plot() demo.load( fn=build_plot, inputs=[min_score, max_models_per_month, toggle_annotations, set_selector], outputs=plot, ) min_score.change( fn=build_plot, inputs=[min_score, max_models_per_month, toggle_annotations, set_selector], outputs=plot, ) max_models_per_month.change( fn=build_plot, inputs=[min_score, max_models_per_month, toggle_annotations, set_selector], outputs=plot, ) toggle_annotations.change( fn=build_plot, inputs=[min_score, max_models_per_month, toggle_annotations, set_selector], outputs=plot, ) set_selector.change( fn=build_plot, inputs=[min_score, max_models_per_month, toggle_annotations, set_selector], outputs=plot, ) gr.Markdown( """

If you have any questions, feel free to open a discussion or reach out to me on social.

""" ) demo.launch()