import os
import pickle
import pandas as pd
import gradio as gr
import plotly.express as px
from datetime import datetime
from huggingface_hub import HfApi
from apscheduler.schedulers.background import BackgroundScheduler
from utils import (
KEY_TO_CATEGORY_NAME,
CAT_NAME_TO_EXPLANATION,
download_latest_data_from_space,
get_constants,
update_release_date_mapping,
format_data,
)
###################
### Initialize scheduler
###################
def restart_space():
HfApi(token=os.getenv("HF_TOKEN", None)).restart_space(
repo_id="andrewrreed/closed-vs-open-arena-elo"
)
print(f"Space restarted on {datetime.now()}")
# restart the space every day at 9am
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "cron", day_of_week="mon-sun", hour=7, minute=0)
scheduler.start()
###################
### Load Data
###################
# gather ELO data
latest_elo_file_local = download_latest_data_from_space(
repo_id="lmsys/chatbot-arena-leaderboard", file_type="pkl"
)
with open(latest_elo_file_local, "rb") as fin:
elo_results = pickle.load(fin)
arena_dfs = {}
for k in KEY_TO_CATEGORY_NAME.keys():
if k not in elo_results:
continue
arena_dfs[KEY_TO_CATEGORY_NAME[k]] = elo_results[k]["leaderboard_table_df"]
# gather open llm leaderboard data
latest_leaderboard_file_local = download_latest_data_from_space(
repo_id="lmsys/chatbot-arena-leaderboard", file_type="csv"
)
leaderboard_df = pd.read_csv(latest_leaderboard_file_local)
# load release date mapping data
release_date_mapping = pd.read_json("release_date_mapping.json", orient="records")
###################
### Prepare Data
###################
# update release date mapping with new models
# check for new models in ELO data
new_model_keys_to_add = [
model
for model in arena_dfs["Overall"].index.to_list()
if model not in release_date_mapping["key"].to_list()
]
if new_model_keys_to_add:
release_date_mapping = update_release_date_mapping(
new_model_keys_to_add, leaderboard_df, release_date_mapping
)
# merge leaderboard data with ELO data
merged_dfs = {}
for k, v in arena_dfs.items():
merged_dfs[k] = (
pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on="key")
.sort_values("rating", ascending=False)
.reset_index(drop=True)
)
# add release dates into the merged data
for k, v in merged_dfs.items():
merged_dfs[k] = pd.merge(
merged_dfs[k], release_date_mapping[["key", "Release Date"]], on="key"
)
# format dataframes
merged_dfs = {k: format_data(v) for k, v in merged_dfs.items()}
# get constants
min_elo_score, max_elo_score, upper_models_per_month = get_constants(merged_dfs)
date_updated = elo_results["full"]["last_updated_datetime"].split(" ")[0]
###################
### Plot Data
###################
def get_data_split(dfs, set_name):
df = dfs[set_name].copy(deep=True)
return df.reset_index(drop=True)
def build_plot(min_score, max_models_per_month, toggle_annotations, set_selector):
df = get_data_split(merged_dfs, set_name=set_selector)
# filter data
filtered_df = df[(df["rating"] >= min_score)]
filtered_df = (
filtered_df.groupby(["Month-Year", "License"])
.apply(
lambda x: x.nlargest(max_models_per_month, "rating"), include_groups=True
)
.reset_index(drop=True)
)
# construct plot
custom_colors = {"Open": "#ff7f0e", "Proprietary": "#1f77b4"}
fig = px.scatter(
filtered_df,
x="Release Date",
y="rating",
color="License",
hover_name="Model",
hover_data=["Organization", "License", "Link"],
trendline="ols",
title=f"Open vs Proprietary LLMs by LMSYS Arena ELO Score
(as of {date_updated})",
labels={"rating": "Arena ELO", "Release Date": "Release Date"},
height=700,
template="plotly_dark",
color_discrete_map=custom_colors,
)
fig.update_layout(
plot_bgcolor="rgba(0,0,0,0)", # Set background color to transparent
paper_bgcolor="rgba(0,0,0,0)", # Set paper (plot) background color to transparent
title={"x": 0.5},
)
fig.update_traces(marker=dict(size=10, opacity=0.6))
if toggle_annotations:
# get the points to annotate (only the highest rated model per month per license)
idx_to_annotate = filtered_df.groupby(["Month-Year", "License"])[
"rating"
].idxmax()
points_to_annotate_df = filtered_df.loc[idx_to_annotate]
for i, row in points_to_annotate_df.iterrows():
fig.add_annotation(
x=row["Release Date"],
y=row["rating"],
text=row["Model"],
showarrow=True,
arrowhead=0,
)
return fig
set_dark_mode = """
function refresh() {
const url = new URL(window.location);
if (url.searchParams.get('__theme') !== 'dark') {
url.searchParams.set('__theme', 'dark');
window.location.href = url.href;
}
}
"""
with gr.Blocks(
theme=gr.themes.Soft(
primary_hue=gr.themes.colors.sky,
secondary_hue=gr.themes.colors.green,
# spacing_size=gr.themes.sizes.spacing_sm,
text_size=gr.themes.sizes.text_sm,
font=[
gr.themes.GoogleFont("Open Sans"),
"ui-sans-serif",
"system-ui",
"sans-serif",
],
),
js=set_dark_mode,
) as demo:
gr.Markdown(
"""
This app visualizes the progress of proprietary and open-source LLMs over time as scored by the LMSYS Chatbot Arena. The idea is inspired by this great work from Maxime Labonne, and is intended to stay up-to-date as new models are released and evaluated.
If you have any questions, feel free to open a discussion or reach out to me on social.