🔬 Progress Tracker: Open vs. Proprietary LLMs 🔬

import os
import pickle

import pandas as pd
import gradio as gr
import plotly.express as px
from datetime import datetime
from huggingface_hub import HfApi
from apscheduler.schedulers.background import BackgroundScheduler

from utils import (
    KEY_TO_CATEGORY_NAME,
    CAT_NAME_TO_EXPLANATION,
    download_latest_data_from_space,
    get_constants,
    update_release_date_mapping,
    format_data,
)

###################
### Initialize scheduler
###################


def restart_space():
    HfApi(token=os.getenv("HF_TOKEN", None)).restart_space(
        repo_id="andrewrreed/closed-vs-open-arena-elo"
    )
    print(f"Space restarted on {datetime.now()}")


# restart the space every day at 9am
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "cron", day_of_week="mon-sun", hour=7, minute=0)
scheduler.start()

###################
### Load Data
###################

# gather ELO data
latest_elo_file_local = download_latest_data_from_space(
    repo_id="lmsys/chatbot-arena-leaderboard", file_type="pkl"
)

with open(latest_elo_file_local, "rb") as fin:
    elo_results = pickle.load(fin)

arena_dfs = {}
for k in KEY_TO_CATEGORY_NAME.keys():
    if k not in elo_results:
        continue
    arena_dfs[KEY_TO_CATEGORY_NAME[k]] = elo_results[k]["leaderboard_table_df"]

# gather open llm leaderboard data
latest_leaderboard_file_local = download_latest_data_from_space(
    repo_id="lmsys/chatbot-arena-leaderboard", file_type="csv"
)
leaderboard_df = pd.read_csv(latest_leaderboard_file_local)

# load release date mapping data
release_date_mapping = pd.read_json("release_date_mapping.json", orient="records")

###################
### Prepare Data
###################

# update release date mapping with new models
# check for new models in ELO data
new_model_keys_to_add = [
    model
    for model in arena_dfs["Overall"].index.to_list()
    if model not in release_date_mapping["key"].to_list()
]
if new_model_keys_to_add:
    release_date_mapping = update_release_date_mapping(
        new_model_keys_to_add, leaderboard_df, release_date_mapping
    )

# merge leaderboard data with ELO data
merged_dfs = {}
for k, v in arena_dfs.items():
    merged_dfs[k] = (
        pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on="key")
        .sort_values("rating", ascending=False)
        .reset_index(drop=True)
    )

# add release dates into the merged data
for k, v in merged_dfs.items():
    merged_dfs[k] = pd.merge(
        merged_dfs[k], release_date_mapping[["key", "Release Date"]], on="key"
    )

# format dataframes
merged_dfs = {k: format_data(v) for k, v in merged_dfs.items()}

# get constants
min_elo_score, max_elo_score, upper_models_per_month = get_constants(merged_dfs)
date_updated = elo_results["full"]["last_updated_datetime"].split(" ")[0]

###################
### Plot Data
###################


def get_data_split(dfs, set_name):
    df = dfs[set_name].copy(deep=True)
    return df.reset_index(drop=True)


def build_plot(min_score, max_models_per_month, toggle_annotations, set_selector):

    df = get_data_split(merged_dfs, set_name=set_selector)

    # filter data
    filtered_df = df[(df["rating"] >= min_score)]

    filtered_df = (
        filtered_df.groupby(["Month-Year", "License"])
        .apply(
            lambda x: x.nlargest(max_models_per_month, "rating"), include_groups=True
        )
        .reset_index(drop=True)
    )

    # construct plot
    custom_colors = {"Open": "#ff7f0e", "Proprietary": "#1f77b4"}
    fig = px.scatter(
        filtered_df,
        x="Release Date",
        y="rating",
        color="License",
        hover_name="Model",
        hover_data=["Organization", "License", "Link"],
        trendline="ols",
        title=f"Open vs Proprietary LLMs by LMSYS Arena ELO Score<br>(as of {date_updated})",
        labels={"rating": "Arena ELO", "Release Date": "Release Date"},
        height=700,
        template="plotly_dark",
        color_discrete_map=custom_colors,
    )

    fig.update_layout(
        plot_bgcolor="rgba(0,0,0,0)",  # Set background color to transparent
        paper_bgcolor="rgba(0,0,0,0)",  # Set paper (plot) background color to transparent
        title={"x": 0.5},
    )

    fig.update_traces(marker=dict(size=10, opacity=0.6))

    if toggle_annotations:
        # get the points to annotate (only the highest rated model per month per license)
        idx_to_annotate = filtered_df.groupby(["Month-Year", "License"])[
            "rating"
        ].idxmax()
        points_to_annotate_df = filtered_df.loc[idx_to_annotate]

        for i, row in points_to_annotate_df.iterrows():
            fig.add_annotation(
                x=row["Release Date"],
                y=row["rating"],
                text=row["Model"],
                showarrow=True,
                arrowhead=0,
            )

    return fig


set_dark_mode = """
function refresh() {
    const url = new URL(window.location);

    if (url.searchParams.get('__theme') !== 'dark') {
        url.searchParams.set('__theme', 'dark');
        window.location.href = url.href;
    }
}
"""

with gr.Blocks(
    theme=gr.themes.Soft(
        primary_hue=gr.themes.colors.sky,
        secondary_hue=gr.themes.colors.green,
        # spacing_size=gr.themes.sizes.spacing_sm,
        text_size=gr.themes.sizes.text_sm,
        font=[
            gr.themes.GoogleFont("Open Sans"),
            "ui-sans-serif",
            "system-ui",
            "sans-serif",
        ],
    ),
    js=set_dark_mode,
) as demo:
    gr.Markdown(
        """
        <div style="text-align: center; max-width: 650px; margin: auto;">
            <h1 style="font-weight: 900; margin-top: 5px;">🔬 Progress Tracker: Open vs. Proprietary LLMs 🔬</h1>
            <p style="text-align: left; margin-top: 30px; margin-bottom: 30px; line-height: 20px;">
            This app visualizes the progress of proprietary and open-source LLMs over time as scored by the <a href="https://leaderboard.lmsys.org/">LMSYS Chatbot Arena</a>.
            The idea is inspired by <a href="https://www.linkedin.com/posts/maxime-labonne_arena-elo-graph-updated-with-new-models-activity-7187062633735368705-u2jB">this great work</a> 
            from <a href="https://huggingface.co/mlabonne/">Maxime Labonne</a>, and is intended to stay up-to-date as new models are released and evaluated.
            <div style="text-align: left;">
            <strong>Plot info:</strong>
            <br>
            <ul style="padding-left: 20px;">
                <li> The ELO score (y-axis) is a measure of the relative strength of a model based on its performance against other models in the arena. </li>
                <li> The Release Date (x-axis) corresponds to when the model was first publicly released or when its ELO results were first reported (for ease of automated updates). </li>
                <li> Trend lines are based on Ordinary Least Squares (OLS) regression and adjust based on the filter criteria. </li>
            <ul>
            </div>
            </p>
        </div>
        """
    )

    with gr.Row(variant="compact"):
        set_selector = gr.Dropdown(
            choices=list(CAT_NAME_TO_EXPLANATION.keys()),
            label="Select Category",
            value="Overall",
            info="Select the category to visualize",
        )
        min_score = gr.Slider(
            minimum=min_elo_score,
            maximum=max_elo_score,
            value=(max_elo_score - min_elo_score) * 0.3 + min_elo_score,
            step=50,
            label="Minimum ELO Score",
            info="Filter out low scoring models",
        )
        max_models_per_month = gr.Slider(
            value=upper_models_per_month - 2,
            minimum=1,
            maximum=upper_models_per_month,
            step=1,
            label="Max Models per Month (per License)",
            info="Limit to N best models per month per license to reduce clutter",
        )
        toggle_annotations = gr.Radio(
            choices=[True, False],
            label="Overlay Best Model Name",
            value=True,
            info="Toggle to overlay the name of the best model per month per license",
        )

    # Show plot
    plot = gr.Plot()
    demo.load(
        fn=build_plot,
        inputs=[min_score, max_models_per_month, toggle_annotations, set_selector],
        outputs=plot,
    )
    min_score.change(
        fn=build_plot,
        inputs=[min_score, max_models_per_month, toggle_annotations, set_selector],
        outputs=plot,
    )
    max_models_per_month.change(
        fn=build_plot,
        inputs=[min_score, max_models_per_month, toggle_annotations, set_selector],
        outputs=plot,
    )
    toggle_annotations.change(
        fn=build_plot,
        inputs=[min_score, max_models_per_month, toggle_annotations, set_selector],
        outputs=plot,
    )
    set_selector.change(
        fn=build_plot,
        inputs=[min_score, max_models_per_month, toggle_annotations, set_selector],
        outputs=plot,
    )

    gr.Markdown(
        """
                <div style="text-align: center; max-width: 650px; margin: auto;">
                <p style="margin-top: 40px;"> If you have any questions, feel free to open a discussion or <a href="https://twitter.com/andrewrreed">reach out to me on social</a>. </p>
                </p>
                </div>
                """
    )

demo.launch()