Spaces:

lmarena-ai
/

chatbot-arena-leaderboard

Running

App Files Files Community

per_task_results

#26

by lisabdunlap - opened Mar 31, 2024

base: refs/heads/main

←

from: refs/pr/26

Discussion Files changed

+649

-99

Files changed (11) hide show

app.py +341 -95
elo_results_20240329.pkl → elo_results_20240327.pkl +2 -2
elo_results_20240403.pkl +3 -0
elo_results_20240409.pkl +3 -0
elo_results_20240410.pkl +3 -0
elo_results_20240411.pkl +3 -0
leaderboard_table_20240329.csv → leaderboard_table_20240404.csv +4 -1
leaderboard_table_20240409.csv +95 -0
leaderboard_table_20240410.csv +96 -0
leaderboard_table_20240411.csv +97 -0
theme.json +1 -0

app.py CHANGED Viewed

@@ -12,7 +12,6 @@ import pandas as pd
 # notebook_url = "https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing"
 notebook_url = "https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH#scrollTo=o_CpbkGEbhrK"
 basic_component_values = [None] * 6
 leader_component_values = [None] * 5
@@ -34,14 +33,26 @@ We've collected over **500,000** human preference votes to rank LLMs with the El
 def make_arena_leaderboard_md(arena_df):
     total_votes = sum(arena_df["num_battles"]) // 2
     total_models = len(arena_df)
     leaderboard_md = f"""
-Total #models: **{total_models}**. Total #votes: **{total_votes}**. Last updated: March 29, 2024.
-Contribute your vote 🗳️ at [chat.lmsys.org](https://chat.lmsys.org)! Find more analysis in the [notebook]({notebook_url}).
 """
     return leaderboard_md
 def make_full_leaderboard_md(elo_results):
     leaderboard_md = f"""
@@ -202,52 +213,131 @@ def get_full_table(arena_df, model_table_df):
     values.sort(key=lambda x: -x[1] if not np.isnan(x[1]) else 1e9)
     return values
-def get_arena_table(arena_df, model_table_df):
     # sort by rating
-    arena_df = arena_df.sort_values(by=["rating"], ascending=False)
     values = []
     for i in range(len(arena_df)):
         row = []
         model_key = arena_df.index[i]
-        model_name = model_table_df[model_table_df["key"] == model_key]["Model"].values[
-            0
-        ]
-        # rank
-        ranking = arena_df.iloc[i].get("final_ranking") or i+1
-        row.append(ranking)
-        # model display name
-        row.append(model_name)
-        # elo rating
-        row.append(round(arena_df.iloc[i]["rating"]))
-        upper_diff = round(
-            arena_df.iloc[i]["rating_q975"] - arena_df.iloc[i]["rating"]
-        )
-        lower_diff = round(
-            arena_df.iloc[i]["rating"] - arena_df.iloc[i]["rating_q025"]
-        )
-        row.append(f"+{upper_diff}/-{lower_diff}")
-        # num battles
-        row.append(round(arena_df.iloc[i]["num_battles"]))
-        # Organization
-        row.append(
-            model_table_df[model_table_df["key"] == model_key]["Organization"].values[0]
-        )
-        # license
-        row.append(
-            model_table_df[model_table_df["key"] == model_key]["License"].values[0]
-        )
-        cutoff_date = model_table_df[model_table_df["key"] == model_key]["Knowledge cutoff date"].values[0]
-        if cutoff_date == "-":
-            row.append("Unknown")
-        else:
-            row.append(cutoff_date)
-        values.append(row)
-    return values
 def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
     if elo_results_file is None:  # Do live update
         default_md = "Loading ..."
         p1 = p2 = p3 = p4 = None
@@ -255,14 +345,19 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
         with open(elo_results_file, "rb") as fin:
             elo_results = pickle.load(fin)
             if "full" in elo_results:
-                elo_results = elo_results["full"]
-        p1 = elo_results["win_fraction_heatmap"]
-        p2 = elo_results["battle_count_heatmap"]
-        p3 = elo_results["bootstrap_elo_rating"]
-        p4 = elo_results["average_win_rate_bar"]
-        arena_df = elo_results["leaderboard_table_df"]
-        default_md = make_default_md(arena_df, elo_results)
     md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
     if leaderboard_table_file:
@@ -274,8 +369,15 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
             arena_table_vals = get_arena_table(arena_df, model_table_df)
             with gr.Tab("Arena Elo", id=0):
                 md = make_arena_leaderboard_md(arena_df)
-                gr.Markdown(md, elem_id="leaderboard_markdown")
-                gr.Dataframe(
                     headers=[
                         "Rank",
                         "🤖 Model",
@@ -287,7 +389,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
                         "Knowledge Cutoff",
                     ],
                     datatype=[
-                        "str",
                         "markdown",
                         "number",
                         "str",
@@ -299,9 +401,48 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
                     value=arena_table_vals,
                     elem_id="arena_leaderboard_dataframe",
                     height=700,
-                    column_widths=[50, 200, 120, 100, 100, 150, 150, 100],
                     wrap=True,
                 )
             with gr.Tab("Full Leaderboard", id=1):
                 md = make_full_leaderboard_md(elo_results)
                 gr.Markdown(md, elem_id="leaderboard_markdown")
@@ -332,53 +473,121 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
     else:
         pass
-    gr.Markdown(
-        f"""Note: we take the 95% confidence interval into account when determining a model's ranking.
-A model is ranked higher only if its lower bound of model score is higher than the upper bound of the other model's score.
-See Figure 3 below for visualization of the confidence intervals.
-""",
-        elem_id="leaderboard_markdown"
-    )
-    leader_component_values[:] = [default_md, p1, p2, p3, p4]
-    if show_plot:
-        gr.Markdown(
-            f"""## More Statistics for Chatbot Arena\n
-Below are figures for more statistics. The code for generating them is also included in this [notebook]({notebook_url}).
-You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
-    """,
-            elem_id="leaderboard_markdown"
-        )
-        with gr.Row():
-            with gr.Column():
-                gr.Markdown(
-                    "#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles"
-                )
-                plot_1 = gr.Plot(p1, show_label=False)
-            with gr.Column():
-                gr.Markdown(
-                    "#### Figure 2: Battle Count for Each Combination of Models (without Ties)"
-                )
-                plot_2 = gr.Plot(p2, show_label=False)
-        with gr.Row():
-            with gr.Column():
-                gr.Markdown(
-                    "#### Figure 3: Confidence Intervals on Model Strength (via Bootstrapping)"
                 )
-                plot_3 = gr.Plot(p3, show_label=False)
-            with gr.Column():
-                gr.Markdown(
-                    "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
                 )
-                plot_4 = gr.Plot(p4, show_label=False)
-    gr.Markdown(acknowledgment_md)
     if show_plot:
         return [md_1, plot_1, plot_2, plot_3, plot_4]
     return [md_1]
 block_css = """
 #notice_markdown {
     font-size: 104%
@@ -390,6 +599,13 @@ block_css = """
     padding-top: 6px;
     padding-bottom: 6px;
 }
 #leaderboard_markdown {
     font-size: 104%
 }
@@ -397,9 +613,34 @@ block_css = """
     padding-top: 6px;
     padding-bottom: 6px;
 }
 #leaderboard_dataframe td {
     line-height: 0.1em;
 }
 footer {
     display:none !important
 }
@@ -429,10 +670,13 @@ We thank [Kaggle](https://www.kaggle.com/), [MBZUAI](https://mbzuai.ac.ae/), [a1
 def build_demo(elo_results_file, leaderboard_table_file):
     text_size = gr.themes.sizes.text_lg
     with gr.Blocks(
         title="Chatbot Arena Leaderboard",
-        theme=gr.themes.Base(text_size=text_size),
         css=block_css,
     ) as demo:
         leader_components = build_leaderboard_tab(
@@ -444,6 +688,8 @@ def build_demo(elo_results_file, leaderboard_table_file):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--share", action="store_true")
     args = parser.parse_args()
     elo_result_files = glob.glob("elo_results_*.pkl")
@@ -455,4 +701,4 @@ if __name__ == "__main__":
     leaderboard_table_file = leaderboard_table_files[-1]
     demo = build_demo(elo_result_file, leaderboard_table_file)
-    demo.launch(share=args.share)

 # notebook_url = "https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing"
 notebook_url = "https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH#scrollTo=o_CpbkGEbhrK"
 basic_component_values = [None] * 6
 leader_component_values = [None] * 5
 def make_arena_leaderboard_md(arena_df):
     total_votes = sum(arena_df["num_battles"]) // 2
     total_models = len(arena_df)
+    space = "&nbsp;&nbsp;&nbsp;"
     leaderboard_md = f"""
+Total #models: **{total_models}**.{space} Total #votes: **{"{:,}".format(total_votes)}**.{space} Last updated: April 11, 2024.
+📣 **NEW!** View leaderboard for different categories (e.g., coding, long user query)!
+Code to recreate leaderboard tables and plots in this [notebook]({notebook_url}). Cast your vote 🗳️ at [chat.lmsys.org](https://chat.lmsys.org)!
 """
     return leaderboard_md
+def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="Overall"):
+    total_votes = sum(arena_df["num_battles"]) // 2
+    total_models = len(arena_df)
+    space = "&nbsp;&nbsp;&nbsp;"
+    total_subset_votes = sum(arena_subset_df["num_battles"]) // 2
+    total_subset_models = len(arena_subset_df)
+    leaderboard_md = f"""### {cat_name_to_explanation[name]}
+#### [Coverage] {space} #models: **{total_subset_models} ({round(total_subset_models/total_models *100)}%)** {space} #votes: **{"{:,}".format(total_subset_votes)} ({round(total_subset_votes/total_votes * 100)}%)**{space}
+"""
+    return leaderboard_md
 def make_full_leaderboard_md(elo_results):
     leaderboard_md = f"""
     values.sort(key=lambda x: -x[1] if not np.isnan(x[1]) else 1e9)
     return values
+def create_ranking_str(ranking, ranking_difference):
+    if ranking_difference > 0:
+        # return f"{int(ranking)} (\u2191{int(ranking_difference)})"
+        return f"{int(ranking)} \u2191"
+    elif ranking_difference < 0:
+        # return f"{int(ranking)} (\u2193{int(-ranking_difference)})"
+        return f"{int(ranking)} \u2193"
+    else:
+        return f"{int(ranking)}"
+def recompute_final_ranking(arena_df):
+    # compute ranking based on CI
+    ranking = {}
+    for i, model_a in enumerate(arena_df.index):
+        ranking[model_a] = 1
+        for j, model_b in enumerate(arena_df.index):
+            if i == j:
+                continue
+            if arena_df.loc[model_b]["rating_q025"] > arena_df.loc[model_a]["rating_q975"]:
+                ranking[model_a] += 1
+    return list(ranking.values())
+def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
+    arena_df = arena_df.sort_values(by=["final_ranking", "rating"], ascending=[True, False])
+    arena_df = arena_df[arena_df["num_battles"] > 2000]
+    arena_df["final_ranking"] = recompute_final_ranking(arena_df)
+    arena_df = arena_df.sort_values(by=["final_ranking"], ascending=True)
+    # arena_df["final_ranking"] = range(1, len(arena_df) + 1)
     # sort by rating
+    if arena_subset_df is not None:
+        # filter out models not in the arena_df
+        arena_subset_df = arena_subset_df[arena_subset_df.index.isin(arena_df.index)]
+        arena_subset_df = arena_subset_df.sort_values(by=["rating"], ascending=False)
+        # arena_subset_df = arena_subset_df.sort_values(by=["final_ranking"], ascending=True)
+        # arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 500]
+        arena_subset_df["final_ranking"] = recompute_final_ranking(arena_subset_df)
+        # keep only the models in the subset in arena_df and recompute final_ranking
+        arena_df = arena_df[arena_df.index.isin(arena_subset_df.index)]
+        # recompute final ranking
+        arena_df["final_ranking"] = recompute_final_ranking(arena_df)
+        # assign ranking by the order
+        arena_subset_df["final_ranking_no_tie"] = range(1, len(arena_subset_df) + 1)
+        arena_df["final_ranking_no_tie"] = range(1, len(arena_df) + 1)
+        # join arena_df and arena_subset_df on index
+        arena_df = arena_subset_df.join(arena_df["final_ranking"], rsuffix="_global", how="inner")
+        arena_df["ranking_difference"] =  arena_df["final_ranking_global"] - arena_df["final_ranking"]
+        # no tie version
+        # arena_df = arena_subset_df.join(arena_df["final_ranking_no_tie"], rsuffix="_global", how="inner")
+        # arena_df["ranking_difference"] =  arena_df["final_ranking_no_tie_global"] - arena_df["final_ranking_no_tie"]
+        arena_df = arena_df.sort_values(by=["final_ranking", "rating"], ascending=[True, False])
+        arena_df["final_ranking"] = arena_df.apply(lambda x: create_ranking_str(x["final_ranking"], x["ranking_difference"]), axis=1)
     values = []
     for i in range(len(arena_df)):
         row = []
         model_key = arena_df.index[i]
+        try: # this is a janky fix for where the model key is not in the model table (model table and arena table dont contain all the same models)
+            model_name = model_table_df[model_table_df["key"] == model_key]["Model"].values[
+                0
+            ]
+            # rank
+            ranking = arena_df.iloc[i].get("final_ranking") or i+1
+            row.append(ranking)
+            if arena_subset_df is not None:
+                row.append(arena_df.iloc[i].get("ranking_difference") or 0)
+            # model display name
+            row.append(model_name)
+            # elo rating
+            row.append(round(arena_df.iloc[i]["rating"]))
+            upper_diff = round(
+                arena_df.iloc[i]["rating_q975"] - arena_df.iloc[i]["rating"]
+            )
+            lower_diff = round(
+                arena_df.iloc[i]["rating"] - arena_df.iloc[i]["rating_q025"]
+            )
+            row.append(f"+{upper_diff}/-{lower_diff}")
+            # num battles
+            row.append(round(arena_df.iloc[i]["num_battles"]))
+            # Organization
+            row.append(
+                model_table_df[model_table_df["key"] == model_key]["Organization"].values[0]
+            )
+            # license
+            row.append(
+                model_table_df[model_table_df["key"] == model_key]["License"].values[0]
+            )
+            cutoff_date = model_table_df[model_table_df["key"] == model_key]["Knowledge cutoff date"].values[0]
+            if cutoff_date == "-":
+                row.append("Unknown")
+            else:
+                row.append(cutoff_date)
+            values.append(row)
+        except Exception as e:
+            print(f"{model_key} - {e}")
+    return values
+key_to_category_name = {
+    "full": "Overall",
+    "coding": "Coding",
+    "long_user": "Longer Query",
+    "english": "English",
+    "chinese": "Chinese",
+    "french": "French",
+    "no_tie": "Exclude Ties",
+    "no_short": "Exclude Short",
+}
+cat_name_to_explanation = {
+    "Overall": "Overall Questions",
+    "Coding": "Coding: whether conversation contains code snippets",
+    "Longer Query": "Longer Query (>= 500 tokens)",
+    "English": "English Prompts",
+    "Chinese": "Chinese Prompts",
+    "French": "French Prompts",
+    "Exclude Ties": "Exclude Ties and Bothbad",
+    "Exclude Short": "User Query >= 5 tokens",
+}
 def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
+    arena_dfs = {}
+    category_elo_results = {}
     if elo_results_file is None:  # Do live update
         default_md = "Loading ..."
         p1 = p2 = p3 = p4 = None
         with open(elo_results_file, "rb") as fin:
             elo_results = pickle.load(fin)
             if "full" in elo_results:
+                print("KEYS ", elo_results.keys())
+                for k in elo_results.keys():
+                    if k not in key_to_category_name:
+                        continue
+                    arena_dfs[key_to_category_name[k]] = elo_results[k]["leaderboard_table_df"]
+                    category_elo_results[key_to_category_name[k]] = elo_results[k]
+        p1 = category_elo_results["Overall"]["win_fraction_heatmap"]
+        p2 = category_elo_results["Overall"]["battle_count_heatmap"]
+        p3 = category_elo_results["Overall"]["bootstrap_elo_rating"]
+        p4 = category_elo_results["Overall"]["average_win_rate_bar"]
+        arena_df = arena_dfs["Overall"]
+        default_md = make_default_md(arena_df, category_elo_results["Overall"])
     md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
     if leaderboard_table_file:
             arena_table_vals = get_arena_table(arena_df, model_table_df)
             with gr.Tab("Arena Elo", id=0):
                 md = make_arena_leaderboard_md(arena_df)
+                leaderboard_markdown = gr.Markdown(md, elem_id="leaderboard_markdown")
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        category_dropdown = gr.Dropdown(choices=list(arena_dfs.keys()), label="Category", value="Overall")
+                    default_category_details = make_category_arena_leaderboard_md(arena_df, arena_df, name="Overall")
+                    with gr.Column(scale=4, variant="panel"):
+                        category_deets = gr.Markdown(default_category_details, elem_id="category_deets")
+                elo_display_df = gr.Dataframe(
                     headers=[
                         "Rank",
                         "🤖 Model",
                         "Knowledge Cutoff",
                     ],
                     datatype=[
+                        "number",
                         "markdown",
                         "number",
                         "str",
                     value=arena_table_vals,
                     elem_id="arena_leaderboard_dataframe",
                     height=700,
+                    column_widths=[70, 190, 110, 100, 90, 160, 150, 140],
                     wrap=True,
                 )
+                gr.Markdown(
+                    f"""Note: we take the 95% confidence interval into account when determining a model's ranking.
+            A model is ranked higher only if its lower bound of model score is higher than the upper bound of the other model's score.
+            See Figure 3 below for visualization of the confidence intervals. More details in [notebook]({notebook_url}).
+            """,
+                    elem_id="leaderboard_markdown"
+                )
+                leader_component_values[:] = [default_md, p1, p2, p3, p4]
+                if show_plot:
+                    more_stats_md = gr.Markdown(
+                        f"""## More Statistics for Chatbot Arena (Overall)""",
+                        elem_id="leaderboard_header_markdown"
+                    )
+                    with gr.Row():
+                        with gr.Column():
+                            gr.Markdown(
+                                "#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles", elem_id="plot-title"
+                            )
+                            plot_1 = gr.Plot(p1, show_label=False, elem_id="plot-container")
+                        with gr.Column():
+                            gr.Markdown(
+                                "#### Figure 2: Battle Count for Each Combination of Models (without Ties)", elem_id="plot-title"
+                            )
+                            plot_2 = gr.Plot(p2, show_label=False)
+                    with gr.Row():
+                        with gr.Column():
+                            gr.Markdown(
+                                "#### Figure 3: Confidence Intervals on Model Strength (via Bootstrapping)", elem_id="plot-title"
+                            )
+                            plot_3 = gr.Plot(p3, show_label=False)
+                        with gr.Column():
+                            gr.Markdown(
+                                "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)", elem_id="plot-title"
+                            )
+                            plot_4 = gr.Plot(p4, show_label=False)
             with gr.Tab("Full Leaderboard", id=1):
                 md = make_full_leaderboard_md(elo_results)
                 gr.Markdown(md, elem_id="leaderboard_markdown")
     else:
         pass
+    def update_leaderboard_df(arena_table_vals):
+        elo_datarame = pd.DataFrame(arena_table_vals, columns=[ "Rank", "Delta", "🤖 Model", "⭐ Arena Elo", "📊 95% CI", "🗳️ Votes", "Organization", "License", "Knowledge Cutoff"])
+        # goal: color the rows based on the rank with styler
+        def highlight_max(s):
+            # all items in S which contain up arrow should be green, down arrow should be red, otherwise black
+            return ["color: green; font-weight: bold" if "\u2191" in v else "color: red; font-weight: bold" if "\u2193" in v else "" for v in s]
+        def highlight_rank_max(s):
+            return ["color: green; font-weight: bold" if v > 0 else "color: red; font-weight: bold" if v < 0 else "" for v in s]
+        return elo_datarame.style.apply(highlight_max, subset=["Rank"]).apply(highlight_rank_max, subset=["Delta"])
+    def update_leaderboard_and_plots(category):
+        arena_subset_df = arena_dfs[category]
+        arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 500]
+        elo_subset_results = category_elo_results[category]
+        arena_df = arena_dfs["Overall"]
+        arena_values = get_arena_table(arena_df, model_table_df, arena_subset_df = arena_subset_df if category != "Overall" else None)
+        if category != "Overall":
+            arena_values = update_leaderboard_df(arena_values)
+            arena_values = gr.Dataframe(
+                    headers=[
+                        "Rank",
+                        "Delta",
+                        "🤖 Model",
+                        "⭐ Arena Elo",
+                        "📊 95% CI",
+                        "🗳️ Votes",
+                        "Organization",
+                        "License",
+                        "Knowledge Cutoff",
+                    ],
+                    datatype=[
+                        "number",
+                        "number",
+                        "markdown",
+                        "number",
+                        "str",
+                        "number",
+                        "str",
+                        "str",
+                        "str",
+                    ],
+                    value=arena_values,
+                    elem_id="arena_leaderboard_dataframe",
+                    height=700,
+                    column_widths=[60, 70, 190, 110, 100, 90, 160, 150, 140],
+                    wrap=True,
                 )
+        else:
+            arena_values = gr.Dataframe(
+                    headers=[
+                        "Rank",
+                        "🤖 Model",
+                        "⭐ Arena Elo",
+                        "📊 95% CI",
+                        "🗳️ Votes",
+                        "Organization",
+                        "License",
+                        "Knowledge Cutoff",
+                    ],
+                    datatype=[
+                        "number",
+                        "markdown",
+                        "number",
+                        "str",
+                        "number",
+                        "str",
+                        "str",
+                        "str",
+                    ],
+                    value=arena_values,
+                    elem_id="arena_leaderboard_dataframe",
+                    height=700,
+                    column_widths=[70, 190, 110, 100, 90, 160, 150, 140],
+                    wrap=True,
                 )
+        p1 = elo_subset_results["win_fraction_heatmap"]
+        p2 = elo_subset_results["battle_count_heatmap"]
+        p3 = elo_subset_results["bootstrap_elo_rating"]
+        p4 = elo_subset_results["average_win_rate_bar"]
+        more_stats_md = f"""## More Statistics for Chatbot Arena - {category}
+        """
+        leaderboard_md = make_category_arena_leaderboard_md(arena_df, arena_subset_df, name=category)
+        return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
+    category_dropdown.change(update_leaderboard_and_plots, inputs=[category_dropdown], outputs=[elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, category_deets])
+    with gr.Accordion(
+        "📝 Citation",
+        open=True,
+    ):
+        citation_md = """
+        ### Citation
+        Please cite the following paper if you find our leaderboard or dataset helpful.
+        ```
+        @misc{chiang2024chatbot,
+            title={Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference},
+            author={Wei-Lin Chiang and Lianmin Zheng and Ying Sheng and Anastasios Nikolas Angelopoulos and Tianle Li and Dacheng Li and Hao Zhang and Banghua Zhu and Michael Jordan and Joseph E. Gonzalez and Ion Stoica},
+            year={2024},
+            eprint={2403.04132},
+            archivePrefix={arXiv},
+            primaryClass={cs.AI}
+        }
+        """
+        gr.Markdown(citation_md, elem_id="leaderboard_markdown")
+        gr.Markdown(acknowledgment_md)
     if show_plot:
         return [md_1, plot_1, plot_2, plot_3, plot_4]
     return [md_1]
 block_css = """
 #notice_markdown {
     font-size: 104%
     padding-top: 6px;
     padding-bottom: 6px;
 }
+#category_deets {
+    text-align: center;
+    padding: 0px;
+    padding-left: 5px;
+}
 #leaderboard_markdown {
     font-size: 104%
 }
     padding-top: 6px;
     padding-bottom: 6px;
 }
+#leaderboard_header_markdown {
+    font-size: 104%;
+    text-align: center;
+    display:block;
+}
 #leaderboard_dataframe td {
     line-height: 0.1em;
 }
+#plot-title {
+    text-align: center;
+    display:block;
+}
+#non-interactive-button {
+  display: inline-block;
+  padding: 10px 10px;
+  background-color: #f7f7f7; /* Super light grey background */
+  text-align: center;
+  font-size: 26px; /* Larger text */
+  border-radius: 0; /* Straight edges, no border radius */
+  border: 0px solid #dcdcdc; /* A light grey border to match the background */
+  user-select: none; /* The text inside the button is not selectable */
+  pointer-events: none; /* The button is non-interactive */
+}
 footer {
     display:none !important
 }
 def build_demo(elo_results_file, leaderboard_table_file):
     text_size = gr.themes.sizes.text_lg
+    theme = gr.themes.Base(text_size=text_size)
+    theme.set(button_secondary_background_fill_hover="*primary_300",
+              button_secondary_background_fill_hover_dark="*primary_700")
     with gr.Blocks(
         title="Chatbot Arena Leaderboard",
+        theme=theme,
+        # theme = gr.themes.Base.load("theme.json"), # uncomment to use new cool theme
         css=block_css,
     ) as demo:
         leader_components = build_leaderboard_tab(
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--share", action="store_true")
+    parser.add_argument("--host", default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=7860)
     args = parser.parse_args()
     elo_result_files = glob.glob("elo_results_*.pkl")
     leaderboard_table_file = leaderboard_table_files[-1]
     demo = build_demo(elo_result_file, leaderboard_table_file)
+    demo.launch(share=args.share, server_name=args.host, server_port=args.port)

elo_results_20240329.pkl → elo_results_20240327.pkl RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7f4c037f68c9ddbf27b70b1cb333ca37bf70ff9a3cddad7a93cd62bca709cd77
-size 115776

 version https://git-lfs.github.com/spec/v1
+oid sha256:bab4e9fa00e9d7c8244723993174af2c4f35ffc8487cc3059504b72658f06f43
+size 457743

elo_results_20240403.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce8cebf41da8c06eee0f37156e01be83cc43182e0f00444311b4ad97a83154be
+size 690286

elo_results_20240409.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6774f780b63f569666e9a85b12eddceef3af75e1d1799ff7c6e0529102950c3
+size 119947

elo_results_20240410.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a5757ab9692c6121451f2c787700507fe6b866837329ab0a47a9003a274338f
+size 120963

elo_results_20240411.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fada8d86ddb6dae319c5bda602d921859cc4280fdd53388eff446d80c3ab8192
+size 1183214

leaderboard_table_20240329.csv → leaderboard_table_20240404.csv RENAMED Viewed

@@ -88,4 +88,7 @@ codellama-70b-instruct,CodeLlama-70B-instruct,-,-,2024/1,Llama 2 Community,Meta,
 olmo-7b-instruct,OLMo-7B-instruct,-,-,2024/2,Apache-2.0,Allen AI,https://huggingface.co/allenai/OLMo-7B-Instruct
 claude-3-haiku-20240307,Claude 3 Haiku,-,0.752,2023/8,Proprietary,Anthropic,https://www.anthropic.com/news/claude-3-family
 starling-lm-7b-beta,Starling-LM-7B-beta,8.12,-,2024/3,Apache-2.0,Nexusflow,https://huggingface.co/Nexusflow/Starling-LM-7B-beta
-command-r,Command R,-,-,2024/3,CC-BY-NC-4.0,Cohere AI,https://txt.cohere.com/command-r

 olmo-7b-instruct,OLMo-7B-instruct,-,-,2024/2,Apache-2.0,Allen AI,https://huggingface.co/allenai/OLMo-7B-Instruct
 claude-3-haiku-20240307,Claude 3 Haiku,-,0.752,2023/8,Proprietary,Anthropic,https://www.anthropic.com/news/claude-3-family
 starling-lm-7b-beta,Starling-LM-7B-beta,8.12,-,2024/3,Apache-2.0,Nexusflow,https://huggingface.co/Nexusflow/Starling-LM-7B-beta
+dbrx-instruct,DBRX-instruct,-,-,2024/3,Apache-2.0,Databricks,-
+command-r,Command R,-,-,2024/3,Apache-2.0,Cohere,-
+qwen1.5-14b-chat,Qwen1.5-14B-Chat,-,-,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5/
+qwen1.5-32b-chat,Qwen1.5-32B-Chat,-,-,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5/

leaderboard_table_20240409.csv ADDED Viewed

	@@ -0,0 +1,95 @@

+key,Model,MT-bench (score),MMLU,Knowledge cutoff date,License,Organization,Link
+wizardlm-30b,WizardLM-30B,7.01,0.587,2023/6,Non-commercial,Microsoft,https://huggingface.co/WizardLM/WizardLM-30B-V1.0
+vicuna-13b-16k,Vicuna-13B-16k,6.92,0.545,2023/7,Llama 2 Community,LMSYS,https://huggingface.co/lmsys/vicuna-13b-v1.5-16k
+wizardlm-13b-v1.1,WizardLM-13B-v1.1,6.76,0.500,2023/7,Non-commercial,Microsoft,https://huggingface.co/WizardLM/WizardLM-13B-V1.1
+tulu-30b,Tulu-30B,6.43,0.581,2023/6,Non-commercial,AllenAI/UW,https://huggingface.co/allenai/tulu-30b
+guanaco-65b,Guanaco-65B,6.41,0.621,2023/5,Non-commercial,UW,https://huggingface.co/timdettmers/guanaco-65b-merged
+openassistant-llama-30b,OpenAssistant-LLaMA-30B,6.41,0.560,2023/4,Non-commercial,OpenAssistant,https://huggingface.co/OpenAssistant/oasst-sft-6-llama-30b-xor
+wizardlm-13b-v1.0,WizardLM-13B-v1.0,6.35,0.523,2023/5,Non-commercial,Microsoft,https://huggingface.co/WizardLM/WizardLM-13B-V1.0
+vicuna-7b-16k,Vicuna-7B-16k,6.22,0.485,2023/7,Llama 2 Community,LMSYS,https://huggingface.co/lmsys/vicuna-7b-v1.5-16k
+baize-v2-13b,Baize-v2-13B,5.75,0.489,2023/4,Non-commercial,UCSD,https://huggingface.co/project-baize/baize-v2-13b
+xgen-7b-8k-inst,XGen-7B-8K-Inst,5.55,0.421,2023/7,Non-commercial,Salesforce,https://huggingface.co/Salesforce/xgen-7b-8k-inst
+nous-hermes-13b,Nous-Hermes-13B,5.51,0.493,2023/6,Non-commercial,NousResearch,https://huggingface.co/NousResearch/Nous-Hermes-13b
+mpt-30b-instruct,MPT-30B-Instruct,5.22,0.478,2023/6,CC-BY-SA 3.0,MosaicML,https://huggingface.co/mosaicml/mpt-30b-instruct
+falcon-40b-instruct,Falcon-40B-Instruct,5.17,0.547,2023/5,Apache 2.0,TII,https://huggingface.co/tiiuae/falcon-40b-instruct
+h2o-oasst-openllama-13b,H2O-Oasst-OpenLLaMA-13B,4.63,0.428,2023/6,Apache 2.0,h2oai,https://huggingface.co/h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-13b
+gpt-4-1106-preview,GPT-4-1106-preview,9.32,-,2023/4,Proprietary,OpenAI,https://openai.com/blog/new-models-and-developer-products-announced-at-devday
+gpt-4-0314,GPT-4-0314,8.96,0.864,2021/9,Proprietary,OpenAI,https://openai.com/research/gpt-4
+claude-1,Claude-1,7.90,0.770,-,Proprietary,Anthropic,https://www.anthropic.com/index/introducing-claude
+gpt-4-0613,GPT-4-0613,9.18,-,2021/9,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo
+claude-2.0,Claude-2.0,8.06,0.785,-,Proprietary,Anthropic,https://www.anthropic.com/index/claude-2
+claude-2.1,Claude-2.1,8.18,-,-,Proprietary,Anthropic,https://www.anthropic.com/index/claude-2-1
+gpt-3.5-turbo-0613,GPT-3.5-Turbo-0613,8.39,-,2021/9,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-3-5
+mixtral-8x7b-instruct-v0.1,Mixtral-8x7b-Instruct-v0.1,8.30,0.706,2023/12,Apache 2.0,Mistral,https://mistral.ai/news/mixtral-of-experts/
+claude-instant-1,Claude-Instant-1,7.85,0.734,-,Proprietary,Anthropic,https://www.anthropic.com/index/introducing-claude
+gpt-3.5-turbo-0314,GPT-3.5-Turbo-0314,7.94,0.700,2021/9,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-3-5
+tulu-2-dpo-70b,Tulu-2-DPO-70B,7.89,-,2023/11,AI2 ImpACT Low-risk,AllenAI/UW,https://huggingface.co/allenai/tulu-2-dpo-70b
+yi-34b-chat,Yi-34B-Chat,-,0.735,2023/6,Yi License,01 AI,https://huggingface.co/01-ai/Yi-34B-Chat
+gemini-pro,Gemini Pro,-,0.718,2023/4,Proprietary,Google,https://blog.google/technology/ai/gemini-api-developers-cloud/
+gemini-pro-dev-api,Gemini Pro (Dev API),-,0.718,2023/4,Proprietary,Google,https://ai.google.dev/docs/gemini_api_overview
+bard-jan-24-gemini-pro,Bard (Gemini Pro),-,-,Online,Proprietary,Google,https://bard.google.com/
+wizardlm-70b,WizardLM-70B-v1.0,7.71,0.637,2023/8,Llama 2 Community,Microsoft,https://huggingface.co/WizardLM/WizardLM-70B-V1.0
+vicuna-33b,Vicuna-33B,7.12,0.592,2023/8,Non-commercial,LMSYS,https://huggingface.co/lmsys/vicuna-33b-v1.3
+starling-lm-7b-alpha,Starling-LM-7B-alpha,8.09,0.639,2023/11,CC-BY-NC-4.0,UC Berkeley,https://huggingface.co/berkeley-nest/Starling-LM-7B-alpha
+pplx-70b-online,pplx-70b-online,-,-,Online,Proprietary,Perplexity AI,https://blog.perplexity.ai/blog/introducing-pplx-online-llms
+openchat-3.5,OpenChat-3.5,7.81,0.643,2023/11,Apache-2.0,OpenChat,https://huggingface.co/openchat/openchat_3.5
+openhermes-2.5-mistral-7b,OpenHermes-2.5-Mistral-7b,-,-,2023/11,Apache-2.0,NousResearch,https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B
+gpt-3.5-turbo-1106,GPT-3.5-Turbo-1106,8.32,-,2021/9,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-3-5
+llama-2-70b-chat,Llama-2-70b-chat,6.86,0.630,2023/7,Llama 2 Community,Meta,https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+solar-10.7b-instruct-v1.0,SOLAR-10.7B-Instruct-v1.0,7.58,0.662,2023/11,CC-BY-NC-4.0,Upstage AI,https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0
+dolphin-2.2.1-mistral-7b,Dolphin-2.2.1-Mistral-7B,-,-,2023/10,Apache-2.0,Cognitive Computations,https://huggingface.co/ehartford/dolphin-2.2.1-mistral-7b
+wizardlm-13b,WizardLM-13b-v1.2,7.20,0.527,2023/7,Llama 2 Community,Microsoft,https://huggingface.co/WizardLM/WizardLM-13B-V1.2
+zephyr-7b-beta,Zephyr-7b-beta,7.34,0.614,2023/10,MIT,HuggingFace,https://huggingface.co/HuggingFaceH4/zephyr-7b-beta
+mpt-30b-chat,MPT-30B-chat,6.39,0.504,2023/6,CC-BY-NC-SA-4.0,MosaicML,https://huggingface.co/mosaicml/mpt-30b-chat
+vicuna-13b,Vicuna-13B,6.57,0.558,2023/7,Llama 2 Community,LMSYS,https://huggingface.co/lmsys/vicuna-13b-v1.5
+qwen-14b-chat,Qwen-14B-Chat,6.96,0.665,2023/8,Qianwen LICENSE,Alibaba,https://huggingface.co/Qwen/Qwen-14B-Chat
+zephyr-7b-alpha,Zephyr-7b-alpha,6.88,-,2023/10,MIT,HuggingFace,https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha
+codellama-34b-instruct,CodeLlama-34B-instruct,-,0.537,2023/7,Llama 2 Community,Meta,https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf
+falcon-180b-chat,falcon-180b-chat,-,0.680,2023/9,Falcon-180B TII License,TII,https://huggingface.co/tiiuae/falcon-180B-chat
+guanaco-33b,Guanaco-33B,6.53,0.576,2023/5,Non-commercial,UW,https://huggingface.co/timdettmers/guanaco-33b-merged
+llama-2-13b-chat,Llama-2-13b-chat,6.65,0.536,2023/7,Llama 2 Community,Meta,https://huggingface.co/meta-llama/Llama-2-13b-chat-hf
+mistral-7b-instruct,Mistral-7B-Instruct-v0.1,6.84,0.554,2023/9,Apache 2.0,Mistral,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1
+pplx-7b-online,pplx-7b-online,-,-,Online,Proprietary,Perplexity AI,https://blog.perplexity.ai/blog/introducing-pplx-online-llms
+llama-2-7b-chat,Llama-2-7b-chat,6.27,0.458,2023/7,Llama 2 Community,Meta,https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+vicuna-7b,Vicuna-7B,6.17,0.498,2023/7,Llama 2 Community,LMSYS,https://huggingface.co/lmsys/vicuna-7b-v1.5
+palm-2,PaLM-Chat-Bison-001,6.40,-,2021/6,Proprietary,Google,https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models#foundation_models
+koala-13b,Koala-13B,5.35,0.447,2023/4,Non-commercial,UC Berkeley,https://bair.berkeley.edu/blog/2023/04/03/koala/
+chatglm3-6b,ChatGLM3-6B,-,-,2023/10,Apache-2.0,Tsinghua,https://huggingface.co/THUDM/chatglm3-6b
+gpt4all-13b-snoozy,GPT4All-13B-Snoozy,5.41,0.430,2023/3,Non-commercial,Nomic AI,https://huggingface.co/nomic-ai/gpt4all-13b-snoozy
+mpt-7b-chat,MPT-7B-Chat,5.42,0.320,2023/5,CC-BY-NC-SA-4.0,MosaicML,https://huggingface.co/mosaicml/mpt-7b-chat
+chatglm2-6b,ChatGLM2-6B,4.96,0.455,2023/6,Apache-2.0,Tsinghua,https://huggingface.co/THUDM/chatglm2-6b
+RWKV-4-Raven-14B,RWKV-4-Raven-14B,3.98,0.256,2023/4,Apache 2.0,RWKV,https://huggingface.co/BlinkDL/rwkv-4-raven
+alpaca-13b,Alpaca-13B,4.53,0.481,2023/3,Non-commercial,Stanford,https://crfm.stanford.edu/2023/03/13/alpaca.html
+oasst-pythia-12b,OpenAssistant-Pythia-12B,4.32,0.270,2023/4,Apache 2.0,OpenAssistant,https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5
+chatglm-6b,ChatGLM-6B,4.50,0.361,2023/3,Non-commercial,Tsinghua,https://huggingface.co/THUDM/chatglm-6b
+fastchat-t5-3b,FastChat-T5-3B,3.04,0.477,2023/4,Apache 2.0,LMSYS,https://huggingface.co/lmsys/fastchat-t5-3b-v1.0
+stablelm-tuned-alpha-7b,StableLM-Tuned-Alpha-7B,2.75,0.244,2023/4,CC-BY-NC-SA-4.0,Stability AI,https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b
+dolly-v2-12b,Dolly-V2-12B,3.28,0.257,2023/4,MIT,Databricks,https://huggingface.co/databricks/dolly-v2-12b
+llama-13b,LLaMA-13B,2.61,0.470,2023/2,Non-commercial,Meta,https://arxiv.org/abs/2302.13971
+mistral-medium,Mistral Medium,8.61,0.753,-,Proprietary,Mistral,https://mistral.ai/news/la-plateforme/
+llama2-70b-steerlm-chat,NV-Llama2-70B-SteerLM-Chat,7.54,0.685,2023/11,Llama 2 Community,Nvidia,https://huggingface.co/nvidia/Llama2-70B-SteerLM-Chat
+stripedhyena-nous-7b,StripedHyena-Nous-7B,-,-,2023/12,Apache 2.0,Together AI,https://huggingface.co/togethercomputer/StripedHyena-Nous-7B
+deepseek-llm-67b-chat,DeepSeek-LLM-67B-Chat,-,0.713,2023/11,DeepSeek License,DeepSeek AI,https://huggingface.co/deepseek-ai/deepseek-llm-67b-chat
+gpt-4-0125-preview,GPT-4-0125-preview,-,-,2023/12,Proprietary,OpenAI,https://openai.com/blog/new-models-and-developer-products-announced-at-devday
+qwen1.5-72b-chat,Qwen1.5-72B-Chat,8.61,0.775,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5/
+qwen1.5-7b-chat,Qwen1.5-7B-Chat,7.6,0.610,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5/
+qwen1.5-4b-chat,Qwen1.5-4B-Chat,-,0.561,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5/
+openchat-3.5-0106,OpenChat-3.5-0106,7.8,0.658,2024/1,Apache-2.0,OpenChat,https://huggingface.co/openchat/openchat-3.5-0106
+nous-hermes-2-mixtral-8x7b-dpo,Nous-Hermes-2-Mixtral-8x7B-DPO,-,-,2024/1,Apache-2.0,NousResearch,https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO
+gpt-3.5-turbo-0125,GPT-3.5-Turbo-0125,-,-,2021/9,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-3-5-turbo
+mistral-next,Mistral-Next,-,-,-,Proprietary,Mistral,https://chat.mistral.ai/chat
+mistral-large-2402,Mistral-Large-2402,-,0.812,-,Proprietary,Mistral,https://mistral.ai/news/mistral-large/
+gemma-7b-it,Gemma-7B-it,-,0.643,2024/2,Gemma license,Google,https://huggingface.co/google/gemma-7b-it
+gemma-2b-it,Gemma-2B-it,-,0.423,2024/2,Gemma license,Google,https://huggingface.co/google/gemma-2b-it
+mistral-7b-instruct-v0.2,Mistral-7B-Instruct-v0.2,7.6,-,2023/12,Apache-2.0,Mistral,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2
+claude-3-sonnet-20240229,Claude 3 Sonnet,-,0.790,2023/8,Proprietary,Anthropic,https://www.anthropic.com/news/claude-3-family
+claude-3-opus-20240229,Claude 3 Opus,-,0.868,2023/8,Proprietary,Anthropic,https://www.anthropic.com/news/claude-3-family
+codellama-70b-instruct,CodeLlama-70B-instruct,-,-,2024/1,Llama 2 Community,Meta,https://huggingface.co/codellama/CodeLlama-70b-hf
+olmo-7b-instruct,OLMo-7B-instruct,-,-,2024/2,Apache-2.0,Allen AI,https://huggingface.co/allenai/OLMo-7B-Instruct
+claude-3-haiku-20240307,Claude 3 Haiku,-,0.752,2023/8,Proprietary,Anthropic,https://www.anthropic.com/news/claude-3-family
+starling-lm-7b-beta,Starling-LM-7B-beta,8.12,-,2024/3,Apache-2.0,Nexusflow,https://huggingface.co/Nexusflow/Starling-LM-7B-beta
+command-r,Command R,-,-,2024/3,CC-BY-NC-4.0,Cohere,https://txt.cohere.com/command-r
+qwen1.5-14b-chat,Qwen1.5-14B-Chat,7.91,0.676,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5/
+qwen1.5-32b-chat,Qwen1.5-32B-Chat,8.30,0.734,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5-32b/
+command-r-plus,Command R+,-,-,2024/3,CC-BY-NC-4.0,Cohere,https://txt.cohere.com/command-r-plus-microsoft-azure/
+gemma-1.1-7b-it,Gemma-1.1-7B-it,-,0.643,2024/2,Gemma license,Google,https://huggingface.co/google/gemma-1.1-7b-it

leaderboard_table_20240410.csv ADDED Viewed

	@@ -0,0 +1,96 @@

+key,Model,MT-bench (score),MMLU,Knowledge cutoff date,License,Organization,Link
+wizardlm-30b,WizardLM-30B,7.01,0.587,2023/6,Non-commercial,Microsoft,https://huggingface.co/WizardLM/WizardLM-30B-V1.0
+vicuna-13b-16k,Vicuna-13B-16k,6.92,0.545,2023/7,Llama 2 Community,LMSYS,https://huggingface.co/lmsys/vicuna-13b-v1.5-16k
+wizardlm-13b-v1.1,WizardLM-13B-v1.1,6.76,0.500,2023/7,Non-commercial,Microsoft,https://huggingface.co/WizardLM/WizardLM-13B-V1.1
+tulu-30b,Tulu-30B,6.43,0.581,2023/6,Non-commercial,AllenAI/UW,https://huggingface.co/allenai/tulu-30b
+guanaco-65b,Guanaco-65B,6.41,0.621,2023/5,Non-commercial,UW,https://huggingface.co/timdettmers/guanaco-65b-merged
+openassistant-llama-30b,OpenAssistant-LLaMA-30B,6.41,0.560,2023/4,Non-commercial,OpenAssistant,https://huggingface.co/OpenAssistant/oasst-sft-6-llama-30b-xor
+wizardlm-13b-v1.0,WizardLM-13B-v1.0,6.35,0.523,2023/5,Non-commercial,Microsoft,https://huggingface.co/WizardLM/WizardLM-13B-V1.0
+vicuna-7b-16k,Vicuna-7B-16k,6.22,0.485,2023/7,Llama 2 Community,LMSYS,https://huggingface.co/lmsys/vicuna-7b-v1.5-16k
+baize-v2-13b,Baize-v2-13B,5.75,0.489,2023/4,Non-commercial,UCSD,https://huggingface.co/project-baize/baize-v2-13b
+xgen-7b-8k-inst,XGen-7B-8K-Inst,5.55,0.421,2023/7,Non-commercial,Salesforce,https://huggingface.co/Salesforce/xgen-7b-8k-inst
+nous-hermes-13b,Nous-Hermes-13B,5.51,0.493,2023/6,Non-commercial,NousResearch,https://huggingface.co/NousResearch/Nous-Hermes-13b
+mpt-30b-instruct,MPT-30B-Instruct,5.22,0.478,2023/6,CC-BY-SA 3.0,MosaicML,https://huggingface.co/mosaicml/mpt-30b-instruct
+falcon-40b-instruct,Falcon-40B-Instruct,5.17,0.547,2023/5,Apache 2.0,TII,https://huggingface.co/tiiuae/falcon-40b-instruct
+h2o-oasst-openllama-13b,H2O-Oasst-OpenLLaMA-13B,4.63,0.428,2023/6,Apache 2.0,h2oai,https://huggingface.co/h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-13b
+gpt-4-1106-preview,GPT-4-1106-preview,9.32,-,2023/4,Proprietary,OpenAI,https://openai.com/blog/new-models-and-developer-products-announced-at-devday
+gpt-4-0314,GPT-4-0314,8.96,0.864,2021/9,Proprietary,OpenAI,https://openai.com/research/gpt-4
+claude-1,Claude-1,7.90,0.770,-,Proprietary,Anthropic,https://www.anthropic.com/index/introducing-claude
+gpt-4-0613,GPT-4-0613,9.18,-,2021/9,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo
+claude-2.0,Claude-2.0,8.06,0.785,-,Proprietary,Anthropic,https://www.anthropic.com/index/claude-2
+claude-2.1,Claude-2.1,8.18,-,-,Proprietary,Anthropic,https://www.anthropic.com/index/claude-2-1
+gpt-3.5-turbo-0613,GPT-3.5-Turbo-0613,8.39,-,2021/9,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-3-5
+mixtral-8x7b-instruct-v0.1,Mixtral-8x7b-Instruct-v0.1,8.30,0.706,2023/12,Apache 2.0,Mistral,https://mistral.ai/news/mixtral-of-experts/
+claude-instant-1,Claude-Instant-1,7.85,0.734,-,Proprietary,Anthropic,https://www.anthropic.com/index/introducing-claude
+gpt-3.5-turbo-0314,GPT-3.5-Turbo-0314,7.94,0.700,2021/9,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-3-5
+tulu-2-dpo-70b,Tulu-2-DPO-70B,7.89,-,2023/11,AI2 ImpACT Low-risk,AllenAI/UW,https://huggingface.co/allenai/tulu-2-dpo-70b
+yi-34b-chat,Yi-34B-Chat,-,0.735,2023/6,Yi License,01 AI,https://huggingface.co/01-ai/Yi-34B-Chat
+gemini-pro,Gemini Pro,-,0.718,2023/4,Proprietary,Google,https://blog.google/technology/ai/gemini-api-developers-cloud/
+gemini-pro-dev-api,Gemini Pro (Dev API),-,0.718,2023/4,Proprietary,Google,https://ai.google.dev/docs/gemini_api_overview
+bard-jan-24-gemini-pro,Bard (Gemini Pro),-,-,Online,Proprietary,Google,https://bard.google.com/
+wizardlm-70b,WizardLM-70B-v1.0,7.71,0.637,2023/8,Llama 2 Community,Microsoft,https://huggingface.co/WizardLM/WizardLM-70B-V1.0
+vicuna-33b,Vicuna-33B,7.12,0.592,2023/8,Non-commercial,LMSYS,https://huggingface.co/lmsys/vicuna-33b-v1.3
+starling-lm-7b-alpha,Starling-LM-7B-alpha,8.09,0.639,2023/11,CC-BY-NC-4.0,UC Berkeley,https://huggingface.co/berkeley-nest/Starling-LM-7B-alpha
+pplx-70b-online,pplx-70b-online,-,-,Online,Proprietary,Perplexity AI,https://blog.perplexity.ai/blog/introducing-pplx-online-llms
+openchat-3.5,OpenChat-3.5,7.81,0.643,2023/11,Apache-2.0,OpenChat,https://huggingface.co/openchat/openchat_3.5
+openhermes-2.5-mistral-7b,OpenHermes-2.5-Mistral-7b,-,-,2023/11,Apache-2.0,NousResearch,https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B
+gpt-3.5-turbo-1106,GPT-3.5-Turbo-1106,8.32,-,2021/9,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-3-5
+llama-2-70b-chat,Llama-2-70b-chat,6.86,0.630,2023/7,Llama 2 Community,Meta,https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+solar-10.7b-instruct-v1.0,SOLAR-10.7B-Instruct-v1.0,7.58,0.662,2023/11,CC-BY-NC-4.0,Upstage AI,https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0
+dolphin-2.2.1-mistral-7b,Dolphin-2.2.1-Mistral-7B,-,-,2023/10,Apache-2.0,Cognitive Computations,https://huggingface.co/ehartford/dolphin-2.2.1-mistral-7b
+wizardlm-13b,WizardLM-13b-v1.2,7.20,0.527,2023/7,Llama 2 Community,Microsoft,https://huggingface.co/WizardLM/WizardLM-13B-V1.2
+zephyr-7b-beta,Zephyr-7b-beta,7.34,0.614,2023/10,MIT,HuggingFace,https://huggingface.co/HuggingFaceH4/zephyr-7b-beta
+mpt-30b-chat,MPT-30B-chat,6.39,0.504,2023/6,CC-BY-NC-SA-4.0,MosaicML,https://huggingface.co/mosaicml/mpt-30b-chat
+vicuna-13b,Vicuna-13B,6.57,0.558,2023/7,Llama 2 Community,LMSYS,https://huggingface.co/lmsys/vicuna-13b-v1.5
+qwen-14b-chat,Qwen-14B-Chat,6.96,0.665,2023/8,Qianwen LICENSE,Alibaba,https://huggingface.co/Qwen/Qwen-14B-Chat
+zephyr-7b-alpha,Zephyr-7b-alpha,6.88,-,2023/10,MIT,HuggingFace,https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha
+codellama-34b-instruct,CodeLlama-34B-instruct,-,0.537,2023/7,Llama 2 Community,Meta,https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf
+falcon-180b-chat,falcon-180b-chat,-,0.680,2023/9,Falcon-180B TII License,TII,https://huggingface.co/tiiuae/falcon-180B-chat
+guanaco-33b,Guanaco-33B,6.53,0.576,2023/5,Non-commercial,UW,https://huggingface.co/timdettmers/guanaco-33b-merged
+llama-2-13b-chat,Llama-2-13b-chat,6.65,0.536,2023/7,Llama 2 Community,Meta,https://huggingface.co/meta-llama/Llama-2-13b-chat-hf
+mistral-7b-instruct,Mistral-7B-Instruct-v0.1,6.84,0.554,2023/9,Apache 2.0,Mistral,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1
+pplx-7b-online,pplx-7b-online,-,-,Online,Proprietary,Perplexity AI,https://blog.perplexity.ai/blog/introducing-pplx-online-llms
+llama-2-7b-chat,Llama-2-7b-chat,6.27,0.458,2023/7,Llama 2 Community,Meta,https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+vicuna-7b,Vicuna-7B,6.17,0.498,2023/7,Llama 2 Community,LMSYS,https://huggingface.co/lmsys/vicuna-7b-v1.5
+palm-2,PaLM-Chat-Bison-001,6.40,-,2021/6,Proprietary,Google,https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models#foundation_models
+koala-13b,Koala-13B,5.35,0.447,2023/4,Non-commercial,UC Berkeley,https://bair.berkeley.edu/blog/2023/04/03/koala/
+chatglm3-6b,ChatGLM3-6B,-,-,2023/10,Apache-2.0,Tsinghua,https://huggingface.co/THUDM/chatglm3-6b
+gpt4all-13b-snoozy,GPT4All-13B-Snoozy,5.41,0.430,2023/3,Non-commercial,Nomic AI,https://huggingface.co/nomic-ai/gpt4all-13b-snoozy
+mpt-7b-chat,MPT-7B-Chat,5.42,0.320,2023/5,CC-BY-NC-SA-4.0,MosaicML,https://huggingface.co/mosaicml/mpt-7b-chat
+chatglm2-6b,ChatGLM2-6B,4.96,0.455,2023/6,Apache-2.0,Tsinghua,https://huggingface.co/THUDM/chatglm2-6b
+RWKV-4-Raven-14B,RWKV-4-Raven-14B,3.98,0.256,2023/4,Apache 2.0,RWKV,https://huggingface.co/BlinkDL/rwkv-4-raven
+alpaca-13b,Alpaca-13B,4.53,0.481,2023/3,Non-commercial,Stanford,https://crfm.stanford.edu/2023/03/13/alpaca.html
+oasst-pythia-12b,OpenAssistant-Pythia-12B,4.32,0.270,2023/4,Apache 2.0,OpenAssistant,https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5
+chatglm-6b,ChatGLM-6B,4.50,0.361,2023/3,Non-commercial,Tsinghua,https://huggingface.co/THUDM/chatglm-6b
+fastchat-t5-3b,FastChat-T5-3B,3.04,0.477,2023/4,Apache 2.0,LMSYS,https://huggingface.co/lmsys/fastchat-t5-3b-v1.0
+stablelm-tuned-alpha-7b,StableLM-Tuned-Alpha-7B,2.75,0.244,2023/4,CC-BY-NC-SA-4.0,Stability AI,https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b
+dolly-v2-12b,Dolly-V2-12B,3.28,0.257,2023/4,MIT,Databricks,https://huggingface.co/databricks/dolly-v2-12b
+llama-13b,LLaMA-13B,2.61,0.470,2023/2,Non-commercial,Meta,https://arxiv.org/abs/2302.13971
+mistral-medium,Mistral Medium,8.61,0.753,-,Proprietary,Mistral,https://mistral.ai/news/la-plateforme/
+llama2-70b-steerlm-chat,NV-Llama2-70B-SteerLM-Chat,7.54,0.685,2023/11,Llama 2 Community,Nvidia,https://huggingface.co/nvidia/Llama2-70B-SteerLM-Chat
+stripedhyena-nous-7b,StripedHyena-Nous-7B,-,-,2023/12,Apache 2.0,Together AI,https://huggingface.co/togethercomputer/StripedHyena-Nous-7B
+deepseek-llm-67b-chat,DeepSeek-LLM-67B-Chat,-,0.713,2023/11,DeepSeek License,DeepSeek AI,https://huggingface.co/deepseek-ai/deepseek-llm-67b-chat
+gpt-4-0125-preview,GPT-4-0125-preview,-,-,2023/12,Proprietary,OpenAI,https://openai.com/blog/new-models-and-developer-products-announced-at-devday
+qwen1.5-72b-chat,Qwen1.5-72B-Chat,8.61,0.775,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5/
+qwen1.5-7b-chat,Qwen1.5-7B-Chat,7.6,0.610,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5/
+qwen1.5-4b-chat,Qwen1.5-4B-Chat,-,0.561,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5/
+openchat-3.5-0106,OpenChat-3.5-0106,7.8,0.658,2024/1,Apache-2.0,OpenChat,https://huggingface.co/openchat/openchat-3.5-0106
+nous-hermes-2-mixtral-8x7b-dpo,Nous-Hermes-2-Mixtral-8x7B-DPO,-,-,2024/1,Apache-2.0,NousResearch,https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO
+gpt-3.5-turbo-0125,GPT-3.5-Turbo-0125,-,-,2021/9,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-3-5-turbo
+mistral-next,Mistral-Next,-,-,-,Proprietary,Mistral,https://chat.mistral.ai/chat
+mistral-large-2402,Mistral-Large-2402,-,0.812,-,Proprietary,Mistral,https://mistral.ai/news/mistral-large/
+gemma-7b-it,Gemma-7B-it,-,0.643,2024/2,Gemma license,Google,https://huggingface.co/google/gemma-7b-it
+gemma-2b-it,Gemma-2B-it,-,0.423,2024/2,Gemma license,Google,https://huggingface.co/google/gemma-2b-it
+mistral-7b-instruct-v0.2,Mistral-7B-Instruct-v0.2,7.6,-,2023/12,Apache-2.0,Mistral,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2
+claude-3-sonnet-20240229,Claude 3 Sonnet,-,0.790,2023/8,Proprietary,Anthropic,https://www.anthropic.com/news/claude-3-family
+claude-3-opus-20240229,Claude 3 Opus,-,0.868,2023/8,Proprietary,Anthropic,https://www.anthropic.com/news/claude-3-family
+codellama-70b-instruct,CodeLlama-70B-instruct,-,-,2024/1,Llama 2 Community,Meta,https://huggingface.co/codellama/CodeLlama-70b-hf
+olmo-7b-instruct,OLMo-7B-instruct,-,-,2024/2,Apache-2.0,Allen AI,https://huggingface.co/allenai/OLMo-7B-Instruct
+claude-3-haiku-20240307,Claude 3 Haiku,-,0.752,2023/8,Proprietary,Anthropic,https://www.anthropic.com/news/claude-3-family
+starling-lm-7b-beta,Starling-LM-7B-beta,8.12,-,2024/3,Apache-2.0,Nexusflow,https://huggingface.co/Nexusflow/Starling-LM-7B-beta
+command-r,Command R,-,-,2024/3,CC-BY-NC-4.0,Cohere,https://txt.cohere.com/command-r
+qwen1.5-14b-chat,Qwen1.5-14B-Chat,7.91,0.676,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5/
+qwen1.5-32b-chat,Qwen1.5-32B-Chat,8.30,0.734,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5-32b/
+command-r-plus,Command R+,-,-,2024/3,CC-BY-NC-4.0,Cohere,https://txt.cohere.com/command-r-plus-microsoft-azure/
+gemma-1.1-7b-it,Gemma-1.1-7B-it,-,0.643,2024/2,Gemma license,Google,https://huggingface.co/google/gemma-1.1-7b-it
+dbrx-instruct,DBRX-Instruct-Preview,-,0.737,2023/12,DBRX LICENSE,Databricks,https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm

leaderboard_table_20240411.csv ADDED Viewed

	@@ -0,0 +1,97 @@

+key,Model,MT-bench (score),MMLU,Knowledge cutoff date,License,Organization,Link
+wizardlm-30b,WizardLM-30B,7.01,0.587,2023/6,Non-commercial,Microsoft,https://huggingface.co/WizardLM/WizardLM-30B-V1.0
+vicuna-13b-16k,Vicuna-13B-16k,6.92,0.545,2023/7,Llama 2 Community,LMSYS,https://huggingface.co/lmsys/vicuna-13b-v1.5-16k
+wizardlm-13b-v1.1,WizardLM-13B-v1.1,6.76,0.500,2023/7,Non-commercial,Microsoft,https://huggingface.co/WizardLM/WizardLM-13B-V1.1
+tulu-30b,Tulu-30B,6.43,0.581,2023/6,Non-commercial,AllenAI/UW,https://huggingface.co/allenai/tulu-30b
+guanaco-65b,Guanaco-65B,6.41,0.621,2023/5,Non-commercial,UW,https://huggingface.co/timdettmers/guanaco-65b-merged
+openassistant-llama-30b,OpenAssistant-LLaMA-30B,6.41,0.560,2023/4,Non-commercial,OpenAssistant,https://huggingface.co/OpenAssistant/oasst-sft-6-llama-30b-xor
+wizardlm-13b-v1.0,WizardLM-13B-v1.0,6.35,0.523,2023/5,Non-commercial,Microsoft,https://huggingface.co/WizardLM/WizardLM-13B-V1.0
+vicuna-7b-16k,Vicuna-7B-16k,6.22,0.485,2023/7,Llama 2 Community,LMSYS,https://huggingface.co/lmsys/vicuna-7b-v1.5-16k
+baize-v2-13b,Baize-v2-13B,5.75,0.489,2023/4,Non-commercial,UCSD,https://huggingface.co/project-baize/baize-v2-13b
+xgen-7b-8k-inst,XGen-7B-8K-Inst,5.55,0.421,2023/7,Non-commercial,Salesforce,https://huggingface.co/Salesforce/xgen-7b-8k-inst
+nous-hermes-13b,Nous-Hermes-13B,5.51,0.493,2023/6,Non-commercial,NousResearch,https://huggingface.co/NousResearch/Nous-Hermes-13b
+mpt-30b-instruct,MPT-30B-Instruct,5.22,0.478,2023/6,CC-BY-SA 3.0,MosaicML,https://huggingface.co/mosaicml/mpt-30b-instruct
+falcon-40b-instruct,Falcon-40B-Instruct,5.17,0.547,2023/5,Apache 2.0,TII,https://huggingface.co/tiiuae/falcon-40b-instruct
+h2o-oasst-openllama-13b,H2O-Oasst-OpenLLaMA-13B,4.63,0.428,2023/6,Apache 2.0,h2oai,https://huggingface.co/h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-13b
+gpt-4-1106-preview,GPT-4-1106-preview,9.32,-,2023/4,Proprietary,OpenAI,https://openai.com/blog/new-models-and-developer-products-announced-at-devday
+gpt-4-0314,GPT-4-0314,8.96,0.864,2021/9,Proprietary,OpenAI,https://openai.com/research/gpt-4
+claude-1,Claude-1,7.90,0.770,-,Proprietary,Anthropic,https://www.anthropic.com/index/introducing-claude
+gpt-4-0613,GPT-4-0613,9.18,-,2021/9,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo
+claude-2.0,Claude-2.0,8.06,0.785,-,Proprietary,Anthropic,https://www.anthropic.com/index/claude-2
+claude-2.1,Claude-2.1,8.18,-,-,Proprietary,Anthropic,https://www.anthropic.com/index/claude-2-1
+gpt-3.5-turbo-0613,GPT-3.5-Turbo-0613,8.39,-,2021/9,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-3-5
+mixtral-8x7b-instruct-v0.1,Mixtral-8x7b-Instruct-v0.1,8.30,0.706,2023/12,Apache 2.0,Mistral,https://mistral.ai/news/mixtral-of-experts/
+claude-instant-1,Claude-Instant-1,7.85,0.734,-,Proprietary,Anthropic,https://www.anthropic.com/index/introducing-claude
+gpt-3.5-turbo-0314,GPT-3.5-Turbo-0314,7.94,0.700,2021/9,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-3-5
+tulu-2-dpo-70b,Tulu-2-DPO-70B,7.89,-,2023/11,AI2 ImpACT Low-risk,AllenAI/UW,https://huggingface.co/allenai/tulu-2-dpo-70b
+yi-34b-chat,Yi-34B-Chat,-,0.735,2023/6,Yi License,01 AI,https://huggingface.co/01-ai/Yi-34B-Chat
+gemini-pro,Gemini Pro,-,0.718,2023/4,Proprietary,Google,https://blog.google/technology/ai/gemini-api-developers-cloud/
+gemini-pro-dev-api,Gemini Pro (Dev API),-,0.718,2023/4,Proprietary,Google,https://ai.google.dev/docs/gemini_api_overview
+bard-jan-24-gemini-pro,Bard (Gemini Pro),-,-,Online,Proprietary,Google,https://bard.google.com/
+wizardlm-70b,WizardLM-70B-v1.0,7.71,0.637,2023/8,Llama 2 Community,Microsoft,https://huggingface.co/WizardLM/WizardLM-70B-V1.0
+vicuna-33b,Vicuna-33B,7.12,0.592,2023/8,Non-commercial,LMSYS,https://huggingface.co/lmsys/vicuna-33b-v1.3
+starling-lm-7b-alpha,Starling-LM-7B-alpha,8.09,0.639,2023/11,CC-BY-NC-4.0,UC Berkeley,https://huggingface.co/berkeley-nest/Starling-LM-7B-alpha
+pplx-70b-online,pplx-70b-online,-,-,Online,Proprietary,Perplexity AI,https://blog.perplexity.ai/blog/introducing-pplx-online-llms
+openchat-3.5,OpenChat-3.5,7.81,0.643,2023/11,Apache-2.0,OpenChat,https://huggingface.co/openchat/openchat_3.5
+openhermes-2.5-mistral-7b,OpenHermes-2.5-Mistral-7b,-,-,2023/11,Apache-2.0,NousResearch,https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B
+gpt-3.5-turbo-1106,GPT-3.5-Turbo-1106,8.32,-,2021/9,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-3-5
+llama-2-70b-chat,Llama-2-70b-chat,6.86,0.630,2023/7,Llama 2 Community,Meta,https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
+solar-10.7b-instruct-v1.0,SOLAR-10.7B-Instruct-v1.0,7.58,0.662,2023/11,CC-BY-NC-4.0,Upstage AI,https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0
+dolphin-2.2.1-mistral-7b,Dolphin-2.2.1-Mistral-7B,-,-,2023/10,Apache-2.0,Cognitive Computations,https://huggingface.co/ehartford/dolphin-2.2.1-mistral-7b
+wizardlm-13b,WizardLM-13b-v1.2,7.20,0.527,2023/7,Llama 2 Community,Microsoft,https://huggingface.co/WizardLM/WizardLM-13B-V1.2
+zephyr-7b-beta,Zephyr-7b-beta,7.34,0.614,2023/10,MIT,HuggingFace,https://huggingface.co/HuggingFaceH4/zephyr-7b-beta
+mpt-30b-chat,MPT-30B-chat,6.39,0.504,2023/6,CC-BY-NC-SA-4.0,MosaicML,https://huggingface.co/mosaicml/mpt-30b-chat
+vicuna-13b,Vicuna-13B,6.57,0.558,2023/7,Llama 2 Community,LMSYS,https://huggingface.co/lmsys/vicuna-13b-v1.5
+qwen-14b-chat,Qwen-14B-Chat,6.96,0.665,2023/8,Qianwen LICENSE,Alibaba,https://huggingface.co/Qwen/Qwen-14B-Chat
+zephyr-7b-alpha,Zephyr-7b-alpha,6.88,-,2023/10,MIT,HuggingFace,https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha
+codellama-34b-instruct,CodeLlama-34B-instruct,-,0.537,2023/7,Llama 2 Community,Meta,https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf
+falcon-180b-chat,falcon-180b-chat,-,0.680,2023/9,Falcon-180B TII License,TII,https://huggingface.co/tiiuae/falcon-180B-chat
+guanaco-33b,Guanaco-33B,6.53,0.576,2023/5,Non-commercial,UW,https://huggingface.co/timdettmers/guanaco-33b-merged
+llama-2-13b-chat,Llama-2-13b-chat,6.65,0.536,2023/7,Llama 2 Community,Meta,https://huggingface.co/meta-llama/Llama-2-13b-chat-hf
+mistral-7b-instruct,Mistral-7B-Instruct-v0.1,6.84,0.554,2023/9,Apache 2.0,Mistral,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1
+pplx-7b-online,pplx-7b-online,-,-,Online,Proprietary,Perplexity AI,https://blog.perplexity.ai/blog/introducing-pplx-online-llms
+llama-2-7b-chat,Llama-2-7b-chat,6.27,0.458,2023/7,Llama 2 Community,Meta,https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+vicuna-7b,Vicuna-7B,6.17,0.498,2023/7,Llama 2 Community,LMSYS,https://huggingface.co/lmsys/vicuna-7b-v1.5
+palm-2,PaLM-Chat-Bison-001,6.40,-,2021/6,Proprietary,Google,https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models#foundation_models
+koala-13b,Koala-13B,5.35,0.447,2023/4,Non-commercial,UC Berkeley,https://bair.berkeley.edu/blog/2023/04/03/koala/
+chatglm3-6b,ChatGLM3-6B,-,-,2023/10,Apache-2.0,Tsinghua,https://huggingface.co/THUDM/chatglm3-6b
+gpt4all-13b-snoozy,GPT4All-13B-Snoozy,5.41,0.430,2023/3,Non-commercial,Nomic AI,https://huggingface.co/nomic-ai/gpt4all-13b-snoozy
+mpt-7b-chat,MPT-7B-Chat,5.42,0.320,2023/5,CC-BY-NC-SA-4.0,MosaicML,https://huggingface.co/mosaicml/mpt-7b-chat
+chatglm2-6b,ChatGLM2-6B,4.96,0.455,2023/6,Apache-2.0,Tsinghua,https://huggingface.co/THUDM/chatglm2-6b
+RWKV-4-Raven-14B,RWKV-4-Raven-14B,3.98,0.256,2023/4,Apache 2.0,RWKV,https://huggingface.co/BlinkDL/rwkv-4-raven
+alpaca-13b,Alpaca-13B,4.53,0.481,2023/3,Non-commercial,Stanford,https://crfm.stanford.edu/2023/03/13/alpaca.html
+oasst-pythia-12b,OpenAssistant-Pythia-12B,4.32,0.270,2023/4,Apache 2.0,OpenAssistant,https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5
+chatglm-6b,ChatGLM-6B,4.50,0.361,2023/3,Non-commercial,Tsinghua,https://huggingface.co/THUDM/chatglm-6b
+fastchat-t5-3b,FastChat-T5-3B,3.04,0.477,2023/4,Apache 2.0,LMSYS,https://huggingface.co/lmsys/fastchat-t5-3b-v1.0
+stablelm-tuned-alpha-7b,StableLM-Tuned-Alpha-7B,2.75,0.244,2023/4,CC-BY-NC-SA-4.0,Stability AI,https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b
+dolly-v2-12b,Dolly-V2-12B,3.28,0.257,2023/4,MIT,Databricks,https://huggingface.co/databricks/dolly-v2-12b
+llama-13b,LLaMA-13B,2.61,0.470,2023/2,Non-commercial,Meta,https://arxiv.org/abs/2302.13971
+mistral-medium,Mistral Medium,8.61,0.753,-,Proprietary,Mistral,https://mistral.ai/news/la-plateforme/
+llama2-70b-steerlm-chat,NV-Llama2-70B-SteerLM-Chat,7.54,0.685,2023/11,Llama 2 Community,Nvidia,https://huggingface.co/nvidia/Llama2-70B-SteerLM-Chat
+stripedhyena-nous-7b,StripedHyena-Nous-7B,-,-,2023/12,Apache 2.0,Together AI,https://huggingface.co/togethercomputer/StripedHyena-Nous-7B
+deepseek-llm-67b-chat,DeepSeek-LLM-67B-Chat,-,0.713,2023/11,DeepSeek License,DeepSeek AI,https://huggingface.co/deepseek-ai/deepseek-llm-67b-chat
+gpt-4-0125-preview,GPT-4-0125-preview,-,-,2023/12,Proprietary,OpenAI,https://openai.com/blog/new-models-and-developer-products-announced-at-devday
+qwen1.5-72b-chat,Qwen1.5-72B-Chat,8.61,0.775,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5/
+qwen1.5-7b-chat,Qwen1.5-7B-Chat,7.6,0.610,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5/
+qwen1.5-4b-chat,Qwen1.5-4B-Chat,-,0.561,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5/
+openchat-3.5-0106,OpenChat-3.5-0106,7.8,0.658,2024/1,Apache-2.0,OpenChat,https://huggingface.co/openchat/openchat-3.5-0106
+nous-hermes-2-mixtral-8x7b-dpo,Nous-Hermes-2-Mixtral-8x7B-DPO,-,-,2024/1,Apache-2.0,NousResearch,https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO
+gpt-3.5-turbo-0125,GPT-3.5-Turbo-0125,-,-,2021/9,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-3-5-turbo
+mistral-next,Mistral-Next,-,-,-,Proprietary,Mistral,https://chat.mistral.ai/chat
+mistral-large-2402,Mistral-Large-2402,-,0.812,-,Proprietary,Mistral,https://mistral.ai/news/mistral-large/
+gemma-7b-it,Gemma-7B-it,-,0.643,2024/2,Gemma license,Google,https://huggingface.co/google/gemma-7b-it
+gemma-2b-it,Gemma-2B-it,-,0.423,2024/2,Gemma license,Google,https://huggingface.co/google/gemma-2b-it
+mistral-7b-instruct-v0.2,Mistral-7B-Instruct-v0.2,7.6,-,2023/12,Apache-2.0,Mistral,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2
+claude-3-sonnet-20240229,Claude 3 Sonnet,-,0.790,2023/8,Proprietary,Anthropic,https://www.anthropic.com/news/claude-3-family
+claude-3-opus-20240229,Claude 3 Opus,-,0.868,2023/8,Proprietary,Anthropic,https://www.anthropic.com/news/claude-3-family
+codellama-70b-instruct,CodeLlama-70B-instruct,-,-,2024/1,Llama 2 Community,Meta,https://huggingface.co/codellama/CodeLlama-70b-hf
+olmo-7b-instruct,OLMo-7B-instruct,-,-,2024/2,Apache-2.0,Allen AI,https://huggingface.co/allenai/OLMo-7B-Instruct
+claude-3-haiku-20240307,Claude 3 Haiku,-,0.752,2023/8,Proprietary,Anthropic,https://www.anthropic.com/news/claude-3-family
+starling-lm-7b-beta,Starling-LM-7B-beta,8.12,-,2024/3,Apache-2.0,Nexusflow,https://huggingface.co/Nexusflow/Starling-LM-7B-beta
+command-r,Command R,-,-,2024/3,CC-BY-NC-4.0,Cohere,https://txt.cohere.com/command-r
+qwen1.5-14b-chat,Qwen1.5-14B-Chat,7.91,0.676,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5/
+qwen1.5-32b-chat,Qwen1.5-32B-Chat,8.30,0.734,2024/2,Qianwen LICENSE,Alibaba,https://qwenlm.github.io/blog/qwen1.5-32b/
+command-r-plus,Command R+,-,-,2024/3,CC-BY-NC-4.0,Cohere,https://txt.cohere.com/command-r-plus-microsoft-azure/
+gemma-1.1-7b-it,Gemma-1.1-7B-it,-,0.643,2024/2,Gemma license,Google,https://huggingface.co/google/gemma-1.1-7b-it
+dbrx-instruct-preview,DBRX-Instruct-Preview,-,0.737,2023/12,DBRX LICENSE,Databricks,https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm
+gpt-4-turbo-2024-04-09,GPT-4-Turbo-2024-04-09,-,-,2023/12,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4

theme.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"theme": {"_font": [{"__gradio_font__": true, "name": "Rubik", "class": "google"}, {"__gradio_font__": true, "name": "ui-sans-serif", "class": "font"}, {"__gradio_font__": true, "name": "system-ui", "class": "font"}, {"__gradio_font__": true, "name": "sans-serif", "class": "font"}], "_font_mono": [{"__gradio_font__": true, "name": "Inconsolata", "class": "google"}, {"__gradio_font__": true, "name": "ui-monospace", "class": "font"}, {"__gradio_font__": true, "name": "Consolas", "class": "font"}, {"__gradio_font__": true, "name": "monospace", "class": "font"}], "_stylesheets": ["https://fonts.googleapis.com/css2?family=Rubik:wght@400;500&display=swap", "https://fonts.googleapis.com/css2?family=Inconsolata:wght@400;500&display=swap"], "text_size": "20px", "background_fill_primary": "white", "background_fill_primary_dark": "*neutral_950", "background_fill_secondary": "*neutral_50", "background_fill_secondary_dark": "*neutral_900", "block_background_fill": "*background_fill_primary", "block_background_fill_dark": "*neutral_800", "block_border_color": "*border_color_primary", "block_border_color_dark": "*border_color_primary", "block_border_width": "1px", "block_border_width_dark": "1px", "block_info_text_color": "*body_text_color_subdued", "block_info_text_color_dark": "*body_text_color_subdued", "block_info_text_size": "*text_sm", "block_info_text_weight": "400", "block_label_background_fill": "*background_fill_primary", "block_label_background_fill_dark": "*background_fill_secondary", "block_label_border_color": "*border_color_primary", "block_label_border_color_dark": "*border_color_primary", "block_label_border_width": "1px", "block_label_border_width_dark": "1px", "block_label_margin": "0", "block_label_padding": "*spacing_sm *spacing_lg", "block_label_radius": "calc(*radius_lg - 1px) 0 calc(*radius_lg - 1px) 0", "block_label_right_radius": "0 calc(*radius_lg - 1px) 0 calc(*radius_lg - 1px)", "block_label_shadow": "*block_shadow", "block_label_text_color": "*neutral_500", "block_label_text_color_dark": "*neutral_200", "block_label_text_size": "*text_sm", "block_label_text_weight": "400", "block_padding": "*spacing_xl calc(*spacing_xl + 2px)", "block_radius": "*radius_lg", "block_shadow": "none", "block_shadow_dark": "none", "block_title_background_fill": "none", "block_title_background_fill_dark": "none", "block_title_border_color": "none", "block_title_border_color_dark": "none", "block_title_border_width": "0px", "block_title_border_width_dark": "0px", "block_title_padding": "0", "block_title_radius": "none", "block_title_text_color": "*neutral_500", "block_title_text_color_dark": "*neutral_200", "block_title_text_size": "*text_md", "block_title_text_weight": "400", "body_background_fill": "*background_fill_primary", "body_background_fill_dark": "*background_fill_primary", "body_text_color": "*neutral_700", "body_text_color_dark": "*neutral_200", "body_text_color_subdued": "*neutral_400", "body_text_color_subdued_dark": "*neutral_500", "body_text_size": "*text_md", "body_text_weight": "400", "border_color_accent": "*primary_300", "border_color_accent_dark": "*neutral_600", "border_color_primary": "*neutral_200", "border_color_primary_dark": "*neutral_700", "button_border_width": "*input_border_width", "button_border_width_dark": "*input_border_width", "button_cancel_background_fill": "*button_secondary_background_fill", "button_cancel_background_fill_dark": "*button_secondary_background_fill", "button_cancel_background_fill_hover": "*button_cancel_background_fill", "button_cancel_background_fill_hover_dark": "*button_cancel_background_fill", "button_cancel_border_color": "*button_secondary_border_color", "button_cancel_border_color_dark": "*button_secondary_border_color", "button_cancel_border_color_hover": "*button_cancel_border_color", "button_cancel_border_color_hover_dark": "*button_cancel_border_color", "button_cancel_text_color": "*button_secondary_text_color", "button_cancel_text_color_dark": "*button_secondary_text_color", "button_cancel_text_color_hover": "*button_cancel_text_color", "button_cancel_text_color_hover_dark": "*button_cancel_text_color", "button_large_padding": "*spacing_lg calc(2 * *spacing_lg)", "button_large_radius": "*radius_lg", "button_large_text_size": "*text_lg", "button_large_text_weight": "500", "button_primary_background_fill": "*primary_200", "button_primary_background_fill_dark": "*primary_700", "button_primary_background_fill_hover": "*button_primary_background_fill", "button_primary_background_fill_hover_dark": "*button_primary_background_fill", "button_primary_border_color": "*primary_200", "button_primary_border_color_dark": "*primary_600", "button_primary_border_color_hover": "*button_primary_border_color", "button_primary_border_color_hover_dark": "*button_primary_border_color", "button_primary_text_color": "*primary_600", "button_primary_text_color_dark": "white", "button_primary_text_color_hover": "*button_primary_text_color", "button_primary_text_color_hover_dark": "*button_primary_text_color", "button_secondary_background_fill": "*neutral_200", "button_secondary_background_fill_dark": "*neutral_600", "button_secondary_background_fill_hover": "*neutral_300", "button_secondary_background_fill_hover_dark": "*neutral_500", "button_secondary_border_color": "*neutral_200", "button_secondary_border_color_dark": "*neutral_600", "button_secondary_border_color_hover": "*button_secondary_border_color", "button_secondary_border_color_hover_dark": "*button_secondary_border_color", "button_secondary_text_color": "*neutral_700", "button_secondary_text_color_dark": "white", "button_secondary_text_color_hover": "*button_secondary_text_color", "button_secondary_text_color_hover_dark": "*button_secondary_text_color", "button_shadow": "none", "button_shadow_active": "none", "button_shadow_hover": "none", "button_small_padding": "*spacing_sm calc(2 * *spacing_sm)", "button_small_radius": "*radius_lg", "button_small_text_size": "*text_md", "button_small_text_weight": "400", "button_transition": "background-color 0.2s ease", "checkbox_background_color": "*background_fill_primary", "checkbox_background_color_dark": "*neutral_800", "checkbox_background_color_focus": "*checkbox_background_color", "checkbox_background_color_focus_dark": "*checkbox_background_color", "checkbox_background_color_hover": "*checkbox_background_color", "checkbox_background_color_hover_dark": "*checkbox_background_color", "checkbox_background_color_selected": "*secondary_600", "checkbox_background_color_selected_dark": "*secondary_600", "checkbox_border_color": "*neutral_300", "checkbox_border_color_dark": "*neutral_700", "checkbox_border_color_focus": "*secondary_500", "checkbox_border_color_focus_dark": "*secondary_500", "checkbox_border_color_hover": "*neutral_300", "checkbox_border_color_hover_dark": "*neutral_600", "checkbox_border_color_selected": "*secondary_600", "checkbox_border_color_selected_dark": "*secondary_600", "checkbox_border_radius": "*radius_sm", "checkbox_border_width": "*input_border_width", "checkbox_border_width_dark": "*input_border_width", "checkbox_check": "url(\"data:image/svg+xml,%3csvg viewBox='0 0 16 16' fill='white' xmlns='http://www.w3.org/2000/svg'%3e%3cpath d='M12.207 4.793a1 1 0 010 1.414l-5 5a1 1 0 01-1.414 0l-2-2a1 1 0 011.414-1.414L6.5 9.086l4.293-4.293a1 1 0 011.414 0z'/%3e%3c/svg%3e\")", "checkbox_label_background_fill": "*button_secondary_background_fill", "checkbox_label_background_fill_dark": "*button_secondary_background_fill", "checkbox_label_background_fill_hover": "*button_secondary_background_fill_hover", "checkbox_label_background_fill_hover_dark": "*button_secondary_background_fill_hover", "checkbox_label_background_fill_selected": "*checkbox_label_background_fill", "checkbox_label_background_fill_selected_dark": "*checkbox_label_background_fill", "checkbox_label_border_color": "*border_color_primary", "checkbox_label_border_color_dark": "*border_color_primary", "checkbox_label_border_color_hover": "*checkbox_label_border_color", "checkbox_label_border_color_hover_dark": "*checkbox_label_border_color", "checkbox_label_border_width": "*input_border_width", "checkbox_label_border_width_dark": "*input_border_width", "checkbox_label_gap": "*spacing_lg", "checkbox_label_padding": "*spacing_md calc(2 * *spacing_md)", "checkbox_label_shadow": "none", "checkbox_label_text_color": "*body_text_color", "checkbox_label_text_color_dark": "*body_text_color", "checkbox_label_text_color_selected": "*checkbox_label_text_color", "checkbox_label_text_color_selected_dark": "*checkbox_label_text_color", "checkbox_label_text_size": "*text_md", "checkbox_label_text_weight": "400", "checkbox_shadow": "*input_shadow", "color_accent": "*primary_500", "color_accent_soft": "*primary_50", "color_accent_soft_dark": "*neutral_700", "container_radius": "*radius_lg", "embed_radius": "*radius_md", "error_background_fill": "#fee2e2", "error_background_fill_dark": "*background_fill_primary", "error_border_color": "#fecaca", "error_border_color_dark": "*border_color_primary", "error_border_width": "1px", "error_border_width_dark": "1px", "error_text_color": "#ef4444", "error_text_color_dark": "#ef4444", "font": "'Rubik', 'ui-sans-serif', 'system-ui', sans-serif", "font_mono": "'Inconsolata', 'ui-monospace', 'Consolas', monospace", "form_gap_width": "0px", "input_background_fill": "*neutral_100", "input_background_fill_dark": "*neutral_700", "input_background_fill_focus": "*secondary_500", "input_background_fill_focus_dark": "*secondary_600", "input_background_fill_hover": "*input_background_fill", "input_background_fill_hover_dark": "*input_background_fill", "input_border_color": "*border_color_primary", "input_border_color_dark": "*border_color_primary", "input_border_color_focus": "*secondary_300", "input_border_color_focus_dark": "*neutral_700", "input_border_color_hover": "*input_border_color", "input_border_color_hover_dark": "*input_border_color", "input_border_width": "0px", "input_border_width_dark": "0px", "input_padding": "*spacing_xl", "input_placeholder_color": "*neutral_400", "input_placeholder_color_dark": "*neutral_500", "input_radius": "*radius_lg", "input_shadow": "none", "input_shadow_dark": "none", "input_shadow_focus": "*input_shadow", "input_shadow_focus_dark": "*input_shadow", "input_text_size": "*text_md", "input_text_weight": "400", "layout_gap": "*spacing_xxl", "link_text_color": "*secondary_600", "link_text_color_active": "*secondary_600", "link_text_color_active_dark": "*secondary_500", "link_text_color_dark": "*secondary_500", "link_text_color_hover": "*secondary_700", "link_text_color_hover_dark": "*secondary_400", "link_text_color_visited": "*secondary_500", "link_text_color_visited_dark": "*secondary_600", "loader_color": "*color_accent", "loader_color_dark": "*color_accent", "name": "base", "neutral_100": "#f5f5f4", "neutral_200": "#e7e5e4", "neutral_300": "#d6d3d1", "neutral_400": "#a8a29e", "neutral_50": "#fafaf9", "neutral_500": "#78716c", "neutral_600": "#57534e", "neutral_700": "#44403c", "neutral_800": "#292524", "neutral_900": "#1c1917", "neutral_950": "#0f0e0d", "panel_background_fill": "*background_fill_secondary", "panel_background_fill_dark": "*background_fill_secondary", "panel_border_color": "*border_color_primary", "panel_border_color_dark": "*border_color_primary", "panel_border_width": "0", "panel_border_width_dark": "0", "primary_100": "#e0f2fe", "primary_200": "#bae6fd", "primary_300": "#7dd3fc", "primary_400": "#38bdf8", "primary_50": "#f0f9ff", "primary_500": "#0ea5e9", "primary_600": "#0284c7", "primary_700": "#0369a1", "primary_800": "#075985", "primary_900": "#0c4a6e", "primary_950": "#0b4165", "prose_header_text_weight": "500", "prose_text_size": "*text_md", "prose_text_weight": "400", "radio_circle": "url(\"data:image/svg+xml,%3csvg viewBox='0 0 16 16' fill='white' xmlns='http://www.w3.org/2000/svg'%3e%3ccircle cx='8' cy='8' r='3'/%3e%3c/svg%3e\")", "radius_lg": "3px", "radius_md": "3px", "radius_sm": "3px", "radius_xl": "3px", "radius_xs": "3px", "radius_xxl": "3px", "radius_xxs": "3px", "secondary_100": "#e0f2fe", "secondary_200": "#bae6fd", "secondary_300": "#7dd3fc", "secondary_400": "#38bdf8", "secondary_50": "#f0f9ff", "secondary_500": "#0ea5e9", "secondary_600": "#0284c7", "secondary_700": "#0369a1", "secondary_800": "#075985", "secondary_900": "#0c4a6e", "secondary_950": "#0b4165", "section_header_text_size": "*text_md", "section_header_text_weight": "400", "shadow_drop": "rgba(0,0,0,0.05) 0px 1px 2px 0px", "shadow_drop_lg": "0 1px 3px 0 rgb(0 0 0 / 0.1), 0 1px 2px -1px rgb(0 0 0 / 0.1)", "shadow_inset": "rgba(0,0,0,0.05) 0px 2px 4px 0px inset", "shadow_spread": "3px", "shadow_spread_dark": "1px", "slider_color": "*primary_600", "slider_color_dark": "*primary_600", "spacing_lg": "8px", "spacing_md": "6px", "spacing_sm": "4px", "spacing_xl": "10px", "spacing_xs": "2px", "spacing_xxl": "16px", "spacing_xxs": "1px", "stat_background_fill": "*primary_300", "stat_background_fill_dark": "*primary_500", "table_border_color": "*neutral_300", "table_border_color_dark": "*neutral_700", "table_even_background_fill": "white", "table_even_background_fill_dark": "*neutral_950", "table_odd_background_fill": "*neutral_50", "table_odd_background_fill_dark": "*neutral_900", "table_radius": "*radius_lg", "table_row_focus": "*color_accent_soft", "table_row_focus_dark": "*color_accent_soft", "text_lg": "20px", "text_md": "16px", "text_sm": "14px", "text_xl": "24px", "text_xs": "12px", "text_xxl": "28px", "text_xxs": "10px"}, "version": "0.0.1"}