weichiang commited on
Commit
1edf6fb
β€’
1 Parent(s): 5cc2304
Files changed (3) hide show
  1. app.py +196 -75
  2. elo_results_20240109.pkl +3 -0
  3. leaderboard_table_20240109.csv +69 -0
app.py CHANGED
@@ -6,6 +6,7 @@ import pickle
6
 
7
  import gradio as gr
8
  import numpy as np
 
9
 
10
 
11
  # notebook_url = "https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing"
@@ -16,17 +17,41 @@ basic_component_values = [None] * 6
16
  leader_component_values = [None] * 5
17
 
18
 
19
- def make_leaderboard_md(elo_results):
 
 
 
20
  leaderboard_md = f"""
21
- # Leaderboard
22
- | [Vote](https://chat.lmsys.org/?arena) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
 
 
 
 
 
 
23
 
24
- πŸ† This leaderboard is based on the following three benchmarks.
25
- - [Chatbot Arena](https://chat.lmsys.org/?arena) - a crowdsourced, randomized battle platform. We use 130K+ user votes to compute Elo ratings.
26
- - [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
27
- - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
28
 
29
- πŸ’» Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval). Higher values are better for all benchmarks. Empty cells mean not available. Last updated: Dec 20, 2023.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  """
31
  return leaderboard_md
32
 
@@ -152,94 +177,190 @@ def build_basic_stats_tab():
152
  md4 = gr.Markdown(empty)
153
  return [md0, plot_1, md1, md2, md3, md4]
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
- def build_leaderboard_tab(elo_results_file, leaderboard_table_file):
158
  if elo_results_file is None: # Do live update
159
- md = "Loading ..."
160
  p1 = p2 = p3 = p4 = None
161
  else:
162
  with open(elo_results_file, "rb") as fin:
163
  elo_results = pickle.load(fin)
164
 
165
- md = make_leaderboard_md(elo_results)
166
  p1 = elo_results["win_fraction_heatmap"]
167
  p2 = elo_results["battle_count_heatmap"]
168
  p3 = elo_results["bootstrap_elo_rating"]
169
  p4 = elo_results["average_win_rate_bar"]
 
 
170
 
171
- md_1 = gr.Markdown(md, elem_id="leaderboard_markdown")
172
-
173
  if leaderboard_table_file:
174
  data = load_leaderboard_table_csv(leaderboard_table_file)
175
- headers = [
176
- "Model",
177
- "Arena Elo rating",
178
- "MT-bench (score)",
179
- "MMLU",
180
- "License",
181
- ]
182
- values = []
183
- for item in data:
184
- row = []
185
- for key in headers:
186
- value = item[key]
187
- row.append(value)
188
- values.append(row)
189
- values.sort(key=lambda x: -x[1] if not np.isnan(x[1]) else 1e9)
190
-
191
- headers[1] = "⭐ " + headers[1]
192
- headers[2] = "πŸ“ˆ " + headers[2]
193
-
194
- gr.Dataframe(
195
- headers=headers,
196
- datatype=["markdown", "number", "number", "number", "str"],
197
- value=values,
198
- elem_id="leaderboard_dataframe",
199
- )
200
- gr.Markdown(
201
- "If you want to see more models, please help us [add them](https://github.com/lm-sys/FastChat/blob/main/docs/arena.md#how-to-add-a-new-model).",
202
- elem_id="leaderboard_markdown"
203
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  else:
205
  pass
206
 
207
- gr.Markdown(
208
- f"""## More Statistics for Chatbot Arena\n
209
- We added some additional figures to show more statistics. The code for generating them is also included in this [notebook]({notebook_url}).
210
- Please note that you may see different orders from different ranking methods. This is expected for models that perform similarly, as demonstrated by the confidence interval in the bootstrap figure. Going forward, we prefer the classical Elo calculation because of its scalability and interpretability. You can find more discussions in this blog [post](https://lmsys.org/blog/2023-05-03-arena/).
211
- """,
212
- elem_id="leaderboard_markdown"
213
- )
214
-
215
- leader_component_values[:] = [md, p1, p2, p3, p4]
216
 
217
- with gr.Row():
218
- with gr.Column():
219
- gr.Markdown(
220
- "#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles"
221
- )
222
- plot_1 = gr.Plot(p1, show_label=False)
223
- with gr.Column():
224
- gr.Markdown(
225
- "#### Figure 2: Battle Count for Each Combination of Models (without Ties)"
226
- )
227
- plot_2 = gr.Plot(p2, show_label=False)
228
- with gr.Row():
229
- with gr.Column():
230
- gr.Markdown(
231
- "#### Figure 3: Bootstrap of MLE Elo Estimates (1000 Rounds of Random Sampling)"
232
- )
233
- plot_3 = gr.Plot(p3, show_label=False)
234
- with gr.Column():
235
- gr.Markdown(
236
- "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
237
- )
238
- plot_4 = gr.Plot(p4, show_label=False)
 
 
 
 
 
 
 
 
 
 
239
 
240
  gr.Markdown(acknowledgment_md)
241
 
242
- return [md_1, plot_1, plot_2, plot_3, plot_4]
 
 
243
 
244
  block_css = """
245
  #notice_markdown {
@@ -300,7 +421,7 @@ def build_demo(elo_results_file, leaderboard_table_file):
300
  css=block_css,
301
  ) as demo:
302
  leader_components = build_leaderboard_tab(
303
- elo_results_file, leaderboard_table_file
304
  )
305
  return demo
306
 
 
6
 
7
  import gradio as gr
8
  import numpy as np
9
+ import pandas as pd
10
 
11
 
12
  # notebook_url = "https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing"
 
17
  leader_component_values = [None] * 5
18
 
19
 
20
+ def make_default_md(arena_df, elo_results):
21
+ total_votes = sum(arena_df["num_battles"]) // 2
22
+ total_models = len(arena_df)
23
+
24
  leaderboard_md = f"""
25
+ # πŸ† LMSYS Chatbot Arena Leaderboard
26
+ | [Vote](https://chat.lmsys.org) | [Blog](https://lmsys.org/blog/2023-05-03-arena/) | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) | [Twitter](https://twitter.com/lmsysorg) | [Discord](https://discord.gg/HSWAKCrnFx) |
27
+
28
+ LMSYS [Chatbot Arena](https://lmsys.org/blog/2023-05-03-arena/) is a crowdsourced open platform for LLM evals.
29
+ We've collected over **200,000** human preference votes to rank LLMs with the Elo ranking system.
30
+ """
31
+ return leaderboard_md
32
+
33
 
34
+ def make_arena_leaderboard_md(arena_df):
35
+ total_votes = sum(arena_df["num_battles"]) // 2
36
+ total_models = len(arena_df)
 
37
 
38
+ leaderboard_md = f"""
39
+ Total #models: **{total_models}**. Total #votes: **{total_votes}**. Last updated: Jan 9, 2024.
40
+
41
+ Contribute your vote πŸ—³οΈ at [chat.lmsys.org](https://chat.lmsys.org)! Find more analysis in the [notebook]({notebook_url}).
42
+ """
43
+ return leaderboard_md
44
+
45
+
46
+ def make_full_leaderboard_md(elo_results):
47
+ leaderboard_md = f"""
48
+ Two more benchmarks are displayed: **MT-Bench** and **MMLU**.
49
+ - [MT-Bench](https://arxiv.org/abs/2306.05685): a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
50
+ - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot): a test to measure a model's multitask accuracy on 57 tasks.
51
+
52
+ πŸ’» Code: The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge).
53
+ The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval).
54
+ Higher values are better for all benchmarks. Empty cells mean not available.
55
  """
56
  return leaderboard_md
57
 
 
177
  md4 = gr.Markdown(empty)
178
  return [md0, plot_1, md1, md2, md3, md4]
179
 
180
+ def get_full_table(arena_df, model_table_df):
181
+ values = []
182
+ for i in range(len(model_table_df)):
183
+ row = []
184
+ model_key = model_table_df.iloc[i]["key"]
185
+ model_name = model_table_df.iloc[i]["Model"]
186
+ # model display name
187
+ row.append(model_name)
188
+ if model_key in arena_df.index:
189
+ idx = arena_df.index.get_loc(model_key)
190
+ row.append(round(arena_df.iloc[idx]["rating"], 1))
191
+ else:
192
+ row.append(np.nan)
193
+ row.append(model_table_df.iloc[i]["MT-bench (score)"])
194
+ row.append(model_table_df.iloc[i]["MMLU"])
195
+ # Organization
196
+ row.append(model_table_df.iloc[i]["Organization"])
197
+ # license
198
+ row.append(model_table_df.iloc[i]["License"])
199
+
200
+ values.append(row)
201
+ values.sort(key=lambda x: -x[1] if not np.isnan(x[1]) else 1e9)
202
+ return values
203
+
204
+
205
+ def get_arena_table(arena_df, model_table_df):
206
+ # sort by rating
207
+ arena_df = arena_df.sort_values(by=["rating"], ascending=False)
208
+ values = []
209
+ for i in range(len(arena_df)):
210
+ row = []
211
+ model_key = arena_df.index[i]
212
+ model_name = model_table_df[model_table_df["key"] == model_key]["Model"].values[
213
+ 0
214
+ ]
215
 
216
+ # rank
217
+ row.append(i + 1)
218
+ # model display name
219
+ row.append(model_name)
220
+ # elo rating
221
+ row.append(round(arena_df.iloc[i]["rating"], 1))
222
+ upper_diff = round(
223
+ arena_df.iloc[i]["rating_q975"] - arena_df.iloc[i]["rating"], 1
224
+ )
225
+ lower_diff = round(
226
+ arena_df.iloc[i]["rating"] - arena_df.iloc[i]["rating_q025"], 1
227
+ )
228
+ row.append(f"+{upper_diff}/-{lower_diff}")
229
+ # num battles
230
+ row.append(round(arena_df.iloc[i]["num_battles"]))
231
+ # Organization
232
+ row.append(
233
+ model_table_df[model_table_df["key"] == model_key]["Organization"].values[0]
234
+ )
235
+ # license
236
+ row.append(
237
+ model_table_df[model_table_df["key"] == model_key]["License"].values[0]
238
+ )
239
+
240
+ values.append(row)
241
+ return values
242
 
243
+ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
244
  if elo_results_file is None: # Do live update
245
+ default_md = "Loading ..."
246
  p1 = p2 = p3 = p4 = None
247
  else:
248
  with open(elo_results_file, "rb") as fin:
249
  elo_results = pickle.load(fin)
250
 
 
251
  p1 = elo_results["win_fraction_heatmap"]
252
  p2 = elo_results["battle_count_heatmap"]
253
  p3 = elo_results["bootstrap_elo_rating"]
254
  p4 = elo_results["average_win_rate_bar"]
255
+ arena_df = elo_results["leaderboard_table_df"]
256
+ default_md = make_default_md(arena_df, elo_results)
257
 
258
+ md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
 
259
  if leaderboard_table_file:
260
  data = load_leaderboard_table_csv(leaderboard_table_file)
261
+ model_table_df = pd.DataFrame(data)
262
+
263
+ with gr.Tabs() as tabs:
264
+ # arena table
265
+ arena_table_vals = get_arena_table(arena_df, model_table_df)
266
+ with gr.Tab("Arena Elo", id=0):
267
+ md = make_arena_leaderboard_md(arena_df)
268
+ gr.Markdown(md, elem_id="leaderboard_markdown")
269
+ gr.Dataframe(
270
+ headers=[
271
+ "Rank",
272
+ "πŸ€– Model",
273
+ "⭐ Arena Elo",
274
+ "πŸ“Š 95% CI",
275
+ "πŸ—³οΈ Votes",
276
+ "Organization",
277
+ "License",
278
+ ],
279
+ datatype=[
280
+ "str",
281
+ "markdown",
282
+ "number",
283
+ "str",
284
+ "number",
285
+ "str",
286
+ "str",
287
+ ],
288
+ value=arena_table_vals,
289
+ elem_id="arena_leaderboard_dataframe",
290
+ height=700,
291
+ column_widths=[50, 200, 100, 100, 100, 150, 150],
292
+ wrap=True,
293
+ )
294
+ with gr.Tab("Full Leaderboard", id=1):
295
+ md = make_full_leaderboard_md(elo_results)
296
+ gr.Markdown(md, elem_id="leaderboard_markdown")
297
+ full_table_vals = get_full_table(arena_df, model_table_df)
298
+ gr.Dataframe(
299
+ headers=[
300
+ "πŸ€– Model",
301
+ "⭐ Arena Elo",
302
+ "πŸ“ˆ MT-bench",
303
+ "πŸ“š MMLU",
304
+ "Organization",
305
+ "License",
306
+ ],
307
+ datatype=["markdown", "number", "number", "number", "str", "str"],
308
+ value=full_table_vals,
309
+ elem_id="full_leaderboard_dataframe",
310
+ column_widths=[200, 100, 100, 100, 150, 150],
311
+ height=700,
312
+ wrap=True,
313
+ )
314
+ if not show_plot:
315
+ gr.Markdown(
316
+ """ ## Visit our [HF space](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard) for more analysis!
317
+ If you want to see more models, please help us [add them](https://github.com/lm-sys/FastChat/blob/main/docs/arena.md#how-to-add-a-new-model).
318
+ """,
319
+ elem_id="leaderboard_markdown",
320
+ )
321
  else:
322
  pass
323
 
324
+ leader_component_values[:] = [default_md, p1, p2, p3, p4]
 
 
 
 
 
 
 
 
325
 
326
+ if show_plot:
327
+ gr.Markdown(
328
+ f"""## More Statistics for Chatbot Arena\n
329
+ Below are figures for more statistics. The code for generating them is also included in this [notebook]({notebook_url}).
330
+ You can find more discussions in this blog [post](https://lmsys.org/blog/2023-12-07-leaderboard/).
331
+ """,
332
+ elem_id="leaderboard_markdown"
333
+ )
334
+ with gr.Row():
335
+ with gr.Column():
336
+ gr.Markdown(
337
+ "#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles"
338
+ )
339
+ plot_1 = gr.Plot(p1, show_label=False)
340
+ with gr.Column():
341
+ gr.Markdown(
342
+ "#### Figure 2: Battle Count for Each Combination of Models (without Ties)"
343
+ )
344
+ plot_2 = gr.Plot(p2, show_label=False)
345
+ with gr.Row():
346
+ with gr.Column():
347
+ gr.Markdown(
348
+ "#### Figure 3: Bootstrap of Elo Estimates (1000 Rounds of Random Sampling)"
349
+ )
350
+ plot_3 = gr.Plot(p3, show_label=False)
351
+ with gr.Column():
352
+ gr.Markdown(
353
+ "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
354
+ )
355
+ plot_4 = gr.Plot(p4, show_label=False)
356
+
357
+ from fastchat.serve.gradio_web_server import acknowledgment_md
358
 
359
  gr.Markdown(acknowledgment_md)
360
 
361
+ if show_plot:
362
+ return [md_1, plot_1, plot_2, plot_3, plot_4]
363
+ return [md_1]
364
 
365
  block_css = """
366
  #notice_markdown {
 
421
  css=block_css,
422
  ) as demo:
423
  leader_components = build_leaderboard_tab(
424
+ elo_results_file, leaderboard_table_file, show_plot=True
425
  )
426
  return demo
427
 
elo_results_20240109.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a334a1a5000f62dd9491d6fb2c7b136cce3fd37647ffec0e9c0c084919783ea
3
+ size 264666
leaderboard_table_20240109.csv ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ key,Model,MT-bench (score),MMLU,License,Organization,Link
2
+ wizardlm-30b,WizardLM-30B,7.01,0.587,Non-commercial,Microsoft,https://huggingface.co/WizardLM/WizardLM-30B-V1.0
3
+ vicuna-13b-16k,Vicuna-13B-16k,6.92,0.545,Llama 2 Community,LMSYS,https://huggingface.co/lmsys/vicuna-13b-v1.5-16k
4
+ wizardlm-13b-v1.1,WizardLM-13B-v1.1,6.76,0.500,Non-commercial,Microsoft,https://huggingface.co/WizardLM/WizardLM-13B-V1.1
5
+ tulu-30b,Tulu-30B,6.43,0.581,Non-commercial,AllenAI/UW,https://huggingface.co/allenai/tulu-30b
6
+ guanaco-65b,Guanaco-65B,6.41,0.621,Non-commercial,UW,https://huggingface.co/timdettmers/guanaco-65b-merged
7
+ openassistant-llama-30b,OpenAssistant-LLaMA-30B,6.41,0.560,Non-commercial,OpenAssistant,https://huggingface.co/OpenAssistant/oasst-sft-6-llama-30b-xor
8
+ wizardlm-13b-v1.0,WizardLM-13B-v1.0,6.35,0.523,Non-commercial,Microsoft,https://huggingface.co/WizardLM/WizardLM-13B-V1.0
9
+ vicuna-7b-16k,Vicuna-7B-16k,6.22,0.485,Llama 2 Community,LMSYS,https://huggingface.co/lmsys/vicuna-7b-v1.5-16k
10
+ baize-v2-13b,Baize-v2-13B,5.75,0.489,Non-commercial,UCSD,https://huggingface.co/project-baize/baize-v2-13b
11
+ xgen-7b-8k-inst,XGen-7B-8K-Inst,5.55,0.421,Non-commercial,Salesforce,https://huggingface.co/Salesforce/xgen-7b-8k-inst
12
+ nous-hermes-13b,Nous-Hermes-13B,5.51,0.493,Non-commercial,NousResearch,https://huggingface.co/NousResearch/Nous-Hermes-13b
13
+ mpt-30b-instruct,MPT-30B-Instruct,5.22,0.478,CC-BY-SA 3.0,MosaicML,https://huggingface.co/mosaicml/mpt-30b-instruct
14
+ falcon-40b-instruct,Falcon-40B-Instruct,5.17,0.547,Apache 2.0,TII,https://huggingface.co/tiiuae/falcon-40b-instruct
15
+ h2o-oasst-openllama-13b,H2O-Oasst-OpenLLaMA-13B,4.63,0.428,Apache 2.0,h2oai,https://huggingface.co/h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-13b
16
+ gpt-4-turbo,GPT-4-Turbo,9.32,-,Proprietary,OpenAI,https://openai.com/blog/new-models-and-developer-products-announced-at-devday
17
+ gpt-4-0314,GPT-4-0314,8.96,0.864,Proprietary,OpenAI,https://openai.com/research/gpt-4
18
+ claude-1,Claude-1,7.90,0.770,Proprietary,Anthropic,https://www.anthropic.com/index/introducing-claude
19
+ gpt-4-0613,GPT-4-0613,9.18,-,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo
20
+ claude-2.0,Claude-2.0,8.06,0.785,Proprietary,Anthropic,https://www.anthropic.com/index/claude-2
21
+ claude-2.1,Claude-2.1,8.18,-,Proprietary,Anthropic,https://www.anthropic.com/index/claude-2-1
22
+ gpt-3.5-turbo-0613,GPT-3.5-Turbo-0613,8.39,-,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-3-5
23
+ mixtral-8x7b-instruct-v0.1,Mixtral-8x7b-Instruct-v0.1,8.30,0.706,Apache 2.0,Mistral,https://mistral.ai/news/mixtral-of-experts/
24
+ claude-instant-1,Claude-Instant-1,7.85,0.734,Proprietary,Anthropic,https://www.anthropic.com/index/introducing-claude
25
+ gpt-3.5-turbo-0314,GPT-3.5-Turbo-0314,7.94,0.700,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-3-5
26
+ tulu-2-dpo-70b,Tulu-2-DPO-70B,7.89,-,AI2 ImpACT Low-risk,AllenAI/UW,https://huggingface.co/allenai/tulu-2-dpo-70b
27
+ yi-34b-chat,Yi-34B-Chat,-,0.735,Yi License,01 AI,https://huggingface.co/01-ai/Yi-34B-Chat
28
+ gemini-pro,Gemini Pro,-,0.718,Proprietary,Google,https://cloud.google.com/vertex-ai/docs/generative-ai/start/quickstarts/quickstart-multimodal
29
+ gemini-pro-dev-api,Gemini Pro (Dev),-,0.718,Proprietary,Google,https://ai.google.dev/docs/gemini_api_overview
30
+ wizardlm-70b,WizardLM-70B-v1.0,7.71,0.637,Llama 2 Community,Microsoft,https://huggingface.co/WizardLM/WizardLM-70B-V1.0
31
+ vicuna-33b,Vicuna-33B,7.12,0.592,Non-commercial,LMSYS,https://huggingface.co/lmsys/vicuna-33b-v1.3
32
+ starling-lm-7b-alpha,Starling-LM-7B-alpha,8.09,0.639,CC-BY-NC-4.0,UC Berkeley,https://huggingface.co/berkeley-nest/Starling-LM-7B-alpha
33
+ pplx-70b-online,pplx-70b-online,-,-,Proprietary,Perplexity AI,https://blog.perplexity.ai/blog/introducing-pplx-online-llms
34
+ openchat-3.5,OpenChat-3.5,7.81,0.643,Apache-2.0,OpenChat,https://huggingface.co/openchat/openchat_3.5
35
+ openhermes-2.5-mistral-7b,OpenHermes-2.5-Mistral-7b,-,-,Apache-2.0,NousResearch,https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B
36
+ gpt-3.5-turbo-1106,GPT-3.5-Turbo-1106,8.32,-,Proprietary,OpenAI,https://platform.openai.com/docs/models/gpt-3-5
37
+ llama-2-70b-chat,Llama-2-70b-chat,6.86,0.630,Llama 2 Community,Meta,https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
38
+ solar-10.7b-instruct-v1.0,SOLAR-10.7B-Instruct-v1.0,7.58,0.662,CC-BY-NC-4.0,Upstage AI,https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0
39
+ dolphin-2.2.1-mistral-7b,Dolphin-2.2.1-Mistral-7B,-,-,Apache-2.0,Cognitive Computations,https://huggingface.co/ehartford/dolphin-2.2.1-mistral-7b
40
+ wizardlm-13b,WizardLM-13b-v1.2,7.20,0.527,Llama 2 Community,Microsoft,https://huggingface.co/WizardLM/WizardLM-13B-V1.2
41
+ zephyr-7b-beta,Zephyr-7b-beta,7.34,0.614,MIT,HuggingFace,https://huggingface.co/HuggingFaceH4/zephyr-7b-beta
42
+ mpt-30b-chat,MPT-30B-chat,6.39,0.504,CC-BY-NC-SA-4.0,MosaicML,https://huggingface.co/mosaicml/mpt-30b-chat
43
+ vicuna-13b,Vicuna-13B,6.57,0.558,Llama 2 Community,LMSYS,https://huggingface.co/lmsys/vicuna-13b-v1.5
44
+ qwen-14b-chat,Qwen-14B-Chat,6.96,0.665,Qianwen LICENSE,Alibaba,https://huggingface.co/Qwen/Qwen-14B-Chat
45
+ zephyr-7b-alpha,Zephyr-7b-alpha,6.88,-,MIT,HuggingFace,https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha
46
+ codellama-34b-instruct,CodeLlama-34B-instruct,-,0.537,Llama 2 Community,Meta,https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf
47
+ falcon-180b-chat,falcon-180b-chat,-,0.680,Falcon-180B TII License,TII,https://huggingface.co/tiiuae/falcon-180B-chat
48
+ guanaco-33b,Guanaco-33B,6.53,0.576,Non-commercial,UW,https://huggingface.co/timdettmers/guanaco-33b-merged
49
+ llama-2-13b-chat,Llama-2-13b-chat,6.65,0.536,Llama 2 Community,Meta,https://huggingface.co/meta-llama/Llama-2-13b-chat-hf
50
+ mistral-7b-instruct,Mistral-7B-Instruct-v0.1,6.84,0.554,Apache 2.0,Mistral,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1
51
+ pplx-7b-online,pplx-7b-online,-,-,Proprietary,Perplexity AI,https://blog.perplexity.ai/blog/introducing-pplx-online-llms
52
+ llama-2-7b-chat,Llama-2-7b-chat,6.27,0.458,Llama 2 Community,Meta,https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
53
+ vicuna-7b,Vicuna-7B,6.17,0.498,Llama 2 Community,LMSYS,https://huggingface.co/lmsys/vicuna-7b-v1.5
54
+ palm-2,PaLM-Chat-Bison-001,6.40,-,Proprietary,Google,https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models#foundation_models
55
+ koala-13b,Koala-13B,5.35,0.447,Non-commercial,UC Berkeley,https://bair.berkeley.edu/blog/2023/04/03/koala/
56
+ chatglm3-6b,ChatGLM3-6B,-,-,Apache-2.0,Tsinghua,https://huggingface.co/THUDM/chatglm3-6b
57
+ gpt4all-13b-snoozy,GPT4All-13B-Snoozy,5.41,0.430,Non-commercial,Nomic AI,https://huggingface.co/nomic-ai/gpt4all-13b-snoozy
58
+ mpt-7b-chat,MPT-7B-Chat,5.42,0.320,CC-BY-NC-SA-4.0,MosaicML,https://huggingface.co/mosaicml/mpt-7b-chat
59
+ chatglm2-6b,ChatGLM2-6B,4.96,0.455,Apache-2.0,Tsinghua,https://huggingface.co/THUDM/chatglm2-6b
60
+ RWKV-4-Raven-14B,RWKV-4-Raven-14B,3.98,0.256,Apache 2.0,RWKV,https://huggingface.co/BlinkDL/rwkv-4-raven
61
+ alpaca-13b,Alpaca-13B,4.53,0.481,Non-commercial,Stanford,https://crfm.stanford.edu/2023/03/13/alpaca.html
62
+ oasst-pythia-12b,OpenAssistant-Pythia-12B,4.32,0.270,Apache 2.0,OpenAssistant,https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5
63
+ chatglm-6b,ChatGLM-6B,4.50,0.361,Non-commercial,Tsinghua,https://huggingface.co/THUDM/chatglm-6b
64
+ fastchat-t5-3b,FastChat-T5-3B,3.04,0.477,Apache 2.0,LMSYS,https://huggingface.co/lmsys/fastchat-t5-3b-v1.0
65
+ stablelm-tuned-alpha-7b,StableLM-Tuned-Alpha-7B,2.75,0.244,CC-BY-NC-SA-4.0,Stability AI,https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b
66
+ dolly-v2-12b,Dolly-V2-12B,3.28,0.257,MIT,Databricks,https://huggingface.co/databricks/dolly-v2-12b
67
+ llama-13b,LLaMA-13B,2.61,0.470,Non-commercial,Meta,https://arxiv.org/abs/2302.13971
68
+ mistral-medium,Mistral Medium,8.61,0.753,Proprietary,Mistral,https://mistral.ai/news/la-plateforme/
69
+ llama2-70b-steerlm-chat,Llama2-70B-SteerLM-Chat,7.54,-,Llama 2 Community,Nvidia,https://huggingface.co/nvidia/Llama2-70B-SteerLM-Chat