Spaces:
Running
Running
Update app.py and score DB 240517
Browse files- app.py +19 -8
- db/score_240516.csv +0 -84
- db/score_240517.csv +84 -0
app.py
CHANGED
@@ -17,18 +17,15 @@ my_theme = gr.themes.Soft(
|
|
17 |
)
|
18 |
sns.set(color_codes=True, font_scale=1.2)
|
19 |
|
20 |
-
TARGET_DATE = "
|
21 |
-
SCORE_PATH = f"db/
|
22 |
score_df = pd.read_csv(SCORE_PATH)
|
23 |
score_df["H6-Avg"] = (
|
24 |
score_df[["ARC-c", "HellaSwag", "MMLU", "TruthfulQA", "WinoGrande", "GSM-8K"]]
|
25 |
.mean(axis=1)
|
26 |
.round(2)
|
27 |
)
|
28 |
-
AVAILABLE_SCORES = score_df.columns[
|
29 |
-
for score in AVAILABLE_SCORES:
|
30 |
-
if "Arena Elo" in score:
|
31 |
-
AVAILABLE_SCORES.remove(score)
|
32 |
AVAILABLE_MODELS = score_df["Model"].to_list()
|
33 |
|
34 |
DEFAULT_SCORES = ["ARC-c", "HellaSwag", "MMLU", "TruthfulQA", "WinoGrande", "GSM-8K"]
|
@@ -84,7 +81,7 @@ def get_corr_table(benchmarks_list=None):
|
|
84 |
if benchmarks_list:
|
85 |
benchmarks_list = [f"Arena Elo ({TARGET_DATE})"] + benchmarks_list
|
86 |
else:
|
87 |
-
benchmarks_list = score_df.columns[
|
88 |
|
89 |
corr_table = score_df[benchmarks_list].pairwise_corr(method="pearson")
|
90 |
|
@@ -96,7 +93,7 @@ def get_corr_figure(benchmarks_list=None):
|
|
96 |
if benchmarks_list:
|
97 |
benchmarks_list = [f"Arena Elo ({TARGET_DATE})"] + benchmarks_list
|
98 |
else:
|
99 |
-
benchmarks_list = score_df.columns[
|
100 |
|
101 |
corr_values = score_df[benchmarks_list].corr()
|
102 |
|
@@ -144,6 +141,12 @@ with gr.Blocks(theme=my_theme) as demo:
|
|
144 |
The Universe of Evaluation. All about the evaluation for LLMs.\n
|
145 |
Run an evaluation for your LLM with **`Evalverse`** [[Github](https://github.com/UpstageAI/evalverse) β’ [Paper](https://arxiv.org/abs/2404.00943) β’ [Docs](https://evalverse.gitbook.io/evalverse-docs)].
|
146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
"""
|
148 |
)
|
149 |
with gr.Tab("π LLM Evaluation Report"):
|
@@ -231,6 +234,13 @@ with gr.Blocks(theme=my_theme) as demo:
|
|
231 |
Reference: https://pingouin-stats.org/build/html/generated/pingouin.pairwise_corr.html#pingouin.pairwise_corr
|
232 |
"""
|
233 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
with gr.Tab("π About Evalverse"):
|
235 |
gr.Markdown(
|
236 |
"""
|
@@ -316,6 +326,7 @@ with gr.Blocks(theme=my_theme) as demo:
|
|
316 |
- [EQ-Bench Leaderboard](https://eqbench.com/)
|
317 |
- [Arena-Hard Leaderboard](https://lmsys.org/blog/2024-04-19-arena-hard/#full-leaderboard-with-gpt-4-turbo-as-judge)
|
318 |
- [AlpacaEval Leaderboard](https://tatsu-lab.github.io/alpaca_eval/)
|
|
|
319 |
- Results from [Evalverse](https://github.com/UpstageAI/evalverse)
|
320 |
|
321 |
"""
|
|
|
17 |
)
|
18 |
sns.set(color_codes=True, font_scale=1.2)
|
19 |
|
20 |
+
TARGET_DATE = "240515"
|
21 |
+
SCORE_PATH = f"db/score_240517.csv"
|
22 |
score_df = pd.read_csv(SCORE_PATH)
|
23 |
score_df["H6-Avg"] = (
|
24 |
score_df[["ARC-c", "HellaSwag", "MMLU", "TruthfulQA", "WinoGrande", "GSM-8K"]]
|
25 |
.mean(axis=1)
|
26 |
.round(2)
|
27 |
)
|
28 |
+
AVAILABLE_SCORES = score_df.columns[8:].tolist()
|
|
|
|
|
|
|
29 |
AVAILABLE_MODELS = score_df["Model"].to_list()
|
30 |
|
31 |
DEFAULT_SCORES = ["ARC-c", "HellaSwag", "MMLU", "TruthfulQA", "WinoGrande", "GSM-8K"]
|
|
|
81 |
if benchmarks_list:
|
82 |
benchmarks_list = [f"Arena Elo ({TARGET_DATE})"] + benchmarks_list
|
83 |
else:
|
84 |
+
benchmarks_list = score_df.columns[4:]
|
85 |
|
86 |
corr_table = score_df[benchmarks_list].pairwise_corr(method="pearson")
|
87 |
|
|
|
93 |
if benchmarks_list:
|
94 |
benchmarks_list = [f"Arena Elo ({TARGET_DATE})"] + benchmarks_list
|
95 |
else:
|
96 |
+
benchmarks_list = score_df.columns[4:]
|
97 |
|
98 |
corr_values = score_df[benchmarks_list].corr()
|
99 |
|
|
|
141 |
The Universe of Evaluation. All about the evaluation for LLMs.\n
|
142 |
Run an evaluation for your LLM with **`Evalverse`** [[Github](https://github.com/UpstageAI/evalverse) β’ [Paper](https://arxiv.org/abs/2404.00943) β’ [Docs](https://evalverse.gitbook.io/evalverse-docs)].
|
143 |
|
144 |
+
### π Newly updated
|
145 |
+
[2024.05.17]
|
146 |
+
- Weekly scores: `Arena Elo (240515)`, `Arena Elo (240508)`, `Arena Elo (240501)`
|
147 |
+
- New benchmarks: [`AlpacaEval 2.0`](https://tatsu-lab.github.io/alpaca_eval/), [`MMLU-Pro`](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro)
|
148 |
+
- New models: `GPT-4o-0513`, `Grok-1`, `OpenELM`, `Qwen-Max-0428`, `Snowflake-Arctic-Instruct`, `Yi-Large`
|
149 |
+
- New tab: `π Full leaderboard`
|
150 |
"""
|
151 |
)
|
152 |
with gr.Tab("π LLM Evaluation Report"):
|
|
|
234 |
Reference: https://pingouin-stats.org/build/html/generated/pingouin.pairwise_corr.html#pingouin.pairwise_corr
|
235 |
"""
|
236 |
)
|
237 |
+
|
238 |
+
with gr.Tab("π Full leaderboard"):
|
239 |
+
lb_selected = ["Arena Elo (240515)", "MT-Bench", "MMLU", "Arena-Hard", "EQ-Bench", "MAGI-Hard", "LC-AlpacaEval-2.0", "MMLU-Pro", "H6-Avg"]
|
240 |
+
lb = score_df[["Organization", "Model", "Size"] + lb_selected]
|
241 |
+
lb = lb.sort_values(lb_selected, ascending=False)
|
242 |
+
gr.DataFrame(lb)
|
243 |
+
|
244 |
with gr.Tab("π About Evalverse"):
|
245 |
gr.Markdown(
|
246 |
"""
|
|
|
326 |
- [EQ-Bench Leaderboard](https://eqbench.com/)
|
327 |
- [Arena-Hard Leaderboard](https://lmsys.org/blog/2024-04-19-arena-hard/#full-leaderboard-with-gpt-4-turbo-as-judge)
|
328 |
- [AlpacaEval Leaderboard](https://tatsu-lab.github.io/alpaca_eval/)
|
329 |
+
- [MMLU-Pro Leaderboard](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro#4-leaderboard)
|
330 |
- Results from [Evalverse](https://github.com/UpstageAI/evalverse)
|
331 |
|
332 |
"""
|
db/score_240516.csv
DELETED
@@ -1,84 +0,0 @@
|
|
1 |
-
Organization,Model,Size,Arena Elo (240516),Arena Elo (240429),MT-Bench,Arena-Hard,EQ-Bench,MAGI-Hard,ARC-c,HellaSwag,MMLU,TruthfulQA,WinoGrande,GSM-8K,AlpacaEval-1.0,LC-AlpacaEval-1.0,AlpacaEval-2.0,LC-AlpacaEval-2.0
|
2 |
-
Anthropic,Claude-1,,1150,1150,7.9,,76.83,,,,77,,,,91.6,76.8,17,27.3
|
3 |
-
Anthropic,Claude-2.0,,1132,1132,8.06,24,72.89,,,,78.5,,,,91.4,74.3,17.2,28.2
|
4 |
-
Anthropic,Claude-2.1,,1119,1119,8.18,22.8,73.96,,,,,,,,87.1,66,15.7,25.3
|
5 |
-
Anthropic,Claude-3-Haiku-20240307,,1180,1181,,41.5,63.65,47.71,89.2,85.9,75.2,,74.2,,,,,
|
6 |
-
Anthropic,Claude-3-Opus-20240229 ,,1248,1251,9.43,60.4,82.19,76.55,96.4,95.4,86.8,,88.5,,,,29.1,40.5
|
7 |
-
Anthropic,Claude-3-Sonnet-20240229,,1200,1202,9.18,46.8,80.45,61.01,93.2,89,79,,75.1,,,,25.6,34.9
|
8 |
-
Cohere,Command-R-Plus,104B,1189,1192,,33.1,76.11,49.7,70.39,87.96,74.02,56.95,83.82,47.31,,,,
|
9 |
-
Cohere,Command-R-v01,35B,1148,1148,,17,56.05,43.27,65.53,87,68.2,52.32,81.53,56.63,,,12.9,10.9
|
10 |
-
Databricks,DBRX-Instruct ,132B,1103,1102,8.26,23.9,76.82,57.13,68.9,89,73.7,66.9,81.8,66.9,,,18.4,25.4
|
11 |
-
DeepSeekAI,DeepSeek-LLM-67B-Chat,67B,1080,1079,8.35,,,,67.75,86.82,72.42,55.85,84.21,63.68,,,12.1,17.8
|
12 |
-
DeepSeekAI,DeepSeek-LLM-7B-Chat,7B,,,,,,,55.8,79.38,51.75,47.98,74.82,46.55,,,,
|
13 |
-
DeepSeekAI,DeepSeek-V2 Chat (RL),236B (21B),,,8.95,,,,,,77.8,,,,,,,
|
14 |
-
DeepSeekAI,DeepSeek-V2 Chat (SFT),236B (21B),,,8.62,,,,,,78.4,,,,,,,
|
15 |
-
Google,Gemini-1.0-Pro,,1135,1136,,17.8,,,,,71.8,,,,79.7,58,18.2,24.4
|
16 |
-
Google,Gemini-1.5-Pro-API-0409-Preview,,1249,1248,,,,,,,81.9,,,,,,,
|
17 |
-
Google,Gemma-1.1-7B-it,7B,1084,1085,,,59.17,38.43,60.07,76.14,60.92,50.74,69.69,42.99,,,,
|
18 |
-
Google,Gemma-2B-it,2B,999,,,3,23.26,24.16,43.94,62.7,37.65,45.82,60.93,5.46,,,3.4,5.4
|
19 |
-
Google,Gemma-7B-it,7B,1043,1043,,7.5,61.72,24.85,53.2,81.2,64.3,31.81,72.3,46.4,,,6.9,10.4
|
20 |
-
OpenAI,GPT-3.5-Turbo-0125,,1108,1106,,23.3,64.97,42.65,,,,,,,,,,
|
21 |
-
OpenAI,GPT-3.5-Turbo-0301,,1109,1108,7.94,18.1,70.67,46.66,85.2,85.5,70,47,81.6,57.1,89.4,79.2,9.6,18.1
|
22 |
-
OpenAI,GPT-3.5-Turbo-0613,,1120,1120,8.39,24.8,69.35,40.55,,,,,,,93.4,81.7,14.1,22.7
|
23 |
-
OpenAI,GPT-3.5-Turbo-1106,,1073,1072,8.32,18.9,71.74,43.17,,,,,,,86.3,75.6,9.2,19.3
|
24 |
-
OpenAI,GPT-4-0125-Preview,,1246,1247,,78,83.87,76.83,,,,,,,95.3,86.5,23.6,38.1
|
25 |
-
OpenAI,GPT-4-0314,,1189,1189,8.96,50,85.73,75.67,96.3,95.3,86.4,59,87.5,92,94.8,85.3,22.1,35.3
|
26 |
-
OpenAI,GPT-4-0613,,1165,1165,9.18,37.9,84.79,77.85,,,,,,,93.8,91.4,15.8,30.2
|
27 |
-
OpenAI,GPT-4-1106-Preview,,1252,1253,9.32,,86.05,74.96,,,,,,,97.7,89.9,50,50
|
28 |
-
OpenAI,GPT-4-Turbo-2024-04-09,,1258,1257,,82.6,86.35,77.74,,,,,,,,,46.1,55
|
29 |
-
OpenAI,GPT-4o-0513,,,,,,83.51,80.86,,,87.2,,,,,,51.3,57.5
|
30 |
-
xAI,Grok-1,314B,,,,,,,,,73,,,,,,,
|
31 |
-
InternLM,InternLM2-Chat-20B,20B,,,7.9,,,,,,66.5,,,,,,21.7,18.7
|
32 |
-
InternLM,InternLM2-Chat-7B,7B,,,7.7,,62.61,38.43,,,63.7,,,,,,,
|
33 |
-
Meta,Llama-2-13b-chat-hf,13B,1057,1054,6.65,,49.12,28.2,59.04,81.94,54.64,44.12,74.51,15.24,81.1,49.8,7.7,8.4
|
34 |
-
Meta,Llama-2-70b-chat-hf,70B,1089,1088,6.86,11.6,73.59,35.4,64.59,85.88,63.91,52.8,80.51,26.69,92.7,74.1,13.9,14.7
|
35 |
-
Meta,Llama-2-7b-chat-hf ,7B,1041,1040,6.27,4.6,36.32,27.5,52.9,78.55,48.32,45.57,71.74,7.35,71.4,29.3,5,5.4
|
36 |
-
Meta,Llama-3-70b-instruct,70B,1208,1207,,41.1,82.13,67.97,71.42,85.69,80.06,61.81,82.87,85.44,,,33.2,34.4
|
37 |
-
Meta,Llama-3-8b-instruct,8B,1154,1146,,20.6,68.88,63.84,60.75,78.55,67.07,51.65,74.51,68.69,,,22.6,22.9
|
38 |
-
Mistral,Mistral-7B-Instruct-v0.1,7B,1012,1011,6.84,,52.15,30.69,54.52,75.63,55.38,56.28,73.72,14.25,,,,
|
39 |
-
Mistral,Mistral-7B-Instruct-v0.2,7B,1074,1073,7.6,12.6,68.18,34.69,63.14,84.88,60.78,68.26,77.19,40.03,92.8,83,14.7,17.1
|
40 |
-
Mistral,Mistral-large-2402,,1156,1158,8.66,37.7,85.17,67.69,94,89.2,81.2,50.5,86.7,81,,,21.4,32.7
|
41 |
-
Mistral,Mistral-medium,,1147,1148,8.61,31.9,82.57,62.15,89.9,88,75.3,,88,66.7,96.8,91.5,21.9,28.6
|
42 |
-
Mistral,Mixtral-8x22B-Instruct-v0.1,141B,1146,1147,,36.4,78.79,62.41,72.7,89.08,77.77,68.14,85.16,82.03,,,22.2,30.9
|
43 |
-
Mistral,Mixtral-8x7b-Instruct-v0.1,47B,1114,1114,8.3,23.4,72.37,45.74,70.22,87.63,71.16,64.58,81.37,60.73,94.8,82.6,18.3,23.7
|
44 |
-
OpenChat,OpenChat-3.5-0106,7B,1098,1098,7.8,,,,66.04,82.93,65.04,51.9,81.77,68.16,,,,
|
45 |
-
Apple,OpenELM-1_1B-Instruct,1.1B,,,,,,,41.55,71.83,25.65,45.95,64.72,,,,,
|
46 |
-
Apple,OpenELM-3B-Instruct,3B,,,,,,,47.7,76.87,24.8,38.76,67.96,,,,,
|
47 |
-
OrionStarAI,Orion-14B-Chat,14B,,,7.37,,59.71,40.74,,,61.7,,,,,,,
|
48 |
-
Microsoft,Phi-3-Mini-128k-Instruct,3.8B,1052,1064,8.38,15.4,,,63.14,80.09,68.7,54.12,72.85,69.52,,,,
|
49 |
-
Microsoft,Phi-3-Mini-4k-Instruct,3.8B,,,,,58.15,53.26,62.97,80.6,69.08,59.88,72.38,74.53,,,,
|
50 |
-
Alibaba,Qwen-14B-Chat,14B,1039,1038,6.96,,63.47,39.74,,,66.5,,,,,,7.5,12.4
|
51 |
-
Alibaba,Qwen-7B-Chat,7B,,,,,50.11,33.44,,,57,,,,,,,
|
52 |
-
Alibaba,Qwen-Max-0428,,1186,,8.96,,,,,,,,,,,,,
|
53 |
-
Alibaba,Qwen1.5-1.8B-Chat,1.8B,,,,,24.12,31.56,38.74,60.02,45.87,40.62,59.67,19.03,,,3.7,2.6
|
54 |
-
Alibaba,Qwen1.5-110B-Chat,110B,1172,,8.88,,83.68,66.09,72.01,84.67,78.04,65.86,77.35,30.1,,,33.8,43.9
|
55 |
-
Alibaba,Qwen1.5-14B-Chat,14B,1119,1119,7.91,,74.99,49.27,58.79,82.33,68.52,60.38,73.32,30.86,,,18.6,23.9
|
56 |
-
Alibaba,Qwen1.5-32B-Chat,32B,1134,1135,8.3,,75.59,60.72,66.04,85.49,74.99,66.95,77.19,7.05,,,,
|
57 |
-
Alibaba,Qwen1.5-4B-Chat,4B,1002,,,,28.75,32.66,43.26,69.73,55.55,44.79,64.96,2.43,,,,
|
58 |
-
Alibaba,Qwen1.5-72B-Chat ,72B,1152,1153,8.61,36.1,82.81,63.47,68.52,86.42,77.44,63.9,79.08,20.39,,,31.8,36.7
|
59 |
-
Alibaba,Qwen1.5-7B-Chat,7B,1079,1073,7.6,,54.41,41.59,55.89,78.56,61.7,53.65,67.8,13.19,,,11.8,14.7
|
60 |
-
RekaAI,Reka-Core-20240501,,1199,,,,,,,,83.2,,,,,,,
|
61 |
-
RekaAI,Reka-Edge,7B,,,7.6,,,,,,65.7,,,,,,,
|
62 |
-
RekaAI,Reka-Flash,21B,1149,1149,8.2,,,,,,73.5,,,,,,,
|
63 |
-
Snowflake,Snowflake-Arctic-Instruct,480B (17B),1098,,,17.6,,,,,67.3,,,,,,,
|
64 |
-
Upstage,SOLAR-10.7B-Instruct-v1.0,10.7B,1065,1065,7.58,,73.53,39.62,71.08,88.16,66.21,71.43,83.58,64.75,,,,
|
65 |
-
Nexusflow,Starling-LM-7B-alpha,7B,1091,1091,8.09,12.8,73.9,37.06,63.82,84.9,64.67,46.39,80.58,62.4,,,14.2,14.7
|
66 |
-
Nexusflow,Starling-LM-7B-beta,7B,1118,1119,8.12,23,73.82,40.12,67.24,83.47,65.14,55.47,81.29,66.64,,,,
|
67 |
-
AllenAI,Tulu-2-DPO-70B,70B,1103,1102,7.89,15,76.63,50.23,72.1,88.99,69.84,65.78,83.27,62.62,95,84.3,16,21.2
|
68 |
-
LMSys,Vicuna-13B-v1.5,13B,1048,1047,6.57,,67.39,28.75,57.08,81.24,56.67,51.51,74.66,11.3,,,6.7,10.5
|
69 |
-
LMSys,Vicuna-33B-v1.3,33B,1094,1093,7.12,8.6,67.07,31.66,,,59.2,,,,89,,12.7,17.6
|
70 |
-
LMSys,Vicuna-7B-v1.1,7B,1009,1009,6.17,,26.12,27.38,53.67,77.46,45.63,48.94,70.96,5.53,64.4,,4.2,6.3
|
71 |
-
Microsoft,WizardLM-13b-v1.2,13B,1062,1061,7.2,,63.71,29.1,,,52.7,,,,89.2,,12,14.5
|
72 |
-
Microsoft,WizardLM-2-70B,70B,,,8.92,,,,,,,,,,,,,
|
73 |
-
Microsoft,WizardLM-2-7B,7B,,,8.28,,69.31,35.4,63.23,83.41,61.75,57.01,73.48,43.59,,,,
|
74 |
-
Microsoft,WizardLM-2-8x22B,141B,,,9.12,,77.91,59.16,72.44,89.05,76.77,60.5,82.24,84.61,,,,
|
75 |
-
Microsoft,WizardLM-70B-v1.0,70B,1109,1108,7.71,,,,64.52,83.21,63.32,54.6,,,,,14.4,17.6
|
76 |
-
01.AI,Yi-1.5-34B-Chat,34B,,,8.5,42.6,72.93,64.85,70.48,85.97,77.08,62.16,81.61,71.65,,,,
|
77 |
-
01.AI,Yi-1.5-6B-Chat,6B,,,7.5,17.9,59.45,46.18,,,62.8,,,,,,,
|
78 |
-
01.AI,Yi-1.5-9B-Chat,9B,,,8.2,34.4,70.37,56.13,,,69.5,,,,,,,
|
79 |
-
01.AI,Yi-34B-Chat,34B,1113,1110,7.88,23.1,71.62,57.1,65.1,84.08,74.87,55.41,79.79,19.79,94.1,76.4,29.7,27.2
|
80 |
-
01.AI,Yi-6B-Chat,6B,,,,,61.79,38.74,,,60.99,,,,,,,
|
81 |
-
01.AI,Yi-Large,,,,9.26,,,,,,83.8,,,,,,57.5,51.9
|
82 |
-
HuggingFace,Zephyr-7b-alpha,7B,1043,1042,6.88,,56.82,35.15,61.01,84.04,61.39,57.9,78.61,14.03,85.8,73.5,8.4,10.3
|
83 |
-
HuggingFace,Zephyr-7b-beta,7B,1054,1054,7.34,,58.33,35.97,62.03,84.36,61.07,57.45,77.74,29.04,90.6,76.3,11,13.2
|
84 |
-
HuggingFace,Zephyr-ORPO-141b-A35b-v0.1,141B,1129,1125,8.17,,,,,,,,,,,,,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
db/score_240517.csv
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Organization,Model,Model-in-Arena,Size,Arena Elo (240515),Arena Elo (240508),Arena Elo (240501),Arena Elo (240426),MT-Bench,Arena-Hard,EQ-Bench,MAGI-Hard,ARC-c,HellaSwag,MMLU,TruthfulQA,WinoGrande,GSM-8K,AlpacaEval-1.0,LC-AlpacaEval-1.0,AlpacaEval-2.0,LC-AlpacaEval-2.0,MMLU-Pro
|
2 |
+
Anthropic,Claude-1,claude-1,,1151,1150,1151,1150,7.9,,76.83,,,,77,,,,91.6,76.8,17,27.3,
|
3 |
+
Anthropic,Claude-2.0,claude-2.0,,1133,1132,1132,1132,8.06,24,72.89,,,,78.5,,,,91.4,74.3,17.2,28.2,
|
4 |
+
Anthropic,Claude-2.1,claude-2.1,,1119,1119,1119,1119,8.18,22.8,73.96,,,,,,,,87.1,66,15.7,25.3,
|
5 |
+
Anthropic,Claude-3-Haiku-20240307,claude-3-haiku-20240307,,1181,1180,1181,1181,,41.5,63.65,47.71,89.2,85.9,75.2,,74.2,,,,,,
|
6 |
+
Anthropic,Claude-3-Opus-20240229 ,claude-3-opus-20240229,,1246,1248,1251,1251,9.43,60.4,82.19,76.55,96.4,95.4,86.8,,88.5,,,,29.1,40.5,
|
7 |
+
Anthropic,Claude-3-Sonnet-20240229,claude-3-sonnet-20240229,,1199,1200,1201,1202,9.18,46.8,80.45,61.01,93.2,89,79,,75.1,,,,25.6,34.9,57.93
|
8 |
+
Cohere,Command-R-Plus,command-r-plus,104B,1189,1189,1191,1192,,33.1,76.11,49.7,70.39,87.96,74.02,56.95,83.82,47.31,,,,,
|
9 |
+
Cohere,Command-R-v01,command-r,35B,1148,1148,1148,1148,,17,56.05,43.27,65.53,87,68.2,52.32,81.53,56.63,90.6,61.9,12.9,10.9,35.31
|
10 |
+
Databricks,DBRX-Instruct ,dbrx-instruct-preview,132B,1103,1103,1103,1102,8.26,23.9,76.82,57.13,68.9,89,73.7,66.9,81.8,66.9,,,18.4,25.4,
|
11 |
+
DeepSeekAI,DeepSeek-LLM-67B-Chat,deepseek-llm-67b-chat,67B,1080,1080,1080,1079,8.35,,,,67.75,86.82,72.42,55.85,84.21,63.68,,,12.1,17.8,
|
12 |
+
DeepSeekAI,DeepSeek-LLM-7B-Chat,,7B,,,,,,,,,55.8,79.38,51.75,47.98,74.82,46.55,,,,,
|
13 |
+
DeepSeekAI,DeepSeek-V2 Chat (RL),,236B (21B),,,,,8.95,,,,,,77.8,,,,,,,,
|
14 |
+
DeepSeekAI,DeepSeek-V2 Chat (SFT),,236B (21B),,,,,8.62,,,,,,78.4,,,,,,,,
|
15 |
+
Google,Gemini-1.0-Pro,gemini-pro-dev-api,,1135,1135,1136,1136,,17.8,,,,,71.8,,,,79.7,58,18.2,24.4,54.44
|
16 |
+
Google,Gemini-1.5-Pro-API-0409-Preview,gemini-1.5-pro-api-0409-preview,,1248,1249,1250,1248,,,,,,,81.9,,,,,,,,
|
17 |
+
Google,Gemma-1.1-7B-it,gemma-1.1-7b-it,7B,1091,1084,1083,1085,,,59.17,38.43,60.07,76.14,60.92,50.74,69.69,42.99,,,,,
|
18 |
+
Google,Gemma-2B-it,gemma-2b-it,2B,1000,999,1000,999,,3,23.26,24.16,43.94,62.7,37.65,45.82,60.93,5.46,,,3.4,5.4,
|
19 |
+
Google,Gemma-7B-it,gemma-7b-it,7B,1043,1043,1044,1043,,7.5,61.72,24.85,53.2,81.2,64.3,31.81,72.3,46.4,,,6.9,10.4,
|
20 |
+
OpenAI,GPT-3.5-Turbo-0125,gpt-3.5-turbo-0125,,1110,1108,1107,1106,,23.3,64.97,42.65,,,,,,,,,,,
|
21 |
+
OpenAI,GPT-3.5-Turbo-0301,gpt-3.5-turbo-0314,,1110,1109,1109,1108,7.94,18.1,70.67,46.66,85.2,85.5,70,47,81.6,57.1,89.4,79.2,9.6,18.1,
|
22 |
+
OpenAI,GPT-3.5-Turbo-0613,gpt-3.5-turbo-0613,,1120,1120,1120,1120,8.39,24.8,69.35,40.55,,,,,,,93.4,81.7,14.1,22.7,
|
23 |
+
OpenAI,GPT-3.5-Turbo-1106,gpt-3.5-turbo-1106,,1073,1073,1073,1072,8.32,18.9,71.74,43.17,,,,,,,86.3,75.6,9.2,19.3,
|
24 |
+
OpenAI,GPT-4-0125-Preview,gpt-4-0125-preview,,1244,1246,1247,1247,,78,83.87,76.83,,,,,,,95.3,86.5,23.6,38.1,
|
25 |
+
OpenAI,GPT-4-0314,gpt-4-0314,,1189,1189,1190,1189,8.96,50,85.73,75.67,96.3,95.3,86.4,59,87.5,92,94.8,85.3,22.1,35.3,
|
26 |
+
OpenAI,GPT-4-0613,gpt-4-0613,,1165,1165,1165,1165,9.18,37.9,84.79,77.85,,,,,,,93.8,91.4,15.8,30.2,
|
27 |
+
OpenAI,GPT-4-1106-Preview,gpt-4-1106-preview,,1250,1252,1253,1253,9.32,,86.05,74.96,,,,,,,97.7,89.9,50,50,
|
28 |
+
OpenAI,GPT-4-Turbo-2024-04-09,gpt-4-turbo-2024-04-09,,1252,1258,1259,1257,,82.6,86.35,77.74,,,,,,,,,46.1,55,62.58
|
29 |
+
OpenAI,GPT-4o-0513,gpt-4o-2024-05-13,,1289,,,,,,83.51,80.86,,,87.2,,,,,,51.3,57.5,71.49
|
30 |
+
xAI,Grok-1,,314B,,,,,,,,,,,73,,,,,,,,
|
31 |
+
InternLM,InternLM2-Chat-20B,,20B,,,,,7.9,,,,,,66.5,,,,,,21.7,18.7,
|
32 |
+
InternLM,InternLM2-Chat-7B,,7B,,,,,7.7,,62.61,38.43,,,63.7,,,,,,,,
|
33 |
+
Meta,Llama-2-13b-chat-hf,llama-2-13b-chat,13B,1065,1057,1057,1054,6.65,,49.12,28.2,59.04,81.94,54.64,44.12,74.51,15.24,81.1,49.8,7.7,8.4,24.45
|
34 |
+
Meta,Llama-2-70b-chat-hf,llama-2-70b-chat,70B,1094,1089,1089,1088,6.86,11.6,73.59,35.4,64.59,85.88,63.91,52.8,80.51,26.69,92.7,74.1,13.9,14.7,36.09
|
35 |
+
Meta,Llama-2-7b-chat-hf ,llama-2-7b-chat,7B,1042,1041,1041,1040,6.27,4.6,36.32,27.5,52.9,78.55,48.32,45.57,71.74,7.35,71.4,29.3,5,5.4,19.4
|
36 |
+
Meta,Llama-3-70b-instruct,llama-3-70b-instruct,70B,1203,1208,1210,1207,,41.1,82.13,67.97,71.42,85.69,80.06,61.81,82.87,85.44,,,33.2,34.4,55.41
|
37 |
+
Meta,Llama-3-8b-instruct,llama-3-8b-instruct,8B,1154,1154,1153,1146,,20.6,68.88,63.84,60.75,78.55,67.07,51.65,74.51,68.69,,,22.6,22.9,38.31
|
38 |
+
Mistral,Mistral-7B-Instruct-v0.1,mistral-7b-instruct,7B,1014,1012,1012,1011,6.84,,52.15,30.69,54.52,75.63,55.38,56.28,73.72,14.25,,,,,
|
39 |
+
Mistral,Mistral-7B-Instruct-v0.2,mistral-7b-instruct-v0.2,7B,1074,1074,1074,1073,7.6,12.6,68.18,34.69,63.14,84.88,60.78,68.26,77.19,40.03,92.8,83,14.7,17.1,29.81
|
40 |
+
Mistral,Mistral-large-2402,mistral-large-2402,,1154,1156,1157,1158,8.66,37.7,85.17,67.69,94,89.2,81.2,50.5,86.7,81,,,21.4,32.7,
|
41 |
+
Mistral,Mistral-medium,mistral-medium,,1146,1147,1148,1148,8.61,31.9,82.57,62.15,89.9,88,75.3,,88,66.7,96.8,91.5,21.9,28.6,
|
42 |
+
Mistral,Mixtral-8x22B-Instruct-v0.1,mixtral-8x22b-instruct-v0.1,141B,1144,1146,1146,1147,,36.4,78.79,62.41,72.7,89.08,77.77,68.14,85.16,82.03,,,22.2,30.9,
|
43 |
+
Mistral,Mixtral-8x7b-Instruct-v0.1,mixtral-8x7b-instruct-v0.1,47B,1114,1114,1114,1114,8.3,23.4,72.37,45.74,70.22,87.63,71.16,64.58,81.37,60.73,94.8,82.6,18.3,23.7,40.4
|
44 |
+
OpenChat,OpenChat-3.5-0106,openchat-3.5-0106,7B,1098,1098,1099,1098,7.8,,,,66.04,82.93,65.04,51.9,81.77,68.16,,,,,
|
45 |
+
Apple,OpenELM-1_1B-Instruct,,1.1B,,,,,,,,,41.55,71.83,25.65,45.95,64.72,,,,,,
|
46 |
+
Apple,OpenELM-3B-Instruct,,3B,,,,,,,,,47.7,76.87,24.8,38.76,67.96,,,,,,
|
47 |
+
OrionStarAI,Orion-14B-Chat,,14B,,,,,7.37,,59.71,40.74,,,61.7,,,,,,,,
|
48 |
+
Microsoft,Phi-3-Mini-128k-Instruct,phi-3-mini-128k-instruct,3.8B,1053,1052,1050,1064,8.38,15.4,,,63.14,80.09,68.7,54.12,72.85,69.52,,,,,
|
49 |
+
Microsoft,Phi-3-Mini-4k-Instruct,,3.8B,,,,,,,58.15,53.26,62.97,80.6,69.08,59.88,72.38,74.53,,,,,40.96
|
50 |
+
Alibaba,Qwen-14B-Chat,qwen-14b-chat,14B,1040,1039,1039,1038,6.96,,63.47,39.74,,,66.5,,,,,,7.5,12.4,
|
51 |
+
Alibaba,Qwen-7B-Chat,,7B,,,,,,,50.11,33.44,,,57,,,,,,,,
|
52 |
+
Alibaba,Qwen-Max-0428,qwen-max-0428,,1187,1186,,,8.96,,,,,,,,,,,,,,
|
53 |
+
Alibaba,Qwen1.5-1.8B-Chat,,1.8B,,,,,,,24.12,31.56,38.74,60.02,45.87,40.62,59.67,19.03,,,3.7,2.6,
|
54 |
+
Alibaba,Qwen1.5-110B-Chat,qwen1.5-110b-chat,110B,1171,1172,,,8.88,,83.68,66.09,72.01,84.67,78.04,65.86,77.35,30.1,,,33.8,43.9,
|
55 |
+
Alibaba,Qwen1.5-14B-Chat,qwen1.5-14b-chat,14B,1119,1119,1118,1119,7.91,,74.99,49.27,58.79,82.33,68.52,60.38,73.32,30.86,,,18.6,23.9,
|
56 |
+
Alibaba,Qwen1.5-32B-Chat,qwen1.5-32b-chat,32B,1135,1134,1134,1135,8.3,,75.59,60.72,66.04,85.49,74.99,66.95,77.19,7.05,,,,,
|
57 |
+
Alibaba,Qwen1.5-4B-Chat,qwen1.5-4b-chat,4B,1003,1002,1003,1003,,,28.75,32.66,43.26,69.73,55.55,44.79,64.96,2.43,,,,,
|
58 |
+
Alibaba,Qwen1.5-72B-Chat ,qwen1.5-72b-chat,72B,1152,1152,1152,1153,8.61,36.1,82.81,63.47,68.52,86.42,77.44,63.9,79.08,20.39,,,31.8,36.7,
|
59 |
+
Alibaba,Qwen1.5-7B-Chat,qwen1.5-7b-chat,7B,1079,1079,1077,1073,7.6,,54.41,41.59,55.89,78.56,61.7,53.65,67.8,13.19,,,11.8,14.7,
|
60 |
+
RekaAI,Reka-Core-20240501,reka-core-20240501,,1195,1199,,,,,,,,,83.2,,,,,,,,
|
61 |
+
RekaAI,Reka-Edge,,7B,,,,,7.6,,,,,,65.7,,,,,,,,
|
62 |
+
RekaAI,Reka-Flash,reka-flash-21b-20240226,21B,1148,1149,1147,1149,8.2,,,,,,73.5,,,,,,,,
|
63 |
+
Snowflake,Snowflake-Arctic-Instruct,snowflake-arctic-instruct,480B (17B),1098,1098,1098,,,17.6,,,,,67.3,,,,,,,,
|
64 |
+
Upstage,SOLAR-10.7B-Instruct-v1.0,solar-10.7b-instruct-v1.0,10.7B,1066,1065,1065,1065,7.58,,73.53,39.62,71.08,88.16,66.21,71.43,83.58,64.75,,,,,
|
65 |
+
Nexusflow,Starling-LM-7B-alpha,starling-lm-7b-alpha,7B,1092,1091,1092,1091,8.09,12.8,73.9,37.06,63.82,84.9,64.67,46.39,80.58,62.4,,,14.2,14.7,
|
66 |
+
Nexusflow,Starling-LM-7B-beta,starling-lm-7b-beta,7B,1118,1118,1119,1119,8.12,23,73.82,40.12,67.24,83.47,65.14,55.47,81.29,66.64,,,,,
|
67 |
+
AllenAI,Tulu-2-DPO-70B,tulu-2-dpo-70b,70B,1103,1103,1103,1102,7.89,15,76.63,50.23,72.1,88.99,69.84,65.78,83.27,62.62,95,84.3,16,21.2,
|
68 |
+
LMSys,Vicuna-13B-v1.5,vicuna-13b,13B,1050,1048,1048,1047,6.57,,67.39,28.75,57.08,81.24,56.67,51.51,74.66,11.3,,,6.7,10.5,
|
69 |
+
LMSys,Vicuna-33B-v1.3,vicuna-33b,33B,1095,1094,1094,1093,7.12,8.6,67.07,31.66,,,59.2,,,,89,,12.7,17.6,
|
70 |
+
LMSys,Vicuna-7B-v1.1,vicuna-7b,7B,1011,1009,1009,1009,6.17,,26.12,27.38,53.67,77.46,45.63,48.94,70.96,5.53,64.4,,4.2,6.3,
|
71 |
+
Microsoft,WizardLM-13b-v1.2,wizardlm-13b,13B,1063,1062,1062,1061,7.2,,63.71,29.1,,,52.7,,,,89.2,,12,14.5,
|
72 |
+
Microsoft,WizardLM-2-70B,,70B,,,,,8.92,,,,,,,,,,,,,,
|
73 |
+
Microsoft,WizardLM-2-7B,,7B,,,,,8.28,,69.31,35.4,63.23,83.41,61.75,57.01,73.48,43.59,,,,,
|
74 |
+
Microsoft,WizardLM-2-8x22B,,141B,,,,,9.12,,77.91,59.16,72.44,89.05,76.77,60.5,82.24,84.61,,,,,
|
75 |
+
Microsoft,WizardLM-70B-v1.0,wizardlm-70b,70B,1110,1109,1109,1108,7.71,,,,64.52,83.21,63.32,54.6,,,,,14.4,17.6,
|
76 |
+
01.AI,Yi-1.5-34B-Chat,,34B,,,,,8.5,42.6,72.93,64.85,70.48,85.97,77.08,62.16,81.61,71.65,,,,,50.7
|
77 |
+
01.AI,Yi-1.5-6B-Chat,,6B,,,,,7.5,17.9,59.45,46.18,,,62.8,,,,,,,,36.71
|
78 |
+
01.AI,Yi-1.5-9B-Chat,,9B,,,,,8.2,34.4,70.37,56.13,,,69.5,,,,,,,,44.07
|
79 |
+
01.AI,Yi-34B-Chat,yi-34b-chat,34B,1116,1113,1111,1110,7.88,23.1,71.62,57.1,65.1,84.08,74.87,55.41,79.79,19.79,94.1,76.4,29.7,27.2,
|
80 |
+
01.AI,Yi-6B-Chat,,6B,,,,,,,61.79,38.74,,,60.99,,,,,,,,
|
81 |
+
01.AI,Yi-Large,,,,,,,9.26,,,,,,83.8,,,,,,57.5,51.9,57.53
|
82 |
+
HuggingFace,Zephyr-7b-alpha,zephyr-7b-alpha,7B,1046,1043,1043,1042,6.88,,56.82,35.15,61.01,84.04,61.39,57.9,78.61,14.03,85.8,73.5,8.4,10.3,
|
83 |
+
HuggingFace,Zephyr-7b-beta,zephyr-7b-beta,7B,1056,1054,1054,1054,7.34,,58.33,35.97,62.03,84.36,61.07,57.45,77.74,29.04,90.6,76.3,11,13.2,
|
84 |
+
HuggingFace,Zephyr-ORPO-141b-A35b-v0.1,zephyr-orpo-141b-A35b-v0.1,141B,1129,1129,1129,1125,8.17,,,,,,,,,,,,,,
|