Spaces:
Running
Running
,model,score,scenario,source,aggragated_from | |
0,gpt-4-turbo-2024-04-09,82.6,arena-hard,arena_hard_2404,[] | |
1,gpt-4-0125-preview,78.0,arena-hard,arena_hard_2404,[] | |
2,gemini-1.5-pro-api-preview,72.0,arena-hard,arena_hard_2404,[] | |
3,yi-large,63.7,arena-hard,arena_hard_2404,[] | |
4,claude-3-opus-20240229,60.4,arena-hard,arena_hard_2404,[] | |
5,glm-4,55.7,arena-hard,arena_hard_2404,[] | |
6,gpt-4-0314,50.0,arena-hard,arena_hard_2404,[] | |
7,gemini-1.5-flash-api-preview,49.6,arena-hard,arena_hard_2404,[] | |
8,claude-3-sonnet-20240229,46.8,arena-hard,arena_hard_2404,[] | |
9,claude-3-haiku-20240307,41.5,arena-hard,arena_hard_2404,[] | |
10,llama-3-70b-chat-hf,41.1,arena-hard,arena_hard_2404,[] | |
11,gpt-4-0613,37.9,arena-hard,arena_hard_2404,[] | |
12,mistral-large-2402,37.7,arena-hard,arena_hard_2404,[] | |
13,mixtral-8x22b-instruct-v0.1,36.4,arena-hard,arena_hard_2404,[] | |
14,qwen1.5-72b-chat,36.1,arena-hard,arena_hard_2404,[] | |
15,command-r-plus,33.1,arena-hard,arena_hard_2404,[] | |
16,mistral-medium,31.9,arena-hard,arena_hard_2404,[] | |
17,mistral-next,27.4,arena-hard,arena_hard_2404,[] | |
18,gpt-3.5-turbo-0613,24.8,arena-hard,arena_hard_2404,[] | |
19,claude-2.0,24.0,arena-hard,arena_hard_2404,[] | |
20,dbrx-instructruct,23.9,arena-hard,arena_hard_2404,[] | |
21,mixtral-8x7b-instruct-v0.1,23.4,arena-hard,arena_hard_2404,[] | |
22,gpt-3.5-turbo-0125,23.3,arena-hard,arena_hard_2404,[] | |
23,yi-34b-chat,23.1,arena-hard,arena_hard_2404,[] | |
24,starling-lm-7b-beta,23.0,arena-hard,arena_hard_2404,[] | |
25,claude-2.1,22.8,arena-hard,arena_hard_2404,[] | |
26,snorkel-mistral-pairrm-dpo,20.7,arena-hard,arena_hard_2404,[] | |
27,llama-3-8b-chat-hf,20.6,arena-hard,arena_hard_2404,[] | |
28,gpt-3.5-turbo-1106,18.9,arena-hard,arena_hard_2404,[] | |
29,gpt-3.5-turbo-0301,18.1,arena-hard,arena_hard_2404,[] | |
30,gemini-1.0-pro,17.8,arena-hard,arena_hard_2404,[] | |
31,snowflake-arctic-instruct,17.6,arena-hard,arena_hard_2404,[] | |
32,command-r,17.0,arena-hard,arena_hard_2404,[] | |
33,phi-3-mini-128k-instruct,15.4,arena-hard,arena_hard_2404,[] | |
34,tulu-2-dpo-70b,15.0,arena-hard,arena_hard_2404,[] | |
35,starling-lm-7b-alpha,12.8,arena-hard,arena_hard_2404,[] | |
36,mistral-7b-instruct,12.6,arena-hard,arena_hard_2404,[] | |
37,gemma-1.1-7b-it,12.1,arena-hard,arena_hard_2404,[] | |
38,llama-2-70b-chat-hf,11.6,arena-hard,arena_hard_2404,[] | |
39,vicuna-33b-v1.3,8.6,arena-hard,arena_hard_2404,[] | |
40,gemma-7b-it,7.5,arena-hard,arena_hard_2404,[] | |
41,llama-2-7b-chat-hf,4.6,arena-hard,arena_hard_2404,[] | |
42,gemma-1.1-2b-it,3.4,arena-hard,arena_hard_2404,[] | |
43,gemma-2b-it,3.0,arena-hard,arena_hard_2404,[] | |
0,gpt-4o-2024-05-13,64.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
1,claude-3-opus,63.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
2,gpt-4-turbo-2024-04-09,62.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
3,gemini-1.5-pro-api-0409,58.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
4,yi-large-preview,56.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
5,llama-3-70b-instruct,55.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
6,qwen-max-0428,55.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
7,claude-3-sonnet,54.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
8,reka-core-20240415,52.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
9,mammoth2-8x7b-plus,51.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
10,deepseek-v2,51.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
11,command-r-plus,51.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
12,yi-1.5-34b-chat,51.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
13,mistral-large,50.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
14,qwen1.5-72b-chat,48.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
15,mistral-medium,47.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
16,gemini-1.0-pro,46.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
17,reka-flash-20240226,46.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
18,mistral-small,46.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
19,llama-3-8b-instruct,45.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
20,command-r,45.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
21,qwen1.5-32b-chat,43.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
22,gpt-3.5-turbo-0125,43.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
23,claude-3-haiku,42.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
24,yi-34b-chat,42.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
25,mixtral-8x7b-instruct-v0.1,42.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
26,starling-lm-7b-beta,41.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
27,yi-1.5-9b-chat,40.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
28,gemma-1.1-7b-it,39.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
29,vicuna-33b-v1.3,38.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
30,llama-2-70b-chat,38.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
31,map-neo-instruct-v0.1,37.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
32,mistral-7b-instruct-v0.2,36.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
33,qwen1.5-7b-chat,35.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
34,reka-edge-20240208,32.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
35,zephyr-7b-beta,31.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
36,llama-2-7b-chat,30.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
37,yi-6b-chat,30.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
38,qwen1.5-moe-a2.7b-chat,29.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
39,gemma-1.1-2b-it,28.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
40,vicuna-7b-v1.5,27.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
41,olmo-7b-instruct,26.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
42,qwen1.5-4b-chat,24.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
43,jetmoe-8b-chat,24.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
44,mpt-7b-chat,23.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
45,llama-3-70b,54.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
46,qwen1.5-72b,41.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
47,yi-34b,47.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
48,qwen1.5-32b,41.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
49,mixtral-8x7b,40.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
50,llama-2-70b,41.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
51,qwen1.5-moe-a2.7b,33.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
52,qwen1.5-7b,33.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
53,llama-3-8b,31.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
54,mistral-7b,27.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
55,gemma-7b,32.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
56,yi-6b,30.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
57,qwen1.5-4b,23.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
58,jetmoe-8b,27.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
59,deepseek-7b,21.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
60,phi-2,21.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
61,deepseekmoe-16b,24.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
62,llama-2-7b,22.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
63,gemma-2b,22.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
64,olmo-7b,21.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
65,mpt-7b,17.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
66,gpt-4o-2024-05-13,87.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
67,claude-3-opus,88.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
68,gpt-4-turbo-2024-04-09,88.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
69,gemini-1.5-pro-api-0409,84.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
70,yi-large-preview,84.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
71,llama-3-70b-instruct,84.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
72,qwen-max-0428,86.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
73,claude-3-sonnet,81.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
74,reka-core-20240415,83.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
75,mammoth2-8x7b-plus,81.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
76,deepseek-v2,83.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
77,command-r-plus,81.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
78,yi-1.5-34b-chat,81.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
79,mistral-large,84.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
80,qwen1.5-72b-chat,84.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
81,mistral-medium,81.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
82,gemini-1.0-pro,78.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
83,reka-flash-20240226,79.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
84,mistral-small,81.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
85,llama-3-8b-instruct,75.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
86,command-r,77.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
87,qwen1.5-32b-chat,81.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
88,gpt-3.5-turbo-0125,79.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
89,claude-3-haiku,79.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
90,yi-34b-chat,80.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
91,mixtral-8x7b-instruct-v0.1,76.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
92,starling-lm-7b-beta,74.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
93,yi-1.5-9b-chat,74.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
94,gemma-1.1-7b-it,69.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
95,vicuna-33b-v1.3,66.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
96,llama-2-70b-chat,74.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
97,map-neo-instruct-v0.1,70.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
98,mistral-7b-instruct-v0.2,70.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
99,qwen1.5-7b-chat,71.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
100,reka-edge-20240208,68.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
101,zephyr-7b-beta,69.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
102,llama-2-7b-chat,61.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
103,yi-6b-chat,65.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
104,qwen1.5-moe-a2.7b-chat,69.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
105,gemma-1.1-2b-it,51.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
106,vicuna-7b-v1.5,60.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
107,olmo-7b-instruct,55.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
108,qwen1.5-4b-chat,57.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
109,jetmoe-8b-chat,51.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
110,mpt-7b-chat,43.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
111,llama-3-70b,82.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
112,qwen1.5-72b,79.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
113,yi-34b,78.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
114,qwen1.5-32b,77.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
115,mixtral-8x7b,74.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
116,llama-2-70b,73.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
117,qwen1.5-moe-a2.7b,70.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
118,qwen1.5-7b,68.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
119,llama-3-8b,65.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
120,mistral-7b,64.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
121,gemma-7b,64.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
122,yi-6b,63.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
123,qwen1.5-4b,58.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
124,jetmoe-8b,57.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
125,deepseek-7b,52.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
126,phi-2,51.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
127,deepseekmoe-16b,51.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
128,llama-2-7b,43.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
129,gemma-2b,38.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
130,olmo-7b,31.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
131,mpt-7b,30.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']" | |
264,gpt-4o-2024-05-13,85.4,mmlu-mixed,mixeval_240601,[] | |
265,claude-3-opus,83.2,mmlu-mixed,mixeval_240601,[] | |
266,gpt-4-turbo-2024-04-09,82.8,mmlu-mixed,mixeval_240601,[] | |
267,gemini-1.5-pro-api-0409,79.2,mmlu-mixed,mixeval_240601,[] | |
268,yi-large-preview,80.9,mmlu-mixed,mixeval_240601,[] | |
269,llama-3-70b-instruct,80.5,mmlu-mixed,mixeval_240601,[] | |
270,qwen-max-0428,80.6,mmlu-mixed,mixeval_240601,[] | |
271,claude-3-sonnet,74.7,mmlu-mixed,mixeval_240601,[] | |
272,reka-core-20240415,79.3,mmlu-mixed,mixeval_240601,[] | |
273,mammoth2-8x7b-plus,74.5,mmlu-mixed,mixeval_240601,[] | |
274,deepseek-v2,77.3,mmlu-mixed,mixeval_240601,[] | |
275,command-r-plus,78.9,mmlu-mixed,mixeval_240601,[] | |
276,yi-1.5-34b-chat,76.4,mmlu-mixed,mixeval_240601,[] | |
277,mistral-large,80.2,mmlu-mixed,mixeval_240601,[] | |
278,qwen1.5-72b-chat,80.1,mmlu-mixed,mixeval_240601,[] | |
279,mistral-medium,76.3,mmlu-mixed,mixeval_240601,[] | |
280,gemini-1.0-pro,74.9,mmlu-mixed,mixeval_240601,[] | |
281,reka-flash-20240226,75.4,mmlu-mixed,mixeval_240601,[] | |
282,mistral-small,75.2,mmlu-mixed,mixeval_240601,[] | |
283,llama-3-8b-instruct,71.9,mmlu-mixed,mixeval_240601,[] | |
284,command-r,75.0,mmlu-mixed,mixeval_240601,[] | |
285,qwen1.5-32b-chat,78.0,mmlu-mixed,mixeval_240601,[] | |
286,gpt-3.5-turbo-0125,74.5,mmlu-mixed,mixeval_240601,[] | |
287,claude-3-haiku,76.1,mmlu-mixed,mixeval_240601,[] | |
288,yi-34b-chat,73.6,mmlu-mixed,mixeval_240601,[] | |
289,mixtral-8x7b-instruct-v0.1,72.0,mmlu-mixed,mixeval_240601,[] | |
290,starling-lm-7b-beta,69.0,mmlu-mixed,mixeval_240601,[] | |
291,yi-1.5-9b-chat,72.6,mmlu-mixed,mixeval_240601,[] | |
292,gemma-1.1-7b-it,66.9,mmlu-mixed,mixeval_240601,[] | |
293,vicuna-33b-v1.3,59.2,mmlu-mixed,mixeval_240601,[] | |
294,llama-2-70b-chat,69.8,mmlu-mixed,mixeval_240601,[] | |
295,map-neo-instruct-v0.1,66.7,mmlu-mixed,mixeval_240601,[] | |
296,mistral-7b-instruct-v0.2,67.3,mmlu-mixed,mixeval_240601,[] | |
297,qwen1.5-7b-chat,68.7,mmlu-mixed,mixeval_240601,[] | |
298,reka-edge-20240208,63.6,mmlu-mixed,mixeval_240601,[] | |
299,zephyr-7b-beta,64.9,mmlu-mixed,mixeval_240601,[] | |
300,llama-2-7b-chat,59.4,mmlu-mixed,mixeval_240601,[] | |
301,yi-6b-chat,65.4,mmlu-mixed,mixeval_240601,[] | |
302,qwen1.5-moe-a2.7b-chat,69.5,mmlu-mixed,mixeval_240601,[] | |
303,gemma-1.1-2b-it,51.5,mmlu-mixed,mixeval_240601,[] | |
304,vicuna-7b-v1.5,58.7,mmlu-mixed,mixeval_240601,[] | |
305,olmo-7b-instruct,57.1,mmlu-mixed,mixeval_240601,[] | |
306,qwen1.5-4b-chat,61.4,mmlu-mixed,mixeval_240601,[] | |
307,jetmoe-8b-chat,58.5,mmlu-mixed,mixeval_240601,[] | |
308,mpt-7b-chat,37.8,mmlu-mixed,mixeval_240601,[] | |
309,llama-3-70b,79.8,mmlu-mixed,mixeval_240601,[] | |
310,qwen1.5-72b,78.8,mmlu-mixed,mixeval_240601,[] | |
311,yi-34b,79.3,mmlu-mixed,mixeval_240601,[] | |
312,qwen1.5-32b,77.2,mmlu-mixed,mixeval_240601,[] | |
313,mixtral-8x7b,71.6,mmlu-mixed,mixeval_240601,[] | |
314,llama-2-70b,70.8,mmlu-mixed,mixeval_240601,[] | |
315,qwen1.5-moe-a2.7b,69.4,mmlu-mixed,mixeval_240601,[] | |
316,qwen1.5-7b,67.0,mmlu-mixed,mixeval_240601,[] | |
317,llama-3-8b,69.5,mmlu-mixed,mixeval_240601,[] | |
318,mistral-7b,68.5,mmlu-mixed,mixeval_240601,[] | |
319,gemma-7b,67.4,mmlu-mixed,mixeval_240601,[] | |
320,yi-6b,71.2,mmlu-mixed,mixeval_240601,[] | |
321,qwen1.5-4b,59.6,mmlu-mixed,mixeval_240601,[] | |
322,jetmoe-8b,55.3,mmlu-mixed,mixeval_240601,[] | |
323,deepseek-7b,53.3,mmlu-mixed,mixeval_240601,[] | |
324,phi-2,62.5,mmlu-mixed,mixeval_240601,[] | |
325,deepseekmoe-16b,49.9,mmlu-mixed,mixeval_240601,[] | |
326,llama-2-7b,40.8,mmlu-mixed,mixeval_240601,[] | |
327,gemma-2b,37.4,mmlu-mixed,mixeval_240601,[] | |
328,olmo-7b,29.7,mmlu-mixed,mixeval_240601,[] | |
329,mpt-7b,30.9,mmlu-mixed,mixeval_240601,[] | |
594,gpt-4o-2024-05-13,57.1,mmlu-hard-mixed,mixeval_240601,[] | |
595,claude-3-opus,55.0,mmlu-hard-mixed,mixeval_240601,[] | |
596,gpt-4-turbo-2024-04-09,45.5,mmlu-hard-mixed,mixeval_240601,[] | |
597,gemini-1.5-pro-api-0409,44.6,mmlu-hard-mixed,mixeval_240601,[] | |
598,yi-large-preview,48.5,mmlu-hard-mixed,mixeval_240601,[] | |
599,llama-3-70b-instruct,46.3,mmlu-hard-mixed,mixeval_240601,[] | |
600,qwen-max-0428,41.6,mmlu-hard-mixed,mixeval_240601,[] | |
601,claude-3-sonnet,40.7,mmlu-hard-mixed,mixeval_240601,[] | |
602,reka-core-20240415,46.3,mmlu-hard-mixed,mixeval_240601,[] | |
603,mammoth2-8x7b-plus,41.1,mmlu-hard-mixed,mixeval_240601,[] | |
604,deepseek-v2,42.0,mmlu-hard-mixed,mixeval_240601,[] | |
605,command-r-plus,42.0,mmlu-hard-mixed,mixeval_240601,[] | |
606,yi-1.5-34b-chat,38.1,mmlu-hard-mixed,mixeval_240601,[] | |
607,mistral-large,42.4,mmlu-hard-mixed,mixeval_240601,[] | |
608,qwen1.5-72b-chat,37.7,mmlu-hard-mixed,mixeval_240601,[] | |
609,mistral-medium,38.5,mmlu-hard-mixed,mixeval_240601,[] | |
610,gemini-1.0-pro,35.5,mmlu-hard-mixed,mixeval_240601,[] | |
611,reka-flash-20240226,34.6,mmlu-hard-mixed,mixeval_240601,[] | |
612,mistral-small,33.8,mmlu-hard-mixed,mixeval_240601,[] | |
613,llama-3-8b-instruct,40.7,mmlu-hard-mixed,mixeval_240601,[] | |
614,command-r,39.0,mmlu-hard-mixed,mixeval_240601,[] | |
615,qwen1.5-32b-chat,29.9,mmlu-hard-mixed,mixeval_240601,[] | |
616,gpt-3.5-turbo-0125,35.1,mmlu-hard-mixed,mixeval_240601,[] | |
617,claude-3-haiku,30.7,mmlu-hard-mixed,mixeval_240601,[] | |
618,yi-34b-chat,29.9,mmlu-hard-mixed,mixeval_240601,[] | |
619,mixtral-8x7b-instruct-v0.1,37.2,mmlu-hard-mixed,mixeval_240601,[] | |
620,starling-lm-7b-beta,34.2,mmlu-hard-mixed,mixeval_240601,[] | |
621,yi-1.5-9b-chat,36.8,mmlu-hard-mixed,mixeval_240601,[] | |
622,gemma-1.1-7b-it,39.0,mmlu-hard-mixed,mixeval_240601,[] | |
623,vicuna-33b-v1.3,39.4,mmlu-hard-mixed,mixeval_240601,[] | |
624,llama-2-70b-chat,27.7,mmlu-hard-mixed,mixeval_240601,[] | |
625,map-neo-instruct-v0.1,32.5,mmlu-hard-mixed,mixeval_240601,[] | |
626,mistral-7b-instruct-v0.2,29.4,mmlu-hard-mixed,mixeval_240601,[] | |
627,qwen1.5-7b-chat,29.0,mmlu-hard-mixed,mixeval_240601,[] | |
628,reka-edge-20240208,26.4,mmlu-hard-mixed,mixeval_240601,[] | |
629,zephyr-7b-beta,24.2,mmlu-hard-mixed,mixeval_240601,[] | |
630,llama-2-7b-chat,30.3,mmlu-hard-mixed,mixeval_240601,[] | |
631,yi-6b-chat,26.8,mmlu-hard-mixed,mixeval_240601,[] | |
632,qwen1.5-moe-a2.7b-chat,26.8,mmlu-hard-mixed,mixeval_240601,[] | |
633,gemma-1.1-2b-it,30.3,mmlu-hard-mixed,mixeval_240601,[] | |
634,vicuna-7b-v1.5,23.4,mmlu-hard-mixed,mixeval_240601,[] | |
635,olmo-7b-instruct,27.3,mmlu-hard-mixed,mixeval_240601,[] | |
636,qwen1.5-4b-chat,17.3,mmlu-hard-mixed,mixeval_240601,[] | |
637,jetmoe-8b-chat,25.5,mmlu-hard-mixed,mixeval_240601,[] | |
638,mpt-7b-chat,24.7,mmlu-hard-mixed,mixeval_240601,[] | |
639,llama-3-70b,39.8,mmlu-hard-mixed,mixeval_240601,[] | |
640,qwen1.5-72b,42.4,mmlu-hard-mixed,mixeval_240601,[] | |
641,yi-34b,42.4,mmlu-hard-mixed,mixeval_240601,[] | |
642,qwen1.5-32b,37.2,mmlu-hard-mixed,mixeval_240601,[] | |
643,mixtral-8x7b,34.6,mmlu-hard-mixed,mixeval_240601,[] | |
644,llama-2-70b,29.0,mmlu-hard-mixed,mixeval_240601,[] | |
645,qwen1.5-moe-a2.7b,30.7,mmlu-hard-mixed,mixeval_240601,[] | |
646,qwen1.5-7b,28.6,mmlu-hard-mixed,mixeval_240601,[] | |
647,llama-3-8b,38.5,mmlu-hard-mixed,mixeval_240601,[] | |
648,mistral-7b,27.7,mmlu-hard-mixed,mixeval_240601,[] | |
649,gemma-7b,28.1,mmlu-hard-mixed,mixeval_240601,[] | |
650,yi-6b,37.2,mmlu-hard-mixed,mixeval_240601,[] | |
651,qwen1.5-4b,22.9,mmlu-hard-mixed,mixeval_240601,[] | |
652,jetmoe-8b,27.3,mmlu-hard-mixed,mixeval_240601,[] | |
653,deepseek-7b,26.4,mmlu-hard-mixed,mixeval_240601,[] | |
654,phi-2,29.0,mmlu-hard-mixed,mixeval_240601,[] | |
655,deepseekmoe-16b,30.7,mmlu-hard-mixed,mixeval_240601,[] | |
656,llama-2-7b,24.7,mmlu-hard-mixed,mixeval_240601,[] | |
657,gemma-2b,27.3,mmlu-hard-mixed,mixeval_240601,[] | |
658,olmo-7b,25.1,mmlu-hard-mixed,mixeval_240601,[] | |
659,mpt-7b,24.2,mmlu-hard-mixed,mixeval_240601,[] | |
593,gpt-4-0314,0.57,agieval,BLZ_240312,[] | |
594,gpt-4-0613,0.57,agieval,BLZ_240312,[] | |
596,claude-1,0.49700000000000005,agieval,BLZ_240312,[] | |
601,mixtral-8x7b-instruct-v0.1,0.45299999999999996,agieval,BLZ_240312,[] | |
602,yi-34b-chat,0.508,agieval,BLZ_240312,[] | |
605,gpt-3.5-turbo-0314,0.43200000000000005,agieval,BLZ_240312,[] | |
608,vicuna-33b,0.373,agieval,BLZ_240312,[] | |
609,starling-lm-7b-alpha,0.401,agieval,BLZ_240312,[] | |
611,llama-2-70b-chat,0.45,agieval,BLZ_240312,[] | |
613,openhermes-2.5-mistral-7b,0.43,agieval,BLZ_240312,[] | |
614,openchat-3.5,0.42700000000000005,agieval,BLZ_240312,[] | |
617,solar-10.7b-instruct-v1.0,0.47600000000000003,agieval,BLZ_240312,[] | |
618,dolphin-2.2.1-mistral-7b,0.392,agieval,BLZ_240312,[] | |
620,zephyr-7b-beta,0.406,agieval,BLZ_240312,[] | |
623,llama-2-13b-chat,0.336,agieval,BLZ_240312,[] | |
624,vicuna-13b,0.368,agieval,BLZ_240312,[] | |
626,zephyr-7b-alpha,0.38,agieval,BLZ_240312,[] | |
627,qwen-14b-chat,0.396,agieval,BLZ_240312,[] | |
630,llama-2-7b-chat,0.29600000000000004,agieval,BLZ_240312,[] | |
632,mistral-7b-instruct-v0.1,0.335,agieval,BLZ_240312,[] | |
634,vicuna-7b,0.314,agieval,BLZ_240312,[] | |
636,chatglm3-6b,0.414,agieval,BLZ_240312,[] | |
643,chatglm-6b,0.325,agieval,BLZ_240312,[] | |
647,llama-13b,0.205,agieval,BLZ_240312,[] | |
886,gpt-4-1106-preview,0.977,alpacav1,BLZ_240312,[] | |
888,gpt-4-0314,0.9528,alpacav1,BLZ_240312,[] | |
889,gpt-4-0613,0.9528,alpacav1,BLZ_240312,[] | |
890,mistral-medium,0.9682999999999999,alpacav1,BLZ_240312,[] | |
891,claude-1,0.8839,alpacav1,BLZ_240312,[] | |
892,claude-2.0,0.9136,alpacav1,BLZ_240312,[] | |
893,gemini-pro-dev-api,0.7966,alpacav1,BLZ_240312,[] | |
894,claude-2.1,0.8708,alpacav1,BLZ_240312,[] | |
895,gpt-3.5-turbo-0613,0.8937,alpacav1,BLZ_240312,[] | |
896,mixtral-8x7b-instruct-v0.1,0.9478,alpacav1,BLZ_240312,[] | |
897,yi-34b-chat,0.9408,alpacav1,BLZ_240312,[] | |
898,gemini-pro,0.7966,alpacav1,BLZ_240312,[] | |
900,gpt-3.5-turbo-0314,0.8937,alpacav1,BLZ_240312,[] | |
902,tulu-2-dpo-70b,0.9503,alpacav1,BLZ_240312,[] | |
903,vicuna-33b,0.8898999999999999,alpacav1,BLZ_240312,[] | |
904,starling-lm-7b-alpha,0.9198999999999999,alpacav1,BLZ_240312,[] | |
906,llama-2-70b-chat,0.9266,alpacav1,BLZ_240312,[] | |
909,openchat-3.5,0.8851,alpacav1,BLZ_240312,[] | |
911,gpt-3.5-turbo-1106,0.8626,alpacav1,BLZ_240312,[] | |
914,wizardlm-13b-v1.2,0.8917,alpacav1,BLZ_240312,[] | |
915,zephyr-7b-beta,0.9059999999999999,alpacav1,BLZ_240312,[] | |
918,llama-2-13b-chat,0.8109000000000001,alpacav1,BLZ_240312,[] | |
921,zephyr-7b-alpha,0.8576,alpacav1,BLZ_240312,[] | |
924,guanaco-33b,0.6596,alpacav1,BLZ_240312,[] | |
925,llama-2-7b-chat,0.7137,alpacav1,BLZ_240312,[] | |
934,chatglm2-6b,0.47130000000000005,alpacav1,BLZ_240312,[] | |
937,openassistant-pythia-12b,0.2596,alpacav1,BLZ_240312,[] | |
827,gpt-4-1106-preview,0.5,alpacav2,BLZ_240312,[] | |
829,gpt-4-0314,0.221,alpacav2,BLZ_240312,[] | |
830,gpt-4-0613,0.158,alpacav2,BLZ_240312,[] | |
831,mistral-medium,0.21899999999999997,alpacav2,BLZ_240312,[] | |
832,claude-1,0.17,alpacav2,BLZ_240312,[] | |
833,claude-2.0,0.172,alpacav2,BLZ_240312,[] | |
834,gemini-pro-dev-api,0.16899999999999998,alpacav2,BLZ_240312,[] | |
835,claude-2.1,0.157,alpacav2,BLZ_240312,[] | |
836,gpt-3.5-turbo-0613,0.141,alpacav2,BLZ_240312,[] | |
837,mixtral-8x7b-instruct-v0.1,0.183,alpacav2,BLZ_240312,[] | |
838,yi-34b-chat,0.297,alpacav2,BLZ_240312,[] | |
839,gemini-pro,0.16899999999999998,alpacav2,BLZ_240312,[] | |
840,claude-instant-1,0.161,alpacav2,BLZ_240312,[] | |
841,gpt-3.5-turbo-0314,0.096,alpacav2,BLZ_240312,[] | |
842,wizardlm-70b-v1.0,0.14400000000000002,alpacav2,BLZ_240312,[] | |
843,tulu-2-dpo-70b,0.16,alpacav2,BLZ_240312,[] | |
844,vicuna-33b,0.127,alpacav2,BLZ_240312,[] | |
845,starling-lm-7b-alpha,0.142,alpacav2,BLZ_240312,[] | |
846,deepseek-llm-67b-chat,0.121,alpacav2,BLZ_240312,[] | |
847,llama-2-70b-chat,0.139,alpacav2,BLZ_240312,[] | |
849,openhermes-2.5-mistral-7b,0.10300000000000001,alpacav2,BLZ_240312,[] | |
852,gpt-3.5-turbo-1106,0.092,alpacav2,BLZ_240312,[] | |
854,dolphin-2.2.1-mistral-7b,0.09,alpacav2,BLZ_240312,[] | |
855,wizardlm-13b-v1.2,0.12,alpacav2,BLZ_240312,[] | |
856,zephyr-7b-beta,0.11,alpacav2,BLZ_240312,[] | |
859,llama-2-13b-chat,0.077,alpacav2,BLZ_240312,[] | |
860,vicuna-13b,0.067,alpacav2,BLZ_240312,[] | |
862,zephyr-7b-alpha,0.084,alpacav2,BLZ_240312,[] | |
863,qwen-14b-chat,0.075,alpacav2,BLZ_240312,[] | |
865,guanaco-33b,0.05,alpacav2,BLZ_240312,[] | |
866,llama-2-7b-chat,0.0496,alpacav2,BLZ_240312,[] | |
870,vicuna-7b,0.048,alpacav2,BLZ_240312,[] | |
875,chatglm2-6b,0.027999999999999997,alpacav2,BLZ_240312,[] | |
878,openassistant-pythia-12b,0.018000000000000002,alpacav2,BLZ_240312,[] | |
1299,gpt-4-1106-preview,0.32799999999999996,alpacaeval2-lc,BLZ_240312,[] | |
1301,gpt-4-0314,0.21600000000000003,alpacaeval2-lc,BLZ_240312,[] | |
1302,gpt-4-0613,0.18600000000000003,alpacaeval2-lc,BLZ_240312,[] | |
1303,mistral-medium,0.196,alpacaeval2-lc,BLZ_240312,[] | |
1304,claude-1,0.21100000000000002,alpacaeval2-lc,BLZ_240312,[] | |
1305,claude-2.0,0.21600000000000003,alpacaeval2-lc,BLZ_240312,[] | |
1306,gemini-pro-dev-api,0.172,alpacaeval2-lc,BLZ_240312,[] | |
1307,claude-2.1,0.193,alpacaeval2-lc,BLZ_240312,[] | |
1308,gpt-3.5-turbo-0613,0.14300000000000002,alpacaeval2-lc,BLZ_240312,[] | |
1309,mixtral-8x7b-instruct-v0.1,0.168,alpacaeval2-lc,BLZ_240312,[] | |
1310,yi-34b-chat,0.188,alpacaeval2-lc,BLZ_240312,[] | |
1312,claude-instant-1,0.195,alpacaeval2-lc,BLZ_240312,[] | |
1313,gpt-3.5-turbo-0314,0.156,alpacaeval2-lc,BLZ_240312,[] | |
1314,wizardlm-70b-v1.0,0.125,alpacaeval2-lc,BLZ_240312,[] | |
1315,tulu-2-dpo-70b,0.151,alpacaeval2-lc,BLZ_240312,[] | |
1316,vicuna-33b,0.115,alpacaeval2-lc,BLZ_240312,[] | |
1317,starling-lm-7b-alpha,0.10099999999999999,alpacaeval2-lc,BLZ_240312,[] | |
1318,deepseek-llm-67b-chat,0.141,alpacaeval2-lc,BLZ_240312,[] | |
1319,llama-2-70b-chat,0.10400000000000001,alpacaeval2-lc,BLZ_240312,[] | |
1321,openhermes-2.5-mistral-7b,0.126,alpacaeval2-lc,BLZ_240312,[] | |
1324,gpt-3.5-turbo-1106,0.155,alpacaeval2-lc,BLZ_240312,[] | |
1326,dolphin-2.2.1-mistral-7b,0.10800000000000001,alpacaeval2-lc,BLZ_240312,[] | |
1327,wizardlm-13b-v1.2,0.099,alpacaeval2-lc,BLZ_240312,[] | |
1328,zephyr-7b-beta,0.102,alpacaeval2-lc,BLZ_240312,[] | |
1331,llama-2-13b-chat,0.068,alpacaeval2-lc,BLZ_240312,[] | |
1332,vicuna-13b,0.085,alpacaeval2-lc,BLZ_240312,[] | |
1334,zephyr-7b-alpha,0.086,alpacaeval2-lc,BLZ_240312,[] | |
1335,qwen-14b-chat,0.1,alpacaeval2-lc,BLZ_240312,[] | |
1338,llama-2-7b-chat,0.045,alpacaeval2-lc,BLZ_240312,[] | |
1342,vicuna-7b,0.06,alpacaeval2-lc,BLZ_240312,[] | |
0,gpt-4-0125-preview,1.0,arena-elo,BLZ_240312,[] | |
1,gpt-4-1106-preview,0.9992019154030327,arena-elo,BLZ_240312,[] | |
2,bard-gemini-pro,0.9768555466879489,arena-elo,BLZ_240312,[] | |
3,gpt-4-0314,0.9497206703910615,arena-elo,BLZ_240312,[] | |
4,gpt-4-0613,0.9273743016759777,arena-elo,BLZ_240312,[] | |
5,mistral-medium,0.9177972865123704,arena-elo,BLZ_240312,[] | |
6,claude-1,0.9169992019154031,arena-elo,BLZ_240312,[] | |
7,claude-2.0,0.9034317637669593,arena-elo,BLZ_240312,[] | |
8,gemini-pro-dev-api,0.8938547486033519,arena-elo,BLZ_240312,[] | |
9,claude-2.1,0.8930566640063847,arena-elo,BLZ_240312,[] | |
10,gpt-3.5-turbo-0613,0.8922585794094174,arena-elo,BLZ_240312,[] | |
11,mixtral-8x7b-instruct-v0.1,0.8922585794094174,arena-elo,BLZ_240312,[] | |
12,yi-34b-chat,0.8898643256185156,arena-elo,BLZ_240312,[] | |
13,gemini-pro,0.8890662410215483,arena-elo,BLZ_240312,[] | |
14,claude-instant-1,0.8850758180367119,arena-elo,BLZ_240312,[] | |
15,gpt-3.5-turbo-0314,0.8818834796488427,arena-elo,BLZ_240312,[] | |
16,wizardlm-70b-v1.0,0.8818834796488427,arena-elo,BLZ_240312,[] | |
17,tulu-2-dpo-70b,0.8810853950518756,arena-elo,BLZ_240312,[] | |
18,vicuna-33b,0.8723064644852354,arena-elo,BLZ_240312,[] | |
19,starling-lm-7b-alpha,0.8699122106943336,arena-elo,BLZ_240312,[] | |
20,deepseek-llm-67b-chat,0.8635275339185954,arena-elo,BLZ_240312,[] | |
21,llama-2-70b-chat,0.8635275339185954,arena-elo,BLZ_240312,[] | |
22,nv-llama2-70b-steerlm-chat,0.8603351955307262,arena-elo,BLZ_240312,[] | |
23,openhermes-2.5-mistral-7b,0.8603351955307262,arena-elo,BLZ_240312,[] | |
24,openchat-3.5,0.8587390263367917,arena-elo,BLZ_240312,[] | |
25,pplx-70b-online,0.8587390263367917,arena-elo,BLZ_240312,[] | |
26,gpt-3.5-turbo-1106,0.8547486033519553,arena-elo,BLZ_240312,[] | |
27,solar-10.7b-instruct-v1.0,0.8499600957701516,arena-elo,BLZ_240312,[] | |
28,dolphin-2.2.1-mistral-7b,0.8499600957701516,arena-elo,BLZ_240312,[] | |
29,wizardlm-13b-v1.2,0.8443735035913806,arena-elo,BLZ_240312,[] | |
30,zephyr-7b-beta,0.8387869114126097,arena-elo,BLZ_240312,[] | |
31,mpt-30b-chat,0.8332003192338387,arena-elo,BLZ_240312,[] | |
32,codellama-34b-instruct,0.8324022346368715,arena-elo,BLZ_240312,[] | |
33,llama-2-13b-chat,0.8316041500399042,arena-elo,BLZ_240312,[] | |
34,vicuna-13b,0.8300079808459697,arena-elo,BLZ_240312,[] | |
35,pplx-7b-online,0.8284118116520351,arena-elo,BLZ_240312,[] | |
36,zephyr-7b-alpha,0.8276137270550679,arena-elo,BLZ_240312,[] | |
37,qwen-14b-chat,0.825219473264166,arena-elo,BLZ_240312,[] | |
38,falcon-180b-chat,0.8236233040702314,arena-elo,BLZ_240312,[] | |
39,guanaco-33b,0.8236233040702314,arena-elo,BLZ_240312,[] | |
40,llama-2-7b-chat,0.8172386272944933,arena-elo,BLZ_240312,[] | |
41,stripedhyena-nous-7b,0.8140462889066241,arena-elo,BLZ_240312,[] | |
42,mistral-7b-instruct-v0.1,0.8028731045490822,arena-elo,BLZ_240312,[] | |
43,palm-chat-bison-001,0.8028731045490822,arena-elo,BLZ_240312,[] | |
44,vicuna-7b,0.8020750199521149,arena-elo,BLZ_240312,[] | |
45,koala-13b,0.770949720670391,arena-elo,BLZ_240312,[] | |
46,chatglm3-6b,0.7661612130885874,arena-elo,BLZ_240312,[] | |
47,gpt4all-13b-snoozy,0.74780526735834,arena-elo,BLZ_240312,[] | |
48,mpt-7b-chat,0.7430167597765364,arena-elo,BLZ_240312,[] | |
49,chatglm2-6b,0.7422186751795691,arena-elo,BLZ_240312,[] | |
50,rwkv-4-raven-14b,0.7382282521947326,arena-elo,BLZ_240312,[] | |
51,alpaca-13b,0.7214684756584198,arena-elo,BLZ_240312,[] | |
52,openassistant-pythia-12b,0.7158818834796489,arena-elo,BLZ_240312,[] | |
53,chatglm-6b,0.704708699122107,arena-elo,BLZ_240312,[] | |
54,fastchat-t5-3b,0.6975259377494014,arena-elo,BLZ_240312,[] | |
55,stablelm-tuned-alpha-7b,0.6743814844373504,arena-elo,BLZ_240312,[] | |
56,dolly-v2-12b,0.6568236233040702,arena-elo,BLZ_240312,[] | |
57,llama-13b,0.6384676775738228,arena-elo,BLZ_240312,[] | |
542,mixtral-8x7b-instruct-v0.1,0.7641,gpt4all,BLZ_240312,[] | |
543,yi-34b-chat,0.7212999999999999,gpt4all,BLZ_240312,[] | |
550,starling-lm-7b-alpha,0.7272,gpt4all,BLZ_240312,[] | |
554,openhermes-2.5-mistral-7b,0.7312000000000001,gpt4all,BLZ_240312,[] | |
555,openchat-3.5,0.7292000000000001,gpt4all,BLZ_240312,[] | |
558,solar-10.7b-instruct-v1.0,0.7511,gpt4all,BLZ_240312,[] | |
559,dolphin-2.2.1-mistral-7b,0.7223999999999999,gpt4all,BLZ_240312,[] | |
561,zephyr-7b-beta,0.7182999999999999,gpt4all,BLZ_240312,[] | |
565,vicuna-13b,0.631,gpt4all,BLZ_240312,[] | |
567,zephyr-7b-alpha,0.7223999999999999,gpt4all,BLZ_240312,[] | |
573,mistral-7b-instruct-v0.1,0.6795,gpt4all,BLZ_240312,[] | |
575,vicuna-7b,0.61,gpt4all,BLZ_240312,[] | |
576,koala-13b,0.62,gpt4all,BLZ_240312,[] | |
578,gpt4all-13b-snoozy,0.653,gpt4all,BLZ_240312,[] | |
579,mpt-7b-chat,0.648,gpt4all,BLZ_240312,[] | |
583,openassistant-pythia-12b,0.61,gpt4all,BLZ_240312,[] | |
585,fastchat-t5-3b,0.537,gpt4all,BLZ_240312,[] | |
586,stablelm-tuned-alpha-7b,0.513,gpt4all,BLZ_240312,[] | |
588,llama-13b,0.63,gpt4all,BLZ_240312,[] | |
129,mixtral-8x7b-instruct-v0.1,0.7262000000000001,hugging-6,BLZ_240312,[] | |
130,yi-34b-chat,0.6531999999999999,hugging-6,BLZ_240312,[] | |
134,wizardlm-70b-v1.0,0.6125,hugging-6,BLZ_240312,[] | |
135,tulu-2-dpo-70b,0.7376999999999999,hugging-6,BLZ_240312,[] | |
136,vicuna-33b,0.585,hugging-6,BLZ_240312,[] | |
137,starling-lm-7b-alpha,0.6713,hugging-6,BLZ_240312,[] | |
139,llama-2-70b-chat,0.624,hugging-6,BLZ_240312,[] | |
141,openhermes-2.5-mistral-7b,0.6152000000000001,hugging-6,BLZ_240312,[] | |
142,openchat-3.5,0.6124,hugging-6,BLZ_240312,[] | |
145,solar-10.7b-instruct-v1.0,0.742,hugging-6,BLZ_240312,[] | |
146,dolphin-2.2.1-mistral-7b,0.6493000000000001,hugging-6,BLZ_240312,[] | |
147,wizardlm-13b-v1.2,0.5476,hugging-6,BLZ_240312,[] | |
148,zephyr-7b-beta,0.6195,hugging-6,BLZ_240312,[] | |
149,mpt-30b-chat,0.5538000000000001,hugging-6,BLZ_240312,[] | |
150,codellama-34b-instruct,0.5729,hugging-6,BLZ_240312,[] | |
151,llama-2-13b-chat,0.5490999999999999,hugging-6,BLZ_240312,[] | |
152,vicuna-13b,0.5539999999999999,hugging-6,BLZ_240312,[] | |
154,zephyr-7b-alpha,0.595,hugging-6,BLZ_240312,[] | |
156,falcon-180b-chat,0.6785,hugging-6,BLZ_240312,[] | |
158,llama-2-7b-chat,0.5074000000000001,hugging-6,BLZ_240312,[] | |
160,mistral-7b-instruct-v0.1,0.5496,hugging-6,BLZ_240312,[] | |
162,vicuna-7b,0.521,hugging-6,BLZ_240312,[] | |
176,yi-34bx2-moe-60b,0.7672,hugging-6,BLZ_240312,[] | |
947,gpt-4-0314,0.93,llmonitor,BLZ_240312,[] | |
948,gpt-4-0613,0.89,llmonitor,BLZ_240312,[] | |
950,claude-1,0.66,llmonitor,BLZ_240312,[] | |
951,claude-2.0,0.68,llmonitor,BLZ_240312,[] | |
954,gpt-3.5-turbo-0613,0.81,llmonitor,BLZ_240312,[] | |
958,claude-instant-1,0.6,llmonitor,BLZ_240312,[] | |
959,gpt-3.5-turbo-0314,0.79,llmonitor,BLZ_240312,[] | |
965,llama-2-70b-chat,0.6,llmonitor,BLZ_240312,[] | |
975,mpt-30b-chat,0.4,llmonitor,BLZ_240312,[] | |
976,codellama-34b-instruct,0.34,llmonitor,BLZ_240312,[] | |
977,llama-2-13b-chat,0.5,llmonitor,BLZ_240312,[] | |
978,vicuna-13b,0.5,llmonitor,BLZ_240312,[] | |
982,falcon-180b-chat,0.67,llmonitor,BLZ_240312,[] | |
983,guanaco-33b,0.43,llmonitor,BLZ_240312,[] | |
984,llama-2-7b-chat,0.5,llmonitor,BLZ_240312,[] | |
986,mistral-7b-instruct-v0.1,0.57,llmonitor,BLZ_240312,[] | |
987,palm-chat-bison-001,0.57,llmonitor,BLZ_240312,[] | |
988,vicuna-7b,0.41,llmonitor,BLZ_240312,[] | |
989,koala-13b,0.31,llmonitor,BLZ_240312,[] | |
992,mpt-7b-chat,0.43,llmonitor,BLZ_240312,[] | |
1000,dolly-v2-12b,0.23,llmonitor,BLZ_240312,[] | |
59,gpt-4-0125-preview,0.0929,mt-bench,BLZ_240312,[] | |
60,gpt-4-1106-preview,0.0932,mt-bench,BLZ_240312,[] | |
62,gpt-4-0314,0.08960000000000001,mt-bench,BLZ_240312,[] | |
63,gpt-4-0613,0.09179999999999999,mt-bench,BLZ_240312,[] | |
64,mistral-medium,0.0861,mt-bench,BLZ_240312,[] | |
65,claude-1,0.079,mt-bench,BLZ_240312,[] | |
66,claude-2.0,0.0806,mt-bench,BLZ_240312,[] | |
67,gemini-pro-dev-api,0.08039999999999999,mt-bench,BLZ_240312,[] | |
68,claude-2.1,0.0818,mt-bench,BLZ_240312,[] | |
69,gpt-3.5-turbo-0613,0.0839,mt-bench,BLZ_240312,[] | |
70,mixtral-8x7b-instruct-v0.1,0.083,mt-bench,BLZ_240312,[] | |
71,yi-34b-chat,0.07769999999999999,mt-bench,BLZ_240312,[] | |
72,gemini-pro,0.08039999999999999,mt-bench,BLZ_240312,[] | |
73,claude-instant-1,0.0785,mt-bench,BLZ_240312,[] | |
74,gpt-3.5-turbo-0314,0.0794,mt-bench,BLZ_240312,[] | |
75,wizardlm-70b-v1.0,0.0771,mt-bench,BLZ_240312,[] | |
76,tulu-2-dpo-70b,0.0789,mt-bench,BLZ_240312,[] | |
77,vicuna-33b,0.0712,mt-bench,BLZ_240312,[] | |
78,starling-lm-7b-alpha,0.0809,mt-bench,BLZ_240312,[] | |
79,deepseek-llm-67b-chat,0.08529999999999999,mt-bench,BLZ_240312,[] | |
80,llama-2-70b-chat,0.06860000000000001,mt-bench,BLZ_240312,[] | |
81,nv-llama2-70b-steerlm-chat,0.0754,mt-bench,BLZ_240312,[] | |
82,openhermes-2.5-mistral-7b,0.07690000000000001,mt-bench,BLZ_240312,[] | |
83,openchat-3.5,0.0781,mt-bench,BLZ_240312,[] | |
84,pplx-70b-online,0.0588,mt-bench,BLZ_240312,[] | |
85,gpt-3.5-turbo-1106,0.0832,mt-bench,BLZ_240312,[] | |
86,solar-10.7b-instruct-v1.0,0.0758,mt-bench,BLZ_240312,[] | |
88,wizardlm-13b-v1.2,0.07200000000000001,mt-bench,BLZ_240312,[] | |
89,zephyr-7b-beta,0.07339999999999999,mt-bench,BLZ_240312,[] | |
90,mpt-30b-chat,0.0639,mt-bench,BLZ_240312,[] | |
92,llama-2-13b-chat,0.0665,mt-bench,BLZ_240312,[] | |
93,vicuna-13b,0.06570000000000001,mt-bench,BLZ_240312,[] | |
95,zephyr-7b-alpha,0.0688,mt-bench,BLZ_240312,[] | |
96,qwen-14b-chat,0.0696,mt-bench,BLZ_240312,[] | |
98,guanaco-33b,0.0653,mt-bench,BLZ_240312,[] | |
99,llama-2-7b-chat,0.06269999999999999,mt-bench,BLZ_240312,[] | |
101,mistral-7b-instruct-v0.1,0.0684,mt-bench,BLZ_240312,[] | |
102,palm-chat-bison-001,0.064,mt-bench,BLZ_240312,[] | |
103,vicuna-7b,0.0617,mt-bench,BLZ_240312,[] | |
104,koala-13b,0.0535,mt-bench,BLZ_240312,[] | |
106,gpt4all-13b-snoozy,0.0541,mt-bench,BLZ_240312,[] | |
107,mpt-7b-chat,0.0542,mt-bench,BLZ_240312,[] | |
108,chatglm2-6b,0.0496,mt-bench,BLZ_240312,[] | |
109,rwkv-4-raven-14b,0.0398,mt-bench,BLZ_240312,[] | |
110,alpaca-13b,0.0453,mt-bench,BLZ_240312,[] | |
111,openassistant-pythia-12b,0.0432,mt-bench,BLZ_240312,[] | |
112,chatglm-6b,0.045,mt-bench,BLZ_240312,[] | |
113,fastchat-t5-3b,0.0304,mt-bench,BLZ_240312,[] | |
114,stablelm-tuned-alpha-7b,0.0275,mt-bench,BLZ_240312,[] | |
115,dolly-v2-12b,0.032799999999999996,mt-bench,BLZ_240312,[] | |
116,llama-13b,0.026099999999999998,mt-bench,BLZ_240312,[] | |
0,gpt-4-0613,0.957,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
1,llama-3-70b,0.902,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
2,mixtral-8x22b,0.855,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
3,palmyra-x-v3-72b,0.826,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
4,gpt-4-turbo-1106-preview,0.821,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
5,palm-2-unicorn,0.781,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
6,claude-3-opus-20240229,0.762,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
7,qwen1.5-72b,0.757,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
8,palmyra-x-v2-33b,0.736,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
9,yi-34b,0.723,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
10,qwen1.5-32b,0.689,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
11,claude-v1.3,0.689,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
12,mixtral-8x7b-32k-seqlen,0.679,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
13,palm-2-bison,0.655,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
14,claude-2.0,0.651,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
15,deepseek-llm-67b-chat,0.645,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
16,llama-2-70b,0.609,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
17,claude-2.1,0.594,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
18,gpt-3.5-text-davinci-003,0.577,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
19,qwen1.5-14b,0.574,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
20,claude-instant-1.2,0.551,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
21,llama-3-8b,0.519,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
22,gpt-3.5-turbo-0613,0.502,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
23,gemma-7b,0.47,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
24,claude-3-sonnet-20240229,0.468,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
25,gpt-3.5-text-davinci-002,0.468,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
26,llama-65b,0.466,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
27,mistral-large-2402,0.46,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
28,cohere-command,0.421,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
29,dbrx-instructruct,0.419,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
30,mistral-v0.1-7b,0.415,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
31,mistral-small-2402,0.415,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
32,mistral-medium-2312,0.383,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
33,qwen1.5-7b,0.377,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
34,claude-3-haiku-20240307,0.377,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
35,yi-6b,0.351,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
36,llama-2-13b,0.332,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
37,jurassic-2-jumbo-178b,0.317,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
38,falcon-40b,0.306,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
39,phi-2,0.26,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
40,jurassic-2-grande-17b,0.253,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
41,llama-2-7b,0.234,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
42,luminous-supreme-70b,0.213,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
43,cohere-command-light,0.166,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
44,luminous-extended-30b,0.119,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
45,falcon-7b,0.1,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
46,olmo-7b,0.083,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
47,luminous-base-13b,0.072,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']" | |
0,llama-2-70b,0.944,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
1,llama-65b,0.908,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
2,text-davinci-002,0.905,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
3,mistral-v0.1-7b,0.884,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
4,cohere-command-beta-52.4b,0.874,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
5,text-davinci-003,0.872,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
6,jurassic-2-jumbo-178b,0.824,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
7,llama-2-13b,0.823,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
8,tnlg-v2-530b,0.787,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
9,gpt-3.5-turbo-0613,0.783,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
10,llama-30b,0.781,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
11,anthropic-lm-v4-s3-52b,0.78,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
12,gpt-3.5-turbo-0301,0.76,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
13,jurassic-2-grande-17b,0.743,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
14,palmyra-x-43b,0.732,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
15,falcon-40b,0.729,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
16,falcon-instruct-40b,0.727,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
17,mpt-instruct-30b,0.716,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
18,mpt-30b,0.714,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
19,j1-grande-v2-beta-17b,0.706,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
20,vicuna-v1.3-13b,0.706,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
21,cohere-command-beta-6.1b,0.675,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
22,cohere-xlarge-v20221108-52.4b,0.664,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
23,luminous-supreme-70b,0.662,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
24,vicuna-v1.3-7b,0.625,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
25,opt-175b,0.609,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
26,llama-2-7b,0.607,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
27,llama-13b,0.595,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
28,instructpalmyra-30b,0.568,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
29,cohere-xlarge-v20220609-52.4b,0.56,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
30,jurassic-2-large-7.5b,0.553,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
31,davinci-175b,0.538,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
32,llama-7b,0.533,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
33,redpajama-incite-instruct-7b,0.524,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
34,j1-jumbo-v1-178b,0.517,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
35,glm-130b,0.512,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
36,luminous-extended-30b,0.485,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
37,opt-66b,0.448,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
38,bloom-176b,0.446,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
39,j1-grande-v1-17b,0.433,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
40,alpaca-7b,0.381,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
41,falcon-7b,0.378,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
42,redpajama-incite-base-7b,0.378,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
43,cohere-large-v20220720-13.1b,0.372,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
44,redpajama-incite-instruct-v1-3b,0.366,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
45,text-curie-001,0.36,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
46,gpt-neox-20b,0.351,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
47,luminous-base-13b,0.315,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
48,cohere-medium-v20221108-6.1b,0.312,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
49,redpajama-incite-base-v1-3b,0.311,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
50,tnlg-v2-6.7b,0.309,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
51,j1-large-v1-7.5b,0.285,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
52,gpt-j-6b,0.273,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
53,pythia-12b,0.257,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
54,curie-6.7b,0.247,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
55,falcon-instruct-7b,0.244,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
56,cohere-medium-v20220720-6.1b,0.23,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
57,text-babbage-001,0.229,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
58,t0pp-11b,0.197,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
59,pythia-6.9b,0.196,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
60,ul2-20b,0.167,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
61,t5-11b,0.131,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
62,babbage-1.3b,0.114,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
63,cohere-small-v20220720-410m,0.109,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
64,ada-350m,0.108,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
65,text-ada-001,0.107,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
66,yalm-100b,0.075,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']" | |
67,llama-2-70b,0.582,mmlu,helm_classic_240130,[] | |
68,llama-65b,0.584,mmlu,helm_classic_240130,[] | |
69,text-davinci-002,0.568,mmlu,helm_classic_240130,[] | |
70,mistral-v0.1-7b,0.572,mmlu,helm_classic_240130,[] | |
71,cohere-command-beta-52.4b,0.452,mmlu,helm_classic_240130,[] | |
72,text-davinci-003,0.569,mmlu,helm_classic_240130,[] | |
73,jurassic-2-jumbo-178b,0.48,mmlu,helm_classic_240130,[] | |
74,llama-2-13b,0.507,mmlu,helm_classic_240130,[] | |
75,tnlg-v2-530b,0.469,mmlu,helm_classic_240130,[] | |
76,gpt-3.5-turbo-0613,0.391,mmlu,helm_classic_240130,[] | |
77,llama-30b,0.531,mmlu,helm_classic_240130,[] | |
78,anthropic-lm-v4-s3-52b,0.481,mmlu,helm_classic_240130,[] | |
79,gpt-3.5-turbo-0301,0.59,mmlu,helm_classic_240130,[] | |
80,jurassic-2-grande-17b,0.475,mmlu,helm_classic_240130,[] | |
81,palmyra-x-43b,0.609,mmlu,helm_classic_240130,[] | |
82,falcon-40b,0.509,mmlu,helm_classic_240130,[] | |
83,falcon-instruct-40b,0.497,mmlu,helm_classic_240130,[] | |
84,mpt-instruct-30b,0.444,mmlu,helm_classic_240130,[] | |
85,mpt-30b,0.437,mmlu,helm_classic_240130,[] | |
86,j1-grande-v2-beta-17b,0.445,mmlu,helm_classic_240130,[] | |
87,vicuna-v1.3-13b,0.462,mmlu,helm_classic_240130,[] | |
88,cohere-command-beta-6.1b,0.406,mmlu,helm_classic_240130,[] | |
89,cohere-xlarge-v20221108-52.4b,0.382,mmlu,helm_classic_240130,[] | |
90,luminous-supreme-70b,0.38,mmlu,helm_classic_240130,[] | |
91,vicuna-v1.3-7b,0.434,mmlu,helm_classic_240130,[] | |
92,opt-175b,0.318,mmlu,helm_classic_240130,[] | |
93,llama-2-7b,0.431,mmlu,helm_classic_240130,[] | |
94,llama-13b,0.422,mmlu,helm_classic_240130,[] | |
95,instructpalmyra-30b,0.403,mmlu,helm_classic_240130,[] | |
96,cohere-xlarge-v20220609-52.4b,0.353,mmlu,helm_classic_240130,[] | |
97,jurassic-2-large-7.5b,0.339,mmlu,helm_classic_240130,[] | |
98,davinci-175b,0.422,mmlu,helm_classic_240130,[] | |
99,llama-7b,0.321,mmlu,helm_classic_240130,[] | |
100,redpajama-incite-instruct-7b,0.363,mmlu,helm_classic_240130,[] | |
101,j1-jumbo-v1-178b,0.259,mmlu,helm_classic_240130,[] | |
102,glm-130b,0.344,mmlu,helm_classic_240130,[] | |
103,luminous-extended-30b,0.321,mmlu,helm_classic_240130,[] | |
104,opt-66b,0.276,mmlu,helm_classic_240130,[] | |
105,bloom-176b,0.299,mmlu,helm_classic_240130,[] | |
106,j1-grande-v1-17b,0.27,mmlu,helm_classic_240130,[] | |
107,alpaca-7b,0.385,mmlu,helm_classic_240130,[] | |
108,falcon-7b,0.286,mmlu,helm_classic_240130,[] | |
109,redpajama-incite-base-7b,0.302,mmlu,helm_classic_240130,[] | |
110,cohere-large-v20220720-13.1b,0.324,mmlu,helm_classic_240130,[] | |
111,redpajama-incite-instruct-v1-3b,0.257,mmlu,helm_classic_240130,[] | |
112,text-curie-001,0.237,mmlu,helm_classic_240130,[] | |
113,gpt-neox-20b,0.276,mmlu,helm_classic_240130,[] | |
114,luminous-base-13b,0.27,mmlu,helm_classic_240130,[] | |
115,cohere-medium-v20221108-6.1b,0.254,mmlu,helm_classic_240130,[] | |
116,redpajama-incite-base-v1-3b,0.263,mmlu,helm_classic_240130,[] | |
117,tnlg-v2-6.7b,0.242,mmlu,helm_classic_240130,[] | |
118,j1-large-v1-7.5b,0.241,mmlu,helm_classic_240130,[] | |
119,gpt-j-6b,0.249,mmlu,helm_classic_240130,[] | |
120,pythia-12b,0.274,mmlu,helm_classic_240130,[] | |
121,curie-6.7b,0.243,mmlu,helm_classic_240130,[] | |
122,falcon-instruct-7b,0.275,mmlu,helm_classic_240130,[] | |
123,cohere-medium-v20220720-6.1b,0.279,mmlu,helm_classic_240130,[] | |
124,text-babbage-001,0.229,mmlu,helm_classic_240130,[] | |
125,t0pp-11b,0.407,mmlu,helm_classic_240130,[] | |
126,pythia-6.9b,0.236,mmlu,helm_classic_240130,[] | |
127,ul2-20b,0.291,mmlu,helm_classic_240130,[] | |
128,t5-11b,0.29,mmlu,helm_classic_240130,[] | |
129,babbage-1.3b,0.235,mmlu,helm_classic_240130,[] | |
130,cohere-small-v20220720-410m,0.264,mmlu,helm_classic_240130,[] | |
131,ada-350m,0.243,mmlu,helm_classic_240130,[] | |
132,text-ada-001,0.238,mmlu,helm_classic_240130,[] | |
133,yalm-100b,0.243,mmlu,helm_classic_240130,[] | |
0,gpt-4o-0513,35.7,wildbench-mix,wildbench_240612,[] | |
1,gpt-4-turbo-0409,34.6,wildbench-mix,wildbench_240612,[] | |
2,gpt-4-turbo-0125,29.9,wildbench-mix,wildbench_240612,[] | |
3,gemini-1.5-pro,27.8,wildbench-mix,wildbench_240612,[] | |
4,llama-3-70b-inst,21.0,wildbench-mix,wildbench_240612,[] | |
5,claude-3-opus,20.1,wildbench-mix,wildbench_240612,[] | |
6,gemini-1.5-flash,17.4,wildbench-mix,wildbench_240612,[] | |
7,yi-1.5-34b-chat,16.8,wildbench-mix,wildbench_240612,[] | |
8,llama3-inst-8b-simpo,14.0,wildbench-mix,wildbench_240612,[] | |
9,claude-3-sonnet,7.2,wildbench-mix,wildbench_240612,[] | |
10,qwen1.5-72b-chat,4.4,wildbench-mix,wildbench_240612,[] | |
11,command-r-plus,0.4,wildbench-mix,wildbench_240612,[] | |
12,claude-3-haiku,-8.5,wildbench-mix,wildbench_240612,[] | |
13,mistral-large,-10.5,wildbench-mix,wildbench_240612,[] | |
14,starlinglm-7b-beta,-11.9,wildbench-mix,wildbench_240612,[] | |
15,llama-3-8b-inst,-14.6,wildbench-mix,wildbench_240612,[] | |
16,command-r,-16.0,wildbench-mix,wildbench_240612,[] | |
17,mixtral-8x7b-inst,-18.8,wildbench-mix,wildbench_240612,[] | |
18,dbrx-instruct,-21.6,wildbench-mix,wildbench_240612,[] | |
19,yi-1.5-6b-chat,-24.3,wildbench-mix,wildbench_240612,[] | |
20,mistral-7b-inst-v0.2,-25.0,wildbench-mix,wildbench_240612,[] | |
21,tulu-2-dpo-70b,-25.4,wildbench-mix,wildbench_240612,[] | |
22,llama-2-70b-chat,-26.8,wildbench-mix,wildbench_240612,[] | |
23,qwen1.5-7b-chat,-27.0,wildbench-mix,wildbench_240612,[] | |
24,phi-3-medium-128k,-33.3,wildbench-mix,wildbench_240612,[] | |
25,gpt-3.5-turbo-0125,-33.5,wildbench-mix,wildbench_240612,[] | |
26,llama-2-7b-chat,-48.0,wildbench-mix,wildbench_240612,[] | |
27,gemma-7b-it,-57.0,wildbench-mix,wildbench_240612,[] | |
28,gemma-2b-it,-74.1,wildbench-mix,wildbench_240612,[] | |
13,flan-t5-xxl,0.2244897959183673,mmlu_pro,bluebench_v02,[] | |
30,granite-13b-chat-v2,0.2857142857142857,mmlu_pro,bluebench_v02,[] | |
41,granite-13b-instruct-v2,0.0408163265306122,mmlu_pro,bluebench_v02,[] | |
50,granite-7b-lab,0.2423469387755102,mmlu_pro,bluebench_v02,[] | |
60,llama-2-13b-chat,0.0943877551020408,mmlu_pro,bluebench_v02,[] | |
70,llama-2-70b,0.4081632653061224,mmlu_pro,bluebench_v02,[] | |
81,llama-3-70b-instruct,0.4285714285714285,mmlu_pro,bluebench_v02,[] | |
92,llama-3-8b,0.375,mmlu_pro,bluebench_v02,[] | |
103,llama-3-8b-instruct,0.0994897959183673,mmlu_pro,bluebench_v02,[] | |
112,llama-30b,0.3061224489795918,mmlu_pro,bluebench_v02,[] | |
121,llama-7b,0.1326530612244897,mmlu_pro,bluebench_v02,[] | |
132,mistral-v0.1-7b,0.2857142857142857,mmlu_pro,bluebench_v02,[] | |
143,mixtral-8x7b-instruct-v01,0.375,mmlu_pro,bluebench_v02,[] | |
153,vicuna-13b-v1.5-16k,0.2857142857142857,mmlu_pro,bluebench_v02,[] | |
162,vicuna-33b-v1.3,0.2653061224489796,mmlu_pro,bluebench_v02,[] | |
172,vicuna-v1.3-7b,0.1938775510204081,mmlu_pro,bluebench_v02,[] | |
182,vicuna-7b-v1.5,0.2857142857142857,mmlu_pro,bluebench_v02,[] | |
192,zephyr-7b-beta,0.2959183673469387,mmlu_pro,bluebench_v02,[] | |