benchbench / assets /combined_holistic.csv
Yotam Perlitz
build app
0f8e886
raw
history blame
89.3 kB
,model,score,scenario,source,aggragated_from
0,gpt-4-turbo-2024-04-09,82.6,arena-hard,arena_hard_2404,[]
1,gpt-4-0125-preview,78.0,arena-hard,arena_hard_2404,[]
2,gemini-1.5-pro-api-preview,72.0,arena-hard,arena_hard_2404,[]
3,yi-large,63.7,arena-hard,arena_hard_2404,[]
4,claude-3-opus-20240229,60.4,arena-hard,arena_hard_2404,[]
5,glm-4,55.7,arena-hard,arena_hard_2404,[]
6,gpt-4-0314,50.0,arena-hard,arena_hard_2404,[]
7,gemini-1.5-flash-api-preview,49.6,arena-hard,arena_hard_2404,[]
8,claude-3-sonnet-20240229,46.8,arena-hard,arena_hard_2404,[]
9,claude-3-haiku-20240307,41.5,arena-hard,arena_hard_2404,[]
10,llama-3-70b-chat-hf,41.1,arena-hard,arena_hard_2404,[]
11,gpt-4-0613,37.9,arena-hard,arena_hard_2404,[]
12,mistral-large-2402,37.7,arena-hard,arena_hard_2404,[]
13,mixtral-8x22b-instruct-v0.1,36.4,arena-hard,arena_hard_2404,[]
14,qwen1.5-72b-chat,36.1,arena-hard,arena_hard_2404,[]
15,command-r-plus,33.1,arena-hard,arena_hard_2404,[]
16,mistral-medium,31.9,arena-hard,arena_hard_2404,[]
17,mistral-next,27.4,arena-hard,arena_hard_2404,[]
18,gpt-3.5-turbo-0613,24.8,arena-hard,arena_hard_2404,[]
19,claude-2.0,24.0,arena-hard,arena_hard_2404,[]
20,dbrx-instructruct,23.9,arena-hard,arena_hard_2404,[]
21,mixtral-8x7b-instruct-v0.1,23.4,arena-hard,arena_hard_2404,[]
22,gpt-3.5-turbo-0125,23.3,arena-hard,arena_hard_2404,[]
23,yi-34b-chat,23.1,arena-hard,arena_hard_2404,[]
24,starling-lm-7b-beta,23.0,arena-hard,arena_hard_2404,[]
25,claude-2.1,22.8,arena-hard,arena_hard_2404,[]
26,snorkel-mistral-pairrm-dpo,20.7,arena-hard,arena_hard_2404,[]
27,llama-3-8b-chat-hf,20.6,arena-hard,arena_hard_2404,[]
28,gpt-3.5-turbo-1106,18.9,arena-hard,arena_hard_2404,[]
29,gpt-3.5-turbo-0301,18.1,arena-hard,arena_hard_2404,[]
30,gemini-1.0-pro,17.8,arena-hard,arena_hard_2404,[]
31,snowflake-arctic-instruct,17.6,arena-hard,arena_hard_2404,[]
32,command-r,17.0,arena-hard,arena_hard_2404,[]
33,phi-3-mini-128k-instruct,15.4,arena-hard,arena_hard_2404,[]
34,tulu-2-dpo-70b,15.0,arena-hard,arena_hard_2404,[]
35,starling-lm-7b-alpha,12.8,arena-hard,arena_hard_2404,[]
36,mistral-7b-instruct,12.6,arena-hard,arena_hard_2404,[]
37,gemma-1.1-7b-it,12.1,arena-hard,arena_hard_2404,[]
38,llama-2-70b-chat-hf,11.6,arena-hard,arena_hard_2404,[]
39,vicuna-33b-v1.3,8.6,arena-hard,arena_hard_2404,[]
40,gemma-7b-it,7.5,arena-hard,arena_hard_2404,[]
41,llama-2-7b-chat-hf,4.6,arena-hard,arena_hard_2404,[]
42,gemma-1.1-2b-it,3.4,arena-hard,arena_hard_2404,[]
43,gemma-2b-it,3.0,arena-hard,arena_hard_2404,[]
0,gpt-4o-2024-05-13,64.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
1,claude-3-opus,63.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
2,gpt-4-turbo-2024-04-09,62.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
3,gemini-1.5-pro-api-0409,58.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
4,yi-large-preview,56.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
5,llama-3-70b-instruct,55.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
6,qwen-max-0428,55.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
7,claude-3-sonnet,54.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
8,reka-core-20240415,52.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
9,mammoth2-8x7b-plus,51.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
10,deepseek-v2,51.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
11,command-r-plus,51.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
12,yi-1.5-34b-chat,51.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
13,mistral-large,50.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
14,qwen1.5-72b-chat,48.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
15,mistral-medium,47.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
16,gemini-1.0-pro,46.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
17,reka-flash-20240226,46.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
18,mistral-small,46.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
19,llama-3-8b-instruct,45.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
20,command-r,45.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
21,qwen1.5-32b-chat,43.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
22,gpt-3.5-turbo-0125,43.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
23,claude-3-haiku,42.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
24,yi-34b-chat,42.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
25,mixtral-8x7b-instruct-v0.1,42.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
26,starling-lm-7b-beta,41.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
27,yi-1.5-9b-chat,40.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
28,gemma-1.1-7b-it,39.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
29,vicuna-33b-v1.3,38.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
30,llama-2-70b-chat,38.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
31,map-neo-instruct-v0.1,37.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
32,mistral-7b-instruct-v0.2,36.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
33,qwen1.5-7b-chat,35.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
34,reka-edge-20240208,32.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
35,zephyr-7b-beta,31.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
36,llama-2-7b-chat,30.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
37,yi-6b-chat,30.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
38,qwen1.5-moe-a2.7b-chat,29.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
39,gemma-1.1-2b-it,28.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
40,vicuna-7b-v1.5,27.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
41,olmo-7b-instruct,26.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
42,qwen1.5-4b-chat,24.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
43,jetmoe-8b-chat,24.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
44,mpt-7b-chat,23.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
45,llama-3-70b,54.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
46,qwen1.5-72b,41.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
47,yi-34b,47.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
48,qwen1.5-32b,41.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
49,mixtral-8x7b,40.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
50,llama-2-70b,41.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
51,qwen1.5-moe-a2.7b,33.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
52,qwen1.5-7b,33.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
53,llama-3-8b,31.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
54,mistral-7b,27.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
55,gemma-7b,32.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
56,yi-6b,30.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
57,qwen1.5-4b,23.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
58,jetmoe-8b,27.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
59,deepseek-7b,21.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
60,phi-2,21.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
61,deepseekmoe-16b,24.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
62,llama-2-7b,22.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
63,gemma-2b,22.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
64,olmo-7b,21.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
65,mpt-7b,17.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
66,gpt-4o-2024-05-13,87.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
67,claude-3-opus,88.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
68,gpt-4-turbo-2024-04-09,88.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
69,gemini-1.5-pro-api-0409,84.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
70,yi-large-preview,84.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
71,llama-3-70b-instruct,84.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
72,qwen-max-0428,86.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
73,claude-3-sonnet,81.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
74,reka-core-20240415,83.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
75,mammoth2-8x7b-plus,81.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
76,deepseek-v2,83.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
77,command-r-plus,81.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
78,yi-1.5-34b-chat,81.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
79,mistral-large,84.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
80,qwen1.5-72b-chat,84.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
81,mistral-medium,81.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
82,gemini-1.0-pro,78.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
83,reka-flash-20240226,79.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
84,mistral-small,81.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
85,llama-3-8b-instruct,75.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
86,command-r,77.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
87,qwen1.5-32b-chat,81.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
88,gpt-3.5-turbo-0125,79.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
89,claude-3-haiku,79.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
90,yi-34b-chat,80.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
91,mixtral-8x7b-instruct-v0.1,76.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
92,starling-lm-7b-beta,74.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
93,yi-1.5-9b-chat,74.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
94,gemma-1.1-7b-it,69.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
95,vicuna-33b-v1.3,66.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
96,llama-2-70b-chat,74.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
97,map-neo-instruct-v0.1,70.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
98,mistral-7b-instruct-v0.2,70.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
99,qwen1.5-7b-chat,71.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
100,reka-edge-20240208,68.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
101,zephyr-7b-beta,69.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
102,llama-2-7b-chat,61.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
103,yi-6b-chat,65.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
104,qwen1.5-moe-a2.7b-chat,69.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
105,gemma-1.1-2b-it,51.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
106,vicuna-7b-v1.5,60.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
107,olmo-7b-instruct,55.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
108,qwen1.5-4b-chat,57.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
109,jetmoe-8b-chat,51.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
110,mpt-7b-chat,43.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
111,llama-3-70b,82.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
112,qwen1.5-72b,79.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
113,yi-34b,78.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
114,qwen1.5-32b,77.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
115,mixtral-8x7b,74.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
116,llama-2-70b,73.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
117,qwen1.5-moe-a2.7b,70.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
118,qwen1.5-7b,68.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
119,llama-3-8b,65.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
120,mistral-7b,64.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
121,gemma-7b,64.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
122,yi-6b,63.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
123,qwen1.5-4b,58.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
124,jetmoe-8b,57.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
125,deepseek-7b,52.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
126,phi-2,51.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
127,deepseekmoe-16b,51.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
128,llama-2-7b,43.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
129,gemma-2b,38.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
130,olmo-7b,31.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
131,mpt-7b,30.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
264,gpt-4o-2024-05-13,85.4,mmlu-mixed,mixeval_240601,[]
265,claude-3-opus,83.2,mmlu-mixed,mixeval_240601,[]
266,gpt-4-turbo-2024-04-09,82.8,mmlu-mixed,mixeval_240601,[]
267,gemini-1.5-pro-api-0409,79.2,mmlu-mixed,mixeval_240601,[]
268,yi-large-preview,80.9,mmlu-mixed,mixeval_240601,[]
269,llama-3-70b-instruct,80.5,mmlu-mixed,mixeval_240601,[]
270,qwen-max-0428,80.6,mmlu-mixed,mixeval_240601,[]
271,claude-3-sonnet,74.7,mmlu-mixed,mixeval_240601,[]
272,reka-core-20240415,79.3,mmlu-mixed,mixeval_240601,[]
273,mammoth2-8x7b-plus,74.5,mmlu-mixed,mixeval_240601,[]
274,deepseek-v2,77.3,mmlu-mixed,mixeval_240601,[]
275,command-r-plus,78.9,mmlu-mixed,mixeval_240601,[]
276,yi-1.5-34b-chat,76.4,mmlu-mixed,mixeval_240601,[]
277,mistral-large,80.2,mmlu-mixed,mixeval_240601,[]
278,qwen1.5-72b-chat,80.1,mmlu-mixed,mixeval_240601,[]
279,mistral-medium,76.3,mmlu-mixed,mixeval_240601,[]
280,gemini-1.0-pro,74.9,mmlu-mixed,mixeval_240601,[]
281,reka-flash-20240226,75.4,mmlu-mixed,mixeval_240601,[]
282,mistral-small,75.2,mmlu-mixed,mixeval_240601,[]
283,llama-3-8b-instruct,71.9,mmlu-mixed,mixeval_240601,[]
284,command-r,75.0,mmlu-mixed,mixeval_240601,[]
285,qwen1.5-32b-chat,78.0,mmlu-mixed,mixeval_240601,[]
286,gpt-3.5-turbo-0125,74.5,mmlu-mixed,mixeval_240601,[]
287,claude-3-haiku,76.1,mmlu-mixed,mixeval_240601,[]
288,yi-34b-chat,73.6,mmlu-mixed,mixeval_240601,[]
289,mixtral-8x7b-instruct-v0.1,72.0,mmlu-mixed,mixeval_240601,[]
290,starling-lm-7b-beta,69.0,mmlu-mixed,mixeval_240601,[]
291,yi-1.5-9b-chat,72.6,mmlu-mixed,mixeval_240601,[]
292,gemma-1.1-7b-it,66.9,mmlu-mixed,mixeval_240601,[]
293,vicuna-33b-v1.3,59.2,mmlu-mixed,mixeval_240601,[]
294,llama-2-70b-chat,69.8,mmlu-mixed,mixeval_240601,[]
295,map-neo-instruct-v0.1,66.7,mmlu-mixed,mixeval_240601,[]
296,mistral-7b-instruct-v0.2,67.3,mmlu-mixed,mixeval_240601,[]
297,qwen1.5-7b-chat,68.7,mmlu-mixed,mixeval_240601,[]
298,reka-edge-20240208,63.6,mmlu-mixed,mixeval_240601,[]
299,zephyr-7b-beta,64.9,mmlu-mixed,mixeval_240601,[]
300,llama-2-7b-chat,59.4,mmlu-mixed,mixeval_240601,[]
301,yi-6b-chat,65.4,mmlu-mixed,mixeval_240601,[]
302,qwen1.5-moe-a2.7b-chat,69.5,mmlu-mixed,mixeval_240601,[]
303,gemma-1.1-2b-it,51.5,mmlu-mixed,mixeval_240601,[]
304,vicuna-7b-v1.5,58.7,mmlu-mixed,mixeval_240601,[]
305,olmo-7b-instruct,57.1,mmlu-mixed,mixeval_240601,[]
306,qwen1.5-4b-chat,61.4,mmlu-mixed,mixeval_240601,[]
307,jetmoe-8b-chat,58.5,mmlu-mixed,mixeval_240601,[]
308,mpt-7b-chat,37.8,mmlu-mixed,mixeval_240601,[]
309,llama-3-70b,79.8,mmlu-mixed,mixeval_240601,[]
310,qwen1.5-72b,78.8,mmlu-mixed,mixeval_240601,[]
311,yi-34b,79.3,mmlu-mixed,mixeval_240601,[]
312,qwen1.5-32b,77.2,mmlu-mixed,mixeval_240601,[]
313,mixtral-8x7b,71.6,mmlu-mixed,mixeval_240601,[]
314,llama-2-70b,70.8,mmlu-mixed,mixeval_240601,[]
315,qwen1.5-moe-a2.7b,69.4,mmlu-mixed,mixeval_240601,[]
316,qwen1.5-7b,67.0,mmlu-mixed,mixeval_240601,[]
317,llama-3-8b,69.5,mmlu-mixed,mixeval_240601,[]
318,mistral-7b,68.5,mmlu-mixed,mixeval_240601,[]
319,gemma-7b,67.4,mmlu-mixed,mixeval_240601,[]
320,yi-6b,71.2,mmlu-mixed,mixeval_240601,[]
321,qwen1.5-4b,59.6,mmlu-mixed,mixeval_240601,[]
322,jetmoe-8b,55.3,mmlu-mixed,mixeval_240601,[]
323,deepseek-7b,53.3,mmlu-mixed,mixeval_240601,[]
324,phi-2,62.5,mmlu-mixed,mixeval_240601,[]
325,deepseekmoe-16b,49.9,mmlu-mixed,mixeval_240601,[]
326,llama-2-7b,40.8,mmlu-mixed,mixeval_240601,[]
327,gemma-2b,37.4,mmlu-mixed,mixeval_240601,[]
328,olmo-7b,29.7,mmlu-mixed,mixeval_240601,[]
329,mpt-7b,30.9,mmlu-mixed,mixeval_240601,[]
594,gpt-4o-2024-05-13,57.1,mmlu-hard-mixed,mixeval_240601,[]
595,claude-3-opus,55.0,mmlu-hard-mixed,mixeval_240601,[]
596,gpt-4-turbo-2024-04-09,45.5,mmlu-hard-mixed,mixeval_240601,[]
597,gemini-1.5-pro-api-0409,44.6,mmlu-hard-mixed,mixeval_240601,[]
598,yi-large-preview,48.5,mmlu-hard-mixed,mixeval_240601,[]
599,llama-3-70b-instruct,46.3,mmlu-hard-mixed,mixeval_240601,[]
600,qwen-max-0428,41.6,mmlu-hard-mixed,mixeval_240601,[]
601,claude-3-sonnet,40.7,mmlu-hard-mixed,mixeval_240601,[]
602,reka-core-20240415,46.3,mmlu-hard-mixed,mixeval_240601,[]
603,mammoth2-8x7b-plus,41.1,mmlu-hard-mixed,mixeval_240601,[]
604,deepseek-v2,42.0,mmlu-hard-mixed,mixeval_240601,[]
605,command-r-plus,42.0,mmlu-hard-mixed,mixeval_240601,[]
606,yi-1.5-34b-chat,38.1,mmlu-hard-mixed,mixeval_240601,[]
607,mistral-large,42.4,mmlu-hard-mixed,mixeval_240601,[]
608,qwen1.5-72b-chat,37.7,mmlu-hard-mixed,mixeval_240601,[]
609,mistral-medium,38.5,mmlu-hard-mixed,mixeval_240601,[]
610,gemini-1.0-pro,35.5,mmlu-hard-mixed,mixeval_240601,[]
611,reka-flash-20240226,34.6,mmlu-hard-mixed,mixeval_240601,[]
612,mistral-small,33.8,mmlu-hard-mixed,mixeval_240601,[]
613,llama-3-8b-instruct,40.7,mmlu-hard-mixed,mixeval_240601,[]
614,command-r,39.0,mmlu-hard-mixed,mixeval_240601,[]
615,qwen1.5-32b-chat,29.9,mmlu-hard-mixed,mixeval_240601,[]
616,gpt-3.5-turbo-0125,35.1,mmlu-hard-mixed,mixeval_240601,[]
617,claude-3-haiku,30.7,mmlu-hard-mixed,mixeval_240601,[]
618,yi-34b-chat,29.9,mmlu-hard-mixed,mixeval_240601,[]
619,mixtral-8x7b-instruct-v0.1,37.2,mmlu-hard-mixed,mixeval_240601,[]
620,starling-lm-7b-beta,34.2,mmlu-hard-mixed,mixeval_240601,[]
621,yi-1.5-9b-chat,36.8,mmlu-hard-mixed,mixeval_240601,[]
622,gemma-1.1-7b-it,39.0,mmlu-hard-mixed,mixeval_240601,[]
623,vicuna-33b-v1.3,39.4,mmlu-hard-mixed,mixeval_240601,[]
624,llama-2-70b-chat,27.7,mmlu-hard-mixed,mixeval_240601,[]
625,map-neo-instruct-v0.1,32.5,mmlu-hard-mixed,mixeval_240601,[]
626,mistral-7b-instruct-v0.2,29.4,mmlu-hard-mixed,mixeval_240601,[]
627,qwen1.5-7b-chat,29.0,mmlu-hard-mixed,mixeval_240601,[]
628,reka-edge-20240208,26.4,mmlu-hard-mixed,mixeval_240601,[]
629,zephyr-7b-beta,24.2,mmlu-hard-mixed,mixeval_240601,[]
630,llama-2-7b-chat,30.3,mmlu-hard-mixed,mixeval_240601,[]
631,yi-6b-chat,26.8,mmlu-hard-mixed,mixeval_240601,[]
632,qwen1.5-moe-a2.7b-chat,26.8,mmlu-hard-mixed,mixeval_240601,[]
633,gemma-1.1-2b-it,30.3,mmlu-hard-mixed,mixeval_240601,[]
634,vicuna-7b-v1.5,23.4,mmlu-hard-mixed,mixeval_240601,[]
635,olmo-7b-instruct,27.3,mmlu-hard-mixed,mixeval_240601,[]
636,qwen1.5-4b-chat,17.3,mmlu-hard-mixed,mixeval_240601,[]
637,jetmoe-8b-chat,25.5,mmlu-hard-mixed,mixeval_240601,[]
638,mpt-7b-chat,24.7,mmlu-hard-mixed,mixeval_240601,[]
639,llama-3-70b,39.8,mmlu-hard-mixed,mixeval_240601,[]
640,qwen1.5-72b,42.4,mmlu-hard-mixed,mixeval_240601,[]
641,yi-34b,42.4,mmlu-hard-mixed,mixeval_240601,[]
642,qwen1.5-32b,37.2,mmlu-hard-mixed,mixeval_240601,[]
643,mixtral-8x7b,34.6,mmlu-hard-mixed,mixeval_240601,[]
644,llama-2-70b,29.0,mmlu-hard-mixed,mixeval_240601,[]
645,qwen1.5-moe-a2.7b,30.7,mmlu-hard-mixed,mixeval_240601,[]
646,qwen1.5-7b,28.6,mmlu-hard-mixed,mixeval_240601,[]
647,llama-3-8b,38.5,mmlu-hard-mixed,mixeval_240601,[]
648,mistral-7b,27.7,mmlu-hard-mixed,mixeval_240601,[]
649,gemma-7b,28.1,mmlu-hard-mixed,mixeval_240601,[]
650,yi-6b,37.2,mmlu-hard-mixed,mixeval_240601,[]
651,qwen1.5-4b,22.9,mmlu-hard-mixed,mixeval_240601,[]
652,jetmoe-8b,27.3,mmlu-hard-mixed,mixeval_240601,[]
653,deepseek-7b,26.4,mmlu-hard-mixed,mixeval_240601,[]
654,phi-2,29.0,mmlu-hard-mixed,mixeval_240601,[]
655,deepseekmoe-16b,30.7,mmlu-hard-mixed,mixeval_240601,[]
656,llama-2-7b,24.7,mmlu-hard-mixed,mixeval_240601,[]
657,gemma-2b,27.3,mmlu-hard-mixed,mixeval_240601,[]
658,olmo-7b,25.1,mmlu-hard-mixed,mixeval_240601,[]
659,mpt-7b,24.2,mmlu-hard-mixed,mixeval_240601,[]
593,gpt-4-0314,0.57,agieval,BLZ_240312,[]
594,gpt-4-0613,0.57,agieval,BLZ_240312,[]
596,claude-1,0.49700000000000005,agieval,BLZ_240312,[]
601,mixtral-8x7b-instruct-v0.1,0.45299999999999996,agieval,BLZ_240312,[]
602,yi-34b-chat,0.508,agieval,BLZ_240312,[]
605,gpt-3.5-turbo-0314,0.43200000000000005,agieval,BLZ_240312,[]
608,vicuna-33b,0.373,agieval,BLZ_240312,[]
609,starling-lm-7b-alpha,0.401,agieval,BLZ_240312,[]
611,llama-2-70b-chat,0.45,agieval,BLZ_240312,[]
613,openhermes-2.5-mistral-7b,0.43,agieval,BLZ_240312,[]
614,openchat-3.5,0.42700000000000005,agieval,BLZ_240312,[]
617,solar-10.7b-instruct-v1.0,0.47600000000000003,agieval,BLZ_240312,[]
618,dolphin-2.2.1-mistral-7b,0.392,agieval,BLZ_240312,[]
620,zephyr-7b-beta,0.406,agieval,BLZ_240312,[]
623,llama-2-13b-chat,0.336,agieval,BLZ_240312,[]
624,vicuna-13b,0.368,agieval,BLZ_240312,[]
626,zephyr-7b-alpha,0.38,agieval,BLZ_240312,[]
627,qwen-14b-chat,0.396,agieval,BLZ_240312,[]
630,llama-2-7b-chat,0.29600000000000004,agieval,BLZ_240312,[]
632,mistral-7b-instruct-v0.1,0.335,agieval,BLZ_240312,[]
634,vicuna-7b,0.314,agieval,BLZ_240312,[]
636,chatglm3-6b,0.414,agieval,BLZ_240312,[]
643,chatglm-6b,0.325,agieval,BLZ_240312,[]
647,llama-13b,0.205,agieval,BLZ_240312,[]
886,gpt-4-1106-preview,0.977,alpacav1,BLZ_240312,[]
888,gpt-4-0314,0.9528,alpacav1,BLZ_240312,[]
889,gpt-4-0613,0.9528,alpacav1,BLZ_240312,[]
890,mistral-medium,0.9682999999999999,alpacav1,BLZ_240312,[]
891,claude-1,0.8839,alpacav1,BLZ_240312,[]
892,claude-2.0,0.9136,alpacav1,BLZ_240312,[]
893,gemini-pro-dev-api,0.7966,alpacav1,BLZ_240312,[]
894,claude-2.1,0.8708,alpacav1,BLZ_240312,[]
895,gpt-3.5-turbo-0613,0.8937,alpacav1,BLZ_240312,[]
896,mixtral-8x7b-instruct-v0.1,0.9478,alpacav1,BLZ_240312,[]
897,yi-34b-chat,0.9408,alpacav1,BLZ_240312,[]
898,gemini-pro,0.7966,alpacav1,BLZ_240312,[]
900,gpt-3.5-turbo-0314,0.8937,alpacav1,BLZ_240312,[]
902,tulu-2-dpo-70b,0.9503,alpacav1,BLZ_240312,[]
903,vicuna-33b,0.8898999999999999,alpacav1,BLZ_240312,[]
904,starling-lm-7b-alpha,0.9198999999999999,alpacav1,BLZ_240312,[]
906,llama-2-70b-chat,0.9266,alpacav1,BLZ_240312,[]
909,openchat-3.5,0.8851,alpacav1,BLZ_240312,[]
911,gpt-3.5-turbo-1106,0.8626,alpacav1,BLZ_240312,[]
914,wizardlm-13b-v1.2,0.8917,alpacav1,BLZ_240312,[]
915,zephyr-7b-beta,0.9059999999999999,alpacav1,BLZ_240312,[]
918,llama-2-13b-chat,0.8109000000000001,alpacav1,BLZ_240312,[]
921,zephyr-7b-alpha,0.8576,alpacav1,BLZ_240312,[]
924,guanaco-33b,0.6596,alpacav1,BLZ_240312,[]
925,llama-2-7b-chat,0.7137,alpacav1,BLZ_240312,[]
934,chatglm2-6b,0.47130000000000005,alpacav1,BLZ_240312,[]
937,openassistant-pythia-12b,0.2596,alpacav1,BLZ_240312,[]
827,gpt-4-1106-preview,0.5,alpacav2,BLZ_240312,[]
829,gpt-4-0314,0.221,alpacav2,BLZ_240312,[]
830,gpt-4-0613,0.158,alpacav2,BLZ_240312,[]
831,mistral-medium,0.21899999999999997,alpacav2,BLZ_240312,[]
832,claude-1,0.17,alpacav2,BLZ_240312,[]
833,claude-2.0,0.172,alpacav2,BLZ_240312,[]
834,gemini-pro-dev-api,0.16899999999999998,alpacav2,BLZ_240312,[]
835,claude-2.1,0.157,alpacav2,BLZ_240312,[]
836,gpt-3.5-turbo-0613,0.141,alpacav2,BLZ_240312,[]
837,mixtral-8x7b-instruct-v0.1,0.183,alpacav2,BLZ_240312,[]
838,yi-34b-chat,0.297,alpacav2,BLZ_240312,[]
839,gemini-pro,0.16899999999999998,alpacav2,BLZ_240312,[]
840,claude-instant-1,0.161,alpacav2,BLZ_240312,[]
841,gpt-3.5-turbo-0314,0.096,alpacav2,BLZ_240312,[]
842,wizardlm-70b-v1.0,0.14400000000000002,alpacav2,BLZ_240312,[]
843,tulu-2-dpo-70b,0.16,alpacav2,BLZ_240312,[]
844,vicuna-33b,0.127,alpacav2,BLZ_240312,[]
845,starling-lm-7b-alpha,0.142,alpacav2,BLZ_240312,[]
846,deepseek-llm-67b-chat,0.121,alpacav2,BLZ_240312,[]
847,llama-2-70b-chat,0.139,alpacav2,BLZ_240312,[]
849,openhermes-2.5-mistral-7b,0.10300000000000001,alpacav2,BLZ_240312,[]
852,gpt-3.5-turbo-1106,0.092,alpacav2,BLZ_240312,[]
854,dolphin-2.2.1-mistral-7b,0.09,alpacav2,BLZ_240312,[]
855,wizardlm-13b-v1.2,0.12,alpacav2,BLZ_240312,[]
856,zephyr-7b-beta,0.11,alpacav2,BLZ_240312,[]
859,llama-2-13b-chat,0.077,alpacav2,BLZ_240312,[]
860,vicuna-13b,0.067,alpacav2,BLZ_240312,[]
862,zephyr-7b-alpha,0.084,alpacav2,BLZ_240312,[]
863,qwen-14b-chat,0.075,alpacav2,BLZ_240312,[]
865,guanaco-33b,0.05,alpacav2,BLZ_240312,[]
866,llama-2-7b-chat,0.0496,alpacav2,BLZ_240312,[]
870,vicuna-7b,0.048,alpacav2,BLZ_240312,[]
875,chatglm2-6b,0.027999999999999997,alpacav2,BLZ_240312,[]
878,openassistant-pythia-12b,0.018000000000000002,alpacav2,BLZ_240312,[]
1299,gpt-4-1106-preview,0.32799999999999996,alpacaeval2-lc,BLZ_240312,[]
1301,gpt-4-0314,0.21600000000000003,alpacaeval2-lc,BLZ_240312,[]
1302,gpt-4-0613,0.18600000000000003,alpacaeval2-lc,BLZ_240312,[]
1303,mistral-medium,0.196,alpacaeval2-lc,BLZ_240312,[]
1304,claude-1,0.21100000000000002,alpacaeval2-lc,BLZ_240312,[]
1305,claude-2.0,0.21600000000000003,alpacaeval2-lc,BLZ_240312,[]
1306,gemini-pro-dev-api,0.172,alpacaeval2-lc,BLZ_240312,[]
1307,claude-2.1,0.193,alpacaeval2-lc,BLZ_240312,[]
1308,gpt-3.5-turbo-0613,0.14300000000000002,alpacaeval2-lc,BLZ_240312,[]
1309,mixtral-8x7b-instruct-v0.1,0.168,alpacaeval2-lc,BLZ_240312,[]
1310,yi-34b-chat,0.188,alpacaeval2-lc,BLZ_240312,[]
1312,claude-instant-1,0.195,alpacaeval2-lc,BLZ_240312,[]
1313,gpt-3.5-turbo-0314,0.156,alpacaeval2-lc,BLZ_240312,[]
1314,wizardlm-70b-v1.0,0.125,alpacaeval2-lc,BLZ_240312,[]
1315,tulu-2-dpo-70b,0.151,alpacaeval2-lc,BLZ_240312,[]
1316,vicuna-33b,0.115,alpacaeval2-lc,BLZ_240312,[]
1317,starling-lm-7b-alpha,0.10099999999999999,alpacaeval2-lc,BLZ_240312,[]
1318,deepseek-llm-67b-chat,0.141,alpacaeval2-lc,BLZ_240312,[]
1319,llama-2-70b-chat,0.10400000000000001,alpacaeval2-lc,BLZ_240312,[]
1321,openhermes-2.5-mistral-7b,0.126,alpacaeval2-lc,BLZ_240312,[]
1324,gpt-3.5-turbo-1106,0.155,alpacaeval2-lc,BLZ_240312,[]
1326,dolphin-2.2.1-mistral-7b,0.10800000000000001,alpacaeval2-lc,BLZ_240312,[]
1327,wizardlm-13b-v1.2,0.099,alpacaeval2-lc,BLZ_240312,[]
1328,zephyr-7b-beta,0.102,alpacaeval2-lc,BLZ_240312,[]
1331,llama-2-13b-chat,0.068,alpacaeval2-lc,BLZ_240312,[]
1332,vicuna-13b,0.085,alpacaeval2-lc,BLZ_240312,[]
1334,zephyr-7b-alpha,0.086,alpacaeval2-lc,BLZ_240312,[]
1335,qwen-14b-chat,0.1,alpacaeval2-lc,BLZ_240312,[]
1338,llama-2-7b-chat,0.045,alpacaeval2-lc,BLZ_240312,[]
1342,vicuna-7b,0.06,alpacaeval2-lc,BLZ_240312,[]
0,gpt-4-0125-preview,1.0,arena-elo,BLZ_240312,[]
1,gpt-4-1106-preview,0.9992019154030327,arena-elo,BLZ_240312,[]
2,bard-gemini-pro,0.9768555466879489,arena-elo,BLZ_240312,[]
3,gpt-4-0314,0.9497206703910615,arena-elo,BLZ_240312,[]
4,gpt-4-0613,0.9273743016759777,arena-elo,BLZ_240312,[]
5,mistral-medium,0.9177972865123704,arena-elo,BLZ_240312,[]
6,claude-1,0.9169992019154031,arena-elo,BLZ_240312,[]
7,claude-2.0,0.9034317637669593,arena-elo,BLZ_240312,[]
8,gemini-pro-dev-api,0.8938547486033519,arena-elo,BLZ_240312,[]
9,claude-2.1,0.8930566640063847,arena-elo,BLZ_240312,[]
10,gpt-3.5-turbo-0613,0.8922585794094174,arena-elo,BLZ_240312,[]
11,mixtral-8x7b-instruct-v0.1,0.8922585794094174,arena-elo,BLZ_240312,[]
12,yi-34b-chat,0.8898643256185156,arena-elo,BLZ_240312,[]
13,gemini-pro,0.8890662410215483,arena-elo,BLZ_240312,[]
14,claude-instant-1,0.8850758180367119,arena-elo,BLZ_240312,[]
15,gpt-3.5-turbo-0314,0.8818834796488427,arena-elo,BLZ_240312,[]
16,wizardlm-70b-v1.0,0.8818834796488427,arena-elo,BLZ_240312,[]
17,tulu-2-dpo-70b,0.8810853950518756,arena-elo,BLZ_240312,[]
18,vicuna-33b,0.8723064644852354,arena-elo,BLZ_240312,[]
19,starling-lm-7b-alpha,0.8699122106943336,arena-elo,BLZ_240312,[]
20,deepseek-llm-67b-chat,0.8635275339185954,arena-elo,BLZ_240312,[]
21,llama-2-70b-chat,0.8635275339185954,arena-elo,BLZ_240312,[]
22,nv-llama2-70b-steerlm-chat,0.8603351955307262,arena-elo,BLZ_240312,[]
23,openhermes-2.5-mistral-7b,0.8603351955307262,arena-elo,BLZ_240312,[]
24,openchat-3.5,0.8587390263367917,arena-elo,BLZ_240312,[]
25,pplx-70b-online,0.8587390263367917,arena-elo,BLZ_240312,[]
26,gpt-3.5-turbo-1106,0.8547486033519553,arena-elo,BLZ_240312,[]
27,solar-10.7b-instruct-v1.0,0.8499600957701516,arena-elo,BLZ_240312,[]
28,dolphin-2.2.1-mistral-7b,0.8499600957701516,arena-elo,BLZ_240312,[]
29,wizardlm-13b-v1.2,0.8443735035913806,arena-elo,BLZ_240312,[]
30,zephyr-7b-beta,0.8387869114126097,arena-elo,BLZ_240312,[]
31,mpt-30b-chat,0.8332003192338387,arena-elo,BLZ_240312,[]
32,codellama-34b-instruct,0.8324022346368715,arena-elo,BLZ_240312,[]
33,llama-2-13b-chat,0.8316041500399042,arena-elo,BLZ_240312,[]
34,vicuna-13b,0.8300079808459697,arena-elo,BLZ_240312,[]
35,pplx-7b-online,0.8284118116520351,arena-elo,BLZ_240312,[]
36,zephyr-7b-alpha,0.8276137270550679,arena-elo,BLZ_240312,[]
37,qwen-14b-chat,0.825219473264166,arena-elo,BLZ_240312,[]
38,falcon-180b-chat,0.8236233040702314,arena-elo,BLZ_240312,[]
39,guanaco-33b,0.8236233040702314,arena-elo,BLZ_240312,[]
40,llama-2-7b-chat,0.8172386272944933,arena-elo,BLZ_240312,[]
41,stripedhyena-nous-7b,0.8140462889066241,arena-elo,BLZ_240312,[]
42,mistral-7b-instruct-v0.1,0.8028731045490822,arena-elo,BLZ_240312,[]
43,palm-chat-bison-001,0.8028731045490822,arena-elo,BLZ_240312,[]
44,vicuna-7b,0.8020750199521149,arena-elo,BLZ_240312,[]
45,koala-13b,0.770949720670391,arena-elo,BLZ_240312,[]
46,chatglm3-6b,0.7661612130885874,arena-elo,BLZ_240312,[]
47,gpt4all-13b-snoozy,0.74780526735834,arena-elo,BLZ_240312,[]
48,mpt-7b-chat,0.7430167597765364,arena-elo,BLZ_240312,[]
49,chatglm2-6b,0.7422186751795691,arena-elo,BLZ_240312,[]
50,rwkv-4-raven-14b,0.7382282521947326,arena-elo,BLZ_240312,[]
51,alpaca-13b,0.7214684756584198,arena-elo,BLZ_240312,[]
52,openassistant-pythia-12b,0.7158818834796489,arena-elo,BLZ_240312,[]
53,chatglm-6b,0.704708699122107,arena-elo,BLZ_240312,[]
54,fastchat-t5-3b,0.6975259377494014,arena-elo,BLZ_240312,[]
55,stablelm-tuned-alpha-7b,0.6743814844373504,arena-elo,BLZ_240312,[]
56,dolly-v2-12b,0.6568236233040702,arena-elo,BLZ_240312,[]
57,llama-13b,0.6384676775738228,arena-elo,BLZ_240312,[]
542,mixtral-8x7b-instruct-v0.1,0.7641,gpt4all,BLZ_240312,[]
543,yi-34b-chat,0.7212999999999999,gpt4all,BLZ_240312,[]
550,starling-lm-7b-alpha,0.7272,gpt4all,BLZ_240312,[]
554,openhermes-2.5-mistral-7b,0.7312000000000001,gpt4all,BLZ_240312,[]
555,openchat-3.5,0.7292000000000001,gpt4all,BLZ_240312,[]
558,solar-10.7b-instruct-v1.0,0.7511,gpt4all,BLZ_240312,[]
559,dolphin-2.2.1-mistral-7b,0.7223999999999999,gpt4all,BLZ_240312,[]
561,zephyr-7b-beta,0.7182999999999999,gpt4all,BLZ_240312,[]
565,vicuna-13b,0.631,gpt4all,BLZ_240312,[]
567,zephyr-7b-alpha,0.7223999999999999,gpt4all,BLZ_240312,[]
573,mistral-7b-instruct-v0.1,0.6795,gpt4all,BLZ_240312,[]
575,vicuna-7b,0.61,gpt4all,BLZ_240312,[]
576,koala-13b,0.62,gpt4all,BLZ_240312,[]
578,gpt4all-13b-snoozy,0.653,gpt4all,BLZ_240312,[]
579,mpt-7b-chat,0.648,gpt4all,BLZ_240312,[]
583,openassistant-pythia-12b,0.61,gpt4all,BLZ_240312,[]
585,fastchat-t5-3b,0.537,gpt4all,BLZ_240312,[]
586,stablelm-tuned-alpha-7b,0.513,gpt4all,BLZ_240312,[]
588,llama-13b,0.63,gpt4all,BLZ_240312,[]
129,mixtral-8x7b-instruct-v0.1,0.7262000000000001,hugging-6,BLZ_240312,[]
130,yi-34b-chat,0.6531999999999999,hugging-6,BLZ_240312,[]
134,wizardlm-70b-v1.0,0.6125,hugging-6,BLZ_240312,[]
135,tulu-2-dpo-70b,0.7376999999999999,hugging-6,BLZ_240312,[]
136,vicuna-33b,0.585,hugging-6,BLZ_240312,[]
137,starling-lm-7b-alpha,0.6713,hugging-6,BLZ_240312,[]
139,llama-2-70b-chat,0.624,hugging-6,BLZ_240312,[]
141,openhermes-2.5-mistral-7b,0.6152000000000001,hugging-6,BLZ_240312,[]
142,openchat-3.5,0.6124,hugging-6,BLZ_240312,[]
145,solar-10.7b-instruct-v1.0,0.742,hugging-6,BLZ_240312,[]
146,dolphin-2.2.1-mistral-7b,0.6493000000000001,hugging-6,BLZ_240312,[]
147,wizardlm-13b-v1.2,0.5476,hugging-6,BLZ_240312,[]
148,zephyr-7b-beta,0.6195,hugging-6,BLZ_240312,[]
149,mpt-30b-chat,0.5538000000000001,hugging-6,BLZ_240312,[]
150,codellama-34b-instruct,0.5729,hugging-6,BLZ_240312,[]
151,llama-2-13b-chat,0.5490999999999999,hugging-6,BLZ_240312,[]
152,vicuna-13b,0.5539999999999999,hugging-6,BLZ_240312,[]
154,zephyr-7b-alpha,0.595,hugging-6,BLZ_240312,[]
156,falcon-180b-chat,0.6785,hugging-6,BLZ_240312,[]
158,llama-2-7b-chat,0.5074000000000001,hugging-6,BLZ_240312,[]
160,mistral-7b-instruct-v0.1,0.5496,hugging-6,BLZ_240312,[]
162,vicuna-7b,0.521,hugging-6,BLZ_240312,[]
176,yi-34bx2-moe-60b,0.7672,hugging-6,BLZ_240312,[]
947,gpt-4-0314,0.93,llmonitor,BLZ_240312,[]
948,gpt-4-0613,0.89,llmonitor,BLZ_240312,[]
950,claude-1,0.66,llmonitor,BLZ_240312,[]
951,claude-2.0,0.68,llmonitor,BLZ_240312,[]
954,gpt-3.5-turbo-0613,0.81,llmonitor,BLZ_240312,[]
958,claude-instant-1,0.6,llmonitor,BLZ_240312,[]
959,gpt-3.5-turbo-0314,0.79,llmonitor,BLZ_240312,[]
965,llama-2-70b-chat,0.6,llmonitor,BLZ_240312,[]
975,mpt-30b-chat,0.4,llmonitor,BLZ_240312,[]
976,codellama-34b-instruct,0.34,llmonitor,BLZ_240312,[]
977,llama-2-13b-chat,0.5,llmonitor,BLZ_240312,[]
978,vicuna-13b,0.5,llmonitor,BLZ_240312,[]
982,falcon-180b-chat,0.67,llmonitor,BLZ_240312,[]
983,guanaco-33b,0.43,llmonitor,BLZ_240312,[]
984,llama-2-7b-chat,0.5,llmonitor,BLZ_240312,[]
986,mistral-7b-instruct-v0.1,0.57,llmonitor,BLZ_240312,[]
987,palm-chat-bison-001,0.57,llmonitor,BLZ_240312,[]
988,vicuna-7b,0.41,llmonitor,BLZ_240312,[]
989,koala-13b,0.31,llmonitor,BLZ_240312,[]
992,mpt-7b-chat,0.43,llmonitor,BLZ_240312,[]
1000,dolly-v2-12b,0.23,llmonitor,BLZ_240312,[]
59,gpt-4-0125-preview,0.0929,mt-bench,BLZ_240312,[]
60,gpt-4-1106-preview,0.0932,mt-bench,BLZ_240312,[]
62,gpt-4-0314,0.08960000000000001,mt-bench,BLZ_240312,[]
63,gpt-4-0613,0.09179999999999999,mt-bench,BLZ_240312,[]
64,mistral-medium,0.0861,mt-bench,BLZ_240312,[]
65,claude-1,0.079,mt-bench,BLZ_240312,[]
66,claude-2.0,0.0806,mt-bench,BLZ_240312,[]
67,gemini-pro-dev-api,0.08039999999999999,mt-bench,BLZ_240312,[]
68,claude-2.1,0.0818,mt-bench,BLZ_240312,[]
69,gpt-3.5-turbo-0613,0.0839,mt-bench,BLZ_240312,[]
70,mixtral-8x7b-instruct-v0.1,0.083,mt-bench,BLZ_240312,[]
71,yi-34b-chat,0.07769999999999999,mt-bench,BLZ_240312,[]
72,gemini-pro,0.08039999999999999,mt-bench,BLZ_240312,[]
73,claude-instant-1,0.0785,mt-bench,BLZ_240312,[]
74,gpt-3.5-turbo-0314,0.0794,mt-bench,BLZ_240312,[]
75,wizardlm-70b-v1.0,0.0771,mt-bench,BLZ_240312,[]
76,tulu-2-dpo-70b,0.0789,mt-bench,BLZ_240312,[]
77,vicuna-33b,0.0712,mt-bench,BLZ_240312,[]
78,starling-lm-7b-alpha,0.0809,mt-bench,BLZ_240312,[]
79,deepseek-llm-67b-chat,0.08529999999999999,mt-bench,BLZ_240312,[]
80,llama-2-70b-chat,0.06860000000000001,mt-bench,BLZ_240312,[]
81,nv-llama2-70b-steerlm-chat,0.0754,mt-bench,BLZ_240312,[]
82,openhermes-2.5-mistral-7b,0.07690000000000001,mt-bench,BLZ_240312,[]
83,openchat-3.5,0.0781,mt-bench,BLZ_240312,[]
84,pplx-70b-online,0.0588,mt-bench,BLZ_240312,[]
85,gpt-3.5-turbo-1106,0.0832,mt-bench,BLZ_240312,[]
86,solar-10.7b-instruct-v1.0,0.0758,mt-bench,BLZ_240312,[]
88,wizardlm-13b-v1.2,0.07200000000000001,mt-bench,BLZ_240312,[]
89,zephyr-7b-beta,0.07339999999999999,mt-bench,BLZ_240312,[]
90,mpt-30b-chat,0.0639,mt-bench,BLZ_240312,[]
92,llama-2-13b-chat,0.0665,mt-bench,BLZ_240312,[]
93,vicuna-13b,0.06570000000000001,mt-bench,BLZ_240312,[]
95,zephyr-7b-alpha,0.0688,mt-bench,BLZ_240312,[]
96,qwen-14b-chat,0.0696,mt-bench,BLZ_240312,[]
98,guanaco-33b,0.0653,mt-bench,BLZ_240312,[]
99,llama-2-7b-chat,0.06269999999999999,mt-bench,BLZ_240312,[]
101,mistral-7b-instruct-v0.1,0.0684,mt-bench,BLZ_240312,[]
102,palm-chat-bison-001,0.064,mt-bench,BLZ_240312,[]
103,vicuna-7b,0.0617,mt-bench,BLZ_240312,[]
104,koala-13b,0.0535,mt-bench,BLZ_240312,[]
106,gpt4all-13b-snoozy,0.0541,mt-bench,BLZ_240312,[]
107,mpt-7b-chat,0.0542,mt-bench,BLZ_240312,[]
108,chatglm2-6b,0.0496,mt-bench,BLZ_240312,[]
109,rwkv-4-raven-14b,0.0398,mt-bench,BLZ_240312,[]
110,alpaca-13b,0.0453,mt-bench,BLZ_240312,[]
111,openassistant-pythia-12b,0.0432,mt-bench,BLZ_240312,[]
112,chatglm-6b,0.045,mt-bench,BLZ_240312,[]
113,fastchat-t5-3b,0.0304,mt-bench,BLZ_240312,[]
114,stablelm-tuned-alpha-7b,0.0275,mt-bench,BLZ_240312,[]
115,dolly-v2-12b,0.032799999999999996,mt-bench,BLZ_240312,[]
116,llama-13b,0.026099999999999998,mt-bench,BLZ_240312,[]
0,gpt-4-0613,0.957,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
1,llama-3-70b,0.902,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
2,mixtral-8x22b,0.855,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
3,palmyra-x-v3-72b,0.826,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
4,gpt-4-turbo-1106-preview,0.821,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
5,palm-2-unicorn,0.781,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
6,claude-3-opus-20240229,0.762,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
7,qwen1.5-72b,0.757,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
8,palmyra-x-v2-33b,0.736,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
9,yi-34b,0.723,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
10,qwen1.5-32b,0.689,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
11,claude-v1.3,0.689,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
12,mixtral-8x7b-32k-seqlen,0.679,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
13,palm-2-bison,0.655,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
14,claude-2.0,0.651,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
15,deepseek-llm-67b-chat,0.645,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
16,llama-2-70b,0.609,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
17,claude-2.1,0.594,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
18,gpt-3.5-text-davinci-003,0.577,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
19,qwen1.5-14b,0.574,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
20,claude-instant-1.2,0.551,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
21,llama-3-8b,0.519,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
22,gpt-3.5-turbo-0613,0.502,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
23,gemma-7b,0.47,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
24,claude-3-sonnet-20240229,0.468,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
25,gpt-3.5-text-davinci-002,0.468,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
26,llama-65b,0.466,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
27,mistral-large-2402,0.46,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
28,cohere-command,0.421,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
29,dbrx-instructruct,0.419,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
30,mistral-v0.1-7b,0.415,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
31,mistral-small-2402,0.415,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
32,mistral-medium-2312,0.383,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
33,qwen1.5-7b,0.377,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
34,claude-3-haiku-20240307,0.377,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
35,yi-6b,0.351,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
36,llama-2-13b,0.332,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
37,jurassic-2-jumbo-178b,0.317,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
38,falcon-40b,0.306,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
39,phi-2,0.26,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
40,jurassic-2-grande-17b,0.253,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
41,llama-2-7b,0.234,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
42,luminous-supreme-70b,0.213,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
43,cohere-command-light,0.166,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
44,luminous-extended-30b,0.119,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
45,falcon-7b,0.1,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
46,olmo-7b,0.083,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
47,luminous-base-13b,0.072,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
0,llama-2-70b,0.944,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
1,llama-65b,0.908,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
2,text-davinci-002,0.905,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
3,mistral-v0.1-7b,0.884,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
4,cohere-command-beta-52.4b,0.874,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
5,text-davinci-003,0.872,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
6,jurassic-2-jumbo-178b,0.824,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
7,llama-2-13b,0.823,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
8,tnlg-v2-530b,0.787,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
9,gpt-3.5-turbo-0613,0.783,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
10,llama-30b,0.781,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
11,anthropic-lm-v4-s3-52b,0.78,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
12,gpt-3.5-turbo-0301,0.76,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
13,jurassic-2-grande-17b,0.743,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
14,palmyra-x-43b,0.732,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
15,falcon-40b,0.729,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
16,falcon-instruct-40b,0.727,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
17,mpt-instruct-30b,0.716,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
18,mpt-30b,0.714,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
19,j1-grande-v2-beta-17b,0.706,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
20,vicuna-v1.3-13b,0.706,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
21,cohere-command-beta-6.1b,0.675,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
22,cohere-xlarge-v20221108-52.4b,0.664,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
23,luminous-supreme-70b,0.662,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
24,vicuna-v1.3-7b,0.625,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
25,opt-175b,0.609,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
26,llama-2-7b,0.607,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
27,llama-13b,0.595,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
28,instructpalmyra-30b,0.568,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
29,cohere-xlarge-v20220609-52.4b,0.56,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
30,jurassic-2-large-7.5b,0.553,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
31,davinci-175b,0.538,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
32,llama-7b,0.533,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
33,redpajama-incite-instruct-7b,0.524,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
34,j1-jumbo-v1-178b,0.517,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
35,glm-130b,0.512,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
36,luminous-extended-30b,0.485,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
37,opt-66b,0.448,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
38,bloom-176b,0.446,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
39,j1-grande-v1-17b,0.433,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
40,alpaca-7b,0.381,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
41,falcon-7b,0.378,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
42,redpajama-incite-base-7b,0.378,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
43,cohere-large-v20220720-13.1b,0.372,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
44,redpajama-incite-instruct-v1-3b,0.366,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
45,text-curie-001,0.36,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
46,gpt-neox-20b,0.351,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
47,luminous-base-13b,0.315,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
48,cohere-medium-v20221108-6.1b,0.312,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
49,redpajama-incite-base-v1-3b,0.311,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
50,tnlg-v2-6.7b,0.309,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
51,j1-large-v1-7.5b,0.285,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
52,gpt-j-6b,0.273,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
53,pythia-12b,0.257,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
54,curie-6.7b,0.247,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
55,falcon-instruct-7b,0.244,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
56,cohere-medium-v20220720-6.1b,0.23,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
57,text-babbage-001,0.229,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
58,t0pp-11b,0.197,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
59,pythia-6.9b,0.196,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
60,ul2-20b,0.167,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
61,t5-11b,0.131,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
62,babbage-1.3b,0.114,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
63,cohere-small-v20220720-410m,0.109,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
64,ada-350m,0.108,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
65,text-ada-001,0.107,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
66,yalm-100b,0.075,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
67,llama-2-70b,0.582,mmlu,helm_classic_240130,[]
68,llama-65b,0.584,mmlu,helm_classic_240130,[]
69,text-davinci-002,0.568,mmlu,helm_classic_240130,[]
70,mistral-v0.1-7b,0.572,mmlu,helm_classic_240130,[]
71,cohere-command-beta-52.4b,0.452,mmlu,helm_classic_240130,[]
72,text-davinci-003,0.569,mmlu,helm_classic_240130,[]
73,jurassic-2-jumbo-178b,0.48,mmlu,helm_classic_240130,[]
74,llama-2-13b,0.507,mmlu,helm_classic_240130,[]
75,tnlg-v2-530b,0.469,mmlu,helm_classic_240130,[]
76,gpt-3.5-turbo-0613,0.391,mmlu,helm_classic_240130,[]
77,llama-30b,0.531,mmlu,helm_classic_240130,[]
78,anthropic-lm-v4-s3-52b,0.481,mmlu,helm_classic_240130,[]
79,gpt-3.5-turbo-0301,0.59,mmlu,helm_classic_240130,[]
80,jurassic-2-grande-17b,0.475,mmlu,helm_classic_240130,[]
81,palmyra-x-43b,0.609,mmlu,helm_classic_240130,[]
82,falcon-40b,0.509,mmlu,helm_classic_240130,[]
83,falcon-instruct-40b,0.497,mmlu,helm_classic_240130,[]
84,mpt-instruct-30b,0.444,mmlu,helm_classic_240130,[]
85,mpt-30b,0.437,mmlu,helm_classic_240130,[]
86,j1-grande-v2-beta-17b,0.445,mmlu,helm_classic_240130,[]
87,vicuna-v1.3-13b,0.462,mmlu,helm_classic_240130,[]
88,cohere-command-beta-6.1b,0.406,mmlu,helm_classic_240130,[]
89,cohere-xlarge-v20221108-52.4b,0.382,mmlu,helm_classic_240130,[]
90,luminous-supreme-70b,0.38,mmlu,helm_classic_240130,[]
91,vicuna-v1.3-7b,0.434,mmlu,helm_classic_240130,[]
92,opt-175b,0.318,mmlu,helm_classic_240130,[]
93,llama-2-7b,0.431,mmlu,helm_classic_240130,[]
94,llama-13b,0.422,mmlu,helm_classic_240130,[]
95,instructpalmyra-30b,0.403,mmlu,helm_classic_240130,[]
96,cohere-xlarge-v20220609-52.4b,0.353,mmlu,helm_classic_240130,[]
97,jurassic-2-large-7.5b,0.339,mmlu,helm_classic_240130,[]
98,davinci-175b,0.422,mmlu,helm_classic_240130,[]
99,llama-7b,0.321,mmlu,helm_classic_240130,[]
100,redpajama-incite-instruct-7b,0.363,mmlu,helm_classic_240130,[]
101,j1-jumbo-v1-178b,0.259,mmlu,helm_classic_240130,[]
102,glm-130b,0.344,mmlu,helm_classic_240130,[]
103,luminous-extended-30b,0.321,mmlu,helm_classic_240130,[]
104,opt-66b,0.276,mmlu,helm_classic_240130,[]
105,bloom-176b,0.299,mmlu,helm_classic_240130,[]
106,j1-grande-v1-17b,0.27,mmlu,helm_classic_240130,[]
107,alpaca-7b,0.385,mmlu,helm_classic_240130,[]
108,falcon-7b,0.286,mmlu,helm_classic_240130,[]
109,redpajama-incite-base-7b,0.302,mmlu,helm_classic_240130,[]
110,cohere-large-v20220720-13.1b,0.324,mmlu,helm_classic_240130,[]
111,redpajama-incite-instruct-v1-3b,0.257,mmlu,helm_classic_240130,[]
112,text-curie-001,0.237,mmlu,helm_classic_240130,[]
113,gpt-neox-20b,0.276,mmlu,helm_classic_240130,[]
114,luminous-base-13b,0.27,mmlu,helm_classic_240130,[]
115,cohere-medium-v20221108-6.1b,0.254,mmlu,helm_classic_240130,[]
116,redpajama-incite-base-v1-3b,0.263,mmlu,helm_classic_240130,[]
117,tnlg-v2-6.7b,0.242,mmlu,helm_classic_240130,[]
118,j1-large-v1-7.5b,0.241,mmlu,helm_classic_240130,[]
119,gpt-j-6b,0.249,mmlu,helm_classic_240130,[]
120,pythia-12b,0.274,mmlu,helm_classic_240130,[]
121,curie-6.7b,0.243,mmlu,helm_classic_240130,[]
122,falcon-instruct-7b,0.275,mmlu,helm_classic_240130,[]
123,cohere-medium-v20220720-6.1b,0.279,mmlu,helm_classic_240130,[]
124,text-babbage-001,0.229,mmlu,helm_classic_240130,[]
125,t0pp-11b,0.407,mmlu,helm_classic_240130,[]
126,pythia-6.9b,0.236,mmlu,helm_classic_240130,[]
127,ul2-20b,0.291,mmlu,helm_classic_240130,[]
128,t5-11b,0.29,mmlu,helm_classic_240130,[]
129,babbage-1.3b,0.235,mmlu,helm_classic_240130,[]
130,cohere-small-v20220720-410m,0.264,mmlu,helm_classic_240130,[]
131,ada-350m,0.243,mmlu,helm_classic_240130,[]
132,text-ada-001,0.238,mmlu,helm_classic_240130,[]
133,yalm-100b,0.243,mmlu,helm_classic_240130,[]
0,gpt-4o-0513,35.7,wildbench-mix,wildbench_240612,[]
1,gpt-4-turbo-0409,34.6,wildbench-mix,wildbench_240612,[]
2,gpt-4-turbo-0125,29.9,wildbench-mix,wildbench_240612,[]
3,gemini-1.5-pro,27.8,wildbench-mix,wildbench_240612,[]
4,llama-3-70b-inst,21.0,wildbench-mix,wildbench_240612,[]
5,claude-3-opus,20.1,wildbench-mix,wildbench_240612,[]
6,gemini-1.5-flash,17.4,wildbench-mix,wildbench_240612,[]
7,yi-1.5-34b-chat,16.8,wildbench-mix,wildbench_240612,[]
8,llama3-inst-8b-simpo,14.0,wildbench-mix,wildbench_240612,[]
9,claude-3-sonnet,7.2,wildbench-mix,wildbench_240612,[]
10,qwen1.5-72b-chat,4.4,wildbench-mix,wildbench_240612,[]
11,command-r-plus,0.4,wildbench-mix,wildbench_240612,[]
12,claude-3-haiku,-8.5,wildbench-mix,wildbench_240612,[]
13,mistral-large,-10.5,wildbench-mix,wildbench_240612,[]
14,starlinglm-7b-beta,-11.9,wildbench-mix,wildbench_240612,[]
15,llama-3-8b-inst,-14.6,wildbench-mix,wildbench_240612,[]
16,command-r,-16.0,wildbench-mix,wildbench_240612,[]
17,mixtral-8x7b-inst,-18.8,wildbench-mix,wildbench_240612,[]
18,dbrx-instruct,-21.6,wildbench-mix,wildbench_240612,[]
19,yi-1.5-6b-chat,-24.3,wildbench-mix,wildbench_240612,[]
20,mistral-7b-inst-v0.2,-25.0,wildbench-mix,wildbench_240612,[]
21,tulu-2-dpo-70b,-25.4,wildbench-mix,wildbench_240612,[]
22,llama-2-70b-chat,-26.8,wildbench-mix,wildbench_240612,[]
23,qwen1.5-7b-chat,-27.0,wildbench-mix,wildbench_240612,[]
24,phi-3-medium-128k,-33.3,wildbench-mix,wildbench_240612,[]
25,gpt-3.5-turbo-0125,-33.5,wildbench-mix,wildbench_240612,[]
26,llama-2-7b-chat,-48.0,wildbench-mix,wildbench_240612,[]
27,gemma-7b-it,-57.0,wildbench-mix,wildbench_240612,[]
28,gemma-2b-it,-74.1,wildbench-mix,wildbench_240612,[]
13,flan-t5-xxl,0.2244897959183673,mmlu_pro,bluebench_v02,[]
30,granite-13b-chat-v2,0.2857142857142857,mmlu_pro,bluebench_v02,[]
41,granite-13b-instruct-v2,0.0408163265306122,mmlu_pro,bluebench_v02,[]
50,granite-7b-lab,0.2423469387755102,mmlu_pro,bluebench_v02,[]
60,llama-2-13b-chat,0.0943877551020408,mmlu_pro,bluebench_v02,[]
70,llama-2-70b,0.4081632653061224,mmlu_pro,bluebench_v02,[]
81,llama-3-70b-instruct,0.4285714285714285,mmlu_pro,bluebench_v02,[]
92,llama-3-8b,0.375,mmlu_pro,bluebench_v02,[]
103,llama-3-8b-instruct,0.0994897959183673,mmlu_pro,bluebench_v02,[]
112,llama-30b,0.3061224489795918,mmlu_pro,bluebench_v02,[]
121,llama-7b,0.1326530612244897,mmlu_pro,bluebench_v02,[]
132,mistral-v0.1-7b,0.2857142857142857,mmlu_pro,bluebench_v02,[]
143,mixtral-8x7b-instruct-v01,0.375,mmlu_pro,bluebench_v02,[]
153,vicuna-13b-v1.5-16k,0.2857142857142857,mmlu_pro,bluebench_v02,[]
162,vicuna-33b-v1.3,0.2653061224489796,mmlu_pro,bluebench_v02,[]
172,vicuna-v1.3-7b,0.1938775510204081,mmlu_pro,bluebench_v02,[]
182,vicuna-7b-v1.5,0.2857142857142857,mmlu_pro,bluebench_v02,[]
192,zephyr-7b-beta,0.2959183673469387,mmlu_pro,bluebench_v02,[]