diff --git "a/assets/combined_20240704.csv" "b/assets/combined_20240704.csv" new file mode 100644--- /dev/null +++ "b/assets/combined_20240704.csv" @@ -0,0 +1,4749 @@ +,model,score,scenario,source,aggragated_from,tag +0,gpt_4_turbo_2024_04_09,82.6,arena_hard,arena_hard_2404,[],holistic +1,gpt_4_0125_preview,78.0,arena_hard,arena_hard_2404,[],holistic +2,gemini_1.5_pro_api_preview,72.0,arena_hard,arena_hard_2404,[],holistic +3,yi_large,63.7,arena_hard,arena_hard_2404,[],holistic +4,claude_3_opus_20240229,60.4,arena_hard,arena_hard_2404,[],holistic +5,glm_4,55.7,arena_hard,arena_hard_2404,[],holistic +6,gpt_4_0314,50.0,arena_hard,arena_hard_2404,[],holistic +7,gemini_1.5_flash_api_preview,49.6,arena_hard,arena_hard_2404,[],holistic +8,claude_3_sonnet_20240229,46.8,arena_hard,arena_hard_2404,[],holistic +9,claude_3_haiku_20240307,41.5,arena_hard,arena_hard_2404,[],holistic +10,llama_3_70b_chat,41.1,arena_hard,arena_hard_2404,[],holistic +11,gpt_4_0613,37.9,arena_hard,arena_hard_2404,[],holistic +12,mistral_large_2402,37.7,arena_hard,arena_hard_2404,[],holistic +13,mixtral_8x22b_instruct_v0.1,36.4,arena_hard,arena_hard_2404,[],holistic +14,qwen1.5_72b_chat,36.1,arena_hard,arena_hard_2404,[],holistic +15,command_r_plus,33.1,arena_hard,arena_hard_2404,[],holistic +16,mistral_medium,31.9,arena_hard,arena_hard_2404,[],holistic +17,mistral_next,27.4,arena_hard,arena_hard_2404,[],holistic +18,gpt_3.5_turbo_0613,24.8,arena_hard,arena_hard_2404,[],holistic +19,claude_2.0,24.0,arena_hard,arena_hard_2404,[],holistic +20,dbrx_instructruct,23.9,arena_hard,arena_hard_2404,[],holistic +21,mixtral_8x7b_instruct_v0.1,23.4,arena_hard,arena_hard_2404,[],holistic +22,gpt_3.5_turbo_0125,23.3,arena_hard,arena_hard_2404,[],holistic +23,yi_34b_chat,23.1,arena_hard,arena_hard_2404,[],holistic +24,starling_lm_7b_beta,23.0,arena_hard,arena_hard_2404,[],holistic +25,claude_2.1,22.8,arena_hard,arena_hard_2404,[],holistic +26,snorkel_mistral_pairrm_dpo,20.7,arena_hard,arena_hard_2404,[],holistic +27,llama_3_8b_chat,20.6,arena_hard,arena_hard_2404,[],holistic +28,gpt_3.5_turbo_1106,18.9,arena_hard,arena_hard_2404,[],holistic +29,gpt_3.5_turbo_0301,18.1,arena_hard,arena_hard_2404,[],holistic +30,gemini_1.0_pro,17.8,arena_hard,arena_hard_2404,[],holistic +31,snowflake_arctic_instruct,17.6,arena_hard,arena_hard_2404,[],holistic +32,command_r,17.0,arena_hard,arena_hard_2404,[],holistic +33,phi_3_mini_128k_instruct,15.4,arena_hard,arena_hard_2404,[],holistic +34,tulu_2_dpo_70b,15.0,arena_hard,arena_hard_2404,[],holistic +35,starling_lm_7b_alpha,12.8,arena_hard,arena_hard_2404,[],holistic +36,mistral_7b_instruct,12.6,arena_hard,arena_hard_2404,[],holistic +37,gemma_1.1_7b_it,12.1,arena_hard,arena_hard_2404,[],holistic +38,llama_2_70b_chat,11.6,arena_hard,arena_hard_2404,[],holistic +39,vicuna_33b_v1.3,8.6,arena_hard,arena_hard_2404,[],holistic +40,gemma_7b_it,7.5,arena_hard,arena_hard_2404,[],holistic +41,llama_2_7b_chat,4.6,arena_hard,arena_hard_2404,[],holistic +42,gemma_1.1_2b_it,3.4,arena_hard,arena_hard_2404,[],holistic +43,gemma_2b_it,3.0,arena_hard,arena_hard_2404,[],holistic +0,gpt_4o_2024_05_13,64.7,mixeval_hard-mixed,mixeval_240601,[],holistic +1,claude_3_opus,63.5,mixeval_hard-mixed,mixeval_240601,[],holistic +2,gpt_4_turbo_2024_04_09,62.6,mixeval_hard-mixed,mixeval_240601,[],holistic +3,gemini_1.5_pro_api_0409,58.7,mixeval_hard-mixed,mixeval_240601,[],holistic +4,yi_large_preview,56.8,mixeval_hard-mixed,mixeval_240601,[],holistic +5,llama_3_70b_instruct,55.9,mixeval_hard-mixed,mixeval_240601,[],holistic +6,qwen_max_0428,55.8,mixeval_hard-mixed,mixeval_240601,[],holistic +7,claude_3_sonnet,54.0,mixeval_hard-mixed,mixeval_240601,[],holistic +8,reka_core_20240415,52.9,mixeval_hard-mixed,mixeval_240601,[],holistic +9,mammoth2_8x7b_plus,51.8,mixeval_hard-mixed,mixeval_240601,[],holistic +10,deepseek_v2,51.7,mixeval_hard-mixed,mixeval_240601,[],holistic +11,command_r_plus,51.4,mixeval_hard-mixed,mixeval_240601,[],holistic +12,yi_1.5_34b_chat,51.2,mixeval_hard-mixed,mixeval_240601,[],holistic +13,mistral_large,50.3,mixeval_hard-mixed,mixeval_240601,[],holistic +14,qwen1.5_72b_chat,48.3,mixeval_hard-mixed,mixeval_240601,[],holistic +15,mistral_medium,47.8,mixeval_hard-mixed,mixeval_240601,[],holistic +16,gemini_1.0_pro,46.4,mixeval_hard-mixed,mixeval_240601,[],holistic +17,reka_flash_20240226,46.2,mixeval_hard-mixed,mixeval_240601,[],holistic +18,mistral_small,46.2,mixeval_hard-mixed,mixeval_240601,[],holistic +19,llama_3_8b_instruct,45.6,mixeval_hard-mixed,mixeval_240601,[],holistic +20,command_r,45.2,mixeval_hard-mixed,mixeval_240601,[],holistic +21,qwen1.5_32b_chat,43.3,mixeval_hard-mixed,mixeval_240601,[],holistic +22,gpt_3.5_turbo_0125,43.0,mixeval_hard-mixed,mixeval_240601,[],holistic +23,claude_3_haiku,42.8,mixeval_hard-mixed,mixeval_240601,[],holistic +24,yi_34b_chat,42.6,mixeval_hard-mixed,mixeval_240601,[],holistic +25,mixtral_8x7b_instruct_v0.1,42.5,mixeval_hard-mixed,mixeval_240601,[],holistic +26,starling_lm_7b_beta,41.8,mixeval_hard-mixed,mixeval_240601,[],holistic +27,yi_1.5_9b_chat,40.9,mixeval_hard-mixed,mixeval_240601,[],holistic +28,gemma_1.1_7b_it,39.1,mixeval_hard-mixed,mixeval_240601,[],holistic +29,vicuna_33b_v1.3,38.7,mixeval_hard-mixed,mixeval_240601,[],holistic +30,llama_2_70b_chat,38.0,mixeval_hard-mixed,mixeval_240601,[],holistic +31,map_neo_instruct_v0.1,37.8,mixeval_hard-mixed,mixeval_240601,[],holistic +32,mistral_7b_instruct_v0.2,36.2,mixeval_hard-mixed,mixeval_240601,[],holistic +33,qwen1.5_7b_chat,35.5,mixeval_hard-mixed,mixeval_240601,[],holistic +34,reka_edge_20240208,32.2,mixeval_hard-mixed,mixeval_240601,[],holistic +35,zephyr_7b_beta,31.6,mixeval_hard-mixed,mixeval_240601,[],holistic +36,llama_2_7b_chat,30.8,mixeval_hard-mixed,mixeval_240601,[],holistic +37,yi_6b_chat,30.1,mixeval_hard-mixed,mixeval_240601,[],holistic +38,qwen1.5_moe_a2.7b_chat,29.1,mixeval_hard-mixed,mixeval_240601,[],holistic +39,gemma_1.1_2b_it,28.4,mixeval_hard-mixed,mixeval_240601,[],holistic +40,vicuna_7b_v1.5,27.8,mixeval_hard-mixed,mixeval_240601,[],holistic +41,olmo_7b_instruct,26.7,mixeval_hard-mixed,mixeval_240601,[],holistic +42,qwen1.5_4b_chat,24.6,mixeval_hard-mixed,mixeval_240601,[],holistic +43,jetmoe_8b_chat,24.3,mixeval_hard-mixed,mixeval_240601,[],holistic +44,mpt_7b_chat,23.8,mixeval_hard-mixed,mixeval_240601,[],holistic +45,llama_3_70b,54.0,mixeval_hard-mixed,mixeval_240601,[],holistic +46,qwen1.5_72b,41.9,mixeval_hard-mixed,mixeval_240601,[],holistic +47,yi_34b,47.2,mixeval_hard-mixed,mixeval_240601,[],holistic +48,qwen1.5_32b,41.0,mixeval_hard-mixed,mixeval_240601,[],holistic +49,mixtral_8x7b,40.7,mixeval_hard-mixed,mixeval_240601,[],holistic +50,llama_2_70b,41.6,mixeval_hard-mixed,mixeval_240601,[],holistic +51,qwen1.5_moe_a2.7b,33.5,mixeval_hard-mixed,mixeval_240601,[],holistic +52,qwen1.5_7b,33.7,mixeval_hard-mixed,mixeval_240601,[],holistic +53,llama_3_8b,31.7,mixeval_hard-mixed,mixeval_240601,[],holistic +54,mistral_7b,27.1,mixeval_hard-mixed,mixeval_240601,[],holistic +55,gemma_7b,32.7,mixeval_hard-mixed,mixeval_240601,[],holistic +56,yi_6b,30.4,mixeval_hard-mixed,mixeval_240601,[],holistic +57,qwen1.5_4b,23.5,mixeval_hard-mixed,mixeval_240601,[],holistic +58,jetmoe_8b,27.0,mixeval_hard-mixed,mixeval_240601,[],holistic +59,deepseek_7b,21.7,mixeval_hard-mixed,mixeval_240601,[],holistic +60,phi_2,21.9,mixeval_hard-mixed,mixeval_240601,[],holistic +61,deepseekmoe_16b,24.2,mixeval_hard-mixed,mixeval_240601,[],holistic +62,llama_2_7b,22.1,mixeval_hard-mixed,mixeval_240601,[],holistic +63,gemma_2b,22.6,mixeval_hard-mixed,mixeval_240601,[],holistic +64,olmo_7b,21.2,mixeval_hard-mixed,mixeval_240601,[],holistic +65,mpt_7b,17.4,mixeval_hard-mixed,mixeval_240601,[],holistic +66,gpt_4o_2024_05_13,87.9,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +67,claude_3_opus,88.1,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +68,gpt_4_turbo_2024_04_09,88.8,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +69,gemini_1.5_pro_api_0409,84.2,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +70,yi_large_preview,84.4,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +71,llama_3_70b_instruct,84.0,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +72,qwen_max_0428,86.1,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +73,claude_3_sonnet,81.7,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +74,reka_core_20240415,83.3,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +75,mammoth2_8x7b_plus,81.5,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +76,deepseek_v2,83.7,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +77,command_r_plus,81.5,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +78,yi_1.5_34b_chat,81.7,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +79,mistral_large,84.2,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +80,qwen1.5_72b_chat,84.1,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +81,mistral_medium,81.9,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +82,gemini_1.0_pro,78.9,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +83,reka_flash_20240226,79.8,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +84,mistral_small,81.2,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +85,llama_3_8b_instruct,75.0,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +86,command_r,77.0,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +87,qwen1.5_32b_chat,81.0,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +88,gpt_3.5_turbo_0125,79.7,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +89,claude_3_haiku,79.7,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +90,yi_34b_chat,80.1,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +91,mixtral_8x7b_instruct_v0.1,76.4,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +92,starling_lm_7b_beta,74.8,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +93,yi_1.5_9b_chat,74.2,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +94,gemma_1.1_7b_it,69.6,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +95,vicuna_33b_v1.3,66.3,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +96,llama_2_70b_chat,74.6,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +97,map_neo_instruct_v0.1,70.0,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +98,mistral_7b_instruct_v0.2,70.0,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +99,qwen1.5_7b_chat,71.4,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +100,reka_edge_20240208,68.5,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +101,zephyr_7b_beta,69.1,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +102,llama_2_7b_chat,61.7,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +103,yi_6b_chat,65.6,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +104,qwen1.5_moe_a2.7b_chat,69.1,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +105,gemma_1.1_2b_it,51.9,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +106,vicuna_7b_v1.5,60.3,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +107,olmo_7b_instruct,55.0,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +108,qwen1.5_4b_chat,57.2,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +109,jetmoe_8b_chat,51.6,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +110,mpt_7b_chat,43.8,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +111,llama_3_70b,82.2,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +112,qwen1.5_72b,79.5,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +113,yi_34b,78.3,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +114,qwen1.5_32b,77.6,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +115,mixtral_8x7b,74.0,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +116,llama_2_70b,73.2,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +117,qwen1.5_moe_a2.7b,70.2,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +118,qwen1.5_7b,68.2,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +119,llama_3_8b,65.1,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +120,mistral_7b,64.8,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +121,gemma_7b,64.7,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +122,yi_6b,63.1,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +123,qwen1.5_4b,58.2,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +124,jetmoe_8b,57.1,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +125,deepseek_7b,52.2,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +126,phi_2,51.9,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +127,deepseekmoe_16b,51.4,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +128,llama_2_7b,43.1,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +129,gemma_2b,38.9,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +130,olmo_7b,31.8,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +131,mpt_7b,30.8,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic +132,gpt_4o_2024_05_13,1287.0,arena_elo-mixed,mixeval_240601,[],holistic +133,claude_3_opus,1248.0,arena_elo-mixed,mixeval_240601,[],holistic +134,gpt_4_turbo_2024_04_09,1256.0,arena_elo-mixed,mixeval_240601,[],holistic +135,gemini_1.5_pro_api_0409,1258.0,arena_elo-mixed,mixeval_240601,[],holistic +136,yi_large_preview,1239.0,arena_elo-mixed,mixeval_240601,[],holistic +137,llama_3_70b_instruct,1208.0,arena_elo-mixed,mixeval_240601,[],holistic +138,qwen_max_0428,1184.0,arena_elo-mixed,mixeval_240601,[],holistic +139,claude_3_sonnet,1201.0,arena_elo-mixed,mixeval_240601,[],holistic +143,command_r_plus,1189.0,arena_elo-mixed,mixeval_240601,[],holistic +145,mistral_large,1156.0,arena_elo-mixed,mixeval_240601,[],holistic +146,qwen1.5_72b_chat,1147.0,arena_elo-mixed,mixeval_240601,[],holistic +147,mistral_medium,1148.0,arena_elo-mixed,mixeval_240601,[],holistic +148,gemini_1.0_pro,1131.0,arena_elo-mixed,mixeval_240601,[],holistic +149,reka_flash_20240226,1148.0,arena_elo-mixed,mixeval_240601,[],holistic +151,llama_3_8b_instruct,1153.0,arena_elo-mixed,mixeval_240601,[],holistic +152,command_r,1147.0,arena_elo-mixed,mixeval_240601,[],holistic +153,qwen1.5_32b_chat,1126.0,arena_elo-mixed,mixeval_240601,[],holistic +154,gpt_3.5_turbo_0125,1102.0,arena_elo-mixed,mixeval_240601,[],holistic +155,claude_3_haiku,1178.0,arena_elo-mixed,mixeval_240601,[],holistic +156,yi_34b_chat,1111.0,arena_elo-mixed,mixeval_240601,[],holistic +157,mixtral_8x7b_instruct_v0.1,1114.0,arena_elo-mixed,mixeval_240601,[],holistic +158,starling_lm_7b_beta,1119.0,arena_elo-mixed,mixeval_240601,[],holistic +160,gemma_1.1_7b_it,1084.0,arena_elo-mixed,mixeval_240601,[],holistic +161,vicuna_33b_v1.3,1090.0,arena_elo-mixed,mixeval_240601,[],holistic +162,llama_2_70b_chat,1093.0,arena_elo-mixed,mixeval_240601,[],holistic +164,mistral_7b_instruct_v0.2,1072.0,arena_elo-mixed,mixeval_240601,[],holistic +165,qwen1.5_7b_chat,1069.0,arena_elo-mixed,mixeval_240601,[],holistic +168,llama_2_7b_chat,1037.0,arena_elo-mixed,mixeval_240601,[],holistic +171,gemma_1.1_2b_it,1019.0,arena_elo-mixed,mixeval_240601,[],holistic +172,vicuna_7b_v1.5,1004.0,arena_elo-mixed,mixeval_240601,[],holistic +173,olmo_7b_instruct,1015.0,arena_elo-mixed,mixeval_240601,[],holistic +174,qwen1.5_4b_chat,988.0,arena_elo-mixed,mixeval_240601,[],holistic +176,mpt_7b_chat,927.0,arena_elo-mixed,mixeval_240601,[],holistic +198,gpt_4o_2024_05_13,88.0,triviaqa-mixed,mixeval_240601,[],knowledge +199,claude_3_opus,90.4,triviaqa-mixed,mixeval_240601,[],knowledge +200,gpt_4_turbo_2024_04_09,91.2,triviaqa-mixed,mixeval_240601,[],knowledge +201,gemini_1.5_pro_api_0409,85.3,triviaqa-mixed,mixeval_240601,[],knowledge +202,yi_large_preview,81.7,triviaqa-mixed,mixeval_240601,[],knowledge +203,llama_3_70b_instruct,83.1,triviaqa-mixed,mixeval_240601,[],knowledge +204,qwen_max_0428,86.7,triviaqa-mixed,mixeval_240601,[],knowledge +205,claude_3_sonnet,84.2,triviaqa-mixed,mixeval_240601,[],knowledge +206,reka_core_20240415,82.8,triviaqa-mixed,mixeval_240601,[],knowledge +207,mammoth2_8x7b_plus,83.0,triviaqa-mixed,mixeval_240601,[],knowledge +208,deepseek_v2,84.4,triviaqa-mixed,mixeval_240601,[],knowledge +209,command_r_plus,83.3,triviaqa-mixed,mixeval_240601,[],knowledge +210,yi_1.5_34b_chat,78.4,triviaqa-mixed,mixeval_240601,[],knowledge +211,mistral_large,88.3,triviaqa-mixed,mixeval_240601,[],knowledge +212,qwen1.5_72b_chat,83.9,triviaqa-mixed,mixeval_240601,[],knowledge +213,mistral_medium,86.8,triviaqa-mixed,mixeval_240601,[],knowledge +214,gemini_1.0_pro,81.0,triviaqa-mixed,mixeval_240601,[],knowledge +215,reka_flash_20240226,76.4,triviaqa-mixed,mixeval_240601,[],knowledge +216,mistral_small,85.1,triviaqa-mixed,mixeval_240601,[],knowledge +217,llama_3_8b_instruct,71.7,triviaqa-mixed,mixeval_240601,[],knowledge +218,command_r,80.9,triviaqa-mixed,mixeval_240601,[],knowledge +219,qwen1.5_32b_chat,75.7,triviaqa-mixed,mixeval_240601,[],knowledge +220,gpt_3.5_turbo_0125,85.2,triviaqa-mixed,mixeval_240601,[],knowledge +221,claude_3_haiku,79.9,triviaqa-mixed,mixeval_240601,[],knowledge +222,yi_34b_chat,82.7,triviaqa-mixed,mixeval_240601,[],knowledge +223,mixtral_8x7b_instruct_v0.1,82.5,triviaqa-mixed,mixeval_240601,[],knowledge +224,starling_lm_7b_beta,75.1,triviaqa-mixed,mixeval_240601,[],knowledge +225,yi_1.5_9b_chat,61.3,triviaqa-mixed,mixeval_240601,[],knowledge +226,gemma_1.1_7b_it,64.3,triviaqa-mixed,mixeval_240601,[],knowledge +227,vicuna_33b_v1.3,79.2,triviaqa-mixed,mixeval_240601,[],knowledge +228,llama_2_70b_chat,80.0,triviaqa-mixed,mixeval_240601,[],knowledge +229,map_neo_instruct_v0.1,62.1,triviaqa-mixed,mixeval_240601,[],knowledge +230,mistral_7b_instruct_v0.2,73.7,triviaqa-mixed,mixeval_240601,[],knowledge +231,qwen1.5_7b_chat,64.1,triviaqa-mixed,mixeval_240601,[],knowledge +232,reka_edge_20240208,60.0,triviaqa-mixed,mixeval_240601,[],knowledge +233,zephyr_7b_beta,74.7,triviaqa-mixed,mixeval_240601,[],knowledge +234,llama_2_7b_chat,68.8,triviaqa-mixed,mixeval_240601,[],knowledge +235,yi_6b_chat,66.1,triviaqa-mixed,mixeval_240601,[],knowledge +236,qwen1.5_moe_a2.7b_chat,65.9,triviaqa-mixed,mixeval_240601,[],knowledge +237,gemma_1.1_2b_it,53.7,triviaqa-mixed,mixeval_240601,[],knowledge +238,vicuna_7b_v1.5,66.4,triviaqa-mixed,mixeval_240601,[],knowledge +239,olmo_7b_instruct,51.7,triviaqa-mixed,mixeval_240601,[],knowledge +240,qwen1.5_4b_chat,46.0,triviaqa-mixed,mixeval_240601,[],knowledge +241,jetmoe_8b_chat,46.8,triviaqa-mixed,mixeval_240601,[],knowledge +242,mpt_7b_chat,50.2,triviaqa-mixed,mixeval_240601,[],knowledge +243,llama_3_70b,83.1,triviaqa-mixed,mixeval_240601,[],knowledge +244,qwen1.5_72b,78.4,triviaqa-mixed,mixeval_240601,[],knowledge +245,yi_34b,72.1,triviaqa-mixed,mixeval_240601,[],knowledge +246,qwen1.5_32b,71.9,triviaqa-mixed,mixeval_240601,[],knowledge +247,mixtral_8x7b,77.3,triviaqa-mixed,mixeval_240601,[],knowledge +248,llama_2_70b,78.7,triviaqa-mixed,mixeval_240601,[],knowledge +249,qwen1.5_moe_a2.7b,71.3,triviaqa-mixed,mixeval_240601,[],knowledge +250,qwen1.5_7b,61.4,triviaqa-mixed,mixeval_240601,[],knowledge +251,llama_3_8b,65.2,triviaqa-mixed,mixeval_240601,[],knowledge +252,mistral_7b,67.2,triviaqa-mixed,mixeval_240601,[],knowledge +253,gemma_7b,66.0,triviaqa-mixed,mixeval_240601,[],knowledge +254,yi_6b,54.7,triviaqa-mixed,mixeval_240601,[],knowledge +255,qwen1.5_4b,47.8,triviaqa-mixed,mixeval_240601,[],knowledge +256,jetmoe_8b,53.4,triviaqa-mixed,mixeval_240601,[],knowledge +257,deepseek_7b,58.7,triviaqa-mixed,mixeval_240601,[],knowledge +258,phi_2,37.0,triviaqa-mixed,mixeval_240601,[],knowledge +259,deepseekmoe_16b,64.2,triviaqa-mixed,mixeval_240601,[],knowledge +260,llama_2_7b,55.5,triviaqa-mixed,mixeval_240601,[],knowledge +261,gemma_2b,41.5,triviaqa-mixed,mixeval_240601,[],knowledge +262,olmo_7b,38.4,triviaqa-mixed,mixeval_240601,[],knowledge +263,mpt_7b,33.5,triviaqa-mixed,mixeval_240601,[],knowledge +264,gpt_4o_2024_05_13,85.4,mmlu-mixed,mixeval_240601,[],knowledge +265,claude_3_opus,83.2,mmlu-mixed,mixeval_240601,[],knowledge +266,gpt_4_turbo_2024_04_09,82.8,mmlu-mixed,mixeval_240601,[],knowledge +267,gemini_1.5_pro_api_0409,79.2,mmlu-mixed,mixeval_240601,[],knowledge +268,yi_large_preview,80.9,mmlu-mixed,mixeval_240601,[],knowledge +269,llama_3_70b_instruct,80.5,mmlu-mixed,mixeval_240601,[],knowledge +270,qwen_max_0428,80.6,mmlu-mixed,mixeval_240601,[],knowledge +271,claude_3_sonnet,74.7,mmlu-mixed,mixeval_240601,[],knowledge +272,reka_core_20240415,79.3,mmlu-mixed,mixeval_240601,[],knowledge +273,mammoth2_8x7b_plus,74.5,mmlu-mixed,mixeval_240601,[],knowledge +274,deepseek_v2,77.3,mmlu-mixed,mixeval_240601,[],knowledge +275,command_r_plus,78.9,mmlu-mixed,mixeval_240601,[],knowledge +276,yi_1.5_34b_chat,76.4,mmlu-mixed,mixeval_240601,[],knowledge +277,mistral_large,80.2,mmlu-mixed,mixeval_240601,[],knowledge +278,qwen1.5_72b_chat,80.1,mmlu-mixed,mixeval_240601,[],knowledge +279,mistral_medium,76.3,mmlu-mixed,mixeval_240601,[],knowledge +280,gemini_1.0_pro,74.9,mmlu-mixed,mixeval_240601,[],knowledge +281,reka_flash_20240226,75.4,mmlu-mixed,mixeval_240601,[],knowledge +282,mistral_small,75.2,mmlu-mixed,mixeval_240601,[],knowledge +283,llama_3_8b_instruct,71.9,mmlu-mixed,mixeval_240601,[],knowledge +284,command_r,75.0,mmlu-mixed,mixeval_240601,[],knowledge +285,qwen1.5_32b_chat,78.0,mmlu-mixed,mixeval_240601,[],knowledge +286,gpt_3.5_turbo_0125,74.5,mmlu-mixed,mixeval_240601,[],knowledge +287,claude_3_haiku,76.1,mmlu-mixed,mixeval_240601,[],knowledge +288,yi_34b_chat,73.6,mmlu-mixed,mixeval_240601,[],knowledge +289,mixtral_8x7b_instruct_v0.1,72.0,mmlu-mixed,mixeval_240601,[],knowledge +290,starling_lm_7b_beta,69.0,mmlu-mixed,mixeval_240601,[],knowledge +291,yi_1.5_9b_chat,72.6,mmlu-mixed,mixeval_240601,[],knowledge +292,gemma_1.1_7b_it,66.9,mmlu-mixed,mixeval_240601,[],knowledge +293,vicuna_33b_v1.3,59.2,mmlu-mixed,mixeval_240601,[],knowledge +294,llama_2_70b_chat,69.8,mmlu-mixed,mixeval_240601,[],knowledge +295,map_neo_instruct_v0.1,66.7,mmlu-mixed,mixeval_240601,[],knowledge +296,mistral_7b_instruct_v0.2,67.3,mmlu-mixed,mixeval_240601,[],knowledge +297,qwen1.5_7b_chat,68.7,mmlu-mixed,mixeval_240601,[],knowledge +298,reka_edge_20240208,63.6,mmlu-mixed,mixeval_240601,[],knowledge +299,zephyr_7b_beta,64.9,mmlu-mixed,mixeval_240601,[],knowledge +300,llama_2_7b_chat,59.4,mmlu-mixed,mixeval_240601,[],knowledge +301,yi_6b_chat,65.4,mmlu-mixed,mixeval_240601,[],knowledge +302,qwen1.5_moe_a2.7b_chat,69.5,mmlu-mixed,mixeval_240601,[],knowledge +303,gemma_1.1_2b_it,51.5,mmlu-mixed,mixeval_240601,[],knowledge +304,vicuna_7b_v1.5,58.7,mmlu-mixed,mixeval_240601,[],knowledge +305,olmo_7b_instruct,57.1,mmlu-mixed,mixeval_240601,[],knowledge +306,qwen1.5_4b_chat,61.4,mmlu-mixed,mixeval_240601,[],knowledge +307,jetmoe_8b_chat,58.5,mmlu-mixed,mixeval_240601,[],knowledge +308,mpt_7b_chat,37.8,mmlu-mixed,mixeval_240601,[],knowledge +309,llama_3_70b,79.8,mmlu-mixed,mixeval_240601,[],knowledge +310,qwen1.5_72b,78.8,mmlu-mixed,mixeval_240601,[],knowledge +311,yi_34b,79.3,mmlu-mixed,mixeval_240601,[],knowledge +312,qwen1.5_32b,77.2,mmlu-mixed,mixeval_240601,[],knowledge +313,mixtral_8x7b,71.6,mmlu-mixed,mixeval_240601,[],knowledge +314,llama_2_70b,70.8,mmlu-mixed,mixeval_240601,[],knowledge +315,qwen1.5_moe_a2.7b,69.4,mmlu-mixed,mixeval_240601,[],knowledge +316,qwen1.5_7b,67.0,mmlu-mixed,mixeval_240601,[],knowledge +317,llama_3_8b,69.5,mmlu-mixed,mixeval_240601,[],knowledge +318,mistral_7b,68.5,mmlu-mixed,mixeval_240601,[],knowledge +319,gemma_7b,67.4,mmlu-mixed,mixeval_240601,[],knowledge +320,yi_6b,71.2,mmlu-mixed,mixeval_240601,[],knowledge +321,qwen1.5_4b,59.6,mmlu-mixed,mixeval_240601,[],knowledge +322,jetmoe_8b,55.3,mmlu-mixed,mixeval_240601,[],knowledge +323,deepseek_7b,53.3,mmlu-mixed,mixeval_240601,[],knowledge +324,phi_2,62.5,mmlu-mixed,mixeval_240601,[],knowledge +325,deepseekmoe_16b,49.9,mmlu-mixed,mixeval_240601,[],knowledge +326,llama_2_7b,40.8,mmlu-mixed,mixeval_240601,[],knowledge +327,gemma_2b,37.4,mmlu-mixed,mixeval_240601,[],knowledge +328,olmo_7b,29.7,mmlu-mixed,mixeval_240601,[],knowledge +329,mpt_7b,30.9,mmlu-mixed,mixeval_240601,[],knowledge +330,gpt_4o_2024_05_13,87.9,drop-mixed,mixeval_240601,[],reasoning +331,claude_3_opus,91.5,drop-mixed,mixeval_240601,[],reasoning +332,gpt_4_turbo_2024_04_09,91.0,drop-mixed,mixeval_240601,[],reasoning +333,gemini_1.5_pro_api_0409,84.2,drop-mixed,mixeval_240601,[],reasoning +334,yi_large_preview,87.0,drop-mixed,mixeval_240601,[],reasoning +335,llama_3_70b_instruct,90.1,drop-mixed,mixeval_240601,[],reasoning +336,qwen_max_0428,85.4,drop-mixed,mixeval_240601,[],reasoning +337,claude_3_sonnet,87.7,drop-mixed,mixeval_240601,[],reasoning +338,reka_core_20240415,88.1,drop-mixed,mixeval_240601,[],reasoning +339,mammoth2_8x7b_plus,85.7,drop-mixed,mixeval_240601,[],reasoning +340,deepseek_v2,85.3,drop-mixed,mixeval_240601,[],reasoning +341,command_r_plus,80.4,drop-mixed,mixeval_240601,[],reasoning +342,yi_1.5_34b_chat,87.0,drop-mixed,mixeval_240601,[],reasoning +343,mistral_large,88.6,drop-mixed,mixeval_240601,[],reasoning +344,qwen1.5_72b_chat,85.1,drop-mixed,mixeval_240601,[],reasoning +345,mistral_medium,83.2,drop-mixed,mixeval_240601,[],reasoning +346,gemini_1.0_pro,82.6,drop-mixed,mixeval_240601,[],reasoning +347,reka_flash_20240226,86.7,drop-mixed,mixeval_240601,[],reasoning +348,mistral_small,86.1,drop-mixed,mixeval_240601,[],reasoning +349,llama_3_8b_instruct,86.4,drop-mixed,mixeval_240601,[],reasoning +350,command_r,72.0,drop-mixed,mixeval_240601,[],reasoning +351,qwen1.5_32b_chat,82.9,drop-mixed,mixeval_240601,[],reasoning +352,gpt_3.5_turbo_0125,84.8,drop-mixed,mixeval_240601,[],reasoning +353,claude_3_haiku,85.0,drop-mixed,mixeval_240601,[],reasoning +354,yi_34b_chat,86.1,drop-mixed,mixeval_240601,[],reasoning +355,mixtral_8x7b_instruct_v0.1,79.5,drop-mixed,mixeval_240601,[],reasoning +356,starling_lm_7b_beta,86.4,drop-mixed,mixeval_240601,[],reasoning +357,yi_1.5_9b_chat,83.9,drop-mixed,mixeval_240601,[],reasoning +358,gemma_1.1_7b_it,80.6,drop-mixed,mixeval_240601,[],reasoning +359,vicuna_33b_v1.3,71.4,drop-mixed,mixeval_240601,[],reasoning +360,llama_2_70b_chat,79.8,drop-mixed,mixeval_240601,[],reasoning +361,map_neo_instruct_v0.1,75.5,drop-mixed,mixeval_240601,[],reasoning +362,mistral_7b_instruct_v0.2,72.8,drop-mixed,mixeval_240601,[],reasoning +363,qwen1.5_7b_chat,76.4,drop-mixed,mixeval_240601,[],reasoning +364,reka_edge_20240208,80.0,drop-mixed,mixeval_240601,[],reasoning +365,zephyr_7b_beta,77.3,drop-mixed,mixeval_240601,[],reasoning +366,llama_2_7b_chat,69.3,drop-mixed,mixeval_240601,[],reasoning +367,yi_6b_chat,70.5,drop-mixed,mixeval_240601,[],reasoning +368,qwen1.5_moe_a2.7b_chat,64.6,drop-mixed,mixeval_240601,[],reasoning +369,gemma_1.1_2b_it,59.8,drop-mixed,mixeval_240601,[],reasoning +370,vicuna_7b_v1.5,68.3,drop-mixed,mixeval_240601,[],reasoning +371,olmo_7b_instruct,53.1,drop-mixed,mixeval_240601,[],reasoning +372,qwen1.5_4b_chat,57.2,drop-mixed,mixeval_240601,[],reasoning +373,jetmoe_8b_chat,27.0,drop-mixed,mixeval_240601,[],reasoning +374,mpt_7b_chat,50.0,drop-mixed,mixeval_240601,[],reasoning +375,llama_3_70b,81.5,drop-mixed,mixeval_240601,[],reasoning +376,qwen1.5_72b,64.5,drop-mixed,mixeval_240601,[],reasoning +377,yi_34b,78.2,drop-mixed,mixeval_240601,[],reasoning +378,qwen1.5_32b,68.7,drop-mixed,mixeval_240601,[],reasoning +379,mixtral_8x7b,69.8,drop-mixed,mixeval_240601,[],reasoning +380,llama_2_70b,73.2,drop-mixed,mixeval_240601,[],reasoning +381,qwen1.5_moe_a2.7b,59.9,drop-mixed,mixeval_240601,[],reasoning +382,qwen1.5_7b,63.6,drop-mixed,mixeval_240601,[],reasoning +383,llama_3_8b,63.8,drop-mixed,mixeval_240601,[],reasoning +384,mistral_7b,61.3,drop-mixed,mixeval_240601,[],reasoning +385,gemma_7b,63.8,drop-mixed,mixeval_240601,[],reasoning +386,yi_6b,51.4,drop-mixed,mixeval_240601,[],reasoning +387,qwen1.5_4b,51.0,drop-mixed,mixeval_240601,[],reasoning +388,jetmoe_8b,44.1,drop-mixed,mixeval_240601,[],reasoning +389,deepseek_7b,43.5,drop-mixed,mixeval_240601,[],reasoning +390,phi_2,50.4,drop-mixed,mixeval_240601,[],reasoning +391,deepseekmoe_16b,41.1,drop-mixed,mixeval_240601,[],reasoning +392,llama_2_7b,37.6,drop-mixed,mixeval_240601,[],reasoning +393,gemma_2b,32.6,drop-mixed,mixeval_240601,[],reasoning +394,olmo_7b,24.0,drop-mixed,mixeval_240601,[],reasoning +395,mpt_7b,26.8,drop-mixed,mixeval_240601,[],reasoning +396,gpt_4o_2024_05_13,94.3,hellaswag-mixed,mixeval_240601,[],reasoning +397,claude_3_opus,93.3,hellaswag-mixed,mixeval_240601,[],reasoning +398,gpt_4_turbo_2024_04_09,92.6,hellaswag-mixed,mixeval_240601,[],reasoning +399,gemini_1.5_pro_api_0409,89.2,hellaswag-mixed,mixeval_240601,[],reasoning +400,yi_large_preview,92.6,hellaswag-mixed,mixeval_240601,[],reasoning +401,llama_3_70b_instruct,81.8,hellaswag-mixed,mixeval_240601,[],reasoning +402,qwen_max_0428,93.6,hellaswag-mixed,mixeval_240601,[],reasoning +403,claude_3_sonnet,85.9,hellaswag-mixed,mixeval_240601,[],reasoning +404,reka_core_20240415,88.6,hellaswag-mixed,mixeval_240601,[],reasoning +405,mammoth2_8x7b_plus,82.2,hellaswag-mixed,mixeval_240601,[],reasoning +406,deepseek_v2,88.2,hellaswag-mixed,mixeval_240601,[],reasoning +407,command_r_plus,83.5,hellaswag-mixed,mixeval_240601,[],reasoning +408,yi_1.5_34b_chat,90.2,hellaswag-mixed,mixeval_240601,[],reasoning +409,mistral_large,65.0,hellaswag-mixed,mixeval_240601,[],reasoning +410,qwen1.5_72b_chat,87.9,hellaswag-mixed,mixeval_240601,[],reasoning +411,mistral_medium,72.4,hellaswag-mixed,mixeval_240601,[],reasoning +412,gemini_1.0_pro,74.7,hellaswag-mixed,mixeval_240601,[],reasoning +413,reka_flash_20240226,90.6,hellaswag-mixed,mixeval_240601,[],reasoning +414,mistral_small,73.4,hellaswag-mixed,mixeval_240601,[],reasoning +415,llama_3_8b_instruct,65.7,hellaswag-mixed,mixeval_240601,[],reasoning +416,command_r,75.8,hellaswag-mixed,mixeval_240601,[],reasoning +417,qwen1.5_32b_chat,85.9,hellaswag-mixed,mixeval_240601,[],reasoning +418,gpt_3.5_turbo_0125,63.0,hellaswag-mixed,mixeval_240601,[],reasoning +419,claude_3_haiku,75.8,hellaswag-mixed,mixeval_240601,[],reasoning +420,yi_34b_chat,86.9,hellaswag-mixed,mixeval_240601,[],reasoning +421,mixtral_8x7b_instruct_v0.1,54.2,hellaswag-mixed,mixeval_240601,[],reasoning +422,starling_lm_7b_beta,48.5,hellaswag-mixed,mixeval_240601,[],reasoning +423,yi_1.5_9b_chat,86.5,hellaswag-mixed,mixeval_240601,[],reasoning +424,gemma_1.1_7b_it,66.3,hellaswag-mixed,mixeval_240601,[],reasoning +425,vicuna_33b_v1.3,30.3,hellaswag-mixed,mixeval_240601,[],reasoning +426,llama_2_70b_chat,67.3,hellaswag-mixed,mixeval_240601,[],reasoning +427,map_neo_instruct_v0.1,74.4,hellaswag-mixed,mixeval_240601,[],reasoning +428,mistral_7b_instruct_v0.2,54.2,hellaswag-mixed,mixeval_240601,[],reasoning +429,qwen1.5_7b_chat,76.1,hellaswag-mixed,mixeval_240601,[],reasoning +430,reka_edge_20240208,74.7,hellaswag-mixed,mixeval_240601,[],reasoning +431,zephyr_7b_beta,39.1,hellaswag-mixed,mixeval_240601,[],reasoning +432,llama_2_7b_chat,35.7,hellaswag-mixed,mixeval_240601,[],reasoning +433,yi_6b_chat,52.5,hellaswag-mixed,mixeval_240601,[],reasoning +434,qwen1.5_moe_a2.7b_chat,72.7,hellaswag-mixed,mixeval_240601,[],reasoning +435,gemma_1.1_2b_it,26.6,hellaswag-mixed,mixeval_240601,[],reasoning +436,vicuna_7b_v1.5,24.9,hellaswag-mixed,mixeval_240601,[],reasoning +437,olmo_7b_instruct,55.9,hellaswag-mixed,mixeval_240601,[],reasoning +438,qwen1.5_4b_chat,54.9,hellaswag-mixed,mixeval_240601,[],reasoning +439,jetmoe_8b_chat,86.2,hellaswag-mixed,mixeval_240601,[],reasoning +440,mpt_7b_chat,25.6,hellaswag-mixed,mixeval_240601,[],reasoning +441,llama_3_70b,90.9,hellaswag-mixed,mixeval_240601,[],reasoning +442,qwen1.5_72b,91.9,hellaswag-mixed,mixeval_240601,[],reasoning +443,yi_34b,98.0,hellaswag-mixed,mixeval_240601,[],reasoning +444,qwen1.5_32b,93.3,hellaswag-mixed,mixeval_240601,[],reasoning +445,mixtral_8x7b,73.7,hellaswag-mixed,mixeval_240601,[],reasoning +446,llama_2_70b,63.0,hellaswag-mixed,mixeval_240601,[],reasoning +447,qwen1.5_moe_a2.7b,80.1,hellaswag-mixed,mixeval_240601,[],reasoning +448,qwen1.5_7b,83.8,hellaswag-mixed,mixeval_240601,[],reasoning +449,llama_3_8b,51.5,hellaswag-mixed,mixeval_240601,[],reasoning +450,mistral_7b,54.5,hellaswag-mixed,mixeval_240601,[],reasoning +451,gemma_7b,36.0,hellaswag-mixed,mixeval_240601,[],reasoning +452,yi_6b,77.4,hellaswag-mixed,mixeval_240601,[],reasoning +453,qwen1.5_4b,65.7,hellaswag-mixed,mixeval_240601,[],reasoning +454,jetmoe_8b,89.2,hellaswag-mixed,mixeval_240601,[],reasoning +455,deepseek_7b,35.0,hellaswag-mixed,mixeval_240601,[],reasoning +456,phi_2,20.2,hellaswag-mixed,mixeval_240601,[],reasoning +457,deepseekmoe_16b,28.6,hellaswag-mixed,mixeval_240601,[],reasoning +458,llama_2_7b,24.9,hellaswag-mixed,mixeval_240601,[],reasoning +459,gemma_2b,33.3,hellaswag-mixed,mixeval_240601,[],reasoning +460,olmo_7b,26.9,hellaswag-mixed,mixeval_240601,[],reasoning +461,mpt_7b,19.2,hellaswag-mixed,mixeval_240601,[],reasoning +462,gpt_4o_2024_05_13,86.8,commonsenseqa-mixed,mixeval_240601,[],reasoning +463,claude_3_opus,87.7,commonsenseqa-mixed,mixeval_240601,[],reasoning +464,gpt_4_turbo_2024_04_09,85.4,commonsenseqa-mixed,mixeval_240601,[],reasoning +465,gemini_1.5_pro_api_0409,84.4,commonsenseqa-mixed,mixeval_240601,[],reasoning +466,yi_large_preview,90.1,commonsenseqa-mixed,mixeval_240601,[],reasoning +467,llama_3_70b_instruct,83.0,commonsenseqa-mixed,mixeval_240601,[],reasoning +468,qwen_max_0428,88.2,commonsenseqa-mixed,mixeval_240601,[],reasoning +469,claude_3_sonnet,82.5,commonsenseqa-mixed,mixeval_240601,[],reasoning +470,reka_core_20240415,81.6,commonsenseqa-mixed,mixeval_240601,[],reasoning +471,mammoth2_8x7b_plus,82.5,commonsenseqa-mixed,mixeval_240601,[],reasoning +472,deepseek_v2,84.0,commonsenseqa-mixed,mixeval_240601,[],reasoning +473,command_r_plus,82.1,commonsenseqa-mixed,mixeval_240601,[],reasoning +474,yi_1.5_34b_chat,86.8,commonsenseqa-mixed,mixeval_240601,[],reasoning +475,mistral_large,83.5,commonsenseqa-mixed,mixeval_240601,[],reasoning +476,qwen1.5_72b_chat,86.3,commonsenseqa-mixed,mixeval_240601,[],reasoning +477,mistral_medium,82.5,commonsenseqa-mixed,mixeval_240601,[],reasoning +478,gemini_1.0_pro,80.2,commonsenseqa-mixed,mixeval_240601,[],reasoning +479,reka_flash_20240226,80.7,commonsenseqa-mixed,mixeval_240601,[],reasoning +480,mistral_small,77.8,commonsenseqa-mixed,mixeval_240601,[],reasoning +481,llama_3_8b_instruct,78.3,commonsenseqa-mixed,mixeval_240601,[],reasoning +482,command_r,77.4,commonsenseqa-mixed,mixeval_240601,[],reasoning +483,qwen1.5_32b_chat,88.2,commonsenseqa-mixed,mixeval_240601,[],reasoning +484,gpt_3.5_turbo_0125,81.6,commonsenseqa-mixed,mixeval_240601,[],reasoning +485,claude_3_haiku,78.8,commonsenseqa-mixed,mixeval_240601,[],reasoning +486,yi_34b_chat,78.8,commonsenseqa-mixed,mixeval_240601,[],reasoning +487,mixtral_8x7b_instruct_v0.1,77.4,commonsenseqa-mixed,mixeval_240601,[],reasoning +488,starling_lm_7b_beta,84.9,commonsenseqa-mixed,mixeval_240601,[],reasoning +489,yi_1.5_9b_chat,82.5,commonsenseqa-mixed,mixeval_240601,[],reasoning +490,gemma_1.1_7b_it,73.6,commonsenseqa-mixed,mixeval_240601,[],reasoning +491,vicuna_33b_v1.3,61.8,commonsenseqa-mixed,mixeval_240601,[],reasoning +492,llama_2_70b_chat,74.1,commonsenseqa-mixed,mixeval_240601,[],reasoning +493,map_neo_instruct_v0.1,82.1,commonsenseqa-mixed,mixeval_240601,[],reasoning +494,mistral_7b_instruct_v0.2,66.0,commonsenseqa-mixed,mixeval_240601,[],reasoning +495,qwen1.5_7b_chat,82.1,commonsenseqa-mixed,mixeval_240601,[],reasoning +496,reka_edge_20240208,80.7,commonsenseqa-mixed,mixeval_240601,[],reasoning +497,zephyr_7b_beta,69.3,commonsenseqa-mixed,mixeval_240601,[],reasoning +498,llama_2_7b_chat,61.3,commonsenseqa-mixed,mixeval_240601,[],reasoning +499,yi_6b_chat,69.8,commonsenseqa-mixed,mixeval_240601,[],reasoning +500,qwen1.5_moe_a2.7b_chat,81.1,commonsenseqa-mixed,mixeval_240601,[],reasoning +501,gemma_1.1_2b_it,57.1,commonsenseqa-mixed,mixeval_240601,[],reasoning +502,vicuna_7b_v1.5,62.7,commonsenseqa-mixed,mixeval_240601,[],reasoning +503,olmo_7b_instruct,64.6,commonsenseqa-mixed,mixeval_240601,[],reasoning +504,qwen1.5_4b_chat,74.1,commonsenseqa-mixed,mixeval_240601,[],reasoning +505,jetmoe_8b_chat,68.4,commonsenseqa-mixed,mixeval_240601,[],reasoning +506,mpt_7b_chat,36.3,commonsenseqa-mixed,mixeval_240601,[],reasoning +507,llama_3_70b,85.4,commonsenseqa-mixed,mixeval_240601,[],reasoning +508,qwen1.5_72b,87.3,commonsenseqa-mixed,mixeval_240601,[],reasoning +509,yi_34b,81.1,commonsenseqa-mixed,mixeval_240601,[],reasoning +510,qwen1.5_32b,89.2,commonsenseqa-mixed,mixeval_240601,[],reasoning +511,mixtral_8x7b,77.4,commonsenseqa-mixed,mixeval_240601,[],reasoning +512,llama_2_70b,77.4,commonsenseqa-mixed,mixeval_240601,[],reasoning +513,qwen1.5_moe_a2.7b,80.2,commonsenseqa-mixed,mixeval_240601,[],reasoning +514,qwen1.5_7b,84.4,commonsenseqa-mixed,mixeval_240601,[],reasoning +515,llama_3_8b,69.8,commonsenseqa-mixed,mixeval_240601,[],reasoning +516,mistral_7b,67.9,commonsenseqa-mixed,mixeval_240601,[],reasoning +517,gemma_7b,68.4,commonsenseqa-mixed,mixeval_240601,[],reasoning +518,yi_6b,76.4,commonsenseqa-mixed,mixeval_240601,[],reasoning +519,qwen1.5_4b,79.2,commonsenseqa-mixed,mixeval_240601,[],reasoning +520,jetmoe_8b,60.4,commonsenseqa-mixed,mixeval_240601,[],reasoning +521,deepseek_7b,51.4,commonsenseqa-mixed,mixeval_240601,[],reasoning +522,phi_2,68.9,commonsenseqa-mixed,mixeval_240601,[],reasoning +523,deepseekmoe_16b,48.6,commonsenseqa-mixed,mixeval_240601,[],reasoning +524,llama_2_7b,30.7,commonsenseqa-mixed,mixeval_240601,[],reasoning +525,gemma_2b,31.6,commonsenseqa-mixed,mixeval_240601,[],reasoning +526,olmo_7b,25.5,commonsenseqa-mixed,mixeval_240601,[],reasoning +527,mpt_7b,28.8,commonsenseqa-mixed,mixeval_240601,[],reasoning +528,gpt_4o_2024_05_13,70.3,triviaqa_hard-mixed,mixeval_240601,[],knowledge +529,claude_3_opus,71.4,triviaqa_hard-mixed,mixeval_240601,[],knowledge +530,gpt_4_turbo_2024_04_09,73.1,triviaqa_hard-mixed,mixeval_240601,[],knowledge +531,gemini_1.5_pro_api_0409,67.8,triviaqa_hard-mixed,mixeval_240601,[],knowledge +532,yi_large_preview,55.4,triviaqa_hard-mixed,mixeval_240601,[],knowledge +533,llama_3_70b_instruct,60.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge +534,qwen_max_0428,61.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge +535,claude_3_sonnet,59.1,triviaqa_hard-mixed,mixeval_240601,[],knowledge +536,reka_core_20240415,51.6,triviaqa_hard-mixed,mixeval_240601,[],knowledge +537,mammoth2_8x7b_plus,52.9,triviaqa_hard-mixed,mixeval_240601,[],knowledge +538,deepseek_v2,51.7,triviaqa_hard-mixed,mixeval_240601,[],knowledge +539,command_r_plus,57.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge +540,yi_1.5_34b_chat,44.4,triviaqa_hard-mixed,mixeval_240601,[],knowledge +541,mistral_large,55.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge +542,qwen1.5_72b_chat,49.9,triviaqa_hard-mixed,mixeval_240601,[],knowledge +543,mistral_medium,59.8,triviaqa_hard-mixed,mixeval_240601,[],knowledge +544,gemini_1.0_pro,58.2,triviaqa_hard-mixed,mixeval_240601,[],knowledge +545,reka_flash_20240226,42.9,triviaqa_hard-mixed,mixeval_240601,[],knowledge +546,mistral_small,56.0,triviaqa_hard-mixed,mixeval_240601,[],knowledge +547,llama_3_8b_instruct,40.2,triviaqa_hard-mixed,mixeval_240601,[],knowledge +548,command_r,57.0,triviaqa_hard-mixed,mixeval_240601,[],knowledge +549,qwen1.5_32b_chat,39.1,triviaqa_hard-mixed,mixeval_240601,[],knowledge +550,gpt_3.5_turbo_0125,46.4,triviaqa_hard-mixed,mixeval_240601,[],knowledge +551,claude_3_haiku,42.4,triviaqa_hard-mixed,mixeval_240601,[],knowledge +552,yi_34b_chat,41.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge +553,mixtral_8x7b_instruct_v0.1,48.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge +554,starling_lm_7b_beta,33.4,triviaqa_hard-mixed,mixeval_240601,[],knowledge +555,yi_1.5_9b_chat,23.3,triviaqa_hard-mixed,mixeval_240601,[],knowledge +556,gemma_1.1_7b_it,30.3,triviaqa_hard-mixed,mixeval_240601,[],knowledge +557,vicuna_33b_v1.3,42.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge +558,llama_2_70b_chat,42.2,triviaqa_hard-mixed,mixeval_240601,[],knowledge +559,map_neo_instruct_v0.1,26.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge +560,mistral_7b_instruct_v0.2,33.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge +561,qwen1.5_7b_chat,29.0,triviaqa_hard-mixed,mixeval_240601,[],knowledge +562,reka_edge_20240208,18.6,triviaqa_hard-mixed,mixeval_240601,[],knowledge +563,zephyr_7b_beta,30.2,triviaqa_hard-mixed,mixeval_240601,[],knowledge +564,llama_2_7b_chat,24.8,triviaqa_hard-mixed,mixeval_240601,[],knowledge +565,yi_6b_chat,18.9,triviaqa_hard-mixed,mixeval_240601,[],knowledge +566,qwen1.5_moe_a2.7b_chat,21.9,triviaqa_hard-mixed,mixeval_240601,[],knowledge +567,gemma_1.1_2b_it,31.9,triviaqa_hard-mixed,mixeval_240601,[],knowledge +568,vicuna_7b_v1.5,25.9,triviaqa_hard-mixed,mixeval_240601,[],knowledge +569,olmo_7b_instruct,24.7,triviaqa_hard-mixed,mixeval_240601,[],knowledge +570,qwen1.5_4b_chat,16.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge +571,jetmoe_8b_chat,19.2,triviaqa_hard-mixed,mixeval_240601,[],knowledge +572,mpt_7b_chat,17.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge +573,llama_3_70b,59.1,triviaqa_hard-mixed,mixeval_240601,[],knowledge +574,qwen1.5_72b,41.4,triviaqa_hard-mixed,mixeval_240601,[],knowledge +575,yi_34b,39.4,triviaqa_hard-mixed,mixeval_240601,[],knowledge +576,qwen1.5_32b,28.0,triviaqa_hard-mixed,mixeval_240601,[],knowledge +577,mixtral_8x7b,44.1,triviaqa_hard-mixed,mixeval_240601,[],knowledge +578,llama_2_70b,53.8,triviaqa_hard-mixed,mixeval_240601,[],knowledge +579,qwen1.5_moe_a2.7b,36.0,triviaqa_hard-mixed,mixeval_240601,[],knowledge +580,qwen1.5_7b,31.6,triviaqa_hard-mixed,mixeval_240601,[],knowledge +581,llama_3_8b,22.6,triviaqa_hard-mixed,mixeval_240601,[],knowledge +582,mistral_7b,24.2,triviaqa_hard-mixed,mixeval_240601,[],knowledge +583,gemma_7b,31.1,triviaqa_hard-mixed,mixeval_240601,[],knowledge +584,yi_6b,17.0,triviaqa_hard-mixed,mixeval_240601,[],knowledge +585,qwen1.5_4b,14.0,triviaqa_hard-mixed,mixeval_240601,[],knowledge +586,jetmoe_8b,22.8,triviaqa_hard-mixed,mixeval_240601,[],knowledge +587,deepseek_7b,21.4,triviaqa_hard-mixed,mixeval_240601,[],knowledge +588,phi_2,7.3,triviaqa_hard-mixed,mixeval_240601,[],knowledge +589,deepseekmoe_16b,24.9,triviaqa_hard-mixed,mixeval_240601,[],knowledge +590,llama_2_7b,19.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge +591,gemma_2b,12.1,triviaqa_hard-mixed,mixeval_240601,[],knowledge +592,olmo_7b,16.0,triviaqa_hard-mixed,mixeval_240601,[],knowledge +593,mpt_7b,6.6,triviaqa_hard-mixed,mixeval_240601,[],knowledge +594,gpt_4o_2024_05_13,57.1,mmlu_hard-mixed,mixeval_240601,[],knowledge +595,claude_3_opus,55.0,mmlu_hard-mixed,mixeval_240601,[],knowledge +596,gpt_4_turbo_2024_04_09,45.5,mmlu_hard-mixed,mixeval_240601,[],knowledge +597,gemini_1.5_pro_api_0409,44.6,mmlu_hard-mixed,mixeval_240601,[],knowledge +598,yi_large_preview,48.5,mmlu_hard-mixed,mixeval_240601,[],knowledge +599,llama_3_70b_instruct,46.3,mmlu_hard-mixed,mixeval_240601,[],knowledge +600,qwen_max_0428,41.6,mmlu_hard-mixed,mixeval_240601,[],knowledge +601,claude_3_sonnet,40.7,mmlu_hard-mixed,mixeval_240601,[],knowledge +602,reka_core_20240415,46.3,mmlu_hard-mixed,mixeval_240601,[],knowledge +603,mammoth2_8x7b_plus,41.1,mmlu_hard-mixed,mixeval_240601,[],knowledge +604,deepseek_v2,42.0,mmlu_hard-mixed,mixeval_240601,[],knowledge +605,command_r_plus,42.0,mmlu_hard-mixed,mixeval_240601,[],knowledge +606,yi_1.5_34b_chat,38.1,mmlu_hard-mixed,mixeval_240601,[],knowledge +607,mistral_large,42.4,mmlu_hard-mixed,mixeval_240601,[],knowledge +608,qwen1.5_72b_chat,37.7,mmlu_hard-mixed,mixeval_240601,[],knowledge +609,mistral_medium,38.5,mmlu_hard-mixed,mixeval_240601,[],knowledge +610,gemini_1.0_pro,35.5,mmlu_hard-mixed,mixeval_240601,[],knowledge +611,reka_flash_20240226,34.6,mmlu_hard-mixed,mixeval_240601,[],knowledge +612,mistral_small,33.8,mmlu_hard-mixed,mixeval_240601,[],knowledge +613,llama_3_8b_instruct,40.7,mmlu_hard-mixed,mixeval_240601,[],knowledge +614,command_r,39.0,mmlu_hard-mixed,mixeval_240601,[],knowledge +615,qwen1.5_32b_chat,29.9,mmlu_hard-mixed,mixeval_240601,[],knowledge +616,gpt_3.5_turbo_0125,35.1,mmlu_hard-mixed,mixeval_240601,[],knowledge +617,claude_3_haiku,30.7,mmlu_hard-mixed,mixeval_240601,[],knowledge +618,yi_34b_chat,29.9,mmlu_hard-mixed,mixeval_240601,[],knowledge +619,mixtral_8x7b_instruct_v0.1,37.2,mmlu_hard-mixed,mixeval_240601,[],knowledge +620,starling_lm_7b_beta,34.2,mmlu_hard-mixed,mixeval_240601,[],knowledge +621,yi_1.5_9b_chat,36.8,mmlu_hard-mixed,mixeval_240601,[],knowledge +622,gemma_1.1_7b_it,39.0,mmlu_hard-mixed,mixeval_240601,[],knowledge +623,vicuna_33b_v1.3,39.4,mmlu_hard-mixed,mixeval_240601,[],knowledge +624,llama_2_70b_chat,27.7,mmlu_hard-mixed,mixeval_240601,[],knowledge +625,map_neo_instruct_v0.1,32.5,mmlu_hard-mixed,mixeval_240601,[],knowledge +626,mistral_7b_instruct_v0.2,29.4,mmlu_hard-mixed,mixeval_240601,[],knowledge +627,qwen1.5_7b_chat,29.0,mmlu_hard-mixed,mixeval_240601,[],knowledge +628,reka_edge_20240208,26.4,mmlu_hard-mixed,mixeval_240601,[],knowledge +629,zephyr_7b_beta,24.2,mmlu_hard-mixed,mixeval_240601,[],knowledge +630,llama_2_7b_chat,30.3,mmlu_hard-mixed,mixeval_240601,[],knowledge +631,yi_6b_chat,26.8,mmlu_hard-mixed,mixeval_240601,[],knowledge +632,qwen1.5_moe_a2.7b_chat,26.8,mmlu_hard-mixed,mixeval_240601,[],knowledge +633,gemma_1.1_2b_it,30.3,mmlu_hard-mixed,mixeval_240601,[],knowledge +634,vicuna_7b_v1.5,23.4,mmlu_hard-mixed,mixeval_240601,[],knowledge +635,olmo_7b_instruct,27.3,mmlu_hard-mixed,mixeval_240601,[],knowledge +636,qwen1.5_4b_chat,17.3,mmlu_hard-mixed,mixeval_240601,[],knowledge +637,jetmoe_8b_chat,25.5,mmlu_hard-mixed,mixeval_240601,[],knowledge +638,mpt_7b_chat,24.7,mmlu_hard-mixed,mixeval_240601,[],knowledge +639,llama_3_70b,39.8,mmlu_hard-mixed,mixeval_240601,[],knowledge +640,qwen1.5_72b,42.4,mmlu_hard-mixed,mixeval_240601,[],knowledge +641,yi_34b,42.4,mmlu_hard-mixed,mixeval_240601,[],knowledge +642,qwen1.5_32b,37.2,mmlu_hard-mixed,mixeval_240601,[],knowledge +643,mixtral_8x7b,34.6,mmlu_hard-mixed,mixeval_240601,[],knowledge +644,llama_2_70b,29.0,mmlu_hard-mixed,mixeval_240601,[],knowledge +645,qwen1.5_moe_a2.7b,30.7,mmlu_hard-mixed,mixeval_240601,[],knowledge +646,qwen1.5_7b,28.6,mmlu_hard-mixed,mixeval_240601,[],knowledge +647,llama_3_8b,38.5,mmlu_hard-mixed,mixeval_240601,[],knowledge +648,mistral_7b,27.7,mmlu_hard-mixed,mixeval_240601,[],knowledge +649,gemma_7b,28.1,mmlu_hard-mixed,mixeval_240601,[],knowledge +650,yi_6b,37.2,mmlu_hard-mixed,mixeval_240601,[],knowledge +651,qwen1.5_4b,22.9,mmlu_hard-mixed,mixeval_240601,[],knowledge +652,jetmoe_8b,27.3,mmlu_hard-mixed,mixeval_240601,[],knowledge +653,deepseek_7b,26.4,mmlu_hard-mixed,mixeval_240601,[],knowledge +654,phi_2,29.0,mmlu_hard-mixed,mixeval_240601,[],knowledge +655,deepseekmoe_16b,30.7,mmlu_hard-mixed,mixeval_240601,[],knowledge +656,llama_2_7b,24.7,mmlu_hard-mixed,mixeval_240601,[],knowledge +657,gemma_2b,27.3,mmlu_hard-mixed,mixeval_240601,[],knowledge +658,olmo_7b,25.1,mmlu_hard-mixed,mixeval_240601,[],knowledge +659,mpt_7b,24.2,mmlu_hard-mixed,mixeval_240601,[],knowledge +660,gpt_4o_2024_05_13,67.5,drop_hard-mixed,mixeval_240601,[],reasoning +661,claude_3_opus,75.2,drop_hard-mixed,mixeval_240601,[],reasoning +662,gpt_4_turbo_2024_04_09,71.0,drop_hard-mixed,mixeval_240601,[],reasoning +663,gemini_1.5_pro_api_0409,64.8,drop_hard-mixed,mixeval_240601,[],reasoning +664,yi_large_preview,63.1,drop_hard-mixed,mixeval_240601,[],reasoning +665,llama_3_70b_instruct,74.5,drop_hard-mixed,mixeval_240601,[],reasoning +666,qwen_max_0428,53.5,drop_hard-mixed,mixeval_240601,[],reasoning +667,claude_3_sonnet,66.9,drop_hard-mixed,mixeval_240601,[],reasoning +668,reka_core_20240415,66.6,drop_hard-mixed,mixeval_240601,[],reasoning +669,mammoth2_8x7b_plus,65.1,drop_hard-mixed,mixeval_240601,[],reasoning +670,deepseek_v2,62.8,drop_hard-mixed,mixeval_240601,[],reasoning +671,command_r_plus,65.0,drop_hard-mixed,mixeval_240601,[],reasoning +672,yi_1.5_34b_chat,67.4,drop_hard-mixed,mixeval_240601,[],reasoning +673,mistral_large,61.6,drop_hard-mixed,mixeval_240601,[],reasoning +674,qwen1.5_72b_chat,56.5,drop_hard-mixed,mixeval_240601,[],reasoning +675,mistral_medium,47.1,drop_hard-mixed,mixeval_240601,[],reasoning +676,gemini_1.0_pro,54.1,drop_hard-mixed,mixeval_240601,[],reasoning +677,reka_flash_20240226,65.0,drop_hard-mixed,mixeval_240601,[],reasoning +678,mistral_small,52.6,drop_hard-mixed,mixeval_240601,[],reasoning +679,llama_3_8b_instruct,67.6,drop_hard-mixed,mixeval_240601,[],reasoning +680,command_r,42.0,drop_hard-mixed,mixeval_240601,[],reasoning +681,qwen1.5_32b_chat,54.4,drop_hard-mixed,mixeval_240601,[],reasoning +682,gpt_3.5_turbo_0125,55.4,drop_hard-mixed,mixeval_240601,[],reasoning +683,claude_3_haiku,51.5,drop_hard-mixed,mixeval_240601,[],reasoning +684,yi_34b_chat,57.1,drop_hard-mixed,mixeval_240601,[],reasoning +685,mixtral_8x7b_instruct_v0.1,47.7,drop_hard-mixed,mixeval_240601,[],reasoning +686,starling_lm_7b_beta,62.9,drop_hard-mixed,mixeval_240601,[],reasoning +687,yi_1.5_9b_chat,61.3,drop_hard-mixed,mixeval_240601,[],reasoning +688,gemma_1.1_7b_it,55.1,drop_hard-mixed,mixeval_240601,[],reasoning +689,vicuna_33b_v1.3,36.6,drop_hard-mixed,mixeval_240601,[],reasoning +690,llama_2_70b_chat,42.2,drop_hard-mixed,mixeval_240601,[],reasoning +691,map_neo_instruct_v0.1,42.4,drop_hard-mixed,mixeval_240601,[],reasoning +692,mistral_7b_instruct_v0.2,44.3,drop_hard-mixed,mixeval_240601,[],reasoning +693,qwen1.5_7b_chat,50.0,drop_hard-mixed,mixeval_240601,[],reasoning +694,reka_edge_20240208,56.9,drop_hard-mixed,mixeval_240601,[],reasoning +695,zephyr_7b_beta,45.3,drop_hard-mixed,mixeval_240601,[],reasoning +696,llama_2_7b_chat,44.3,drop_hard-mixed,mixeval_240601,[],reasoning +697,yi_6b_chat,43.7,drop_hard-mixed,mixeval_240601,[],reasoning +698,qwen1.5_moe_a2.7b_chat,39.5,drop_hard-mixed,mixeval_240601,[],reasoning +699,gemma_1.1_2b_it,27.8,drop_hard-mixed,mixeval_240601,[],reasoning +700,vicuna_7b_v1.5,33.2,drop_hard-mixed,mixeval_240601,[],reasoning +701,olmo_7b_instruct,22.9,drop_hard-mixed,mixeval_240601,[],reasoning +702,qwen1.5_4b_chat,28.6,drop_hard-mixed,mixeval_240601,[],reasoning +703,jetmoe_8b_chat,11.5,drop_hard-mixed,mixeval_240601,[],reasoning +704,mpt_7b_chat,31.0,drop_hard-mixed,mixeval_240601,[],reasoning +705,llama_3_70b,59.5,drop_hard-mixed,mixeval_240601,[],reasoning +706,qwen1.5_72b,26.2,drop_hard-mixed,mixeval_240601,[],reasoning +707,yi_34b,56.5,drop_hard-mixed,mixeval_240601,[],reasoning +708,qwen1.5_32b,36.9,drop_hard-mixed,mixeval_240601,[],reasoning +709,mixtral_8x7b,42.0,drop_hard-mixed,mixeval_240601,[],reasoning +710,llama_2_70b,46.1,drop_hard-mixed,mixeval_240601,[],reasoning +711,qwen1.5_moe_a2.7b,31.0,drop_hard-mixed,mixeval_240601,[],reasoning +712,qwen1.5_7b,29.8,drop_hard-mixed,mixeval_240601,[],reasoning +713,llama_3_8b,37.1,drop_hard-mixed,mixeval_240601,[],reasoning +714,mistral_7b,34.5,drop_hard-mixed,mixeval_240601,[],reasoning +715,gemma_7b,31.4,drop_hard-mixed,mixeval_240601,[],reasoning +716,yi_6b,19.4,drop_hard-mixed,mixeval_240601,[],reasoning +717,qwen1.5_4b,24.7,drop_hard-mixed,mixeval_240601,[],reasoning +718,jetmoe_8b,19.2,drop_hard-mixed,mixeval_240601,[],reasoning +719,deepseek_7b,21.4,drop_hard-mixed,mixeval_240601,[],reasoning +720,phi_2,27.1,drop_hard-mixed,mixeval_240601,[],reasoning +721,deepseekmoe_16b,12.2,drop_hard-mixed,mixeval_240601,[],reasoning +722,llama_2_7b,14.9,drop_hard-mixed,mixeval_240601,[],reasoning +723,gemma_2b,13.2,drop_hard-mixed,mixeval_240601,[],reasoning +724,olmo_7b,11.1,drop_hard-mixed,mixeval_240601,[],reasoning +725,mpt_7b,9.2,drop_hard-mixed,mixeval_240601,[],reasoning +771,llama_3_70b,81.7,boolq-mixed,mixeval_240601,[],knowledge +772,qwen1.5_72b,86.9,boolq-mixed,mixeval_240601,[],knowledge +773,yi_34b,79.4,boolq-mixed,mixeval_240601,[],knowledge +774,qwen1.5_32b,83.4,boolq-mixed,mixeval_240601,[],knowledge +775,mixtral_8x7b,77.7,boolq-mixed,mixeval_240601,[],knowledge +776,llama_2_70b,74.3,boolq-mixed,mixeval_240601,[],knowledge +777,qwen1.5_moe_a2.7b,70.9,boolq-mixed,mixeval_240601,[],knowledge +778,qwen1.5_7b,77.7,boolq-mixed,mixeval_240601,[],knowledge +779,llama_3_8b,64.0,boolq-mixed,mixeval_240601,[],knowledge +780,mistral_7b,68.0,boolq-mixed,mixeval_240601,[],knowledge +781,gemma_7b,74.3,boolq-mixed,mixeval_240601,[],knowledge +782,yi_6b,65.1,boolq-mixed,mixeval_240601,[],knowledge +783,qwen1.5_4b,72.0,boolq-mixed,mixeval_240601,[],knowledge +784,jetmoe_8b,64.6,boolq-mixed,mixeval_240601,[],knowledge +785,deepseek_7b,62.9,boolq-mixed,mixeval_240601,[],knowledge +786,phi_2,73.1,boolq-mixed,mixeval_240601,[],knowledge +787,deepseekmoe_16b,62.9,boolq-mixed,mixeval_240601,[],knowledge +788,llama_2_7b,61.7,boolq-mixed,mixeval_240601,[],knowledge +789,gemma_2b,58.9,boolq-mixed,mixeval_240601,[],knowledge +790,olmo_7b,49.1,boolq-mixed,mixeval_240601,[],knowledge +791,mpt_7b,44.0,boolq-mixed,mixeval_240601,[],knowledge +593,gpt_4_0314,0.57,agieval,BLZ_240312,[],holistic +594,gpt_4_0613,0.57,agieval,BLZ_240312,[],holistic +596,claude_1,0.49700000000000005,agieval,BLZ_240312,[],holistic +601,mixtral_8x7b_instruct_v0.1,0.45299999999999996,agieval,BLZ_240312,[],holistic +602,yi_34b_chat,0.508,agieval,BLZ_240312,[],holistic +605,gpt_3.5_turbo_0314,0.43200000000000005,agieval,BLZ_240312,[],holistic +608,vicuna_33b,0.373,agieval,BLZ_240312,[],holistic +609,starling_lm_7b_alpha,0.401,agieval,BLZ_240312,[],holistic +611,llama_2_70b_chat,0.45,agieval,BLZ_240312,[],holistic +613,openhermes_2.5_mistral_7b,0.43,agieval,BLZ_240312,[],holistic +614,openchat_3.5,0.42700000000000005,agieval,BLZ_240312,[],holistic +617,solar_10.7b_instruct_v1.0,0.47600000000000003,agieval,BLZ_240312,[],holistic +618,dolphin_2.2.1_mistral_7b,0.392,agieval,BLZ_240312,[],holistic +620,zephyr_7b_beta,0.406,agieval,BLZ_240312,[],holistic +623,llama_2_13b_chat,0.336,agieval,BLZ_240312,[],holistic +624,vicuna_13b,0.368,agieval,BLZ_240312,[],holistic +626,zephyr_7b_alpha,0.38,agieval,BLZ_240312,[],holistic +627,qwen_14b_chat,0.396,agieval,BLZ_240312,[],holistic +630,llama_2_7b_chat,0.29600000000000004,agieval,BLZ_240312,[],holistic +632,mistral_7b_instruct_v0.1,0.335,agieval,BLZ_240312,[],holistic +634,vicuna_7b,0.314,agieval,BLZ_240312,[],holistic +636,chatglm3_6b,0.414,agieval,BLZ_240312,[],holistic +643,chatglm_6b,0.325,agieval,BLZ_240312,[],holistic +647,llama_13b,0.205,agieval,BLZ_240312,[],holistic +180,gpt_4_0314,0.963,arc_c,BLZ_240312,[],reasoning +182,mistral_medium,0.899,arc_c,BLZ_240312,[],reasoning +188,mixtral_8x7b_instruct_v0.1,0.7021999999999999,arc_c,BLZ_240312,[],reasoning +189,yi_34b_chat,0.6544,arc_c,BLZ_240312,[],reasoning +192,gpt_3.5_turbo_0314,0.855,arc_c,BLZ_240312,[],reasoning +193,wizardlm_70b_v1.0,0.6544,arc_c,BLZ_240312,[],reasoning +194,tulu_2_dpo_70b,0.721,arc_c,BLZ_240312,[],reasoning +195,vicuna_33b,0.6212,arc_c,BLZ_240312,[],reasoning +196,starling_lm_7b_alpha,0.6382,arc_c,BLZ_240312,[],reasoning +198,llama_2_70b_chat,0.6459,arc_c,BLZ_240312,[],reasoning +200,openhermes_2.5_mistral_7b,0.6493000000000001,arc_c,BLZ_240312,[],reasoning +201,openchat_3.5,0.6391,arc_c,BLZ_240312,[],reasoning +204,solar_10.7b_instruct_v1.0,0.7108,arc_c,BLZ_240312,[],reasoning +205,dolphin_2.2.1_mistral_7b,0.6331,arc_c,BLZ_240312,[],reasoning +206,wizardlm_13b_v1.2,0.5904,arc_c,BLZ_240312,[],reasoning +207,zephyr_7b_beta,0.6203,arc_c,BLZ_240312,[],reasoning +208,mpt_30b_chat,0.5870000000000001,arc_c,BLZ_240312,[],reasoning +209,codellama_34b_instruct,0.5427000000000001,arc_c,BLZ_240312,[],reasoning +210,llama_2_13b_chat,0.5904,arc_c,BLZ_240312,[],reasoning +211,vicuna_13b,0.5708,arc_c,BLZ_240312,[],reasoning +213,zephyr_7b_alpha,0.6101,arc_c,BLZ_240312,[],reasoning +215,falcon_180b_chat,0.6945,arc_c,BLZ_240312,[],reasoning +217,llama_2_7b_chat,0.529,arc_c,BLZ_240312,[],reasoning +219,mistral_7b_instruct_v0.1,0.5452,arc_c,BLZ_240312,[],reasoning +221,vicuna_7b,0.5324,arc_c,BLZ_240312,[],reasoning +235,yi_34bx2_moe_60b,0.7108,arc_c,BLZ_240312,[],reasoning +886,gpt_4_1106_preview,0.977,alpacav1,BLZ_240312,[],holistic +888,gpt_4_0314,0.9528,alpacav1,BLZ_240312,[],holistic +889,gpt_4_0613,0.9528,alpacav1,BLZ_240312,[],holistic +890,mistral_medium,0.9682999999999999,alpacav1,BLZ_240312,[],holistic +891,claude_1,0.8839,alpacav1,BLZ_240312,[],holistic +892,claude_2.0,0.9136,alpacav1,BLZ_240312,[],holistic +893,gemini_pro_dev_api,0.7966,alpacav1,BLZ_240312,[],holistic +894,claude_2.1,0.8708,alpacav1,BLZ_240312,[],holistic +895,gpt_3.5_turbo_0613,0.8937,alpacav1,BLZ_240312,[],holistic +896,mixtral_8x7b_instruct_v0.1,0.9478,alpacav1,BLZ_240312,[],holistic +897,yi_34b_chat,0.9408,alpacav1,BLZ_240312,[],holistic +898,gemini_pro,0.7966,alpacav1,BLZ_240312,[],holistic +900,gpt_3.5_turbo_0314,0.8937,alpacav1,BLZ_240312,[],holistic +902,tulu_2_dpo_70b,0.9503,alpacav1,BLZ_240312,[],holistic +903,vicuna_33b,0.8898999999999999,alpacav1,BLZ_240312,[],holistic +904,starling_lm_7b_alpha,0.9198999999999999,alpacav1,BLZ_240312,[],holistic +906,llama_2_70b_chat,0.9266,alpacav1,BLZ_240312,[],holistic +909,openchat_3.5,0.8851,alpacav1,BLZ_240312,[],holistic +911,gpt_3.5_turbo_1106,0.8626,alpacav1,BLZ_240312,[],holistic +914,wizardlm_13b_v1.2,0.8917,alpacav1,BLZ_240312,[],holistic +915,zephyr_7b_beta,0.9059999999999999,alpacav1,BLZ_240312,[],holistic +918,llama_2_13b_chat,0.8109000000000001,alpacav1,BLZ_240312,[],holistic +921,zephyr_7b_alpha,0.8576,alpacav1,BLZ_240312,[],holistic +924,guanaco_33b,0.6596,alpacav1,BLZ_240312,[],holistic +925,llama_2_7b_chat,0.7137,alpacav1,BLZ_240312,[],holistic +934,chatglm2_6b,0.47130000000000005,alpacav1,BLZ_240312,[],holistic +937,openassistant_pythia_12b,0.2596,alpacav1,BLZ_240312,[],holistic +827,gpt_4_1106_preview,0.5,alpacav2,BLZ_240312,[],holistic +829,gpt_4_0314,0.221,alpacav2,BLZ_240312,[],holistic +830,gpt_4_0613,0.158,alpacav2,BLZ_240312,[],holistic +831,mistral_medium,0.21899999999999997,alpacav2,BLZ_240312,[],holistic +832,claude_1,0.17,alpacav2,BLZ_240312,[],holistic +833,claude_2.0,0.172,alpacav2,BLZ_240312,[],holistic +834,gemini_pro_dev_api,0.16899999999999998,alpacav2,BLZ_240312,[],holistic +835,claude_2.1,0.157,alpacav2,BLZ_240312,[],holistic +836,gpt_3.5_turbo_0613,0.141,alpacav2,BLZ_240312,[],holistic +837,mixtral_8x7b_instruct_v0.1,0.183,alpacav2,BLZ_240312,[],holistic +838,yi_34b_chat,0.297,alpacav2,BLZ_240312,[],holistic +839,gemini_pro,0.16899999999999998,alpacav2,BLZ_240312,[],holistic +840,claude_instant_1,0.161,alpacav2,BLZ_240312,[],holistic +841,gpt_3.5_turbo_0314,0.096,alpacav2,BLZ_240312,[],holistic +842,wizardlm_70b_v1.0,0.14400000000000002,alpacav2,BLZ_240312,[],holistic +843,tulu_2_dpo_70b,0.16,alpacav2,BLZ_240312,[],holistic +844,vicuna_33b,0.127,alpacav2,BLZ_240312,[],holistic +845,starling_lm_7b_alpha,0.142,alpacav2,BLZ_240312,[],holistic +846,deepseek_llm_67b_chat,0.121,alpacav2,BLZ_240312,[],holistic +847,llama_2_70b_chat,0.139,alpacav2,BLZ_240312,[],holistic +849,openhermes_2.5_mistral_7b,0.10300000000000001,alpacav2,BLZ_240312,[],holistic +852,gpt_3.5_turbo_1106,0.092,alpacav2,BLZ_240312,[],holistic +854,dolphin_2.2.1_mistral_7b,0.09,alpacav2,BLZ_240312,[],holistic +855,wizardlm_13b_v1.2,0.12,alpacav2,BLZ_240312,[],holistic +856,zephyr_7b_beta,0.11,alpacav2,BLZ_240312,[],holistic +859,llama_2_13b_chat,0.077,alpacav2,BLZ_240312,[],holistic +860,vicuna_13b,0.067,alpacav2,BLZ_240312,[],holistic +862,zephyr_7b_alpha,0.084,alpacav2,BLZ_240312,[],holistic +863,qwen_14b_chat,0.075,alpacav2,BLZ_240312,[],holistic +865,guanaco_33b,0.05,alpacav2,BLZ_240312,[],holistic +866,llama_2_7b_chat,0.0496,alpacav2,BLZ_240312,[],holistic +870,vicuna_7b,0.048,alpacav2,BLZ_240312,[],holistic +875,chatglm2_6b,0.027999999999999997,alpacav2,BLZ_240312,[],holistic +878,openassistant_pythia_12b,0.018000000000000002,alpacav2,BLZ_240312,[],holistic +1299,gpt_4_1106_preview,0.32799999999999996,alpacaeval2_lc,BLZ_240312,[],holistic +1301,gpt_4_0314,0.21600000000000003,alpacaeval2_lc,BLZ_240312,[],holistic +1302,gpt_4_0613,0.18600000000000003,alpacaeval2_lc,BLZ_240312,[],holistic +1303,mistral_medium,0.196,alpacaeval2_lc,BLZ_240312,[],holistic +1304,claude_1,0.21100000000000002,alpacaeval2_lc,BLZ_240312,[],holistic +1305,claude_2.0,0.21600000000000003,alpacaeval2_lc,BLZ_240312,[],holistic +1306,gemini_pro_dev_api,0.172,alpacaeval2_lc,BLZ_240312,[],holistic +1307,claude_2.1,0.193,alpacaeval2_lc,BLZ_240312,[],holistic +1308,gpt_3.5_turbo_0613,0.14300000000000002,alpacaeval2_lc,BLZ_240312,[],holistic +1309,mixtral_8x7b_instruct_v0.1,0.168,alpacaeval2_lc,BLZ_240312,[],holistic +1310,yi_34b_chat,0.188,alpacaeval2_lc,BLZ_240312,[],holistic +1312,claude_instant_1,0.195,alpacaeval2_lc,BLZ_240312,[],holistic +1313,gpt_3.5_turbo_0314,0.156,alpacaeval2_lc,BLZ_240312,[],holistic +1314,wizardlm_70b_v1.0,0.125,alpacaeval2_lc,BLZ_240312,[],holistic +1315,tulu_2_dpo_70b,0.151,alpacaeval2_lc,BLZ_240312,[],holistic +1316,vicuna_33b,0.115,alpacaeval2_lc,BLZ_240312,[],holistic +1317,starling_lm_7b_alpha,0.10099999999999999,alpacaeval2_lc,BLZ_240312,[],holistic +1318,deepseek_llm_67b_chat,0.141,alpacaeval2_lc,BLZ_240312,[],holistic +1319,llama_2_70b_chat,0.10400000000000001,alpacaeval2_lc,BLZ_240312,[],holistic +1321,openhermes_2.5_mistral_7b,0.126,alpacaeval2_lc,BLZ_240312,[],holistic +1324,gpt_3.5_turbo_1106,0.155,alpacaeval2_lc,BLZ_240312,[],holistic +1326,dolphin_2.2.1_mistral_7b,0.10800000000000001,alpacaeval2_lc,BLZ_240312,[],holistic +1327,wizardlm_13b_v1.2,0.099,alpacaeval2_lc,BLZ_240312,[],holistic +1328,zephyr_7b_beta,0.102,alpacaeval2_lc,BLZ_240312,[],holistic +1331,llama_2_13b_chat,0.068,alpacaeval2_lc,BLZ_240312,[],holistic +1332,vicuna_13b,0.085,alpacaeval2_lc,BLZ_240312,[],holistic +1334,zephyr_7b_alpha,0.086,alpacaeval2_lc,BLZ_240312,[],holistic +1335,qwen_14b_chat,0.1,alpacaeval2_lc,BLZ_240312,[],holistic +1338,llama_2_7b_chat,0.045,alpacaeval2_lc,BLZ_240312,[],holistic +1342,vicuna_7b,0.06,alpacaeval2_lc,BLZ_240312,[],holistic +0,gpt_4_0125_preview,1.0,arena_elo,BLZ_240312,[],holistic +1,gpt_4_1106_preview,0.9992019154030327,arena_elo,BLZ_240312,[],holistic +2,bard_gemini_pro,0.9768555466879489,arena_elo,BLZ_240312,[],holistic +3,gpt_4_0314,0.9497206703910615,arena_elo,BLZ_240312,[],holistic +4,gpt_4_0613,0.9273743016759777,arena_elo,BLZ_240312,[],holistic +5,mistral_medium,0.9177972865123704,arena_elo,BLZ_240312,[],holistic +6,claude_1,0.9169992019154031,arena_elo,BLZ_240312,[],holistic +7,claude_2.0,0.9034317637669593,arena_elo,BLZ_240312,[],holistic +8,gemini_pro_dev_api,0.8938547486033519,arena_elo,BLZ_240312,[],holistic +9,claude_2.1,0.8930566640063847,arena_elo,BLZ_240312,[],holistic +10,gpt_3.5_turbo_0613,0.8922585794094174,arena_elo,BLZ_240312,[],holistic +11,mixtral_8x7b_instruct_v0.1,0.8922585794094174,arena_elo,BLZ_240312,[],holistic +12,yi_34b_chat,0.8898643256185156,arena_elo,BLZ_240312,[],holistic +13,gemini_pro,0.8890662410215483,arena_elo,BLZ_240312,[],holistic +14,claude_instant_1,0.8850758180367119,arena_elo,BLZ_240312,[],holistic +15,gpt_3.5_turbo_0314,0.8818834796488427,arena_elo,BLZ_240312,[],holistic +16,wizardlm_70b_v1.0,0.8818834796488427,arena_elo,BLZ_240312,[],holistic +17,tulu_2_dpo_70b,0.8810853950518756,arena_elo,BLZ_240312,[],holistic +18,vicuna_33b,0.8723064644852354,arena_elo,BLZ_240312,[],holistic +19,starling_lm_7b_alpha,0.8699122106943336,arena_elo,BLZ_240312,[],holistic +20,deepseek_llm_67b_chat,0.8635275339185954,arena_elo,BLZ_240312,[],holistic +21,llama_2_70b_chat,0.8635275339185954,arena_elo,BLZ_240312,[],holistic +22,nv_llama2_70b_steerlm_chat,0.8603351955307262,arena_elo,BLZ_240312,[],holistic +23,openhermes_2.5_mistral_7b,0.8603351955307262,arena_elo,BLZ_240312,[],holistic +24,openchat_3.5,0.8587390263367917,arena_elo,BLZ_240312,[],holistic +25,pplx_70b_online,0.8587390263367917,arena_elo,BLZ_240312,[],holistic +26,gpt_3.5_turbo_1106,0.8547486033519553,arena_elo,BLZ_240312,[],holistic +27,solar_10.7b_instruct_v1.0,0.8499600957701516,arena_elo,BLZ_240312,[],holistic +28,dolphin_2.2.1_mistral_7b,0.8499600957701516,arena_elo,BLZ_240312,[],holistic +29,wizardlm_13b_v1.2,0.8443735035913806,arena_elo,BLZ_240312,[],holistic +30,zephyr_7b_beta,0.8387869114126097,arena_elo,BLZ_240312,[],holistic +31,mpt_30b_chat,0.8332003192338387,arena_elo,BLZ_240312,[],holistic +32,codellama_34b_instruct,0.8324022346368715,arena_elo,BLZ_240312,[],holistic +33,llama_2_13b_chat,0.8316041500399042,arena_elo,BLZ_240312,[],holistic +34,vicuna_13b,0.8300079808459697,arena_elo,BLZ_240312,[],holistic +35,pplx_7b_online,0.8284118116520351,arena_elo,BLZ_240312,[],holistic +36,zephyr_7b_alpha,0.8276137270550679,arena_elo,BLZ_240312,[],holistic +37,qwen_14b_chat,0.825219473264166,arena_elo,BLZ_240312,[],holistic +38,falcon_180b_chat,0.8236233040702314,arena_elo,BLZ_240312,[],holistic +39,guanaco_33b,0.8236233040702314,arena_elo,BLZ_240312,[],holistic +40,llama_2_7b_chat,0.8172386272944933,arena_elo,BLZ_240312,[],holistic +41,stripedhyena_nous_7b,0.8140462889066241,arena_elo,BLZ_240312,[],holistic +42,mistral_7b_instruct_v0.1,0.8028731045490822,arena_elo,BLZ_240312,[],holistic +43,palm_chat_bison_001,0.8028731045490822,arena_elo,BLZ_240312,[],holistic +44,vicuna_7b,0.8020750199521149,arena_elo,BLZ_240312,[],holistic +45,koala_13b,0.770949720670391,arena_elo,BLZ_240312,[],holistic +46,chatglm3_6b,0.7661612130885874,arena_elo,BLZ_240312,[],holistic +47,gpt4all_13b_snoozy,0.74780526735834,arena_elo,BLZ_240312,[],holistic +48,mpt_7b_chat,0.7430167597765364,arena_elo,BLZ_240312,[],holistic +49,chatglm2_6b,0.7422186751795691,arena_elo,BLZ_240312,[],holistic +50,rwkv_4_raven_14b,0.7382282521947326,arena_elo,BLZ_240312,[],holistic +51,alpaca_13b,0.7214684756584198,arena_elo,BLZ_240312,[],holistic +52,openassistant_pythia_12b,0.7158818834796489,arena_elo,BLZ_240312,[],holistic +53,chatglm_6b,0.704708699122107,arena_elo,BLZ_240312,[],holistic +54,fastchat_t5_3b,0.6975259377494014,arena_elo,BLZ_240312,[],holistic +55,stablelm_tuned_alpha_7b,0.6743814844373504,arena_elo,BLZ_240312,[],holistic +56,dolly_v2_12b,0.6568236233040702,arena_elo,BLZ_240312,[],holistic +57,llama_13b,0.6384676775738228,arena_elo,BLZ_240312,[],holistic +709,gpt_4_1106_preview,0.8390000000000001,bbh,BLZ_240312,[],holistic +711,gpt_4_0314,0.867,bbh,BLZ_240312,[],holistic +712,gpt_4_0613,0.867,bbh,BLZ_240312,[],holistic +714,claude_1,0.6729999999999999,bbh,BLZ_240312,[],holistic +716,gemini_pro_dev_api,0.6559999999999999,bbh,BLZ_240312,[],holistic +718,gpt_3.5_turbo_0613,0.71,bbh,BLZ_240312,[],holistic +719,mixtral_8x7b_instruct_v0.1,0.67,bbh,BLZ_240312,[],holistic +720,yi_34b_chat,0.7170000000000001,bbh,BLZ_240312,[],holistic +721,gemini_pro,0.6559999999999999,bbh,BLZ_240312,[],holistic +725,tulu_2_dpo_70b,0.66,bbh,BLZ_240312,[],holistic +726,vicuna_33b,0.52,bbh,BLZ_240312,[],holistic +729,llama_2_70b_chat,0.608,bbh,BLZ_240312,[],holistic +734,gpt_3.5_turbo_1106,0.71,bbh,BLZ_240312,[],holistic +736,dolphin_2.2.1_mistral_7b,0.598,bbh,BLZ_240312,[],holistic +741,llama_2_13b_chat,0.5820000000000001,bbh,BLZ_240312,[],holistic +742,vicuna_13b,0.515,bbh,BLZ_240312,[],holistic +745,qwen_14b_chat,0.537,bbh,BLZ_240312,[],holistic +748,llama_2_7b_chat,0.35600000000000004,bbh,BLZ_240312,[],holistic +750,mistral_7b_instruct_v0.1,0.5670000000000001,bbh,BLZ_240312,[],holistic +752,vicuna_7b,0.434,bbh,BLZ_240312,[],holistic +765,llama_13b,0.379,bbh,BLZ_240312,[],holistic +1122,gpt_4_1106_preview,0.8604999999999999,eq_benchv2,BLZ_240312,[],holistic +1124,gpt_4_0314,0.8573000000000001,eq_benchv2,BLZ_240312,[],holistic +1125,gpt_4_0613,0.8479000000000001,eq_benchv2,BLZ_240312,[],holistic +1126,mistral_medium,0.8256999999999999,eq_benchv2,BLZ_240312,[],holistic +1127,claude_1,0.7683,eq_benchv2,BLZ_240312,[],holistic +1128,claude_2.0,0.7289,eq_benchv2,BLZ_240312,[],holistic +1129,gemini_pro_dev_api,0.7508,eq_benchv2,BLZ_240312,[],holistic +1130,claude_2.1,0.7395999999999999,eq_benchv2,BLZ_240312,[],holistic +1131,gpt_3.5_turbo_0613,0.6934999999999999,eq_benchv2,BLZ_240312,[],holistic +1132,mixtral_8x7b_instruct_v0.1,0.7237,eq_benchv2,BLZ_240312,[],holistic +1133,yi_34b_chat,0.7162000000000001,eq_benchv2,BLZ_240312,[],holistic +1135,claude_instant_1,0.6904,eq_benchv2,BLZ_240312,[],holistic +1136,gpt_3.5_turbo_0314,0.7067,eq_benchv2,BLZ_240312,[],holistic +1137,wizardlm_70b_v1.0,0.7128,eq_benchv2,BLZ_240312,[],holistic +1138,tulu_2_dpo_70b,0.7663,eq_benchv2,BLZ_240312,[],holistic +1139,vicuna_33b,0.6707,eq_benchv2,BLZ_240312,[],holistic +1140,starling_lm_7b_alpha,0.7390000000000001,eq_benchv2,BLZ_240312,[],holistic +1141,deepseek_llm_67b_chat,0.7753,eq_benchv2,BLZ_240312,[],holistic +1142,llama_2_70b_chat,0.7359,eq_benchv2,BLZ_240312,[],holistic +1144,openhermes_2.5_mistral_7b,0.6689,eq_benchv2,BLZ_240312,[],holistic +1145,openchat_3.5,0.7218000000000001,eq_benchv2,BLZ_240312,[],holistic +1146,pplx_70b_online,0.6279,eq_benchv2,BLZ_240312,[],holistic +1147,gpt_3.5_turbo_1106,0.7173999999999999,eq_benchv2,BLZ_240312,[],holistic +1148,solar_10.7b_instruct_v1.0,0.7353000000000001,eq_benchv2,BLZ_240312,[],holistic +1149,dolphin_2.2.1_mistral_7b,0.6992,eq_benchv2,BLZ_240312,[],holistic +1150,wizardlm_13b_v1.2,0.6371,eq_benchv2,BLZ_240312,[],holistic +1151,zephyr_7b_beta,0.5832999999999999,eq_benchv2,BLZ_240312,[],holistic +1153,codellama_34b_instruct,0.4915,eq_benchv2,BLZ_240312,[],holistic +1154,llama_2_13b_chat,0.49119999999999997,eq_benchv2,BLZ_240312,[],holistic +1155,vicuna_13b,0.6739,eq_benchv2,BLZ_240312,[],holistic +1156,pplx_7b_online,0.4891,eq_benchv2,BLZ_240312,[],holistic +1157,zephyr_7b_alpha,0.5682,eq_benchv2,BLZ_240312,[],holistic +1158,qwen_14b_chat,0.6347,eq_benchv2,BLZ_240312,[],holistic +1159,falcon_180b_chat,0.5682,eq_benchv2,BLZ_240312,[],holistic +1160,guanaco_33b,0.3611,eq_benchv2,BLZ_240312,[],holistic +1161,llama_2_7b_chat,0.3632,eq_benchv2,BLZ_240312,[],holistic +1162,stripedhyena_nous_7b,0.5458,eq_benchv2,BLZ_240312,[],holistic +1163,mistral_7b_instruct_v0.1,0.5215,eq_benchv2,BLZ_240312,[],holistic +1179,yi_34bx2_moe_60b,0.7269,eq_benchv2,BLZ_240312,[],holistic +542,mixtral_8x7b_instruct_v0.1,0.7641,gpt4all,BLZ_240312,[],holistic +543,yi_34b_chat,0.7212999999999999,gpt4all,BLZ_240312,[],holistic +550,starling_lm_7b_alpha,0.7272,gpt4all,BLZ_240312,[],holistic +554,openhermes_2.5_mistral_7b,0.7312000000000001,gpt4all,BLZ_240312,[],holistic +555,openchat_3.5,0.7292000000000001,gpt4all,BLZ_240312,[],holistic +558,solar_10.7b_instruct_v1.0,0.7511,gpt4all,BLZ_240312,[],holistic +559,dolphin_2.2.1_mistral_7b,0.7223999999999999,gpt4all,BLZ_240312,[],holistic +561,zephyr_7b_beta,0.7182999999999999,gpt4all,BLZ_240312,[],holistic +565,vicuna_13b,0.631,gpt4all,BLZ_240312,[],holistic +567,zephyr_7b_alpha,0.7223999999999999,gpt4all,BLZ_240312,[],holistic +573,mistral_7b_instruct_v0.1,0.6795,gpt4all,BLZ_240312,[],holistic +575,vicuna_7b,0.61,gpt4all,BLZ_240312,[],holistic +576,koala_13b,0.62,gpt4all,BLZ_240312,[],holistic +578,gpt4all_13b_snoozy,0.653,gpt4all,BLZ_240312,[],holistic +579,mpt_7b_chat,0.648,gpt4all,BLZ_240312,[],holistic +583,openassistant_pythia_12b,0.61,gpt4all,BLZ_240312,[],holistic +585,fastchat_t5_3b,0.537,gpt4all,BLZ_240312,[],holistic +586,stablelm_tuned_alpha_7b,0.513,gpt4all,BLZ_240312,[],holistic +588,llama_13b,0.63,gpt4all,BLZ_240312,[],holistic +477,mistral_medium,0.667,gsm8k,BLZ_240312,[],math +483,mixtral_8x7b_instruct_v0.1,0.6073,gsm8k,BLZ_240312,[],math +484,yi_34b_chat,0.31920000000000004,gsm8k,BLZ_240312,[],math +487,gpt_3.5_turbo_0314,0.5710000000000001,gsm8k,BLZ_240312,[],math +488,wizardlm_70b_v1.0,0.1797,gsm8k,BLZ_240312,[],math +489,tulu_2_dpo_70b,0.6262,gsm8k,BLZ_240312,[],math +490,vicuna_33b,0.13720000000000002,gsm8k,BLZ_240312,[],math +491,starling_lm_7b_alpha,0.624,gsm8k,BLZ_240312,[],math +493,llama_2_70b_chat,0.2669,gsm8k,BLZ_240312,[],math +495,openhermes_2.5_mistral_7b,0.2608,gsm8k,BLZ_240312,[],math +496,openchat_3.5,0.26839999999999997,gsm8k,BLZ_240312,[],math +499,solar_10.7b_instruct_v1.0,0.6475,gsm8k,BLZ_240312,[],math +500,dolphin_2.2.1_mistral_7b,0.4807,gsm8k,BLZ_240312,[],math +501,wizardlm_13b_v1.2,0.135,gsm8k,BLZ_240312,[],math +502,zephyr_7b_beta,0.2904,gsm8k,BLZ_240312,[],math +503,mpt_30b_chat,0.1213,gsm8k,BLZ_240312,[],math +504,codellama_34b_instruct,0.37979999999999997,gsm8k,BLZ_240312,[],math +505,llama_2_13b_chat,0.1524,gsm8k,BLZ_240312,[],math +506,vicuna_13b,0.113,gsm8k,BLZ_240312,[],math +508,zephyr_7b_alpha,0.14029999999999998,gsm8k,BLZ_240312,[],math +509,qwen_14b_chat,0.597,gsm8k,BLZ_240312,[],math +510,falcon_180b_chat,0.4594,gsm8k,BLZ_240312,[],math +512,llama_2_7b_chat,0.0735,gsm8k,BLZ_240312,[],math +514,mistral_7b_instruct_v0.1,0.1425,gsm8k,BLZ_240312,[],math +516,vicuna_7b,0.0819,gsm8k,BLZ_240312,[],math +530,yi_34bx2_moe_60b,0.7551000000000001,gsm8k,BLZ_240312,[],math +239,gpt_4_0314,0.953,hellaswag,BLZ_240312,[],reasoning +241,mistral_medium,0.88,hellaswag,BLZ_240312,[],reasoning +247,mixtral_8x7b_instruct_v0.1,0.8763,hellaswag,BLZ_240312,[],reasoning +248,yi_34b_chat,0.8416,hellaswag,BLZ_240312,[],reasoning +251,gpt_3.5_turbo_0314,0.706,hellaswag,BLZ_240312,[],reasoning +252,wizardlm_70b_v1.0,0.8441,hellaswag,BLZ_240312,[],reasoning +253,tulu_2_dpo_70b,0.8898999999999999,hellaswag,BLZ_240312,[],reasoning +254,vicuna_33b,0.83,hellaswag,BLZ_240312,[],reasoning +255,starling_lm_7b_alpha,0.8490000000000001,hellaswag,BLZ_240312,[],reasoning +257,llama_2_70b_chat,0.8588,hellaswag,BLZ_240312,[],reasoning +259,openhermes_2.5_mistral_7b,0.8418000000000001,hellaswag,BLZ_240312,[],reasoning +260,openchat_3.5,0.8479000000000001,hellaswag,BLZ_240312,[],reasoning +263,solar_10.7b_instruct_v1.0,0.8815999999999999,hellaswag,BLZ_240312,[],reasoning +264,dolphin_2.2.1_mistral_7b,0.8376,hellaswag,BLZ_240312,[],reasoning +265,wizardlm_13b_v1.2,0.8220999999999999,hellaswag,BLZ_240312,[],reasoning +266,zephyr_7b_beta,0.8436,hellaswag,BLZ_240312,[],reasoning +267,mpt_30b_chat,0.8254,hellaswag,BLZ_240312,[],reasoning +268,codellama_34b_instruct,0.7692,hellaswag,BLZ_240312,[],reasoning +269,llama_2_13b_chat,0.8194,hellaswag,BLZ_240312,[],reasoning +270,vicuna_13b,0.8123999999999999,hellaswag,BLZ_240312,[],reasoning +272,zephyr_7b_alpha,0.8404,hellaswag,BLZ_240312,[],reasoning +274,falcon_180b_chat,0.8886,hellaswag,BLZ_240312,[],reasoning +276,llama_2_7b_chat,0.7855,hellaswag,BLZ_240312,[],reasoning +278,mistral_7b_instruct_v0.1,0.7563,hellaswag,BLZ_240312,[],reasoning +280,vicuna_7b,0.7739,hellaswag,BLZ_240312,[],reasoning +294,yi_34bx2_moe_60b,0.8523000000000001,hellaswag,BLZ_240312,[],reasoning +129,mixtral_8x7b_instruct_v0.1,0.7262000000000001,hugging_6,BLZ_240312,[],holistic +130,yi_34b_chat,0.6531999999999999,hugging_6,BLZ_240312,[],holistic +134,wizardlm_70b_v1.0,0.6125,hugging_6,BLZ_240312,[],holistic +135,tulu_2_dpo_70b,0.7376999999999999,hugging_6,BLZ_240312,[],holistic +136,vicuna_33b,0.585,hugging_6,BLZ_240312,[],holistic +137,starling_lm_7b_alpha,0.6713,hugging_6,BLZ_240312,[],holistic +139,llama_2_70b_chat,0.624,hugging_6,BLZ_240312,[],holistic +141,openhermes_2.5_mistral_7b,0.6152000000000001,hugging_6,BLZ_240312,[],holistic +142,openchat_3.5,0.6124,hugging_6,BLZ_240312,[],holistic +145,solar_10.7b_instruct_v1.0,0.742,hugging_6,BLZ_240312,[],holistic +146,dolphin_2.2.1_mistral_7b,0.6493000000000001,hugging_6,BLZ_240312,[],holistic +147,wizardlm_13b_v1.2,0.5476,hugging_6,BLZ_240312,[],holistic +148,zephyr_7b_beta,0.6195,hugging_6,BLZ_240312,[],holistic +149,mpt_30b_chat,0.5538000000000001,hugging_6,BLZ_240312,[],holistic +150,codellama_34b_instruct,0.5729,hugging_6,BLZ_240312,[],holistic +151,llama_2_13b_chat,0.5490999999999999,hugging_6,BLZ_240312,[],holistic +152,vicuna_13b,0.5539999999999999,hugging_6,BLZ_240312,[],holistic +154,zephyr_7b_alpha,0.595,hugging_6,BLZ_240312,[],holistic +156,falcon_180b_chat,0.6785,hugging_6,BLZ_240312,[],holistic +158,llama_2_7b_chat,0.5074000000000001,hugging_6,BLZ_240312,[],holistic +160,mistral_7b_instruct_v0.1,0.5496,hugging_6,BLZ_240312,[],holistic +162,vicuna_7b,0.521,hugging_6,BLZ_240312,[],holistic +176,yi_34bx2_moe_60b,0.7672,hugging_6,BLZ_240312,[],holistic +768,gpt_4_1106_preview,0.8540000000000001,humaneval,BLZ_240312,[],code +770,gpt_4_0314,0.884,humaneval,BLZ_240312,[],code +771,gpt_4_0613,0.884,humaneval,BLZ_240312,[],code +773,claude_1,0.56,humaneval,BLZ_240312,[],code +774,claude_2.0,0.7120000000000001,humaneval,BLZ_240312,[],code +775,gemini_pro_dev_api,0.634,humaneval,BLZ_240312,[],code +777,gpt_3.5_turbo_0613,0.726,humaneval,BLZ_240312,[],code +778,mixtral_8x7b_instruct_v0.1,0.5489999999999999,humaneval,BLZ_240312,[],code +780,gemini_pro,0.634,humaneval,BLZ_240312,[],code +781,claude_instant_1,0.528,humaneval,BLZ_240312,[],code +782,gpt_3.5_turbo_0314,0.732,humaneval,BLZ_240312,[],code +790,openhermes_2.5_mistral_7b,0.48200000000000004,humaneval,BLZ_240312,[],code +791,openchat_3.5,0.555,humaneval,BLZ_240312,[],code +793,gpt_3.5_turbo_1106,0.726,humaneval,BLZ_240312,[],code +797,zephyr_7b_beta,0.3,humaneval,BLZ_240312,[],code +799,codellama_34b_instruct,0.518,humaneval,BLZ_240312,[],code +801,vicuna_13b,0.171,humaneval,BLZ_240312,[],code +804,qwen_14b_chat,0.439,humaneval,BLZ_240312,[],code +809,mistral_7b_instruct_v0.1,0.287,humaneval,BLZ_240312,[],code +811,vicuna_7b,0.11599999999999999,humaneval,BLZ_240312,[],code +947,gpt_4_0314,0.93,llmonitor,BLZ_240312,[],holistic +948,gpt_4_0613,0.89,llmonitor,BLZ_240312,[],holistic +950,claude_1,0.66,llmonitor,BLZ_240312,[],holistic +951,claude_2.0,0.68,llmonitor,BLZ_240312,[],holistic +954,gpt_3.5_turbo_0613,0.81,llmonitor,BLZ_240312,[],holistic +958,claude_instant_1,0.6,llmonitor,BLZ_240312,[],holistic +959,gpt_3.5_turbo_0314,0.79,llmonitor,BLZ_240312,[],holistic +965,llama_2_70b_chat,0.6,llmonitor,BLZ_240312,[],holistic +975,mpt_30b_chat,0.4,llmonitor,BLZ_240312,[],holistic +976,codellama_34b_instruct,0.34,llmonitor,BLZ_240312,[],holistic +977,llama_2_13b_chat,0.5,llmonitor,BLZ_240312,[],holistic +978,vicuna_13b,0.5,llmonitor,BLZ_240312,[],holistic +982,falcon_180b_chat,0.67,llmonitor,BLZ_240312,[],holistic +983,guanaco_33b,0.43,llmonitor,BLZ_240312,[],holistic +984,llama_2_7b_chat,0.5,llmonitor,BLZ_240312,[],holistic +986,mistral_7b_instruct_v0.1,0.57,llmonitor,BLZ_240312,[],holistic +987,palm_chat_bison_001,0.57,llmonitor,BLZ_240312,[],holistic +988,vicuna_7b,0.41,llmonitor,BLZ_240312,[],holistic +989,koala_13b,0.31,llmonitor,BLZ_240312,[],holistic +992,mpt_7b_chat,0.43,llmonitor,BLZ_240312,[],holistic +1000,dolly_v2_12b,0.23,llmonitor,BLZ_240312,[],holistic +1185,mistral_medium,0.654,magi,BLZ_240312,[],holistic +1188,gemini_pro_dev_api,0.528,magi,BLZ_240312,[],holistic +1190,gpt_3.5_turbo_0613,0.455,magi,BLZ_240312,[],holistic +1191,mixtral_8x7b_instruct_v0.1,0.49560000000000004,magi,BLZ_240312,[],holistic +1192,yi_34b_chat,0.5821999999999999,magi,BLZ_240312,[],holistic +1195,gpt_3.5_turbo_0314,0.512,magi,BLZ_240312,[],holistic +1196,wizardlm_70b_v1.0,0.4476,magi,BLZ_240312,[],holistic +1197,tulu_2_dpo_70b,0.5212,magi,BLZ_240312,[],holistic +1198,vicuna_33b,0.3837,magi,BLZ_240312,[],holistic +1199,starling_lm_7b_alpha,0.4304,magi,BLZ_240312,[],holistic +1200,deepseek_llm_67b_chat,0.5946,magi,BLZ_240312,[],holistic +1201,llama_2_70b_chat,0.39899999999999997,magi,BLZ_240312,[],holistic +1203,openhermes_2.5_mistral_7b,0.4236,magi,BLZ_240312,[],holistic +1204,openchat_3.5,0.42200000000000004,magi,BLZ_240312,[],holistic +1206,gpt_3.5_turbo_1106,0.462,magi,BLZ_240312,[],holistic +1207,solar_10.7b_instruct_v1.0,0.4693,magi,BLZ_240312,[],holistic +1208,dolphin_2.2.1_mistral_7b,0.3782,magi,BLZ_240312,[],holistic +1209,wizardlm_13b_v1.2,0.3678,magi,BLZ_240312,[],holistic +1210,zephyr_7b_beta,0.4042,magi,BLZ_240312,[],holistic +1213,llama_2_13b_chat,0.37170000000000003,magi,BLZ_240312,[],holistic +1214,vicuna_13b,0.36560000000000004,magi,BLZ_240312,[],holistic +1216,zephyr_7b_alpha,0.39899999999999997,magi,BLZ_240312,[],holistic +1217,qwen_14b_chat,0.4535,magi,BLZ_240312,[],holistic +1219,guanaco_33b,0.38659999999999994,magi,BLZ_240312,[],holistic +1220,llama_2_7b_chat,0.35969999999999996,magi,BLZ_240312,[],holistic +1222,mistral_7b_instruct_v0.1,0.3704,magi,BLZ_240312,[],holistic +1063,gpt_4_1106_preview,0.83,mbpp,BLZ_240312,[],code +1067,mistral_medium,0.623,mbpp,BLZ_240312,[],code +1070,gemini_pro_dev_api,0.7290000000000001,mbpp,BLZ_240312,[],code +1073,mixtral_8x7b_instruct_v0.1,0.607,mbpp,BLZ_240312,[],code +1075,gemini_pro,0.7290000000000001,mbpp,BLZ_240312,[],code +1077,gpt_3.5_turbo_0314,0.816,mbpp,BLZ_240312,[],code +1089,solar_10.7b_instruct_v1.0,0.429,mbpp,BLZ_240312,[],code +1092,zephyr_7b_beta,0.41100000000000003,mbpp,BLZ_240312,[],code +296,gpt_4_1106_preview,0.805,mmlu,BLZ_240312,[],knowledge +298,gpt_4_0314,0.8640000000000001,mmlu,BLZ_240312,[],knowledge +300,mistral_medium,0.753,mmlu,BLZ_240312,[],knowledge +301,claude_1,0.77,mmlu,BLZ_240312,[],knowledge +302,claude_2.0,0.785,mmlu,BLZ_240312,[],knowledge +303,gemini_pro_dev_api,0.718,mmlu,BLZ_240312,[],knowledge +306,mixtral_8x7b_instruct_v0.1,0.706,mmlu,BLZ_240312,[],knowledge +307,yi_34b_chat,0.735,mmlu,BLZ_240312,[],knowledge +308,gemini_pro,0.718,mmlu,BLZ_240312,[],knowledge +309,claude_instant_1,0.7340000000000001,mmlu,BLZ_240312,[],knowledge +310,gpt_3.5_turbo_0314,0.7,mmlu,BLZ_240312,[],knowledge +311,wizardlm_70b_v1.0,0.637,mmlu,BLZ_240312,[],knowledge +312,tulu_2_dpo_70b,0.698,mmlu,BLZ_240312,[],knowledge +313,vicuna_33b,0.5920000000000001,mmlu,BLZ_240312,[],knowledge +314,starling_lm_7b_alpha,0.639,mmlu,BLZ_240312,[],knowledge +315,deepseek_llm_67b_chat,0.713,mmlu,BLZ_240312,[],knowledge +316,llama_2_70b_chat,0.63,mmlu,BLZ_240312,[],knowledge +317,nv_llama2_70b_steerlm_chat,0.685,mmlu,BLZ_240312,[],knowledge +318,openhermes_2.5_mistral_7b,0.638,mmlu,BLZ_240312,[],knowledge +319,openchat_3.5,0.643,mmlu,BLZ_240312,[],knowledge +321,gpt_3.5_turbo_1106,0.6779999999999999,mmlu,BLZ_240312,[],knowledge +322,solar_10.7b_instruct_v1.0,0.662,mmlu,BLZ_240312,[],knowledge +323,dolphin_2.2.1_mistral_7b,0.632,mmlu,BLZ_240312,[],knowledge +324,wizardlm_13b_v1.2,0.527,mmlu,BLZ_240312,[],knowledge +325,zephyr_7b_beta,0.614,mmlu,BLZ_240312,[],knowledge +326,mpt_30b_chat,0.504,mmlu,BLZ_240312,[],knowledge +327,codellama_34b_instruct,0.537,mmlu,BLZ_240312,[],knowledge +328,llama_2_13b_chat,0.536,mmlu,BLZ_240312,[],knowledge +329,vicuna_13b,0.5579999999999999,mmlu,BLZ_240312,[],knowledge +331,zephyr_7b_alpha,0.614,mmlu,BLZ_240312,[],knowledge +332,qwen_14b_chat,0.665,mmlu,BLZ_240312,[],knowledge +333,falcon_180b_chat,0.68,mmlu,BLZ_240312,[],knowledge +334,guanaco_33b,0.5760000000000001,mmlu,BLZ_240312,[],knowledge +335,llama_2_7b_chat,0.45799999999999996,mmlu,BLZ_240312,[],knowledge +337,mistral_7b_instruct_v0.1,0.5539999999999999,mmlu,BLZ_240312,[],knowledge +339,vicuna_7b,0.51,mmlu,BLZ_240312,[],knowledge +340,koala_13b,0.447,mmlu,BLZ_240312,[],knowledge +342,gpt4all_13b_snoozy,0.43,mmlu,BLZ_240312,[],knowledge +343,mpt_7b_chat,0.32,mmlu,BLZ_240312,[],knowledge +344,chatglm2_6b,0.455,mmlu,BLZ_240312,[],knowledge +345,rwkv_4_raven_14b,0.256,mmlu,BLZ_240312,[],knowledge +346,alpaca_13b,0.48100000000000004,mmlu,BLZ_240312,[],knowledge +347,openassistant_pythia_12b,0.27,mmlu,BLZ_240312,[],knowledge +348,chatglm_6b,0.361,mmlu,BLZ_240312,[],knowledge +349,fastchat_t5_3b,0.47700000000000004,mmlu,BLZ_240312,[],knowledge +350,stablelm_tuned_alpha_7b,0.244,mmlu,BLZ_240312,[],knowledge +351,dolly_v2_12b,0.257,mmlu,BLZ_240312,[],knowledge +352,llama_13b,0.47,mmlu,BLZ_240312,[],knowledge +353,yi_34bx2_moe_60b,0.775,mmlu,BLZ_240312,[],knowledge +59,gpt_4_0125_preview,0.0929,mt_bench,BLZ_240312,[],holistic +60,gpt_4_1106_preview,0.0932,mt_bench,BLZ_240312,[],holistic +62,gpt_4_0314,0.08960000000000001,mt_bench,BLZ_240312,[],holistic +63,gpt_4_0613,0.09179999999999999,mt_bench,BLZ_240312,[],holistic +64,mistral_medium,0.0861,mt_bench,BLZ_240312,[],holistic +65,claude_1,0.079,mt_bench,BLZ_240312,[],holistic +66,claude_2.0,0.0806,mt_bench,BLZ_240312,[],holistic +67,gemini_pro_dev_api,0.08039999999999999,mt_bench,BLZ_240312,[],holistic +68,claude_2.1,0.0818,mt_bench,BLZ_240312,[],holistic +69,gpt_3.5_turbo_0613,0.0839,mt_bench,BLZ_240312,[],holistic +70,mixtral_8x7b_instruct_v0.1,0.083,mt_bench,BLZ_240312,[],holistic +71,yi_34b_chat,0.07769999999999999,mt_bench,BLZ_240312,[],holistic +72,gemini_pro,0.08039999999999999,mt_bench,BLZ_240312,[],holistic +73,claude_instant_1,0.0785,mt_bench,BLZ_240312,[],holistic +74,gpt_3.5_turbo_0314,0.0794,mt_bench,BLZ_240312,[],holistic +75,wizardlm_70b_v1.0,0.0771,mt_bench,BLZ_240312,[],holistic +76,tulu_2_dpo_70b,0.0789,mt_bench,BLZ_240312,[],holistic +77,vicuna_33b,0.0712,mt_bench,BLZ_240312,[],holistic +78,starling_lm_7b_alpha,0.0809,mt_bench,BLZ_240312,[],holistic +79,deepseek_llm_67b_chat,0.08529999999999999,mt_bench,BLZ_240312,[],holistic +80,llama_2_70b_chat,0.06860000000000001,mt_bench,BLZ_240312,[],holistic +81,nv_llama2_70b_steerlm_chat,0.0754,mt_bench,BLZ_240312,[],holistic +82,openhermes_2.5_mistral_7b,0.07690000000000001,mt_bench,BLZ_240312,[],holistic +83,openchat_3.5,0.0781,mt_bench,BLZ_240312,[],holistic +84,pplx_70b_online,0.0588,mt_bench,BLZ_240312,[],holistic +85,gpt_3.5_turbo_1106,0.0832,mt_bench,BLZ_240312,[],holistic +86,solar_10.7b_instruct_v1.0,0.0758,mt_bench,BLZ_240312,[],holistic +88,wizardlm_13b_v1.2,0.07200000000000001,mt_bench,BLZ_240312,[],holistic +89,zephyr_7b_beta,0.07339999999999999,mt_bench,BLZ_240312,[],holistic +90,mpt_30b_chat,0.0639,mt_bench,BLZ_240312,[],holistic +92,llama_2_13b_chat,0.0665,mt_bench,BLZ_240312,[],holistic +93,vicuna_13b,0.06570000000000001,mt_bench,BLZ_240312,[],holistic +95,zephyr_7b_alpha,0.0688,mt_bench,BLZ_240312,[],holistic +96,qwen_14b_chat,0.0696,mt_bench,BLZ_240312,[],holistic +98,guanaco_33b,0.0653,mt_bench,BLZ_240312,[],holistic +99,llama_2_7b_chat,0.06269999999999999,mt_bench,BLZ_240312,[],holistic +101,mistral_7b_instruct_v0.1,0.0684,mt_bench,BLZ_240312,[],holistic +102,palm_chat_bison_001,0.064,mt_bench,BLZ_240312,[],holistic +103,vicuna_7b,0.0617,mt_bench,BLZ_240312,[],holistic +104,koala_13b,0.0535,mt_bench,BLZ_240312,[],holistic +106,gpt4all_13b_snoozy,0.0541,mt_bench,BLZ_240312,[],holistic +107,mpt_7b_chat,0.0542,mt_bench,BLZ_240312,[],holistic +108,chatglm2_6b,0.0496,mt_bench,BLZ_240312,[],holistic +109,rwkv_4_raven_14b,0.0398,mt_bench,BLZ_240312,[],holistic +110,alpaca_13b,0.0453,mt_bench,BLZ_240312,[],holistic +111,openassistant_pythia_12b,0.0432,mt_bench,BLZ_240312,[],holistic +112,chatglm_6b,0.045,mt_bench,BLZ_240312,[],holistic +113,fastchat_t5_3b,0.0304,mt_bench,BLZ_240312,[],holistic +114,stablelm_tuned_alpha_7b,0.0275,mt_bench,BLZ_240312,[],holistic +115,dolly_v2_12b,0.032799999999999996,mt_bench,BLZ_240312,[],holistic +116,llama_13b,0.026099999999999998,mt_bench,BLZ_240312,[],holistic +357,gpt_4_0314,0.59,truthfulqa,BLZ_240312,[],knowledge +365,mixtral_8x7b_instruct_v0.1,0.6457999999999999,truthfulqa,BLZ_240312,[],knowledge +366,yi_34b_chat,0.5537,truthfulqa,BLZ_240312,[],knowledge +370,wizardlm_70b_v1.0,0.5481,truthfulqa,BLZ_240312,[],knowledge +371,tulu_2_dpo_70b,0.6578,truthfulqa,BLZ_240312,[],knowledge +372,vicuna_33b,0.5616,truthfulqa,BLZ_240312,[],knowledge +373,starling_lm_7b_alpha,0.4639,truthfulqa,BLZ_240312,[],knowledge +375,llama_2_70b_chat,0.528,truthfulqa,BLZ_240312,[],knowledge +377,openhermes_2.5_mistral_7b,0.5224,truthfulqa,BLZ_240312,[],knowledge +378,openchat_3.5,0.46380000000000005,truthfulqa,BLZ_240312,[],knowledge +381,solar_10.7b_instruct_v1.0,0.7143,truthfulqa,BLZ_240312,[],knowledge +382,dolphin_2.2.1_mistral_7b,0.5311,truthfulqa,BLZ_240312,[],knowledge +383,wizardlm_13b_v1.2,0.4727,truthfulqa,BLZ_240312,[],knowledge +384,zephyr_7b_beta,0.5745,truthfulqa,BLZ_240312,[],knowledge +385,mpt_30b_chat,0.5242,truthfulqa,BLZ_240312,[],knowledge +386,codellama_34b_instruct,0.44439999999999996,truthfulqa,BLZ_240312,[],knowledge +387,llama_2_13b_chat,0.4412,truthfulqa,BLZ_240312,[],knowledge +388,vicuna_13b,0.5151,truthfulqa,BLZ_240312,[],knowledge +390,zephyr_7b_alpha,0.579,truthfulqa,BLZ_240312,[],knowledge +392,falcon_180b_chat,0.4547,truthfulqa,BLZ_240312,[],knowledge +394,llama_2_7b_chat,0.4557,truthfulqa,BLZ_240312,[],knowledge +396,mistral_7b_instruct_v0.1,0.5628,truthfulqa,BLZ_240312,[],knowledge +398,vicuna_7b,0.5034000000000001,truthfulqa,BLZ_240312,[],knowledge +412,yi_34bx2_moe_60b,0.6618999999999999,truthfulqa,BLZ_240312,[],knowledge +418,mistral_medium,0.88,winogrande,BLZ_240312,[],reasoning +424,mixtral_8x7b_instruct_v0.1,0.8137000000000001,winogrande,BLZ_240312,[],reasoning +425,yi_34b_chat,0.8011,winogrande,BLZ_240312,[],reasoning +428,gpt_3.5_turbo_0314,0.852,winogrande,BLZ_240312,[],reasoning +429,wizardlm_70b_v1.0,0.8081999999999999,winogrande,BLZ_240312,[],reasoning +430,tulu_2_dpo_70b,0.8327,winogrande,BLZ_240312,[],reasoning +431,vicuna_33b,0.7703,winogrande,BLZ_240312,[],reasoning +432,starling_lm_7b_alpha,0.8058,winogrande,BLZ_240312,[],reasoning +434,llama_2_70b_chat,0.8051,winogrande,BLZ_240312,[],reasoning +436,openhermes_2.5_mistral_7b,0.7806000000000001,winogrande,BLZ_240312,[],reasoning +437,openchat_3.5,0.8058,winogrande,BLZ_240312,[],reasoning +440,solar_10.7b_instruct_v1.0,0.8358,winogrande,BLZ_240312,[],reasoning +441,dolphin_2.2.1_mistral_7b,0.7814,winogrande,BLZ_240312,[],reasoning +442,wizardlm_13b_v1.2,0.7190000000000001,winogrande,BLZ_240312,[],reasoning +443,zephyr_7b_beta,0.7774,winogrande,BLZ_240312,[],reasoning +444,mpt_30b_chat,0.753,winogrande,BLZ_240312,[],reasoning +445,codellama_34b_instruct,0.7459,winogrande,BLZ_240312,[],reasoning +446,llama_2_13b_chat,0.7451000000000001,winogrande,BLZ_240312,[],reasoning +447,vicuna_13b,0.7465999999999999,winogrande,BLZ_240312,[],reasoning +449,zephyr_7b_alpha,0.7861,winogrande,BLZ_240312,[],reasoning +451,falcon_180b_chat,0.8690000000000001,winogrande,BLZ_240312,[],reasoning +453,llama_2_7b_chat,0.7173999999999999,winogrande,BLZ_240312,[],reasoning +455,mistral_7b_instruct_v0.1,0.7372,winogrande,BLZ_240312,[],reasoning +457,vicuna_7b,0.7214,winogrande,BLZ_240312,[],reasoning +471,yi_34bx2_moe_60b,0.8484999999999999,winogrande,BLZ_240312,[],reasoning +0,gpt_4_0613,0.957,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +1,llama_3_70b,0.902,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +2,mixtral_8x22b,0.855,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +3,palmyra_x_v3_72b,0.826,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +4,gpt_4_turbo_1106_preview,0.821,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +5,palm_2_unicorn,0.781,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +6,claude_3_opus_20240229,0.762,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +7,qwen1.5_72b,0.757,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +8,palmyra_x_v2_33b,0.736,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +9,yi_34b,0.723,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +10,qwen1.5_32b,0.689,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +11,claude_v1.3,0.689,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +12,mixtral_8x7b_32k_seqlen,0.679,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +13,palm_2_bison,0.655,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +14,claude_2.0,0.651,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +15,deepseek_llm_chat_67b,0.645,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +16,llama_2_70b,0.609,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +17,claude_2.1,0.594,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +18,gpt_3.5_text_davinci_003,0.577,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +19,qwen1.5_14b,0.574,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +20,claude_instant_1.2,0.551,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +21,llama_3_8b,0.519,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +22,gpt_3.5_turbo_0613,0.502,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +23,gemma_7b,0.47,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +24,claude_3_sonnet_20240229,0.468,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +25,gpt_3.5_text_davinci_002,0.468,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +26,llama_65b,0.466,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +27,mistral_large_2402,0.46,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +28,cohere_command,0.421,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +29,dbrx_instructruct,0.419,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +30,mistral_v0.1_7b,0.415,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +31,mistral_small_2402,0.415,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +32,mistral_medium_2312,0.383,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +33,qwen1.5_7b,0.377,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +34,claude_3_haiku_20240307,0.377,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +35,yi_6b,0.351,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +36,llama_2_13b,0.332,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +37,jurassic_2_jumbo_178b,0.317,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +38,falcon_40b,0.306,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +39,phi_2,0.26,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +40,jurassic_2_grande_17b,0.253,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +41,llama_2_7b,0.234,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +42,luminous_supreme_70b,0.213,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +43,cohere_command_light,0.166,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +44,luminous_extended_30b,0.119,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +45,falcon_7b,0.1,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +46,olmo_7b,0.083,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +47,luminous_base_13b,0.072,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic +48,gpt_4_0613,0.768,narrativeqa,helm_lite_240610,[],knowledge +49,llama_3_70b,0.798,narrativeqa,helm_lite_240610,[],knowledge +50,mixtral_8x22b,0.779,narrativeqa,helm_lite_240610,[],knowledge +51,palmyra_x_v3_72b,0.706,narrativeqa,helm_lite_240610,[],knowledge +52,gpt_4_turbo_1106_preview,0.727,narrativeqa,helm_lite_240610,[],knowledge +53,palm_2_unicorn,0.583,narrativeqa,helm_lite_240610,[],knowledge +54,claude_3_opus_20240229,0.351,narrativeqa,helm_lite_240610,[],knowledge +55,qwen1.5_72b,0.601,narrativeqa,helm_lite_240610,[],knowledge +56,palmyra_x_v2_33b,0.752,narrativeqa,helm_lite_240610,[],knowledge +57,yi_34b,0.782,narrativeqa,helm_lite_240610,[],knowledge +58,qwen1.5_32b,0.589,narrativeqa,helm_lite_240610,[],knowledge +59,claude_v1.3,0.723,narrativeqa,helm_lite_240610,[],knowledge +60,mixtral_8x7b_32k_seqlen,0.767,narrativeqa,helm_lite_240610,[],knowledge +61,palm_2_bison,0.718,narrativeqa,helm_lite_240610,[],knowledge +62,claude_2.0,0.718,narrativeqa,helm_lite_240610,[],knowledge +63,deepseek_llm_chat_67b,0.581,narrativeqa,helm_lite_240610,[],knowledge +64,llama_2_70b,0.763,narrativeqa,helm_lite_240610,[],knowledge +65,claude_2.1,0.677,narrativeqa,helm_lite_240610,[],knowledge +66,gpt_3.5_text_davinci_003,0.731,narrativeqa,helm_lite_240610,[],knowledge +67,qwen1.5_14b,0.711,narrativeqa,helm_lite_240610,[],knowledge +68,claude_instant_1.2,0.616,narrativeqa,helm_lite_240610,[],knowledge +69,llama_3_8b,0.754,narrativeqa,helm_lite_240610,[],knowledge +70,gpt_3.5_turbo_0613,0.655,narrativeqa,helm_lite_240610,[],knowledge +71,gemma_7b,0.752,narrativeqa,helm_lite_240610,[],knowledge +72,claude_3_sonnet_20240229,0.111,narrativeqa,helm_lite_240610,[],knowledge +73,gpt_3.5_text_davinci_002,0.719,narrativeqa,helm_lite_240610,[],knowledge +74,llama_65b,0.755,narrativeqa,helm_lite_240610,[],knowledge +75,mistral_large_2402,0.454,narrativeqa,helm_lite_240610,[],knowledge +76,cohere_command,0.749,narrativeqa,helm_lite_240610,[],knowledge +77,dbrx_instructruct,0.488,narrativeqa,helm_lite_240610,[],knowledge +78,mistral_v0.1_7b,0.716,narrativeqa,helm_lite_240610,[],knowledge +79,mistral_small_2402,0.519,narrativeqa,helm_lite_240610,[],knowledge +80,mistral_medium_2312,0.449,narrativeqa,helm_lite_240610,[],knowledge +81,qwen1.5_7b,0.448,narrativeqa,helm_lite_240610,[],knowledge +82,claude_3_haiku_20240307,0.244,narrativeqa,helm_lite_240610,[],knowledge +83,yi_6b,0.702,narrativeqa,helm_lite_240610,[],knowledge +84,llama_2_13b,0.741,narrativeqa,helm_lite_240610,[],knowledge +85,jurassic_2_jumbo_178b,0.728,narrativeqa,helm_lite_240610,[],knowledge +86,falcon_40b,0.671,narrativeqa,helm_lite_240610,[],knowledge +87,phi_2,0.703,narrativeqa,helm_lite_240610,[],knowledge +88,jurassic_2_grande_17b,0.744,narrativeqa,helm_lite_240610,[],knowledge +89,llama_2_7b,0.686,narrativeqa,helm_lite_240610,[],knowledge +90,luminous_supreme_70b,0.743,narrativeqa,helm_lite_240610,[],knowledge +91,cohere_command_light,0.629,narrativeqa,helm_lite_240610,[],knowledge +92,luminous_extended_30b,0.684,narrativeqa,helm_lite_240610,[],knowledge +93,falcon_7b,0.621,narrativeqa,helm_lite_240610,[],knowledge +94,olmo_7b,0.597,narrativeqa,helm_lite_240610,[],knowledge +95,luminous_base_13b,0.633,narrativeqa,helm_lite_240610,[],knowledge +96,gpt_4_0613,0.79,naturalquestions_open,helm_lite_240610,[],knowledge +97,llama_3_70b,0.743,naturalquestions_open,helm_lite_240610,[],knowledge +98,mixtral_8x22b,0.726,naturalquestions_open,helm_lite_240610,[],knowledge +99,palmyra_x_v3_72b,0.685,naturalquestions_open,helm_lite_240610,[],knowledge +100,gpt_4_turbo_1106_preview,0.763,naturalquestions_open,helm_lite_240610,[],knowledge +101,palm_2_unicorn,0.674,naturalquestions_open,helm_lite_240610,[],knowledge +102,claude_3_opus_20240229,0.264,naturalquestions_open,helm_lite_240610,[],knowledge +103,qwen1.5_72b,0.758,naturalquestions_open,helm_lite_240610,[],knowledge +104,palmyra_x_v2_33b,0.752,naturalquestions_open,helm_lite_240610,[],knowledge +105,yi_34b,0.775,naturalquestions_open,helm_lite_240610,[],knowledge +106,qwen1.5_32b,0.777,naturalquestions_open,helm_lite_240610,[],knowledge +107,claude_v1.3,0.699,naturalquestions_open,helm_lite_240610,[],knowledge +108,mixtral_8x7b_32k_seqlen,0.699,naturalquestions_open,helm_lite_240610,[],knowledge +109,palm_2_bison,0.813,naturalquestions_open,helm_lite_240610,[],knowledge +110,claude_2.0,0.67,naturalquestions_open,helm_lite_240610,[],knowledge +111,deepseek_llm_chat_67b,0.733,naturalquestions_open,helm_lite_240610,[],knowledge +112,llama_2_70b,0.674,naturalquestions_open,helm_lite_240610,[],knowledge +113,claude_2.1,0.611,naturalquestions_open,helm_lite_240610,[],knowledge +114,gpt_3.5_text_davinci_003,0.77,naturalquestions_open,helm_lite_240610,[],knowledge +115,qwen1.5_14b,0.772,naturalquestions_open,helm_lite_240610,[],knowledge +116,claude_instant_1.2,0.731,naturalquestions_open,helm_lite_240610,[],knowledge +117,llama_3_8b,0.681,naturalquestions_open,helm_lite_240610,[],knowledge +118,gpt_3.5_turbo_0613,0.678,naturalquestions_open,helm_lite_240610,[],knowledge +119,gemma_7b,0.665,naturalquestions_open,helm_lite_240610,[],knowledge +120,claude_3_sonnet_20240229,0.072,naturalquestions_open,helm_lite_240610,[],knowledge +121,gpt_3.5_text_davinci_002,0.71,naturalquestions_open,helm_lite_240610,[],knowledge +122,llama_65b,0.672,naturalquestions_open,helm_lite_240610,[],knowledge +123,mistral_large_2402,0.485,naturalquestions_open,helm_lite_240610,[],knowledge +124,cohere_command,0.777,naturalquestions_open,helm_lite_240610,[],knowledge +125,dbrx_instructruct,0.55,naturalquestions_open,helm_lite_240610,[],knowledge +126,mistral_v0.1_7b,0.687,naturalquestions_open,helm_lite_240610,[],knowledge +127,mistral_small_2402,0.587,naturalquestions_open,helm_lite_240610,[],knowledge +128,mistral_medium_2312,0.468,naturalquestions_open,helm_lite_240610,[],knowledge +129,qwen1.5_7b,0.749,naturalquestions_open,helm_lite_240610,[],knowledge +130,claude_3_haiku_20240307,0.252,naturalquestions_open,helm_lite_240610,[],knowledge +131,yi_6b,0.748,naturalquestions_open,helm_lite_240610,[],knowledge +132,llama_2_13b,0.64,naturalquestions_open,helm_lite_240610,[],knowledge +133,jurassic_2_jumbo_178b,0.65,naturalquestions_open,helm_lite_240610,[],knowledge +134,falcon_40b,0.676,naturalquestions_open,helm_lite_240610,[],knowledge +135,phi_2,0.68,naturalquestions_open,helm_lite_240610,[],knowledge +136,jurassic_2_grande_17b,0.627,naturalquestions_open,helm_lite_240610,[],knowledge +137,llama_2_7b,0.612,naturalquestions_open,helm_lite_240610,[],knowledge +138,luminous_supreme_70b,0.656,naturalquestions_open,helm_lite_240610,[],knowledge +139,cohere_command_light,0.686,naturalquestions_open,helm_lite_240610,[],knowledge +140,luminous_extended_30b,0.611,naturalquestions_open,helm_lite_240610,[],knowledge +141,falcon_7b,0.58,naturalquestions_open,helm_lite_240610,[],knowledge +142,olmo_7b,0.603,naturalquestions_open,helm_lite_240610,[],knowledge +143,luminous_base_13b,0.577,naturalquestions_open,helm_lite_240610,[],knowledge +144,gpt_4_0613,0.457,naturalquestions_closed,helm_lite_240610,[],knowledge +145,llama_3_70b,0.475,naturalquestions_closed,helm_lite_240610,[],knowledge +146,mixtral_8x22b,0.478,naturalquestions_closed,helm_lite_240610,[],knowledge +147,palmyra_x_v3_72b,0.407,naturalquestions_closed,helm_lite_240610,[],knowledge +148,gpt_4_turbo_1106_preview,0.435,naturalquestions_closed,helm_lite_240610,[],knowledge +149,palm_2_unicorn,0.435,naturalquestions_closed,helm_lite_240610,[],knowledge +150,claude_3_opus_20240229,0.441,naturalquestions_closed,helm_lite_240610,[],knowledge +151,qwen1.5_72b,0.417,naturalquestions_closed,helm_lite_240610,[],knowledge +152,palmyra_x_v2_33b,0.428,naturalquestions_closed,helm_lite_240610,[],knowledge +153,yi_34b,0.443,naturalquestions_closed,helm_lite_240610,[],knowledge +154,qwen1.5_32b,0.353,naturalquestions_closed,helm_lite_240610,[],knowledge +155,claude_v1.3,0.409,naturalquestions_closed,helm_lite_240610,[],knowledge +156,mixtral_8x7b_32k_seqlen,0.427,naturalquestions_closed,helm_lite_240610,[],knowledge +157,palm_2_bison,0.39,naturalquestions_closed,helm_lite_240610,[],knowledge +158,claude_2.0,0.428,naturalquestions_closed,helm_lite_240610,[],knowledge +159,deepseek_llm_chat_67b,0.412,naturalquestions_closed,helm_lite_240610,[],knowledge +160,llama_2_70b,0.46,naturalquestions_closed,helm_lite_240610,[],knowledge +161,claude_2.1,0.375,naturalquestions_closed,helm_lite_240610,[],knowledge +162,gpt_3.5_text_davinci_003,0.413,naturalquestions_closed,helm_lite_240610,[],knowledge +163,qwen1.5_14b,0.3,naturalquestions_closed,helm_lite_240610,[],knowledge +164,claude_instant_1.2,0.343,naturalquestions_closed,helm_lite_240610,[],knowledge +165,llama_3_8b,0.378,naturalquestions_closed,helm_lite_240610,[],knowledge +166,gpt_3.5_turbo_0613,0.335,naturalquestions_closed,helm_lite_240610,[],knowledge +167,gemma_7b,0.336,naturalquestions_closed,helm_lite_240610,[],knowledge +168,claude_3_sonnet_20240229,0.028,naturalquestions_closed,helm_lite_240610,[],knowledge +169,gpt_3.5_text_davinci_002,0.394,naturalquestions_closed,helm_lite_240610,[],knowledge +170,llama_65b,0.433,naturalquestions_closed,helm_lite_240610,[],knowledge +171,mistral_large_2402,0.311,naturalquestions_closed,helm_lite_240610,[],knowledge +172,cohere_command,0.391,naturalquestions_closed,helm_lite_240610,[],knowledge +173,dbrx_instructruct,0.284,naturalquestions_closed,helm_lite_240610,[],knowledge +174,mistral_v0.1_7b,0.367,naturalquestions_closed,helm_lite_240610,[],knowledge +175,mistral_small_2402,0.304,naturalquestions_closed,helm_lite_240610,[],knowledge +176,mistral_medium_2312,0.29,naturalquestions_closed,helm_lite_240610,[],knowledge +177,qwen1.5_7b,0.27,naturalquestions_closed,helm_lite_240610,[],knowledge +178,claude_3_haiku_20240307,0.144,naturalquestions_closed,helm_lite_240610,[],knowledge +179,yi_6b,0.31,naturalquestions_closed,helm_lite_240610,[],knowledge +180,llama_2_13b,0.371,naturalquestions_closed,helm_lite_240610,[],knowledge +181,jurassic_2_jumbo_178b,0.385,naturalquestions_closed,helm_lite_240610,[],knowledge +182,falcon_40b,0.392,naturalquestions_closed,helm_lite_240610,[],knowledge +183,phi_2,0.155,naturalquestions_closed,helm_lite_240610,[],knowledge +184,jurassic_2_grande_17b,0.35,naturalquestions_closed,helm_lite_240610,[],knowledge +185,llama_2_7b,0.333,naturalquestions_closed,helm_lite_240610,[],knowledge +186,luminous_supreme_70b,0.299,naturalquestions_closed,helm_lite_240610,[],knowledge +187,cohere_command_light,0.195,naturalquestions_closed,helm_lite_240610,[],knowledge +188,luminous_extended_30b,0.253,naturalquestions_closed,helm_lite_240610,[],knowledge +189,falcon_7b,0.285,naturalquestions_closed,helm_lite_240610,[],knowledge +190,olmo_7b,0.259,naturalquestions_closed,helm_lite_240610,[],knowledge +191,luminous_base_13b,0.197,naturalquestions_closed,helm_lite_240610,[],knowledge +192,gpt_4_0613,0.96,openbookqa,helm_lite_240610,[],knowledge +193,llama_3_70b,0.934,openbookqa,helm_lite_240610,[],knowledge +194,mixtral_8x22b,0.882,openbookqa,helm_lite_240610,[],knowledge +195,palmyra_x_v3_72b,0.938,openbookqa,helm_lite_240610,[],knowledge +196,gpt_4_turbo_1106_preview,0.95,openbookqa,helm_lite_240610,[],knowledge +197,palm_2_unicorn,0.938,openbookqa,helm_lite_240610,[],knowledge +198,claude_3_opus_20240229,0.956,openbookqa,helm_lite_240610,[],knowledge +199,qwen1.5_72b,0.93,openbookqa,helm_lite_240610,[],knowledge +200,palmyra_x_v2_33b,0.878,openbookqa,helm_lite_240610,[],knowledge +201,yi_34b,0.92,openbookqa,helm_lite_240610,[],knowledge +202,qwen1.5_32b,0.932,openbookqa,helm_lite_240610,[],knowledge +203,claude_v1.3,0.908,openbookqa,helm_lite_240610,[],knowledge +204,mixtral_8x7b_32k_seqlen,0.868,openbookqa,helm_lite_240610,[],knowledge +205,palm_2_bison,0.878,openbookqa,helm_lite_240610,[],knowledge +206,claude_2.0,0.862,openbookqa,helm_lite_240610,[],knowledge +207,deepseek_llm_chat_67b,0.88,openbookqa,helm_lite_240610,[],knowledge +208,llama_2_70b,0.838,openbookqa,helm_lite_240610,[],knowledge +209,claude_2.1,0.872,openbookqa,helm_lite_240610,[],knowledge +210,gpt_3.5_text_davinci_003,0.828,openbookqa,helm_lite_240610,[],knowledge +211,qwen1.5_14b,0.862,openbookqa,helm_lite_240610,[],knowledge +212,claude_instant_1.2,0.844,openbookqa,helm_lite_240610,[],knowledge +213,llama_3_8b,0.766,openbookqa,helm_lite_240610,[],knowledge +214,gpt_3.5_turbo_0613,0.838,openbookqa,helm_lite_240610,[],knowledge +215,gemma_7b,0.808,openbookqa,helm_lite_240610,[],knowledge +216,claude_3_sonnet_20240229,0.918,openbookqa,helm_lite_240610,[],knowledge +217,gpt_3.5_text_davinci_002,0.796,openbookqa,helm_lite_240610,[],knowledge +218,llama_65b,0.754,openbookqa,helm_lite_240610,[],knowledge +219,mistral_large_2402,0.894,openbookqa,helm_lite_240610,[],knowledge +220,cohere_command,0.774,openbookqa,helm_lite_240610,[],knowledge +221,dbrx_instructruct,0.91,openbookqa,helm_lite_240610,[],knowledge +222,mistral_v0.1_7b,0.776,openbookqa,helm_lite_240610,[],knowledge +223,mistral_small_2402,0.862,openbookqa,helm_lite_240610,[],knowledge +224,mistral_medium_2312,0.83,openbookqa,helm_lite_240610,[],knowledge +225,qwen1.5_7b,0.806,openbookqa,helm_lite_240610,[],knowledge +226,claude_3_haiku_20240307,0.838,openbookqa,helm_lite_240610,[],knowledge +227,yi_6b,0.8,openbookqa,helm_lite_240610,[],knowledge +228,llama_2_13b,0.634,openbookqa,helm_lite_240610,[],knowledge +229,jurassic_2_jumbo_178b,0.688,openbookqa,helm_lite_240610,[],knowledge +230,falcon_40b,0.662,openbookqa,helm_lite_240610,[],knowledge +231,phi_2,0.798,openbookqa,helm_lite_240610,[],knowledge +232,jurassic_2_grande_17b,0.614,openbookqa,helm_lite_240610,[],knowledge +233,llama_2_7b,0.544,openbookqa,helm_lite_240610,[],knowledge +234,luminous_supreme_70b,0.284,openbookqa,helm_lite_240610,[],knowledge +235,cohere_command_light,0.398,openbookqa,helm_lite_240610,[],knowledge +236,luminous_extended_30b,0.272,openbookqa,helm_lite_240610,[],knowledge +237,falcon_7b,0.26,openbookqa,helm_lite_240610,[],knowledge +238,olmo_7b,0.222,openbookqa,helm_lite_240610,[],knowledge +239,luminous_base_13b,0.286,openbookqa,helm_lite_240610,[],knowledge +240,gpt_4_0613,0.735,mmlu,helm_lite_240610,[],knowledge +241,llama_3_70b,0.695,mmlu,helm_lite_240610,[],knowledge +242,mixtral_8x22b,0.701,mmlu,helm_lite_240610,[],knowledge +243,palmyra_x_v3_72b,0.702,mmlu,helm_lite_240610,[],knowledge +244,gpt_4_turbo_1106_preview,0.699,mmlu,helm_lite_240610,[],knowledge +245,palm_2_unicorn,0.702,mmlu,helm_lite_240610,[],knowledge +246,claude_3_opus_20240229,0.768,mmlu,helm_lite_240610,[],knowledge +247,qwen1.5_72b,0.647,mmlu,helm_lite_240610,[],knowledge +248,palmyra_x_v2_33b,0.621,mmlu,helm_lite_240610,[],knowledge +249,yi_34b,0.65,mmlu,helm_lite_240610,[],knowledge +250,qwen1.5_32b,0.628,mmlu,helm_lite_240610,[],knowledge +251,claude_v1.3,0.631,mmlu,helm_lite_240610,[],knowledge +252,mixtral_8x7b_32k_seqlen,0.649,mmlu,helm_lite_240610,[],knowledge +253,palm_2_bison,0.608,mmlu,helm_lite_240610,[],knowledge +254,claude_2.0,0.639,mmlu,helm_lite_240610,[],knowledge +255,deepseek_llm_chat_67b,0.641,mmlu,helm_lite_240610,[],knowledge +256,llama_2_70b,0.58,mmlu,helm_lite_240610,[],knowledge +257,claude_2.1,0.643,mmlu,helm_lite_240610,[],knowledge +258,gpt_3.5_text_davinci_003,0.555,mmlu,helm_lite_240610,[],knowledge +259,qwen1.5_14b,0.626,mmlu,helm_lite_240610,[],knowledge +260,claude_instant_1.2,0.631,mmlu,helm_lite_240610,[],knowledge +261,llama_3_8b,0.602,mmlu,helm_lite_240610,[],knowledge +262,gpt_3.5_turbo_0613,0.614,mmlu,helm_lite_240610,[],knowledge +263,gemma_7b,0.571,mmlu,helm_lite_240610,[],knowledge +264,claude_3_sonnet_20240229,0.652,mmlu,helm_lite_240610,[],knowledge +265,gpt_3.5_text_davinci_002,0.568,mmlu,helm_lite_240610,[],knowledge +266,llama_65b,0.584,mmlu,helm_lite_240610,[],knowledge +267,mistral_large_2402,0.638,mmlu,helm_lite_240610,[],knowledge +268,cohere_command,0.525,mmlu,helm_lite_240610,[],knowledge +269,dbrx_instructruct,0.643,mmlu,helm_lite_240610,[],knowledge +270,mistral_v0.1_7b,0.584,mmlu,helm_lite_240610,[],knowledge +271,mistral_small_2402,0.593,mmlu,helm_lite_240610,[],knowledge +272,mistral_medium_2312,0.618,mmlu,helm_lite_240610,[],knowledge +273,qwen1.5_7b,0.569,mmlu,helm_lite_240610,[],knowledge +274,claude_3_haiku_20240307,0.662,mmlu,helm_lite_240610,[],knowledge +275,yi_6b,0.53,mmlu,helm_lite_240610,[],knowledge +276,llama_2_13b,0.505,mmlu,helm_lite_240610,[],knowledge +277,jurassic_2_jumbo_178b,0.483,mmlu,helm_lite_240610,[],knowledge +278,falcon_40b,0.507,mmlu,helm_lite_240610,[],knowledge +279,phi_2,0.518,mmlu,helm_lite_240610,[],knowledge +280,jurassic_2_grande_17b,0.471,mmlu,helm_lite_240610,[],knowledge +281,llama_2_7b,0.425,mmlu,helm_lite_240610,[],knowledge +282,luminous_supreme_70b,0.316,mmlu,helm_lite_240610,[],knowledge +283,cohere_command_light,0.386,mmlu,helm_lite_240610,[],knowledge +284,luminous_extended_30b,0.248,mmlu,helm_lite_240610,[],knowledge +285,falcon_7b,0.288,mmlu,helm_lite_240610,[],knowledge +286,olmo_7b,0.305,mmlu,helm_lite_240610,[],knowledge +287,luminous_base_13b,0.243,mmlu,helm_lite_240610,[],knowledge +288,gpt_4_0613,0.802,math,helm_lite_240610,[],math +289,llama_3_70b,0.663,math,helm_lite_240610,[],math +290,mixtral_8x22b,0.656,math,helm_lite_240610,[],math +291,palmyra_x_v3_72b,0.723,math,helm_lite_240610,[],math +292,gpt_4_turbo_1106_preview,0.857,math,helm_lite_240610,[],math +293,palm_2_unicorn,0.674,math,helm_lite_240610,[],math +294,claude_3_opus_20240229,0.76,math,helm_lite_240610,[],math +295,qwen1.5_72b,0.683,math,helm_lite_240610,[],math +296,palmyra_x_v2_33b,0.58,math,helm_lite_240610,[],math +297,yi_34b,0.375,math,helm_lite_240610,[],math +298,qwen1.5_32b,0.733,math,helm_lite_240610,[],math +299,claude_v1.3,0.54,math,helm_lite_240610,[],math +300,mixtral_8x7b_32k_seqlen,0.494,math,helm_lite_240610,[],math +301,palm_2_bison,0.421,math,helm_lite_240610,[],math +302,claude_2.0,0.603,math,helm_lite_240610,[],math +303,deepseek_llm_chat_67b,0.615,math,helm_lite_240610,[],math +304,llama_2_70b,0.323,math,helm_lite_240610,[],math +305,claude_2.1,0.632,math,helm_lite_240610,[],math +306,gpt_3.5_text_davinci_003,0.449,math,helm_lite_240610,[],math +307,qwen1.5_14b,0.686,math,helm_lite_240610,[],math +308,claude_instant_1.2,0.499,math,helm_lite_240610,[],math +309,llama_3_8b,0.391,math,helm_lite_240610,[],math +310,gpt_3.5_turbo_0613,0.667,math,helm_lite_240610,[],math +311,gemma_7b,0.5,math,helm_lite_240610,[],math +312,claude_3_sonnet_20240229,0.084,math,helm_lite_240610,[],math +313,gpt_3.5_text_davinci_002,0.428,math,helm_lite_240610,[],math +314,llama_65b,0.257,math,helm_lite_240610,[],math +315,mistral_large_2402,0.75,math,helm_lite_240610,[],math +316,cohere_command,0.236,math,helm_lite_240610,[],math +317,dbrx_instructruct,0.358,math,helm_lite_240610,[],math +318,mistral_v0.1_7b,0.297,math,helm_lite_240610,[],math +319,mistral_small_2402,0.621,math,helm_lite_240610,[],math +320,mistral_medium_2312,0.565,math,helm_lite_240610,[],math +321,qwen1.5_7b,0.561,math,helm_lite_240610,[],math +322,claude_3_haiku_20240307,0.131,math,helm_lite_240610,[],math +323,yi_6b,0.126,math,helm_lite_240610,[],math +324,llama_2_13b,0.102,math,helm_lite_240610,[],math +325,jurassic_2_jumbo_178b,0.103,math,helm_lite_240610,[],math +326,falcon_40b,0.128,math,helm_lite_240610,[],math +327,phi_2,0.255,math,helm_lite_240610,[],math +328,jurassic_2_grande_17b,0.064,math,helm_lite_240610,[],math +329,llama_2_7b,0.097,math,helm_lite_240610,[],math +330,luminous_supreme_70b,0.078,math,helm_lite_240610,[],math +331,cohere_command_light,0.098,math,helm_lite_240610,[],math +332,luminous_extended_30b,0.04,math,helm_lite_240610,[],math +333,falcon_7b,0.044,math,helm_lite_240610,[],math +334,olmo_7b,0.029,math,helm_lite_240610,[],math +335,luminous_base_13b,0.026,math,helm_lite_240610,[],math +336,gpt_4_0613,0.932,gsm8k,helm_lite_240610,[],math +337,llama_3_70b,0.805,gsm8k,helm_lite_240610,[],math +338,mixtral_8x22b,0.8,gsm8k,helm_lite_240610,[],math +339,palmyra_x_v3_72b,0.831,gsm8k,helm_lite_240610,[],math +340,gpt_4_turbo_1106_preview,0.668,gsm8k,helm_lite_240610,[],math +341,palm_2_unicorn,0.831,gsm8k,helm_lite_240610,[],math +342,claude_3_opus_20240229,0.924,gsm8k,helm_lite_240610,[],math +343,qwen1.5_72b,0.799,gsm8k,helm_lite_240610,[],math +344,palmyra_x_v2_33b,0.735,gsm8k,helm_lite_240610,[],math +345,yi_34b,0.648,gsm8k,helm_lite_240610,[],math +346,qwen1.5_32b,0.773,gsm8k,helm_lite_240610,[],math +347,claude_v1.3,0.784,gsm8k,helm_lite_240610,[],math +348,mixtral_8x7b_32k_seqlen,0.622,gsm8k,helm_lite_240610,[],math +349,palm_2_bison,0.61,gsm8k,helm_lite_240610,[],math +350,claude_2.0,0.583,gsm8k,helm_lite_240610,[],math +351,deepseek_llm_chat_67b,0.795,gsm8k,helm_lite_240610,[],math +352,llama_2_70b,0.567,gsm8k,helm_lite_240610,[],math +353,claude_2.1,0.604,gsm8k,helm_lite_240610,[],math +354,gpt_3.5_text_davinci_003,0.615,gsm8k,helm_lite_240610,[],math +355,qwen1.5_14b,0.693,gsm8k,helm_lite_240610,[],math +356,claude_instant_1.2,0.721,gsm8k,helm_lite_240610,[],math +357,llama_3_8b,0.499,gsm8k,helm_lite_240610,[],math +358,gpt_3.5_turbo_0613,0.501,gsm8k,helm_lite_240610,[],math +359,gemma_7b,0.559,gsm8k,helm_lite_240610,[],math +360,claude_3_sonnet_20240229,0.907,gsm8k,helm_lite_240610,[],math +361,gpt_3.5_text_davinci_002,0.479,gsm8k,helm_lite_240610,[],math +362,llama_65b,0.489,gsm8k,helm_lite_240610,[],math +363,mistral_large_2402,0.694,gsm8k,helm_lite_240610,[],math +364,cohere_command,0.452,gsm8k,helm_lite_240610,[],math +365,dbrx_instructruct,0.671,gsm8k,helm_lite_240610,[],math +366,mistral_v0.1_7b,0.377,gsm8k,helm_lite_240610,[],math +367,mistral_small_2402,0.734,gsm8k,helm_lite_240610,[],math +368,mistral_medium_2312,0.706,gsm8k,helm_lite_240610,[],math +369,qwen1.5_7b,0.6,gsm8k,helm_lite_240610,[],math +370,claude_3_haiku_20240307,0.699,gsm8k,helm_lite_240610,[],math +371,yi_6b,0.375,gsm8k,helm_lite_240610,[],math +372,llama_2_13b,0.266,gsm8k,helm_lite_240610,[],math +373,jurassic_2_jumbo_178b,0.239,gsm8k,helm_lite_240610,[],math +374,falcon_40b,0.267,gsm8k,helm_lite_240610,[],math +375,phi_2,0.581,gsm8k,helm_lite_240610,[],math +376,jurassic_2_grande_17b,0.159,gsm8k,helm_lite_240610,[],math +377,llama_2_7b,0.154,gsm8k,helm_lite_240610,[],math +378,luminous_supreme_70b,0.137,gsm8k,helm_lite_240610,[],math +379,cohere_command_light,0.149,gsm8k,helm_lite_240610,[],math +380,luminous_extended_30b,0.075,gsm8k,helm_lite_240610,[],math +381,falcon_7b,0.055,gsm8k,helm_lite_240610,[],math +382,olmo_7b,0.044,gsm8k,helm_lite_240610,[],math +383,luminous_base_13b,0.028,gsm8k,helm_lite_240610,[],math +384,gpt_4_0613,0.713,legalbench,helm_lite_240610,[],knowledge +385,llama_3_70b,0.733,legalbench,helm_lite_240610,[],knowledge +386,mixtral_8x22b,0.708,legalbench,helm_lite_240610,[],knowledge +387,palmyra_x_v3_72b,0.709,legalbench,helm_lite_240610,[],knowledge +388,gpt_4_turbo_1106_preview,0.626,legalbench,helm_lite_240610,[],knowledge +389,palm_2_unicorn,0.677,legalbench,helm_lite_240610,[],knowledge +390,claude_3_opus_20240229,0.662,legalbench,helm_lite_240610,[],knowledge +391,qwen1.5_72b,0.694,legalbench,helm_lite_240610,[],knowledge +392,palmyra_x_v2_33b,0.644,legalbench,helm_lite_240610,[],knowledge +393,yi_34b,0.618,legalbench,helm_lite_240610,[],knowledge +394,qwen1.5_32b,0.636,legalbench,helm_lite_240610,[],knowledge +395,claude_v1.3,0.629,legalbench,helm_lite_240610,[],knowledge +396,mixtral_8x7b_32k_seqlen,0.63,legalbench,helm_lite_240610,[],knowledge +397,palm_2_bison,0.645,legalbench,helm_lite_240610,[],knowledge +398,claude_2.0,0.643,legalbench,helm_lite_240610,[],knowledge +399,deepseek_llm_chat_67b,0.637,legalbench,helm_lite_240610,[],knowledge +400,llama_2_70b,0.673,legalbench,helm_lite_240610,[],knowledge +401,claude_2.1,0.643,legalbench,helm_lite_240610,[],knowledge +402,gpt_3.5_text_davinci_003,0.622,legalbench,helm_lite_240610,[],knowledge +403,qwen1.5_14b,0.593,legalbench,helm_lite_240610,[],knowledge +404,claude_instant_1.2,0.586,legalbench,helm_lite_240610,[],knowledge +405,llama_3_8b,0.637,legalbench,helm_lite_240610,[],knowledge +406,gpt_3.5_turbo_0613,0.528,legalbench,helm_lite_240610,[],knowledge +407,gemma_7b,0.581,legalbench,helm_lite_240610,[],knowledge +408,claude_3_sonnet_20240229,0.49,legalbench,helm_lite_240610,[],knowledge +409,gpt_3.5_text_davinci_002,0.58,legalbench,helm_lite_240610,[],knowledge +410,llama_65b,0.48,legalbench,helm_lite_240610,[],knowledge +411,mistral_large_2402,0.479,legalbench,helm_lite_240610,[],knowledge +412,cohere_command,0.578,legalbench,helm_lite_240610,[],knowledge +413,dbrx_instructruct,0.426,legalbench,helm_lite_240610,[],knowledge +414,mistral_v0.1_7b,0.58,legalbench,helm_lite_240610,[],knowledge +415,mistral_small_2402,0.389,legalbench,helm_lite_240610,[],knowledge +416,mistral_medium_2312,0.452,legalbench,helm_lite_240610,[],knowledge +417,qwen1.5_7b,0.523,legalbench,helm_lite_240610,[],knowledge +418,claude_3_haiku_20240307,0.46,legalbench,helm_lite_240610,[],knowledge +419,yi_6b,0.519,legalbench,helm_lite_240610,[],knowledge +420,llama_2_13b,0.591,legalbench,helm_lite_240610,[],knowledge +421,jurassic_2_jumbo_178b,0.533,legalbench,helm_lite_240610,[],knowledge +422,falcon_40b,0.442,legalbench,helm_lite_240610,[],knowledge +423,phi_2,0.334,legalbench,helm_lite_240610,[],knowledge +424,jurassic_2_grande_17b,0.468,legalbench,helm_lite_240610,[],knowledge +425,llama_2_7b,0.502,legalbench,helm_lite_240610,[],knowledge +426,luminous_supreme_70b,0.452,legalbench,helm_lite_240610,[],knowledge +427,cohere_command_light,0.397,legalbench,helm_lite_240610,[],knowledge +428,luminous_extended_30b,0.421,legalbench,helm_lite_240610,[],knowledge +429,falcon_7b,0.346,legalbench,helm_lite_240610,[],knowledge +430,olmo_7b,0.341,legalbench,helm_lite_240610,[],knowledge +431,luminous_base_13b,0.332,legalbench,helm_lite_240610,[],knowledge +432,gpt_4_0613,0.815,medqa,helm_lite_240610,[],knowledge +433,llama_3_70b,0.777,medqa,helm_lite_240610,[],knowledge +434,mixtral_8x22b,0.704,medqa,helm_lite_240610,[],knowledge +435,palmyra_x_v3_72b,0.684,medqa,helm_lite_240610,[],knowledge +436,gpt_4_turbo_1106_preview,0.817,medqa,helm_lite_240610,[],knowledge +437,palm_2_unicorn,0.684,medqa,helm_lite_240610,[],knowledge +438,claude_3_opus_20240229,0.775,medqa,helm_lite_240610,[],knowledge +439,qwen1.5_72b,0.67,medqa,helm_lite_240610,[],knowledge +440,palmyra_x_v2_33b,0.598,medqa,helm_lite_240610,[],knowledge +441,yi_34b,0.656,medqa,helm_lite_240610,[],knowledge +442,qwen1.5_32b,0.656,medqa,helm_lite_240610,[],knowledge +443,claude_v1.3,0.618,medqa,helm_lite_240610,[],knowledge +444,mixtral_8x7b_32k_seqlen,0.652,medqa,helm_lite_240610,[],knowledge +445,palm_2_bison,0.547,medqa,helm_lite_240610,[],knowledge +446,claude_2.0,0.652,medqa,helm_lite_240610,[],knowledge +447,deepseek_llm_chat_67b,0.628,medqa,helm_lite_240610,[],knowledge +448,llama_2_70b,0.618,medqa,helm_lite_240610,[],knowledge +449,claude_2.1,0.644,medqa,helm_lite_240610,[],knowledge +450,gpt_3.5_text_davinci_003,0.531,medqa,helm_lite_240610,[],knowledge +451,qwen1.5_14b,0.515,medqa,helm_lite_240610,[],knowledge +452,claude_instant_1.2,0.559,medqa,helm_lite_240610,[],knowledge +453,llama_3_8b,0.581,medqa,helm_lite_240610,[],knowledge +454,gpt_3.5_turbo_0613,0.622,medqa,helm_lite_240610,[],knowledge +455,gemma_7b,0.513,medqa,helm_lite_240610,[],knowledge +456,claude_3_sonnet_20240229,0.684,medqa,helm_lite_240610,[],knowledge +457,gpt_3.5_text_davinci_002,0.525,medqa,helm_lite_240610,[],knowledge +458,llama_65b,0.507,medqa,helm_lite_240610,[],knowledge +459,mistral_large_2402,0.499,medqa,helm_lite_240610,[],knowledge +460,cohere_command,0.445,medqa,helm_lite_240610,[],knowledge +461,dbrx_instructruct,0.694,medqa,helm_lite_240610,[],knowledge +462,mistral_v0.1_7b,0.525,medqa,helm_lite_240610,[],knowledge +463,mistral_small_2402,0.616,medqa,helm_lite_240610,[],knowledge +464,mistral_medium_2312,0.61,medqa,helm_lite_240610,[],knowledge +465,qwen1.5_7b,0.479,medqa,helm_lite_240610,[],knowledge +466,claude_3_haiku_20240307,0.702,medqa,helm_lite_240610,[],knowledge +467,yi_6b,0.497,medqa,helm_lite_240610,[],knowledge +468,llama_2_13b,0.392,medqa,helm_lite_240610,[],knowledge +469,jurassic_2_jumbo_178b,0.431,medqa,helm_lite_240610,[],knowledge +470,falcon_40b,0.419,medqa,helm_lite_240610,[],knowledge +471,phi_2,0.41,medqa,helm_lite_240610,[],knowledge +472,jurassic_2_grande_17b,0.39,medqa,helm_lite_240610,[],knowledge +473,llama_2_7b,0.392,medqa,helm_lite_240610,[],knowledge +474,luminous_supreme_70b,0.276,medqa,helm_lite_240610,[],knowledge +475,cohere_command_light,0.312,medqa,helm_lite_240610,[],knowledge +476,luminous_extended_30b,0.276,medqa,helm_lite_240610,[],knowledge +477,falcon_7b,0.254,medqa,helm_lite_240610,[],knowledge +478,olmo_7b,0.229,medqa,helm_lite_240610,[],knowledge +479,luminous_base_13b,0.26,medqa,helm_lite_240610,[],knowledge +480,gpt_4_0613,0.211,wmt_2014,helm_lite_240610,[],mt +481,llama_3_70b,0.225,wmt_2014,helm_lite_240610,[],mt +482,mixtral_8x22b,0.209,wmt_2014,helm_lite_240610,[],mt +483,palmyra_x_v3_72b,0.262,wmt_2014,helm_lite_240610,[],mt +484,gpt_4_turbo_1106_preview,0.205,wmt_2014,helm_lite_240610,[],mt +485,palm_2_unicorn,0.26,wmt_2014,helm_lite_240610,[],mt +486,claude_3_opus_20240229,0.24,wmt_2014,helm_lite_240610,[],mt +487,qwen1.5_72b,0.201,wmt_2014,helm_lite_240610,[],mt +488,palmyra_x_v2_33b,0.239,wmt_2014,helm_lite_240610,[],mt +489,yi_34b,0.172,wmt_2014,helm_lite_240610,[],mt +490,qwen1.5_32b,0.193,wmt_2014,helm_lite_240610,[],mt +491,claude_v1.3,0.219,wmt_2014,helm_lite_240610,[],mt +492,mixtral_8x7b_32k_seqlen,0.19,wmt_2014,helm_lite_240610,[],mt +493,palm_2_bison,0.241,wmt_2014,helm_lite_240610,[],mt +494,claude_2.0,0.219,wmt_2014,helm_lite_240610,[],mt +495,deepseek_llm_chat_67b,0.186,wmt_2014,helm_lite_240610,[],mt +496,llama_2_70b,0.196,wmt_2014,helm_lite_240610,[],mt +497,claude_2.1,0.204,wmt_2014,helm_lite_240610,[],mt +498,gpt_3.5_text_davinci_003,0.191,wmt_2014,helm_lite_240610,[],mt +499,qwen1.5_14b,0.178,wmt_2014,helm_lite_240610,[],mt +500,claude_instant_1.2,0.194,wmt_2014,helm_lite_240610,[],mt +501,llama_3_8b,0.183,wmt_2014,helm_lite_240610,[],mt +502,gpt_3.5_turbo_0613,0.187,wmt_2014,helm_lite_240610,[],mt +503,gemma_7b,0.187,wmt_2014,helm_lite_240610,[],mt +504,claude_3_sonnet_20240229,0.218,wmt_2014,helm_lite_240610,[],mt +505,gpt_3.5_text_davinci_002,0.174,wmt_2014,helm_lite_240610,[],mt +506,llama_65b,0.189,wmt_2014,helm_lite_240610,[],mt +507,mistral_large_2402,0.182,wmt_2014,helm_lite_240610,[],mt +508,cohere_command,0.088,wmt_2014,helm_lite_240610,[],mt +509,dbrx_instructruct,0.131,wmt_2014,helm_lite_240610,[],mt +510,mistral_v0.1_7b,0.16,wmt_2014,helm_lite_240610,[],mt +511,mistral_small_2402,0.169,wmt_2014,helm_lite_240610,[],mt +512,mistral_medium_2312,0.169,wmt_2014,helm_lite_240610,[],mt +513,qwen1.5_7b,0.153,wmt_2014,helm_lite_240610,[],mt +514,claude_3_haiku_20240307,0.148,wmt_2014,helm_lite_240610,[],mt +515,yi_6b,0.117,wmt_2014,helm_lite_240610,[],mt +516,llama_2_13b,0.167,wmt_2014,helm_lite_240610,[],mt +517,jurassic_2_jumbo_178b,0.114,wmt_2014,helm_lite_240610,[],mt +518,falcon_40b,0.162,wmt_2014,helm_lite_240610,[],mt +519,phi_2,0.038,wmt_2014,helm_lite_240610,[],mt +520,jurassic_2_grande_17b,0.102,wmt_2014,helm_lite_240610,[],mt +521,llama_2_7b,0.144,wmt_2014,helm_lite_240610,[],mt +522,luminous_supreme_70b,0.102,wmt_2014,helm_lite_240610,[],mt +523,cohere_command_light,0.023,wmt_2014,helm_lite_240610,[],mt +524,luminous_extended_30b,0.083,wmt_2014,helm_lite_240610,[],mt +525,falcon_7b,0.094,wmt_2014,helm_lite_240610,[],mt +526,olmo_7b,0.097,wmt_2014,helm_lite_240610,[],mt +527,luminous_base_13b,0.066,wmt_2014,helm_lite_240610,[],mt +0,llama_2_70b,0.944,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +1,llama_65b,0.908,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +2,text_davinci_002,0.905,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +3,mistral_v0.1_7b,0.884,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +4,cohere_command_beta_52.4b,0.874,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +5,text_davinci_003,0.872,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +6,jurassic_2_jumbo_178b,0.824,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +7,llama_2_13b,0.823,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +8,tnlg_v2_530b,0.787,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +9,gpt_3.5_turbo_0613,0.783,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +10,llama_30b,0.781,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +11,anthropic_lm_v4_s3_52b,0.78,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +12,gpt_3.5_turbo_0301,0.76,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +13,jurassic_2_grande_17b,0.743,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +14,palmyra_x_43b,0.732,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +15,falcon_40b,0.729,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +16,falcon_instruct_40b,0.727,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +17,mpt_instruct_30b,0.716,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +18,mpt_30b,0.714,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +19,j1_grande_v2_beta_17b,0.706,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +20,vicuna_v1.3_13b,0.706,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +21,cohere_command_beta_6.1b,0.675,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +22,cohere_xlarge_v20221108_52.4b,0.664,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +23,luminous_supreme_70b,0.662,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +24,vicuna_v1.3_7b,0.625,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +25,opt_175b,0.609,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +26,llama_2_7b,0.607,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +27,llama_13b,0.595,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +28,instructpalmyra_30b,0.568,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +29,cohere_xlarge_v20220609_52.4b,0.56,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +30,jurassic_2_large_7.5b,0.553,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +31,davinci_175b,0.538,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +32,llama_7b,0.533,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +33,redpajama_incite_instruct_7b,0.524,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +34,j1_jumbo_v1_178b,0.517,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +35,glm_130b,0.512,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +36,luminous_extended_30b,0.485,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +37,opt_66b,0.448,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +38,bloom_176b,0.446,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +39,j1_grande_v1_17b,0.433,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +40,alpaca_7b,0.381,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +41,falcon_7b,0.378,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +42,redpajama_incite_base_7b,0.378,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +43,cohere_large_v20220720_13.1b,0.372,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +44,redpajama_incite_instruct_v1_3b,0.366,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +45,text_curie_001,0.36,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +46,gpt_neox_20b,0.351,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +47,luminous_base_13b,0.315,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +48,cohere_medium_v20221108_6.1b,0.312,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +49,redpajama_incite_base_v1_3b,0.311,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +50,tnlg_v2_6.7b,0.309,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +51,j1_large_v1_7.5b,0.285,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +52,gpt_j_6b,0.273,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +53,pythia_12b,0.257,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +54,curie_6.7b,0.247,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +55,falcon_instruct_7b,0.244,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +56,cohere_medium_v20220720_6.1b,0.23,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +57,text_babbage_001,0.229,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +58,t0pp_11b,0.197,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +59,pythia_6.9b,0.196,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +60,ul2_20b,0.167,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +61,t5_11b,0.131,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +62,babbage_1.3b,0.114,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +63,cohere_small_v20220720_410m,0.109,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +64,ada_350m,0.108,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +65,text_ada_001,0.107,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +66,yalm_100b,0.075,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic +67,llama_2_70b,0.582,mmlu,helm_classic_240130,[],knowledge +68,llama_65b,0.584,mmlu,helm_classic_240130,[],knowledge +69,text_davinci_002,0.568,mmlu,helm_classic_240130,[],knowledge +70,mistral_v0.1_7b,0.572,mmlu,helm_classic_240130,[],knowledge +71,cohere_command_beta_52.4b,0.452,mmlu,helm_classic_240130,[],knowledge +72,text_davinci_003,0.569,mmlu,helm_classic_240130,[],knowledge +73,jurassic_2_jumbo_178b,0.48,mmlu,helm_classic_240130,[],knowledge +74,llama_2_13b,0.507,mmlu,helm_classic_240130,[],knowledge +75,tnlg_v2_530b,0.469,mmlu,helm_classic_240130,[],knowledge +76,gpt_3.5_turbo_0613,0.391,mmlu,helm_classic_240130,[],knowledge +77,llama_30b,0.531,mmlu,helm_classic_240130,[],knowledge +78,anthropic_lm_v4_s3_52b,0.481,mmlu,helm_classic_240130,[],knowledge +79,gpt_3.5_turbo_0301,0.59,mmlu,helm_classic_240130,[],knowledge +80,jurassic_2_grande_17b,0.475,mmlu,helm_classic_240130,[],knowledge +81,palmyra_x_43b,0.609,mmlu,helm_classic_240130,[],knowledge +82,falcon_40b,0.509,mmlu,helm_classic_240130,[],knowledge +83,falcon_instruct_40b,0.497,mmlu,helm_classic_240130,[],knowledge +84,mpt_instruct_30b,0.444,mmlu,helm_classic_240130,[],knowledge +85,mpt_30b,0.437,mmlu,helm_classic_240130,[],knowledge +86,j1_grande_v2_beta_17b,0.445,mmlu,helm_classic_240130,[],knowledge +87,vicuna_v1.3_13b,0.462,mmlu,helm_classic_240130,[],knowledge +88,cohere_command_beta_6.1b,0.406,mmlu,helm_classic_240130,[],knowledge +89,cohere_xlarge_v20221108_52.4b,0.382,mmlu,helm_classic_240130,[],knowledge +90,luminous_supreme_70b,0.38,mmlu,helm_classic_240130,[],knowledge +91,vicuna_v1.3_7b,0.434,mmlu,helm_classic_240130,[],knowledge +92,opt_175b,0.318,mmlu,helm_classic_240130,[],knowledge +93,llama_2_7b,0.431,mmlu,helm_classic_240130,[],knowledge +94,llama_13b,0.422,mmlu,helm_classic_240130,[],knowledge +95,instructpalmyra_30b,0.403,mmlu,helm_classic_240130,[],knowledge +96,cohere_xlarge_v20220609_52.4b,0.353,mmlu,helm_classic_240130,[],knowledge +97,jurassic_2_large_7.5b,0.339,mmlu,helm_classic_240130,[],knowledge +98,davinci_175b,0.422,mmlu,helm_classic_240130,[],knowledge +99,llama_7b,0.321,mmlu,helm_classic_240130,[],knowledge +100,redpajama_incite_instruct_7b,0.363,mmlu,helm_classic_240130,[],knowledge +101,j1_jumbo_v1_178b,0.259,mmlu,helm_classic_240130,[],knowledge +102,glm_130b,0.344,mmlu,helm_classic_240130,[],knowledge +103,luminous_extended_30b,0.321,mmlu,helm_classic_240130,[],knowledge +104,opt_66b,0.276,mmlu,helm_classic_240130,[],knowledge +105,bloom_176b,0.299,mmlu,helm_classic_240130,[],knowledge +106,j1_grande_v1_17b,0.27,mmlu,helm_classic_240130,[],knowledge +107,alpaca_7b,0.385,mmlu,helm_classic_240130,[],knowledge +108,falcon_7b,0.286,mmlu,helm_classic_240130,[],knowledge +109,redpajama_incite_base_7b,0.302,mmlu,helm_classic_240130,[],knowledge +110,cohere_large_v20220720_13.1b,0.324,mmlu,helm_classic_240130,[],knowledge +111,redpajama_incite_instruct_v1_3b,0.257,mmlu,helm_classic_240130,[],knowledge +112,text_curie_001,0.237,mmlu,helm_classic_240130,[],knowledge +113,gpt_neox_20b,0.276,mmlu,helm_classic_240130,[],knowledge +114,luminous_base_13b,0.27,mmlu,helm_classic_240130,[],knowledge +115,cohere_medium_v20221108_6.1b,0.254,mmlu,helm_classic_240130,[],knowledge +116,redpajama_incite_base_v1_3b,0.263,mmlu,helm_classic_240130,[],knowledge +117,tnlg_v2_6.7b,0.242,mmlu,helm_classic_240130,[],knowledge +118,j1_large_v1_7.5b,0.241,mmlu,helm_classic_240130,[],knowledge +119,gpt_j_6b,0.249,mmlu,helm_classic_240130,[],knowledge +120,pythia_12b,0.274,mmlu,helm_classic_240130,[],knowledge +121,curie_6.7b,0.243,mmlu,helm_classic_240130,[],knowledge +122,falcon_instruct_7b,0.275,mmlu,helm_classic_240130,[],knowledge +123,cohere_medium_v20220720_6.1b,0.279,mmlu,helm_classic_240130,[],knowledge +124,text_babbage_001,0.229,mmlu,helm_classic_240130,[],knowledge +125,t0pp_11b,0.407,mmlu,helm_classic_240130,[],knowledge +126,pythia_6.9b,0.236,mmlu,helm_classic_240130,[],knowledge +127,ul2_20b,0.291,mmlu,helm_classic_240130,[],knowledge +128,t5_11b,0.29,mmlu,helm_classic_240130,[],knowledge +129,babbage_1.3b,0.235,mmlu,helm_classic_240130,[],knowledge +130,cohere_small_v20220720_410m,0.264,mmlu,helm_classic_240130,[],knowledge +131,ada_350m,0.243,mmlu,helm_classic_240130,[],knowledge +132,text_ada_001,0.238,mmlu,helm_classic_240130,[],knowledge +133,yalm_100b,0.243,mmlu,helm_classic_240130,[],knowledge +134,llama_2_70b,0.886,boolq,helm_classic_240130,[],knowledge +135,llama_65b,0.871,boolq,helm_classic_240130,[],knowledge +136,text_davinci_002,0.877,boolq,helm_classic_240130,[],knowledge +137,mistral_v0.1_7b,0.874,boolq,helm_classic_240130,[],knowledge +138,cohere_command_beta_52.4b,0.856,boolq,helm_classic_240130,[],knowledge +139,text_davinci_003,0.881,boolq,helm_classic_240130,[],knowledge +140,jurassic_2_jumbo_178b,0.829,boolq,helm_classic_240130,[],knowledge +141,llama_2_13b,0.811,boolq,helm_classic_240130,[],knowledge +142,tnlg_v2_530b,0.809,boolq,helm_classic_240130,[],knowledge +143,gpt_3.5_turbo_0613,0.87,boolq,helm_classic_240130,[],knowledge +144,llama_30b,0.861,boolq,helm_classic_240130,[],knowledge +145,anthropic_lm_v4_s3_52b,0.815,boolq,helm_classic_240130,[],knowledge +146,gpt_3.5_turbo_0301,0.74,boolq,helm_classic_240130,[],knowledge +147,jurassic_2_grande_17b,0.826,boolq,helm_classic_240130,[],knowledge +148,palmyra_x_43b,0.896,boolq,helm_classic_240130,[],knowledge +149,falcon_40b,0.819,boolq,helm_classic_240130,[],knowledge +150,falcon_instruct_40b,0.829,boolq,helm_classic_240130,[],knowledge +151,mpt_instruct_30b,0.85,boolq,helm_classic_240130,[],knowledge +152,mpt_30b,0.704,boolq,helm_classic_240130,[],knowledge +153,j1_grande_v2_beta_17b,0.812,boolq,helm_classic_240130,[],knowledge +154,vicuna_v1.3_13b,0.808,boolq,helm_classic_240130,[],knowledge +155,cohere_command_beta_6.1b,0.798,boolq,helm_classic_240130,[],knowledge +156,cohere_xlarge_v20221108_52.4b,0.762,boolq,helm_classic_240130,[],knowledge +157,luminous_supreme_70b,0.775,boolq,helm_classic_240130,[],knowledge +158,vicuna_v1.3_7b,0.76,boolq,helm_classic_240130,[],knowledge +159,opt_175b,0.793,boolq,helm_classic_240130,[],knowledge +160,llama_2_7b,0.762,boolq,helm_classic_240130,[],knowledge +161,llama_13b,0.714,boolq,helm_classic_240130,[],knowledge +162,instructpalmyra_30b,0.751,boolq,helm_classic_240130,[],knowledge +163,cohere_xlarge_v20220609_52.4b,0.718,boolq,helm_classic_240130,[],knowledge +164,jurassic_2_large_7.5b,0.742,boolq,helm_classic_240130,[],knowledge +165,davinci_175b,0.722,boolq,helm_classic_240130,[],knowledge +166,llama_7b,0.756,boolq,helm_classic_240130,[],knowledge +167,redpajama_incite_instruct_7b,0.705,boolq,helm_classic_240130,[],knowledge +168,j1_jumbo_v1_178b,0.776,boolq,helm_classic_240130,[],knowledge +169,glm_130b,0.784,boolq,helm_classic_240130,[],knowledge +170,luminous_extended_30b,0.767,boolq,helm_classic_240130,[],knowledge +171,opt_66b,0.76,boolq,helm_classic_240130,[],knowledge +172,bloom_176b,0.704,boolq,helm_classic_240130,[],knowledge +173,j1_grande_v1_17b,0.722,boolq,helm_classic_240130,[],knowledge +174,alpaca_7b,0.778,boolq,helm_classic_240130,[],knowledge +175,falcon_7b,0.753,boolq,helm_classic_240130,[],knowledge +176,redpajama_incite_base_7b,0.713,boolq,helm_classic_240130,[],knowledge +177,cohere_large_v20220720_13.1b,0.725,boolq,helm_classic_240130,[],knowledge +178,redpajama_incite_instruct_v1_3b,0.677,boolq,helm_classic_240130,[],knowledge +179,text_curie_001,0.62,boolq,helm_classic_240130,[],knowledge +180,gpt_neox_20b,0.683,boolq,helm_classic_240130,[],knowledge +181,luminous_base_13b,0.719,boolq,helm_classic_240130,[],knowledge +182,cohere_medium_v20221108_6.1b,0.7,boolq,helm_classic_240130,[],knowledge +183,redpajama_incite_base_v1_3b,0.685,boolq,helm_classic_240130,[],knowledge +184,tnlg_v2_6.7b,0.698,boolq,helm_classic_240130,[],knowledge +185,j1_large_v1_7.5b,0.683,boolq,helm_classic_240130,[],knowledge +186,gpt_j_6b,0.649,boolq,helm_classic_240130,[],knowledge +187,pythia_12b,0.662,boolq,helm_classic_240130,[],knowledge +188,curie_6.7b,0.656,boolq,helm_classic_240130,[],knowledge +189,falcon_instruct_7b,0.72,boolq,helm_classic_240130,[],knowledge +190,cohere_medium_v20220720_6.1b,0.659,boolq,helm_classic_240130,[],knowledge +191,text_babbage_001,0.451,boolq,helm_classic_240130,[],knowledge +192,t0pp_11b,0.0,boolq,helm_classic_240130,[],knowledge +193,pythia_6.9b,0.631,boolq,helm_classic_240130,[],knowledge +194,ul2_20b,0.746,boolq,helm_classic_240130,[],knowledge +195,t5_11b,0.761,boolq,helm_classic_240130,[],knowledge +196,babbage_1.3b,0.574,boolq,helm_classic_240130,[],knowledge +197,cohere_small_v20220720_410m,0.457,boolq,helm_classic_240130,[],knowledge +198,ada_350m,0.581,boolq,helm_classic_240130,[],knowledge +199,text_ada_001,0.464,boolq,helm_classic_240130,[],knowledge +200,yalm_100b,0.634,boolq,helm_classic_240130,[],knowledge +201,llama_2_70b,0.77,narrativeqa,helm_classic_240130,[],knowledge +202,llama_65b,0.755,narrativeqa,helm_classic_240130,[],knowledge +203,text_davinci_002,0.727,narrativeqa,helm_classic_240130,[],knowledge +204,mistral_v0.1_7b,0.716,narrativeqa,helm_classic_240130,[],knowledge +205,cohere_command_beta_52.4b,0.752,narrativeqa,helm_classic_240130,[],knowledge +206,text_davinci_003,0.727,narrativeqa,helm_classic_240130,[],knowledge +207,jurassic_2_jumbo_178b,0.733,narrativeqa,helm_classic_240130,[],knowledge +208,llama_2_13b,0.744,narrativeqa,helm_classic_240130,[],knowledge +209,tnlg_v2_530b,0.722,narrativeqa,helm_classic_240130,[],knowledge +210,gpt_3.5_turbo_0613,0.625,narrativeqa,helm_classic_240130,[],knowledge +211,llama_30b,0.752,narrativeqa,helm_classic_240130,[],knowledge +212,anthropic_lm_v4_s3_52b,0.728,narrativeqa,helm_classic_240130,[],knowledge +213,gpt_3.5_turbo_0301,0.663,narrativeqa,helm_classic_240130,[],knowledge +214,jurassic_2_grande_17b,0.737,narrativeqa,helm_classic_240130,[],knowledge +215,palmyra_x_43b,0.742,narrativeqa,helm_classic_240130,[],knowledge +216,falcon_40b,0.673,narrativeqa,helm_classic_240130,[],knowledge +217,falcon_instruct_40b,0.625,narrativeqa,helm_classic_240130,[],knowledge +218,mpt_instruct_30b,0.733,narrativeqa,helm_classic_240130,[],knowledge +219,mpt_30b,0.732,narrativeqa,helm_classic_240130,[],knowledge +220,j1_grande_v2_beta_17b,0.725,narrativeqa,helm_classic_240130,[],knowledge +221,vicuna_v1.3_13b,0.691,narrativeqa,helm_classic_240130,[],knowledge +222,cohere_command_beta_6.1b,0.709,narrativeqa,helm_classic_240130,[],knowledge +223,cohere_xlarge_v20221108_52.4b,0.672,narrativeqa,helm_classic_240130,[],knowledge +224,luminous_supreme_70b,0.711,narrativeqa,helm_classic_240130,[],knowledge +225,vicuna_v1.3_7b,0.643,narrativeqa,helm_classic_240130,[],knowledge +226,opt_175b,0.671,narrativeqa,helm_classic_240130,[],knowledge +227,llama_2_7b,0.691,narrativeqa,helm_classic_240130,[],knowledge +228,llama_13b,0.711,narrativeqa,helm_classic_240130,[],knowledge +229,instructpalmyra_30b,0.496,narrativeqa,helm_classic_240130,[],knowledge +230,cohere_xlarge_v20220609_52.4b,0.65,narrativeqa,helm_classic_240130,[],knowledge +232,davinci_175b,0.687,narrativeqa,helm_classic_240130,[],knowledge +233,llama_7b,0.669,narrativeqa,helm_classic_240130,[],knowledge +234,redpajama_incite_instruct_7b,0.638,narrativeqa,helm_classic_240130,[],knowledge +235,j1_jumbo_v1_178b,0.695,narrativeqa,helm_classic_240130,[],knowledge +236,glm_130b,0.706,narrativeqa,helm_classic_240130,[],knowledge +237,luminous_extended_30b,0.665,narrativeqa,helm_classic_240130,[],knowledge +238,opt_66b,0.638,narrativeqa,helm_classic_240130,[],knowledge +239,bloom_176b,0.662,narrativeqa,helm_classic_240130,[],knowledge +240,j1_grande_v1_17b,0.672,narrativeqa,helm_classic_240130,[],knowledge +241,alpaca_7b,0.396,narrativeqa,helm_classic_240130,[],knowledge +242,falcon_7b,0.621,narrativeqa,helm_classic_240130,[],knowledge +243,redpajama_incite_base_7b,0.617,narrativeqa,helm_classic_240130,[],knowledge +244,cohere_large_v20220720_13.1b,0.625,narrativeqa,helm_classic_240130,[],knowledge +245,redpajama_incite_instruct_v1_3b,0.638,narrativeqa,helm_classic_240130,[],knowledge +246,text_curie_001,0.582,narrativeqa,helm_classic_240130,[],knowledge +247,gpt_neox_20b,0.599,narrativeqa,helm_classic_240130,[],knowledge +248,luminous_base_13b,0.605,narrativeqa,helm_classic_240130,[],knowledge +249,cohere_medium_v20221108_6.1b,0.61,narrativeqa,helm_classic_240130,[],knowledge +250,redpajama_incite_base_v1_3b,0.555,narrativeqa,helm_classic_240130,[],knowledge +251,tnlg_v2_6.7b,0.631,narrativeqa,helm_classic_240130,[],knowledge +252,j1_large_v1_7.5b,0.623,narrativeqa,helm_classic_240130,[],knowledge +253,gpt_j_6b,0.545,narrativeqa,helm_classic_240130,[],knowledge +254,pythia_12b,0.596,narrativeqa,helm_classic_240130,[],knowledge +255,curie_6.7b,0.604,narrativeqa,helm_classic_240130,[],knowledge +256,falcon_instruct_7b,0.476,narrativeqa,helm_classic_240130,[],knowledge +257,cohere_medium_v20220720_6.1b,0.559,narrativeqa,helm_classic_240130,[],knowledge +258,text_babbage_001,0.429,narrativeqa,helm_classic_240130,[],knowledge +259,t0pp_11b,0.151,narrativeqa,helm_classic_240130,[],knowledge +260,pythia_6.9b,0.528,narrativeqa,helm_classic_240130,[],knowledge +261,ul2_20b,0.083,narrativeqa,helm_classic_240130,[],knowledge +262,t5_11b,0.086,narrativeqa,helm_classic_240130,[],knowledge +263,babbage_1.3b,0.491,narrativeqa,helm_classic_240130,[],knowledge +264,cohere_small_v20220720_410m,0.294,narrativeqa,helm_classic_240130,[],knowledge +265,ada_350m,0.326,narrativeqa,helm_classic_240130,[],knowledge +266,text_ada_001,0.238,narrativeqa,helm_classic_240130,[],knowledge +267,yalm_100b,0.252,narrativeqa,helm_classic_240130,[],knowledge +268,llama_2_70b,0.458,naturalquestions_closed,helm_classic_240130,[],knowledge +269,llama_65b,0.431,naturalquestions_closed,helm_classic_240130,[],knowledge +270,text_davinci_002,0.383,naturalquestions_closed,helm_classic_240130,[],knowledge +271,mistral_v0.1_7b,0.365,naturalquestions_closed,helm_classic_240130,[],knowledge +272,cohere_command_beta_52.4b,0.372,naturalquestions_closed,helm_classic_240130,[],knowledge +273,text_davinci_003,0.406,naturalquestions_closed,helm_classic_240130,[],knowledge +274,jurassic_2_jumbo_178b,0.385,naturalquestions_closed,helm_classic_240130,[],knowledge +275,llama_2_13b,0.376,naturalquestions_closed,helm_classic_240130,[],knowledge +276,tnlg_v2_530b,0.384,naturalquestions_closed,helm_classic_240130,[],knowledge +277,gpt_3.5_turbo_0613,0.348,naturalquestions_closed,helm_classic_240130,[],knowledge +278,llama_30b,0.408,naturalquestions_closed,helm_classic_240130,[],knowledge +279,anthropic_lm_v4_s3_52b,0.288,naturalquestions_closed,helm_classic_240130,[],knowledge +280,gpt_3.5_turbo_0301,0.39,naturalquestions_closed,helm_classic_240130,[],knowledge +281,jurassic_2_grande_17b,0.356,naturalquestions_closed,helm_classic_240130,[],knowledge +282,palmyra_x_43b,0.413,naturalquestions_closed,helm_classic_240130,[],knowledge +283,falcon_40b,0.392,naturalquestions_closed,helm_classic_240130,[],knowledge +284,falcon_instruct_40b,0.377,naturalquestions_closed,helm_classic_240130,[],knowledge +285,mpt_instruct_30b,0.304,naturalquestions_closed,helm_classic_240130,[],knowledge +286,mpt_30b,0.347,naturalquestions_closed,helm_classic_240130,[],knowledge +287,j1_grande_v2_beta_17b,0.337,naturalquestions_closed,helm_classic_240130,[],knowledge +288,vicuna_v1.3_13b,0.346,naturalquestions_closed,helm_classic_240130,[],knowledge +289,cohere_command_beta_6.1b,0.229,naturalquestions_closed,helm_classic_240130,[],knowledge +290,cohere_xlarge_v20221108_52.4b,0.361,naturalquestions_closed,helm_classic_240130,[],knowledge +291,luminous_supreme_70b,0.293,naturalquestions_closed,helm_classic_240130,[],knowledge +292,vicuna_v1.3_7b,0.287,naturalquestions_closed,helm_classic_240130,[],knowledge +293,opt_175b,0.297,naturalquestions_closed,helm_classic_240130,[],knowledge +294,llama_2_7b,0.337,naturalquestions_closed,helm_classic_240130,[],knowledge +295,llama_13b,0.346,naturalquestions_closed,helm_classic_240130,[],knowledge +296,instructpalmyra_30b,0.33,naturalquestions_closed,helm_classic_240130,[],knowledge +297,cohere_xlarge_v20220609_52.4b,0.312,naturalquestions_closed,helm_classic_240130,[],knowledge +298,jurassic_2_large_7.5b,0.274,naturalquestions_closed,helm_classic_240130,[],knowledge +299,davinci_175b,0.329,naturalquestions_closed,helm_classic_240130,[],knowledge +300,llama_7b,0.297,naturalquestions_closed,helm_classic_240130,[],knowledge +301,redpajama_incite_instruct_7b,0.232,naturalquestions_closed,helm_classic_240130,[],knowledge +302,j1_jumbo_v1_178b,0.293,naturalquestions_closed,helm_classic_240130,[],knowledge +303,glm_130b,0.148,naturalquestions_closed,helm_classic_240130,[],knowledge +304,luminous_extended_30b,0.254,naturalquestions_closed,helm_classic_240130,[],knowledge +305,opt_66b,0.258,naturalquestions_closed,helm_classic_240130,[],knowledge +306,bloom_176b,0.216,naturalquestions_closed,helm_classic_240130,[],knowledge +307,j1_grande_v1_17b,0.233,naturalquestions_closed,helm_classic_240130,[],knowledge +308,alpaca_7b,0.266,naturalquestions_closed,helm_classic_240130,[],knowledge +309,falcon_7b,0.285,naturalquestions_closed,helm_classic_240130,[],knowledge +310,redpajama_incite_base_7b,0.25,naturalquestions_closed,helm_classic_240130,[],knowledge +311,cohere_large_v20220720_13.1b,0.232,naturalquestions_closed,helm_classic_240130,[],knowledge +312,redpajama_incite_instruct_v1_3b,0.203,naturalquestions_closed,helm_classic_240130,[],knowledge +313,text_curie_001,0.175,naturalquestions_closed,helm_classic_240130,[],knowledge +314,gpt_neox_20b,0.193,naturalquestions_closed,helm_classic_240130,[],knowledge +315,luminous_base_13b,0.202,naturalquestions_closed,helm_classic_240130,[],knowledge +316,cohere_medium_v20221108_6.1b,0.199,naturalquestions_closed,helm_classic_240130,[],knowledge +317,redpajama_incite_base_v1_3b,0.207,naturalquestions_closed,helm_classic_240130,[],knowledge +318,tnlg_v2_6.7b,0.21,naturalquestions_closed,helm_classic_240130,[],knowledge +319,j1_large_v1_7.5b,0.19,naturalquestions_closed,helm_classic_240130,[],knowledge +320,gpt_j_6b,0.156,naturalquestions_closed,helm_classic_240130,[],knowledge +321,pythia_12b,0.175,naturalquestions_closed,helm_classic_240130,[],knowledge +322,curie_6.7b,0.199,naturalquestions_closed,helm_classic_240130,[],knowledge +323,falcon_instruct_7b,0.194,naturalquestions_closed,helm_classic_240130,[],knowledge +324,cohere_medium_v20220720_6.1b,0.177,naturalquestions_closed,helm_classic_240130,[],knowledge +325,text_babbage_001,0.07,naturalquestions_closed,helm_classic_240130,[],knowledge +326,t0pp_11b,0.039,naturalquestions_closed,helm_classic_240130,[],knowledge +327,pythia_6.9b,0.142,naturalquestions_closed,helm_classic_240130,[],knowledge +328,ul2_20b,0.204,naturalquestions_closed,helm_classic_240130,[],knowledge +329,t5_11b,0.194,naturalquestions_closed,helm_classic_240130,[],knowledge +330,babbage_1.3b,0.119,naturalquestions_closed,helm_classic_240130,[],knowledge +331,cohere_small_v20220720_410m,0.078,naturalquestions_closed,helm_classic_240130,[],knowledge +332,ada_350m,0.082,naturalquestions_closed,helm_classic_240130,[],knowledge +333,text_ada_001,0.025,naturalquestions_closed,helm_classic_240130,[],knowledge +334,yalm_100b,0.068,naturalquestions_closed,helm_classic_240130,[],knowledge +335,llama_2_70b,0.674,naturalquestions_open,helm_classic_240130,[],knowledge +336,llama_65b,0.672,naturalquestions_open,helm_classic_240130,[],knowledge +337,text_davinci_002,0.713,naturalquestions_open,helm_classic_240130,[],knowledge +338,mistral_v0.1_7b,0.687,naturalquestions_open,helm_classic_240130,[],knowledge +339,cohere_command_beta_52.4b,0.76,naturalquestions_open,helm_classic_240130,[],knowledge +340,text_davinci_003,0.77,naturalquestions_open,helm_classic_240130,[],knowledge +341,jurassic_2_jumbo_178b,0.669,naturalquestions_open,helm_classic_240130,[],knowledge +342,llama_2_13b,0.637,naturalquestions_open,helm_classic_240130,[],knowledge +343,tnlg_v2_530b,0.642,naturalquestions_open,helm_classic_240130,[],knowledge +344,gpt_3.5_turbo_0613,0.675,naturalquestions_open,helm_classic_240130,[],knowledge +345,llama_30b,0.666,naturalquestions_open,helm_classic_240130,[],knowledge +346,anthropic_lm_v4_s3_52b,0.686,naturalquestions_open,helm_classic_240130,[],knowledge +347,gpt_3.5_turbo_0301,0.624,naturalquestions_open,helm_classic_240130,[],knowledge +348,jurassic_2_grande_17b,0.639,naturalquestions_open,helm_classic_240130,[],knowledge +350,falcon_40b,0.675,naturalquestions_open,helm_classic_240130,[],knowledge +351,falcon_instruct_40b,0.666,naturalquestions_open,helm_classic_240130,[],knowledge +352,mpt_instruct_30b,0.697,naturalquestions_open,helm_classic_240130,[],knowledge +353,mpt_30b,0.673,naturalquestions_open,helm_classic_240130,[],knowledge +354,j1_grande_v2_beta_17b,0.625,naturalquestions_open,helm_classic_240130,[],knowledge +355,vicuna_v1.3_13b,0.686,naturalquestions_open,helm_classic_240130,[],knowledge +356,cohere_command_beta_6.1b,0.717,naturalquestions_open,helm_classic_240130,[],knowledge +357,cohere_xlarge_v20221108_52.4b,0.628,naturalquestions_open,helm_classic_240130,[],knowledge +358,luminous_supreme_70b,0.649,naturalquestions_open,helm_classic_240130,[],knowledge +359,vicuna_v1.3_7b,0.634,naturalquestions_open,helm_classic_240130,[],knowledge +360,opt_175b,0.615,naturalquestions_open,helm_classic_240130,[],knowledge +361,llama_2_7b,0.611,naturalquestions_open,helm_classic_240130,[],knowledge +362,llama_13b,0.614,naturalquestions_open,helm_classic_240130,[],knowledge +363,instructpalmyra_30b,0.682,naturalquestions_open,helm_classic_240130,[],knowledge +364,cohere_xlarge_v20220609_52.4b,0.595,naturalquestions_open,helm_classic_240130,[],knowledge +365,jurassic_2_large_7.5b,0.589,naturalquestions_open,helm_classic_240130,[],knowledge +366,davinci_175b,0.625,naturalquestions_open,helm_classic_240130,[],knowledge +367,llama_7b,0.589,naturalquestions_open,helm_classic_240130,[],knowledge +368,redpajama_incite_instruct_7b,0.659,naturalquestions_open,helm_classic_240130,[],knowledge +369,j1_jumbo_v1_178b,0.595,naturalquestions_open,helm_classic_240130,[],knowledge +370,glm_130b,0.642,naturalquestions_open,helm_classic_240130,[],knowledge +371,luminous_extended_30b,0.609,naturalquestions_open,helm_classic_240130,[],knowledge +372,opt_66b,0.596,naturalquestions_open,helm_classic_240130,[],knowledge +373,bloom_176b,0.621,naturalquestions_open,helm_classic_240130,[],knowledge +374,j1_grande_v1_17b,0.578,naturalquestions_open,helm_classic_240130,[],knowledge +375,alpaca_7b,0.592,naturalquestions_open,helm_classic_240130,[],knowledge +376,falcon_7b,0.579,naturalquestions_open,helm_classic_240130,[],knowledge +377,redpajama_incite_base_7b,0.586,naturalquestions_open,helm_classic_240130,[],knowledge +378,cohere_large_v20220720_13.1b,0.573,naturalquestions_open,helm_classic_240130,[],knowledge +379,redpajama_incite_instruct_v1_3b,0.637,naturalquestions_open,helm_classic_240130,[],knowledge +380,text_curie_001,0.571,naturalquestions_open,helm_classic_240130,[],knowledge +381,gpt_neox_20b,0.596,naturalquestions_open,helm_classic_240130,[],knowledge +382,luminous_base_13b,0.568,naturalquestions_open,helm_classic_240130,[],knowledge +383,cohere_medium_v20221108_6.1b,0.517,naturalquestions_open,helm_classic_240130,[],knowledge +384,redpajama_incite_base_v1_3b,0.52,naturalquestions_open,helm_classic_240130,[],knowledge +385,tnlg_v2_6.7b,0.561,naturalquestions_open,helm_classic_240130,[],knowledge +386,j1_large_v1_7.5b,0.532,naturalquestions_open,helm_classic_240130,[],knowledge +387,gpt_j_6b,0.559,naturalquestions_open,helm_classic_240130,[],knowledge +388,pythia_12b,0.581,naturalquestions_open,helm_classic_240130,[],knowledge +389,curie_6.7b,0.552,naturalquestions_open,helm_classic_240130,[],knowledge +390,falcon_instruct_7b,0.449,naturalquestions_open,helm_classic_240130,[],knowledge +391,cohere_medium_v20220720_6.1b,0.504,naturalquestions_open,helm_classic_240130,[],knowledge +392,text_babbage_001,0.33,naturalquestions_open,helm_classic_240130,[],knowledge +393,t0pp_11b,0.19,naturalquestions_open,helm_classic_240130,[],knowledge +394,pythia_6.9b,0.539,naturalquestions_open,helm_classic_240130,[],knowledge +395,ul2_20b,0.349,naturalquestions_open,helm_classic_240130,[],knowledge +396,t5_11b,0.477,naturalquestions_open,helm_classic_240130,[],knowledge +397,babbage_1.3b,0.451,naturalquestions_open,helm_classic_240130,[],knowledge +398,cohere_small_v20220720_410m,0.309,naturalquestions_open,helm_classic_240130,[],knowledge +399,ada_350m,0.365,naturalquestions_open,helm_classic_240130,[],knowledge +400,text_ada_001,0.149,naturalquestions_open,helm_classic_240130,[],knowledge +401,yalm_100b,0.227,naturalquestions_open,helm_classic_240130,[],knowledge +402,llama_2_70b,0.484,quac,helm_classic_240130,[],other +403,llama_65b,0.401,quac,helm_classic_240130,[],other +404,text_davinci_002,0.445,quac,helm_classic_240130,[],other +405,mistral_v0.1_7b,0.423,quac,helm_classic_240130,[],other +406,cohere_command_beta_52.4b,0.432,quac,helm_classic_240130,[],other +407,text_davinci_003,0.525,quac,helm_classic_240130,[],other +408,jurassic_2_jumbo_178b,0.435,quac,helm_classic_240130,[],other +409,llama_2_13b,0.424,quac,helm_classic_240130,[],other +410,tnlg_v2_530b,0.39,quac,helm_classic_240130,[],other +411,gpt_3.5_turbo_0613,0.485,quac,helm_classic_240130,[],other +412,llama_30b,0.39,quac,helm_classic_240130,[],other +413,anthropic_lm_v4_s3_52b,0.431,quac,helm_classic_240130,[],other +414,gpt_3.5_turbo_0301,0.512,quac,helm_classic_240130,[],other +415,jurassic_2_grande_17b,0.418,quac,helm_classic_240130,[],other +416,palmyra_x_43b,0.473,quac,helm_classic_240130,[],other +417,falcon_40b,0.307,quac,helm_classic_240130,[],other +418,falcon_instruct_40b,0.371,quac,helm_classic_240130,[],other +419,mpt_instruct_30b,0.327,quac,helm_classic_240130,[],other +420,mpt_30b,0.393,quac,helm_classic_240130,[],other +421,j1_grande_v2_beta_17b,0.392,quac,helm_classic_240130,[],other +422,vicuna_v1.3_13b,0.403,quac,helm_classic_240130,[],other +423,cohere_command_beta_6.1b,0.375,quac,helm_classic_240130,[],other +424,cohere_xlarge_v20221108_52.4b,0.374,quac,helm_classic_240130,[],other +425,luminous_supreme_70b,0.37,quac,helm_classic_240130,[],other +426,vicuna_v1.3_7b,0.392,quac,helm_classic_240130,[],other +427,opt_175b,0.36,quac,helm_classic_240130,[],other +428,llama_2_7b,0.406,quac,helm_classic_240130,[],other +429,llama_13b,0.347,quac,helm_classic_240130,[],other +430,instructpalmyra_30b,0.433,quac,helm_classic_240130,[],other +431,cohere_xlarge_v20220609_52.4b,0.361,quac,helm_classic_240130,[],other +433,davinci_175b,0.36,quac,helm_classic_240130,[],other +434,llama_7b,0.338,quac,helm_classic_240130,[],other +435,redpajama_incite_instruct_7b,0.26,quac,helm_classic_240130,[],other +436,j1_jumbo_v1_178b,0.358,quac,helm_classic_240130,[],other +437,glm_130b,0.272,quac,helm_classic_240130,[],other +438,luminous_extended_30b,0.349,quac,helm_classic_240130,[],other +439,opt_66b,0.357,quac,helm_classic_240130,[],other +440,bloom_176b,0.361,quac,helm_classic_240130,[],other +441,j1_grande_v1_17b,0.362,quac,helm_classic_240130,[],other +442,alpaca_7b,0.27,quac,helm_classic_240130,[],other +443,falcon_7b,0.332,quac,helm_classic_240130,[],other +444,redpajama_incite_base_7b,0.336,quac,helm_classic_240130,[],other +445,cohere_large_v20220720_13.1b,0.338,quac,helm_classic_240130,[],other +446,redpajama_incite_instruct_v1_3b,0.259,quac,helm_classic_240130,[],other +447,text_curie_001,0.358,quac,helm_classic_240130,[],other +448,gpt_neox_20b,0.326,quac,helm_classic_240130,[],other +449,luminous_base_13b,0.334,quac,helm_classic_240130,[],other +450,cohere_medium_v20221108_6.1b,0.314,quac,helm_classic_240130,[],other +451,redpajama_incite_base_v1_3b,0.309,quac,helm_classic_240130,[],other +452,tnlg_v2_6.7b,0.345,quac,helm_classic_240130,[],other +453,j1_large_v1_7.5b,0.328,quac,helm_classic_240130,[],other +454,gpt_j_6b,0.33,quac,helm_classic_240130,[],other +455,pythia_12b,0.313,quac,helm_classic_240130,[],other +456,curie_6.7b,0.321,quac,helm_classic_240130,[],other +457,falcon_instruct_7b,0.311,quac,helm_classic_240130,[],other +458,cohere_medium_v20220720_6.1b,0.279,quac,helm_classic_240130,[],other +459,text_babbage_001,0.284,quac,helm_classic_240130,[],other +460,t0pp_11b,0.121,quac,helm_classic_240130,[],other +461,pythia_6.9b,0.296,quac,helm_classic_240130,[],other +462,ul2_20b,0.144,quac,helm_classic_240130,[],other +463,t5_11b,0.116,quac,helm_classic_240130,[],other +464,babbage_1.3b,0.273,quac,helm_classic_240130,[],other +465,cohere_small_v20220720_410m,0.219,quac,helm_classic_240130,[],other +466,ada_350m,0.242,quac,helm_classic_240130,[],other +467,text_ada_001,0.176,quac,helm_classic_240130,[],other +468,yalm_100b,0.162,quac,helm_classic_240130,[],other +471,text_davinci_002,0.815,hellaswag,helm_classic_240130,[],reasoning +473,cohere_command_beta_52.4b,0.811,hellaswag,helm_classic_240130,[],reasoning +474,text_davinci_003,0.822,hellaswag,helm_classic_240130,[],reasoning +475,jurassic_2_jumbo_178b,0.788,hellaswag,helm_classic_240130,[],reasoning +477,tnlg_v2_530b,0.799,hellaswag,helm_classic_240130,[],reasoning +480,anthropic_lm_v4_s3_52b,0.807,hellaswag,helm_classic_240130,[],reasoning +482,jurassic_2_grande_17b,0.781,hellaswag,helm_classic_240130,[],reasoning +488,j1_grande_v2_beta_17b,0.764,hellaswag,helm_classic_240130,[],reasoning +490,cohere_command_beta_6.1b,0.752,hellaswag,helm_classic_240130,[],reasoning +491,cohere_xlarge_v20221108_52.4b,0.81,hellaswag,helm_classic_240130,[],reasoning +494,opt_175b,0.791,hellaswag,helm_classic_240130,[],reasoning +498,cohere_xlarge_v20220609_52.4b,0.811,hellaswag,helm_classic_240130,[],reasoning +499,jurassic_2_large_7.5b,0.729,hellaswag,helm_classic_240130,[],reasoning +500,davinci_175b,0.775,hellaswag,helm_classic_240130,[],reasoning +503,j1_jumbo_v1_178b,0.765,hellaswag,helm_classic_240130,[],reasoning +506,opt_66b,0.745,hellaswag,helm_classic_240130,[],reasoning +507,bloom_176b,0.744,hellaswag,helm_classic_240130,[],reasoning +508,j1_grande_v1_17b,0.739,hellaswag,helm_classic_240130,[],reasoning +512,cohere_large_v20220720_13.1b,0.736,hellaswag,helm_classic_240130,[],reasoning +514,text_curie_001,0.676,hellaswag,helm_classic_240130,[],reasoning +515,gpt_neox_20b,0.718,hellaswag,helm_classic_240130,[],reasoning +517,cohere_medium_v20221108_6.1b,0.726,hellaswag,helm_classic_240130,[],reasoning +519,tnlg_v2_6.7b,0.704,hellaswag,helm_classic_240130,[],reasoning +520,j1_large_v1_7.5b,0.7,hellaswag,helm_classic_240130,[],reasoning +521,gpt_j_6b,0.663,hellaswag,helm_classic_240130,[],reasoning +523,curie_6.7b,0.682,hellaswag,helm_classic_240130,[],reasoning +525,cohere_medium_v20220720_6.1b,0.706,hellaswag,helm_classic_240130,[],reasoning +526,text_babbage_001,0.561,hellaswag,helm_classic_240130,[],reasoning +531,babbage_1.3b,0.555,hellaswag,helm_classic_240130,[],reasoning +532,cohere_small_v20220720_410m,0.483,hellaswag,helm_classic_240130,[],reasoning +533,ada_350m,0.435,hellaswag,helm_classic_240130,[],reasoning +534,text_ada_001,0.429,hellaswag,helm_classic_240130,[],reasoning +538,text_davinci_002,0.594,openbookqa,helm_classic_240130,[],knowledge +540,cohere_command_beta_52.4b,0.582,openbookqa,helm_classic_240130,[],knowledge +541,text_davinci_003,0.646,openbookqa,helm_classic_240130,[],knowledge +542,jurassic_2_jumbo_178b,0.558,openbookqa,helm_classic_240130,[],knowledge +544,tnlg_v2_530b,0.562,openbookqa,helm_classic_240130,[],knowledge +547,anthropic_lm_v4_s3_52b,0.558,openbookqa,helm_classic_240130,[],knowledge +549,jurassic_2_grande_17b,0.542,openbookqa,helm_classic_240130,[],knowledge +555,j1_grande_v2_beta_17b,0.56,openbookqa,helm_classic_240130,[],knowledge +557,cohere_command_beta_6.1b,0.55,openbookqa,helm_classic_240130,[],knowledge +558,cohere_xlarge_v20221108_52.4b,0.588,openbookqa,helm_classic_240130,[],knowledge +561,opt_175b,0.586,openbookqa,helm_classic_240130,[],knowledge +565,cohere_xlarge_v20220609_52.4b,0.55,openbookqa,helm_classic_240130,[],knowledge +566,jurassic_2_large_7.5b,0.53,openbookqa,helm_classic_240130,[],knowledge +567,davinci_175b,0.586,openbookqa,helm_classic_240130,[],knowledge +570,j1_jumbo_v1_178b,0.534,openbookqa,helm_classic_240130,[],knowledge +573,opt_66b,0.534,openbookqa,helm_classic_240130,[],knowledge +574,bloom_176b,0.534,openbookqa,helm_classic_240130,[],knowledge +575,j1_grande_v1_17b,0.52,openbookqa,helm_classic_240130,[],knowledge +579,cohere_large_v20220720_13.1b,0.542,openbookqa,helm_classic_240130,[],knowledge +581,text_curie_001,0.514,openbookqa,helm_classic_240130,[],knowledge +582,gpt_neox_20b,0.524,openbookqa,helm_classic_240130,[],knowledge +584,cohere_medium_v20221108_6.1b,0.538,openbookqa,helm_classic_240130,[],knowledge +586,tnlg_v2_6.7b,0.478,openbookqa,helm_classic_240130,[],knowledge +587,j1_large_v1_7.5b,0.514,openbookqa,helm_classic_240130,[],knowledge +588,gpt_j_6b,0.514,openbookqa,helm_classic_240130,[],knowledge +590,curie_6.7b,0.502,openbookqa,helm_classic_240130,[],knowledge +592,cohere_medium_v20220720_6.1b,0.496,openbookqa,helm_classic_240130,[],knowledge +593,text_babbage_001,0.452,openbookqa,helm_classic_240130,[],knowledge +598,babbage_1.3b,0.438,openbookqa,helm_classic_240130,[],knowledge +599,cohere_small_v20220720_410m,0.348,openbookqa,helm_classic_240130,[],knowledge +600,ada_350m,0.38,openbookqa,helm_classic_240130,[],knowledge +601,text_ada_001,0.346,openbookqa,helm_classic_240130,[],knowledge +603,llama_2_70b,0.554,truthfulqa,helm_classic_240130,[],knowledge +604,llama_65b,0.508,truthfulqa,helm_classic_240130,[],knowledge +605,text_davinci_002,0.61,truthfulqa,helm_classic_240130,[],knowledge +606,mistral_v0.1_7b,0.422,truthfulqa,helm_classic_240130,[],knowledge +607,cohere_command_beta_52.4b,0.269,truthfulqa,helm_classic_240130,[],knowledge +608,text_davinci_003,0.593,truthfulqa,helm_classic_240130,[],knowledge +609,jurassic_2_jumbo_178b,0.437,truthfulqa,helm_classic_240130,[],knowledge +610,llama_2_13b,0.33,truthfulqa,helm_classic_240130,[],knowledge +611,tnlg_v2_530b,0.251,truthfulqa,helm_classic_240130,[],knowledge +612,gpt_3.5_turbo_0613,0.339,truthfulqa,helm_classic_240130,[],knowledge +613,llama_30b,0.344,truthfulqa,helm_classic_240130,[],knowledge +614,anthropic_lm_v4_s3_52b,0.368,truthfulqa,helm_classic_240130,[],knowledge +615,gpt_3.5_turbo_0301,0.609,truthfulqa,helm_classic_240130,[],knowledge +616,jurassic_2_grande_17b,0.348,truthfulqa,helm_classic_240130,[],knowledge +617,palmyra_x_43b,0.616,truthfulqa,helm_classic_240130,[],knowledge +618,falcon_40b,0.353,truthfulqa,helm_classic_240130,[],knowledge +619,falcon_instruct_40b,0.384,truthfulqa,helm_classic_240130,[],knowledge +620,mpt_instruct_30b,0.234,truthfulqa,helm_classic_240130,[],knowledge +621,mpt_30b,0.231,truthfulqa,helm_classic_240130,[],knowledge +622,j1_grande_v2_beta_17b,0.306,truthfulqa,helm_classic_240130,[],knowledge +623,vicuna_v1.3_13b,0.385,truthfulqa,helm_classic_240130,[],knowledge +624,cohere_command_beta_6.1b,0.203,truthfulqa,helm_classic_240130,[],knowledge +625,cohere_xlarge_v20221108_52.4b,0.169,truthfulqa,helm_classic_240130,[],knowledge +626,luminous_supreme_70b,0.222,truthfulqa,helm_classic_240130,[],knowledge +627,vicuna_v1.3_7b,0.292,truthfulqa,helm_classic_240130,[],knowledge +628,opt_175b,0.25,truthfulqa,helm_classic_240130,[],knowledge +629,llama_2_7b,0.272,truthfulqa,helm_classic_240130,[],knowledge +630,llama_13b,0.324,truthfulqa,helm_classic_240130,[],knowledge +631,instructpalmyra_30b,0.185,truthfulqa,helm_classic_240130,[],knowledge +632,cohere_xlarge_v20220609_52.4b,0.198,truthfulqa,helm_classic_240130,[],knowledge +633,jurassic_2_large_7.5b,0.245,truthfulqa,helm_classic_240130,[],knowledge +634,davinci_175b,0.194,truthfulqa,helm_classic_240130,[],knowledge +635,llama_7b,0.28,truthfulqa,helm_classic_240130,[],knowledge +636,redpajama_incite_instruct_7b,0.243,truthfulqa,helm_classic_240130,[],knowledge +637,j1_jumbo_v1_178b,0.175,truthfulqa,helm_classic_240130,[],knowledge +638,glm_130b,0.218,truthfulqa,helm_classic_240130,[],knowledge +639,luminous_extended_30b,0.221,truthfulqa,helm_classic_240130,[],knowledge +640,opt_66b,0.201,truthfulqa,helm_classic_240130,[],knowledge +641,bloom_176b,0.205,truthfulqa,helm_classic_240130,[],knowledge +642,j1_grande_v1_17b,0.193,truthfulqa,helm_classic_240130,[],knowledge +643,alpaca_7b,0.243,truthfulqa,helm_classic_240130,[],knowledge +644,falcon_7b,0.234,truthfulqa,helm_classic_240130,[],knowledge +645,redpajama_incite_base_7b,0.205,truthfulqa,helm_classic_240130,[],knowledge +646,cohere_large_v20220720_13.1b,0.181,truthfulqa,helm_classic_240130,[],knowledge +647,redpajama_incite_instruct_v1_3b,0.208,truthfulqa,helm_classic_240130,[],knowledge +648,text_curie_001,0.257,truthfulqa,helm_classic_240130,[],knowledge +649,gpt_neox_20b,0.216,truthfulqa,helm_classic_240130,[],knowledge +650,luminous_base_13b,0.182,truthfulqa,helm_classic_240130,[],knowledge +651,cohere_medium_v20221108_6.1b,0.215,truthfulqa,helm_classic_240130,[],knowledge +652,redpajama_incite_base_v1_3b,0.277,truthfulqa,helm_classic_240130,[],knowledge +653,tnlg_v2_6.7b,0.167,truthfulqa,helm_classic_240130,[],knowledge +654,j1_large_v1_7.5b,0.197,truthfulqa,helm_classic_240130,[],knowledge +655,gpt_j_6b,0.199,truthfulqa,helm_classic_240130,[],knowledge +656,pythia_12b,0.177,truthfulqa,helm_classic_240130,[],knowledge +657,curie_6.7b,0.232,truthfulqa,helm_classic_240130,[],knowledge +658,falcon_instruct_7b,0.213,truthfulqa,helm_classic_240130,[],knowledge +659,cohere_medium_v20220720_6.1b,0.19,truthfulqa,helm_classic_240130,[],knowledge +660,text_babbage_001,0.233,truthfulqa,helm_classic_240130,[],knowledge +661,t0pp_11b,0.377,truthfulqa,helm_classic_240130,[],knowledge +662,pythia_6.9b,0.213,truthfulqa,helm_classic_240130,[],knowledge +663,ul2_20b,0.193,truthfulqa,helm_classic_240130,[],knowledge +664,t5_11b,0.133,truthfulqa,helm_classic_240130,[],knowledge +665,babbage_1.3b,0.188,truthfulqa,helm_classic_240130,[],knowledge +666,cohere_small_v20220720_410m,0.217,truthfulqa,helm_classic_240130,[],knowledge +667,ada_350m,0.215,truthfulqa,helm_classic_240130,[],knowledge +668,text_ada_001,0.232,truthfulqa,helm_classic_240130,[],knowledge +669,yalm_100b,0.202,truthfulqa,helm_classic_240130,[],knowledge +672,text_davinci_002,0.421,ms_marco_regular,helm_classic_240130,[],other +674,cohere_command_beta_52.4b,0.472,ms_marco_regular,helm_classic_240130,[],other +675,text_davinci_003,0.368,ms_marco_regular,helm_classic_240130,[],other +676,jurassic_2_jumbo_178b,0.398,ms_marco_regular,helm_classic_240130,[],other +678,tnlg_v2_530b,0.377,ms_marco_regular,helm_classic_240130,[],other +683,jurassic_2_grande_17b,0.293,ms_marco_regular,helm_classic_240130,[],other +689,j1_grande_v2_beta_17b,0.285,ms_marco_regular,helm_classic_240130,[],other +691,cohere_command_beta_6.1b,0.434,ms_marco_regular,helm_classic_240130,[],other +692,cohere_xlarge_v20221108_52.4b,0.315,ms_marco_regular,helm_classic_240130,[],other +695,opt_175b,0.288,ms_marco_regular,helm_classic_240130,[],other +699,cohere_xlarge_v20220609_52.4b,0.273,ms_marco_regular,helm_classic_240130,[],other +700,jurassic_2_large_7.5b,0.247,ms_marco_regular,helm_classic_240130,[],other +701,davinci_175b,0.211,ms_marco_regular,helm_classic_240130,[],other +704,j1_jumbo_v1_178b,0.21,ms_marco_regular,helm_classic_240130,[],other +707,opt_66b,0.237,ms_marco_regular,helm_classic_240130,[],other +708,bloom_176b,0.236,ms_marco_regular,helm_classic_240130,[],other +709,j1_grande_v1_17b,0.161,ms_marco_regular,helm_classic_240130,[],other +713,cohere_large_v20220720_13.1b,0.19,ms_marco_regular,helm_classic_240130,[],other +715,text_curie_001,0.271,ms_marco_regular,helm_classic_240130,[],other +716,gpt_neox_20b,0.184,ms_marco_regular,helm_classic_240130,[],other +718,cohere_medium_v20221108_6.1b,0.175,ms_marco_regular,helm_classic_240130,[],other +720,tnlg_v2_6.7b,0.158,ms_marco_regular,helm_classic_240130,[],other +721,j1_large_v1_7.5b,0.147,ms_marco_regular,helm_classic_240130,[],other +722,gpt_j_6b,0.152,ms_marco_regular,helm_classic_240130,[],other +724,curie_6.7b,0.162,ms_marco_regular,helm_classic_240130,[],other +726,cohere_medium_v20220720_6.1b,0.152,ms_marco_regular,helm_classic_240130,[],other +727,text_babbage_001,0.208,ms_marco_regular,helm_classic_240130,[],other +732,babbage_1.3b,0.122,ms_marco_regular,helm_classic_240130,[],other +734,ada_350m,0.102,ms_marco_regular,helm_classic_240130,[],other +735,text_ada_001,0.134,ms_marco_regular,helm_classic_240130,[],other +739,text_davinci_002,0.664,ms_marco_trec,helm_classic_240130,[],other +741,cohere_command_beta_52.4b,0.762,ms_marco_trec,helm_classic_240130,[],other +742,text_davinci_003,0.644,ms_marco_trec,helm_classic_240130,[],other +743,jurassic_2_jumbo_178b,0.661,ms_marco_trec,helm_classic_240130,[],other +745,tnlg_v2_530b,0.643,ms_marco_trec,helm_classic_240130,[],other +750,jurassic_2_grande_17b,0.514,ms_marco_trec,helm_classic_240130,[],other +756,j1_grande_v2_beta_17b,0.46,ms_marco_trec,helm_classic_240130,[],other +758,cohere_command_beta_6.1b,0.709,ms_marco_trec,helm_classic_240130,[],other +759,cohere_xlarge_v20221108_52.4b,0.55,ms_marco_trec,helm_classic_240130,[],other +762,opt_175b,0.448,ms_marco_trec,helm_classic_240130,[],other +766,cohere_xlarge_v20220609_52.4b,0.459,ms_marco_trec,helm_classic_240130,[],other +767,jurassic_2_large_7.5b,0.464,ms_marco_trec,helm_classic_240130,[],other +768,davinci_175b,0.378,ms_marco_trec,helm_classic_240130,[],other +771,j1_jumbo_v1_178b,0.363,ms_marco_trec,helm_classic_240130,[],other +774,opt_66b,0.482,ms_marco_trec,helm_classic_240130,[],other +775,bloom_176b,0.386,ms_marco_trec,helm_classic_240130,[],other +776,j1_grande_v1_17b,0.341,ms_marco_trec,helm_classic_240130,[],other +780,cohere_large_v20220720_13.1b,0.33,ms_marco_trec,helm_classic_240130,[],other +782,text_curie_001,0.507,ms_marco_trec,helm_classic_240130,[],other +783,gpt_neox_20b,0.398,ms_marco_trec,helm_classic_240130,[],other +785,cohere_medium_v20221108_6.1b,0.373,ms_marco_trec,helm_classic_240130,[],other +787,tnlg_v2_6.7b,0.332,ms_marco_trec,helm_classic_240130,[],other +788,j1_large_v1_7.5b,0.292,ms_marco_trec,helm_classic_240130,[],other +789,gpt_j_6b,0.345,ms_marco_trec,helm_classic_240130,[],other +791,curie_6.7b,0.3,ms_marco_trec,helm_classic_240130,[],other +793,cohere_medium_v20220720_6.1b,0.374,ms_marco_trec,helm_classic_240130,[],other +794,text_babbage_001,0.449,ms_marco_trec,helm_classic_240130,[],other +799,babbage_1.3b,0.317,ms_marco_trec,helm_classic_240130,[],other +800,cohere_small_v20220720_410m,0.304,ms_marco_trec,helm_classic_240130,[],other +801,ada_350m,0.29,ms_marco_trec,helm_classic_240130,[],other +802,text_ada_001,0.302,ms_marco_trec,helm_classic_240130,[],other +806,text_davinci_002,0.153,cnn/dailymail,helm_classic_240130,[],other +808,cohere_command_beta_52.4b,0.161,cnn/dailymail,helm_classic_240130,[],other +809,text_davinci_003,0.156,cnn/dailymail,helm_classic_240130,[],other +810,jurassic_2_jumbo_178b,0.149,cnn/dailymail,helm_classic_240130,[],other +812,tnlg_v2_530b,0.161,cnn/dailymail,helm_classic_240130,[],other +815,anthropic_lm_v4_s3_52b,0.154,cnn/dailymail,helm_classic_240130,[],other +817,jurassic_2_grande_17b,0.144,cnn/dailymail,helm_classic_240130,[],other +818,palmyra_x_43b,0.049,cnn/dailymail,helm_classic_240130,[],other +823,j1_grande_v2_beta_17b,0.146,cnn/dailymail,helm_classic_240130,[],other +825,cohere_command_beta_6.1b,0.153,cnn/dailymail,helm_classic_240130,[],other +826,cohere_xlarge_v20221108_52.4b,0.153,cnn/dailymail,helm_classic_240130,[],other +827,luminous_supreme_70b,0.15,cnn/dailymail,helm_classic_240130,[],other +829,opt_175b,0.146,cnn/dailymail,helm_classic_240130,[],other +832,instructpalmyra_30b,0.152,cnn/dailymail,helm_classic_240130,[],other +833,cohere_xlarge_v20220609_52.4b,0.144,cnn/dailymail,helm_classic_240130,[],other +834,jurassic_2_large_7.5b,0.136,cnn/dailymail,helm_classic_240130,[],other +835,davinci_175b,0.127,cnn/dailymail,helm_classic_240130,[],other +838,j1_jumbo_v1_178b,0.144,cnn/dailymail,helm_classic_240130,[],other +839,glm_130b,0.154,cnn/dailymail,helm_classic_240130,[],other +840,luminous_extended_30b,0.139,cnn/dailymail,helm_classic_240130,[],other +841,opt_66b,0.136,cnn/dailymail,helm_classic_240130,[],other +842,bloom_176b,0.08,cnn/dailymail,helm_classic_240130,[],other +843,j1_grande_v1_17b,0.143,cnn/dailymail,helm_classic_240130,[],other +847,cohere_large_v20220720_13.1b,0.126,cnn/dailymail,helm_classic_240130,[],other +849,text_curie_001,0.152,cnn/dailymail,helm_classic_240130,[],other +850,gpt_neox_20b,0.123,cnn/dailymail,helm_classic_240130,[],other +851,luminous_base_13b,0.11,cnn/dailymail,helm_classic_240130,[],other +852,cohere_medium_v20221108_6.1b,0.121,cnn/dailymail,helm_classic_240130,[],other +854,tnlg_v2_6.7b,0.146,cnn/dailymail,helm_classic_240130,[],other +855,j1_large_v1_7.5b,0.134,cnn/dailymail,helm_classic_240130,[],other +856,gpt_j_6b,0.131,cnn/dailymail,helm_classic_240130,[],other +858,curie_6.7b,0.113,cnn/dailymail,helm_classic_240130,[],other +860,cohere_medium_v20220720_6.1b,0.077,cnn/dailymail,helm_classic_240130,[],other +861,text_babbage_001,0.151,cnn/dailymail,helm_classic_240130,[],other +862,t0pp_11b,0.122,cnn/dailymail,helm_classic_240130,[],other +864,ul2_20b,0.03,cnn/dailymail,helm_classic_240130,[],other +865,t5_11b,0.043,cnn/dailymail,helm_classic_240130,[],other +866,babbage_1.3b,0.079,cnn/dailymail,helm_classic_240130,[],other +867,cohere_small_v20220720_410m,0.063,cnn/dailymail,helm_classic_240130,[],other +868,ada_350m,0.09,cnn/dailymail,helm_classic_240130,[],other +869,text_ada_001,0.136,cnn/dailymail,helm_classic_240130,[],other +870,yalm_100b,0.017,cnn/dailymail,helm_classic_240130,[],other +873,text_davinci_002,0.144,xsum,helm_classic_240130,[],other +875,cohere_command_beta_52.4b,0.152,xsum,helm_classic_240130,[],other +876,text_davinci_003,0.124,xsum,helm_classic_240130,[],other +877,jurassic_2_jumbo_178b,0.182,xsum,helm_classic_240130,[],other +879,tnlg_v2_530b,0.169,xsum,helm_classic_240130,[],other +882,anthropic_lm_v4_s3_52b,0.134,xsum,helm_classic_240130,[],other +884,jurassic_2_grande_17b,0.167,xsum,helm_classic_240130,[],other +885,palmyra_x_43b,0.149,xsum,helm_classic_240130,[],other +890,j1_grande_v2_beta_17b,0.152,xsum,helm_classic_240130,[],other +892,cohere_command_beta_6.1b,0.122,xsum,helm_classic_240130,[],other +893,cohere_xlarge_v20221108_52.4b,0.153,xsum,helm_classic_240130,[],other +894,luminous_supreme_70b,0.136,xsum,helm_classic_240130,[],other +896,opt_175b,0.155,xsum,helm_classic_240130,[],other +899,instructpalmyra_30b,0.104,xsum,helm_classic_240130,[],other +900,cohere_xlarge_v20220609_52.4b,0.129,xsum,helm_classic_240130,[],other +901,jurassic_2_large_7.5b,0.142,xsum,helm_classic_240130,[],other +902,davinci_175b,0.126,xsum,helm_classic_240130,[],other +905,j1_jumbo_v1_178b,0.129,xsum,helm_classic_240130,[],other +906,glm_130b,0.132,xsum,helm_classic_240130,[],other +907,luminous_extended_30b,0.124,xsum,helm_classic_240130,[],other +908,opt_66b,0.126,xsum,helm_classic_240130,[],other +909,bloom_176b,0.03,xsum,helm_classic_240130,[],other +910,j1_grande_v1_17b,0.122,xsum,helm_classic_240130,[],other +914,cohere_large_v20220720_13.1b,0.108,xsum,helm_classic_240130,[],other +916,text_curie_001,0.076,xsum,helm_classic_240130,[],other +917,gpt_neox_20b,0.102,xsum,helm_classic_240130,[],other +918,luminous_base_13b,0.105,xsum,helm_classic_240130,[],other +919,cohere_medium_v20221108_6.1b,0.099,xsum,helm_classic_240130,[],other +921,tnlg_v2_6.7b,0.11,xsum,helm_classic_240130,[],other +922,j1_large_v1_7.5b,0.102,xsum,helm_classic_240130,[],other +923,gpt_j_6b,0.096,xsum,helm_classic_240130,[],other +925,curie_6.7b,0.091,xsum,helm_classic_240130,[],other +927,cohere_medium_v20220720_6.1b,0.087,xsum,helm_classic_240130,[],other +928,text_babbage_001,0.046,xsum,helm_classic_240130,[],other +929,t0pp_11b,0.09,xsum,helm_classic_240130,[],other +931,ul2_20b,0.058,xsum,helm_classic_240130,[],other +932,t5_11b,0.015,xsum,helm_classic_240130,[],other +933,babbage_1.3b,0.045,xsum,helm_classic_240130,[],other +934,cohere_small_v20220720_410m,0.033,xsum,helm_classic_240130,[],other +935,ada_350m,0.022,xsum,helm_classic_240130,[],other +936,text_ada_001,0.034,xsum,helm_classic_240130,[],other +937,yalm_100b,0.021,xsum,helm_classic_240130,[],other +938,llama_2_70b,0.961,imdb,helm_classic_240130,[],other +939,llama_65b,0.962,imdb,helm_classic_240130,[],other +940,text_davinci_002,0.948,imdb,helm_classic_240130,[],other +941,mistral_v0.1_7b,0.962,imdb,helm_classic_240130,[],other +942,cohere_command_beta_52.4b,0.96,imdb,helm_classic_240130,[],other +943,text_davinci_003,0.848,imdb,helm_classic_240130,[],other +944,jurassic_2_jumbo_178b,0.938,imdb,helm_classic_240130,[],other +945,llama_2_13b,0.962,imdb,helm_classic_240130,[],other +946,tnlg_v2_530b,0.941,imdb,helm_classic_240130,[],other +947,gpt_3.5_turbo_0613,0.943,imdb,helm_classic_240130,[],other +948,llama_30b,0.927,imdb,helm_classic_240130,[],other +949,anthropic_lm_v4_s3_52b,0.934,imdb,helm_classic_240130,[],other +950,gpt_3.5_turbo_0301,0.899,imdb,helm_classic_240130,[],other +951,jurassic_2_grande_17b,0.938,imdb,helm_classic_240130,[],other +952,palmyra_x_43b,0.935,imdb,helm_classic_240130,[],other +953,falcon_40b,0.959,imdb,helm_classic_240130,[],other +954,falcon_instruct_40b,0.959,imdb,helm_classic_240130,[],other +955,mpt_instruct_30b,0.956,imdb,helm_classic_240130,[],other +956,mpt_30b,0.959,imdb,helm_classic_240130,[],other +957,j1_grande_v2_beta_17b,0.957,imdb,helm_classic_240130,[],other +958,vicuna_v1.3_13b,0.762,imdb,helm_classic_240130,[],other +959,cohere_command_beta_6.1b,0.961,imdb,helm_classic_240130,[],other +960,cohere_xlarge_v20221108_52.4b,0.956,imdb,helm_classic_240130,[],other +961,luminous_supreme_70b,0.959,imdb,helm_classic_240130,[],other +962,vicuna_v1.3_7b,0.916,imdb,helm_classic_240130,[],other +963,opt_175b,0.947,imdb,helm_classic_240130,[],other +964,llama_2_7b,0.907,imdb,helm_classic_240130,[],other +965,llama_13b,0.928,imdb,helm_classic_240130,[],other +966,instructpalmyra_30b,0.94,imdb,helm_classic_240130,[],other +967,cohere_xlarge_v20220609_52.4b,0.956,imdb,helm_classic_240130,[],other +968,jurassic_2_large_7.5b,0.956,imdb,helm_classic_240130,[],other +969,davinci_175b,0.933,imdb,helm_classic_240130,[],other +970,llama_7b,0.947,imdb,helm_classic_240130,[],other +971,redpajama_incite_instruct_7b,0.927,imdb,helm_classic_240130,[],other +972,j1_jumbo_v1_178b,0.943,imdb,helm_classic_240130,[],other +973,glm_130b,0.955,imdb,helm_classic_240130,[],other +974,luminous_extended_30b,0.947,imdb,helm_classic_240130,[],other +975,opt_66b,0.917,imdb,helm_classic_240130,[],other +976,bloom_176b,0.945,imdb,helm_classic_240130,[],other +977,j1_grande_v1_17b,0.953,imdb,helm_classic_240130,[],other +978,alpaca_7b,0.738,imdb,helm_classic_240130,[],other +979,falcon_7b,0.836,imdb,helm_classic_240130,[],other +980,redpajama_incite_base_7b,0.752,imdb,helm_classic_240130,[],other +981,cohere_large_v20220720_13.1b,0.933,imdb,helm_classic_240130,[],other +982,redpajama_incite_instruct_v1_3b,0.894,imdb,helm_classic_240130,[],other +983,text_curie_001,0.923,imdb,helm_classic_240130,[],other +984,gpt_neox_20b,0.948,imdb,helm_classic_240130,[],other +985,luminous_base_13b,0.939,imdb,helm_classic_240130,[],other +986,cohere_medium_v20221108_6.1b,0.935,imdb,helm_classic_240130,[],other +987,redpajama_incite_base_v1_3b,0.907,imdb,helm_classic_240130,[],other +988,tnlg_v2_6.7b,0.927,imdb,helm_classic_240130,[],other +989,j1_large_v1_7.5b,0.956,imdb,helm_classic_240130,[],other +990,gpt_j_6b,0.939,imdb,helm_classic_240130,[],other +991,pythia_12b,0.931,imdb,helm_classic_240130,[],other +992,curie_6.7b,0.889,imdb,helm_classic_240130,[],other +993,falcon_instruct_7b,0.852,imdb,helm_classic_240130,[],other +994,cohere_medium_v20220720_6.1b,0.935,imdb,helm_classic_240130,[],other +995,text_babbage_001,0.913,imdb,helm_classic_240130,[],other +996,t0pp_11b,0.207,imdb,helm_classic_240130,[],other +997,pythia_6.9b,0.928,imdb,helm_classic_240130,[],other +998,ul2_20b,0.337,imdb,helm_classic_240130,[],other +999,t5_11b,0.379,imdb,helm_classic_240130,[],other +1000,babbage_1.3b,0.597,imdb,helm_classic_240130,[],other +1001,cohere_small_v20220720_410m,0.578,imdb,helm_classic_240130,[],other +1002,ada_350m,0.849,imdb,helm_classic_240130,[],other +1003,text_ada_001,0.822,imdb,helm_classic_240130,[],other +1004,yalm_100b,0.836,imdb,helm_classic_240130,[],other +1005,llama_2_70b,0.652,civilcomments,helm_classic_240130,[],other +1006,llama_65b,0.655,civilcomments,helm_classic_240130,[],other +1007,text_davinci_002,0.668,civilcomments,helm_classic_240130,[],other +1008,mistral_v0.1_7b,0.624,civilcomments,helm_classic_240130,[],other +1009,cohere_command_beta_52.4b,0.601,civilcomments,helm_classic_240130,[],other +1010,text_davinci_003,0.684,civilcomments,helm_classic_240130,[],other +1011,jurassic_2_jumbo_178b,0.57,civilcomments,helm_classic_240130,[],other +1012,llama_2_13b,0.588,civilcomments,helm_classic_240130,[],other +1013,tnlg_v2_530b,0.601,civilcomments,helm_classic_240130,[],other +1014,gpt_3.5_turbo_0613,0.696,civilcomments,helm_classic_240130,[],other +1015,llama_30b,0.549,civilcomments,helm_classic_240130,[],other +1016,anthropic_lm_v4_s3_52b,0.61,civilcomments,helm_classic_240130,[],other +1017,gpt_3.5_turbo_0301,0.674,civilcomments,helm_classic_240130,[],other +1018,jurassic_2_grande_17b,0.547,civilcomments,helm_classic_240130,[],other +1019,palmyra_x_43b,0.008,civilcomments,helm_classic_240130,[],other +1020,falcon_40b,0.552,civilcomments,helm_classic_240130,[],other +1021,falcon_instruct_40b,0.603,civilcomments,helm_classic_240130,[],other +1022,mpt_instruct_30b,0.573,civilcomments,helm_classic_240130,[],other +1023,mpt_30b,0.599,civilcomments,helm_classic_240130,[],other +1024,j1_grande_v2_beta_17b,0.546,civilcomments,helm_classic_240130,[],other +1025,vicuna_v1.3_13b,0.645,civilcomments,helm_classic_240130,[],other +1026,cohere_command_beta_6.1b,0.54,civilcomments,helm_classic_240130,[],other +1027,cohere_xlarge_v20221108_52.4b,0.524,civilcomments,helm_classic_240130,[],other +1028,luminous_supreme_70b,0.562,civilcomments,helm_classic_240130,[],other +1029,vicuna_v1.3_7b,0.62,civilcomments,helm_classic_240130,[],other +1030,opt_175b,0.505,civilcomments,helm_classic_240130,[],other +1031,llama_2_7b,0.562,civilcomments,helm_classic_240130,[],other +1032,llama_13b,0.6,civilcomments,helm_classic_240130,[],other +1033,instructpalmyra_30b,0.555,civilcomments,helm_classic_240130,[],other +1034,cohere_xlarge_v20220609_52.4b,0.532,civilcomments,helm_classic_240130,[],other +1035,jurassic_2_large_7.5b,0.57,civilcomments,helm_classic_240130,[],other +1036,davinci_175b,0.532,civilcomments,helm_classic_240130,[],other +1037,llama_7b,0.563,civilcomments,helm_classic_240130,[],other +1038,redpajama_incite_instruct_7b,0.664,civilcomments,helm_classic_240130,[],other +1039,j1_jumbo_v1_178b,0.553,civilcomments,helm_classic_240130,[],other +1040,glm_130b,0.5,civilcomments,helm_classic_240130,[],other +1041,luminous_extended_30b,0.524,civilcomments,helm_classic_240130,[],other +1042,opt_66b,0.506,civilcomments,helm_classic_240130,[],other +1043,bloom_176b,0.62,civilcomments,helm_classic_240130,[],other +1044,j1_grande_v1_17b,0.529,civilcomments,helm_classic_240130,[],other +1045,alpaca_7b,0.566,civilcomments,helm_classic_240130,[],other +1046,falcon_7b,0.514,civilcomments,helm_classic_240130,[],other +1047,redpajama_incite_base_7b,0.547,civilcomments,helm_classic_240130,[],other +1048,cohere_large_v20220720_13.1b,0.507,civilcomments,helm_classic_240130,[],other +1049,redpajama_incite_instruct_v1_3b,0.549,civilcomments,helm_classic_240130,[],other +1050,text_curie_001,0.537,civilcomments,helm_classic_240130,[],other +1051,gpt_neox_20b,0.516,civilcomments,helm_classic_240130,[],other +1052,luminous_base_13b,0.544,civilcomments,helm_classic_240130,[],other +1053,cohere_medium_v20221108_6.1b,0.5,civilcomments,helm_classic_240130,[],other +1054,redpajama_incite_base_v1_3b,0.549,civilcomments,helm_classic_240130,[],other +1055,tnlg_v2_6.7b,0.532,civilcomments,helm_classic_240130,[],other +1056,j1_large_v1_7.5b,0.532,civilcomments,helm_classic_240130,[],other +1057,gpt_j_6b,0.52,civilcomments,helm_classic_240130,[],other +1058,pythia_12b,0.531,civilcomments,helm_classic_240130,[],other +1059,curie_6.7b,0.539,civilcomments,helm_classic_240130,[],other +1060,falcon_instruct_7b,0.511,civilcomments,helm_classic_240130,[],other +1061,cohere_medium_v20220720_6.1b,0.504,civilcomments,helm_classic_240130,[],other +1062,text_babbage_001,0.499,civilcomments,helm_classic_240130,[],other +1063,t0pp_11b,0.234,civilcomments,helm_classic_240130,[],other +1064,pythia_6.9b,0.511,civilcomments,helm_classic_240130,[],other +1065,ul2_20b,0.521,civilcomments,helm_classic_240130,[],other +1066,t5_11b,0.509,civilcomments,helm_classic_240130,[],other +1067,babbage_1.3b,0.519,civilcomments,helm_classic_240130,[],other +1068,cohere_small_v20220720_410m,0.501,civilcomments,helm_classic_240130,[],other +1069,ada_350m,0.517,civilcomments,helm_classic_240130,[],other +1070,text_ada_001,0.503,civilcomments,helm_classic_240130,[],other +1071,yalm_100b,0.49,civilcomments,helm_classic_240130,[],other +1072,llama_2_70b,0.727,raft,helm_classic_240130,[],other +1073,llama_65b,0.702,raft,helm_classic_240130,[],other +1074,text_davinci_002,0.733,raft,helm_classic_240130,[],other +1075,mistral_v0.1_7b,0.707,raft,helm_classic_240130,[],other +1076,cohere_command_beta_52.4b,0.667,raft,helm_classic_240130,[],other +1077,text_davinci_003,0.759,raft,helm_classic_240130,[],other +1078,jurassic_2_jumbo_178b,0.746,raft,helm_classic_240130,[],other +1079,llama_2_13b,0.707,raft,helm_classic_240130,[],other +1080,tnlg_v2_530b,0.679,raft,helm_classic_240130,[],other +1081,gpt_3.5_turbo_0613,0.748,raft,helm_classic_240130,[],other +1082,llama_30b,0.752,raft,helm_classic_240130,[],other +1083,anthropic_lm_v4_s3_52b,0.699,raft,helm_classic_240130,[],other +1084,gpt_3.5_turbo_0301,0.768,raft,helm_classic_240130,[],other +1085,jurassic_2_grande_17b,0.712,raft,helm_classic_240130,[],other +1086,palmyra_x_43b,0.701,raft,helm_classic_240130,[],other +1087,falcon_40b,0.661,raft,helm_classic_240130,[],other +1088,falcon_instruct_40b,0.586,raft,helm_classic_240130,[],other +1089,mpt_instruct_30b,0.68,raft,helm_classic_240130,[],other +1090,mpt_30b,0.723,raft,helm_classic_240130,[],other +1091,j1_grande_v2_beta_17b,0.679,raft,helm_classic_240130,[],other +1092,vicuna_v1.3_13b,0.657,raft,helm_classic_240130,[],other +1093,cohere_command_beta_6.1b,0.634,raft,helm_classic_240130,[],other +1094,cohere_xlarge_v20221108_52.4b,0.624,raft,helm_classic_240130,[],other +1095,luminous_supreme_70b,0.653,raft,helm_classic_240130,[],other +1096,vicuna_v1.3_7b,0.693,raft,helm_classic_240130,[],other +1097,opt_175b,0.606,raft,helm_classic_240130,[],other +1098,llama_2_7b,0.643,raft,helm_classic_240130,[],other +1099,llama_13b,0.643,raft,helm_classic_240130,[],other +1100,instructpalmyra_30b,0.652,raft,helm_classic_240130,[],other +1101,cohere_xlarge_v20220609_52.4b,0.633,raft,helm_classic_240130,[],other +1102,jurassic_2_large_7.5b,0.622,raft,helm_classic_240130,[],other +1103,davinci_175b,0.642,raft,helm_classic_240130,[],other +1104,llama_7b,0.573,raft,helm_classic_240130,[],other +1105,redpajama_incite_instruct_7b,0.695,raft,helm_classic_240130,[],other +1106,j1_jumbo_v1_178b,0.681,raft,helm_classic_240130,[],other +1107,glm_130b,0.598,raft,helm_classic_240130,[],other +1108,luminous_extended_30b,0.523,raft,helm_classic_240130,[],other +1109,opt_66b,0.557,raft,helm_classic_240130,[],other +1110,bloom_176b,0.592,raft,helm_classic_240130,[],other +1111,j1_grande_v1_17b,0.658,raft,helm_classic_240130,[],other +1112,alpaca_7b,0.486,raft,helm_classic_240130,[],other +1113,falcon_7b,0.602,raft,helm_classic_240130,[],other +1114,redpajama_incite_base_7b,0.648,raft,helm_classic_240130,[],other +1115,cohere_large_v20220720_13.1b,0.596,raft,helm_classic_240130,[],other +1116,redpajama_incite_instruct_v1_3b,0.661,raft,helm_classic_240130,[],other +1117,text_curie_001,0.489,raft,helm_classic_240130,[],other +1118,gpt_neox_20b,0.505,raft,helm_classic_240130,[],other +1119,luminous_base_13b,0.473,raft,helm_classic_240130,[],other +1120,cohere_medium_v20221108_6.1b,0.591,raft,helm_classic_240130,[],other +1121,redpajama_incite_base_v1_3b,0.502,raft,helm_classic_240130,[],other +1122,tnlg_v2_6.7b,0.525,raft,helm_classic_240130,[],other +1123,j1_large_v1_7.5b,0.545,raft,helm_classic_240130,[],other +1124,gpt_j_6b,0.619,raft,helm_classic_240130,[],other +1125,pythia_12b,0.514,raft,helm_classic_240130,[],other +1126,curie_6.7b,0.49,raft,helm_classic_240130,[],other +1127,falcon_instruct_7b,0.523,raft,helm_classic_240130,[],other +1128,cohere_medium_v20220720_6.1b,0.52,raft,helm_classic_240130,[],other +1129,text_babbage_001,0.509,raft,helm_classic_240130,[],other +1130,t0pp_11b,0.118,raft,helm_classic_240130,[],other +1131,pythia_6.9b,0.502,raft,helm_classic_240130,[],other +1132,ul2_20b,0.404,raft,helm_classic_240130,[],other +1133,t5_11b,0.37,raft,helm_classic_240130,[],other +1134,babbage_1.3b,0.455,raft,helm_classic_240130,[],other +1135,cohere_small_v20220720_410m,0.492,raft,helm_classic_240130,[],other +1136,ada_350m,0.423,raft,helm_classic_240130,[],other +1137,text_ada_001,0.406,raft,helm_classic_240130,[],other +1138,yalm_100b,0.395,raft,helm_classic_240130,[],other +0,phi_1,1.1,grounding,biggen_240612,[],other +1,phi_1_5,2.425,grounding,biggen_240612,[],other +2,phi_2,3.05,grounding,biggen_240612,[],other +3,qwen1.5_0.5b,1.85,grounding,biggen_240612,[],other +4,qwen1.5_1.8b,2.425,grounding,biggen_240612,[],other +5,qwen1.5_4b,2.85,grounding,biggen_240612,[],other +6,gemma_2b,2.163,grounding,biggen_240612,[],other +7,olmo_1b,1.675,grounding,biggen_240612,[],other +8,qwen1.5_0.5b_chat,2.075,grounding,biggen_240612,[],other +9,qwen1.5_1.8b_chat,2.75,grounding,biggen_240612,[],other +10,qwen1.5_4b_chat,2.862,grounding,biggen_240612,[],other +11,phi_3_mini_4k_instruct,3.675,grounding,biggen_240612,[],other +12,phi_3_mini_128k_instruct,3.5,grounding,biggen_240612,[],other +13,gemma_2b_it,2.825,grounding,biggen_240612,[],other +14,gemma_1.1_2b_it,2.812,grounding,biggen_240612,[],other +15,gemma_7b,1.288,grounding,biggen_240612,[],other +16,mistral_7b_v0.1,3.15,grounding,biggen_240612,[],other +17,mistral_7b_v0.2,3.038,grounding,biggen_240612,[],other +18,qwen1.5_7b,2.9,grounding,biggen_240612,[],other +19,yi_6b,2.688,grounding,biggen_240612,[],other +20,llama_2_7b,2.325,grounding,biggen_240612,[],other +21,codellama_7b,1.875,grounding,biggen_240612,[],other +22,meta_llama_3_8b,3.025,grounding,biggen_240612,[],other +23,llemma_7b,2.237,grounding,biggen_240612,[],other +24,olmo_7b,2.075,grounding,biggen_240612,[],other +25,gemma_7b_it,3.212,grounding,biggen_240612,[],other +26,gemma_1.1_7b_it,3.5,grounding,biggen_240612,[],other +27,mistral_7b_instruct_v0.2,3.612,grounding,biggen_240612,[],other +28,qwen1.5_7b_chat,3.575,grounding,biggen_240612,[],other +29,yi_6b_chat,3.062,grounding,biggen_240612,[],other +30,llama_2_7b_chat,3.25,grounding,biggen_240612,[],other +31,codellama_7b_instruct,3.1,grounding,biggen_240612,[],other +32,meta_llama_3_8b_instruct,3.975,grounding,biggen_240612,[],other +33,olmo_7b_sft,2.825,grounding,biggen_240612,[],other +34,olmo_7b_instruct,2.925,grounding,biggen_240612,[],other +35,tulu_2_7b,2.788,grounding,biggen_240612,[],other +36,tulu_2_dpo_7b,3.2,grounding,biggen_240612,[],other +37,codetulu_2_7b,2.862,grounding,biggen_240612,[],other +38,orca_2_7b,2.3,grounding,biggen_240612,[],other +39,openchat_3.5_0106,3.575,grounding,biggen_240612,[],other +40,openhermes_2_mistral_7b,3.388,grounding,biggen_240612,[],other +41,openhermes_2.5_mistral_7b,3.3,grounding,biggen_240612,[],other +42,nous_hermes_2_mistral_7b_dpo,3.525,grounding,biggen_240612,[],other +43,starling_lm_7b_alpha,3.638,grounding,biggen_240612,[],other +44,starling_lm_7b_beta,3.737,grounding,biggen_240612,[],other +45,mistral_orpo_alpha,3.35,grounding,biggen_240612,[],other +46,mistral_orpo_beta,3.487,grounding,biggen_240612,[],other +47,zephyr_7b_beta,3.362,grounding,biggen_240612,[],other +48,qwen1.5_14b,3.413,grounding,biggen_240612,[],other +49,llama_2_13b,2.763,grounding,biggen_240612,[],other +50,codellama_13b,2.2,grounding,biggen_240612,[],other +51,solar_10.7b_v1.0,3.212,grounding,biggen_240612,[],other +52,qwen1.5_14b_chat,3.612,grounding,biggen_240612,[],other +53,solar_10.7b_instruct_v1.0,3.663,grounding,biggen_240612,[],other +54,aya_101,1.25,grounding,biggen_240612,[],other +55,llama_2_13b_chat,3.538,grounding,biggen_240612,[],other +56,codellama_13b_instruct,3.075,grounding,biggen_240612,[],other +57,tulu_2_13b,2.975,grounding,biggen_240612,[],other +58,tulu_2_dpo_13b,3.487,grounding,biggen_240612,[],other +59,codetulu_2_13b,3.1,grounding,biggen_240612,[],other +60,orca_2_13b,2.825,grounding,biggen_240612,[],other +61,yi_34b,3.388,grounding,biggen_240612,[],other +62,llemma_34b,2.812,grounding,biggen_240612,[],other +63,qwen1.5_32b,3.3,grounding,biggen_240612,[],other +64,codellama_34b,2.65,grounding,biggen_240612,[],other +65,mixtral_8x7b_v0.1,3.663,grounding,biggen_240612,[],other +66,yi_34b_chat,3.7,grounding,biggen_240612,[],other +67,nous_hermes_2_yi_34b,3.175,grounding,biggen_240612,[],other +68,codellama_34b_instruct,3.337,grounding,biggen_240612,[],other +69,codetulu_2_34b,3.275,grounding,biggen_240612,[],other +70,qwen1.5_32b_chat,3.712,grounding,biggen_240612,[],other +71,mixtral_8x7b_instruct_v0.1,3.862,grounding,biggen_240612,[],other +72,nous_hermes_2_mixtral_8x7b_sft,3.587,grounding,biggen_240612,[],other +73,nous_hermes_2_mixtral_8x7b_dpo,3.612,grounding,biggen_240612,[],other +74,c4ai_command_r_v01,3.688,grounding,biggen_240612,[],other +75,llama_2_70b,3.288,grounding,biggen_240612,[],other +76,codellama_70b,2.812,grounding,biggen_240612,[],other +77,mixtral_8x22b_v0.1_awq,3.475,grounding,biggen_240612,[],other +78,meta_llama_3_70b,3.263,grounding,biggen_240612,[],other +79,qwen1.5_72b,3.362,grounding,biggen_240612,[],other +80,llama_2_70b_chat,3.612,grounding,biggen_240612,[],other +81,codellama_70b_instruct,2.913,grounding,biggen_240612,[],other +82,tulu_2_dpo_70b,3.7,grounding,biggen_240612,[],other +83,c4ai_command_r_plus_gptq,3.788,grounding,biggen_240612,[],other +84,meta_llama_3_70b_instruct,4.013,grounding,biggen_240612,[],other +85,mixtral_8x22b_instruct_v0.1_awq,3.812,grounding,biggen_240612,[],other +86,zephyr_orpo_141b_a35b_v0.1_awq,3.425,grounding,biggen_240612,[],other +87,qwen1.5_72b_chat,3.938,grounding,biggen_240612,[],other +88,qwen_110b_chat,4.025,grounding,biggen_240612,[],other +89,gpt_3.5_turbo_1106,3.875,grounding,biggen_240612,[],other +90,gpt_3.5_turbo_0125,3.737,grounding,biggen_240612,[],other +91,gpt_4_1106_preview,4.237,grounding,biggen_240612,[],other +92,gpt_4_0125_preview,4.2,grounding,biggen_240612,[],other +93,gpt_4_turbo_2024_04_09,4.188,grounding,biggen_240612,[],other +94,gpt_4o_2024_05_13,4.088,grounding,biggen_240612,[],other +95,mistral_medium_hjpark,3.938,grounding,biggen_240612,[],other +96,mistral_large_hjpark,3.913,grounding,biggen_240612,[],other +97,gemini_1.0_pro,3.6,grounding,biggen_240612,[],other +98,gemini_pro_1.5,3.938,grounding,biggen_240612,[],other +99,gemini_flash_1.5,4.112,grounding,biggen_240612,[],other +100,claude_3_haiku_20240307,4.1,grounding,biggen_240612,[],other +101,claude_3_sonnet_20240229,4.05,grounding,biggen_240612,[],other +102,claude_3_opus_20240229,4.088,grounding,biggen_240612,[],other +103,phi_1,1.0,instruction_following,biggen_240612,[],other +104,phi_1_5,2.77,instruction_following,biggen_240612,[],other +105,phi_2,2.86,instruction_following,biggen_240612,[],other +106,qwen1.5_0.5b,2.06,instruction_following,biggen_240612,[],other +107,qwen1.5_1.8b,2.79,instruction_following,biggen_240612,[],other +108,qwen1.5_4b,2.82,instruction_following,biggen_240612,[],other +109,gemma_2b,2.61,instruction_following,biggen_240612,[],other +110,olmo_1b,1.7,instruction_following,biggen_240612,[],other +111,qwen1.5_0.5b_chat,2.36,instruction_following,biggen_240612,[],other +112,qwen1.5_1.8b_chat,3.09,instruction_following,biggen_240612,[],other +113,qwen1.5_4b_chat,2.99,instruction_following,biggen_240612,[],other +114,phi_3_mini_4k_instruct,3.82,instruction_following,biggen_240612,[],other +115,phi_3_mini_128k_instruct,3.66,instruction_following,biggen_240612,[],other +116,gemma_2b_it,3.12,instruction_following,biggen_240612,[],other +117,gemma_1.1_2b_it,3.21,instruction_following,biggen_240612,[],other +118,gemma_7b,1.53,instruction_following,biggen_240612,[],other +119,mistral_7b_v0.1,3.22,instruction_following,biggen_240612,[],other +120,mistral_7b_v0.2,3.31,instruction_following,biggen_240612,[],other +121,qwen1.5_7b,3.03,instruction_following,biggen_240612,[],other +122,yi_6b,2.77,instruction_following,biggen_240612,[],other +123,llama_2_7b,2.73,instruction_following,biggen_240612,[],other +124,codellama_7b,2.01,instruction_following,biggen_240612,[],other +125,meta_llama_3_8b,2.84,instruction_following,biggen_240612,[],other +126,llemma_7b,2.44,instruction_following,biggen_240612,[],other +127,olmo_7b,2.23,instruction_following,biggen_240612,[],other +128,gemma_7b_it,3.31,instruction_following,biggen_240612,[],other +129,gemma_1.1_7b_it,3.47,instruction_following,biggen_240612,[],other +130,mistral_7b_instruct_v0.2,3.74,instruction_following,biggen_240612,[],other +131,qwen1.5_7b_chat,3.83,instruction_following,biggen_240612,[],other +132,yi_6b_chat,3.5,instruction_following,biggen_240612,[],other +133,llama_2_7b_chat,3.55,instruction_following,biggen_240612,[],other +134,codellama_7b_instruct,3.26,instruction_following,biggen_240612,[],other +135,meta_llama_3_8b_instruct,3.75,instruction_following,biggen_240612,[],other +136,olmo_7b_sft,3.18,instruction_following,biggen_240612,[],other +137,olmo_7b_instruct,3.29,instruction_following,biggen_240612,[],other +138,tulu_2_7b,3.35,instruction_following,biggen_240612,[],other +139,tulu_2_dpo_7b,3.64,instruction_following,biggen_240612,[],other +140,codetulu_2_7b,3.11,instruction_following,biggen_240612,[],other +141,orca_2_7b,2.23,instruction_following,biggen_240612,[],other +142,openchat_3.5_0106,3.73,instruction_following,biggen_240612,[],other +143,openhermes_2_mistral_7b,3.53,instruction_following,biggen_240612,[],other +144,openhermes_2.5_mistral_7b,3.34,instruction_following,biggen_240612,[],other +145,nous_hermes_2_mistral_7b_dpo,3.61,instruction_following,biggen_240612,[],other +146,starling_lm_7b_alpha,3.62,instruction_following,biggen_240612,[],other +147,starling_lm_7b_beta,3.82,instruction_following,biggen_240612,[],other +148,mistral_orpo_alpha,3.53,instruction_following,biggen_240612,[],other +149,mistral_orpo_beta,3.76,instruction_following,biggen_240612,[],other +150,zephyr_7b_beta,3.69,instruction_following,biggen_240612,[],other +151,qwen1.5_14b,3.41,instruction_following,biggen_240612,[],other +152,llama_2_13b,2.99,instruction_following,biggen_240612,[],other +153,codellama_13b,2.08,instruction_following,biggen_240612,[],other +154,solar_10.7b_v1.0,3.53,instruction_following,biggen_240612,[],other +155,qwen1.5_14b_chat,3.84,instruction_following,biggen_240612,[],other +156,solar_10.7b_instruct_v1.0,3.73,instruction_following,biggen_240612,[],other +157,aya_101,1.33,instruction_following,biggen_240612,[],other +158,llama_2_13b_chat,3.72,instruction_following,biggen_240612,[],other +159,codellama_13b_instruct,3.13,instruction_following,biggen_240612,[],other +160,tulu_2_13b,3.4,instruction_following,biggen_240612,[],other +161,tulu_2_dpo_13b,3.65,instruction_following,biggen_240612,[],other +162,codetulu_2_13b,3.33,instruction_following,biggen_240612,[],other +163,orca_2_13b,2.45,instruction_following,biggen_240612,[],other +164,yi_34b,3.47,instruction_following,biggen_240612,[],other +165,llemma_34b,2.74,instruction_following,biggen_240612,[],other +166,qwen1.5_32b,3.63,instruction_following,biggen_240612,[],other +167,codellama_34b,2.49,instruction_following,biggen_240612,[],other +168,mixtral_8x7b_v0.1,3.45,instruction_following,biggen_240612,[],other +169,yi_34b_chat,3.79,instruction_following,biggen_240612,[],other +170,nous_hermes_2_yi_34b,3.65,instruction_following,biggen_240612,[],other +171,codellama_34b_instruct,3.5,instruction_following,biggen_240612,[],other +172,codetulu_2_34b,3.44,instruction_following,biggen_240612,[],other +173,qwen1.5_32b_chat,3.92,instruction_following,biggen_240612,[],other +174,mixtral_8x7b_instruct_v0.1,3.95,instruction_following,biggen_240612,[],other +175,nous_hermes_2_mixtral_8x7b_sft,3.7,instruction_following,biggen_240612,[],other +176,nous_hermes_2_mixtral_8x7b_dpo,3.83,instruction_following,biggen_240612,[],other +177,c4ai_command_r_v01,3.67,instruction_following,biggen_240612,[],other +178,llama_2_70b,3.4,instruction_following,biggen_240612,[],other +179,codellama_70b,2.46,instruction_following,biggen_240612,[],other +180,mixtral_8x22b_v0.1_awq,3.59,instruction_following,biggen_240612,[],other +181,meta_llama_3_70b,3.26,instruction_following,biggen_240612,[],other +182,qwen1.5_72b,3.5,instruction_following,biggen_240612,[],other +183,llama_2_70b_chat,3.71,instruction_following,biggen_240612,[],other +184,codellama_70b_instruct,2.53,instruction_following,biggen_240612,[],other +185,tulu_2_dpo_70b,3.79,instruction_following,biggen_240612,[],other +186,c4ai_command_r_plus_gptq,3.89,instruction_following,biggen_240612,[],other +187,meta_llama_3_70b_instruct,4.02,instruction_following,biggen_240612,[],other +188,mixtral_8x22b_instruct_v0.1_awq,3.91,instruction_following,biggen_240612,[],other +189,zephyr_orpo_141b_a35b_v0.1_awq,3.57,instruction_following,biggen_240612,[],other +190,qwen1.5_72b_chat,4.0,instruction_following,biggen_240612,[],other +191,qwen_110b_chat,3.89,instruction_following,biggen_240612,[],other +192,gpt_3.5_turbo_1106,3.73,instruction_following,biggen_240612,[],other +193,gpt_3.5_turbo_0125,3.74,instruction_following,biggen_240612,[],other +194,gpt_4_1106_preview,4.23,instruction_following,biggen_240612,[],other +195,gpt_4_0125_preview,4.12,instruction_following,biggen_240612,[],other +196,gpt_4_turbo_2024_04_09,4.04,instruction_following,biggen_240612,[],other +197,gpt_4o_2024_05_13,4.1,instruction_following,biggen_240612,[],other +198,mistral_medium_hjpark,3.88,instruction_following,biggen_240612,[],other +199,mistral_large_hjpark,3.82,instruction_following,biggen_240612,[],other +200,gemini_1.0_pro,3.67,instruction_following,biggen_240612,[],other +201,gemini_pro_1.5,3.91,instruction_following,biggen_240612,[],other +202,gemini_flash_1.5,3.78,instruction_following,biggen_240612,[],other +203,claude_3_haiku_20240307,4.0,instruction_following,biggen_240612,[],other +204,claude_3_sonnet_20240229,3.84,instruction_following,biggen_240612,[],other +205,claude_3_opus_20240229,4.0,instruction_following,biggen_240612,[],other +206,phi_1,1.0,planning,biggen_240612,[],other +207,phi_1_5,2.314,planning,biggen_240612,[],other +208,phi_2,2.6,planning,biggen_240612,[],other +209,qwen1.5_0.5b,1.471,planning,biggen_240612,[],other +210,qwen1.5_1.8b,2.214,planning,biggen_240612,[],other +211,qwen1.5_4b,2.557,planning,biggen_240612,[],other +212,gemma_2b,2.129,planning,biggen_240612,[],other +213,olmo_1b,1.343,planning,biggen_240612,[],other +214,qwen1.5_0.5b_chat,1.957,planning,biggen_240612,[],other +215,qwen1.5_1.8b_chat,2.629,planning,biggen_240612,[],other +216,qwen1.5_4b_chat,2.914,planning,biggen_240612,[],other +217,phi_3_mini_4k_instruct,3.486,planning,biggen_240612,[],other +218,phi_3_mini_128k_instruct,3.5,planning,biggen_240612,[],other +219,gemma_2b_it,3.0,planning,biggen_240612,[],other +220,gemma_1.1_2b_it,3.0,planning,biggen_240612,[],other +221,gemma_7b,1.171,planning,biggen_240612,[],other +222,mistral_7b_v0.1,3.029,planning,biggen_240612,[],other +223,mistral_7b_v0.2,2.871,planning,biggen_240612,[],other +224,qwen1.5_7b,2.814,planning,biggen_240612,[],other +225,yi_6b,2.271,planning,biggen_240612,[],other +226,llama_2_7b,2.4,planning,biggen_240612,[],other +227,codellama_7b,1.586,planning,biggen_240612,[],other +228,meta_llama_3_8b,2.414,planning,biggen_240612,[],other +229,llemma_7b,1.971,planning,biggen_240612,[],other +230,olmo_7b,1.757,planning,biggen_240612,[],other +231,gemma_7b_it,2.857,planning,biggen_240612,[],other +232,gemma_1.1_7b_it,3.143,planning,biggen_240612,[],other +233,mistral_7b_instruct_v0.2,3.7,planning,biggen_240612,[],other +234,qwen1.5_7b_chat,3.471,planning,biggen_240612,[],other +235,yi_6b_chat,3.171,planning,biggen_240612,[],other +236,llama_2_7b_chat,3.286,planning,biggen_240612,[],other +237,codellama_7b_instruct,2.914,planning,biggen_240612,[],other +238,meta_llama_3_8b_instruct,3.714,planning,biggen_240612,[],other +239,olmo_7b_sft,2.843,planning,biggen_240612,[],other +240,olmo_7b_instruct,2.986,planning,biggen_240612,[],other +241,tulu_2_7b,3.129,planning,biggen_240612,[],other +242,tulu_2_dpo_7b,3.229,planning,biggen_240612,[],other +243,codetulu_2_7b,2.929,planning,biggen_240612,[],other +244,orca_2_7b,1.3,planning,biggen_240612,[],other +245,openchat_3.5_0106,3.643,planning,biggen_240612,[],other +246,openhermes_2_mistral_7b,3.529,planning,biggen_240612,[],other +247,openhermes_2.5_mistral_7b,3.457,planning,biggen_240612,[],other +248,nous_hermes_2_mistral_7b_dpo,3.514,planning,biggen_240612,[],other +249,starling_lm_7b_alpha,3.557,planning,biggen_240612,[],other +250,starling_lm_7b_beta,3.671,planning,biggen_240612,[],other +251,mistral_orpo_alpha,3.329,planning,biggen_240612,[],other +252,mistral_orpo_beta,3.3,planning,biggen_240612,[],other +253,zephyr_7b_beta,3.571,planning,biggen_240612,[],other +254,qwen1.5_14b,2.9,planning,biggen_240612,[],other +255,llama_2_13b,2.629,planning,biggen_240612,[],other +256,codellama_13b,1.814,planning,biggen_240612,[],other +257,solar_10.7b_v1.0,3.057,planning,biggen_240612,[],other +258,qwen1.5_14b_chat,3.657,planning,biggen_240612,[],other +259,solar_10.7b_instruct_v1.0,3.614,planning,biggen_240612,[],other +260,aya_101,1.357,planning,biggen_240612,[],other +261,llama_2_13b_chat,3.4,planning,biggen_240612,[],other +262,codellama_13b_instruct,3.086,planning,biggen_240612,[],other +263,tulu_2_13b,3.371,planning,biggen_240612,[],other +264,tulu_2_dpo_13b,3.371,planning,biggen_240612,[],other +265,codetulu_2_13b,3.1,planning,biggen_240612,[],other +266,orca_2_13b,1.6,planning,biggen_240612,[],other +267,yi_34b,3.243,planning,biggen_240612,[],other +268,llemma_34b,2.529,planning,biggen_240612,[],other +269,qwen1.5_32b,3.229,planning,biggen_240612,[],other +270,codellama_34b,2.257,planning,biggen_240612,[],other +271,mixtral_8x7b_v0.1,3.286,planning,biggen_240612,[],other +272,yi_34b_chat,3.729,planning,biggen_240612,[],other +273,nous_hermes_2_yi_34b,3.543,planning,biggen_240612,[],other +274,codellama_34b_instruct,3.171,planning,biggen_240612,[],other +275,codetulu_2_34b,3.5,planning,biggen_240612,[],other +276,qwen1.5_32b_chat,3.829,planning,biggen_240612,[],other +277,mixtral_8x7b_instruct_v0.1,3.457,planning,biggen_240612,[],other +278,nous_hermes_2_mixtral_8x7b_sft,3.586,planning,biggen_240612,[],other +279,nous_hermes_2_mixtral_8x7b_dpo,3.657,planning,biggen_240612,[],other +280,c4ai_command_r_v01,3.643,planning,biggen_240612,[],other +281,llama_2_70b,3.2,planning,biggen_240612,[],other +282,codellama_70b,2.357,planning,biggen_240612,[],other +283,mixtral_8x22b_v0.1_awq,3.457,planning,biggen_240612,[],other +284,meta_llama_3_70b,2.8,planning,biggen_240612,[],other +285,qwen1.5_72b,3.186,planning,biggen_240612,[],other +286,llama_2_70b_chat,3.671,planning,biggen_240612,[],other +287,codellama_70b_instruct,2.5,planning,biggen_240612,[],other +288,tulu_2_dpo_70b,3.886,planning,biggen_240612,[],other +289,c4ai_command_r_plus_gptq,3.914,planning,biggen_240612,[],other +290,meta_llama_3_70b_instruct,3.929,planning,biggen_240612,[],other +291,mixtral_8x22b_instruct_v0.1_awq,3.729,planning,biggen_240612,[],other +292,zephyr_orpo_141b_a35b_v0.1_awq,3.8,planning,biggen_240612,[],other +293,qwen1.5_72b_chat,3.814,planning,biggen_240612,[],other +294,qwen_110b_chat,3.957,planning,biggen_240612,[],other +295,gpt_3.5_turbo_1106,3.871,planning,biggen_240612,[],other +296,gpt_3.5_turbo_0125,3.871,planning,biggen_240612,[],other +297,gpt_4_1106_preview,4.157,planning,biggen_240612,[],other +298,gpt_4_0125_preview,4.243,planning,biggen_240612,[],other +299,gpt_4_turbo_2024_04_09,4.029,planning,biggen_240612,[],other +300,gpt_4o_2024_05_13,4.086,planning,biggen_240612,[],other +301,mistral_medium_hjpark,3.914,planning,biggen_240612,[],other +302,mistral_large_hjpark,3.9,planning,biggen_240612,[],other +303,gemini_1.0_pro,3.714,planning,biggen_240612,[],other +304,gemini_pro_1.5,3.929,planning,biggen_240612,[],other +305,gemini_flash_1.5,3.771,planning,biggen_240612,[],other +306,claude_3_haiku_20240307,4.043,planning,biggen_240612,[],other +307,claude_3_sonnet_20240229,4.057,planning,biggen_240612,[],other +308,claude_3_opus_20240229,4.1,planning,biggen_240612,[],other +309,phi_1,1.0,reasoning,biggen_240612,[],reasoning +310,phi_1_5,2.13,reasoning,biggen_240612,[],reasoning +311,phi_2,2.7,reasoning,biggen_240612,[],reasoning +312,qwen1.5_0.5b,1.5,reasoning,biggen_240612,[],reasoning +313,qwen1.5_1.8b,1.83,reasoning,biggen_240612,[],reasoning +314,qwen1.5_4b,2.3,reasoning,biggen_240612,[],reasoning +315,gemma_2b,1.99,reasoning,biggen_240612,[],reasoning +316,olmo_1b,1.33,reasoning,biggen_240612,[],reasoning +317,qwen1.5_0.5b_chat,1.68,reasoning,biggen_240612,[],reasoning +318,qwen1.5_1.8b_chat,2.28,reasoning,biggen_240612,[],reasoning +319,qwen1.5_4b_chat,2.69,reasoning,biggen_240612,[],reasoning +320,phi_3_mini_4k_instruct,3.59,reasoning,biggen_240612,[],reasoning +321,phi_3_mini_128k_instruct,3.61,reasoning,biggen_240612,[],reasoning +322,gemma_2b_it,2.39,reasoning,biggen_240612,[],reasoning +323,gemma_1.1_2b_it,2.49,reasoning,biggen_240612,[],reasoning +324,gemma_7b,1.28,reasoning,biggen_240612,[],reasoning +325,mistral_7b_v0.1,2.75,reasoning,biggen_240612,[],reasoning +326,mistral_7b_v0.2,2.65,reasoning,biggen_240612,[],reasoning +327,qwen1.5_7b,2.37,reasoning,biggen_240612,[],reasoning +328,yi_6b,2.25,reasoning,biggen_240612,[],reasoning +329,llama_2_7b,2.03,reasoning,biggen_240612,[],reasoning +330,codellama_7b,1.57,reasoning,biggen_240612,[],reasoning +331,meta_llama_3_8b,2.32,reasoning,biggen_240612,[],reasoning +332,llemma_7b,2.07,reasoning,biggen_240612,[],reasoning +333,olmo_7b,1.76,reasoning,biggen_240612,[],reasoning +334,gemma_7b_it,2.88,reasoning,biggen_240612,[],reasoning +335,gemma_1.1_7b_it,3.05,reasoning,biggen_240612,[],reasoning +336,mistral_7b_instruct_v0.2,3.06,reasoning,biggen_240612,[],reasoning +337,qwen1.5_7b_chat,3.02,reasoning,biggen_240612,[],reasoning +338,yi_6b_chat,2.61,reasoning,biggen_240612,[],reasoning +339,llama_2_7b_chat,2.72,reasoning,biggen_240612,[],reasoning +340,codellama_7b_instruct,2.52,reasoning,biggen_240612,[],reasoning +341,meta_llama_3_8b_instruct,3.32,reasoning,biggen_240612,[],reasoning +342,olmo_7b_sft,2.37,reasoning,biggen_240612,[],reasoning +343,olmo_7b_instruct,2.38,reasoning,biggen_240612,[],reasoning +344,tulu_2_7b,2.57,reasoning,biggen_240612,[],reasoning +345,tulu_2_dpo_7b,2.68,reasoning,biggen_240612,[],reasoning +346,codetulu_2_7b,2.56,reasoning,biggen_240612,[],reasoning +347,orca_2_7b,1.75,reasoning,biggen_240612,[],reasoning +348,openchat_3.5_0106,3.23,reasoning,biggen_240612,[],reasoning +349,openhermes_2_mistral_7b,3.09,reasoning,biggen_240612,[],reasoning +350,openhermes_2.5_mistral_7b,3.12,reasoning,biggen_240612,[],reasoning +351,nous_hermes_2_mistral_7b_dpo,3.11,reasoning,biggen_240612,[],reasoning +352,starling_lm_7b_alpha,3.24,reasoning,biggen_240612,[],reasoning +353,starling_lm_7b_beta,3.46,reasoning,biggen_240612,[],reasoning +354,mistral_orpo_alpha,2.93,reasoning,biggen_240612,[],reasoning +355,mistral_orpo_beta,2.96,reasoning,biggen_240612,[],reasoning +356,zephyr_7b_beta,3.08,reasoning,biggen_240612,[],reasoning +357,qwen1.5_14b,2.77,reasoning,biggen_240612,[],reasoning +358,llama_2_13b,2.17,reasoning,biggen_240612,[],reasoning +359,codellama_13b,1.89,reasoning,biggen_240612,[],reasoning +360,solar_10.7b_v1.0,2.72,reasoning,biggen_240612,[],reasoning +361,qwen1.5_14b_chat,3.38,reasoning,biggen_240612,[],reasoning +362,solar_10.7b_instruct_v1.0,3.23,reasoning,biggen_240612,[],reasoning +363,aya_101,1.34,reasoning,biggen_240612,[],reasoning +364,llama_2_13b_chat,2.61,reasoning,biggen_240612,[],reasoning +365,codellama_13b_instruct,2.78,reasoning,biggen_240612,[],reasoning +366,tulu_2_13b,2.7,reasoning,biggen_240612,[],reasoning +367,tulu_2_dpo_13b,2.8,reasoning,biggen_240612,[],reasoning +368,codetulu_2_13b,2.62,reasoning,biggen_240612,[],reasoning +369,orca_2_13b,2.22,reasoning,biggen_240612,[],reasoning +370,yi_34b,3.06,reasoning,biggen_240612,[],reasoning +371,llemma_34b,2.56,reasoning,biggen_240612,[],reasoning +372,qwen1.5_32b,3.07,reasoning,biggen_240612,[],reasoning +373,codellama_34b,2.0,reasoning,biggen_240612,[],reasoning +374,mixtral_8x7b_v0.1,3.13,reasoning,biggen_240612,[],reasoning +375,yi_34b_chat,3.25,reasoning,biggen_240612,[],reasoning +376,nous_hermes_2_yi_34b,3.3,reasoning,biggen_240612,[],reasoning +377,codellama_34b_instruct,2.95,reasoning,biggen_240612,[],reasoning +378,codetulu_2_34b,2.97,reasoning,biggen_240612,[],reasoning +379,qwen1.5_32b_chat,3.47,reasoning,biggen_240612,[],reasoning +380,mixtral_8x7b_instruct_v0.1,3.58,reasoning,biggen_240612,[],reasoning +381,nous_hermes_2_mixtral_8x7b_sft,3.29,reasoning,biggen_240612,[],reasoning +382,nous_hermes_2_mixtral_8x7b_dpo,3.42,reasoning,biggen_240612,[],reasoning +383,c4ai_command_r_v01,3.25,reasoning,biggen_240612,[],reasoning +384,llama_2_70b,2.86,reasoning,biggen_240612,[],reasoning +385,codellama_70b,2.35,reasoning,biggen_240612,[],reasoning +386,mixtral_8x22b_v0.1_awq,3.48,reasoning,biggen_240612,[],reasoning +387,meta_llama_3_70b,2.88,reasoning,biggen_240612,[],reasoning +388,qwen1.5_72b,3.2,reasoning,biggen_240612,[],reasoning +389,llama_2_70b_chat,3.1,reasoning,biggen_240612,[],reasoning +390,codellama_70b_instruct,2.56,reasoning,biggen_240612,[],reasoning +391,tulu_2_dpo_70b,3.12,reasoning,biggen_240612,[],reasoning +392,c4ai_command_r_plus_gptq,3.48,reasoning,biggen_240612,[],reasoning +393,meta_llama_3_70b_instruct,3.77,reasoning,biggen_240612,[],reasoning +394,mixtral_8x22b_instruct_v0.1_awq,3.76,reasoning,biggen_240612,[],reasoning +395,zephyr_orpo_141b_a35b_v0.1_awq,3.42,reasoning,biggen_240612,[],reasoning +396,qwen1.5_72b_chat,3.65,reasoning,biggen_240612,[],reasoning +397,qwen_110b_chat,3.8,reasoning,biggen_240612,[],reasoning +398,gpt_3.5_turbo_1106,3.37,reasoning,biggen_240612,[],reasoning +399,gpt_3.5_turbo_0125,3.58,reasoning,biggen_240612,[],reasoning +400,gpt_4_1106_preview,4.15,reasoning,biggen_240612,[],reasoning +401,gpt_4_0125_preview,4.2,reasoning,biggen_240612,[],reasoning +402,gpt_4_turbo_2024_04_09,4.13,reasoning,biggen_240612,[],reasoning +403,gpt_4o_2024_05_13,4.03,reasoning,biggen_240612,[],reasoning +404,mistral_medium_hjpark,3.89,reasoning,biggen_240612,[],reasoning +405,mistral_large_hjpark,3.78,reasoning,biggen_240612,[],reasoning +406,gemini_1.0_pro,3.61,reasoning,biggen_240612,[],reasoning +407,gemini_pro_1.5,3.89,reasoning,biggen_240612,[],reasoning +408,gemini_flash_1.5,3.85,reasoning,biggen_240612,[],reasoning +409,claude_3_haiku_20240307,3.55,reasoning,biggen_240612,[],reasoning +410,claude_3_sonnet_20240229,3.82,reasoning,biggen_240612,[],reasoning +411,claude_3_opus_20240229,3.9,reasoning,biggen_240612,[],reasoning +412,phi_1,1.303,refinement,biggen_240612,[],other +413,phi_1_5,2.329,refinement,biggen_240612,[],other +414,phi_2,2.789,refinement,biggen_240612,[],other +415,qwen1.5_0.5b,1.934,refinement,biggen_240612,[],other +416,qwen1.5_1.8b,2.408,refinement,biggen_240612,[],other +417,qwen1.5_4b,2.447,refinement,biggen_240612,[],other +418,gemma_2b,1.934,refinement,biggen_240612,[],other +419,olmo_1b,1.737,refinement,biggen_240612,[],other +420,qwen1.5_0.5b_chat,1.776,refinement,biggen_240612,[],other +421,qwen1.5_1.8b_chat,2.553,refinement,biggen_240612,[],other +422,qwen1.5_4b_chat,2.579,refinement,biggen_240612,[],other +423,phi_3_mini_4k_instruct,3.763,refinement,biggen_240612,[],other +424,phi_3_mini_128k_instruct,3.539,refinement,biggen_240612,[],other +425,gemma_2b_it,2.724,refinement,biggen_240612,[],other +426,gemma_1.1_2b_it,2.947,refinement,biggen_240612,[],other +427,gemma_7b,1.474,refinement,biggen_240612,[],other +428,mistral_7b_v0.1,2.566,refinement,biggen_240612,[],other +429,mistral_7b_v0.2,2.579,refinement,biggen_240612,[],other +430,qwen1.5_7b,2.579,refinement,biggen_240612,[],other +431,yi_6b,2.434,refinement,biggen_240612,[],other +432,llama_2_7b,2.092,refinement,biggen_240612,[],other +433,codellama_7b,1.776,refinement,biggen_240612,[],other +434,meta_llama_3_8b,2.829,refinement,biggen_240612,[],other +435,llemma_7b,2.158,refinement,biggen_240612,[],other +436,olmo_7b,1.868,refinement,biggen_240612,[],other +437,gemma_7b_it,3.039,refinement,biggen_240612,[],other +438,gemma_1.1_7b_it,3.158,refinement,biggen_240612,[],other +439,mistral_7b_instruct_v0.2,3.355,refinement,biggen_240612,[],other +440,qwen1.5_7b_chat,3.132,refinement,biggen_240612,[],other +441,yi_6b_chat,2.803,refinement,biggen_240612,[],other +442,llama_2_7b_chat,2.987,refinement,biggen_240612,[],other +443,codellama_7b_instruct,2.671,refinement,biggen_240612,[],other +444,meta_llama_3_8b_instruct,3.408,refinement,biggen_240612,[],other +445,olmo_7b_sft,2.224,refinement,biggen_240612,[],other +446,olmo_7b_instruct,2.539,refinement,biggen_240612,[],other +447,tulu_2_7b,2.789,refinement,biggen_240612,[],other +448,tulu_2_dpo_7b,2.868,refinement,biggen_240612,[],other +449,codetulu_2_7b,2.763,refinement,biggen_240612,[],other +450,orca_2_7b,2.066,refinement,biggen_240612,[],other +451,openchat_3.5_0106,3.408,refinement,biggen_240612,[],other +452,openhermes_2_mistral_7b,3.079,refinement,biggen_240612,[],other +453,openhermes_2.5_mistral_7b,2.855,refinement,biggen_240612,[],other +454,nous_hermes_2_mistral_7b_dpo,3.158,refinement,biggen_240612,[],other +455,starling_lm_7b_alpha,3.092,refinement,biggen_240612,[],other +456,starling_lm_7b_beta,3.421,refinement,biggen_240612,[],other +457,mistral_orpo_alpha,3.184,refinement,biggen_240612,[],other +458,mistral_orpo_beta,2.987,refinement,biggen_240612,[],other +459,zephyr_7b_beta,3.158,refinement,biggen_240612,[],other +460,qwen1.5_14b,2.974,refinement,biggen_240612,[],other +461,llama_2_13b,2.382,refinement,biggen_240612,[],other +462,codellama_13b,1.697,refinement,biggen_240612,[],other +463,solar_10.7b_v1.0,3.092,refinement,biggen_240612,[],other +464,qwen1.5_14b_chat,3.25,refinement,biggen_240612,[],other +465,solar_10.7b_instruct_v1.0,3.289,refinement,biggen_240612,[],other +466,aya_101,1.882,refinement,biggen_240612,[],other +467,llama_2_13b_chat,3.066,refinement,biggen_240612,[],other +468,codellama_13b_instruct,2.526,refinement,biggen_240612,[],other +469,tulu_2_13b,2.803,refinement,biggen_240612,[],other +470,tulu_2_dpo_13b,3.118,refinement,biggen_240612,[],other +471,codetulu_2_13b,2.961,refinement,biggen_240612,[],other +472,orca_2_13b,2.092,refinement,biggen_240612,[],other +473,yi_34b,2.921,refinement,biggen_240612,[],other +474,llemma_34b,2.566,refinement,biggen_240612,[],other +475,qwen1.5_32b,2.921,refinement,biggen_240612,[],other +476,codellama_34b,2.289,refinement,biggen_240612,[],other +477,mixtral_8x7b_v0.1,3.013,refinement,biggen_240612,[],other +478,yi_34b_chat,3.342,refinement,biggen_240612,[],other +479,nous_hermes_2_yi_34b,3.342,refinement,biggen_240612,[],other +480,codellama_34b_instruct,2.776,refinement,biggen_240612,[],other +481,codetulu_2_34b,3.039,refinement,biggen_240612,[],other +482,qwen1.5_32b_chat,3.145,refinement,biggen_240612,[],other +483,mixtral_8x7b_instruct_v0.1,3.329,refinement,biggen_240612,[],other +484,nous_hermes_2_mixtral_8x7b_sft,3.039,refinement,biggen_240612,[],other +485,nous_hermes_2_mixtral_8x7b_dpo,3.303,refinement,biggen_240612,[],other +486,c4ai_command_r_v01,3.316,refinement,biggen_240612,[],other +487,llama_2_70b,2.895,refinement,biggen_240612,[],other +488,codellama_70b,2.408,refinement,biggen_240612,[],other +489,mixtral_8x22b_v0.1_awq,3.237,refinement,biggen_240612,[],other +490,meta_llama_3_70b,3.066,refinement,biggen_240612,[],other +491,qwen1.5_72b,3.013,refinement,biggen_240612,[],other +492,llama_2_70b_chat,3.303,refinement,biggen_240612,[],other +493,codellama_70b_instruct,2.25,refinement,biggen_240612,[],other +494,tulu_2_dpo_70b,3.382,refinement,biggen_240612,[],other +495,c4ai_command_r_plus_gptq,3.447,refinement,biggen_240612,[],other +496,meta_llama_3_70b_instruct,3.776,refinement,biggen_240612,[],other +497,mixtral_8x22b_instruct_v0.1_awq,3.684,refinement,biggen_240612,[],other +498,zephyr_orpo_141b_a35b_v0.1_awq,3.303,refinement,biggen_240612,[],other +499,qwen1.5_72b_chat,3.868,refinement,biggen_240612,[],other +500,qwen_110b_chat,3.842,refinement,biggen_240612,[],other +501,gpt_3.5_turbo_1106,3.105,refinement,biggen_240612,[],other +502,gpt_3.5_turbo_0125,3.539,refinement,biggen_240612,[],other +503,gpt_4_1106_preview,4.263,refinement,biggen_240612,[],other +504,gpt_4_0125_preview,3.961,refinement,biggen_240612,[],other +505,gpt_4_turbo_2024_04_09,4.0,refinement,biggen_240612,[],other +506,gpt_4o_2024_05_13,3.855,refinement,biggen_240612,[],other +507,mistral_medium_hjpark,3.632,refinement,biggen_240612,[],other +508,mistral_large_hjpark,3.684,refinement,biggen_240612,[],other +509,gemini_1.0_pro,2.816,refinement,biggen_240612,[],other +510,gemini_pro_1.5,3.553,refinement,biggen_240612,[],other +511,gemini_flash_1.5,3.513,refinement,biggen_240612,[],other +512,claude_3_haiku_20240307,3.566,refinement,biggen_240612,[],other +513,claude_3_sonnet_20240229,3.658,refinement,biggen_240612,[],other +514,claude_3_opus_20240229,3.947,refinement,biggen_240612,[],other +515,phi_1,1.391,safety,biggen_240612,[],other +516,phi_1_5,2.87,safety,biggen_240612,[],other +517,phi_2,3.406,safety,biggen_240612,[],other +518,qwen1.5_0.5b,2.029,safety,biggen_240612,[],other +519,qwen1.5_1.8b,2.42,safety,biggen_240612,[],other +520,qwen1.5_4b,3.13,safety,biggen_240612,[],other +521,gemma_2b,2.42,safety,biggen_240612,[],other +522,olmo_1b,2.072,safety,biggen_240612,[],other +523,qwen1.5_0.5b_chat,2.594,safety,biggen_240612,[],other +524,qwen1.5_1.8b_chat,2.696,safety,biggen_240612,[],other +525,qwen1.5_4b_chat,3.362,safety,biggen_240612,[],other +526,phi_3_mini_4k_instruct,4.101,safety,biggen_240612,[],other +527,phi_3_mini_128k_instruct,3.986,safety,biggen_240612,[],other +528,gemma_2b_it,3.928,safety,biggen_240612,[],other +529,gemma_1.1_2b_it,3.884,safety,biggen_240612,[],other +530,gemma_7b,2.029,safety,biggen_240612,[],other +531,mistral_7b_v0.1,3.29,safety,biggen_240612,[],other +532,mistral_7b_v0.2,3.304,safety,biggen_240612,[],other +533,qwen1.5_7b,3.087,safety,biggen_240612,[],other +534,yi_6b,3.101,safety,biggen_240612,[],other +535,llama_2_7b,3.188,safety,biggen_240612,[],other +536,codellama_7b,2.377,safety,biggen_240612,[],other +537,meta_llama_3_8b,2.899,safety,biggen_240612,[],other +538,llemma_7b,2.435,safety,biggen_240612,[],other +539,olmo_7b,2.623,safety,biggen_240612,[],other +540,gemma_7b_it,3.768,safety,biggen_240612,[],other +541,gemma_1.1_7b_it,4.043,safety,biggen_240612,[],other +542,mistral_7b_instruct_v0.2,3.986,safety,biggen_240612,[],other +543,qwen1.5_7b_chat,3.928,safety,biggen_240612,[],other +544,yi_6b_chat,3.609,safety,biggen_240612,[],other +545,llama_2_7b_chat,4.261,safety,biggen_240612,[],other +546,codellama_7b_instruct,3.841,safety,biggen_240612,[],other +547,meta_llama_3_8b_instruct,3.652,safety,biggen_240612,[],other +548,olmo_7b_sft,3.435,safety,biggen_240612,[],other +549,olmo_7b_instruct,3.188,safety,biggen_240612,[],other +550,tulu_2_7b,3.797,safety,biggen_240612,[],other +551,tulu_2_dpo_7b,3.797,safety,biggen_240612,[],other +552,codetulu_2_7b,3.348,safety,biggen_240612,[],other +553,orca_2_7b,2.58,safety,biggen_240612,[],other +554,openchat_3.5_0106,3.971,safety,biggen_240612,[],other +555,openhermes_2_mistral_7b,3.203,safety,biggen_240612,[],other +556,openhermes_2.5_mistral_7b,3.101,safety,biggen_240612,[],other +557,nous_hermes_2_mistral_7b_dpo,3.333,safety,biggen_240612,[],other +558,starling_lm_7b_alpha,3.797,safety,biggen_240612,[],other +559,starling_lm_7b_beta,3.841,safety,biggen_240612,[],other +560,mistral_orpo_alpha,3.826,safety,biggen_240612,[],other +561,mistral_orpo_beta,3.609,safety,biggen_240612,[],other +562,zephyr_7b_beta,3.725,safety,biggen_240612,[],other +563,qwen1.5_14b,2.536,safety,biggen_240612,[],other +564,llama_2_13b,3.319,safety,biggen_240612,[],other +565,codellama_13b,2.304,safety,biggen_240612,[],other +566,solar_10.7b_v1.0,3.652,safety,biggen_240612,[],other +567,qwen1.5_14b_chat,4.058,safety,biggen_240612,[],other +568,solar_10.7b_instruct_v1.0,3.826,safety,biggen_240612,[],other +569,aya_101,1.58,safety,biggen_240612,[],other +570,llama_2_13b_chat,4.29,safety,biggen_240612,[],other +571,codellama_13b_instruct,4.116,safety,biggen_240612,[],other +572,tulu_2_13b,3.87,safety,biggen_240612,[],other +573,tulu_2_dpo_13b,3.928,safety,biggen_240612,[],other +574,codetulu_2_13b,3.42,safety,biggen_240612,[],other +575,orca_2_13b,2.913,safety,biggen_240612,[],other +576,yi_34b,3.464,safety,biggen_240612,[],other +577,llemma_34b,2.884,safety,biggen_240612,[],other +578,qwen1.5_32b,3.377,safety,biggen_240612,[],other +579,codellama_34b,2.536,safety,biggen_240612,[],other +580,mixtral_8x7b_v0.1,3.855,safety,biggen_240612,[],other +581,yi_34b_chat,4.087,safety,biggen_240612,[],other +582,nous_hermes_2_yi_34b,3.507,safety,biggen_240612,[],other +583,codellama_34b_instruct,4.145,safety,biggen_240612,[],other +584,codetulu_2_34b,3.739,safety,biggen_240612,[],other +585,qwen1.5_32b_chat,4.116,safety,biggen_240612,[],other +586,mixtral_8x7b_instruct_v0.1,3.884,safety,biggen_240612,[],other +587,nous_hermes_2_mixtral_8x7b_sft,3.551,safety,biggen_240612,[],other +588,nous_hermes_2_mixtral_8x7b_dpo,3.667,safety,biggen_240612,[],other +589,c4ai_command_r_v01,3.913,safety,biggen_240612,[],other +590,llama_2_70b,3.913,safety,biggen_240612,[],other +591,codellama_70b,2.754,safety,biggen_240612,[],other +592,mixtral_8x22b_v0.1_awq,3.754,safety,biggen_240612,[],other +593,meta_llama_3_70b,3.058,safety,biggen_240612,[],other +594,qwen1.5_72b,3.957,safety,biggen_240612,[],other +595,llama_2_70b_chat,4.536,safety,biggen_240612,[],other +596,codellama_70b_instruct,4.043,safety,biggen_240612,[],other +597,tulu_2_dpo_70b,3.913,safety,biggen_240612,[],other +598,c4ai_command_r_plus_gptq,3.986,safety,biggen_240612,[],other +599,meta_llama_3_70b_instruct,3.87,safety,biggen_240612,[],other +600,mixtral_8x22b_instruct_v0.1_awq,3.899,safety,biggen_240612,[],other +601,zephyr_orpo_141b_a35b_v0.1_awq,3.435,safety,biggen_240612,[],other +602,qwen1.5_72b_chat,4.0,safety,biggen_240612,[],other +603,qwen_110b_chat,3.971,safety,biggen_240612,[],other +604,gpt_3.5_turbo_1106,4.13,safety,biggen_240612,[],other +605,gpt_3.5_turbo_0125,3.957,safety,biggen_240612,[],other +606,gpt_4_1106_preview,4.594,safety,biggen_240612,[],other +607,gpt_4_0125_preview,4.203,safety,biggen_240612,[],other +608,gpt_4_turbo_2024_04_09,4.116,safety,biggen_240612,[],other +609,gpt_4o_2024_05_13,4.043,safety,biggen_240612,[],other +610,mistral_medium_hjpark,4.13,safety,biggen_240612,[],other +611,mistral_large_hjpark,4.087,safety,biggen_240612,[],other +612,gemini_1.0_pro,4.043,safety,biggen_240612,[],other +613,gemini_pro_1.5,3.971,safety,biggen_240612,[],other +614,gemini_flash_1.5,4.203,safety,biggen_240612,[],other +615,claude_3_haiku_20240307,4.29,safety,biggen_240612,[],other +616,claude_3_sonnet_20240229,4.362,safety,biggen_240612,[],other +617,claude_3_opus_20240229,4.551,safety,biggen_240612,[],other +618,phi_1,1.01,theory_of_mind,biggen_240612,[],reasoning +619,phi_1_5,2.7,theory_of_mind,biggen_240612,[],reasoning +620,phi_2,3.0,theory_of_mind,biggen_240612,[],reasoning +621,qwen1.5_0.5b,1.75,theory_of_mind,biggen_240612,[],reasoning +622,qwen1.5_1.8b,2.36,theory_of_mind,biggen_240612,[],reasoning +623,qwen1.5_4b,2.61,theory_of_mind,biggen_240612,[],reasoning +624,gemma_2b,2.24,theory_of_mind,biggen_240612,[],reasoning +625,olmo_1b,1.44,theory_of_mind,biggen_240612,[],reasoning +626,qwen1.5_0.5b_chat,2.26,theory_of_mind,biggen_240612,[],reasoning +627,qwen1.5_1.8b_chat,3.03,theory_of_mind,biggen_240612,[],reasoning +628,qwen1.5_4b_chat,2.89,theory_of_mind,biggen_240612,[],reasoning +629,phi_3_mini_4k_instruct,3.78,theory_of_mind,biggen_240612,[],reasoning +630,phi_3_mini_128k_instruct,3.66,theory_of_mind,biggen_240612,[],reasoning +631,gemma_2b_it,3.16,theory_of_mind,biggen_240612,[],reasoning +632,gemma_1.1_2b_it,3.15,theory_of_mind,biggen_240612,[],reasoning +633,gemma_7b,1.17,theory_of_mind,biggen_240612,[],reasoning +634,mistral_7b_v0.1,2.97,theory_of_mind,biggen_240612,[],reasoning +635,mistral_7b_v0.2,3.1,theory_of_mind,biggen_240612,[],reasoning +636,qwen1.5_7b,2.68,theory_of_mind,biggen_240612,[],reasoning +637,yi_6b,2.74,theory_of_mind,biggen_240612,[],reasoning +638,llama_2_7b,2.37,theory_of_mind,biggen_240612,[],reasoning +639,codellama_7b,1.77,theory_of_mind,biggen_240612,[],reasoning +640,meta_llama_3_8b,2.57,theory_of_mind,biggen_240612,[],reasoning +641,llemma_7b,2.02,theory_of_mind,biggen_240612,[],reasoning +642,olmo_7b,1.97,theory_of_mind,biggen_240612,[],reasoning +643,gemma_7b_it,3.19,theory_of_mind,biggen_240612,[],reasoning +644,gemma_1.1_7b_it,3.354,theory_of_mind,biggen_240612,[],reasoning +645,mistral_7b_instruct_v0.2,3.68,theory_of_mind,biggen_240612,[],reasoning +646,qwen1.5_7b_chat,3.67,theory_of_mind,biggen_240612,[],reasoning +647,yi_6b_chat,3.545,theory_of_mind,biggen_240612,[],reasoning +648,llama_2_7b_chat,3.6,theory_of_mind,biggen_240612,[],reasoning +649,codellama_7b_instruct,3.23,theory_of_mind,biggen_240612,[],reasoning +650,meta_llama_3_8b_instruct,3.65,theory_of_mind,biggen_240612,[],reasoning +651,olmo_7b_sft,2.85,theory_of_mind,biggen_240612,[],reasoning +652,olmo_7b_instruct,3.29,theory_of_mind,biggen_240612,[],reasoning +653,tulu_2_7b,3.17,theory_of_mind,biggen_240612,[],reasoning +654,tulu_2_dpo_7b,3.59,theory_of_mind,biggen_240612,[],reasoning +655,codetulu_2_7b,3.09,theory_of_mind,biggen_240612,[],reasoning +656,orca_2_7b,2.23,theory_of_mind,biggen_240612,[],reasoning +657,openchat_3.5_0106,3.56,theory_of_mind,biggen_240612,[],reasoning +658,openhermes_2_mistral_7b,3.3,theory_of_mind,biggen_240612,[],reasoning +659,openhermes_2.5_mistral_7b,3.35,theory_of_mind,biggen_240612,[],reasoning +660,nous_hermes_2_mistral_7b_dpo,3.51,theory_of_mind,biggen_240612,[],reasoning +661,starling_lm_7b_alpha,3.47,theory_of_mind,biggen_240612,[],reasoning +662,starling_lm_7b_beta,3.68,theory_of_mind,biggen_240612,[],reasoning +663,mistral_orpo_alpha,3.47,theory_of_mind,biggen_240612,[],reasoning +664,mistral_orpo_beta,3.47,theory_of_mind,biggen_240612,[],reasoning +665,zephyr_7b_beta,3.64,theory_of_mind,biggen_240612,[],reasoning +666,qwen1.5_14b,3.01,theory_of_mind,biggen_240612,[],reasoning +667,llama_2_13b,2.61,theory_of_mind,biggen_240612,[],reasoning +668,codellama_13b,1.98,theory_of_mind,biggen_240612,[],reasoning +669,solar_10.7b_v1.0,3.21,theory_of_mind,biggen_240612,[],reasoning +670,qwen1.5_14b_chat,3.51,theory_of_mind,biggen_240612,[],reasoning +671,solar_10.7b_instruct_v1.0,3.66,theory_of_mind,biggen_240612,[],reasoning +672,aya_101,1.37,theory_of_mind,biggen_240612,[],reasoning +673,llama_2_13b_chat,3.65,theory_of_mind,biggen_240612,[],reasoning +674,codellama_13b_instruct,3.25,theory_of_mind,biggen_240612,[],reasoning +675,tulu_2_13b,3.23,theory_of_mind,biggen_240612,[],reasoning +676,tulu_2_dpo_13b,3.61,theory_of_mind,biggen_240612,[],reasoning +677,codetulu_2_13b,3.31,theory_of_mind,biggen_240612,[],reasoning +678,orca_2_13b,2.77,theory_of_mind,biggen_240612,[],reasoning +679,yi_34b,3.26,theory_of_mind,biggen_240612,[],reasoning +680,llemma_34b,2.51,theory_of_mind,biggen_240612,[],reasoning +681,qwen1.5_32b,3.24,theory_of_mind,biggen_240612,[],reasoning +682,codellama_34b,2.56,theory_of_mind,biggen_240612,[],reasoning +683,mixtral_8x7b_v0.1,3.35,theory_of_mind,biggen_240612,[],reasoning +684,yi_34b_chat,3.84,theory_of_mind,biggen_240612,[],reasoning +685,nous_hermes_2_yi_34b,3.43,theory_of_mind,biggen_240612,[],reasoning +686,codellama_34b_instruct,3.34,theory_of_mind,biggen_240612,[],reasoning +687,codetulu_2_34b,3.45,theory_of_mind,biggen_240612,[],reasoning +688,qwen1.5_32b_chat,3.78,theory_of_mind,biggen_240612,[],reasoning +689,mixtral_8x7b_instruct_v0.1,3.8,theory_of_mind,biggen_240612,[],reasoning +690,nous_hermes_2_mixtral_8x7b_sft,3.47,theory_of_mind,biggen_240612,[],reasoning +691,nous_hermes_2_mixtral_8x7b_dpo,3.63,theory_of_mind,biggen_240612,[],reasoning +692,c4ai_command_r_v01,3.74,theory_of_mind,biggen_240612,[],reasoning +693,llama_2_70b,3.25,theory_of_mind,biggen_240612,[],reasoning +694,codellama_70b,2.3,theory_of_mind,biggen_240612,[],reasoning +695,mixtral_8x22b_v0.1_awq,3.39,theory_of_mind,biggen_240612,[],reasoning +696,meta_llama_3_70b,2.9,theory_of_mind,biggen_240612,[],reasoning +697,qwen1.5_72b,3.17,theory_of_mind,biggen_240612,[],reasoning +698,llama_2_70b_chat,3.75,theory_of_mind,biggen_240612,[],reasoning +699,codellama_70b_instruct,2.44,theory_of_mind,biggen_240612,[],reasoning +700,tulu_2_dpo_70b,3.79,theory_of_mind,biggen_240612,[],reasoning +701,c4ai_command_r_plus_gptq,3.87,theory_of_mind,biggen_240612,[],reasoning +702,meta_llama_3_70b_instruct,3.92,theory_of_mind,biggen_240612,[],reasoning +703,mixtral_8x22b_instruct_v0.1_awq,3.74,theory_of_mind,biggen_240612,[],reasoning +704,zephyr_orpo_141b_a35b_v0.1_awq,3.48,theory_of_mind,biggen_240612,[],reasoning +705,qwen1.5_72b_chat,3.92,theory_of_mind,biggen_240612,[],reasoning +706,qwen_110b_chat,3.94,theory_of_mind,biggen_240612,[],reasoning +707,gpt_3.5_turbo_1106,3.74,theory_of_mind,biggen_240612,[],reasoning +708,gpt_3.5_turbo_0125,3.8,theory_of_mind,biggen_240612,[],reasoning +709,gpt_4_1106_preview,4.07,theory_of_mind,biggen_240612,[],reasoning +710,gpt_4_0125_preview,4.21,theory_of_mind,biggen_240612,[],reasoning +711,gpt_4_turbo_2024_04_09,4.03,theory_of_mind,biggen_240612,[],reasoning +712,gpt_4o_2024_05_13,4.04,theory_of_mind,biggen_240612,[],reasoning +713,mistral_medium_hjpark,3.85,theory_of_mind,biggen_240612,[],reasoning +714,mistral_large_hjpark,3.93,theory_of_mind,biggen_240612,[],reasoning +715,gemini_1.0_pro,3.83,theory_of_mind,biggen_240612,[],reasoning +716,gemini_pro_1.5,3.96,theory_of_mind,biggen_240612,[],reasoning +717,gemini_flash_1.5,3.89,theory_of_mind,biggen_240612,[],reasoning +718,claude_3_haiku_20240307,3.97,theory_of_mind,biggen_240612,[],reasoning +719,claude_3_sonnet_20240229,3.97,theory_of_mind,biggen_240612,[],reasoning +720,claude_3_opus_20240229,4.08,theory_of_mind,biggen_240612,[],reasoning +721,phi_1,1.012,tool_usage,biggen_240612,[],other +722,phi_1_5,1.3,tool_usage,biggen_240612,[],other +723,phi_2,1.675,tool_usage,biggen_240612,[],other +724,qwen1.5_0.5b,1.15,tool_usage,biggen_240612,[],other +725,qwen1.5_1.8b,1.413,tool_usage,biggen_240612,[],other +726,qwen1.5_4b,1.688,tool_usage,biggen_240612,[],other +727,gemma_2b,1.35,tool_usage,biggen_240612,[],other +728,olmo_1b,1.087,tool_usage,biggen_240612,[],other +729,qwen1.5_0.5b_chat,1.25,tool_usage,biggen_240612,[],other +730,qwen1.5_1.8b_chat,1.688,tool_usage,biggen_240612,[],other +731,qwen1.5_4b_chat,2.05,tool_usage,biggen_240612,[],other +732,phi_3_mini_4k_instruct,3.112,tool_usage,biggen_240612,[],other +733,phi_3_mini_128k_instruct,2.7,tool_usage,biggen_240612,[],other +734,gemma_2b_it,1.812,tool_usage,biggen_240612,[],other +735,gemma_1.1_2b_it,1.675,tool_usage,biggen_240612,[],other +736,gemma_7b,1.025,tool_usage,biggen_240612,[],other +737,mistral_7b_v0.1,2.038,tool_usage,biggen_240612,[],other +738,mistral_7b_v0.2,1.962,tool_usage,biggen_240612,[],other +739,qwen1.5_7b,2.212,tool_usage,biggen_240612,[],other +740,yi_6b,1.425,tool_usage,biggen_240612,[],other +741,llama_2_7b,1.337,tool_usage,biggen_240612,[],other +742,codellama_7b,1.387,tool_usage,biggen_240612,[],other +743,meta_llama_3_8b,1.738,tool_usage,biggen_240612,[],other +744,llemma_7b,1.575,tool_usage,biggen_240612,[],other +745,olmo_7b,1.15,tool_usage,biggen_240612,[],other +746,gemma_7b_it,2.125,tool_usage,biggen_240612,[],other +747,gemma_1.1_7b_it,2.562,tool_usage,biggen_240612,[],other +748,mistral_7b_instruct_v0.2,3.175,tool_usage,biggen_240612,[],other +749,qwen1.5_7b_chat,3.013,tool_usage,biggen_240612,[],other +750,yi_6b_chat,2.05,tool_usage,biggen_240612,[],other +751,llama_2_7b_chat,2.075,tool_usage,biggen_240612,[],other +752,codellama_7b_instruct,2.288,tool_usage,biggen_240612,[],other +753,meta_llama_3_8b_instruct,3.263,tool_usage,biggen_240612,[],other +754,olmo_7b_sft,1.887,tool_usage,biggen_240612,[],other +755,olmo_7b_instruct,1.875,tool_usage,biggen_240612,[],other +756,tulu_2_7b,2.062,tool_usage,biggen_240612,[],other +757,tulu_2_dpo_7b,2.325,tool_usage,biggen_240612,[],other +758,codetulu_2_7b,2.65,tool_usage,biggen_240612,[],other +759,orca_2_7b,1.462,tool_usage,biggen_240612,[],other +760,openchat_3.5_0106,2.9,tool_usage,biggen_240612,[],other +761,openhermes_2_mistral_7b,2.663,tool_usage,biggen_240612,[],other +762,openhermes_2.5_mistral_7b,2.65,tool_usage,biggen_240612,[],other +763,nous_hermes_2_mistral_7b_dpo,2.837,tool_usage,biggen_240612,[],other +764,starling_lm_7b_alpha,2.95,tool_usage,biggen_240612,[],other +765,starling_lm_7b_beta,3.388,tool_usage,biggen_240612,[],other +766,mistral_orpo_alpha,2.675,tool_usage,biggen_240612,[],other +767,mistral_orpo_beta,2.775,tool_usage,biggen_240612,[],other +768,zephyr_7b_beta,3.175,tool_usage,biggen_240612,[],other +769,qwen1.5_14b,2.788,tool_usage,biggen_240612,[],other +770,llama_2_13b,1.575,tool_usage,biggen_240612,[],other +771,codellama_13b,1.525,tool_usage,biggen_240612,[],other +772,solar_10.7b_v1.0,2.312,tool_usage,biggen_240612,[],other +773,qwen1.5_14b_chat,3.075,tool_usage,biggen_240612,[],other +774,solar_10.7b_instruct_v1.0,3.188,tool_usage,biggen_240612,[],other +775,aya_101,1.163,tool_usage,biggen_240612,[],other +776,llama_2_13b_chat,2.3,tool_usage,biggen_240612,[],other +777,codellama_13b_instruct,2.388,tool_usage,biggen_240612,[],other +778,tulu_2_13b,2.5,tool_usage,biggen_240612,[],other +779,tulu_2_dpo_13b,2.763,tool_usage,biggen_240612,[],other +780,codetulu_2_13b,3.013,tool_usage,biggen_240612,[],other +781,orca_2_13b,2.013,tool_usage,biggen_240612,[],other +782,yi_34b,2.3,tool_usage,biggen_240612,[],other +783,llemma_34b,1.887,tool_usage,biggen_240612,[],other +784,qwen1.5_32b,2.712,tool_usage,biggen_240612,[],other +785,codellama_34b,1.875,tool_usage,biggen_240612,[],other +786,mixtral_8x7b_v0.1,2.538,tool_usage,biggen_240612,[],other +787,yi_34b_chat,3.075,tool_usage,biggen_240612,[],other +788,nous_hermes_2_yi_34b,3.013,tool_usage,biggen_240612,[],other +789,codellama_34b_instruct,2.487,tool_usage,biggen_240612,[],other +790,codetulu_2_34b,3.2,tool_usage,biggen_240612,[],other +791,qwen1.5_32b_chat,3.55,tool_usage,biggen_240612,[],other +792,mixtral_8x7b_instruct_v0.1,3.237,tool_usage,biggen_240612,[],other +793,nous_hermes_2_mixtral_8x7b_sft,3.288,tool_usage,biggen_240612,[],other +794,nous_hermes_2_mixtral_8x7b_dpo,3.413,tool_usage,biggen_240612,[],other +795,c4ai_command_r_v01,2.987,tool_usage,biggen_240612,[],other +796,llama_2_70b,2.487,tool_usage,biggen_240612,[],other +797,codellama_70b,2.138,tool_usage,biggen_240612,[],other +798,mixtral_8x22b_v0.1_awq,2.875,tool_usage,biggen_240612,[],other +799,meta_llama_3_70b,2.388,tool_usage,biggen_240612,[],other +800,qwen1.5_72b,2.875,tool_usage,biggen_240612,[],other +801,llama_2_70b_chat,2.875,tool_usage,biggen_240612,[],other +802,codellama_70b_instruct,1.712,tool_usage,biggen_240612,[],other +803,tulu_2_dpo_70b,3.5,tool_usage,biggen_240612,[],other +804,c4ai_command_r_plus_gptq,3.475,tool_usage,biggen_240612,[],other +805,meta_llama_3_70b_instruct,3.625,tool_usage,biggen_240612,[],other +806,mixtral_8x22b_instruct_v0.1_awq,3.462,tool_usage,biggen_240612,[],other +807,zephyr_orpo_141b_a35b_v0.1_awq,3.062,tool_usage,biggen_240612,[],other +808,qwen1.5_72b_chat,3.388,tool_usage,biggen_240612,[],other +809,qwen_110b_chat,3.438,tool_usage,biggen_240612,[],other +810,gpt_3.5_turbo_1106,3.062,tool_usage,biggen_240612,[],other +811,gpt_3.5_turbo_0125,2.987,tool_usage,biggen_240612,[],other +812,gpt_4_1106_preview,3.7,tool_usage,biggen_240612,[],other +813,gpt_4_0125_preview,3.675,tool_usage,biggen_240612,[],other +814,gpt_4_turbo_2024_04_09,3.712,tool_usage,biggen_240612,[],other +815,gpt_4o_2024_05_13,3.775,tool_usage,biggen_240612,[],other +816,mistral_medium_hjpark,3.737,tool_usage,biggen_240612,[],other +817,mistral_large_hjpark,3.638,tool_usage,biggen_240612,[],other +818,gemini_1.0_pro,3.138,tool_usage,biggen_240612,[],other +819,gemini_pro_1.5,3.337,tool_usage,biggen_240612,[],other +820,gemini_flash_1.5,3.337,tool_usage,biggen_240612,[],other +821,claude_3_haiku_20240307,3.775,tool_usage,biggen_240612,[],other +822,claude_3_sonnet_20240229,3.663,tool_usage,biggen_240612,[],other +823,claude_3_opus_20240229,3.775,tool_usage,biggen_240612,[],other +0,aya_101,0.029411764705882353,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +1,c4ai_command_r_plus_gptq,0.8382352941176471,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +2,c4ai_command_r_v01,0.6948529411764706,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +3,claude_3_haiku_20240307,0.9252450980392157,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +4,claude_3_opus_20240229,0.9681372549019608,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +5,claude_3_sonnet_20240229,0.9240196078431373,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +6,codellama_13b,0.07598039215686275,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +7,codellama_13b_instruct,0.4276960784313726,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +8,codellama_34b,0.1482843137254902,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +9,codellama_34b_instruct,0.5098039215686274,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +10,codellama_70b,0.18872549019607843,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +11,codellama_70b_instruct,0.27450980392156865,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +12,codellama_7b,0.05514705882352941,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +13,codellama_7b_instruct,0.36519607843137253,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +14,codetulu_2_13b,0.43137254901960786,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +15,codetulu_2_34b,0.5441176470588235,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +16,codetulu_2_7b,0.32598039215686275,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +17,gemini_1.0_pro,0.7107843137254902,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +18,gemini_flash_1.5,0.866421568627451,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +19,gemini_pro_1.5,0.8676470588235294,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +20,gemma_1.1_2b_it,0.33578431372549017,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +21,gemma_1.1_7b_it,0.5551470588235294,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +22,gemma_2b,0.09803921568627451,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +23,gemma_2b_it,0.3333333333333333,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +24,gemma_7b,0.013480392156862746,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +25,gemma_7b_it,0.40931372549019607,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +26,gpt_3.5_turbo_0125,0.7757352941176471,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +27,gpt_3.5_turbo_1106,0.758578431372549,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +28,gpt_4_0125_preview,0.9779411764705882,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +29,gpt_4_1106_preview,0.9889705882352942,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +30,gpt_4_turbo_2024_04_09,0.9558823529411765,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +31,gpt_4o_2024_05_13,0.9436274509803921,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +32,llama_2_13b,0.20220588235294118,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +33,llama_2_13b_chat,0.5968137254901961,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +34,llama_2_70b,0.4656862745098039,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +35,llama_2_70b_chat,0.7205882352941176,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +36,llama_2_7b,0.1446078431372549,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +37,llama_2_7b_chat,0.5355392156862745,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +38,llemma_34b,0.21200980392156862,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +39,llemma_7b,0.11029411764705882,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +40,meta_llama_3_70b,0.36887254901960786,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +41,meta_llama_3_70b_instruct,0.875,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +42,meta_llama_3_8b,0.2377450980392157,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +43,meta_llama_3_8b_instruct,0.7328431372549019,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +44,mistral_7b_instruct_v0.2,0.7156862745098039,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +45,mistral_7b_v0.1,0.3272058823529412,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +46,mistral_7b_v0.2,0.3137254901960784,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +47,mistral_large_hjpark,0.8762254901960784,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +48,mistral_medium_hjpark,0.8970588235294118,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +49,mistral_orpo_alpha,0.5392156862745098,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +50,mistral_orpo_beta,0.5477941176470589,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +51,mixtral_8x22b_instruct_v0.1_awq,0.8198529411764706,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +52,mixtral_8x22b_v0.1_awq,0.5968137254901961,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +53,mixtral_8x7b_instruct_v0.1,0.7647058823529411,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +54,mixtral_8x7b_v0.1,0.5453431372549019,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +55,nous_hermes_2_mistral_7b_dpo,0.571078431372549,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +56,nous_hermes_2_mixtral_8x7b_dpo,0.7095588235294118,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +57,nous_hermes_2_mixtral_8x7b_sft,0.6262254901960784,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +58,nous_hermes_2_yi_34b,0.5906862745098039,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +59,olmo_1b,0.028186274509803922,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +60,olmo_7b,0.07107843137254902,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +61,olmo_7b_instruct,0.30269607843137253,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +62,olmo_7b_sft,0.2549019607843137,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +63,openchat_3.5_0106,0.6825980392156863,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +64,openhermes_2.5_mistral_7b,0.4583333333333333,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +65,openhermes_2_mistral_7b,0.5122549019607843,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +66,orca_2_13b,0.17401960784313725,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +67,orca_2_7b,0.08700980392156862,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +68,phi_1,0.0,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +69,phi_1_5,0.15318627450980393,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +70,phi_2,0.29044117647058826,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +71,phi_3_mini_128k_instruct,0.6911764705882353,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +72,phi_3_mini_4k_instruct,0.7867647058823529,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +73,qwen1.5_0.5b,0.0428921568627451,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +74,qwen1.5_0.5b_chat,0.07965686274509803,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +75,qwen1.5_1.8b,0.12867647058823528,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +76,qwen1.5_1.8b_chat,0.21691176470588236,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +77,qwen1.5_14b,0.3946078431372549,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +78,qwen1.5_14b_chat,0.7267156862745098,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +79,qwen1.5_32b,0.4791666666666667,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +80,qwen1.5_32b_chat,0.8149509803921569,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +81,qwen1.5_4b,0.21323529411764705,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +82,qwen1.5_4b_chat,0.29411764705882354,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +83,qwen1.5_72b,0.5294117647058824,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +84,qwen1.5_72b_chat,0.8713235294117647,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +85,qwen1.5_7b,0.2610294117647059,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +86,qwen1.5_7b_chat,0.6580882352941176,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +87,qwen_110b_chat,0.8848039215686274,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +88,solar_10.7b_instruct_v1.0,0.6862745098039216,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +89,solar_10.7b_v1.0,0.43995098039215685,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +90,starling_lm_7b_alpha,0.6139705882352942,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +91,starling_lm_7b_beta,0.7573529411764706,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +92,tulu_2_13b,0.4313725490196078,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +93,tulu_2_7b,0.3553921568627451,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +94,tulu_2_dpo_13b,0.5833333333333333,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +95,tulu_2_dpo_70b,0.7708333333333334,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +96,tulu_2_dpo_7b,0.4767156862745098,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +97,yi_34b,0.46078431372549017,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +98,yi_34b_chat,0.7720588235294118,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +99,yi_6b,0.17892156862745098,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +100,yi_6b_chat,0.4117647058823529,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +101,zephyr_7b_beta,0.6200980392156863,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +102,zephyr_orpo_141b_a35b_v0.1_awq,0.6311274509803921,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']", +0,gpt_4o_0513,35.7,wildbench_mix,wildbench_240612,[],holistic +1,gpt_4_turbo_0409,34.6,wildbench_mix,wildbench_240612,[],holistic +2,gpt_4_turbo_0125,29.9,wildbench_mix,wildbench_240612,[],holistic +3,gemini_1.5_pro,27.8,wildbench_mix,wildbench_240612,[],holistic +4,llama_3_70b_inst,21.0,wildbench_mix,wildbench_240612,[],holistic +5,claude_3_opus,20.1,wildbench_mix,wildbench_240612,[],holistic +6,gemini_1.5_flash,17.4,wildbench_mix,wildbench_240612,[],holistic +7,yi_1.5_34b_chat,16.8,wildbench_mix,wildbench_240612,[],holistic +8,llama3_inst_8b_simpo,14.0,wildbench_mix,wildbench_240612,[],holistic +9,claude_3_sonnet,7.2,wildbench_mix,wildbench_240612,[],holistic +10,qwen1.5_72b_chat,4.4,wildbench_mix,wildbench_240612,[],holistic +11,command_r_plus,0.4,wildbench_mix,wildbench_240612,[],holistic +12,claude_3_haiku,-8.5,wildbench_mix,wildbench_240612,[],holistic +13,mistral_large,-10.5,wildbench_mix,wildbench_240612,[],holistic +14,starlinglm_7b_beta,-11.9,wildbench_mix,wildbench_240612,[],holistic +15,llama_3_8b_inst,-14.6,wildbench_mix,wildbench_240612,[],holistic +16,command_r,-16.0,wildbench_mix,wildbench_240612,[],holistic +17,mixtral_8x7b_inst,-18.8,wildbench_mix,wildbench_240612,[],holistic +18,dbrx_instruct,-21.6,wildbench_mix,wildbench_240612,[],holistic +19,yi_1.5_6b_chat,-24.3,wildbench_mix,wildbench_240612,[],holistic +20,mistral_7b_inst_v0.2,-25.0,wildbench_mix,wildbench_240612,[],holistic +21,tulu_2_dpo_70b,-25.4,wildbench_mix,wildbench_240612,[],holistic +22,llama_2_70b_chat,-26.8,wildbench_mix,wildbench_240612,[],holistic +23,qwen1.5_7b_chat,-27.0,wildbench_mix,wildbench_240612,[],holistic +24,phi_3_medium_128k,-33.3,wildbench_mix,wildbench_240612,[],holistic +25,gpt_3.5_turbo_0125,-33.5,wildbench_mix,wildbench_240612,[],holistic +26,llama_2_7b_chat,-48.0,wildbench_mix,wildbench_240612,[],holistic +27,gemma_7b_it,-57.0,wildbench_mix,wildbench_240612,[],holistic +28,gemma_2b_it,-74.1,wildbench_mix,wildbench_240612,[],holistic +29,gpt_4o_0513,1.5,wildbench_gpt4t,wildbench_240612,[],holistic +30,gpt_4_turbo_0409,0.0,wildbench_gpt4t,wildbench_240612,[],holistic +31,gpt_4_turbo_0125,4.4,wildbench_gpt4t,wildbench_240612,[],holistic +32,gemini_1.5_pro,-4.4,wildbench_gpt4t,wildbench_240612,[],holistic +33,llama_3_70b_inst,-19.0,wildbench_gpt4t,wildbench_240612,[],holistic +34,claude_3_opus,-20.4,wildbench_gpt4t,wildbench_240612,[],holistic +35,gemini_1.5_flash,-16.6,wildbench_gpt4t,wildbench_240612,[],holistic +36,yi_1.5_34b_chat,-18.3,wildbench_gpt4t,wildbench_240612,[],holistic +37,llama3_inst_8b_simpo,-22.5,wildbench_gpt4t,wildbench_240612,[],holistic +38,claude_3_sonnet,-31.6,wildbench_gpt4t,wildbench_240612,[],holistic +39,qwen1.5_72b_chat,-34.8,wildbench_gpt4t,wildbench_240612,[],holistic +40,command_r_plus,-36.3,wildbench_gpt4t,wildbench_240612,[],holistic +41,claude_3_haiku,-46.9,wildbench_gpt4t,wildbench_240612,[],holistic +42,mistral_large,-48.1,wildbench_gpt4t,wildbench_240612,[],holistic +43,starlinglm_7b_beta,-48.7,wildbench_gpt4t,wildbench_240612,[],holistic +44,llama_3_8b_inst,-49.8,wildbench_gpt4t,wildbench_240612,[],holistic +45,command_r,-48.4,wildbench_gpt4t,wildbench_240612,[],holistic +46,mixtral_8x7b_inst,-53.4,wildbench_gpt4t,wildbench_240612,[],holistic +47,dbrx_instruct,-57.3,wildbench_gpt4t,wildbench_240612,[],holistic +48,yi_1.5_6b_chat,-55.0,wildbench_gpt4t,wildbench_240612,[],holistic +49,mistral_7b_inst_v0.2,-58.1,wildbench_gpt4t,wildbench_240612,[],holistic +50,tulu_2_dpo_70b,-59.3,wildbench_gpt4t,wildbench_240612,[],holistic +51,llama_2_70b_chat,-56.9,wildbench_gpt4t,wildbench_240612,[],holistic +52,qwen1.5_7b_chat,-57.7,wildbench_gpt4t,wildbench_240612,[],holistic +53,phi_3_medium_128k,-66.4,wildbench_gpt4t,wildbench_240612,[],holistic +54,gpt_3.5_turbo_0125,-66.3,wildbench_gpt4t,wildbench_240612,[],holistic +55,llama_2_7b_chat,-71.8,wildbench_gpt4t,wildbench_240612,[],holistic +56,gemma_7b_it,-78.4,wildbench_gpt4t,wildbench_240612,[],holistic +57,gemma_2b_it,-87.8,wildbench_gpt4t,wildbench_240612,[],holistic +58,gpt_4o_0513,46.3,wildbench_haiku,wildbench_240612,[],holistic +59,gpt_4_turbo_0409,45.3,wildbench_haiku,wildbench_240612,[],holistic +60,gpt_4_turbo_0125,38.8,wildbench_haiku,wildbench_240612,[],holistic +61,gemini_1.5_pro,37.9,wildbench_haiku,wildbench_240612,[],holistic +62,llama_3_70b_inst,31.9,wildbench_haiku,wildbench_240612,[],holistic +63,claude_3_opus,34.3,wildbench_haiku,wildbench_240612,[],holistic +64,gemini_1.5_flash,26.3,wildbench_haiku,wildbench_240612,[],holistic +65,yi_1.5_34b_chat,24.1,wildbench_haiku,wildbench_240612,[],holistic +66,llama3_inst_8b_simpo,18.9,wildbench_haiku,wildbench_240612,[],holistic +67,claude_3_sonnet,19.4,wildbench_haiku,wildbench_240612,[],holistic +68,qwen1.5_72b_chat,13.1,wildbench_haiku,wildbench_240612,[],holistic +69,command_r_plus,7.4,wildbench_haiku,wildbench_240612,[],holistic +70,claude_3_haiku,0.0,wildbench_haiku,wildbench_240612,[],holistic +71,mistral_large,-4.0,wildbench_haiku,wildbench_240612,[],holistic +72,starlinglm_7b_beta,-5.0,wildbench_haiku,wildbench_240612,[],holistic +73,llama_3_8b_inst,-9.7,wildbench_haiku,wildbench_240612,[],holistic +74,command_r,-12.7,wildbench_haiku,wildbench_240612,[],holistic +75,mixtral_8x7b_inst,-13.5,wildbench_haiku,wildbench_240612,[],holistic +76,dbrx_instruct,-16.3,wildbench_haiku,wildbench_240612,[],holistic +77,yi_1.5_6b_chat,-19.9,wildbench_haiku,wildbench_240612,[],holistic +78,mistral_7b_inst_v0.2,-22.4,wildbench_haiku,wildbench_240612,[],holistic +79,tulu_2_dpo_70b,-20.3,wildbench_haiku,wildbench_240612,[],holistic +80,llama_2_70b_chat,-23.6,wildbench_haiku,wildbench_240612,[],holistic +81,qwen1.5_7b_chat,-23.0,wildbench_haiku,wildbench_240612,[],holistic +82,phi_3_medium_128k,-30.0,wildbench_haiku,wildbench_240612,[],holistic +83,gpt_3.5_turbo_0125,-30.0,wildbench_haiku,wildbench_240612,[],holistic +84,llama_2_7b_chat,-44.6,wildbench_haiku,wildbench_240612,[],holistic +85,gemma_7b_it,-55.8,wildbench_haiku,wildbench_240612,[],holistic +86,gemma_2b_it,-73.6,wildbench_haiku,wildbench_240612,[],holistic +87,gpt_4o_0513,59.3,wildbench_llama2,wildbench_240612,[],holistic +88,gpt_4_turbo_0409,58.4,wildbench_llama2,wildbench_240612,[],holistic +89,gpt_4_turbo_0125,55.2,wildbench_llama2,wildbench_240612,[],holistic +90,gemini_1.5_pro,50.0,wildbench_llama2,wildbench_240612,[],holistic +91,llama_3_70b_inst,50.2,wildbench_llama2,wildbench_240612,[],holistic +92,claude_3_opus,46.3,wildbench_llama2,wildbench_240612,[],holistic +93,gemini_1.5_flash,42.5,wildbench_llama2,wildbench_240612,[],holistic +94,yi_1.5_34b_chat,44.5,wildbench_llama2,wildbench_240612,[],holistic +95,llama3_inst_8b_simpo,45.7,wildbench_llama2,wildbench_240612,[],holistic +96,claude_3_sonnet,33.9,wildbench_llama2,wildbench_240612,[],holistic +97,qwen1.5_72b_chat,34.7,wildbench_llama2,wildbench_240612,[],holistic +98,command_r_plus,30.2,wildbench_llama2,wildbench_240612,[],holistic +99,claude_3_haiku,21.4,wildbench_llama2,wildbench_240612,[],holistic +100,mistral_large,20.5,wildbench_llama2,wildbench_240612,[],holistic +101,starlinglm_7b_beta,18.0,wildbench_llama2,wildbench_240612,[],holistic +102,llama_3_8b_inst,15.7,wildbench_llama2,wildbench_240612,[],holistic +103,command_r,13.1,wildbench_llama2,wildbench_240612,[],holistic +104,mixtral_8x7b_inst,10.4,wildbench_llama2,wildbench_240612,[],holistic +105,dbrx_instruct,8.7,wildbench_llama2,wildbench_240612,[],holistic +106,yi_1.5_6b_chat,2.1,wildbench_llama2,wildbench_240612,[],holistic +107,mistral_7b_inst_v0.2,5.5,wildbench_llama2,wildbench_240612,[],holistic +108,tulu_2_dpo_70b,3.3,wildbench_llama2,wildbench_240612,[],holistic +109,llama_2_70b_chat,0.0,wildbench_llama2,wildbench_240612,[],holistic +110,qwen1.5_7b_chat,-0.2,wildbench_llama2,wildbench_240612,[],holistic +111,phi_3_medium_128k,-3.6,wildbench_llama2,wildbench_240612,[],holistic +112,gpt_3.5_turbo_0125,-4.1,wildbench_llama2,wildbench_240612,[],holistic +113,llama_2_7b_chat,-27.8,wildbench_llama2,wildbench_240612,[],holistic +114,gemma_7b_it,-36.8,wildbench_llama2,wildbench_240612,[],holistic +115,gemma_2b_it,-60.8,wildbench_llama2,wildbench_240612,[],holistic +116,gpt_4o_0513,65.3,wb_score,wildbench_240612,[],holistic +117,gpt_4_turbo_0409,64.7,wb_score,wildbench_240612,[],holistic +118,gpt_4_turbo_0125,63.3,wb_score,wildbench_240612,[],holistic +119,gemini_1.5_pro,55.7,wb_score,wildbench_240612,[],holistic +120,llama_3_70b_inst,60.4,wb_score,wildbench_240612,[],holistic +121,claude_3_opus,63.1,wb_score,wildbench_240612,[],holistic +122,gemini_1.5_flash,53.1,wb_score,wildbench_240612,[],holistic +123,yi_1.5_34b_chat,57.8,wb_score,wildbench_240612,[],holistic +124,llama3_inst_8b_simpo,53.9,wb_score,wildbench_240612,[],holistic +125,claude_3_sonnet,55.5,wb_score,wildbench_240612,[],holistic +126,qwen1.5_72b_chat,56.5,wb_score,wildbench_240612,[],holistic +127,command_r_plus,51.4,wb_score,wildbench_240612,[],holistic +128,claude_3_haiku,50.4,wb_score,wildbench_240612,[],holistic +129,mistral_large,54.2,wb_score,wildbench_240612,[],holistic +130,starlinglm_7b_beta,46.8,wb_score,wildbench_240612,[],holistic +131,llama_3_8b_inst,45.7,wb_score,wildbench_240612,[],holistic +132,command_r,45.7,wb_score,wildbench_240612,[],holistic +133,mixtral_8x7b_inst,47.8,wb_score,wildbench_240612,[],holistic +134,dbrx_instruct,48.9,wb_score,wildbench_240612,[],holistic +135,yi_1.5_6b_chat,39.6,wb_score,wildbench_240612,[],holistic +136,mistral_7b_inst_v0.2,43.4,wb_score,wildbench_240612,[],holistic +137,tulu_2_dpo_70b,45.2,wb_score,wildbench_240612,[],holistic +138,llama_2_70b_chat,39.2,wb_score,wildbench_240612,[],holistic +139,qwen1.5_7b_chat,40.0,wb_score,wildbench_240612,[],holistic +140,phi_3_medium_128k,42.1,wb_score,wildbench_240612,[],holistic +141,gpt_3.5_turbo_0125,42.1,wb_score,wildbench_240612,[],holistic +142,llama_2_7b_chat,27.6,wb_score,wildbench_240612,[],holistic +143,gemma_7b_it,23.9,wb_score,wildbench_240612,[],holistic +144,gemma_2b_it,6.2,wb_score,wildbench_240612,[],holistic +145,gpt_4o_0513,1293.0,arena_elo,wildbench_240612,[],holistic +146,gpt_4_turbo_0409,1251.0,arena_elo,wildbench_240612,[],holistic +147,gpt_4_turbo_0125,1239.0,arena_elo,wildbench_240612,[],holistic +149,llama_3_70b_inst,1213.0,arena_elo,wildbench_240612,[],holistic +150,claude_3_opus,1232.0,arena_elo,wildbench_240612,[],holistic +154,claude_3_sonnet,1187.0,arena_elo,wildbench_240612,[],holistic +155,qwen1.5_72b_chat,1143.0,arena_elo,wildbench_240612,[],holistic +156,command_r_plus,1155.0,arena_elo,wildbench_240612,[],holistic +157,claude_3_haiku,1169.0,arena_elo,wildbench_240612,[],holistic +158,mistral_large,1158.0,arena_elo,wildbench_240612,[],holistic +159,starlinglm_7b_beta,1111.0,arena_elo,wildbench_240612,[],holistic +160,llama_3_8b_inst,1144.0,arena_elo,wildbench_240612,[],holistic +161,command_r,1106.0,arena_elo,wildbench_240612,[],holistic +162,mixtral_8x7b_inst,1114.0,arena_elo,wildbench_240612,[],holistic +163,dbrx_instruct,1106.0,arena_elo,wildbench_240612,[],holistic +165,mistral_7b_inst_v0.2,1071.0,arena_elo,wildbench_240612,[],holistic +166,tulu_2_dpo_70b,1099.0,arena_elo,wildbench_240612,[],holistic +167,llama_2_70b_chat,1070.0,arena_elo,wildbench_240612,[],holistic +168,qwen1.5_7b_chat,1059.0,arena_elo,wildbench_240612,[],holistic +170,gpt_3.5_turbo_0125,1105.0,arena_elo,wildbench_240612,[],holistic +171,llama_2_7b_chat,1012.0,arena_elo,wildbench_240612,[],holistic +172,gemma_7b_it,1047.0,arena_elo,wildbench_240612,[],holistic +173,gemma_2b_it,980.0,arena_elo,wildbench_240612,[],holistic +175,gpt_4_turbo_0409,82.6,arena_hard,wildbench_240612,[],holistic +176,gpt_4_turbo_0125,78.0,arena_hard,wildbench_240612,[],holistic +178,llama_3_70b_inst,41.1,arena_hard,wildbench_240612,[],holistic +179,claude_3_opus,60.4,arena_hard,wildbench_240612,[],holistic +182,llama3_inst_8b_simpo,33.8,arena_hard,wildbench_240612,[],holistic +183,claude_3_sonnet,46.8,arena_hard,wildbench_240612,[],holistic +184,qwen1.5_72b_chat,36.1,arena_hard,wildbench_240612,[],holistic +185,command_r_plus,33.1,arena_hard,wildbench_240612,[],holistic +186,claude_3_haiku,41.5,arena_hard,wildbench_240612,[],holistic +187,mistral_large,37.7,arena_hard,wildbench_240612,[],holistic +188,starlinglm_7b_beta,23.0,arena_hard,wildbench_240612,[],holistic +189,llama_3_8b_inst,20.6,arena_hard,wildbench_240612,[],holistic +190,command_r,17.0,arena_hard,wildbench_240612,[],holistic +191,mixtral_8x7b_inst,23.4,arena_hard,wildbench_240612,[],holistic +192,dbrx_instruct,23.9,arena_hard,wildbench_240612,[],holistic +195,tulu_2_dpo_70b,15.0,arena_hard,wildbench_240612,[],holistic +196,llama_2_70b_chat,11.6,arena_hard,wildbench_240612,[],holistic +199,gpt_3.5_turbo_0125,23.3,arena_hard,wildbench_240612,[],holistic +200,llama_2_7b_chat,4.6,arena_hard,wildbench_240612,[],holistic +201,gemma_7b_it,7.5,arena_hard,wildbench_240612,[],holistic +202,gemma_2b_it,3.0,arena_hard,wildbench_240612,[],holistic +203,gpt_4o_0513,57.5,alpacaeval2_lc,wildbench_240612,[],holistic +204,gpt_4_turbo_0409,55.0,alpacaeval2_lc,wildbench_240612,[],holistic +207,llama_3_70b_inst,34.4,alpacaeval2_lc,wildbench_240612,[],holistic +208,claude_3_opus,40.5,alpacaeval2_lc,wildbench_240612,[],holistic +211,llama3_inst_8b_simpo,44.7,alpacaeval2_lc,wildbench_240612,[],holistic +212,claude_3_sonnet,34.9,alpacaeval2_lc,wildbench_240612,[],holistic +213,qwen1.5_72b_chat,36.6,alpacaeval2_lc,wildbench_240612,[],holistic +216,mistral_large,32.7,alpacaeval2_lc,wildbench_240612,[],holistic +218,llama_3_8b_inst,22.9,alpacaeval2_lc,wildbench_240612,[],holistic +220,mixtral_8x7b_inst,23.7,alpacaeval2_lc,wildbench_240612,[],holistic +221,dbrx_instruct,25.4,alpacaeval2_lc,wildbench_240612,[],holistic +223,mistral_7b_inst_v0.2,17.1,alpacaeval2_lc,wildbench_240612,[],holistic +224,tulu_2_dpo_70b,21.2,alpacaeval2_lc,wildbench_240612,[],holistic +225,llama_2_70b_chat,14.7,alpacaeval2_lc,wildbench_240612,[],holistic +226,qwen1.5_7b_chat,14.7,alpacaeval2_lc,wildbench_240612,[],holistic +229,llama_2_7b_chat,5.4,alpacaeval2_lc,wildbench_240612,[],holistic +230,gemma_7b_it,10.4,alpacaeval2_lc,wildbench_240612,[],holistic +231,gemma_2b_it,5.4,alpacaeval2_lc,wildbench_240612,[],holistic +232,gpt_4o_0513,51.3,alpacav2,wildbench_240612,[],holistic +233,gpt_4_turbo_0409,46.1,alpacav2,wildbench_240612,[],holistic +236,llama_3_70b_inst,33.2,alpacav2,wildbench_240612,[],holistic +237,claude_3_opus,29.1,alpacav2,wildbench_240612,[],holistic +240,llama3_inst_8b_simpo,40.5,alpacav2,wildbench_240612,[],holistic +241,claude_3_sonnet,25.6,alpacav2,wildbench_240612,[],holistic +242,qwen1.5_72b_chat,26.5,alpacav2,wildbench_240612,[],holistic +245,mistral_large,21.4,alpacav2,wildbench_240612,[],holistic +247,llama_3_8b_inst,22.6,alpacav2,wildbench_240612,[],holistic +249,mixtral_8x7b_inst,18.3,alpacav2,wildbench_240612,[],holistic +250,dbrx_instruct,18.4,alpacav2,wildbench_240612,[],holistic +252,mistral_7b_inst_v0.2,14.7,alpacav2,wildbench_240612,[],holistic +253,tulu_2_dpo_70b,16.0,alpacav2,wildbench_240612,[],holistic +254,llama_2_70b_chat,13.9,alpacav2,wildbench_240612,[],holistic +255,qwen1.5_7b_chat,11.8,alpacav2,wildbench_240612,[],holistic +258,llama_2_7b_chat,5.0,alpacav2,wildbench_240612,[],holistic +259,gemma_7b_it,6.9,alpacav2,wildbench_240612,[],holistic +260,gemma_2b_it,3.4,alpacav2,wildbench_240612,[],holistic +0,gpt_4,4.41,agentbench_overall,agentbench_240720,[],agent +1,claude_v1.3,2.77,agentbench_overall,agentbench_240720,[],agent +2,gpt_3.5_turbo,2.55,agentbench_overall,agentbench_240720,[],agent +3,text_davinci_003,2.1,agentbench_overall,agentbench_240720,[],agent +4,claude_instant_v1.1,1.9,agentbench_overall,agentbench_240720,[],agent +5,text_davinci_002,1.46,agentbench_overall,agentbench_240720,[],agent +6,text_bison_001,1.39,agentbench_overall,agentbench_240720,[],agent +7,chatglm2_v0.2,1.31,agentbench_overall,agentbench_240720,[],agent +8,openchat_v3.2,1.15,agentbench_overall,agentbench_240720,[],agent +9,wizardlm_30b,0.83,agentbench_overall,agentbench_240720,[],agent +10,vicuna_13b,0.62,agentbench_overall,agentbench_240720,[],agent +11,wizardlm_13b,0.59,agentbench_overall,agentbench_240720,[],agent +12,llama2_13b_chat,0.55,agentbench_overall,agentbench_240720,[],agent +13,codegeex2_6b,0.53,agentbench_overall,agentbench_240720,[],agent +14,openchat_8192,0.51,agentbench_overall,agentbench_240720,[],agent +15,baichuan_13b_chat,0.36,agentbench_overall,agentbench_240720,[],agent +16,koala_13b,0.34,agentbench_overall,agentbench_240720,[],agent +17,llama2_7b_chat,0.31,agentbench_overall,agentbench_240720,[],agent +18,chatglm_6b,0.31,agentbench_overall,agentbench_240720,[],agent +19,vicuna_7b,0.24,agentbench_overall,agentbench_240720,[],agent +20,internlm_chat_7b,0.23,agentbench_overall,agentbench_240720,[],agent +21,baichuan_7b,0.22,agentbench_overall,agentbench_240720,[],agent +22,wizardcoder,0.21,agentbench_overall,agentbench_240720,[],agent +23,dolly_v2_12b,0.15,agentbench_overall,agentbench_240720,[],agent +24,oasst_sft_4_pythia_12b,0.07,agentbench_overall,agentbench_240720,[],agent +25,gpt_4,36.81,agentbench_os,agentbench_240720,[],agent +26,claude_v1.3,13.19,agentbench_os,agentbench_240720,[],agent +27,gpt_3.5_turbo,32.64,agentbench_os,agentbench_240720,[],agent +28,text_davinci_003,22.92,agentbench_os,agentbench_240720,[],agent +29,claude_instant_v1.1,14.58,agentbench_os,agentbench_240720,[],agent +30,text_davinci_002,4.86,agentbench_os,agentbench_240720,[],agent +31,text_bison_001,4.17,agentbench_os,agentbench_240720,[],agent +32,chatglm2_v0.2,14.58,agentbench_os,agentbench_240720,[],agent +33,openchat_v3.2,9.72,agentbench_os,agentbench_240720,[],agent +34,wizardlm_30b,14.58,agentbench_os,agentbench_240720,[],agent +35,vicuna_13b,8.33,agentbench_os,agentbench_240720,[],agent +36,wizardlm_13b,9.72,agentbench_os,agentbench_240720,[],agent +37,llama2_13b_chat,10.42,agentbench_os,agentbench_240720,[],agent +38,codegeex2_6b,12.5,agentbench_os,agentbench_240720,[],agent +39,openchat_8192,10.42,agentbench_os,agentbench_240720,[],agent +40,baichuan_13b_chat,11.81,agentbench_os,agentbench_240720,[],agent +41,koala_13b,2.78,agentbench_os,agentbench_240720,[],agent +42,llama2_7b_chat,10.42,agentbench_os,agentbench_240720,[],agent +43,chatglm_6b,4.86,agentbench_os,agentbench_240720,[],agent +44,vicuna_7b,6.25,agentbench_os,agentbench_240720,[],agent +45,internlm_chat_7b,3.47,agentbench_os,agentbench_240720,[],agent +46,baichuan_7b,4.17,agentbench_os,agentbench_240720,[],agent +47,wizardcoder,3.47,agentbench_os,agentbench_240720,[],agent +48,dolly_v2_12b,0.0,agentbench_os,agentbench_240720,[],agent +49,oasst_sft_4_pythia_12b,2.78,agentbench_os,agentbench_240720,[],agent +50,gpt_4,33.67,agentbench_db,agentbench_240720,[],agent +51,claude_v1.3,16.75,agentbench_db,agentbench_240720,[],agent +52,gpt_3.5_turbo,15.0,agentbench_db,agentbench_240720,[],agent +53,text_davinci_003,16.33,agentbench_db,agentbench_240720,[],agent +54,claude_instant_v1.1,8.0,agentbench_db,agentbench_240720,[],agent +55,text_davinci_002,13.67,agentbench_db,agentbench_240720,[],agent +56,text_bison_001,12.75,agentbench_db,agentbench_240720,[],agent +57,chatglm2_v0.2,13.67,agentbench_db,agentbench_240720,[],agent +58,openchat_v3.2,5.33,agentbench_db,agentbench_240720,[],agent +59,wizardlm_30b,12.67,agentbench_db,agentbench_240720,[],agent +60,vicuna_13b,11.33,agentbench_db,agentbench_240720,[],agent +61,wizardlm_13b,13.0,agentbench_db,agentbench_240720,[],agent +62,llama2_13b_chat,4.5,agentbench_db,agentbench_240720,[],agent +63,codegeex2_6b,6.5,agentbench_db,agentbench_240720,[],agent +64,openchat_8192,2.67,agentbench_db,agentbench_240720,[],agent +65,baichuan_13b_chat,3.0,agentbench_db,agentbench_240720,[],agent +66,koala_13b,5.33,agentbench_db,agentbench_240720,[],agent +67,llama2_7b_chat,2.75,agentbench_db,agentbench_240720,[],agent +68,chatglm_6b,0.33,agentbench_db,agentbench_240720,[],agent +69,vicuna_7b,3.33,agentbench_db,agentbench_240720,[],agent +70,internlm_chat_7b,6.33,agentbench_db,agentbench_240720,[],agent +71,baichuan_7b,0.0,agentbench_db,agentbench_240720,[],agent +72,wizardcoder,0.0,agentbench_db,agentbench_240720,[],agent +73,dolly_v2_12b,0.0,agentbench_db,agentbench_240720,[],agent +74,oasst_sft_4_pythia_12b,0.0,agentbench_db,agentbench_240720,[],agent +75,gpt_4,52.14,agentbench_kg,agentbench_240720,[],agent +76,claude_v1.3,36.22,agentbench_kg,agentbench_240720,[],agent +77,gpt_3.5_turbo,27.2,agentbench_kg,agentbench_240720,[],agent +78,text_davinci_003,30.82,agentbench_kg,agentbench_240720,[],agent +79,claude_instant_v1.1,29.67,agentbench_kg,agentbench_240720,[],agent +80,text_davinci_002,18.87,agentbench_kg,agentbench_240720,[],agent +81,text_bison_001,17.12,agentbench_kg,agentbench_240720,[],agent +82,chatglm2_v0.2,6.85,agentbench_kg,agentbench_240720,[],agent +83,openchat_v3.2,6.84,agentbench_kg,agentbench_240720,[],agent +84,wizardlm_30b,2.33,agentbench_kg,agentbench_240720,[],agent +85,vicuna_13b,1.24,agentbench_kg,agentbench_240720,[],agent +86,wizardlm_13b,0.44,agentbench_kg,agentbench_240720,[],agent +87,llama2_13b_chat,3.11,agentbench_kg,agentbench_240720,[],agent +88,codegeex2_6b,6.35,agentbench_kg,agentbench_240720,[],agent +89,openchat_8192,0.59,agentbench_kg,agentbench_240720,[],agent +90,baichuan_13b_chat,6.27,agentbench_kg,agentbench_240720,[],agent +91,koala_13b,0.0,agentbench_kg,agentbench_240720,[],agent +92,llama2_7b_chat,1.89,agentbench_kg,agentbench_240720,[],agent +93,chatglm_6b,0.0,agentbench_kg,agentbench_240720,[],agent +94,vicuna_7b,0.0,agentbench_kg,agentbench_240720,[],agent +95,internlm_chat_7b,0.0,agentbench_kg,agentbench_240720,[],agent +96,baichuan_7b,0.46,agentbench_kg,agentbench_240720,[],agent +97,wizardcoder,2.78,agentbench_kg,agentbench_240720,[],agent +98,dolly_v2_12b,0.0,agentbench_kg,agentbench_240720,[],agent +99,oasst_sft_4_pythia_12b,0.0,agentbench_kg,agentbench_240720,[],agent +100,gpt_4,50.0,agentbench_dcg,agentbench_240720,[],agent +101,claude_v1.3,30.0,agentbench_dcg,agentbench_240720,[],agent +102,gpt_3.5_turbo,30.0,agentbench_dcg,agentbench_240720,[],agent +103,text_davinci_003,15.0,agentbench_dcg,agentbench_240720,[],agent +104,claude_instant_v1.1,35.0,agentbench_dcg,agentbench_240720,[],agent +105,text_davinci_002,25.0,agentbench_dcg,agentbench_240720,[],agent +106,text_bison_001,20.0,agentbench_dcg,agentbench_240720,[],agent +107,chatglm2_v0.2,10.0,agentbench_dcg,agentbench_240720,[],agent +108,openchat_v3.2,0.0,agentbench_dcg,agentbench_240720,[],agent +109,wizardlm_30b,10.0,agentbench_dcg,agentbench_240720,[],agent +110,vicuna_13b,0.0,agentbench_dcg,agentbench_240720,[],agent +111,wizardlm_13b,0.0,agentbench_dcg,agentbench_240720,[],agent +112,llama2_13b_chat,0.0,agentbench_dcg,agentbench_240720,[],agent +113,codegeex2_6b,0.0,agentbench_dcg,agentbench_240720,[],agent +114,openchat_8192,10.0,agentbench_dcg,agentbench_240720,[],agent +115,baichuan_13b_chat,0.0,agentbench_dcg,agentbench_240720,[],agent +116,koala_13b,0.0,agentbench_dcg,agentbench_240720,[],agent +117,llama2_7b_chat,0.0,agentbench_dcg,agentbench_240720,[],agent +118,chatglm_6b,0.0,agentbench_dcg,agentbench_240720,[],agent +119,vicuna_7b,0.0,agentbench_dcg,agentbench_240720,[],agent +120,internlm_chat_7b,0.0,agentbench_dcg,agentbench_240720,[],agent +121,baichuan_7b,0.0,agentbench_dcg,agentbench_240720,[],agent +122,wizardcoder,0.0,agentbench_dcg,agentbench_240720,[],agent +123,dolly_v2_12b,0.0,agentbench_dcg,agentbench_240720,[],agent +124,oasst_sft_4_pythia_12b,0.0,agentbench_dcg,agentbench_240720,[],agent +125,gpt_4,17.6,agentbench_ltp,agentbench_240720,[],agent +126,claude_v1.3,6.39,agentbench_ltp,agentbench_240720,[],agent +127,gpt_3.5_turbo,14.85,agentbench_ltp,agentbench_240720,[],agent +128,text_davinci_003,5.21,agentbench_ltp,agentbench_240720,[],agent +129,claude_instant_v1.1,6.08,agentbench_ltp,agentbench_240720,[],agent +130,text_davinci_002,2.32,agentbench_ltp,agentbench_240720,[],agent +131,text_bison_001,0.12,agentbench_ltp,agentbench_240720,[],agent +132,chatglm2_v0.2,12.62,agentbench_ltp,agentbench_240720,[],agent +133,openchat_v3.2,9.54,agentbench_ltp,agentbench_240720,[],agent +134,wizardlm_30b,4.47,agentbench_ltp,agentbench_240720,[],agent +135,vicuna_13b,7.97,agentbench_ltp,agentbench_240720,[],agent +136,wizardlm_13b,4.06,agentbench_ltp,agentbench_240720,[],agent +137,llama2_13b_chat,3.69,agentbench_ltp,agentbench_240720,[],agent +138,codegeex2_6b,1.94,agentbench_ltp,agentbench_240720,[],agent +139,openchat_8192,0.0,agentbench_ltp,agentbench_240720,[],agent +140,baichuan_13b_chat,1.3,agentbench_ltp,agentbench_240720,[],agent +141,koala_13b,2.46,agentbench_ltp,agentbench_240720,[],agent +142,llama2_7b_chat,2.1,agentbench_ltp,agentbench_240720,[],agent +143,chatglm_6b,6.69,agentbench_ltp,agentbench_240720,[],agent +144,vicuna_7b,3.08,agentbench_ltp,agentbench_240720,[],agent +145,internlm_chat_7b,3.41,agentbench_ltp,agentbench_240720,[],agent +146,baichuan_7b,2.29,agentbench_ltp,agentbench_240720,[],agent +147,wizardcoder,1.32,agentbench_ltp,agentbench_240720,[],agent +148,dolly_v2_12b,3.36,agentbench_ltp,agentbench_240720,[],agent +149,oasst_sft_4_pythia_12b,1.48,agentbench_ltp,agentbench_240720,[],agent +150,gpt_4,78.0,agentbench_hh,agentbench_240720,[],agent +151,claude_v1.3,52.0,agentbench_hh,agentbench_240720,[],agent +152,gpt_3.5_turbo,14.0,agentbench_hh,agentbench_240720,[],agent +153,text_davinci_003,20.0,agentbench_hh,agentbench_240720,[],agent +154,claude_instant_v1.1,26.0,agentbench_hh,agentbench_240720,[],agent +155,text_davinci_002,14.0,agentbench_hh,agentbench_240720,[],agent +156,text_bison_001,4.0,agentbench_hh,agentbench_240720,[],agent +157,chatglm2_v0.2,6.0,agentbench_hh,agentbench_240720,[],agent +158,openchat_v3.2,8.0,agentbench_hh,agentbench_240720,[],agent +159,wizardlm_30b,6.0,agentbench_hh,agentbench_240720,[],agent +160,vicuna_13b,0.0,agentbench_hh,agentbench_240720,[],agent +161,wizardlm_13b,6.0,agentbench_hh,agentbench_240720,[],agent +162,llama2_13b_chat,2.0,agentbench_hh,agentbench_240720,[],agent +163,codegeex2_6b,0.0,agentbench_hh,agentbench_240720,[],agent +164,openchat_8192,4.0,agentbench_hh,agentbench_240720,[],agent +165,baichuan_13b_chat,0.0,agentbench_hh,agentbench_240720,[],agent +166,koala_13b,0.0,agentbench_hh,agentbench_240720,[],agent +167,llama2_7b_chat,0.0,agentbench_hh,agentbench_240720,[],agent +168,chatglm_6b,0.0,agentbench_hh,agentbench_240720,[],agent +169,vicuna_7b,0.0,agentbench_hh,agentbench_240720,[],agent +170,internlm_chat_7b,0.0,agentbench_hh,agentbench_240720,[],agent +171,baichuan_7b,0.0,agentbench_hh,agentbench_240720,[],agent +172,wizardcoder,0.0,agentbench_hh,agentbench_240720,[],agent +173,dolly_v2_12b,0.0,agentbench_hh,agentbench_240720,[],agent +174,oasst_sft_4_pythia_12b,0.0,agentbench_hh,agentbench_240720,[],agent +175,gpt_4,58.6,agentbench_ws,agentbench_240720,[],agent +176,claude_v1.3,59.26,agentbench_ws,agentbench_240720,[],agent +177,gpt_3.5_turbo,67.21,agentbench_ws,agentbench_240720,[],agent +178,text_davinci_003,61.43,agentbench_ws,agentbench_240720,[],agent +179,claude_instant_v1.1,44.22,agentbench_ws,agentbench_240720,[],agent +180,text_davinci_002,60.15,agentbench_ws,agentbench_240720,[],agent +181,text_bison_001,46.06,agentbench_ws,agentbench_240720,[],agent +182,chatglm2_v0.2,19.35,agentbench_ws,agentbench_240720,[],agent +183,openchat_v3.2,50.17,agentbench_ws,agentbench_240720,[],agent +184,wizardlm_30b,10.6,agentbench_ws,agentbench_240720,[],agent +185,vicuna_13b,12.57,agentbench_ws,agentbench_240720,[],agent +186,wizardlm_13b,1.2,agentbench_ws,agentbench_240720,[],agent +187,llama2_13b_chat,3.12,agentbench_ws,agentbench_240720,[],agent +188,codegeex2_6b,11.8,agentbench_ws,agentbench_240720,[],agent +189,openchat_8192,6.68,agentbench_ws,agentbench_240720,[],agent +190,baichuan_13b_chat,5.74,agentbench_ws,agentbench_240720,[],agent +191,koala_13b,5.96,agentbench_ws,agentbench_240720,[],agent +192,llama2_7b_chat,2.22,agentbench_ws,agentbench_240720,[],agent +193,chatglm_6b,0.5,agentbench_ws,agentbench_240720,[],agent +194,vicuna_7b,6.4,agentbench_ws,agentbench_240720,[],agent +195,internlm_chat_7b,0.0,agentbench_ws,agentbench_240720,[],agent +196,baichuan_7b,2.84,agentbench_ws,agentbench_240720,[],agent +197,wizardcoder,0.0,agentbench_ws,agentbench_240720,[],agent +198,dolly_v2_12b,0.38,agentbench_ws,agentbench_240720,[],agent +199,oasst_sft_4_pythia_12b,0.0,agentbench_ws,agentbench_240720,[],agent +200,gpt_4,22.59,agentbench_wb,agentbench_240720,[],agent +201,claude_v1.3,20.97,agentbench_wb,agentbench_240720,[],agent +202,gpt_3.5_turbo,15.69,agentbench_wb,agentbench_240720,[],agent +203,text_davinci_003,15.52,agentbench_wb,agentbench_240720,[],agent +204,claude_instant_v1.1,0.77,agentbench_wb,agentbench_240720,[],agent +205,text_davinci_002,1.11,agentbench_wb,agentbench_240720,[],agent +206,text_bison_001,20.46,agentbench_wb,agentbench_240720,[],agent +207,chatglm2_v0.2,12.87,agentbench_wb,agentbench_240720,[],agent +208,openchat_v3.2,14.92,agentbench_wb,agentbench_240720,[],agent +209,wizardlm_30b,3.07,agentbench_wb,agentbench_240720,[],agent +210,vicuna_13b,3.92,agentbench_wb,agentbench_240720,[],agent +211,wizardlm_13b,5.8,agentbench_wb,agentbench_240720,[],agent +212,llama2_13b_chat,11.94,agentbench_wb,agentbench_240720,[],agent +213,codegeex2_6b,5.37,agentbench_wb,agentbench_240720,[],agent +214,openchat_8192,7.08,agentbench_wb,agentbench_240720,[],agent +215,baichuan_13b_chat,2.3,agentbench_wb,agentbench_240720,[],agent +216,koala_13b,8.1,agentbench_wb,agentbench_240720,[],agent +217,llama2_7b_chat,3.75,agentbench_wb,agentbench_240720,[],agent +218,chatglm_6b,4.94,agentbench_wb,agentbench_240720,[],agent +219,vicuna_7b,0.17,agentbench_wb,agentbench_240720,[],agent +220,internlm_chat_7b,0.17,agentbench_wb,agentbench_240720,[],agent +221,baichuan_7b,5.8,agentbench_wb,agentbench_240720,[],agent +222,wizardcoder,6.65,agentbench_wb,agentbench_240720,[],agent +223,dolly_v2_12b,4.43,agentbench_wb,agentbench_240720,[],agent +224,oasst_sft_4_pythia_12b,0.34,agentbench_wb,agentbench_240720,[],agent +0,pythia_1b,31.4,arc_c,olmes_260624,[],reasoning +1,olmo_1b,38.6,arc_c,olmes_260624,[],reasoning +2,tinyllama_1.1b,38.1,arc_c,olmes_260624,[],reasoning +3,pythia_6.7b,44.6,arc_c,olmes_260624,[],reasoning +4,rpj_incite_7b,45.3,arc_c,olmes_260624,[],reasoning +5,stablelm2_1.6b,50.6,arc_c,olmes_260624,[],reasoning +6,olmo_7b,46.4,arc_c,olmes_260624,[],reasoning +7,mpt_7b,45.7,arc_c,olmes_260624,[],reasoning +8,falcon_7b,49.7,arc_c,olmes_260624,[],reasoning +9,llama2_7b,54.2,arc_c,olmes_260624,[],reasoning +10,llama2_13b,67.3,arc_c,olmes_260624,[],reasoning +11,olmo_1.7_7b,66.9,arc_c,olmes_260624,[],reasoning +12,llama3_8b,79.3,arc_c,olmes_260624,[],reasoning +13,mistral_7b_v0.1,78.6,arc_c,olmes_260624,[],reasoning +14,llama3_70b,93.7,arc_c,olmes_260624,[],reasoning +15,pythia_1b,63.4,arc_e,olmes_260624,[],reasoning +16,olmo_1b,68.3,arc_e,olmes_260624,[],reasoning +17,tinyllama_1.1b,69.5,arc_e,olmes_260624,[],reasoning +18,pythia_6.7b,72.6,arc_e,olmes_260624,[],reasoning +19,rpj_incite_7b,78.8,arc_e,olmes_260624,[],reasoning +20,stablelm2_1.6b,75.3,arc_e,olmes_260624,[],reasoning +21,olmo_7b,78.9,arc_e,olmes_260624,[],reasoning +22,mpt_7b,78.0,arc_e,olmes_260624,[],reasoning +23,falcon_7b,80.6,arc_e,olmes_260624,[],reasoning +24,llama2_7b,84.0,arc_e,olmes_260624,[],reasoning +25,llama2_13b,85.9,arc_e,olmes_260624,[],reasoning +26,olmo_1.7_7b,83.6,arc_e,olmes_260624,[],reasoning +27,llama3_8b,92.4,arc_e,olmes_260624,[],reasoning +28,mistral_7b_v0.1,90.8,arc_e,olmes_260624,[],reasoning +29,llama3_70b,97.7,arc_e,olmes_260624,[],reasoning +30,pythia_1b,56.8,boolq,olmes_260624,[],knowledge +31,olmo_1b,51.3,boolq,olmes_260624,[],knowledge +32,tinyllama_1.1b,63.6,boolq,olmes_260624,[],knowledge +33,pythia_6.7b,68.7,boolq,olmes_260624,[],knowledge +34,rpj_incite_7b,72.0,boolq,olmes_260624,[],knowledge +35,stablelm2_1.6b,82.3,boolq,olmes_260624,[],knowledge +36,olmo_7b,78.7,boolq,olmes_260624,[],knowledge +37,mpt_7b,82.4,boolq,olmes_260624,[],knowledge +38,falcon_7b,78.2,boolq,olmes_260624,[],knowledge +39,llama2_7b,86.1,boolq,olmes_260624,[],knowledge +40,llama2_13b,86.7,boolq,olmes_260624,[],knowledge +41,olmo_1.7_7b,85.9,boolq,olmes_260624,[],knowledge +42,llama3_8b,87.5,boolq,olmes_260624,[],knowledge +43,mistral_7b_v0.1,89.3,boolq,olmes_260624,[],knowledge +44,llama3_70b,91.7,boolq,olmes_260624,[],knowledge +45,pythia_1b,50.9,csqa,olmes_260624,[],knowledge +46,olmo_1b,62.2,csqa,olmes_260624,[],knowledge +47,tinyllama_1.1b,61.1,csqa,olmes_260624,[],knowledge +48,pythia_6.7b,62.1,csqa,olmes_260624,[],knowledge +49,rpj_incite_7b,69.2,csqa,olmes_260624,[],knowledge +50,stablelm2_1.6b,70.4,csqa,olmes_260624,[],knowledge +51,olmo_7b,70.8,csqa,olmes_260624,[],knowledge +52,mpt_7b,70.9,csqa,olmes_260624,[],knowledge +53,falcon_7b,73.4,csqa,olmes_260624,[],knowledge +54,llama2_7b,74.2,csqa,olmes_260624,[],knowledge +55,llama2_13b,74.0,csqa,olmes_260624,[],knowledge +56,olmo_1.7_7b,85.8,csqa,olmes_260624,[],knowledge +57,llama3_8b,73.9,csqa,olmes_260624,[],knowledge +58,mistral_7b_v0.1,72.4,csqa,olmes_260624,[],knowledge +59,llama3_70b,83.2,csqa,olmes_260624,[],knowledge +60,pythia_1b,48.0,hellaswag,olmes_260624,[],reasoning +61,olmo_1b,65.2,hellaswag,olmes_260624,[],reasoning +62,tinyllama_1.1b,60.8,hellaswag,olmes_260624,[],reasoning +63,pythia_6.7b,66.1,hellaswag,olmes_260624,[],reasoning +64,rpj_incite_7b,72.8,hellaswag,olmes_260624,[],reasoning +65,stablelm2_1.6b,70.3,hellaswag,olmes_260624,[],reasoning +66,olmo_7b,78.1,hellaswag,olmes_260624,[],reasoning +67,mpt_7b,79.6,hellaswag,olmes_260624,[],reasoning +68,falcon_7b,79.0,hellaswag,olmes_260624,[],reasoning +69,llama2_7b,78.9,hellaswag,olmes_260624,[],reasoning +70,llama2_13b,83.9,hellaswag,olmes_260624,[],reasoning +71,olmo_1.7_7b,80.1,hellaswag,olmes_260624,[],reasoning +72,llama3_8b,81.8,hellaswag,olmes_260624,[],reasoning +73,mistral_7b_v0.1,83.0,hellaswag,olmes_260624,[],reasoning +74,llama3_70b,89.5,hellaswag,olmes_260624,[],reasoning +75,pythia_1b,31.1,mmlu,olmes_260624,[],knowledge +76,olmo_1b,33.4,mmlu,olmes_260624,[],knowledge +77,tinyllama_1.1b,33.6,mmlu,olmes_260624,[],knowledge +78,pythia_6.7b,37.7,mmlu,olmes_260624,[],knowledge +79,rpj_incite_7b,40.1,mmlu,olmes_260624,[],knowledge +80,stablelm2_1.6b,40.4,mmlu,olmes_260624,[],knowledge +81,olmo_7b,40.5,mmlu,olmes_260624,[],knowledge +82,mpt_7b,40.6,mmlu,olmes_260624,[],knowledge +83,falcon_7b,42.1,mmlu,olmes_260624,[],knowledge +84,llama2_7b,46.2,mmlu,olmes_260624,[],knowledge +85,llama2_13b,55.8,mmlu,olmes_260624,[],knowledge +86,olmo_1.7_7b,54.4,mmlu,olmes_260624,[],knowledge +87,llama3_8b,66.6,mmlu,olmes_260624,[],knowledge +88,mistral_7b_v0.1,64.0,mmlu,olmes_260624,[],knowledge +89,llama3_70b,79.8,mmlu,olmes_260624,[],knowledge +90,pythia_1b,40.4,openbookqa,olmes_260624,[],knowledge +91,olmo_1b,47.6,openbookqa,olmes_260624,[],knowledge +92,tinyllama_1.1b,45.0,openbookqa,olmes_260624,[],knowledge +93,pythia_6.7b,50.4,openbookqa,olmes_260624,[],knowledge +94,rpj_incite_7b,49.0,openbookqa,olmes_260624,[],knowledge +95,stablelm2_1.6b,56.6,openbookqa,olmes_260624,[],knowledge +96,olmo_7b,55.8,openbookqa,olmes_260624,[],knowledge +97,mpt_7b,52.4,openbookqa,olmes_260624,[],knowledge +98,falcon_7b,55.2,openbookqa,olmes_260624,[],knowledge +99,llama2_7b,57.8,openbookqa,olmes_260624,[],knowledge +100,llama2_13b,65.4,openbookqa,olmes_260624,[],knowledge +101,olmo_1.7_7b,68.6,openbookqa,olmes_260624,[],knowledge +102,llama3_8b,77.2,openbookqa,olmes_260624,[],knowledge +103,mistral_7b_v0.1,80.6,openbookqa,olmes_260624,[],knowledge +104,llama3_70b,93.4,openbookqa,olmes_260624,[],knowledge +105,pythia_1b,68.9,piqa,olmes_260624,[],reasoning +106,olmo_1b,74.1,piqa,olmes_260624,[],reasoning +107,tinyllama_1.1b,71.7,piqa,olmes_260624,[],reasoning +108,pythia_6.7b,74.9,piqa,olmes_260624,[],reasoning +109,rpj_incite_7b,75.9,piqa,olmes_260624,[],reasoning +110,stablelm2_1.6b,75.6,piqa,olmes_260624,[],reasoning +111,olmo_7b,78.5,piqa,olmes_260624,[],reasoning +112,mpt_7b,79.2,piqa,olmes_260624,[],reasoning +113,falcon_7b,79.0,piqa,olmes_260624,[],reasoning +114,llama2_7b,77.5,piqa,olmes_260624,[],reasoning +115,llama2_13b,80.2,piqa,olmes_260624,[],reasoning +116,olmo_1.7_7b,80.3,piqa,olmes_260624,[],reasoning +117,llama3_8b,81.6,piqa,olmes_260624,[],reasoning +118,mistral_7b_v0.1,82.8,piqa,olmes_260624,[],reasoning +119,llama3_70b,91.6,piqa,olmes_260624,[],reasoning +120,pythia_1b,46.4,siqa,olmes_260624,[],other +121,olmo_1b,51.5,siqa,olmes_260624,[],other +122,tinyllama_1.1b,50.4,siqa,olmes_260624,[],other +123,pythia_6.7b,51.7,siqa,olmes_260624,[],other +124,rpj_incite_7b,56.6,siqa,olmes_260624,[],other +125,stablelm2_1.6b,64.3,siqa,olmes_260624,[],other +126,olmo_7b,56.5,siqa,olmes_260624,[],other +127,mpt_7b,57.4,siqa,olmes_260624,[],other +128,falcon_7b,60.1,siqa,olmes_260624,[],other +129,llama2_7b,59.6,siqa,olmes_260624,[],other +130,llama2_13b,65.9,siqa,olmes_260624,[],other +131,olmo_1.7_7b,76.1,siqa,olmes_260624,[],other +132,llama3_8b,70.2,siqa,olmes_260624,[],other +133,mistral_7b_v0.1,71.3,siqa,olmes_260624,[],other +134,llama3_70b,78.9,siqa,olmes_260624,[],other +135,pythia_1b,52.7,winogrande,olmes_260624,[],reasoning +136,olmo_1b,59.3,winogrande,olmes_260624,[],reasoning +137,tinyllama_1.1b,60.1,winogrande,olmes_260624,[],reasoning +138,pythia_6.7b,62.3,winogrande,olmes_260624,[],reasoning +139,rpj_incite_7b,68.0,winogrande,olmes_260624,[],reasoning +140,stablelm2_1.6b,65.7,winogrande,olmes_260624,[],reasoning +141,olmo_7b,68.5,winogrande,olmes_260624,[],reasoning +142,mpt_7b,70.2,winogrande,olmes_260624,[],reasoning +143,falcon_7b,71.3,winogrande,olmes_260624,[],reasoning +144,llama2_7b,71.7,winogrande,olmes_260624,[],reasoning +145,llama2_13b,74.9,winogrande,olmes_260624,[],reasoning +146,olmo_1.7_7b,73.6,winogrande,olmes_260624,[],reasoning +147,llama3_8b,76.2,winogrande,olmes_260624,[],reasoning +148,mistral_7b_v0.1,77.9,winogrande,olmes_260624,[],reasoning +149,llama3_70b,84.1,winogrande,olmes_260624,[],reasoning +150,pythia_1b,49.0,olmes_average,olmes_260624,[],holistic +151,olmo_1b,55.1,olmes_average,olmes_260624,[],holistic +152,tinyllama_1.1b,55.4,olmes_average,olmes_260624,[],holistic +153,pythia_6.7b,59.1,olmes_average,olmes_260624,[],holistic +154,rpj_incite_7b,62.8,olmes_average,olmes_260624,[],holistic +155,stablelm2_1.6b,65.1,olmes_average,olmes_260624,[],holistic +156,olmo_7b,65.3,olmes_average,olmes_260624,[],holistic +157,mpt_7b,65.6,olmes_average,olmes_260624,[],holistic +158,falcon_7b,66.9,olmes_average,olmes_260624,[],holistic +159,llama2_7b,69.0,olmes_average,olmes_260624,[],holistic +160,llama2_13b,74.0,olmes_average,olmes_260624,[],holistic +161,olmo_1.7_7b,75.5,olmes_average,olmes_260624,[],holistic +162,llama3_8b,78.7,olmes_average,olmes_260624,[],holistic +163,mistral_7b_v0.1,79.1,olmes_average,olmes_260624,[],holistic +164,llama3_70b,88.4,olmes_average,olmes_260624,[],holistic +0,llama_2_70b,0.3753,mmlu_pro,mmlu_pro_240610,[],knowledge +1,llama_3_8b,0.3536,mmlu_pro,mmlu_pro_240610,[],knowledge +2,deepseekmath_instruct,0.353,mmlu_pro,mmlu_pro_240610,[],knowledge +3,gemma_7b,0.3373,mmlu_pro,mmlu_pro_240610,[],knowledge +4,mistral_7b_v0.1,0.3088,mmlu_pro,mmlu_pro_240610,[],knowledge +5,mistral_7b_instruct_v0.2,0.3084,mmlu_pro,mmlu_pro_240610,[],knowledge +6,mistral_7b_v0.2,0.3043,mmlu_pro,mmlu_pro_240610,[],knowledge +7,qwen1.5_7b_chat,0.2906,mmlu_pro,mmlu_pro_240610,[],knowledge +8,yi_6b_chat,0.2884,mmlu_pro,mmlu_pro_240610,[],knowledge +9,yi_6b,0.2651,mmlu_pro,mmlu_pro_240610,[],knowledge +10,mistral_7b_instruct_v0.1,0.2575,mmlu_pro,mmlu_pro_240610,[],knowledge +11,llama_2_13b,0.2534,mmlu_pro,mmlu_pro_240610,[],knowledge +12,llemma_7b,0.2345,mmlu_pro,mmlu_pro_240610,[],knowledge +13,llama_2_7b,0.2032,mmlu_pro,mmlu_pro_240610,[],knowledge +14,gpt_4o,0.7255,mmlu_pro,mmlu_pro_240610,[],knowledge +15,claude_3_opus,0.6845,mmlu_pro,mmlu_pro_240610,[],knowledge +16,gpt_4_turbo,0.6371,mmlu_pro,mmlu_pro_240610,[],knowledge +17,gemini_1.5_flash,0.5912,mmlu_pro,mmlu_pro_240610,[],knowledge +18,yi_large,0.5753,mmlu_pro,mmlu_pro_240610,[],knowledge +19,claude_3_sonnet,0.568,mmlu_pro,mmlu_pro_240610,[],knowledge +20,llama_3_70b_instruct,0.562,mmlu_pro,mmlu_pro_240610,[],knowledge +21,deepseek_v2,0.5481,mmlu_pro,mmlu_pro_240610,[],knowledge +22,phi_3_medium_4k_instruct,0.5348,mmlu_pro,mmlu_pro_240610,[],knowledge +23,llama_3_70b,0.5278,mmlu_pro,mmlu_pro_240610,[],knowledge +24,qwen1.5_72b_chat,0.5162,mmlu_pro,mmlu_pro_240610,[],knowledge +25,mammoth2_8x7b_plus,0.504,mmlu_pro,mmlu_pro_240610,[],knowledge +26,qwen1.5_110b,0.4993,mmlu_pro,mmlu_pro_240610,[],knowledge +27,mammoth2_8b_plus,0.4335,mmlu_pro,mmlu_pro_240610,[],knowledge +28,mixtral_8x7b_instruct_v0.1,0.4327,mmlu_pro,mmlu_pro_240610,[],knowledge +29,phi_3_mini_4k_instruct,0.4317,mmlu_pro,mmlu_pro_240610,[],knowledge +30,yi_34b,0.4303,mmlu_pro,mmlu_pro_240610,[],knowledge +31,mixtral_8x7b_v0.1,0.4103,mmlu_pro,mmlu_pro_240610,[],knowledge +32,llama_3_8b_instruct,0.4098,mmlu_pro,mmlu_pro_240610,[],knowledge +33,mammoth2_7b_plus,0.4085,mmlu_pro,mmlu_pro_240610,[],knowledge +34,qwen1.5_14b_chat,0.3802,mmlu_pro,mmlu_pro_240610,[],knowledge +35,c4ai_command_r_v01,0.379,mmlu_pro,mmlu_pro_240610,[],knowledge +0,claude_3_5_sonnet_20240620,61.16,livebench_average,livebench_240701,[],holistic +1,gpt_4o_2024_05_13,54.96,livebench_average,livebench_240701,[],holistic +2,gpt_4_turbo_2024_04_09,53.0,livebench_average,livebench_240701,[],holistic +3,gpt_4_1106_preview,52.17,livebench_average,livebench_240701,[],holistic +4,claude_3_opus_20240229,50.75,livebench_average,livebench_240701,[],holistic +5,gpt_4_0125_preview,49.39,livebench_average,livebench_240701,[],holistic +6,deepseek_coder_v2,46.79,livebench_average,livebench_240701,[],holistic +7,gemini_1.5_pro_api_0514,44.35,livebench_average,livebench_240701,[],holistic +8,gemma_2_27b_it,41.22,livebench_average,livebench_240701,[],holistic +9,gemini_1.5_flash_api_0514,40.89,livebench_average,livebench_240701,[],holistic +10,qwen2_72b_instruct,40.16,livebench_average,livebench_240701,[],holistic +11,acm_rewrite_qwen2_72b_chat,39.6,livebench_average,livebench_240701,[],holistic +12,mistral_large_2402,38.92,livebench_average,livebench_240701,[],holistic +13,deepseek_chat_v2,38.39,livebench_average,livebench_240701,[],holistic +14,claude_3_sonnet_20240229,38.08,livebench_average,livebench_240701,[],holistic +15,meta_llama_3_70b_instruct,37.38,livebench_average,livebench_240701,[],holistic +16,claude_3_haiku_20240307,35.32,livebench_average,livebench_240701,[],holistic +17,mixtral_8x22b_instruct_v0.1,34.84,livebench_average,livebench_240701,[],holistic +18,gpt_3.5_turbo_0125,34.43,livebench_average,livebench_240701,[],holistic +19,gpt_3.5_turbo_1106,34.14,livebench_average,livebench_240701,[],holistic +20,command_r_plus,32.86,livebench_average,livebench_240701,[],holistic +21,mistral_small_2402,32.8,livebench_average,livebench_240701,[],holistic +22,gemma_2_9b_it,31.57,livebench_average,livebench_240701,[],holistic +23,phi_3_medium_4k_instruct,30.33,livebench_average,livebench_240701,[],holistic +24,phi_3_medium_128k_instruct,29.64,livebench_average,livebench_240701,[],holistic +25,deepseek_coder_v2_lite_instruct,29.15,livebench_average,livebench_240701,[],holistic +26,qwen1.5_110b_chat,28.96,livebench_average,livebench_240701,[],holistic +27,qwen1.5_72b_chat,28.89,livebench_average,livebench_240701,[],holistic +28,command_r,27.23,livebench_average,livebench_240701,[],holistic +29,phi_3_small_128k_instruct,27.19,livebench_average,livebench_240701,[],holistic +30,meta_llama_3_8b_instruct,26.67,livebench_average,livebench_240701,[],holistic +31,qwen2_7b_instruct,26.45,livebench_average,livebench_240701,[],holistic +32,phi_3_small_8k_instruct,26.24,livebench_average,livebench_240701,[],holistic +33,openhermes_2.5_mistral_7b,23.3,livebench_average,livebench_240701,[],holistic +34,mixtral_8x7b_instruct_v0.1,22.5,livebench_average,livebench_240701,[],holistic +35,mistral_7b_instruct_v0.2,19.33,livebench_average,livebench_240701,[],holistic +36,phi_3_mini_4k_instruct,19.27,livebench_average,livebench_240701,[],holistic +37,zephyr_7b_alpha,19.22,livebench_average,livebench_240701,[],holistic +38,phi_3_mini_128k_instruct,18.04,livebench_average,livebench_240701,[],holistic +39,zephyr_7b_beta,17.32,livebench_average,livebench_240701,[],holistic +40,deepseek_v2_lite_chat,17.14,livebench_average,livebench_240701,[],holistic +41,qwen1.5_7b_chat,16.5,livebench_average,livebench_240701,[],holistic +42,starling_lm_7b_beta,16.44,livebench_average,livebench_240701,[],holistic +43,vicuna_7b_v1.5_16k,13.71,livebench_average,livebench_240701,[],holistic +44,vicuna_7b_v1.5,11.73,livebench_average,livebench_240701,[],holistic +45,qwen1.5_4b_chat,11.13,livebench_average,livebench_240701,[],holistic +46,llama_2_7b_chat,10.25,livebench_average,livebench_240701,[],holistic +47,qwen2_1.5b_instruct,9.96,livebench_average,livebench_240701,[],holistic +48,yi_6b_chat,8.79,livebench_average,livebench_240701,[],holistic +49,qwen2_0.5b_instruct,6.78,livebench_average,livebench_240701,[],holistic +50,qwen1.5_1.8b_chat,6.09,livebench_average,livebench_240701,[],holistic +51,qwen1.5_0.5b_chat,5.26,livebench_average,livebench_240701,[],holistic +52,claude_3_5_sonnet_20240620,64.0,reasoning_average,livebench_240701,[],reasoning +53,gpt_4o_2024_05_13,55.0,reasoning_average,livebench_240701,[],reasoning +54,gpt_4_turbo_2024_04_09,54.0,reasoning_average,livebench_240701,[],reasoning +55,gpt_4_1106_preview,52.0,reasoning_average,livebench_240701,[],reasoning +56,claude_3_opus_20240229,41.0,reasoning_average,livebench_240701,[],reasoning +57,gpt_4_0125_preview,48.0,reasoning_average,livebench_240701,[],reasoning +58,deepseek_coder_v2,49.0,reasoning_average,livebench_240701,[],reasoning +59,gemini_1.5_pro_api_0514,33.0,reasoning_average,livebench_240701,[],reasoning +60,gemma_2_27b_it,31.0,reasoning_average,livebench_240701,[],reasoning +61,gemini_1.5_flash_api_0514,30.0,reasoning_average,livebench_240701,[],reasoning +62,qwen2_72b_instruct,42.0,reasoning_average,livebench_240701,[],reasoning +63,acm_rewrite_qwen2_72b_chat,37.0,reasoning_average,livebench_240701,[],reasoning +64,mistral_large_2402,35.0,reasoning_average,livebench_240701,[],reasoning +65,deepseek_chat_v2,29.0,reasoning_average,livebench_240701,[],reasoning +66,claude_3_sonnet_20240229,26.0,reasoning_average,livebench_240701,[],reasoning +67,meta_llama_3_70b_instruct,31.0,reasoning_average,livebench_240701,[],reasoning +68,claude_3_haiku_20240307,26.0,reasoning_average,livebench_240701,[],reasoning +69,mixtral_8x22b_instruct_v0.1,29.0,reasoning_average,livebench_240701,[],reasoning +70,gpt_3.5_turbo_0125,26.0,reasoning_average,livebench_240701,[],reasoning +71,gpt_3.5_turbo_1106,28.0,reasoning_average,livebench_240701,[],reasoning +72,command_r_plus,32.0,reasoning_average,livebench_240701,[],reasoning +73,mistral_small_2402,28.0,reasoning_average,livebench_240701,[],reasoning +74,gemma_2_9b_it,19.0,reasoning_average,livebench_240701,[],reasoning +75,phi_3_medium_4k_instruct,35.0,reasoning_average,livebench_240701,[],reasoning +76,phi_3_medium_128k_instruct,31.0,reasoning_average,livebench_240701,[],reasoning +77,deepseek_coder_v2_lite_instruct,22.0,reasoning_average,livebench_240701,[],reasoning +78,qwen1.5_110b_chat,26.0,reasoning_average,livebench_240701,[],reasoning +79,qwen1.5_72b_chat,21.0,reasoning_average,livebench_240701,[],reasoning +80,command_r,28.0,reasoning_average,livebench_240701,[],reasoning +81,phi_3_small_128k_instruct,36.0,reasoning_average,livebench_240701,[],reasoning +82,meta_llama_3_8b_instruct,25.0,reasoning_average,livebench_240701,[],reasoning +83,qwen2_7b_instruct,20.0,reasoning_average,livebench_240701,[],reasoning +84,phi_3_small_8k_instruct,23.0,reasoning_average,livebench_240701,[],reasoning +85,openhermes_2.5_mistral_7b,17.0,reasoning_average,livebench_240701,[],reasoning +86,mixtral_8x7b_instruct_v0.1,18.0,reasoning_average,livebench_240701,[],reasoning +87,mistral_7b_instruct_v0.2,13.0,reasoning_average,livebench_240701,[],reasoning +88,phi_3_mini_4k_instruct,19.0,reasoning_average,livebench_240701,[],reasoning +89,zephyr_7b_alpha,17.0,reasoning_average,livebench_240701,[],reasoning +90,phi_3_mini_128k_instruct,10.0,reasoning_average,livebench_240701,[],reasoning +91,zephyr_7b_beta,16.0,reasoning_average,livebench_240701,[],reasoning +92,deepseek_v2_lite_chat,13.0,reasoning_average,livebench_240701,[],reasoning +93,qwen1.5_7b_chat,13.0,reasoning_average,livebench_240701,[],reasoning +94,starling_lm_7b_beta,19.0,reasoning_average,livebench_240701,[],reasoning +95,vicuna_7b_v1.5_16k,15.0,reasoning_average,livebench_240701,[],reasoning +96,vicuna_7b_v1.5,12.0,reasoning_average,livebench_240701,[],reasoning +97,qwen1.5_4b_chat,13.0,reasoning_average,livebench_240701,[],reasoning +98,llama_2_7b_chat,5.0,reasoning_average,livebench_240701,[],reasoning +99,qwen2_1.5b_instruct,8.0,reasoning_average,livebench_240701,[],reasoning +100,yi_6b_chat,8.0,reasoning_average,livebench_240701,[],reasoning +101,qwen2_0.5b_instruct,3.0,reasoning_average,livebench_240701,[],reasoning +102,qwen1.5_1.8b_chat,5.0,reasoning_average,livebench_240701,[],reasoning +103,qwen1.5_0.5b_chat,4.0,reasoning_average,livebench_240701,[],reasoning +104,claude_3_5_sonnet_20240620,63.21,coding_average,livebench_240701,[],code +105,gpt_4o_2024_05_13,46.37,coding_average,livebench_240701,[],code +106,gpt_4_turbo_2024_04_09,47.05,coding_average,livebench_240701,[],code +107,gpt_4_1106_preview,44.37,coding_average,livebench_240701,[],code +108,claude_3_opus_20240229,40.05,coding_average,livebench_240701,[],code +109,gpt_4_0125_preview,44.05,coding_average,livebench_240701,[],code +110,deepseek_coder_v2,41.05,coding_average,livebench_240701,[],code +111,gemini_1.5_pro_api_0514,32.79,coding_average,livebench_240701,[],code +112,gemma_2_27b_it,36.74,coding_average,livebench_240701,[],code +113,gemini_1.5_flash_api_0514,39.05,coding_average,livebench_240701,[],code +114,qwen2_72b_instruct,31.79,coding_average,livebench_240701,[],code +115,acm_rewrite_qwen2_72b_chat,39.05,coding_average,livebench_240701,[],code +116,mistral_large_2402,26.84,coding_average,livebench_240701,[],code +117,deepseek_chat_v2,33.47,coding_average,livebench_240701,[],code +118,claude_3_sonnet_20240229,25.21,coding_average,livebench_240701,[],code +119,meta_llama_3_70b_instruct,20.95,coding_average,livebench_240701,[],code +120,claude_3_haiku_20240307,24.53,coding_average,livebench_240701,[],code +121,mixtral_8x22b_instruct_v0.1,33.11,coding_average,livebench_240701,[],code +122,gpt_3.5_turbo_0125,29.16,coding_average,livebench_240701,[],code +123,gpt_3.5_turbo_1106,26.84,coding_average,livebench_240701,[],code +124,command_r_plus,20.26,coding_average,livebench_240701,[],code +125,mistral_small_2402,24.21,coding_average,livebench_240701,[],code +126,gemma_2_9b_it,22.21,coding_average,livebench_240701,[],code +127,phi_3_medium_4k_instruct,20.58,coding_average,livebench_240701,[],code +128,phi_3_medium_128k_instruct,21.58,coding_average,livebench_240701,[],code +129,deepseek_coder_v2_lite_instruct,26.84,coding_average,livebench_240701,[],code +130,qwen1.5_110b_chat,22.21,coding_average,livebench_240701,[],code +131,qwen1.5_72b_chat,22.89,coding_average,livebench_240701,[],code +132,command_r,14.95,coding_average,livebench_240701,[],code +133,phi_3_small_128k_instruct,25.84,coding_average,livebench_240701,[],code +134,meta_llama_3_8b_instruct,18.26,coding_average,livebench_240701,[],code +135,qwen2_7b_instruct,29.21,coding_average,livebench_240701,[],code +136,phi_3_small_8k_instruct,19.58,coding_average,livebench_240701,[],code +137,openhermes_2.5_mistral_7b,11.63,coding_average,livebench_240701,[],code +138,mixtral_8x7b_instruct_v0.1,11.32,coding_average,livebench_240701,[],code +139,mistral_7b_instruct_v0.2,11.63,coding_average,livebench_240701,[],code +140,phi_3_mini_4k_instruct,14.95,coding_average,livebench_240701,[],code +141,zephyr_7b_alpha,11.32,coding_average,livebench_240701,[],code +142,phi_3_mini_128k_instruct,11.63,coding_average,livebench_240701,[],code +143,zephyr_7b_beta,8.32,coding_average,livebench_240701,[],code +144,deepseek_v2_lite_chat,8.63,coding_average,livebench_240701,[],code +145,qwen1.5_7b_chat,6.63,coding_average,livebench_240701,[],code +146,starling_lm_7b_beta,18.26,coding_average,livebench_240701,[],code +147,vicuna_7b_v1.5_16k,1.32,coding_average,livebench_240701,[],code +148,vicuna_7b_v1.5,1.0,coding_average,livebench_240701,[],code +149,qwen1.5_4b_chat,4.0,coding_average,livebench_240701,[],code +150,llama_2_7b_chat,0.0,coding_average,livebench_240701,[],code +151,qwen2_1.5b_instruct,5.63,coding_average,livebench_240701,[],code +152,yi_6b_chat,1.32,coding_average,livebench_240701,[],code +153,qwen2_0.5b_instruct,2.0,coding_average,livebench_240701,[],code +154,qwen1.5_1.8b_chat,0.0,coding_average,livebench_240701,[],code +155,qwen1.5_0.5b_chat,0.0,coding_average,livebench_240701,[],code +156,claude_3_5_sonnet_20240620,53.75,mathematics_average,livebench_240701,[],math +157,gpt_4o_2024_05_13,49.88,mathematics_average,livebench_240701,[],math +158,gpt_4_turbo_2024_04_09,48.99,mathematics_average,livebench_240701,[],math +159,gpt_4_1106_preview,47.55,mathematics_average,livebench_240701,[],math +160,claude_3_opus_20240229,46.54,mathematics_average,livebench_240701,[],math +161,gpt_4_0125_preview,42.75,mathematics_average,livebench_240701,[],math +162,deepseek_coder_v2,52.19,mathematics_average,livebench_240701,[],math +163,gemini_1.5_pro_api_0514,42.07,mathematics_average,livebench_240701,[],math +164,gemma_2_27b_it,36.23,mathematics_average,livebench_240701,[],math +165,gemini_1.5_flash_api_0514,38.54,mathematics_average,livebench_240701,[],math +166,qwen2_72b_instruct,43.44,mathematics_average,livebench_240701,[],math +167,acm_rewrite_qwen2_72b_chat,40.32,mathematics_average,livebench_240701,[],math +168,mistral_large_2402,32.2,mathematics_average,livebench_240701,[],math +169,deepseek_chat_v2,33.23,mathematics_average,livebench_240701,[],math +170,claude_3_sonnet_20240229,29.65,mathematics_average,livebench_240701,[],math +171,meta_llama_3_70b_instruct,32.31,mathematics_average,livebench_240701,[],math +172,claude_3_haiku_20240307,25.72,mathematics_average,livebench_240701,[],math +173,mixtral_8x22b_instruct_v0.1,26.94,mathematics_average,livebench_240701,[],math +174,gpt_3.5_turbo_0125,25.54,mathematics_average,livebench_240701,[],math +175,gpt_3.5_turbo_1106,28.13,mathematics_average,livebench_240701,[],math +176,command_r_plus,24.85,mathematics_average,livebench_240701,[],math +177,mistral_small_2402,26.76,mathematics_average,livebench_240701,[],math +178,gemma_2_9b_it,23.98,mathematics_average,livebench_240701,[],math +179,phi_3_medium_4k_instruct,27.54,mathematics_average,livebench_240701,[],math +180,phi_3_medium_128k_instruct,24.25,mathematics_average,livebench_240701,[],math +181,deepseek_coder_v2_lite_instruct,34.09,mathematics_average,livebench_240701,[],math +182,qwen1.5_110b_chat,25.58,mathematics_average,livebench_240701,[],math +183,qwen1.5_72b_chat,26.82,mathematics_average,livebench_240701,[],math +184,command_r,16.92,mathematics_average,livebench_240701,[],math +185,phi_3_small_128k_instruct,24.84,mathematics_average,livebench_240701,[],math +186,meta_llama_3_8b_instruct,17.58,mathematics_average,livebench_240701,[],math +187,qwen2_7b_instruct,25.83,mathematics_average,livebench_240701,[],math +188,phi_3_small_8k_instruct,24.15,mathematics_average,livebench_240701,[],math +189,openhermes_2.5_mistral_7b,20.1,mathematics_average,livebench_240701,[],math +190,mixtral_8x7b_instruct_v0.1,18.97,mathematics_average,livebench_240701,[],math +191,mistral_7b_instruct_v0.2,16.04,mathematics_average,livebench_240701,[],math +192,phi_3_mini_4k_instruct,19.88,mathematics_average,livebench_240701,[],math +193,zephyr_7b_alpha,9.61,mathematics_average,livebench_240701,[],math +194,phi_3_mini_128k_instruct,21.48,mathematics_average,livebench_240701,[],math +195,zephyr_7b_beta,11.23,mathematics_average,livebench_240701,[],math +196,deepseek_v2_lite_chat,11.99,mathematics_average,livebench_240701,[],math +197,qwen1.5_7b_chat,12.86,mathematics_average,livebench_240701,[],math +198,starling_lm_7b_beta,13.82,mathematics_average,livebench_240701,[],math +199,vicuna_7b_v1.5_16k,6.61,mathematics_average,livebench_240701,[],math +200,vicuna_7b_v1.5,4.33,mathematics_average,livebench_240701,[],math +201,qwen1.5_4b_chat,7.08,mathematics_average,livebench_240701,[],math +202,llama_2_7b_chat,4.78,mathematics_average,livebench_240701,[],math +203,qwen2_1.5b_instruct,7.16,mathematics_average,livebench_240701,[],math +204,yi_6b_chat,7.14,mathematics_average,livebench_240701,[],math +205,qwen2_0.5b_instruct,4.22,mathematics_average,livebench_240701,[],math +206,qwen1.5_1.8b_chat,2.14,mathematics_average,livebench_240701,[],math +207,qwen1.5_0.5b_chat,3.39,mathematics_average,livebench_240701,[],math +208,claude_3_5_sonnet_20240620,56.74,data_analysis_average,livebench_240701,[],knowledge +209,gpt_4o_2024_05_13,52.41,data_analysis_average,livebench_240701,[],knowledge +210,gpt_4_turbo_2024_04_09,51.32,data_analysis_average,livebench_240701,[],knowledge +211,gpt_4_1106_preview,51.33,data_analysis_average,livebench_240701,[],knowledge +212,claude_3_opus_20240229,54.32,data_analysis_average,livebench_240701,[],knowledge +213,gpt_4_0125_preview,54.06,data_analysis_average,livebench_240701,[],knowledge +214,deepseek_coder_v2,38.25,data_analysis_average,livebench_240701,[],knowledge +215,gemini_1.5_pro_api_0514,52.81,data_analysis_average,livebench_240701,[],knowledge +216,gemma_2_27b_it,43.58,data_analysis_average,livebench_240701,[],knowledge +217,gemini_1.5_flash_api_0514,44.03,data_analysis_average,livebench_240701,[],knowledge +218,qwen2_72b_instruct,26.24,data_analysis_average,livebench_240701,[],knowledge +219,acm_rewrite_qwen2_72b_chat,26.19,data_analysis_average,livebench_240701,[],knowledge +220,mistral_large_2402,42.55,data_analysis_average,livebench_240701,[],knowledge +221,deepseek_chat_v2,38.03,data_analysis_average,livebench_240701,[],knowledge +222,claude_3_sonnet_20240229,44.56,data_analysis_average,livebench_240701,[],knowledge +223,meta_llama_3_70b_instruct,42.41,data_analysis_average,livebench_240701,[],knowledge +224,claude_3_haiku_20240307,41.54,data_analysis_average,livebench_240701,[],knowledge +225,mixtral_8x22b_instruct_v0.1,30.33,data_analysis_average,livebench_240701,[],knowledge +226,gpt_3.5_turbo_0125,41.21,data_analysis_average,livebench_240701,[],knowledge +227,gpt_3.5_turbo_1106,41.7,data_analysis_average,livebench_240701,[],knowledge +228,command_r_plus,24.6,data_analysis_average,livebench_240701,[],knowledge +229,mistral_small_2402,31.88,data_analysis_average,livebench_240701,[],knowledge +230,gemma_2_9b_it,35.06,data_analysis_average,livebench_240701,[],knowledge +231,phi_3_medium_4k_instruct,31.63,data_analysis_average,livebench_240701,[],knowledge +232,phi_3_medium_128k_instruct,32.12,data_analysis_average,livebench_240701,[],knowledge +233,deepseek_coder_v2_lite_instruct,33.0,data_analysis_average,livebench_240701,[],knowledge +234,qwen1.5_110b_chat,31.45,data_analysis_average,livebench_240701,[],knowledge +235,qwen1.5_72b_chat,32.98,data_analysis_average,livebench_240701,[],knowledge +236,command_r,31.69,data_analysis_average,livebench_240701,[],knowledge +237,phi_3_small_128k_instruct,27.33,data_analysis_average,livebench_240701,[],knowledge +238,meta_llama_3_8b_instruct,23.33,data_analysis_average,livebench_240701,[],knowledge +239,qwen2_7b_instruct,28.75,data_analysis_average,livebench_240701,[],knowledge +240,phi_3_small_8k_instruct,27.5,data_analysis_average,livebench_240701,[],knowledge +241,openhermes_2.5_mistral_7b,26.92,data_analysis_average,livebench_240701,[],knowledge +242,mixtral_8x7b_instruct_v0.1,28.13,data_analysis_average,livebench_240701,[],knowledge +243,mistral_7b_instruct_v0.2,14.62,data_analysis_average,livebench_240701,[],knowledge +244,phi_3_mini_4k_instruct,14.67,data_analysis_average,livebench_240701,[],knowledge +245,zephyr_7b_alpha,17.4,data_analysis_average,livebench_240701,[],knowledge +246,phi_3_mini_128k_instruct,8.69,data_analysis_average,livebench_240701,[],knowledge +247,zephyr_7b_beta,15.75,data_analysis_average,livebench_240701,[],knowledge +248,deepseek_v2_lite_chat,18.19,data_analysis_average,livebench_240701,[],knowledge +249,qwen1.5_7b_chat,16.23,data_analysis_average,livebench_240701,[],knowledge +250,starling_lm_7b_beta,2.0,data_analysis_average,livebench_240701,[],knowledge +251,vicuna_7b_v1.5_16k,9.27,data_analysis_average,livebench_240701,[],knowledge +252,vicuna_7b_v1.5,2.67,data_analysis_average,livebench_240701,[],knowledge +253,qwen1.5_4b_chat,9.13,data_analysis_average,livebench_240701,[],knowledge +254,llama_2_7b_chat,0.0,data_analysis_average,livebench_240701,[],knowledge +255,qwen2_1.5b_instruct,10.01,data_analysis_average,livebench_240701,[],knowledge +256,yi_6b_chat,4.38,data_analysis_average,livebench_240701,[],knowledge +257,qwen2_0.5b_instruct,2.0,data_analysis_average,livebench_240701,[],knowledge +258,qwen1.5_1.8b_chat,3.33,data_analysis_average,livebench_240701,[],knowledge +259,qwen1.5_0.5b_chat,0.0,data_analysis_average,livebench_240701,[],knowledge +260,claude_3_5_sonnet_20240620,56.94,language_average,livebench_240701,[],other +261,gpt_4o_2024_05_13,53.94,language_average,livebench_240701,[],other +262,gpt_4_turbo_2024_04_09,45.26,language_average,livebench_240701,[],other +263,gpt_4_1106_preview,48.37,language_average,livebench_240701,[],other +264,claude_3_opus_20240229,51.72,language_average,livebench_240701,[],other +265,gpt_4_0125_preview,43.55,language_average,livebench_240701,[],other +266,deepseek_coder_v2,33.04,language_average,livebench_240701,[],other +267,gemini_1.5_pro_api_0514,38.25,language_average,livebench_240701,[],other +268,gemma_2_27b_it,32.4,language_average,livebench_240701,[],other +269,gemini_1.5_flash_api_0514,30.69,language_average,livebench_240701,[],other +270,qwen2_72b_instruct,29.21,language_average,livebench_240701,[],other +271,acm_rewrite_qwen2_72b_chat,30.03,language_average,livebench_240701,[],other +272,mistral_large_2402,28.74,language_average,livebench_240701,[],other +273,deepseek_chat_v2,32.29,language_average,livebench_240701,[],other +274,claude_3_sonnet_20240229,38.08,language_average,livebench_240701,[],other +275,meta_llama_3_70b_instruct,34.11,language_average,livebench_240701,[],other +276,claude_3_haiku_20240307,30.07,language_average,livebench_240701,[],other +277,mixtral_8x22b_instruct_v0.1,26.48,language_average,livebench_240701,[],other +278,gpt_3.5_turbo_0125,24.22,language_average,livebench_240701,[],other +279,gpt_3.5_turbo_1106,28.63,language_average,livebench_240701,[],other +280,command_r_plus,23.92,language_average,livebench_240701,[],other +281,mistral_small_2402,22.06,language_average,livebench_240701,[],other +282,gemma_2_9b_it,27.64,language_average,livebench_240701,[],other +283,phi_3_medium_4k_instruct,13.91,language_average,livebench_240701,[],other +284,phi_3_medium_128k_instruct,12.76,language_average,livebench_240701,[],other +285,deepseek_coder_v2_lite_instruct,10.64,language_average,livebench_240701,[],other +286,qwen1.5_110b_chat,13.22,language_average,livebench_240701,[],other +287,qwen1.5_72b_chat,11.37,language_average,livebench_240701,[],other +288,command_r,14.64,language_average,livebench_240701,[],other +289,phi_3_small_128k_instruct,12.28,language_average,livebench_240701,[],other +290,meta_llama_3_8b_instruct,18.72,language_average,livebench_240701,[],other +291,qwen2_7b_instruct,10.21,language_average,livebench_240701,[],other +292,phi_3_small_8k_instruct,14.96,language_average,livebench_240701,[],other +293,openhermes_2.5_mistral_7b,11.37,language_average,livebench_240701,[],other +294,mixtral_8x7b_instruct_v0.1,13.76,language_average,livebench_240701,[],other +295,mistral_7b_instruct_v0.2,9.05,language_average,livebench_240701,[],other +296,phi_3_mini_4k_instruct,7.1,language_average,livebench_240701,[],other +297,zephyr_7b_alpha,7.2,language_average,livebench_240701,[],other +298,phi_3_mini_128k_instruct,6.8,language_average,livebench_240701,[],other +299,zephyr_7b_beta,4.28,language_average,livebench_240701,[],other +300,deepseek_v2_lite_chat,9.2,language_average,livebench_240701,[],other +301,qwen1.5_7b_chat,6.18,language_average,livebench_240701,[],other +302,starling_lm_7b_beta,7.26,language_average,livebench_240701,[],other +303,vicuna_7b_v1.5_16k,7.92,language_average,livebench_240701,[],other +304,vicuna_7b_v1.5,8.66,language_average,livebench_240701,[],other +305,qwen1.5_4b_chat,5.8,language_average,livebench_240701,[],other +306,llama_2_7b_chat,6.86,language_average,livebench_240701,[],other +307,qwen2_1.5b_instruct,3.05,language_average,livebench_240701,[],other +308,yi_6b_chat,4.69,language_average,livebench_240701,[],other +309,qwen2_0.5b_instruct,2.8,language_average,livebench_240701,[],other +310,qwen1.5_1.8b_chat,3.16,language_average,livebench_240701,[],other +311,qwen1.5_0.5b_chat,2.88,language_average,livebench_240701,[],other +312,claude_3_5_sonnet_20240620,72.3,if_average,livebench_240701,[],other +313,gpt_4o_2024_05_13,72.17,if_average,livebench_240701,[],other +314,gpt_4_turbo_2024_04_09,71.39,if_average,livebench_240701,[],other +315,gpt_4_1106_preview,69.39,if_average,livebench_240701,[],other +316,claude_3_opus_20240229,70.87,if_average,livebench_240701,[],other +317,gpt_4_0125_preview,63.92,if_average,livebench_240701,[],other +318,deepseek_coder_v2,67.18,if_average,livebench_240701,[],other +319,gemini_1.5_pro_api_0514,67.2,if_average,livebench_240701,[],other +320,gemma_2_27b_it,67.37,if_average,livebench_240701,[],other +321,gemini_1.5_flash_api_0514,63.01,if_average,livebench_240701,[],other +322,qwen2_72b_instruct,68.27,if_average,livebench_240701,[],other +323,acm_rewrite_qwen2_72b_chat,65.0,if_average,livebench_240701,[],other +324,mistral_large_2402,68.19,if_average,livebench_240701,[],other +325,deepseek_chat_v2,64.34,if_average,livebench_240701,[],other +326,claude_3_sonnet_20240229,65.0,if_average,livebench_240701,[],other +327,meta_llama_3_70b_instruct,63.5,if_average,livebench_240701,[],other +328,claude_3_haiku_20240307,64.03,if_average,livebench_240701,[],other +329,mixtral_8x22b_instruct_v0.1,63.17,if_average,livebench_240701,[],other +330,gpt_3.5_turbo_0125,60.47,if_average,livebench_240701,[],other +331,gpt_3.5_turbo_1106,51.53,if_average,livebench_240701,[],other +332,command_r_plus,71.51,if_average,livebench_240701,[],other +333,mistral_small_2402,63.91,if_average,livebench_240701,[],other +334,gemma_2_9b_it,61.55,if_average,livebench_240701,[],other +335,phi_3_medium_4k_instruct,53.3,if_average,livebench_240701,[],other +336,phi_3_medium_128k_instruct,56.15,if_average,livebench_240701,[],other +337,deepseek_coder_v2_lite_instruct,48.34,if_average,livebench_240701,[],other +338,qwen1.5_110b_chat,55.26,if_average,livebench_240701,[],other +339,qwen1.5_72b_chat,58.25,if_average,livebench_240701,[],other +340,command_r,57.16,if_average,livebench_240701,[],other +341,phi_3_small_128k_instruct,36.88,if_average,livebench_240701,[],other +342,meta_llama_3_8b_instruct,57.14,if_average,livebench_240701,[],other +343,qwen2_7b_instruct,44.74,if_average,livebench_240701,[],other +344,phi_3_small_8k_instruct,48.24,if_average,livebench_240701,[],other +345,openhermes_2.5_mistral_7b,52.78,if_average,livebench_240701,[],other +346,mixtral_8x7b_instruct_v0.1,44.81,if_average,livebench_240701,[],other +347,mistral_7b_instruct_v0.2,51.65,if_average,livebench_240701,[],other +348,phi_3_mini_4k_instruct,40.05,if_average,livebench_240701,[],other +349,zephyr_7b_alpha,52.79,if_average,livebench_240701,[],other +350,phi_3_mini_128k_instruct,49.65,if_average,livebench_240701,[],other +351,zephyr_7b_beta,48.32,if_average,livebench_240701,[],other +352,deepseek_v2_lite_chat,41.83,if_average,livebench_240701,[],other +353,qwen1.5_7b_chat,44.12,if_average,livebench_240701,[],other +354,starling_lm_7b_beta,38.32,if_average,livebench_240701,[],other +355,vicuna_7b_v1.5_16k,42.12,if_average,livebench_240701,[],other +356,vicuna_7b_v1.5,41.75,if_average,livebench_240701,[],other +357,qwen1.5_4b_chat,27.75,if_average,livebench_240701,[],other +358,llama_2_7b_chat,44.88,if_average,livebench_240701,[],other +359,qwen2_1.5b_instruct,25.9,if_average,livebench_240701,[],other +360,yi_6b_chat,27.22,if_average,livebench_240701,[],other +361,qwen2_0.5b_instruct,26.63,if_average,livebench_240701,[],other +362,qwen1.5_1.8b_chat,22.9,if_average,livebench_240701,[],other +363,qwen1.5_0.5b_chat,21.3,if_average,livebench_240701,[],other