scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.2778254199662385,0.2400384567875128 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.40368671387966554,0.08581278065055217 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.42599897728156577,0.07162425926742408 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.2778254199662385,0.2400384567875128 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.36698792170878686,0.11834981273562825 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,-0.018181818181818184,1.0 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,-0.018181818181818184,1.0 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,-0.05454545454545454,0.8792698312489979 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,-0.018181818181818184,1.0 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,-0.1272727272727273,0.6480954385121052 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.05454545454545454,0.8792698312489979 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,-0.018181818181818184,1.0 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,-0.018181818181818184,1.0 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.05454545454545454,0.8792698312489979 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,-0.05454545454545454,0.8792698312489979 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.05454545454545454,0.8792698312489979 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.23636363636363636,0.3587114698573032 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.2,0.4453821448613115 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.05454545454545454,0.8792698312489979 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.1272727272727273,0.6480954385121052 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.587180674734059,0.01246215829454031 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.697277051246695,0.003004262239398284 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.587180674734059,0.01246215829454031 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6605782590758164,0.004936818556325077 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6363636363636364,0.005707170915504249 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.2727272727272727,0.2829668209876543 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.34545454545454546,0.16457331248997917 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.38181818181818183,0.12097096961680295 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.2727272727272727,0.2829668209876543 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.34545454545454546,0.16457331248997917 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6363636363636364,0.005707170915504249 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6363636363636364,0.005707170915504249 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6363636363636364,0.005707170915504249 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.18349396085439343,0.43487965849578336 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.2935903373670295,0.21152242941072896 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.2727272727272727,0.2829668209876543 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.18349396085439343,0.43487965849578336 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.2568915451961508,0.27429882739587574 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.4909090909090909,0.04053235730319064 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6000000000000001,0.00994553671637005 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.5636363636363636,0.016540504248837583 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.4909090909090909,0.04053235730319064 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5636363636363636,0.016540504248837583 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.34545454545454546,0.16457331248997917 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.41818181818181815,0.08656124739458072 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.4909090909090909,0.04053235730319064 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.34545454545454546,0.16457331248997917 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.34545454545454546,0.16457331248997917 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.9272727272727274,3.2567740901074234e-06 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8545454545454545,4.624619207952541e-05 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.9272727272727274,3.2567740901074234e-06 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6727272727272727,0.0031063111271444604 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.45454545454545453,0.06017015392015392 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.41818181818181815,0.08656124739458072 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.45454545454545453,0.06017015392015392 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5272727272727272,0.02638447971781305 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.5636363636363636,0.016540504248837583 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7454545454545454,0.000759529822029822 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9272727272727274,3.2567740901074234e-06 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.5272727272727272,0.02638447971781305 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.4403855060505442,0.06091869077971648 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.587180674734059,0.01246215829454031 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.45454545454545453,0.06017015392015392 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.38181818181818183,0.12097096961680295 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6363636363636364,0.005707170915504249 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.587180674734059,0.01246215829454031 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7090909090909091,0.0015912097162097162 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7090909090909091,0.0015912097162097162 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,0,1.0,5.010421677088344e-08 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,1,1.0,5.010421677088344e-08 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,3,1.0,5.010421677088344e-08 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,4,1.0,5.010421677088344e-08 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8545454545454545,4.624619207952541e-05 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8181818181818182,0.00013227513227513228 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8181818181818182,0.00013227513227513228 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7818181818181819,0.0003334435626102293 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.4909090909090909,0.04053235730319064 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8181818181818182,0.00013227513227513228 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6238794669049377,0.007931923532795268 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6605782590758164,0.004936818556325077 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.4403855060505442,0.06091869077971648 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7090909090909091,0.0015912097162097162 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.697277051246695,0.003004262239398284 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8545454545454545,4.624619207952541e-05 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8181818181818182,0.00013227513227513228 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7818181818181819,0.0003334435626102293 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6238794669049377,0.007931923532795268 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.4403855060505442,0.06091869077971648 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.5636363636363636,0.016540504248837583 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6363636363636364,0.005707170915504249 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5636363636363636,0.016540504248837583 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.5636363636363636,0.016540504248837583 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6000000000000001,0.00994553671637005 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5636363636363636,0.016540504248837583 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7339758434175737,0.0017872890369872653 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.587180674734059,0.01246215829454031 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6482593132545567,0.006117582447622459 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.759389481241052,0.0013210471654040124 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.759389481241052,0.0013210471654040124 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6363636363636364,0.005707170915504249 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7090909090909091,0.0015912097162097162 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6727272727272727,0.0031063111271444604 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.2727272727272727,0.2829668209876543 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5272727272727272,0.02638447971781305 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.38181818181818183,0.12097096961680295 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.4909090909090909,0.04053235730319064 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.38895558795273394,0.10000137830747906 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.38181818181818183,0.12097096961680295 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.697277051246695,0.003004262239398284 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.4909090909090909,0.04053235730319064 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.41818181818181815,0.08656124739458072 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6000000000000001,0.00994553671637005 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6000000000000001,0.00994553671637005 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8807710121010884,0.00017812930545546289 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8807710121010884,0.00017812930545546289 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8807710121010884,0.00017812930545546289 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.4770842982214229,0.042330229121360724 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6605782590758164,0.004936818556325077 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8440722199302099,0.0003281542287518694 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7339758434175737,0.0017872890369872653 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.9636363636363636,5.511463844797178e-07 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7818181818181819,0.0003334435626102293 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8181818181818182,0.00013227513227513228 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.4909090909090909,0.04053235730319064 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6363636363636364,0.005707170915504249 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7454545454545454,0.000759529822029822 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.45454545454545453,0.06017015392015392 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6363636363636364,0.005707170915504249 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.9272727272727274,3.2567740901074234e-06 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7090909090909091,0.0015912097162097162 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6727272727272727,0.0031063111271444604 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.4909090909090909,0.04053235730319064 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6727272727272727,0.0031063111271444604 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8181818181818182,0.00013227513227513228 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7818181818181819,0.0003334435626102293 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,1.0,5.010421677088344e-08 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.9636363636363636,5.511463844797178e-07 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,1.0,5.010421677088344e-08 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,1.0,5.010421677088344e-08 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,1.0,5.010421677088344e-08 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.9272727272727274,3.2567740901074234e-06 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.9636363636363636,5.511463844797178e-07 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,1.0,5.010421677088344e-08 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.9272727272727274,3.2567740901074234e-06 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.9272727272727274,3.2567740901074234e-06 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.9636363636363636,5.511463844797178e-07 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.9636363636363636,5.511463844797178e-07 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,0,1.0,5.010421677088344e-08 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,1,1.0,5.010421677088344e-08 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,3,1.0,5.010421677088344e-08 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,4,1.0,5.010421677088344e-08 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.45454545454545453,0.06017015392015392 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.6000000000000001,0.00994553671637005 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.3090909090909091,0.21834651074234407 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.5272727272727272,0.02638447971781305 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.38181818181818183,0.12097096961680295 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.5636363636363636,0.016540504248837583 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.5272727272727272,0.02638447971781305 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.4909090909090909,0.04053235730319064 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.45454545454545453,0.06017015392015392 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.6000000000000001,0.00994553671637005 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.34545454545454546,0.16457331248997917 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.41818181818181815,0.08656124739458072 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.2727272727272727,0.2829668209876543 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.5741725345968929,0.015177848122929492 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.3519121986239021,0.1366995137219537 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.42599897728156577,0.07162425926742408 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.38181818181818183,0.12097096961680295 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.4403855060505442,0.06091869077971648 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.6727272727272727,0.0031063111271444604 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.6727272727272727,0.0031063111271444604 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.5272727272727272,0.02638447971781305 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.3090909090909091,0.21834651074234407 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.45454545454545453,0.06017015392015392 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.2727272727272727,0.2829668209876543 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.4403855060505442,0.06091869077971648 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.38181818181818183,0.12097096961680295 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.4403855060505442,0.06091869077971648 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.45454545454545453,0.06017015392015392 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.36698792170878686,0.11834981273562825 BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,0,0.2,0.4453821448613115 BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,1,0.38181818181818183,0.12097096961680295 BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,2,0.41818181818181815,0.08656124739458072 BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,3,0.5272727272727272,0.02638447971781305 BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,4,0.5272727272727272,0.02638447971781305 BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228 BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228 BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822 BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8181818181818182,0.00013227513227513228 BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8073734277593311,0.0005907573118657002 BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822 BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.587180674734059,0.01246215829454031 BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6482593132545567,0.006117582447622459 BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05 BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7706746355884524,0.0010393630991335228 BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5371291452680612,0.02311942970946668 BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6238794669049377,0.007931923532795268 BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.4909090909090909,0.04053235730319064 BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8440722199302099,0.0003281542287518694 BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822 BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.36698792170878686,0.11834981273562825 BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05 BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9272727272727274,3.2567740901074234e-06 BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08 BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7090909090909091,0.0015912097162097162 BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6727272727272727,0.0031063111271444604 BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228 BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8073734277593311,0.0005907573118657002 BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7818181818181819,0.0003334435626102293 BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6605782590758164,0.004936818556325077 BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,-0.0909090909090909,0.7611503928170594 BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.07339758434175737,0.7547764265871044 BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.4403855060505442,0.06091869077971648 BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.3302891295379082,0.15985367483762747 BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.1272727272727273,0.6480954385121052 BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8705196492275474,0.00023202582506637044 BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7479575920067658,0.001637274718449882 BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5983660736054126,0.01175728488671479 BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6000000000000001,0.00994553671637005 BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162 BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822 BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5272727272727272,0.02638447971781305 BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7706746355884524,0.0010393630991335228 BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8909090909090909,1.3728555395222063e-05 BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.5272727272727272,0.02638447971781305 BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822 BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.7818181818181819,0.0003334435626102293 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.8545454545454545,4.624619207952541e-05 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7818181818181819,0.0003334435626102293 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.6731618328060892,0.004677734981047257 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.759389481241052,0.0013210471654040124 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7339758434175737,0.0017872890369872653 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.6238794669049377,0.007931923532795268 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.6363636363636364,0.005707170915504249 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7706746355884524,0.0010393630991335228 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7706746355884524,0.0010393630991335228 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.8909090909090909,1.3728555395222063e-05 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7818181818181819,0.0003334435626102293 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.5636363636363636,0.016540504248837583 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.6363636363636364,0.005707170915504249 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.7818181818181819,0.0003334435626102293 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.6363636363636364,0.005707170915504249 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.5636363636363636,0.016540504248837583 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.6727272727272727,0.0031063111271444604 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,0,0.2778254199662385,0.2400384567875128 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,1,0.40368671387966554,0.08581278065055217 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,2,0.42599897728156577,0.07162425926742408 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,3,0.2778254199662385,0.2400384567875128 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,4,0.36698792170878686,0.11834981273562825 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,0,-0.018181818181818184,1.0 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,1,-0.018181818181818184,1.0 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,2,-0.05454545454545454,0.8792698312489979 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,3,-0.018181818181818184,1.0 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,4,-0.1272727272727273,0.6480954385121052 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,0,0.05454545454545454,0.8792698312489979 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,1,-0.018181818181818184,1.0 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,2,-0.018181818181818184,1.0 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,3,0.05454545454545454,0.8792698312489979 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,4,-0.05454545454545454,0.8792698312489979 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,0,0.05454545454545454,0.8792698312489979 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,1,0.23636363636363636,0.3587114698573032 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,2,0.2,0.4453821448613115 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,3,0.05454545454545454,0.8792698312489979 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,4,0.1272727272727273,0.6480954385121052 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,0,0.587180674734059,0.01246215829454031 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,2,0.697277051246695,0.003004262239398284 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,3,0.587180674734059,0.01246215829454031 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,4,0.6605782590758164,0.004936818556325077 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,4,0.6363636363636364,0.005707170915504249 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,0,0.2727272727272727,0.2829668209876543 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,1,0.34545454545454546,0.16457331248997917 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,2,0.38181818181818183,0.12097096961680295 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,3,0.2727272727272727,0.2829668209876543 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,4,0.34545454545454546,0.16457331248997917 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,0,0.6363636363636364,0.005707170915504249 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,2,0.6363636363636364,0.005707170915504249 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,3,0.6363636363636364,0.005707170915504249 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,0,0.18349396085439343,0.43487965849578336 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,1,0.2935903373670295,0.21152242941072896 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,2,0.2727272727272727,0.2829668209876543 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,3,0.18349396085439343,0.43487965849578336 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,4,0.2568915451961508,0.27429882739587574 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,0,0.4909090909090909,0.04053235730319064 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,1,0.6000000000000001,0.00994553671637005 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,2,0.5636363636363636,0.016540504248837583 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,3,0.4909090909090909,0.04053235730319064 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,4,0.5636363636363636,0.016540504248837583 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,0,0.34545454545454546,0.16457331248997917 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,1,0.41818181818181815,0.08656124739458072 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,2,0.4909090909090909,0.04053235730319064 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,3,0.34545454545454546,0.16457331248997917 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,4,0.34545454545454546,0.16457331248997917 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.9272727272727274,3.2567740901074234e-06 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.8545454545454545,4.624619207952541e-05 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.9272727272727274,3.2567740901074234e-06 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.6727272727272727,0.0031063111271444604 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.45454545454545453,0.06017015392015392 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.41818181818181815,0.08656124739458072 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.45454545454545453,0.06017015392015392 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.5272727272727272,0.02638447971781305 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.5636363636363636,0.016540504248837583 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.7454545454545454,0.000759529822029822 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.9272727272727274,3.2567740901074234e-06 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.5272727272727272,0.02638447971781305 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.4403855060505442,0.06091869077971648 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.587180674734059,0.01246215829454031 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.45454545454545453,0.06017015392015392 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.38181818181818183,0.12097096961680295 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.6363636363636364,0.005707170915504249 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.587180674734059,0.01246215829454031 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,1,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,3,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,0,1.0,5.010421677088344e-08 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,1,1.0,5.010421677088344e-08 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,3,1.0,5.010421677088344e-08 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,4,1.0,5.010421677088344e-08 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,0,0.8545454545454545,4.624619207952541e-05 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,2,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,4,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,0,0.7818181818181819,0.0003334435626102293 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,3,0.4909090909090909,0.04053235730319064 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,4,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,0,0.6238794669049377,0.007931923532795268 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,1,0.6605782590758164,0.004936818556325077 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,2,0.4403855060505442,0.06091869077971648 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,3,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,4,0.697277051246695,0.003004262239398284 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,0,0.8545454545454545,4.624619207952541e-05 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,2,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,3,0.7818181818181819,0.0003334435626102293 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,0,0.6238794669049377,0.007931923532795268 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,1,0.4403855060505442,0.06091869077971648 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,2,0.5636363636363636,0.016540504248837583 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,3,0.6363636363636364,0.005707170915504249 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,4,0.5636363636363636,0.016540504248837583 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,1,0.5636363636363636,0.016540504248837583 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,2,0.6000000000000001,0.00994553671637005 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,4,0.5636363636363636,0.016540504248837583 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,0,0.7339758434175737,0.0017872890369872653 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,1,0.587180674734059,0.01246215829454031 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,2,0.6482593132545567,0.006117582447622459 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,3,0.759389481241052,0.0013210471654040124 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,4,0.759389481241052,0.0013210471654040124 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,0,0.6363636363636364,0.005707170915504249 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,1,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,2,0.6727272727272727,0.0031063111271444604 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,3,0.2727272727272727,0.2829668209876543 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,4,0.5272727272727272,0.02638447971781305 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,0,0.38181818181818183,0.12097096961680295 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,1,0.4909090909090909,0.04053235730319064 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,2,0.38895558795273394,0.10000137830747906 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,3,0.38181818181818183,0.12097096961680295 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,4,0.697277051246695,0.003004262239398284 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,0,0.4909090909090909,0.04053235730319064 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,1,0.41818181818181815,0.08656124739458072 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,2,0.6000000000000001,0.00994553671637005 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,4,0.6000000000000001,0.00994553671637005 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,1,0.8807710121010884,0.00017812930545546289 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,2,0.8807710121010884,0.00017812930545546289 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,3,0.8807710121010884,0.00017812930545546289 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,0,0.4770842982214229,0.042330229121360724 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,2,0.6605782590758164,0.004936818556325077 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,3,0.8440722199302099,0.0003281542287518694 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,4,0.7339758434175737,0.0017872890369872653 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,0,0.9636363636363636,5.511463844797178e-07 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,2,0.7818181818181819,0.0003334435626102293 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,2,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,0,0.4909090909090909,0.04053235730319064 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,1,0.6363636363636364,0.005707170915504249 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,2,0.7454545454545454,0.000759529822029822 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,0,0.45454545454545453,0.06017015392015392 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,2,0.6363636363636364,0.005707170915504249 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,3,0.9272727272727274,3.2567740901074234e-06 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,1,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,2,0.6727272727272727,0.0031063111271444604 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,3,0.4909090909090909,0.04053235730319064 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,4,0.6727272727272727,0.0031063111271444604 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,4,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,2,0.7818181818181819,0.0003334435626102293 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,0,1.0,5.010421677088344e-08 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,3,0.9636363636363636,5.511463844797178e-07 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,4,1.0,5.010421677088344e-08 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,0,1.0,5.010421677088344e-08 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,1,1.0,5.010421677088344e-08 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,2,0.9272727272727274,3.2567740901074234e-06 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,3,0.9636363636363636,5.511463844797178e-07 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,4,1.0,5.010421677088344e-08 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,0,0.9272727272727274,3.2567740901074234e-06 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,4,0.9272727272727274,3.2567740901074234e-06 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,0,0.9636363636363636,5.511463844797178e-07 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,4,0.9636363636363636,5.511463844797178e-07 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,0,1.0,5.010421677088344e-08 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,1,1.0,5.010421677088344e-08 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,3,1.0,5.010421677088344e-08 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,4,1.0,5.010421677088344e-08 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.45454545454545453,0.06017015392015392 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.6000000000000001,0.00994553671637005 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.3090909090909091,0.21834651074234407 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.5272727272727272,0.02638447971781305 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.38181818181818183,0.12097096961680295 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.5636363636363636,0.016540504248837583 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.5272727272727272,0.02638447971781305 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.4909090909090909,0.04053235730319064 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.45454545454545453,0.06017015392015392 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.6000000000000001,0.00994553671637005 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.34545454545454546,0.16457331248997917 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.41818181818181815,0.08656124739458072 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.2727272727272727,0.2829668209876543 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.5741725345968929,0.015177848122929492 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.3519121986239021,0.1366995137219537 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.42599897728156577,0.07162425926742408 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.38181818181818183,0.12097096961680295 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.4403855060505442,0.06091869077971648 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.6727272727272727,0.0031063111271444604 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.6727272727272727,0.0031063111271444604 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.5272727272727272,0.02638447971781305 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.3090909090909091,0.21834651074234407 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.45454545454545453,0.06017015392015392 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.2727272727272727,0.2829668209876543 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.4403855060505442,0.06091869077971648 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.38181818181818183,0.12097096961680295 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.4403855060505442,0.06091869077971648 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.45454545454545453,0.06017015392015392 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.36698792170878686,0.11834981273562825 aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,0,0.2,0.4453821448613115 aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,1,0.38181818181818183,0.12097096961680295 aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,2,0.41818181818181815,0.08656124739458072 aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,3,0.5272727272727272,0.02638447971781305 aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,4,0.5272727272727272,0.02638447971781305 aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822 aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,2,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,3,0.8073734277593311,0.0005907573118657002 aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822 aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,0,0.587180674734059,0.01246215829454031 aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,1,0.6482593132545567,0.006117582447622459 aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05 aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,3,0.7706746355884524,0.0010393630991335228 aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,4,0.5371291452680612,0.02311942970946668 aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,0,0.6238794669049377,0.007931923532795268 aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,1,0.4909090909090909,0.04053235730319064 aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,2,0.8440722199302099,0.0003281542287518694 aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822 aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,4,0.36698792170878686,0.11834981273562825 aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05 aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,1,0.9272727272727274,3.2567740901074234e-06 aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08 aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,3,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,4,0.6727272727272727,0.0031063111271444604 aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,1,0.8073734277593311,0.0005907573118657002 aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05 aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,3,0.7818181818181819,0.0003334435626102293 aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,4,0.6605782590758164,0.004936818556325077 aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,0,-0.0909090909090909,0.7611503928170594 aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,1,0.07339758434175737,0.7547764265871044 aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,2,0.4403855060505442,0.06091869077971648 aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,3,0.3302891295379082,0.15985367483762747 aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,4,0.1272727272727273,0.6480954385121052 aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293 aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,2,0.8705196492275474,0.00023202582506637044 aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,3,0.7479575920067658,0.001637274718449882 aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,4,0.5983660736054126,0.01175728488671479 aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,1,0.6000000000000001,0.00994553671637005 aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822 aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,4,0.5272727272727272,0.02638447971781305 aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,0,0.7706746355884524,0.0010393630991335228 aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,1,0.8909090909090909,1.3728555395222063e-05 aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,2,0.5272727272727272,0.02638447971781305 aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822 aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,0,0.7818181818181819,0.0003334435626102293 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,1,0.8545454545454545,4.624619207952541e-05 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,2,0.7818181818181819,0.0003334435626102293 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,0,0.6731618328060892,0.004677734981047257 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,1,0.759389481241052,0.0013210471654040124 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,2,0.7339758434175737,0.0017872890369872653 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,3,0.6238794669049377,0.007931923532795268 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,0,0.6363636363636364,0.005707170915504249 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,2,0.7706746355884524,0.0010393630991335228 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,4,0.7706746355884524,0.0010393630991335228 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,1,0.8909090909090909,1.3728555395222063e-05 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,2,0.7818181818181819,0.0003334435626102293 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,0,0.5636363636363636,0.016540504248837583 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,2,0.6363636363636364,0.005707170915504249 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,3,0.7818181818181819,0.0003334435626102293 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,4,0.6363636363636364,0.005707170915504249 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,2,0.5636363636363636,0.016540504248837583 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,0,0.6727272727272727,0.0031063111271444604 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822