benchbench / cache /agreements_cache_151f5bfbf87ac7384c2759731c72ec0c.csv
Yotam-Perlitz
update cache
f1c3da2
raw
history blame
120 kB
scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value
Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,0,-0.017485869096098686,0.9672206778351959
Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,1,-0.06826285140114943,0.8724042132624071
Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,2,-0.27291992568490936,0.5131179718629255
Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,3,-0.0623085741331382,0.8834734515868299
Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,4,0.11553071904436202,0.7852997192967395
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8743737489954189,0.004501296794893102
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8019858294586086,0.01664169341252048
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.865218326418788,0.005519059390504801
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9324959770534272,0.0007305971150650418
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9578331579912773,0.00018155839890573593
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,-0.30992157835736617,0.4550353006304514
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,-0.48460771469003827,0.2235972811859595
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,-0.1162588388208577,0.78397092283469
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,-0.03180360013624742,0.9404084479868535
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.6310234888301745,0.09339585968843296
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5719061307929368,0.1385541569597628
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,-0.2953447949582872,0.47758892197811004
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,-0.08547114468780825,0.8405203853999355
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,-0.02680948636066538,0.9497562944796989
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.4016145018471783,0.32402730112296474
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7247956777996108,0.04194484960329344
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.2767660595168839,0.5069548295866992
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.3337223270100439,0.4191769676693079
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.6126891094585267,0.10632638977302632
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8079257463851817,0.015261307993340337
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6647150497002838,0.07212235537894374
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9659235574949907,9.641323857066814e-05
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8593434484023453,0.0062437049978399314
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7704800482268904,0.025262942539415363
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9028773381740962,0.002126756432137772
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.748982925973149,0.032470780295939985
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8599957450436625,0.006160409391629476
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8718735582848011,0.004766072993988772
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9069576656171551,0.001875739334441522
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9502933219669614,0.00029570003340264575
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8905328662549648,0.003016032865892646
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5058552901713423,0.20090402274559316
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6767432630833718,0.0652968761285632
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7135518769682414,0.04685902831102101
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.873661116609048,0.004575776138454243
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8775217778627072,0.004181622363896538
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7683490298001087,0.025928082489068475
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.741463148953373,0.035258455741147623
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7891209052525207,0.019892902878583873
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8157900850650412,0.013547661219765379
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8625206786227912,0.005844699973375535
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.49625129009057833,0.211004712621783
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7482300147416783,0.0327435760119495
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9237060456412569,0.0010476652712265917
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8540419074377281,0.00694751386877189
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7827735900001105,0.021632253958226707
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7416615606437577,0.03518309274676423
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8208959354305796,0.01250307893717913
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9182336628416601,0.0012842298120423852
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9199026021249039,0.0012087423991030853
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7807842071724994,0.022196180227557687
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6016089012086534,0.11460809097860054
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.85978308688271,0.006187486327563118
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9471155608874564,0.00035525230596496123
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9238574615349179,0.0010415614421426264
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.780599537830846,0.022248986205867058
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.753379355065838,0.030905705190702806
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8379676352721162,0.009384640911630616
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8462209992405952,0.008075105621350536
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9020771423654268,0.0021784040615750178
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9392379026634557,0.000535591367028614
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7412355057774336,0.035345043191044964
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8195179387247324,0.01277979740900836
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9675915145186947,8.304238414993675e-05
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9356246311290696,0.0006351718939850358
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7466011946729814,0.03333852605723143
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9551682330569339,0.00021776057653192886
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.46353588273705637,0.24734250900688215
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8866352243352398,0.003339629955133934
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.937902652612242,0.0005710971446370687
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.2831911510498836,0.4967225093410736
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.2031844122583542,0.6293846722461313
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8894964926830444,0.0031000020401251533
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.549284007260608,0.15849945140105312
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7889373199563972,0.01994193933246426
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9415411104598773,0.00047780769988844555
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8981158348442198,0.0024460728519243077
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7526431927239958,0.0311644661156264
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8962925022649735,0.0025761063553240114
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.937590300147702,0.0005796196796032962
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5831241321997315,0.12921116102954364
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5561145441014004,0.1523217142123119
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.5664450708720614,0.14323389729888122
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.47517181530974595,0.23407895750101468
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.718855715365913,0.04449992445427745
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7168604276016974,0.04537877960385103
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.18264726732113173,0.6650765454064547
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.04614314940391431,0.9136043258512831
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.6369093478690498,0.08944819108801377
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8650362997962656,0.005540656777637369
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9481614738377944,0.00033485605767966255
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8579024362848122,0.006430262194723998
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9674751054383679,8.39330376548511e-05
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9711920829273566,5.848502027941985e-05
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9121630061872308,0.0015845787994022296
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7303458809128464,0.03963972108447683
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7466964409211542,0.03330355520543848
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8886798251454765,0.0031672235640011434
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9036719475219376,0.002076262347775526
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7623592248502944,0.02785522986224059
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8902509919824877,0.0030387234498153886
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8349964637145074,0.009887030967730168
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9513669166922365,0.00027717775621958416
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.979588964641596,2.0934517813580252e-05
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8898917220751776,0.0030678038612609354
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8000397965603336,0.01711033114623395
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7666453684194998,0.026467542617941944
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8751438663188438,0.004421691058140597
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8954496186826447,0.0026376993343606783
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8195357136433342,0.012776203631959988
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8973997559676354,0.0024966210305528294
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9238541898435834,0.0010416930833947954
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9232578806881373,0.0010658683179569461
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9466806411756816,0.00036396834317210526
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9402048459613361,0.0005108048313780666
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7428545649568395,0.03473202812850355
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8962239297969814,0.0025810820467571426
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9528032040825007,0.0002536158007562822
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8136140570811612,0.01400900062666989
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.5749045753814719,0.13602130778385005
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.780595487125304,0.022250145374352125
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8389921086523722,0.009215256295109017
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8803463320171083,0.003907570379771439
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7142670311425445,0.04653663665491792
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7977979460712193,0.017660348313797546
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7240026280446691,0.04228069432019545
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8051290094703403,0.01590190576987268
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9226246952938778,0.0010919364406592675
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.770582228125362,0.025231318204288148
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.5188109005585113,0.18769119165787862
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9642212364414142,0.00011145218096014672
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7836454491081474,0.021387948565361206
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,2,0.865235745718993,0.005516995432107779
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,3,0.819500116935474,0.012783401302719894
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7432637726714306,0.034578129186903464
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9522400671025366,0.0002626898916961467
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9732730607216835,4.677795327851405e-05
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9110081304703664,0.001646433879397326
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9433518650586681,0.0004353717167521428
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,4,0.880586328075459,0.003884834219553849
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9603201312455674,0.00015157780411521223
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9604114108423772,0.00015054459028416203
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9519258192529104,0.00026784516618954716
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9352773832366816,0.0006453340323628832
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9756845534259928,3.5288470321501036e-05
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6730282904268812,0.06736225845470355
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9361725603565639,0.0006193510978979659
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8618105831276622,0.005932414266978994
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9371490197710903,0.0005918014940797798
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8046621876144952,0.01601044603512172
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.41770329390345684,0.30313696659492734
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6529975286213465,0.07915856325659755
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6950517775314824,0.05566978580633573
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.5130382972054114,0.19351964488420637
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.6825577913683614,0.062140382561143265
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9059635004669196,0.0019350193188838174
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8702987510549938,0.00493787146977232
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8349295032906534,0.009898545248446817
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8245663895988613,0.011784555837564846
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9186996315597573,0.0012628532368153516
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.899783088468177,0.002330962388754791
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8724919719311256,0.004699674798249593
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9486250828884353,0.00032606741963897914
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9264530754805538,0.0009405124032405977
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.926933634016331,0.000922537739358256
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6984411569502376,0.05398723363884652
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.754828418128203,0.03040022622820331
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.5655988276473191,0.14396676855997925
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9407474980820671,0.000497230334167822
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.770589245932409,0.025229147116181697
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7775815292717585,0.023123063813025962
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5611200837416681,0.14787988852194642
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.34646366697352105,0.40049416986179387
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7868643731535557,0.020500867535993103
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8114670933196435,0.014473750045325934
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.4013581254554363,0.32436552572418753
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.28341806840646894,0.4963625961904983
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.3139211847524032,0.44892434309679713
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.2606167560977108,0.5330194398770082
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.32260154615753545,0.43577896021471924
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7827817854375669,0.021629949458519884
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9421767369217469,0.0004626159242720608
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.5386185630062554,0.16841388744478442
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7045551126623175,0.05103000019308416
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8414540075802577,0.00881618884168942
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8748256107732684,0.0044544778532186755
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8614522174161048,0.005976999431835443
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7878166990611953,0.02024289628983945
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8381151096374623,0.009360136935052572
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.876154278920616,0.0043186280005204514
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9802952193136,1.884578972104051e-05
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9841937367574427,9.755845662836177e-06
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8661864185981796,0.005405102460401999
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8297856426405835,0.010808669505560614
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9329487606730291,0.000716243089312378
MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.822202489777381,0.01224422861798353
MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6968865871905413,0.05475511707469452
MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9318897100616549,0.0007501099193828288
MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7939152572032528,0.018638835543465734
MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7761614135775217,0.02354161442763604
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9079242687040253,0.0018192466167481706
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5935991848770941,0.12081484777974201
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.96841302674998,7.693398893847449e-05
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9131963004520903,0.001530535130781307
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7594573765014532,0.02881968270449265
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6622792441367216,0.07355344210000651
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5835165093102912,0.1288909419896904
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7271748558955601,0.04094703171178795
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7369082697183147,0.0370157216672518
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7219159720057066,0.04317213020613491
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8973595810319037,0.002499476856786579
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6540145328427245,0.07853263145320354
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9470816844896075,0.0003559262259996983
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.798793471524343,0.017414760604056785
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.766501585020503,0.026513385703318352
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6776894663079587,0.06477689572321889
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6576248245381009,0.07633405000799688
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.796342090311639,0.018023378799051942
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.689140856921657,0.058678219175095074
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.6705942614169457,0.06873614015066103
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6842754194067544,0.0612256583562849
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7338112096805872,0.03824046140795786
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8786344078919507,0.0040722405599500165
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8914863638509409,0.0029400900210167272
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8522000994286094,0.007203358614415384
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7479170810940026,0.03285737031031745
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5899049701184135,0.1237398240474465
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.864013241961245,0.005663050469813282
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.726560560314063,0.04120326937800088
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7600546147835674,0.02861953111724766
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8675817638279608,0.00524352512595729
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.4358953069712842,0.280322780055143
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8724977849323057,0.004699053502733089
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.871502377377448,0.004806214049293794
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.636462032322589,0.08974474991245225
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7407371067623334,0.035535069908202585
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.13754152986907456,0.7453436298315592
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8584434869588686,0.006359804257501524
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9096718109287911,0.0017199423212977748
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.429513562091493,0.2882272134157949
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7258395762861067,0.04150524782255408
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.4140057077993773,0.3078793667149351
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8860840192325219,0.003387122941063616
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8531999374729967,0.007063738601380546
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.570698753672453,0.13958138247636556
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9462124246513754,0.00037350751375720304
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,1,0.820982530302196,0.012485817170678851
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9284819872198913,0.0008661544234609058
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9226572389021586,0.0010905865909148318
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8996834645928126,0.0023377397968761906
OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9806889787900566,1.77437080791335e-05
OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9467481050448351,0.00036260722071780783
OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9051882617143683,0.001982079878231783
OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8448816290057799,0.008279149903754354
OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9486969514405281,0.0003247187445212263
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7138885174194392,0.046707103452906885
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.40763933138747765,0.3161269846214854
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.5033557119680766,0.20350786972733814
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.4943676910774294,0.21301612937354739
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.3662549994154035,0.3722134961617391
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6943274080319848,0.05603338677616118
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.888202282224346,0.0032069637473251308
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.862959786938574,0.0057908774192851585
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.4422315456206938,0.2725814015162671
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9314197867245828,0.0007654668867563735
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8576726697477571,0.006460333718352682
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6477798867796105,0.08241558395766836
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7105249096891054,0.04823848031855015
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7433756448219943,0.034536127920169364
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.465629371128827,0.24492880327618063
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9815968610969954,1.5367458655827867e-05
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9215279351913577,0.0011380681078154023
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9028698976709195,0.0021272329705264844
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8115257987039834,0.014460915122317916
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8840656907304268,0.003564741739845647
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9288767434076772,0.0008521494712455959
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8762491857760322,0.004309027650395265
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.822174167720692,0.012249803466994006
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8388480886223416,0.009238949980481774
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9318866818637482,0.0007502082286076188
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6752208316271633,0.06613869004956173
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7677373687773497,0.026120973578910495
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7919204265038193,0.01915443839404165
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8238198607264919,0.01192852239680578
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8788769140000767,0.0040486473187813605
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5937971020205063,0.1206592532108973
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6743688104667733,0.0666125934693148
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6092910701405022,0.10882867605607495
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.02436876480189197,0.954326651607438
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7114255278499215,0.04782552820112736
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5887872724291499,0.12463254240428198
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,1,0.4029552549015283,0.32226121873409685
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,2,0.19589220319331574,0.6419903458052949
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,3,0.5147894627560958,0.1917415408232741
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,4,0.43696792691727815,0.2790047957490856
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9683600812057522,7.731839943750683e-05
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9490060035318915,0.00031896092810029624
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9033732116949054,0.0020951534061901173
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9728319200142996,4.911626350007423e-05
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9174158952141087,0.0013223130420052574
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8698029729880158,0.00499276771087744
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9736499620869766,4.483954353741208e-05
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8938963574061565,0.002753683842916408
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9427230009399408,0.00044981624708065733
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9288091831587435,0.0008545357544848401
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9876650170257133,4.648675321533348e-06
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9200698352872445,0.0012013420941124318
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8197843971795349,0.012725991028944833
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9667731014329254,8.943826166773405e-05
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9135236868955329,0.0015136659995374103
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.983826044072315,1.0449743172360012e-05
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9482689395026054,0.000332805134027447
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9334433471484072,0.0007007762613840839
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8998371432675459,0.0023272903802322954
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9131450099069247,0.0015331889972515346
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9445409047411082,0.00040889964932544416
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8996453255999854,0.00234033776853281
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8662449830102448,0.005398257529969565
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9506955154682739,0.00028866872380162265
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9121357775980045,0.0015860194531010332
LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9469225816315634,0.000359102582060145
LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.790872393374341,0.019428850798750914
LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7384692720332464,0.03640761031575469
LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9396936265489109,0.0005238133760109684
LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7853349194194776,0.020919442242219075
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8636070293544758,0.005712124057773506
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.837126038633602,0.009525258316342535
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7663953319208139,0.026547294337781743
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8834569465544357,0.00361946726545403
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8480938359553485,0.00779520658099071
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9882164477730901,4.05436289119973e-06
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9749878899040407,3.838912250625781e-05
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9189017807616305,0.0012536521795481071
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.976785228034165,3.073554131266073e-05
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9683736529744773,7.721974100004276e-05
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8793267175321069,0.004005119722136405
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8760721346635911,0.004326948446281908
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9315137258308156,0.0007623806815109492
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9671655908223616,8.633181797191984e-05
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7675767218262903,0.026171781192995118
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8483878251754778,0.007751839541749867
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9222607240796445,0.0011071076795417618
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9440994017259922,0.00041860181264251746
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9640433681068886,0.00011310737614553013
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.692434840005101,0.056990052908859494
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9942767822652612,4.6665376445687894e-07
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9682235346488557,7.831565067564543e-05
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9370054660599566,0.0005958002530390111
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.969420946106877,6.985512173523951e-05
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9834828472581691,1.1126279772397877e-05
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9928216304628095,9.197638948465057e-07
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9611899818187688,0.00014192004448559492
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9411758308443503,0.0004866843681750784
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9688368521395198,7.390226580769654e-05
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9723616916410369,5.16925798887181e-05
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9766036636486001,3.14580315476573e-05
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9597878054141521,0.00015769662952759886
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9404428288332258,0.0005048221249291256
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9831715348590928,1.176456701375346e-05
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.97187689823272,5.4440740892278444e-05
WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9852421877364517,7.946695487913594e-06
WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9692179758222269,7.124441373542135e-05
WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9412248237761267,0.000485487558057933
WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.98025276424875,1.8967257174977277e-05
WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9846373995357367,8.960181355366343e-06
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9522400671025366,0.0002626898916961467
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9732730607216835,4.677795327851405e-05
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9110081304703664,0.001646433879397326
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9433518650586681,0.0004353717167521428
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,4,0.880586328075459,0.003884834219553849
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.9355663499255871,0.0006368701046576545
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.9499604642147754,0.0003016036750416735
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.7164442699126142,0.04556339297891151
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.5643812833359342,0.14502482192576685
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.4448334653124403,0.269433453257965
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.9020957808919513,0.002177191904645508
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.9140262325400854,0.0014880077902407654
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.6613543728531551,0.07410115498793113
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.4797794956768499,0.2289297958345603
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.49503702005526434,0.21230024172428238
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.8658004484348707,0.005450353400185282
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.9239450258900821,0.0010380421984977164
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.6878185417270377,0.05936418242167244
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.6427492187377651,0.08562857067256696
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.19987101474191585,0.6351028985023905
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.7695981699173929,0.025536900476404875
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.851160886507116,0.00735033097799936
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.7535063061583401,0.030861215825263487
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.26946310602236634,0.5186811891252074
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.5071239778851739,0.19958915881626008
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.845558834843199,0.00817557674320208
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.8223598748455347,0.01221327849153134
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.7520379034546343,0.03137821860478068
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.5986152394502113,0.1169062576526029
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.766509325140422,0.026510916638992615
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.6388656044215879,0.08815791552969902
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.8220592376168137,0.012272442496278822
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.1610992186087647,0.7031245257171708
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.22938177579714764,0.584757473087143
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.16217150942988084,0.7012176634258844
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.8536693780854105,0.0069987855857581984
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.9079591032101378,0.0018172316533511903
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.7448797028215589,0.033974472983626124
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.436470242791583,0.2796159471960331
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.5113717481429286,0.195219904727713
BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,0,0.8848684214582546,0.0034933971141531536
BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,1,0.9247518427204778,0.0010059807632682822
BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,2,0.7024798803756629,0.05202256738347333
BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,3,0.6111548412929141,0.10745210550108082
BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,4,0.8864983521119945,0.0033513827582610342
BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8443252756395498,0.008364861793357709
BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8199557285303699,0.012691469447090417
BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6898121736766818,0.05833178396126367
BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.1445400076243653,0.732738456710739
BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.13444519427677581,0.7509364951619687
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9461712339012929,0.00037435448514068834
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8543556725359636,0.006904516600543572
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7671160990392422,0.026317800283773948
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.4230508906614041,0.29634091151848907
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,0.29492042180464345,0.478252042515081
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8192056092552416,0.01284304904344425
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8053230426409881,0.015856927546595193
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6785867773117831,0.06428605698561919
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.021028776761034942,0.960582665935811
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.25337930013147175,0.5448562000018814
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8101772449555595,0.014757563523095152
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7844308170919763,0.021169355122089707
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6407686957715764,0.08691312009391092
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.042093006210129874,0.9211687904012325
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.2813292229519864,0.4996795026573654
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8350456630970934,0.00987857623206292
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.879311548672376,0.004006582681021272
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6951300585252861,0.0556305769370549
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.30955291195703166,0.4556002793087552
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,0.09897629382276267,0.8156278898050575
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8313126956210078,0.010533178480029779
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8169388413464165,0.01330802664448977
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8065284450649773,0.015579295379409611
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.23722382427262312,0.5716108619128892
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,0.026088426326565897,0.9511063910298649
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5558829816104426,0.15252894598370506
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6390946692796851,0.08800754271923365
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.24121345447897227,0.5649619826999719
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,-0.13262144042688304,0.7542351704927408
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.46784288126219703,0.24238975539995447
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7467577882406231,0.03328104267130768
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7611545287510072,0.028253164658278467
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6541774611460981,0.07843262445172178
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.0830822493170678,0.8449361587214159
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.1985934514676979,0.6373119372341151
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9103256104990007,0.001683717098370581
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8079204807250888,0.015262498588799642
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7253154362419392,0.0417256201301186
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.2776474358858506,0.5055464711128136
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.04029159995291984,0.9245349726533298
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.919432996814919,0.0012296819224052442
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.87005129824662,0.004965222567299112
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9073703100625691,0.001851485138509531
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8673887162219034,0.005265692212272121
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8916723527123611,0.0029254223429427636
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9804801069360884,1.832282630082123e-05
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.980051779203359,1.9549343460335766e-05
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9720131442366731,5.3658869462094946e-05
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.9165887813382055,0.001361572704071016
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.9225103255266087,0.0010966889416837342
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9292369266176062,0.000839501038985727
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9505492134066896,0.00029121355501060477
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9415690777822339,0.00047713248045663163
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.9576750897378552,0.00018358576102437457
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8850761460392197,0.0034750864462593195
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9798647388383346,2.0101576768271062e-05
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9781250835045174,2.5741076148769547e-05
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9598475365356987,0.00015700207944980397
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.9317002702003969,0.000756276259880365
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8240635545541923,0.011881405061211926
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9849433970479835,8.437305784682183e-06
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9899107226768695,2.548168158279175e-06
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9645217100316719,0.00010869253777108847
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.9447465624679983,0.00040443116308794275
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8760879368136391,0.0043253470355424355
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9469408250476264,0.0003587374254477132
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9498225876442147,0.000304071618749767
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9413785598975157,0.0004817446027243596
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.8197292667265523,0.012737111858293043
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.9057861973602506,0.0019457176947306907
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9413025091864188,0.000483593804288479
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9083254977326705,0.001796125778484392
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.8626635526406192,0.005827152548807454
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.8043418970652331,0.016085184583393794
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8946872852632068,0.0026942203148939193
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9025950086780581,0.002144887259438991
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.7564264003460613,0.02984872863501939
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9033527343998258,0.002096452391428316
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.8494277893147777,0.0075996673267298715
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8534145445088147,0.007033997470343221
aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,0,-0.017485869096098686,0.9672206778351959
aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,1,-0.06826285140114943,0.8724042132624071
aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,2,-0.27291992568490936,0.5131179718629255
aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,3,-0.0623085741331382,0.8834734515868299
aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,4,0.11553071904436202,0.7852997192967395
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,0,0.8743737489954189,0.004501296794893102
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,1,0.8019858294586086,0.01664169341252048
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,2,0.865218326418788,0.005519059390504801
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,3,0.9324959770534272,0.0007305971150650418
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,4,0.9578331579912773,0.00018155839890573593
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,0,-0.30992157835736617,0.4550353006304514
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,1,-0.48460771469003827,0.2235972811859595
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,2,-0.1162588388208577,0.78397092283469
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,3,-0.03180360013624742,0.9404084479868535
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,4,-0.6310234888301745,0.09339585968843296
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,0,0.5719061307929368,0.1385541569597628
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,1,-0.2953447949582872,0.47758892197811004
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,2,-0.08547114468780825,0.8405203853999355
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,3,-0.02680948636066538,0.9497562944796989
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,4,-0.4016145018471783,0.32402730112296474
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,0,0.7247956777996108,0.04194484960329344
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,1,0.2767660595168839,0.5069548295866992
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,2,0.3337223270100439,0.4191769676693079
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,3,0.6126891094585267,0.10632638977302632
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,4,0.8079257463851817,0.015261307993340337
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,0,0.6647150497002838,0.07212235537894374
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,1,0.9659235574949907,9.641323857066814e-05
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,2,0.8593434484023453,0.0062437049978399314
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,3,0.7704800482268904,0.025262942539415363
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,4,0.9028773381740962,0.002126756432137772
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,0,0.748982925973149,0.032470780295939985
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,1,0.8599957450436625,0.006160409391629476
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,2,0.8718735582848011,0.004766072993988772
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,3,0.9069576656171551,0.001875739334441522
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,4,0.9502933219669614,0.00029570003340264575
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,0,0.8905328662549648,0.003016032865892646
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,1,0.5058552901713423,0.20090402274559316
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,2,0.6767432630833718,0.0652968761285632
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,3,0.7135518769682414,0.04685902831102101
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,4,0.873661116609048,0.004575776138454243
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,0,0.8775217778627072,0.004181622363896538
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,1,0.7683490298001087,0.025928082489068475
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,2,0.741463148953373,0.035258455741147623
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,3,0.7891209052525207,0.019892902878583873
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,4,0.8157900850650412,0.013547661219765379
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,0,0.8625206786227912,0.005844699973375535
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,1,0.49625129009057833,0.211004712621783
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,2,0.7482300147416783,0.0327435760119495
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,3,0.9237060456412569,0.0010476652712265917
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,4,0.8540419074377281,0.00694751386877189
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,0,0.7827735900001105,0.021632253958226707
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,1,0.7416615606437577,0.03518309274676423
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,2,0.8208959354305796,0.01250307893717913
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,3,0.9182336628416601,0.0012842298120423852
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,4,0.9199026021249039,0.0012087423991030853
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,0,0.7807842071724994,0.022196180227557687
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,1,0.6016089012086534,0.11460809097860054
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,2,0.85978308688271,0.006187486327563118
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,3,0.9471155608874564,0.00035525230596496123
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,4,0.9238574615349179,0.0010415614421426264
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.780599537830846,0.022248986205867058
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.753379355065838,0.030905705190702806
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.8379676352721162,0.009384640911630616
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.8462209992405952,0.008075105621350536
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.9020771423654268,0.0021784040615750178
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.9392379026634557,0.000535591367028614
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.7412355057774336,0.035345043191044964
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.8195179387247324,0.01277979740900836
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.9675915145186947,8.304238414993675e-05
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.9356246311290696,0.0006351718939850358
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.7466011946729814,0.03333852605723143
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.9551682330569339,0.00021776057653192886
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.46353588273705637,0.24734250900688215
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.8866352243352398,0.003339629955133934
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.937902652612242,0.0005710971446370687
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.2831911510498836,0.4967225093410736
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.2031844122583542,0.6293846722461313
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.8894964926830444,0.0031000020401251533
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.549284007260608,0.15849945140105312
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.7889373199563972,0.01994193933246426
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.9415411104598773,0.00047780769988844555
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.8981158348442198,0.0024460728519243077
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.7526431927239958,0.0311644661156264
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.8962925022649735,0.0025761063553240114
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.937590300147702,0.0005796196796032962
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.5831241321997315,0.12921116102954364
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.5561145441014004,0.1523217142123119
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.5664450708720614,0.14323389729888122
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.47517181530974595,0.23407895750101468
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.718855715365913,0.04449992445427745
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.7168604276016974,0.04537877960385103
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.18264726732113173,0.6650765454064547
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.04614314940391431,0.9136043258512831
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.6369093478690498,0.08944819108801377
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.8650362997962656,0.005540656777637369
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,0,0.9481614738377944,0.00033485605767966255
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,1,0.8579024362848122,0.006430262194723998
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,2,0.9674751054383679,8.39330376548511e-05
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,3,0.9711920829273566,5.848502027941985e-05
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,4,0.9121630061872308,0.0015845787994022296
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,0,0.7303458809128464,0.03963972108447683
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,1,0.7466964409211542,0.03330355520543848
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,2,0.8886798251454765,0.0031672235640011434
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,3,0.9036719475219376,0.002076262347775526
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,4,0.7623592248502944,0.02785522986224059
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,0,0.8902509919824877,0.0030387234498153886
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,1,0.8349964637145074,0.009887030967730168
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,2,0.9513669166922365,0.00027717775621958416
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,3,0.979588964641596,2.0934517813580252e-05
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,4,0.8898917220751776,0.0030678038612609354
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,0,0.8000397965603336,0.01711033114623395
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,1,0.7666453684194998,0.026467542617941944
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,2,0.8751438663188438,0.004421691058140597
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,3,0.8954496186826447,0.0026376993343606783
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,4,0.8195357136433342,0.012776203631959988
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,0,0.8973997559676354,0.0024966210305528294
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,1,0.9238541898435834,0.0010416930833947954
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,2,0.9232578806881373,0.0010658683179569461
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,3,0.9466806411756816,0.00036396834317210526
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,4,0.9402048459613361,0.0005108048313780666
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,0,0.7428545649568395,0.03473202812850355
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,1,0.8962239297969814,0.0025810820467571426
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,2,0.9528032040825007,0.0002536158007562822
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,3,0.8136140570811612,0.01400900062666989
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,4,0.5749045753814719,0.13602130778385005
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,0,0.780595487125304,0.022250145374352125
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,1,0.8389921086523722,0.009215256295109017
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,2,0.8803463320171083,0.003907570379771439
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,3,0.7142670311425445,0.04653663665491792
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,4,0.7977979460712193,0.017660348313797546
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,0,0.7240026280446691,0.04228069432019545
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,1,0.8051290094703403,0.01590190576987268
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,2,0.9226246952938778,0.0010919364406592675
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,3,0.770582228125362,0.025231318204288148
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,4,0.5188109005585113,0.18769119165787862
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,0,0.9642212364414142,0.00011145218096014672
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,1,0.7836454491081474,0.021387948565361206
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,2,0.865235745718993,0.005516995432107779
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,3,0.819500116935474,0.012783401302719894
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,4,0.7432637726714306,0.034578129186903464
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,0,0.9522400671025366,0.0002626898916961467
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,1,0.9732730607216835,4.677795327851405e-05
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,2,0.9110081304703664,0.001646433879397326
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,3,0.9433518650586681,0.0004353717167521428
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,4,0.880586328075459,0.003884834219553849
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,0,0.9603201312455674,0.00015157780411521223
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,1,0.9604114108423772,0.00015054459028416203
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,2,0.9519258192529104,0.00026784516618954716
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,3,0.9352773832366816,0.0006453340323628832
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,4,0.9756845534259928,3.5288470321501036e-05
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,0,0.6730282904268812,0.06736225845470355
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,1,0.9361725603565639,0.0006193510978979659
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,2,0.8618105831276622,0.005932414266978994
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,3,0.9371490197710903,0.0005918014940797798
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,4,0.8046621876144952,0.01601044603512172
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,0,0.41770329390345684,0.30313696659492734
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,1,0.6529975286213465,0.07915856325659755
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,2,0.6950517775314824,0.05566978580633573
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,3,0.5130382972054114,0.19351964488420637
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,4,0.6825577913683614,0.062140382561143265
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,0,0.9059635004669196,0.0019350193188838174
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,1,0.8702987510549938,0.00493787146977232
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,2,0.8349295032906534,0.009898545248446817
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,3,0.8245663895988613,0.011784555837564846
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,4,0.9186996315597573,0.0012628532368153516
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,0,0.899783088468177,0.002330962388754791
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,1,0.8724919719311256,0.004699674798249593
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,2,0.9486250828884353,0.00032606741963897914
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,3,0.9264530754805538,0.0009405124032405977
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,4,0.926933634016331,0.000922537739358256
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,0,0.6984411569502376,0.05398723363884652
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,1,0.754828418128203,0.03040022622820331
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,2,0.5655988276473191,0.14396676855997925
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,3,0.9407474980820671,0.000497230334167822
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,4,0.770589245932409,0.025229147116181697
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,0,0.7775815292717585,0.023123063813025962
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,1,0.5611200837416681,0.14787988852194642
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,2,0.34646366697352105,0.40049416986179387
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,3,0.7868643731535557,0.020500867535993103
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,4,0.8114670933196435,0.014473750045325934
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,0,0.4013581254554363,0.32436552572418753
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,1,0.28341806840646894,0.4963625961904983
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,2,0.3139211847524032,0.44892434309679713
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,3,0.2606167560977108,0.5330194398770082
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,4,0.32260154615753545,0.43577896021471924
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,0,0.7827817854375669,0.021629949458519884
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,1,0.9421767369217469,0.0004626159242720608
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,2,0.5386185630062554,0.16841388744478442
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,3,0.7045551126623175,0.05103000019308416
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,4,0.8414540075802577,0.00881618884168942
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,0,0.8748256107732684,0.0044544778532186755
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,1,0.8614522174161048,0.005976999431835443
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,2,0.7878166990611953,0.02024289628983945
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,3,0.8381151096374623,0.009360136935052572
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,4,0.876154278920616,0.0043186280005204514
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,0,0.9802952193136,1.884578972104051e-05
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,1,0.9841937367574427,9.755845662836177e-06
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,2,0.8661864185981796,0.005405102460401999
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,3,0.8297856426405835,0.010808669505560614
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,4,0.9329487606730291,0.000716243089312378
aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,0,0.822202489777381,0.01224422861798353
aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,1,0.6968865871905413,0.05475511707469452
aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,2,0.9318897100616549,0.0007501099193828288
aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,3,0.7939152572032528,0.018638835543465734
aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,4,0.7761614135775217,0.02354161442763604
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,0,0.9079242687040253,0.0018192466167481706
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,1,0.5935991848770941,0.12081484777974201
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,2,0.96841302674998,7.693398893847449e-05
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,3,0.9131963004520903,0.001530535130781307
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,4,0.7594573765014532,0.02881968270449265
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,0,0.6622792441367216,0.07355344210000651
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,1,0.5835165093102912,0.1288909419896904
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,2,0.7271748558955601,0.04094703171178795
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,3,0.7369082697183147,0.0370157216672518
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,4,0.7219159720057066,0.04317213020613491
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,0,0.8973595810319037,0.002499476856786579
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,1,0.6540145328427245,0.07853263145320354
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,2,0.9470816844896075,0.0003559262259996983
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,3,0.798793471524343,0.017414760604056785
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,4,0.766501585020503,0.026513385703318352
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,0,0.6776894663079587,0.06477689572321889
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,1,0.6576248245381009,0.07633405000799688
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,2,0.796342090311639,0.018023378799051942
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,3,0.689140856921657,0.058678219175095074
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,4,0.6705942614169457,0.06873614015066103
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,0,0.6842754194067544,0.0612256583562849
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,1,0.7338112096805872,0.03824046140795786
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,2,0.8786344078919507,0.0040722405599500165
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,3,0.8914863638509409,0.0029400900210167272
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,4,0.8522000994286094,0.007203358614415384
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,0,0.7479170810940026,0.03285737031031745
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,1,0.5899049701184135,0.1237398240474465
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,2,0.864013241961245,0.005663050469813282
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,3,0.726560560314063,0.04120326937800088
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,4,0.7600546147835674,0.02861953111724766
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,0,0.8675817638279608,0.00524352512595729
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,1,0.4358953069712842,0.280322780055143
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,2,0.8724977849323057,0.004699053502733089
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,3,0.871502377377448,0.004806214049293794
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,4,0.636462032322589,0.08974474991245225
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,0,0.7407371067623334,0.035535069908202585
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,1,0.13754152986907456,0.7453436298315592
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,2,0.8584434869588686,0.006359804257501524
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,3,0.9096718109287911,0.0017199423212977748
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,4,0.429513562091493,0.2882272134157949
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,0,0.7258395762861067,0.04150524782255408
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,1,0.4140057077993773,0.3078793667149351
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,2,0.8860840192325219,0.003387122941063616
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,3,0.8531999374729967,0.007063738601380546
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,4,0.570698753672453,0.13958138247636556
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,0,0.9462124246513754,0.00037350751375720304
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,1,0.820982530302196,0.012485817170678851
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,2,0.9284819872198913,0.0008661544234609058
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,3,0.9226572389021586,0.0010905865909148318
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,4,0.8996834645928126,0.0023377397968761906
aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,0,0.9806889787900566,1.77437080791335e-05
aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,1,0.9467481050448351,0.00036260722071780783
aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,2,0.9051882617143683,0.001982079878231783
aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,3,0.8448816290057799,0.008279149903754354
aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,4,0.9486969514405281,0.0003247187445212263
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,0,0.7138885174194392,0.046707103452906885
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,1,0.40763933138747765,0.3161269846214854
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,2,0.5033557119680766,0.20350786972733814
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,3,0.4943676910774294,0.21301612937354739
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,4,0.3662549994154035,0.3722134961617391
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,0,0.6943274080319848,0.05603338677616118
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,1,0.888202282224346,0.0032069637473251308
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,2,0.862959786938574,0.0057908774192851585
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,3,0.4422315456206938,0.2725814015162671
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,4,0.9314197867245828,0.0007654668867563735
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,0,0.8576726697477571,0.006460333718352682
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,1,0.6477798867796105,0.08241558395766836
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,2,0.7105249096891054,0.04823848031855015
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,3,0.7433756448219943,0.034536127920169364
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,4,0.465629371128827,0.24492880327618063
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,0,0.9815968610969954,1.5367458655827867e-05
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,1,0.9215279351913577,0.0011380681078154023
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,2,0.9028698976709195,0.0021272329705264844
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,3,0.8115257987039834,0.014460915122317916
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,4,0.8840656907304268,0.003564741739845647
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,0,0.9288767434076772,0.0008521494712455959
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,1,0.8762491857760322,0.004309027650395265
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,2,0.822174167720692,0.012249803466994006
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,3,0.8388480886223416,0.009238949980481774
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,4,0.9318866818637482,0.0007502082286076188
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,0,0.6752208316271633,0.06613869004956173
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,1,0.7677373687773497,0.026120973578910495
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,2,0.7919204265038193,0.01915443839404165
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,3,0.8238198607264919,0.01192852239680578
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,4,0.8788769140000767,0.0040486473187813605
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,0,0.5937971020205063,0.1206592532108973
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,1,0.6743688104667733,0.0666125934693148
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,2,0.6092910701405022,0.10882867605607495
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,3,0.02436876480189197,0.954326651607438
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,4,0.7114255278499215,0.04782552820112736
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,0,0.5887872724291499,0.12463254240428198
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,1,0.4029552549015283,0.32226121873409685
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,2,0.19589220319331574,0.6419903458052949
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,3,0.5147894627560958,0.1917415408232741
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,4,0.43696792691727815,0.2790047957490856
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,0,0.9683600812057522,7.731839943750683e-05
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,1,0.9490060035318915,0.00031896092810029624
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,2,0.9033732116949054,0.0020951534061901173
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,3,0.9728319200142996,4.911626350007423e-05
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,4,0.9174158952141087,0.0013223130420052574
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,0,0.8698029729880158,0.00499276771087744
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,1,0.9736499620869766,4.483954353741208e-05
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,2,0.8938963574061565,0.002753683842916408
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,3,0.9427230009399408,0.00044981624708065733
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,4,0.9288091831587435,0.0008545357544848401
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,0,0.9876650170257133,4.648675321533348e-06
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,1,0.9200698352872445,0.0012013420941124318
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,2,0.8197843971795349,0.012725991028944833
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,3,0.9667731014329254,8.943826166773405e-05
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,4,0.9135236868955329,0.0015136659995374103
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,0,0.983826044072315,1.0449743172360012e-05
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,1,0.9482689395026054,0.000332805134027447
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,2,0.9334433471484072,0.0007007762613840839
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,3,0.8998371432675459,0.0023272903802322954
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,4,0.9131450099069247,0.0015331889972515346
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,0,0.9445409047411082,0.00040889964932544416
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,1,0.8996453255999854,0.00234033776853281
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,2,0.8662449830102448,0.005398257529969565
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,3,0.9506955154682739,0.00028866872380162265
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,4,0.9121357775980045,0.0015860194531010332
aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,0,0.9469225816315634,0.000359102582060145
aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,1,0.790872393374341,0.019428850798750914
aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,2,0.7384692720332464,0.03640761031575469
aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,3,0.9396936265489109,0.0005238133760109684
aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,4,0.7853349194194776,0.020919442242219075
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,0,0.8636070293544758,0.005712124057773506
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,1,0.837126038633602,0.009525258316342535
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,2,0.7663953319208139,0.026547294337781743
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,3,0.8834569465544357,0.00361946726545403
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,4,0.8480938359553485,0.00779520658099071
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,0,0.9882164477730901,4.05436289119973e-06
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,1,0.9749878899040407,3.838912250625781e-05
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,2,0.9189017807616305,0.0012536521795481071
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,3,0.976785228034165,3.073554131266073e-05
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,4,0.9683736529744773,7.721974100004276e-05
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,0,0.8793267175321069,0.004005119722136405
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,1,0.8760721346635911,0.004326948446281908
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,2,0.9315137258308156,0.0007623806815109492
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,3,0.9671655908223616,8.633181797191984e-05
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,4,0.7675767218262903,0.026171781192995118
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,0,0.8483878251754778,0.007751839541749867
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,1,0.9222607240796445,0.0011071076795417618
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,2,0.9440994017259922,0.00041860181264251746
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,3,0.9640433681068886,0.00011310737614553013
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,4,0.692434840005101,0.056990052908859494
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,0,0.9942767822652612,4.6665376445687894e-07
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,1,0.9682235346488557,7.831565067564543e-05
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,2,0.9370054660599566,0.0005958002530390111
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,3,0.969420946106877,6.985512173523951e-05
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,4,0.9834828472581691,1.1126279772397877e-05
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,0,0.9928216304628095,9.197638948465057e-07
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,1,0.9611899818187688,0.00014192004448559492
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,2,0.9411758308443503,0.0004866843681750784
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,3,0.9688368521395198,7.390226580769654e-05
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,4,0.9723616916410369,5.16925798887181e-05
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,0,0.9766036636486001,3.14580315476573e-05
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,1,0.9597878054141521,0.00015769662952759886
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,2,0.9404428288332258,0.0005048221249291256
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,3,0.9831715348590928,1.176456701375346e-05
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,4,0.97187689823272,5.4440740892278444e-05
aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,0,0.9852421877364517,7.946695487913594e-06
aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,1,0.9692179758222269,7.124441373542135e-05
aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,2,0.9412248237761267,0.000485487558057933
aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,3,0.98025276424875,1.8967257174977277e-05
aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,4,0.9846373995357367,8.960181355366343e-06
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,0,0.9522400671025366,0.0002626898916961467
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,1,0.9732730607216835,4.677795327851405e-05
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,2,0.9110081304703664,0.001646433879397326
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,3,0.9433518650586681,0.0004353717167521428
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,4,0.880586328075459,0.003884834219553849
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.9355663499255871,0.0006368701046576545
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.9499604642147754,0.0003016036750416735
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.7164442699126142,0.04556339297891151
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.5643812833359342,0.14502482192576685
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.4448334653124403,0.269433453257965
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.9020957808919513,0.002177191904645508
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.9140262325400854,0.0014880077902407654
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.6613543728531551,0.07410115498793113
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.4797794956768499,0.2289297958345603
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.49503702005526434,0.21230024172428238
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.8658004484348707,0.005450353400185282
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.9239450258900821,0.0010380421984977164
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.6878185417270377,0.05936418242167244
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.6427492187377651,0.08562857067256696
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.19987101474191585,0.6351028985023905
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.7695981699173929,0.025536900476404875
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.851160886507116,0.00735033097799936
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.7535063061583401,0.030861215825263487
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.26946310602236634,0.5186811891252074
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.5071239778851739,0.19958915881626008
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.845558834843199,0.00817557674320208
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.8223598748455347,0.01221327849153134
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.7520379034546343,0.03137821860478068
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.5986152394502113,0.1169062576526029
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.766509325140422,0.026510916638992615
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.6388656044215879,0.08815791552969902
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.8220592376168137,0.012272442496278822
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.1610992186087647,0.7031245257171708
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.22938177579714764,0.584757473087143
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.16217150942988084,0.7012176634258844
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.8536693780854105,0.0069987855857581984
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.9079591032101378,0.0018172316533511903
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.7448797028215589,0.033974472983626124
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.436470242791583,0.2796159471960331
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.5113717481429286,0.195219904727713
aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,0,0.8848684214582546,0.0034933971141531536
aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,1,0.9247518427204778,0.0010059807632682822
aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,2,0.7024798803756629,0.05202256738347333
aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,3,0.6111548412929141,0.10745210550108082
aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,4,0.8864983521119945,0.0033513827582610342
aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,0,0.8443252756395498,0.008364861793357709
aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,1,0.8199557285303699,0.012691469447090417
aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,2,0.6898121736766818,0.05833178396126367
aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,3,0.1445400076243653,0.732738456710739
aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,4,-0.13444519427677581,0.7509364951619687
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,0,0.9461712339012929,0.00037435448514068834
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,1,0.8543556725359636,0.006904516600543572
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,2,0.7671160990392422,0.026317800283773948
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,3,0.4230508906614041,0.29634091151848907
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,4,0.29492042180464345,0.478252042515081
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,0,0.8192056092552416,0.01284304904344425
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,1,0.8053230426409881,0.015856927546595193
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,2,0.6785867773117831,0.06428605698561919
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,3,0.021028776761034942,0.960582665935811
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,4,-0.25337930013147175,0.5448562000018814
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,0,0.8101772449555595,0.014757563523095152
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,1,0.7844308170919763,0.021169355122089707
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,2,0.6407686957715764,0.08691312009391092
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,3,0.042093006210129874,0.9211687904012325
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,4,-0.2813292229519864,0.4996795026573654
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,0,0.8350456630970934,0.00987857623206292
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,1,0.879311548672376,0.004006582681021272
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,2,0.6951300585252861,0.0556305769370549
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,3,0.30955291195703166,0.4556002793087552
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,4,0.09897629382276267,0.8156278898050575
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,0,0.8313126956210078,0.010533178480029779
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,1,0.8169388413464165,0.01330802664448977
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,2,0.8065284450649773,0.015579295379409611
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,3,0.23722382427262312,0.5716108619128892
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,4,0.026088426326565897,0.9511063910298649
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,0,0.5558829816104426,0.15252894598370506
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,1,0.6390946692796851,0.08800754271923365
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,2,0.24121345447897227,0.5649619826999719
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,3,-0.13262144042688304,0.7542351704927408
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,4,-0.46784288126219703,0.24238975539995447
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,0,0.7467577882406231,0.03328104267130768
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,1,0.7611545287510072,0.028253164658278467
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,2,0.6541774611460981,0.07843262445172178
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,3,0.0830822493170678,0.8449361587214159
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,4,-0.1985934514676979,0.6373119372341151
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,0,0.9103256104990007,0.001683717098370581
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,1,0.8079204807250888,0.015262498588799642
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,2,0.7253154362419392,0.0417256201301186
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,3,0.2776474358858506,0.5055464711128136
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,4,-0.04029159995291984,0.9245349726533298
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,0,0.919432996814919,0.0012296819224052442
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,1,0.87005129824662,0.004965222567299112
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,2,0.9073703100625691,0.001851485138509531
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,3,0.8673887162219034,0.005265692212272121
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,4,0.8916723527123611,0.0029254223429427636
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,0,0.9804801069360884,1.832282630082123e-05
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,1,0.980051779203359,1.9549343460335766e-05
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,2,0.9720131442366731,5.3658869462094946e-05
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,3,0.9165887813382055,0.001361572704071016
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,4,0.9225103255266087,0.0010966889416837342
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,0,0.9292369266176062,0.000839501038985727
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,1,0.9505492134066896,0.00029121355501060477
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,2,0.9415690777822339,0.00047713248045663163
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,3,0.9576750897378552,0.00018358576102437457
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,4,0.8850761460392197,0.0034750864462593195
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,0,0.9798647388383346,2.0101576768271062e-05
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,1,0.9781250835045174,2.5741076148769547e-05
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,2,0.9598475365356987,0.00015700207944980397
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,3,0.9317002702003969,0.000756276259880365
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,4,0.8240635545541923,0.011881405061211926
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,0,0.9849433970479835,8.437305784682183e-06
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,1,0.9899107226768695,2.548168158279175e-06
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,2,0.9645217100316719,0.00010869253777108847
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,3,0.9447465624679983,0.00040443116308794275
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,4,0.8760879368136391,0.0043253470355424355
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,0,0.9469408250476264,0.0003587374254477132
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,1,0.9498225876442147,0.000304071618749767
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,2,0.9413785598975157,0.0004817446027243596
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,3,0.8197292667265523,0.012737111858293043
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,4,0.9057861973602506,0.0019457176947306907
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,0,0.9413025091864188,0.000483593804288479
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,1,0.9083254977326705,0.001796125778484392
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,2,0.8626635526406192,0.005827152548807454
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,3,0.8043418970652331,0.016085184583393794
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,4,0.8946872852632068,0.0026942203148939193
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,0,0.9025950086780581,0.002144887259438991
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,1,0.7564264003460613,0.02984872863501939
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,2,0.9033527343998258,0.002096452391428316
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,3,0.8494277893147777,0.0075996673267298715
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,4,0.8534145445088147,0.007033997470343221