Yotam-Perlitz commited on
Commit
0c6c3ca
1 Parent(s): 6e15caa

remove old cache

Browse files

Signed-off-by: Yotam-Perlitz <y.perlitz@ibm.com>

cache/aggregate_scoress_cache_151f5bfbf87ac7384c2759731c72ec0c.csv DELETED
@@ -1,122 +0,0 @@
1
- model,score
2
- gpt_4o_2024_05_13,0.9847612958226769
3
- claude_3_5_sonnet_20240620,0.982905982905983
4
- gpt_4o_2024_08_06,0.9575873827791986
5
- gpt_4_turbo_2024_04_09,0.9428463693169576
6
- gpt_4_0125_preview,0.9171132221004344
7
- mistral_large_2407,0.8868286445012787
8
- llama3_1_405b_instruct,0.8672150411280846
9
- yi_large_preview,0.8641553641553642
10
- hermes_3_llama3_1_70b,0.8626160990712074
11
- smaug_qwen2_72b_instruct,0.8593911248710011
12
- claude_3_opus_20240229,0.8573567665639277
13
- llama3_1_70b_instruct,0.8528408270971201
14
- athene_70b,0.8493788819875776
15
- deepseek_coder_v2,0.8444160272804775
16
- qwen2_72b_instruct,0.8354710666091739
17
- yi_large,0.8346273291925466
18
- gpt_4_0613,0.8146763722211293
19
- llama3_70b_instruct,0.8127546753337573
20
- llama3_70b,0.8105600539811066
21
- gemma_2_27b_it,0.8045273029120115
22
- gpt_4o_mini_2024_07_18,0.8032033326150972
23
- gemma_2_9b_it_dpo,0.790057915057915
24
- llama3_instruct_8b_simpo,0.7884068278805121
25
- phi_3_5_moe_instruct,0.7808307533539731
26
- qwen1_5_110b_chat,0.776004448721167
27
- qwen1_5_32b,0.7658569500674763
28
- yi_1_5_34b_chat,0.7553884711779449
29
- llama_2_70b,0.7303193882141251
30
- mixtral_8x22b_instruct_v0_1,0.7256023690940907
31
- gemma_2_9b_it_simpo,0.7199248120300753
32
- qwen1_5_32b_chat,0.7149122807017544
33
- mixtral_8x22b_v0_1,0.7135490753911806
34
- yi_34b,0.7128879892037787
35
- internlm2_5_20b_chat,0.6842105263157895
36
- phi_3_small_128k_instruct,0.66937564499484
37
- phi_3_medium_4k_instruct,0.6675079642841117
38
- claude_3_sonnet_20240229,0.653911731916847
39
- gemma_2_9b_it,0.6422797189051059
40
- infinity_instruct_3m_0625_llama3_8b,0.6273115220483642
41
- mistral_v0_1_7b,0.6239316239316239
42
- phi_3_5_mini_instruct,0.6202270381836945
43
- mistral_medium,0.6122209165687427
44
- mistral_large_2402,0.6058211467418628
45
- claude_instant_1_2,0.6049896049896051
46
- claude_2_0,0.6020066889632107
47
- yi_1_5_9b_chat,0.5881787802840435
48
- qwen1_5_14b,0.5770917678812416
49
- command_r_plus,0.5761033510394125
50
- llama_65b,0.5736992052781527
51
- gpt_3_5_turbo_0613,0.5724018332713985
52
- qwen1_5_72b_chat,0.5668371367348349
53
- phi_3_mini_4k_instruct,0.5548245614035088
54
- deepseek_llm_67b_chat,0.5506756756756757
55
- claude_3_haiku_20240307,0.549424005945745
56
- yi_34b_chat,0.5455449728905107
57
- dbrx_instructruct,0.5344129554655871
58
- jurassic_2_jumbo_178b,0.532051282051282
59
- llama3_1_8b_instruct,0.5175232440678665
60
- claude_2_1,0.5110980545763154
61
- qwen2_7b_instruct,0.5034227726178191
62
- mistral_small_2402,0.49924585218702866
63
- mixtral_8x7b_v0_1,0.49324324324324326
64
- glm_4_9b_chat,0.46499582289055974
65
- qwen1_5_14b_chat,0.4621068436857911
66
- phi_3_small_8k_instruct,0.45481670929241264
67
- gpt_3_5_turbo_0301,0.4528985507246377
68
- snorkel_mistral_pairrm_dpo,0.4521151586368978
69
- gemma_7b,0.4471997300944669
70
- gpt_3_5_turbo_0125,0.4401920188365201
71
- llama3_8b,0.43302968960863697
72
- dbrx_instruct,0.4266409266409266
73
- llama3_8b_instruct,0.420135922511747
74
- phi_3_mini_128k_instruct,0.4153205904787544
75
- llama_2_13b,0.41490478332583597
76
- jurassic_2_grande_17b,0.39529914529914534
77
- openhermes_2_5_mistral_7b,0.3832617447168531
78
- mistral_7b_v0_3,0.3737553342816501
79
- mixtral_8x7b_instruct_v0_1,0.3713078251895724
80
- qwen1_5_7b,0.3508771929824561
81
- yi_1_5_6b_chat,0.3354636591478697
82
- falcon_40b,0.32812265707002547
83
- command_r,0.32386140074759
84
- internlm2_chat_20b,0.32252252252252256
85
- mistral_7b_v0_2,0.31970128022759603
86
- luminous_supreme_70b,0.30128205128205127
87
- starling_lm_7b_alpha,0.29823530624445954
88
- yi_6b,0.29234143049932526
89
- mistral_7b_instruct_v0_2,0.28609513981031004
90
- zephyr_7b_alpha,0.2838442157327606
91
- zephyr_7b_beta,0.2666234345800909
92
- gemma_1_1_7b_it,0.26226051061156724
93
- mistral_7b_instruct_v0_3,0.2537839697282422
94
- starling_lm_7b_beta,0.25234441602728047
95
- llama_2_7b,0.2391288049182786
96
- luminous_extended_30b,0.2329059829059829
97
- alpaca_7b,0.22072072072072071
98
- vicuna_33b_v1_3,0.2056404230317274
99
- phi_2,0.20087901666849037
100
- qwen2_1_5b_instruct,0.19711042311661506
101
- yi_6b_chat,0.1938854489164087
102
- qwen1_5_7b_chat,0.1916569245052217
103
- tulu_2_dpo_70b,0.17624223602484473
104
- qwen1_5_4b_chat,0.1674406604747162
105
- llama_2_70b_chat,0.15527950310559005
106
- gpt_neox_20b,0.14400584795321636
107
- vicuna_7b_v1_5,0.13619501854795973
108
- falcon_40b_instruct,0.13264580369843526
109
- gemma_7b_it,0.12136319058515854
110
- falcon_7b,0.11407257459889038
111
- gpt_j_6b,0.10160818713450293
112
- luminous_base_13b,0.08333333333333333
113
- llama_2_7b_chat,0.08304448781801049
114
- gemma_1_1_2b_it,0.07665903890160183
115
- olmo_7b,0.06545209176788123
116
- gemma_2b_it,0.05921052631578947
117
- qwen1_5_1_8b_chat,0.059167526659786716
118
- qwen2_0_5b_instruct,0.059081527347781215
119
- pythia_12b,0.054093567251461985
120
- pythia_6_9b,0.019736842105263157
121
- falcon_7b_instruct,0.013513513513513514
122
- qwen1_5_0_5b_chat,0.013157894736842105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cache/aggregate_scoress_cache_1b58bbc4e0d124b0a524da1001369741.csv DELETED
@@ -1,122 +0,0 @@
1
- model,score
2
- gpt_4o_2024_05_13,0.9847612958226769
3
- claude_3_5_sonnet_20240620,0.982905982905983
4
- gpt_4o_2024_08_06,0.9575873827791986
5
- gpt_4_turbo_2024_04_09,0.9428463693169576
6
- gpt_4_0125_preview,0.9171132221004344
7
- mistral_large_2407,0.8868286445012787
8
- llama3_1_405b_instruct,0.8672150411280846
9
- yi_large_preview,0.8641553641553642
10
- hermes_3_llama3_1_70b,0.8626160990712074
11
- smaug_qwen2_72b_instruct,0.8593911248710011
12
- claude_3_opus_20240229,0.8573567665639277
13
- llama3_1_70b_instruct,0.8528408270971201
14
- athene_70b,0.8493788819875776
15
- deepseek_coder_v2,0.8444160272804775
16
- qwen2_72b_instruct,0.8354710666091739
17
- yi_large,0.8346273291925466
18
- gpt_4_0613,0.8146763722211293
19
- llama3_70b_instruct,0.8127546753337573
20
- llama3_70b,0.8105600539811066
21
- gemma_2_27b_it,0.8045273029120115
22
- gpt_4o_mini_2024_07_18,0.8032033326150972
23
- gemma_2_9b_it_dpo,0.790057915057915
24
- llama3_instruct_8b_simpo,0.7884068278805121
25
- phi_3_5_moe_instruct,0.7808307533539731
26
- qwen1_5_110b_chat,0.776004448721167
27
- qwen1_5_32b,0.7658569500674763
28
- yi_1_5_34b_chat,0.7553884711779449
29
- llama_2_70b,0.7303193882141251
30
- mixtral_8x22b_instruct_v0_1,0.7256023690940907
31
- gemma_2_9b_it_simpo,0.7199248120300753
32
- qwen1_5_32b_chat,0.7149122807017544
33
- mixtral_8x22b_v0_1,0.7135490753911806
34
- yi_34b,0.7128879892037787
35
- internlm2_5_20b_chat,0.6842105263157895
36
- phi_3_small_128k_instruct,0.66937564499484
37
- phi_3_medium_4k_instruct,0.6675079642841117
38
- claude_3_sonnet_20240229,0.653911731916847
39
- gemma_2_9b_it,0.6422797189051059
40
- infinity_instruct_3m_0625_llama3_8b,0.6273115220483642
41
- mistral_v0_1_7b,0.6239316239316239
42
- phi_3_5_mini_instruct,0.6202270381836945
43
- mistral_medium,0.6122209165687427
44
- mistral_large_2402,0.6058211467418628
45
- claude_instant_1_2,0.6049896049896051
46
- claude_2_0,0.6020066889632107
47
- yi_1_5_9b_chat,0.5881787802840435
48
- qwen1_5_14b,0.5770917678812416
49
- command_r_plus,0.5761033510394125
50
- llama_65b,0.5736992052781527
51
- gpt_3_5_turbo_0613,0.5724018332713985
52
- qwen1_5_72b_chat,0.5668371367348349
53
- phi_3_mini_4k_instruct,0.5548245614035088
54
- deepseek_llm_67b_chat,0.5506756756756757
55
- claude_3_haiku_20240307,0.549424005945745
56
- yi_34b_chat,0.5455449728905107
57
- dbrx_instructruct,0.5344129554655871
58
- jurassic_2_jumbo_178b,0.532051282051282
59
- llama3_1_8b_instruct,0.5175232440678665
60
- claude_2_1,0.5110980545763154
61
- qwen2_7b_instruct,0.5034227726178191
62
- mistral_small_2402,0.49924585218702866
63
- mixtral_8x7b_v0_1,0.49324324324324326
64
- glm_4_9b_chat,0.46499582289055974
65
- qwen1_5_14b_chat,0.4621068436857911
66
- phi_3_small_8k_instruct,0.45481670929241264
67
- gpt_3_5_turbo_0301,0.4528985507246377
68
- snorkel_mistral_pairrm_dpo,0.4521151586368978
69
- gemma_7b,0.4471997300944669
70
- gpt_3_5_turbo_0125,0.4401920188365201
71
- llama3_8b,0.43302968960863697
72
- dbrx_instruct,0.4266409266409266
73
- llama3_8b_instruct,0.420135922511747
74
- phi_3_mini_128k_instruct,0.4153205904787544
75
- llama_2_13b,0.41490478332583597
76
- jurassic_2_grande_17b,0.39529914529914534
77
- openhermes_2_5_mistral_7b,0.3832617447168531
78
- mistral_7b_v0_3,0.3737553342816501
79
- mixtral_8x7b_instruct_v0_1,0.3713078251895724
80
- qwen1_5_7b,0.3508771929824561
81
- yi_1_5_6b_chat,0.3354636591478697
82
- falcon_40b,0.32812265707002547
83
- command_r,0.32386140074759
84
- internlm2_chat_20b,0.32252252252252256
85
- mistral_7b_v0_2,0.31970128022759603
86
- luminous_supreme_70b,0.30128205128205127
87
- starling_lm_7b_alpha,0.29823530624445954
88
- yi_6b,0.29234143049932526
89
- mistral_7b_instruct_v0_2,0.28609513981031004
90
- zephyr_7b_alpha,0.2838442157327606
91
- zephyr_7b_beta,0.2666234345800909
92
- gemma_1_1_7b_it,0.26226051061156724
93
- mistral_7b_instruct_v0_3,0.2537839697282422
94
- starling_lm_7b_beta,0.25234441602728047
95
- llama_2_7b,0.2391288049182786
96
- luminous_extended_30b,0.2329059829059829
97
- alpaca_7b,0.22072072072072071
98
- vicuna_33b_v1_3,0.2056404230317274
99
- phi_2,0.20087901666849037
100
- qwen2_1_5b_instruct,0.19711042311661506
101
- yi_6b_chat,0.1938854489164087
102
- qwen1_5_7b_chat,0.1916569245052217
103
- tulu_2_dpo_70b,0.17624223602484473
104
- qwen1_5_4b_chat,0.1674406604747162
105
- llama_2_70b_chat,0.15527950310559005
106
- gpt_neox_20b,0.14400584795321636
107
- vicuna_7b_v1_5,0.13619501854795973
108
- falcon_40b_instruct,0.13264580369843526
109
- gemma_7b_it,0.12136319058515854
110
- falcon_7b,0.11407257459889038
111
- gpt_j_6b,0.10160818713450293
112
- luminous_base_13b,0.08333333333333333
113
- llama_2_7b_chat,0.08304448781801049
114
- gemma_1_1_2b_it,0.07665903890160183
115
- olmo_7b,0.06545209176788123
116
- gemma_2b_it,0.05921052631578947
117
- qwen1_5_1_8b_chat,0.059167526659786716
118
- qwen2_0_5b_instruct,0.059081527347781215
119
- pythia_12b,0.054093567251461985
120
- pythia_6_9b,0.019736842105263157
121
- falcon_7b_instruct,0.013513513513513514
122
- qwen1_5_0_5b_chat,0.013157894736842105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cache/aggregate_scoress_cache_741f08262e15cba4bd6c8b25f2b138ca.csv DELETED
@@ -1,62 +0,0 @@
1
- model,score
2
- claude_3_5_sonnet_20240620,1.0
3
- gpt_4o_2024_05_13,0.9833333333333333
4
- gpt_4_0125_preview,0.9666666666666667
5
- gpt_4o_2024_08_06,0.95
6
- athene_70b,0.9333333333333333
7
- gpt_4o_mini,0.9166666666666666
8
- gemini_1_5_pro_api_preview,0.9
9
- mistral_large_2407,0.8833333333333333
10
- llama3_1_405b_instruct,0.8666666666666667
11
- glm_4_0520,0.85
12
- yi_large,0.8333333333333334
13
- deepseek_coder_v2,0.8166666666666667
14
- claude_3_opus_20240229,0.8
15
- gemma_2_27b_it,0.7833333333333333
16
- llama3_1_70b_instruct,0.75
17
- glm_4_0116,0.75
18
- glm_4_air,0.7333333333333333
19
- gpt_4_0314,0.7166666666666667
20
- gemini_1_5_flash_api_preview,0.7
21
- qwen2_72b_instruct,0.6833333333333333
22
- claude_3_sonnet_20240229,0.6666666666666666
23
- llama3_70b_instruct,0.65
24
- claude_3_haiku_20240307,0.6333333333333333
25
- gpt_4_0613,0.6166666666666667
26
- mistral_large_2402,0.6
27
- mixtral_8x22b_instruct_v0_1,0.5833333333333334
28
- qwen1_5_72b_chat,0.5666666666666667
29
- phi_3_medium_4k_instruct,0.55
30
- command_r_plus,0.5333333333333333
31
- mistral_medium,0.5166666666666667
32
- internlm2_5_20b_chat,0.5
33
- phi_3_small_8k_instruct,0.48333333333333334
34
- mistral_next,0.4666666666666667
35
- gpt_3_5_turbo_0613,0.45
36
- dbrx_instructruct_preview,0.43333333333333335
37
- internlm2_20b_chat,0.4166666666666667
38
- claude_2_0,0.4
39
- mixtral_8x7b_instruct_v0_1,0.38333333333333336
40
- gpt_3_5_turbo_0125,0.36666666666666664
41
- yi_34b_chat,0.35
42
- starling_lm_7b_beta,0.3333333333333333
43
- claude_2_1,0.31666666666666665
44
- llama3_1_8b_instruct,0.3
45
- snorkel_mistral_pairrm_dpo,0.2833333333333333
46
- llama3_8b_instruct,0.26666666666666666
47
- gpt_3_5_turbo_1106,0.25
48
- gpt_3_5_turbo_0301,0.23333333333333334
49
- gemini_1_0_pro,0.21666666666666667
50
- snowflake_arctic_instruct,0.2
51
- command_r,0.18333333333333332
52
- phi_3_mini_128k_instruct,0.16666666666666666
53
- tulu_2_dpo_70b,0.15
54
- starling_lm_7b_alpha,0.13333333333333333
55
- mistral_7b_instruct,0.11666666666666667
56
- gemma_1_1_7b_it,0.1
57
- llama_2_70b_chat,0.08333333333333333
58
- vicuna_33b_v1_3,0.06666666666666667
59
- gemma_7b_it,0.05
60
- llama_2_7b_chat,0.03333333333333333
61
- gemma_1_1_2b_it,0.016666666666666666
62
- gemma_2b_it,0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cache/aggregate_scoress_cache_dcbcd453e19427bcbf89a901d3f2a925.csv DELETED
@@ -1,62 +0,0 @@
1
- model,score
2
- claude_3_5_sonnet_20240620,1.0
3
- gpt_4o_2024_05_13,0.9833333333333333
4
- gpt_4_0125_preview,0.9666666666666667
5
- gpt_4o_2024_08_06,0.95
6
- athene_70b,0.9333333333333333
7
- gpt_4o_mini,0.9166666666666666
8
- gemini_1_5_pro_api_preview,0.9
9
- mistral_large_2407,0.8833333333333333
10
- llama3_1_405b_instruct,0.8666666666666667
11
- glm_4_0520,0.85
12
- yi_large,0.8333333333333334
13
- deepseek_coder_v2,0.8166666666666667
14
- claude_3_opus_20240229,0.8
15
- gemma_2_27b_it,0.7833333333333333
16
- llama3_1_70b_instruct,0.75
17
- glm_4_0116,0.75
18
- glm_4_air,0.7333333333333333
19
- gpt_4_0314,0.7166666666666667
20
- gemini_1_5_flash_api_preview,0.7
21
- qwen2_72b_instruct,0.6833333333333333
22
- claude_3_sonnet_20240229,0.6666666666666666
23
- llama3_70b_instruct,0.65
24
- claude_3_haiku_20240307,0.6333333333333333
25
- gpt_4_0613,0.6166666666666667
26
- mistral_large_2402,0.6
27
- mixtral_8x22b_instruct_v0_1,0.5833333333333334
28
- qwen1_5_72b_chat,0.5666666666666667
29
- phi_3_medium_4k_instruct,0.55
30
- command_r_plus,0.5333333333333333
31
- mistral_medium,0.5166666666666667
32
- internlm2_5_20b_chat,0.5
33
- phi_3_small_8k_instruct,0.48333333333333334
34
- mistral_next,0.4666666666666667
35
- gpt_3_5_turbo_0613,0.45
36
- dbrx_instructruct_preview,0.43333333333333335
37
- internlm2_20b_chat,0.4166666666666667
38
- claude_2_0,0.4
39
- mixtral_8x7b_instruct_v0_1,0.38333333333333336
40
- gpt_3_5_turbo_0125,0.36666666666666664
41
- yi_34b_chat,0.35
42
- starling_lm_7b_beta,0.3333333333333333
43
- claude_2_1,0.31666666666666665
44
- llama3_1_8b_instruct,0.3
45
- snorkel_mistral_pairrm_dpo,0.2833333333333333
46
- llama3_8b_instruct,0.26666666666666666
47
- gpt_3_5_turbo_1106,0.25
48
- gpt_3_5_turbo_0301,0.23333333333333334
49
- gemini_1_0_pro,0.21666666666666667
50
- snowflake_arctic_instruct,0.2
51
- command_r,0.18333333333333332
52
- phi_3_mini_128k_instruct,0.16666666666666666
53
- tulu_2_dpo_70b,0.15
54
- starling_lm_7b_alpha,0.13333333333333333
55
- mistral_7b_instruct,0.11666666666666667
56
- gemma_1_1_7b_it,0.1
57
- llama_2_70b_chat,0.08333333333333333
58
- vicuna_33b_v1_3,0.06666666666666667
59
- gemma_7b_it,0.05
60
- llama_2_7b_chat,0.03333333333333333
61
- gemma_1_1_2b_it,0.016666666666666666
62
- gemma_2b_it,0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cache/agreements_cache_151f5bfbf87ac7384c2759731c72ec0c.csv DELETED
The diff for this file is too large to render. See raw diff
 
cache/agreements_cache_1b58bbc4e0d124b0a524da1001369741.csv DELETED
The diff for this file is too large to render. See raw diff
 
cache/agreements_cache_741f08262e15cba4bd6c8b25f2b138ca.csv DELETED
@@ -1,711 +0,0 @@
1
- scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value
2
- Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.2778254199662385,0.2400384567875128
3
- Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.40368671387966554,0.08581278065055217
4
- Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.42599897728156577,0.07162425926742408
5
- Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.2778254199662385,0.2400384567875128
6
- Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.36698792170878686,0.11834981273562825
7
- Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,-0.018181818181818184,1.0
8
- Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,-0.018181818181818184,1.0
9
- Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,-0.05454545454545454,0.8792698312489979
10
- Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,-0.018181818181818184,1.0
11
- Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,-0.1272727272727273,0.6480954385121052
12
- Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.05454545454545454,0.8792698312489979
13
- Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,-0.018181818181818184,1.0
14
- Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,-0.018181818181818184,1.0
15
- Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.05454545454545454,0.8792698312489979
16
- Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,-0.05454545454545454,0.8792698312489979
17
- Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.05454545454545454,0.8792698312489979
18
- Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.23636363636363636,0.3587114698573032
19
- Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.2,0.4453821448613115
20
- Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.05454545454545454,0.8792698312489979
21
- Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.1272727272727273,0.6480954385121052
22
- Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.587180674734059,0.01246215829454031
23
- Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
24
- Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.697277051246695,0.003004262239398284
25
- Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.587180674734059,0.01246215829454031
26
- Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6605782590758164,0.004936818556325077
27
- Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005
28
- Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
29
- Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
30
- Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005
31
- Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6363636363636364,0.005707170915504249
32
- Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.2727272727272727,0.2829668209876543
33
- Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.34545454545454546,0.16457331248997917
34
- Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.38181818181818183,0.12097096961680295
35
- Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.2727272727272727,0.2829668209876543
36
- Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.34545454545454546,0.16457331248997917
37
- Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6363636363636364,0.005707170915504249
38
- Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
39
- Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6363636363636364,0.005707170915504249
40
- Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6363636363636364,0.005707170915504249
41
- Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
42
- Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.18349396085439343,0.43487965849578336
43
- Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.2935903373670295,0.21152242941072896
44
- Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.2727272727272727,0.2829668209876543
45
- Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.18349396085439343,0.43487965849578336
46
- Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.2568915451961508,0.27429882739587574
47
- Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.4909090909090909,0.04053235730319064
48
- Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6000000000000001,0.00994553671637005
49
- Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.5636363636363636,0.016540504248837583
50
- Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.4909090909090909,0.04053235730319064
51
- Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5636363636363636,0.016540504248837583
52
- Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,0,0.34545454545454546,0.16457331248997917
53
- Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,1,0.41818181818181815,0.08656124739458072
54
- Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,2,0.4909090909090909,0.04053235730319064
55
- Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,3,0.34545454545454546,0.16457331248997917
56
- Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,11,4,0.34545454545454546,0.16457331248997917
57
- HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.9272727272727274,3.2567740901074234e-06
58
- HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8545454545454545,4.624619207952541e-05
59
- HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.9272727272727274,3.2567740901074234e-06
60
- HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
61
- HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
62
- HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
63
- HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
64
- HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08
65
- HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6727272727272727,0.0031063111271444604
66
- HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822
67
- HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.45454545454545453,0.06017015392015392
68
- HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.41818181818181815,0.08656124739458072
69
- HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.45454545454545453,0.06017015392015392
70
- HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
71
- HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5272727272727272,0.02638447971781305
72
- HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
73
- HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.5636363636363636,0.016540504248837583
74
- HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7454545454545454,0.000759529822029822
75
- HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
76
- HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
77
- HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
78
- HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9272727272727274,3.2567740901074234e-06
79
- HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08
80
- HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
81
- HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
82
- HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
83
- HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.5272727272727272,0.02638447971781305
84
- HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
85
- HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.4403855060505442,0.06091869077971648
86
- HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.587180674734059,0.01246215829454031
87
- HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822
88
- HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,1,0.45454545454545453,0.06017015392015392
89
- HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,2,0.38181818181818183,0.12097096961680295
90
- HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6363636363636364,0.005707170915504249
91
- HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,11,4,0.587180674734059,0.01246215829454031
92
- Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
93
- Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7090909090909091,0.0015912097162097162
94
- Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
95
- Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7090909090909091,0.0015912097162097162
96
- Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
97
- LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,0,1.0,5.010421677088344e-08
98
- LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,1,1.0,5.010421677088344e-08
99
- LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08
100
- LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,3,1.0,5.010421677088344e-08
101
- LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,11,4,1.0,5.010421677088344e-08
102
- MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8545454545454545,4.624619207952541e-05
103
- MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
104
- MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
105
- MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
106
- MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8181818181818182,0.00013227513227513228
107
- MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7818181818181819,0.0003334435626102293
108
- MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822
109
- MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
110
- MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.4909090909090909,0.04053235730319064
111
- MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8181818181818182,0.00013227513227513228
112
- MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6238794669049377,0.007931923532795268
113
- MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6605782590758164,0.004936818556325077
114
- MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.4403855060505442,0.06091869077971648
115
- MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7090909090909091,0.0015912097162097162
116
- MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.697277051246695,0.003004262239398284
117
- MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8545454545454545,4.624619207952541e-05
118
- MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
119
- MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
120
- MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7818181818181819,0.0003334435626102293
121
- MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
122
- MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6238794669049377,0.007931923532795268
123
- MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.4403855060505442,0.06091869077971648
124
- MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.5636363636363636,0.016540504248837583
125
- MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6363636363636364,0.005707170915504249
126
- MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5636363636363636,0.016540504248837583
127
- MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005
128
- MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.5636363636363636,0.016540504248837583
129
- MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6000000000000001,0.00994553671637005
130
- MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005
131
- MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5636363636363636,0.016540504248837583
132
- MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7339758434175737,0.0017872890369872653
133
- MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.587180674734059,0.01246215829454031
134
- MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6482593132545567,0.006117582447622459
135
- MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.759389481241052,0.0013210471654040124
136
- MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.759389481241052,0.0013210471654040124
137
- MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6363636363636364,0.005707170915504249
138
- MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7090909090909091,0.0015912097162097162
139
- MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6727272727272727,0.0031063111271444604
140
- MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.2727272727272727,0.2829668209876543
141
- MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5272727272727272,0.02638447971781305
142
- MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.38181818181818183,0.12097096961680295
143
- MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.4909090909090909,0.04053235730319064
144
- MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.38895558795273394,0.10000137830747906
145
- MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.38181818181818183,0.12097096961680295
146
- MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.697277051246695,0.003004262239398284
147
- MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,0,0.4909090909090909,0.04053235730319064
148
- MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,1,0.41818181818181815,0.08656124739458072
149
- MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6000000000000001,0.00994553671637005
150
- MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005
151
- MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6000000000000001,0.00994553671637005
152
- AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
153
- AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8807710121010884,0.00017812930545546289
154
- AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8807710121010884,0.00017812930545546289
155
- AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8807710121010884,0.00017812930545546289
156
- AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
157
- LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822
158
- LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
159
- LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
160
- LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
161
- LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
162
- LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.4770842982214229,0.042330229121360724
163
- LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
164
- LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6605782590758164,0.004936818556325077
165
- LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8440722199302099,0.0003281542287518694
166
- LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7339758434175737,0.0017872890369872653
167
- LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.9636363636363636,5.511463844797178e-07
168
- LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
169
- LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
170
- LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
171
- LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
172
- LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
173
- LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822
174
- LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
175
- LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05
176
- LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
177
- LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.4909090909090909,0.04053235730319064
178
- LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6363636363636364,0.005707170915504249
179
- LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7454545454545454,0.000759529822029822
180
- LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
181
- LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822
182
- LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.45454545454545453,0.06017015392015392
183
- LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
184
- LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6363636363636364,0.005707170915504249
185
- LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.9272727272727274,3.2567740901074234e-06
186
- LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
187
- LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005
188
- LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7090909090909091,0.0015912097162097162
189
- LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.6727272727272727,0.0031063111271444604
190
- LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.4909090909090909,0.04053235730319064
191
- LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6727272727272727,0.0031063111271444604
192
- WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
193
- WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
194
- WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
195
- WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05
196
- WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.8181818181818182,0.00013227513227513228
197
- WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822
198
- WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
199
- WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
200
- WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
201
- WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822
202
- WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822
203
- WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
204
- WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
205
- WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
206
- WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822
207
- WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,1.0,5.010421677088344e-08
208
- WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07
209
- WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
210
- WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.9636363636363636,5.511463844797178e-07
211
- WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,1.0,5.010421677088344e-08
212
- WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,1.0,5.010421677088344e-08
213
- WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,1.0,5.010421677088344e-08
214
- WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.9272727272727274,3.2567740901074234e-06
215
- WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.9636363636363636,5.511463844797178e-07
216
- WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,1.0,5.010421677088344e-08
217
- WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.9272727272727274,3.2567740901074234e-06
218
- WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07
219
- WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
220
- WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
221
- WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.9272727272727274,3.2567740901074234e-06
222
- WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,0,0.9636363636363636,5.511463844797178e-07
223
- WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07
224
- WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
225
- WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05
226
- WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,11,4,0.9636363636363636,5.511463844797178e-07
227
- Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,0,1.0,5.010421677088344e-08
228
- Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,1,1.0,5.010421677088344e-08
229
- Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08
230
- Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,3,1.0,5.010421677088344e-08
231
- Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,11,4,1.0,5.010421677088344e-08
232
- HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.45454545454545453,0.06017015392015392
233
- HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.6000000000000001,0.00994553671637005
234
- HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.3090909090909091,0.21834651074234407
235
- HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.5272727272727272,0.02638447971781305
236
- HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.38181818181818183,0.12097096961680295
237
- HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005
238
- HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.5636363636363636,0.016540504248837583
239
- HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.5272727272727272,0.02638447971781305
240
- HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005
241
- HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.4909090909090909,0.04053235730319064
242
- HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.45454545454545453,0.06017015392015392
243
- HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.6000000000000001,0.00994553671637005
244
- HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.34545454545454546,0.16457331248997917
245
- HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.41818181818181815,0.08656124739458072
246
- HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.2727272727272727,0.2829668209876543
247
- HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.5741725345968929,0.015177848122929492
248
- HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.3519121986239021,0.1366995137219537
249
- HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.42599897728156577,0.07162425926742408
250
- HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.38181818181818183,0.12097096961680295
251
- HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.4403855060505442,0.06091869077971648
252
- HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
253
- HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
254
- HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.6727272727272727,0.0031063111271444604
255
- HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
256
- HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.6727272727272727,0.0031063111271444604
257
- HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.6000000000000001,0.00994553671637005
258
- HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.5272727272727272,0.02638447971781305
259
- HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.3090909090909091,0.21834651074234407
260
- HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.45454545454545453,0.06017015392015392
261
- HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.2727272727272727,0.2829668209876543
262
- HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,0,0.4403855060505442,0.06091869077971648
263
- HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,1,0.38181818181818183,0.12097096961680295
264
- HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,2,0.4403855060505442,0.06091869077971648
265
- HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,3,0.45454545454545453,0.06017015392015392
266
- HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,11,4,0.36698792170878686,0.11834981273562825
267
- BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,0,0.2,0.4453821448613115
268
- BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,1,0.38181818181818183,0.12097096961680295
269
- BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,2,0.41818181818181815,0.08656124739458072
270
- BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,3,0.5272727272727272,0.02638447971781305
271
- BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,11,4,0.5272727272727272,0.02638447971781305
272
- BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
273
- BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
274
- BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
275
- BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
276
- BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
277
- BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7454545454545454,0.000759529822029822
278
- BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
279
- BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
280
- BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.8073734277593311,0.0005907573118657002
281
- BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822
282
- BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.587180674734059,0.01246215829454031
283
- BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6482593132545567,0.006117582447622459
284
- BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
285
- BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7706746355884524,0.0010393630991335228
286
- BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5371291452680612,0.02311942970946668
287
- BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.6238794669049377,0.007931923532795268
288
- BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.4909090909090909,0.04053235730319064
289
- BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8440722199302099,0.0003281542287518694
290
- BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
291
- BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.36698792170878686,0.11834981273562825
292
- BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
293
- BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.9272727272727274,3.2567740901074234e-06
294
- BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,1.0,5.010421677088344e-08
295
- BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7090909090909091,0.0015912097162097162
296
- BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6727272727272727,0.0031063111271444604
297
- BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
298
- BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8073734277593311,0.0005907573118657002
299
- BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
300
- BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7818181818181819,0.0003334435626102293
301
- BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.6605782590758164,0.004936818556325077
302
- BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,-0.0909090909090909,0.7611503928170594
303
- BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.07339758434175737,0.7547764265871044
304
- BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.4403855060505442,0.06091869077971648
305
- BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.3302891295379082,0.15985367483762747
306
- BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.1272727272727273,0.6480954385121052
307
- BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
308
- BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
309
- BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.8705196492275474,0.00023202582506637044
310
- BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7479575920067658,0.001637274718449882
311
- BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5983660736054126,0.01175728488671479
312
- BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
313
- BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.6000000000000001,0.00994553671637005
314
- BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
315
- BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
316
- BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.5272727272727272,0.02638447971781305
317
- BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,0,0.7706746355884524,0.0010393630991335228
318
- BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,1,0.8909090909090909,1.3728555395222063e-05
319
- BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,2,0.5272727272727272,0.02638447971781305
320
- BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,3,0.7454545454545454,0.000759529822029822
321
- BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
322
- LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.7818181818181819,0.0003334435626102293
323
- LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.8545454545454545,4.624619207952541e-05
324
- LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
325
- LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
326
- LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
327
- LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.6731618328060892,0.004677734981047257
328
- LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.759389481241052,0.0013210471654040124
329
- LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7339758434175737,0.0017872890369872653
330
- LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.6238794669049377,0.007931923532795268
331
- LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
332
- LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.6363636363636364,0.005707170915504249
333
- LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822
334
- LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7706746355884524,0.0010393630991335228
335
- LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
336
- LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7706746355884524,0.0010393630991335228
337
- LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
338
- LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.8909090909090909,1.3728555395222063e-05
339
- LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
340
- LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
341
- LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
342
- LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.5636363636363636,0.016540504248837583
343
- LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822
344
- LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.6363636363636364,0.005707170915504249
345
- LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.7818181818181819,0.0003334435626102293
346
- LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.6363636363636364,0.005707170915504249
347
- LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
348
- LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
349
- LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.5636363636363636,0.016540504248837583
350
- LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
351
- LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822
352
- LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,0,0.6727272727272727,0.0031063111271444604
353
- LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,1,0.7454545454545454,0.000759529822029822
354
- LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
355
- LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,3,0.6000000000000001,0.00994553671637005
356
- LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,11,4,0.7454545454545454,0.000759529822029822
357
- aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,0,0.2778254199662385,0.2400384567875128
358
- aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,1,0.40368671387966554,0.08581278065055217
359
- aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,2,0.42599897728156577,0.07162425926742408
360
- aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,3,0.2778254199662385,0.2400384567875128
361
- aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,11,4,0.36698792170878686,0.11834981273562825
362
- aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,0,-0.018181818181818184,1.0
363
- aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,1,-0.018181818181818184,1.0
364
- aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,2,-0.05454545454545454,0.8792698312489979
365
- aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,3,-0.018181818181818184,1.0
366
- aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,11,4,-0.1272727272727273,0.6480954385121052
367
- aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,0,0.05454545454545454,0.8792698312489979
368
- aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,1,-0.018181818181818184,1.0
369
- aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,2,-0.018181818181818184,1.0
370
- aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,3,0.05454545454545454,0.8792698312489979
371
- aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,11,4,-0.05454545454545454,0.8792698312489979
372
- aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,0,0.05454545454545454,0.8792698312489979
373
- aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,1,0.23636363636363636,0.3587114698573032
374
- aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,2,0.2,0.4453821448613115
375
- aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,3,0.05454545454545454,0.8792698312489979
376
- aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,11,4,0.1272727272727273,0.6480954385121052
377
- aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,0,0.587180674734059,0.01246215829454031
378
- aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
379
- aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,2,0.697277051246695,0.003004262239398284
380
- aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,3,0.587180674734059,0.01246215829454031
381
- aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,11,4,0.6605782590758164,0.004936818556325077
382
- aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005
383
- aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
384
- aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
385
- aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005
386
- aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,11,4,0.6363636363636364,0.005707170915504249
387
- aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,0,0.2727272727272727,0.2829668209876543
388
- aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,1,0.34545454545454546,0.16457331248997917
389
- aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,2,0.38181818181818183,0.12097096961680295
390
- aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,3,0.2727272727272727,0.2829668209876543
391
- aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,11,4,0.34545454545454546,0.16457331248997917
392
- aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,0,0.6363636363636364,0.005707170915504249
393
- aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
394
- aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,2,0.6363636363636364,0.005707170915504249
395
- aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,3,0.6363636363636364,0.005707170915504249
396
- aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
397
- aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,0,0.18349396085439343,0.43487965849578336
398
- aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,1,0.2935903373670295,0.21152242941072896
399
- aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,2,0.2727272727272727,0.2829668209876543
400
- aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,3,0.18349396085439343,0.43487965849578336
401
- aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,11,4,0.2568915451961508,0.27429882739587574
402
- aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,0,0.4909090909090909,0.04053235730319064
403
- aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,1,0.6000000000000001,0.00994553671637005
404
- aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,2,0.5636363636363636,0.016540504248837583
405
- aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,3,0.4909090909090909,0.04053235730319064
406
- aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,11,4,0.5636363636363636,0.016540504248837583
407
- aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,0,0.34545454545454546,0.16457331248997917
408
- aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,1,0.41818181818181815,0.08656124739458072
409
- aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,2,0.4909090909090909,0.04053235730319064
410
- aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,3,0.34545454545454546,0.16457331248997917
411
- aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,11,4,0.34545454545454546,0.16457331248997917
412
- aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.9272727272727274,3.2567740901074234e-06
413
- aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.8545454545454545,4.624619207952541e-05
414
- aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.9272727272727274,3.2567740901074234e-06
415
- aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
416
- aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
417
- aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
418
- aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
419
- aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08
420
- aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.6727272727272727,0.0031063111271444604
421
- aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822
422
- aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.45454545454545453,0.06017015392015392
423
- aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.41818181818181815,0.08656124739458072
424
- aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.45454545454545453,0.06017015392015392
425
- aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
426
- aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.5272727272727272,0.02638447971781305
427
- aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
428
- aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.5636363636363636,0.016540504248837583
429
- aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.7454545454545454,0.000759529822029822
430
- aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
431
- aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
432
- aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
433
- aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.9272727272727274,3.2567740901074234e-06
434
- aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08
435
- aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
436
- aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
437
- aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
438
- aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.5272727272727272,0.02638447971781305
439
- aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
440
- aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.4403855060505442,0.06091869077971648
441
- aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.587180674734059,0.01246215829454031
442
- aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822
443
- aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,1,0.45454545454545453,0.06017015392015392
444
- aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,2,0.38181818181818183,0.12097096961680295
445
- aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,3,0.6363636363636364,0.005707170915504249
446
- aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,11,4,0.587180674734059,0.01246215829454031
447
- aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
448
- aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,1,0.7090909090909091,0.0015912097162097162
449
- aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
450
- aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,3,0.7090909090909091,0.0015912097162097162
451
- aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
452
- aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,0,1.0,5.010421677088344e-08
453
- aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,1,1.0,5.010421677088344e-08
454
- aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08
455
- aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,3,1.0,5.010421677088344e-08
456
- aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,11,4,1.0,5.010421677088344e-08
457
- aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,0,0.8545454545454545,4.624619207952541e-05
458
- aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
459
- aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
460
- aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
461
- aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,11,4,0.8181818181818182,0.00013227513227513228
462
- aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,0,0.7818181818181819,0.0003334435626102293
463
- aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822
464
- aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
465
- aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,3,0.4909090909090909,0.04053235730319064
466
- aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,11,4,0.8181818181818182,0.00013227513227513228
467
- aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,0,0.6238794669049377,0.007931923532795268
468
- aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,1,0.6605782590758164,0.004936818556325077
469
- aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,2,0.4403855060505442,0.06091869077971648
470
- aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,3,0.7090909090909091,0.0015912097162097162
471
- aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,11,4,0.697277051246695,0.003004262239398284
472
- aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,0,0.8545454545454545,4.624619207952541e-05
473
- aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
474
- aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
475
- aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,3,0.7818181818181819,0.0003334435626102293
476
- aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
477
- aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,0,0.6238794669049377,0.007931923532795268
478
- aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,1,0.4403855060505442,0.06091869077971648
479
- aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,2,0.5636363636363636,0.016540504248837583
480
- aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,3,0.6363636363636364,0.005707170915504249
481
- aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,11,4,0.5636363636363636,0.016540504248837583
482
- aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005
483
- aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,1,0.5636363636363636,0.016540504248837583
484
- aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,2,0.6000000000000001,0.00994553671637005
485
- aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005
486
- aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,11,4,0.5636363636363636,0.016540504248837583
487
- aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,0,0.7339758434175737,0.0017872890369872653
488
- aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,1,0.587180674734059,0.01246215829454031
489
- aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,2,0.6482593132545567,0.006117582447622459
490
- aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,3,0.759389481241052,0.0013210471654040124
491
- aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,11,4,0.759389481241052,0.0013210471654040124
492
- aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,0,0.6363636363636364,0.005707170915504249
493
- aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,1,0.7090909090909091,0.0015912097162097162
494
- aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,2,0.6727272727272727,0.0031063111271444604
495
- aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,3,0.2727272727272727,0.2829668209876543
496
- aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,11,4,0.5272727272727272,0.02638447971781305
497
- aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,0,0.38181818181818183,0.12097096961680295
498
- aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,1,0.4909090909090909,0.04053235730319064
499
- aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,2,0.38895558795273394,0.10000137830747906
500
- aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,3,0.38181818181818183,0.12097096961680295
501
- aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,11,4,0.697277051246695,0.003004262239398284
502
- aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,0,0.4909090909090909,0.04053235730319064
503
- aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,1,0.41818181818181815,0.08656124739458072
504
- aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,2,0.6000000000000001,0.00994553671637005
505
- aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005
506
- aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,11,4,0.6000000000000001,0.00994553671637005
507
- aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
508
- aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,1,0.8807710121010884,0.00017812930545546289
509
- aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,2,0.8807710121010884,0.00017812930545546289
510
- aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,3,0.8807710121010884,0.00017812930545546289
511
- aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
512
- aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822
513
- aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
514
- aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
515
- aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
516
- aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
517
- aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,0,0.4770842982214229,0.042330229121360724
518
- aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
519
- aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,2,0.6605782590758164,0.004936818556325077
520
- aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,3,0.8440722199302099,0.0003281542287518694
521
- aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,11,4,0.7339758434175737,0.0017872890369872653
522
- aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,0,0.9636363636363636,5.511463844797178e-07
523
- aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,1,0.6727272727272727,0.0031063111271444604
524
- aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
525
- aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
526
- aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
527
- aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
528
- aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822
529
- aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
530
- aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05
531
- aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
532
- aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,0,0.4909090909090909,0.04053235730319064
533
- aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,1,0.6363636363636364,0.005707170915504249
534
- aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,2,0.7454545454545454,0.000759529822029822
535
- aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
536
- aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822
537
- aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,0,0.45454545454545453,0.06017015392015392
538
- aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
539
- aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,2,0.6363636363636364,0.005707170915504249
540
- aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,3,0.9272727272727274,3.2567740901074234e-06
541
- aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
542
- aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005
543
- aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,1,0.7090909090909091,0.0015912097162097162
544
- aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,2,0.6727272727272727,0.0031063111271444604
545
- aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,3,0.4909090909090909,0.04053235730319064
546
- aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,11,4,0.6727272727272727,0.0031063111271444604
547
- aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
548
- aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
549
- aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
550
- aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05
551
- aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,11,4,0.8181818181818182,0.00013227513227513228
552
- aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822
553
- aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
554
- aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
555
- aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
556
- aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822
557
- aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822
558
- aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
559
- aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
560
- aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
561
- aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822
562
- aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,0,1.0,5.010421677088344e-08
563
- aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07
564
- aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
565
- aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,3,0.9636363636363636,5.511463844797178e-07
566
- aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,11,4,1.0,5.010421677088344e-08
567
- aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,0,1.0,5.010421677088344e-08
568
- aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,1,1.0,5.010421677088344e-08
569
- aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,2,0.9272727272727274,3.2567740901074234e-06
570
- aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,3,0.9636363636363636,5.511463844797178e-07
571
- aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,11,4,1.0,5.010421677088344e-08
572
- aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,0,0.9272727272727274,3.2567740901074234e-06
573
- aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07
574
- aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
575
- aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
576
- aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,11,4,0.9272727272727274,3.2567740901074234e-06
577
- aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,0,0.9636363636363636,5.511463844797178e-07
578
- aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,1,0.9636363636363636,5.511463844797178e-07
579
- aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
580
- aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,3,0.8909090909090909,1.3728555395222063e-05
581
- aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,11,4,0.9636363636363636,5.511463844797178e-07
582
- aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,0,1.0,5.010421677088344e-08
583
- aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,1,1.0,5.010421677088344e-08
584
- aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08
585
- aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,3,1.0,5.010421677088344e-08
586
- aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,11,4,1.0,5.010421677088344e-08
587
- aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.45454545454545453,0.06017015392015392
588
- aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.6000000000000001,0.00994553671637005
589
- aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.3090909090909091,0.21834651074234407
590
- aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.5272727272727272,0.02638447971781305
591
- aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.38181818181818183,0.12097096961680295
592
- aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005
593
- aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.5636363636363636,0.016540504248837583
594
- aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.5272727272727272,0.02638447971781305
595
- aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005
596
- aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.4909090909090909,0.04053235730319064
597
- aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.45454545454545453,0.06017015392015392
598
- aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.6000000000000001,0.00994553671637005
599
- aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.34545454545454546,0.16457331248997917
600
- aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.41818181818181815,0.08656124739458072
601
- aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.2727272727272727,0.2829668209876543
602
- aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.5741725345968929,0.015177848122929492
603
- aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.3519121986239021,0.1366995137219537
604
- aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.42599897728156577,0.07162425926742408
605
- aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.38181818181818183,0.12097096961680295
606
- aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.4403855060505442,0.06091869077971648
607
- aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
608
- aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
609
- aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.6727272727272727,0.0031063111271444604
610
- aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
611
- aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.6727272727272727,0.0031063111271444604
612
- aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.6000000000000001,0.00994553671637005
613
- aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.5272727272727272,0.02638447971781305
614
- aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.3090909090909091,0.21834651074234407
615
- aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.45454545454545453,0.06017015392015392
616
- aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.2727272727272727,0.2829668209876543
617
- aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,0,0.4403855060505442,0.06091869077971648
618
- aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,1,0.38181818181818183,0.12097096961680295
619
- aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,2,0.4403855060505442,0.06091869077971648
620
- aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,3,0.45454545454545453,0.06017015392015392
621
- aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,11,4,0.36698792170878686,0.11834981273562825
622
- aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,0,0.2,0.4453821448613115
623
- aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,1,0.38181818181818183,0.12097096961680295
624
- aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,2,0.41818181818181815,0.08656124739458072
625
- aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,3,0.5272727272727272,0.02638447971781305
626
- aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,11,4,0.5272727272727272,0.02638447971781305
627
- aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
628
- aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
629
- aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
630
- aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
631
- aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
632
- aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,0,0.7454545454545454,0.000759529822029822
633
- aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
634
- aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,2,0.8181818181818182,0.00013227513227513228
635
- aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,3,0.8073734277593311,0.0005907573118657002
636
- aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822
637
- aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,0,0.587180674734059,0.01246215829454031
638
- aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,1,0.6482593132545567,0.006117582447622459
639
- aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,2,0.8545454545454545,4.624619207952541e-05
640
- aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,3,0.7706746355884524,0.0010393630991335228
641
- aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,11,4,0.5371291452680612,0.02311942970946668
642
- aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,0,0.6238794669049377,0.007931923532795268
643
- aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,1,0.4909090909090909,0.04053235730319064
644
- aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,2,0.8440722199302099,0.0003281542287518694
645
- aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
646
- aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,11,4,0.36698792170878686,0.11834981273562825
647
- aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,0,0.8909090909090909,1.3728555395222063e-05
648
- aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,1,0.9272727272727274,3.2567740901074234e-06
649
- aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,2,1.0,5.010421677088344e-08
650
- aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,3,0.7090909090909091,0.0015912097162097162
651
- aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,11,4,0.6727272727272727,0.0031063111271444604
652
- aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,0,0.8181818181818182,0.00013227513227513228
653
- aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,1,0.8073734277593311,0.0005907573118657002
654
- aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,2,0.8909090909090909,1.3728555395222063e-05
655
- aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,3,0.7818181818181819,0.0003334435626102293
656
- aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,11,4,0.6605782590758164,0.004936818556325077
657
- aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,0,-0.0909090909090909,0.7611503928170594
658
- aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,1,0.07339758434175737,0.7547764265871044
659
- aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,2,0.4403855060505442,0.06091869077971648
660
- aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,3,0.3302891295379082,0.15985367483762747
661
- aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,11,4,0.1272727272727273,0.6480954385121052
662
- aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
663
- aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,1,0.7818181818181819,0.0003334435626102293
664
- aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,2,0.8705196492275474,0.00023202582506637044
665
- aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,3,0.7479575920067658,0.001637274718449882
666
- aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,11,4,0.5983660736054126,0.01175728488671479
667
- aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
668
- aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,1,0.6000000000000001,0.00994553671637005
669
- aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
670
- aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
671
- aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,11,4,0.5272727272727272,0.02638447971781305
672
- aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,0,0.7706746355884524,0.0010393630991335228
673
- aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,1,0.8909090909090909,1.3728555395222063e-05
674
- aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,2,0.5272727272727272,0.02638447971781305
675
- aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,3,0.7454545454545454,0.000759529822029822
676
- aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
677
- aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,0,0.7818181818181819,0.0003334435626102293
678
- aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,1,0.8545454545454545,4.624619207952541e-05
679
- aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
680
- aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
681
- aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,11,4,0.7818181818181819,0.0003334435626102293
682
- aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,0,0.6731618328060892,0.004677734981047257
683
- aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,1,0.759389481241052,0.0013210471654040124
684
- aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,2,0.7339758434175737,0.0017872890369872653
685
- aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,3,0.6238794669049377,0.007931923532795268
686
- aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,11,4,0.7090909090909091,0.0015912097162097162
687
- aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,0,0.6363636363636364,0.005707170915504249
688
- aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822
689
- aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,2,0.7706746355884524,0.0010393630991335228
690
- aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,3,0.8181818181818182,0.00013227513227513228
691
- aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,11,4,0.7706746355884524,0.0010393630991335228
692
- aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
693
- aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,1,0.8909090909090909,1.3728555395222063e-05
694
- aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,2,0.7818181818181819,0.0003334435626102293
695
- aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
696
- aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,11,4,0.8545454545454545,4.624619207952541e-05
697
- aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,0,0.5636363636363636,0.016540504248837583
698
- aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822
699
- aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,2,0.6363636363636364,0.005707170915504249
700
- aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,3,0.7818181818181819,0.0003334435626102293
701
- aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,11,4,0.6363636363636364,0.005707170915504249
702
- aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,0,0.7090909090909091,0.0015912097162097162
703
- aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,1,0.8181818181818182,0.00013227513227513228
704
- aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,2,0.5636363636363636,0.016540504248837583
705
- aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,3,0.8545454545454545,4.624619207952541e-05
706
- aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822
707
- aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,0,0.6727272727272727,0.0031063111271444604
708
- aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,1,0.7454545454545454,0.000759529822029822
709
- aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,2,0.7090909090909091,0.0015912097162097162
710
- aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,3,0.6000000000000001,0.00994553671637005
711
- aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,11,4,0.7454545454545454,0.000759529822029822
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cache/agreements_cache_dcbcd453e19427bcbf89a901d3f2a925.csv DELETED
@@ -1,731 +0,0 @@
1
- scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value
2
- Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4447495899966607,0.1315867602811863
3
- Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
4
- Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.47280542884465016,0.10506382347888965
5
- Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.2545875386086578,0.38281014365989596
6
- Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.40006613209931935,0.17023995462900499
7
- Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.07142857142857142,0.9048611111111111
8
- Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,-0.2857142857142857,0.39875992063492066
9
- Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111
10
- Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,-0.21428571428571427,0.5484126984126985
11
- Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,-0.3571428571428571,0.27509920634920637
12
- Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.07142857142857142,0.9048611111111111
13
- Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,-0.2857142857142857,0.39875992063492066
14
- Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508
15
- Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,-0.21428571428571427,0.5484126984126985
16
- Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,-0.2857142857142857,0.39875992063492066
17
- Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985
18
- Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.0,1.0
19
- Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
20
- Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.0,1.0
21
- Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.0,1.0
22
- Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7637626158259734,0.008839740160738534
23
- Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
24
- Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
25
- Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476
26
- Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7637626158259734,0.008839740160738534
27
- Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
28
- Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
29
- Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
30
- Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476
31
- Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762
32
- Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985
33
- Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
34
- Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
35
- Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.14285714285714285,0.7195436507936508
36
- Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.3571428571428571,0.27509920634920637
37
- Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
38
- Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111
39
- Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
40
- Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
41
- Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762
42
- Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066
43
- Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.21428571428571427,0.5484126984126985
44
- Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
45
- Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.036369648372665396,0.9007802600472398
46
- Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.21428571428571427,0.5484126984126985
47
- Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
48
- Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
49
- Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
50
- Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476
51
- Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365
52
- Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
53
- Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
54
- Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
55
- Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.3571428571428571,0.27509920634920637
56
- Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.3571428571428571,0.27509920634920637
57
- HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
58
- HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968
59
- HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968
60
- HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
61
- HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
62
- HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
63
- HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
64
- HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05
65
- HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762
66
- HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111
67
- HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
68
- HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
69
- HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
70
- HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
71
- HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.3571428571428571,0.27509920634920637
72
- HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111
73
- HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
74
- HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
75
- HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476
76
- HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762
77
- HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
78
- HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968
79
- HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05
80
- HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111
81
- HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111
82
- HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111
83
- HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
84
- HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
85
- HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.4999999999999999,0.10868055555555556
86
- HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
87
- HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
88
- HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
89
- HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
90
- HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
91
- HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365
92
- Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
93
- Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
94
- Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
95
- Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
96
- Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381
97
- LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
98
- LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
99
- LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05
100
- LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9999999999999998,4.96031746031746e-05
101
- LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05
102
- MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111
103
- MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968
104
- MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
105
- MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
106
- MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
107
- MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
108
- MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111
109
- MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
110
- MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111
111
- MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762
112
- MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
113
- MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
114
- MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
115
- MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476
116
- MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762
117
- MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
118
- MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
119
- MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
120
- MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365
121
- MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381
122
- MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
123
- MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111
124
- MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
125
- MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
126
- MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365
127
- MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6182840223353117,0.0340492747686748
128
- MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.18184824186332696,0.5330356744917513
129
- MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
130
- MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
131
- MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.42857142857142855,0.17886904761904762
132
- MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
133
- MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
134
- MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
135
- MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476
136
- MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.42857142857142855,0.17886904761904762
137
- MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6910233190806425,0.017844011512848347
138
- MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5455447255899809,0.0614649096074132
139
- MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132
140
- MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.836501912571304,0.004136737098676645
141
- MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6910233190806425,0.017844011512848347
142
- MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
143
- MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
144
- MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
145
- MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.21428571428571427,0.5484126984126985
146
- MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.42857142857142855,0.17886904761904762
147
- MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
148
- MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
149
- MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.22237479499833035,0.45088703102517036
150
- MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.3571428571428571,0.27509920634920637
151
- MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.5714285714285714,0.06101190476190476
152
- MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.14285714285714285,0.7195436507936508
153
- MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
154
- MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
155
- MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762
156
- MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365
157
- AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
158
- AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,1,0.836501912571304,0.004136737098676645
159
- AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
160
- AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
161
- AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111
162
- OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066
163
- OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
164
- OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
165
- OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,3,0.3571428571428571,0.27509920634920637
166
- OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,4,0.4999999999999999,0.10868055555555556
167
- LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
168
- LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
169
- LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968
170
- LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111
171
- LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381
172
- LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
173
- LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
174
- LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.47280542884465016,0.10506382347888965
175
- LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111
176
- LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.5455447255899809,0.0614649096074132
177
- LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
178
- LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
179
- LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968
180
- LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
181
- LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365
182
- LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
183
- LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
184
- LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
185
- LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
186
- LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762
187
- LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
188
- LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
189
- LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
190
- LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
191
- LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365
192
- LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
193
- LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111
194
- LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
195
- LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
196
- LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381
197
- LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
198
- LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
199
- LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
200
- LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.3571428571428571,0.27509920634920637
201
- LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.42857142857142855,0.17886904761904762
202
- WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111
203
- WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
204
- WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
205
- WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111
206
- WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365
207
- WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
208
- WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
209
- WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
210
- WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
211
- WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365
212
- WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
213
- WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
214
- WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
215
- WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
216
- WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365
217
- WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
218
- WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
219
- WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
220
- WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
221
- WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05
222
- WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
223
- WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
224
- WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
225
- WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
226
- WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05
227
- WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
228
- WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
229
- WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
230
- WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111
231
- WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
232
- WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
233
- WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
234
- WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
235
- WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111
236
- WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05
237
- Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
238
- Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
239
- Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05
240
- Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9999999999999998,4.96031746031746e-05
241
- Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05
242
- HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.14285714285714285,0.7195436507936508
243
- HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
244
- HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
245
- HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365
246
- HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,4,0.5714285714285714,0.06101190476190476
247
- HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
248
- HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
249
- HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
250
- HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476
251
- HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,4,0.3571428571428571,0.27509920634920637
252
- HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.0,1.0
253
- HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
254
- HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
255
- HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365
256
- HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365
257
- HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
258
- HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.10910894511799618,0.7083840532183997
259
- HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.6182840223353117,0.0340492747686748
260
- HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,3,0.2857142857142857,0.39875992063492066
261
- HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,4,0.40006613209931935,0.17023995462900499
262
- HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
263
- HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
264
- HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
265
- HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
266
- HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381
267
- HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
268
- HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
269
- HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
270
- HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,3,0.42857142857142855,0.17886904761904762
271
- HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,4,0.3571428571428571,0.27509920634920637
272
- HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
273
- HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
274
- HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.6182840223353117,0.0340492747686748
275
- HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,3,0.42857142857142855,0.17886904761904762
276
- HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,4,0.2857142857142857,0.39875992063492066
277
- BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985
278
- BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,1,0.14285714285714285,0.7195436507936508
279
- BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
280
- BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,3,0.4999999999999999,0.10868055555555556
281
- BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,4,0.3571428571428571,0.27509920634920637
282
- BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
283
- BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
284
- BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
285
- BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762
286
- BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762
287
- BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
288
- BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111
289
- BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
290
- BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
291
- BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365
292
- BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.40006613209931935,0.17023995462900499
293
- BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6910233190806425,0.017844011512848347
294
- BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
295
- BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762
296
- BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,0.47280542884465016,0.10506382347888965
297
- BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.47280542884465016,0.10506382347888965
298
- BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
299
- BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7637626158259734,0.008839740160738534
300
- BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476
301
- BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,0.10910894511799618,0.7083840532183997
302
- BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111
303
- BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968
304
- BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05
305
- BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476
306
- BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111
307
- BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
308
- BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7637626158259734,0.008839740160738534
309
- BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968
310
- BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365
311
- BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7637626158259734,0.008839740160738534
312
- BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,-0.2857142857142857,0.39875992063492066
313
- BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2545875386086578,0.38281014365989596
314
- BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6910233190806425,0.017844011512848347
315
- BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.10910894511799618,0.7083840532183997
316
- BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,0.07142857142857142,0.9048611111111111
317
- BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
318
- BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
319
- BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.836501912571304,0.004136737098676645
320
- BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5669467095138409,0.05611472402809984
321
- BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6182840223353117,0.0340492747686748
322
- BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
323
- BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
324
- BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
325
- BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365
326
- BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,0.42857142857142855,0.17886904761904762
327
- BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
328
- BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111
329
- BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
330
- BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365
331
- BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111
332
- LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
333
- LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
334
- LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
335
- LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111
336
- LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381
337
- LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.7412493166611012,0.011966745157436277
338
- LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111
339
- LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.7637626158259734,0.008839740160738534
340
- LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762
341
- LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111
342
- LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
343
- LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968
344
- LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
345
- LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.9999999999999998,4.96031746031746e-05
346
- LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.7637626158259734,0.008839740160738534
347
- LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
348
- LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
349
- LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
350
- LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111
351
- LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111
352
- LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
353
- LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111
354
- LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
355
- LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
356
- LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365
357
- LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
358
- LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
359
- LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
360
- LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
361
- LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111
362
- LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
363
- LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111
364
- LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
365
- LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476
366
- LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381
367
- aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,0,0.4447495899966607,0.1315867602811863
368
- aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
369
- aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,2,0.47280542884465016,0.10506382347888965
370
- aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,3,0.2545875386086578,0.38281014365989596
371
- aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,4,0.40006613209931935,0.17023995462900499
372
- aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,0,0.07142857142857142,0.9048611111111111
373
- aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,1,-0.2857142857142857,0.39875992063492066
374
- aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111
375
- aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,3,-0.21428571428571427,0.5484126984126985
376
- aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,4,-0.3571428571428571,0.27509920634920637
377
- aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,0,0.07142857142857142,0.9048611111111111
378
- aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,1,-0.2857142857142857,0.39875992063492066
379
- aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508
380
- aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,3,-0.21428571428571427,0.5484126984126985
381
- aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,4,-0.2857142857142857,0.39875992063492066
382
- aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985
383
- aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,1,0.0,1.0
384
- aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
385
- aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,3,0.0,1.0
386
- aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,4,0.0,1.0
387
- aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,0,0.7637626158259734,0.008839740160738534
388
- aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
389
- aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
390
- aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476
391
- aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,4,0.7637626158259734,0.008839740160738534
392
- aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
393
- aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
394
- aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
395
- aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476
396
- aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762
397
- aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985
398
- aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
399
- aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
400
- aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,3,0.14285714285714285,0.7195436507936508
401
- aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,4,0.3571428571428571,0.27509920634920637
402
- aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
403
- aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111
404
- aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
405
- aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
406
- aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762
407
- aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066
408
- aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,1,0.21428571428571427,0.5484126984126985
409
- aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
410
- aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,3,0.036369648372665396,0.9007802600472398
411
- aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,4,0.21428571428571427,0.5484126984126985
412
- aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
413
- aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
414
- aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
415
- aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476
416
- aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365
417
- aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
418
- aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
419
- aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
420
- aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,3,0.3571428571428571,0.27509920634920637
421
- aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,4,0.3571428571428571,0.27509920634920637
422
- aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
423
- aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968
424
- aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968
425
- aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
426
- aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
427
- aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
428
- aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
429
- aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05
430
- aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762
431
- aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111
432
- aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
433
- aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
434
- aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
435
- aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
436
- aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.3571428571428571,0.27509920634920637
437
- aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111
438
- aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
439
- aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
440
- aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476
441
- aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762
442
- aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
443
- aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968
444
- aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05
445
- aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111
446
- aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111
447
- aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111
448
- aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
449
- aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
450
- aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.4999999999999999,0.10868055555555556
451
- aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
452
- aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
453
- aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
454
- aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
455
- aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
456
- aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365
457
- aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
458
- aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
459
- aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
460
- aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
461
- aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381
462
- aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
463
- aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
464
- aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05
465
- aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,3,0.9999999999999998,4.96031746031746e-05
466
- aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05
467
- aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111
468
- aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968
469
- aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
470
- aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
471
- aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
472
- aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
473
- aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111
474
- aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
475
- aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111
476
- aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762
477
- aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
478
- aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
479
- aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
480
- aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476
481
- aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762
482
- aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
483
- aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
484
- aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
485
- aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365
486
- aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381
487
- aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
488
- aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111
489
- aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
490
- aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
491
- aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365
492
- aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,0,0.6182840223353117,0.0340492747686748
493
- aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,1,0.18184824186332696,0.5330356744917513
494
- aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
495
- aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
496
- aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,4,0.42857142857142855,0.17886904761904762
497
- aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
498
- aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
499
- aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
500
- aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476
501
- aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,4,0.42857142857142855,0.17886904761904762
502
- aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,0,0.6910233190806425,0.017844011512848347
503
- aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,1,0.5455447255899809,0.0614649096074132
504
- aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132
505
- aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,3,0.836501912571304,0.004136737098676645
506
- aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,4,0.6910233190806425,0.017844011512848347
507
- aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
508
- aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
509
- aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
510
- aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,3,0.21428571428571427,0.5484126984126985
511
- aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,4,0.42857142857142855,0.17886904761904762
512
- aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
513
- aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
514
- aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,2,0.22237479499833035,0.45088703102517036
515
- aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,3,0.3571428571428571,0.27509920634920637
516
- aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,4,0.5714285714285714,0.06101190476190476
517
- aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,0,0.14285714285714285,0.7195436507936508
518
- aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
519
- aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
520
- aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762
521
- aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365
522
- aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
523
- aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,1,0.836501912571304,0.004136737098676645
524
- aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
525
- aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
526
- aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111
527
- aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066
528
- aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
529
- aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
530
- aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,3,0.3571428571428571,0.27509920634920637
531
- aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,4,0.4999999999999999,0.10868055555555556
532
- aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
533
- aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
534
- aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968
535
- aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111
536
- aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381
537
- aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
538
- aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
539
- aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,2,0.47280542884465016,0.10506382347888965
540
- aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111
541
- aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,4,0.5455447255899809,0.0614649096074132
542
- aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
543
- aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
544
- aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968
545
- aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
546
- aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365
547
- aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
548
- aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
549
- aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
550
- aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
551
- aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762
552
- aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
553
- aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
554
- aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
555
- aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
556
- aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365
557
- aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
558
- aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111
559
- aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
560
- aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
561
- aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381
562
- aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
563
- aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
564
- aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
565
- aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,3,0.3571428571428571,0.27509920634920637
566
- aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,4,0.42857142857142855,0.17886904761904762
567
- aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111
568
- aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
569
- aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
570
- aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111
571
- aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365
572
- aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
573
- aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
574
- aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
575
- aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
576
- aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365
577
- aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
578
- aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
579
- aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
580
- aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
581
- aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365
582
- aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
583
- aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
584
- aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
585
- aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
586
- aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05
587
- aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
588
- aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
589
- aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
590
- aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
591
- aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05
592
- aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
593
- aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
594
- aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
595
- aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111
596
- aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
597
- aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
598
- aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
599
- aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
600
- aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111
601
- aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05
602
- aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
603
- aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
604
- aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05
605
- aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,3,0.9999999999999998,4.96031746031746e-05
606
- aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05
607
- aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.14285714285714285,0.7195436507936508
608
- aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
609
- aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
610
- aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365
611
- aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,4,0.5714285714285714,0.06101190476190476
612
- aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
613
- aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
614
- aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
615
- aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476
616
- aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,4,0.3571428571428571,0.27509920634920637
617
- aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.0,1.0
618
- aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
619
- aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
620
- aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365
621
- aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365
622
- aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
623
- aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.10910894511799618,0.7083840532183997
624
- aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.6182840223353117,0.0340492747686748
625
- aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,3,0.2857142857142857,0.39875992063492066
626
- aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,4,0.40006613209931935,0.17023995462900499
627
- aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
628
- aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
629
- aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
630
- aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
631
- aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381
632
- aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
633
- aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
634
- aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
635
- aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,3,0.42857142857142855,0.17886904761904762
636
- aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,4,0.3571428571428571,0.27509920634920637
637
- aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
638
- aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
639
- aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.6182840223353117,0.0340492747686748
640
- aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,3,0.42857142857142855,0.17886904761904762
641
- aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,4,0.2857142857142857,0.39875992063492066
642
- aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985
643
- aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,1,0.14285714285714285,0.7195436507936508
644
- aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
645
- aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,3,0.4999999999999999,0.10868055555555556
646
- aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,4,0.3571428571428571,0.27509920634920637
647
- aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
648
- aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
649
- aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
650
- aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762
651
- aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762
652
- aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
653
- aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111
654
- aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
655
- aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
656
- aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365
657
- aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,0,0.40006613209931935,0.17023995462900499
658
- aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,1,0.6910233190806425,0.017844011512848347
659
- aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
660
- aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762
661
- aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,4,0.47280542884465016,0.10506382347888965
662
- aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,0,0.47280542884465016,0.10506382347888965
663
- aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
664
- aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,2,0.7637626158259734,0.008839740160738534
665
- aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476
666
- aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,4,0.10910894511799618,0.7083840532183997
667
- aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111
668
- aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968
669
- aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05
670
- aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476
671
- aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111
672
- aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
673
- aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,1,0.7637626158259734,0.008839740160738534
674
- aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968
675
- aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365
676
- aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,4,0.7637626158259734,0.008839740160738534
677
- aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,0,-0.2857142857142857,0.39875992063492066
678
- aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,1,0.2545875386086578,0.38281014365989596
679
- aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,2,0.6910233190806425,0.017844011512848347
680
- aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,3,0.10910894511799618,0.7083840532183997
681
- aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,4,0.07142857142857142,0.9048611111111111
682
- aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
683
- aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
684
- aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,2,0.836501912571304,0.004136737098676645
685
- aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,3,0.5669467095138409,0.05611472402809984
686
- aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,4,0.6182840223353117,0.0340492747686748
687
- aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
688
- aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
689
- aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
690
- aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365
691
- aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,4,0.42857142857142855,0.17886904761904762
692
- aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
693
- aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111
694
- aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
695
- aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365
696
- aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111
697
- aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
698
- aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
699
- aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
700
- aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111
701
- aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381
702
- aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,0,0.7412493166611012,0.011966745157436277
703
- aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111
704
- aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,2,0.7637626158259734,0.008839740160738534
705
- aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762
706
- aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111
707
- aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
708
- aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968
709
- aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
710
- aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,3,0.9999999999999998,4.96031746031746e-05
711
- aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,4,0.7637626158259734,0.008839740160738534
712
- aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
713
- aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
714
- aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
715
- aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111
716
- aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111
717
- aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
718
- aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111
719
- aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
720
- aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
721
- aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365
722
- aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
723
- aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
724
- aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
725
- aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
726
- aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111
727
- aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
728
- aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111
729
- aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
730
- aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476
731
- aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cache/allbenchs_cache_151f5bfbf87ac7384c2759731c72ec0c.csv DELETED
The diff for this file is too large to render. See raw diff
 
cache/allbenchs_cache_1b58bbc4e0d124b0a524da1001369741.csv DELETED
The diff for this file is too large to render. See raw diff
 
cache/allbenchs_cache_741f08262e15cba4bd6c8b25f2b138ca.csv DELETED
The diff for this file is too large to render. See raw diff
 
cache/allbenchs_cache_dcbcd453e19427bcbf89a901d3f2a925.csv DELETED
The diff for this file is too large to render. See raw diff