ChibuUkachi commited on
Commit
6cfaff9
·
1 Parent(s): da42ec0

add every eval results

Browse files
every_eval_ever/aime25.json ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "schema_version": "0.2.2",
3
+ "evaluation_id": "aime25/inference-optimization/MiniMax-M2.5-NVFP4/1777382291.417811",
4
+ "evaluation_timestamp": "3303957",
5
+ "retrieved_timestamp": "1777382291.417811",
6
+ "source_metadata": {
7
+ "source_name": "lighteval",
8
+ "source_type": "evaluation_run",
9
+ "source_organization_name": "RedHatAI",
10
+ "evaluator_relationship": "third_party"
11
+ },
12
+ "eval_library": {
13
+ "name": "lighteval",
14
+ "version": "v0.13.0"
15
+ },
16
+ "model_info": {
17
+ "name": "inference-optimization/MiniMax-M2.5-NVFP4",
18
+ "id": "inference-optimization/MiniMax-M2.5-NVFP4",
19
+ "developer": "inference-optimization",
20
+ "inference_engine": {
21
+ "name": "vllm"
22
+ },
23
+ "additional_details": {
24
+ "provider": "hosted_vllm",
25
+ "base_url": "http://0.0.0.0:8003/v1",
26
+ "concurrent_requests": "8",
27
+ "verbose": "False",
28
+ "api_max_retry": "8",
29
+ "api_retry_sleep": "1.0",
30
+ "api_retry_multiplier": "2.0",
31
+ "timeout": "2400.0",
32
+ "num_seeds_merged": "8"
33
+ }
34
+ },
35
+ "evaluation_results": [
36
+ {
37
+ "evaluation_name": "aime25",
38
+ "source_data": {
39
+ "dataset_name": "aime25",
40
+ "source_type": "hf_dataset",
41
+ "hf_repo": "yentinglin/aime_2025",
42
+ "hf_split": "train"
43
+ },
44
+ "evaluation_timestamp": "3305786",
45
+ "metric_config": {
46
+ "evaluation_description": "pass@k:k=1&n=1",
47
+ "lower_is_better": false,
48
+ "score_type": "continuous",
49
+ "min_score": 0.0,
50
+ "max_score": 1.0
51
+ },
52
+ "score_details": {
53
+ "score": 0.7708333333333334,
54
+ "details": {
55
+ "seed_scores": "[0.8, 0.7, 0.8333333333333334, 0.7666666666666667, 0.8333333333333334, 0.7666666666666667, 0.6666666666666666, 0.8]",
56
+ "seed_values": "[1234, 1356, 3344, 4158, 42, 5322, 5678, 9843]"
57
+ },
58
+ "uncertainty": {
59
+ "standard_error": {
60
+ "value": 0.021304202581158678,
61
+ "method": "across_seeds"
62
+ },
63
+ "num_samples": 8
64
+ }
65
+ },
66
+ "generation_config": {
67
+ "generation_args": {
68
+ "temperature": 1.0,
69
+ "top_p": 0.95,
70
+ "top_k": 40.0,
71
+ "max_tokens": 64000,
72
+ "max_attempts": 1
73
+ },
74
+ "additional_details": {
75
+ "repetition_penalty": "1.0",
76
+ "presence_penalty": "1.5",
77
+ "seed": "1234",
78
+ "min_p": "0.0"
79
+ }
80
+ }
81
+ },
82
+ {
83
+ "evaluation_name": "aime25",
84
+ "source_data": {
85
+ "dataset_name": "aime25",
86
+ "source_type": "hf_dataset",
87
+ "hf_repo": "yentinglin/aime_2025",
88
+ "hf_split": "train"
89
+ },
90
+ "evaluation_timestamp": "3305786",
91
+ "metric_config": {
92
+ "evaluation_description": "avg@n:n=1",
93
+ "lower_is_better": false,
94
+ "score_type": "continuous",
95
+ "min_score": 0.0,
96
+ "max_score": 1.0
97
+ },
98
+ "score_details": {
99
+ "score": 0.7708333333333334,
100
+ "details": {
101
+ "seed_scores": "[0.8, 0.7, 0.8333333333333334, 0.7666666666666667, 0.8333333333333334, 0.7666666666666667, 0.6666666666666666, 0.8]",
102
+ "seed_values": "[1234, 1356, 3344, 4158, 42, 5322, 5678, 9843]"
103
+ },
104
+ "uncertainty": {
105
+ "standard_error": {
106
+ "value": 0.021304202581158678,
107
+ "method": "across_seeds"
108
+ },
109
+ "num_samples": 8
110
+ }
111
+ },
112
+ "generation_config": {
113
+ "generation_args": {
114
+ "temperature": 1.0,
115
+ "top_p": 0.95,
116
+ "top_k": 40.0,
117
+ "max_tokens": 64000,
118
+ "max_attempts": 1
119
+ },
120
+ "additional_details": {
121
+ "repetition_penalty": "1.0",
122
+ "presence_penalty": "1.5",
123
+ "seed": "1234",
124
+ "min_p": "0.0"
125
+ }
126
+ }
127
+ }
128
+ ]
129
+ }
every_eval_ever/gpqa_diamond.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "schema_version": "0.2.2",
3
+ "evaluation_id": "gpqa:diamond/inference-optimization/MiniMax-M2.5-NVFP4/1777382485.273284",
4
+ "evaluation_timestamp": "3286118",
5
+ "retrieved_timestamp": "1777382485.273284",
6
+ "source_metadata": {
7
+ "source_name": "lighteval",
8
+ "source_type": "evaluation_run",
9
+ "source_organization_name": "RedHatAI",
10
+ "evaluator_relationship": "third_party"
11
+ },
12
+ "eval_library": {
13
+ "name": "lighteval",
14
+ "version": "v0.13.0"
15
+ },
16
+ "model_info": {
17
+ "name": "inference-optimization/MiniMax-M2.5-NVFP4",
18
+ "id": "inference-optimization/MiniMax-M2.5-NVFP4",
19
+ "developer": "inference-optimization",
20
+ "inference_engine": {
21
+ "name": "vllm"
22
+ },
23
+ "additional_details": {
24
+ "provider": "hosted_vllm",
25
+ "base_url": "http://0.0.0.0:8003/v1",
26
+ "concurrent_requests": "8",
27
+ "verbose": "False",
28
+ "api_max_retry": "8",
29
+ "api_retry_sleep": "1.0",
30
+ "api_retry_multiplier": "2.0",
31
+ "timeout": "2400.0",
32
+ "num_seeds_merged": "3"
33
+ }
34
+ },
35
+ "evaluation_results": [
36
+ {
37
+ "evaluation_name": "gpqa:diamond",
38
+ "source_data": {
39
+ "dataset_name": "gpqa:diamond",
40
+ "source_type": "hf_dataset",
41
+ "hf_repo": "Idavidrein/gpqa",
42
+ "hf_split": "train"
43
+ },
44
+ "evaluation_timestamp": "3288999",
45
+ "metric_config": {
46
+ "evaluation_description": "gpqa_pass@k:k=1",
47
+ "lower_is_better": false,
48
+ "score_type": "continuous",
49
+ "min_score": 0.0,
50
+ "max_score": 1.0
51
+ },
52
+ "score_details": {
53
+ "score": 0.803030303030303,
54
+ "details": {
55
+ "seed_scores": "[0.803030303030303, 0.7878787878787878, 0.8181818181818182]",
56
+ "seed_values": "[1234, 4158, 42]"
57
+ },
58
+ "uncertainty": {
59
+ "standard_error": {
60
+ "value": 0.008747731351357991,
61
+ "method": "across_seeds"
62
+ },
63
+ "num_samples": 3
64
+ }
65
+ },
66
+ "generation_config": {
67
+ "generation_args": {
68
+ "temperature": 1.0,
69
+ "top_p": 0.95,
70
+ "top_k": 40.0,
71
+ "max_tokens": 64000,
72
+ "max_attempts": 1
73
+ },
74
+ "additional_details": {
75
+ "repetition_penalty": "1.0",
76
+ "presence_penalty": "1.5",
77
+ "seed": "1234",
78
+ "min_p": "0.0"
79
+ }
80
+ }
81
+ }
82
+ ]
83
+ }
every_eval_ever/gsm8k_platinum_cot_llama.json ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "schema_version": "0.2.2",
3
+ "evaluation_id": "gsm8k_platinum_cot_llama/inference-optimization/MiniMax-M2.5-NVFP4/1777381438.664935",
4
+ "evaluation_timestamp": "1777306547",
5
+ "retrieved_timestamp": "1777381438.664935",
6
+ "source_metadata": {
7
+ "source_name": "lm-evaluation-harness",
8
+ "source_type": "evaluation_run",
9
+ "source_organization_name": "RedHatAI",
10
+ "evaluator_relationship": "third_party"
11
+ },
12
+ "eval_library": {
13
+ "name": "lm_eval",
14
+ "version": "0.4.12.dev0"
15
+ },
16
+ "model_info": {
17
+ "name": "inference-optimization/MiniMax-M2.5-NVFP4",
18
+ "id": "inference-optimization/MiniMax-M2.5-NVFP4",
19
+ "developer": "inference-optimization",
20
+ "additional_details": {
21
+ "model_args": "{'model': 'inference-optimization/MiniMax-M2.5-NVFP4', 'max_length': 196608, 'base_url': 'http://0.0.0.0:8003/v1/chat/completions', 'num_concurrent': 128, 'max_retries': 3, 'tokenized_requests': False, 'tokenizer_backend': None, 'timeout': 2400}",
22
+ "seed": "1234",
23
+ "num_seeds_merged": "3"
24
+ }
25
+ },
26
+ "evaluation_results": [
27
+ {
28
+ "evaluation_name": "gsm8k_platinum_cot_llama/strict-match",
29
+ "source_data": {
30
+ "dataset_name": "gsm8k_platinum_cot_llama",
31
+ "source_type": "hf_dataset",
32
+ "hf_repo": "madrylab/gsm8k-platinum",
33
+ "hf_split": "test"
34
+ },
35
+ "evaluation_timestamp": "1777306728",
36
+ "metric_config": {
37
+ "evaluation_description": "exact_match (filter: strict-match)",
38
+ "lower_is_better": false,
39
+ "score_type": "continuous",
40
+ "min_score": 0.0,
41
+ "max_score": 1.0
42
+ },
43
+ "score_details": {
44
+ "score": 0.9390681003584229,
45
+ "details": {
46
+ "seed_scores": "[0.9396195202646815, 0.9321753515301903, 0.9454094292803971]",
47
+ "seed_values": "[1234, 4158, 42]"
48
+ },
49
+ "uncertainty": {
50
+ "standard_error": {
51
+ "value": 0.0038302850811827447,
52
+ "method": "across_seeds"
53
+ },
54
+ "num_samples": 3
55
+ }
56
+ },
57
+ "generation_config": {
58
+ "generation_args": {
59
+ "temperature": 1.0,
60
+ "top_p": 0.95,
61
+ "top_k": 40.0,
62
+ "max_tokens": 64000,
63
+ "max_attempts": 1
64
+ },
65
+ "additional_details": {
66
+ "do_sample": "true",
67
+ "until": "[\"<|eot_id|>\", \"<|start_header_id|>user<|end_header_id|>\", \"Q:\", \"</s>\", \"<|im_end|>\"]",
68
+ "min_p": "0.0",
69
+ "presence_penalty": "1.5",
70
+ "repetition_penalty": "1.0",
71
+ "seed": "1234",
72
+ "num_fewshot": "0"
73
+ }
74
+ }
75
+ },
76
+ {
77
+ "evaluation_name": "gsm8k_platinum_cot_llama/flexible-extract",
78
+ "source_data": {
79
+ "dataset_name": "gsm8k_platinum_cot_llama",
80
+ "source_type": "hf_dataset",
81
+ "hf_repo": "madrylab/gsm8k-platinum",
82
+ "hf_split": "test"
83
+ },
84
+ "evaluation_timestamp": "1777306728",
85
+ "metric_config": {
86
+ "evaluation_description": "exact_match (filter: flexible-extract)",
87
+ "lower_is_better": false,
88
+ "score_type": "continuous",
89
+ "min_score": 0.0,
90
+ "max_score": 1.0
91
+ },
92
+ "score_details": {
93
+ "score": 0.9567135373586987,
94
+ "details": {
95
+ "seed_scores": "[0.9578163771712159, 0.9520264681555004, 0.9602977667493796]",
96
+ "seed_values": "[1234, 4158, 42]"
97
+ },
98
+ "uncertainty": {
99
+ "standard_error": {
100
+ "value": 0.002450563666202254,
101
+ "method": "across_seeds"
102
+ },
103
+ "num_samples": 3
104
+ }
105
+ },
106
+ "generation_config": {
107
+ "generation_args": {
108
+ "temperature": 1.0,
109
+ "top_p": 0.95,
110
+ "top_k": 40.0,
111
+ "max_tokens": 64000,
112
+ "max_attempts": 1
113
+ },
114
+ "additional_details": {
115
+ "do_sample": "true",
116
+ "until": "[\"<|eot_id|>\", \"<|start_header_id|>user<|end_header_id|>\", \"Q:\", \"</s>\", \"<|im_end|>\"]",
117
+ "min_p": "0.0",
118
+ "presence_penalty": "1.5",
119
+ "repetition_penalty": "1.0",
120
+ "seed": "1234",
121
+ "num_fewshot": "0"
122
+ }
123
+ }
124
+ }
125
+ ]
126
+ }
every_eval_ever/ifeval.json ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "schema_version": "0.2.2",
3
+ "evaluation_id": "ifeval/inference-optimization/MiniMax-M2.5-NVFP4/1777381402.619951",
4
+ "evaluation_timestamp": "1777304021",
5
+ "retrieved_timestamp": "1777381402.619951",
6
+ "source_metadata": {
7
+ "source_name": "lm-evaluation-harness",
8
+ "source_type": "evaluation_run",
9
+ "source_organization_name": "RedHatAI",
10
+ "evaluator_relationship": "third_party"
11
+ },
12
+ "eval_library": {
13
+ "name": "lm_eval",
14
+ "version": "0.4.12.dev0"
15
+ },
16
+ "model_info": {
17
+ "name": "inference-optimization/MiniMax-M2.5-NVFP4",
18
+ "id": "inference-optimization/MiniMax-M2.5-NVFP4",
19
+ "developer": "inference-optimization",
20
+ "additional_details": {
21
+ "model_args": "{'model': 'inference-optimization/MiniMax-M2.5-NVFP4', 'max_length': 196608, 'base_url': 'http://0.0.0.0:8003/v1/chat/completions', 'num_concurrent': 128, 'max_retries': 3, 'tokenized_requests': False, 'tokenizer_backend': None, 'timeout': 2400}",
22
+ "seed": "1234",
23
+ "num_seeds_merged": "3"
24
+ }
25
+ },
26
+ "evaluation_results": [
27
+ {
28
+ "evaluation_name": "ifeval",
29
+ "source_data": {
30
+ "dataset_name": "ifeval",
31
+ "source_type": "hf_dataset",
32
+ "hf_repo": "google/IFEval",
33
+ "hf_split": "train"
34
+ },
35
+ "evaluation_timestamp": "1777304940",
36
+ "metric_config": {
37
+ "evaluation_description": "prompt_level_strict_acc",
38
+ "lower_is_better": false,
39
+ "score_type": "continuous",
40
+ "min_score": 0.0,
41
+ "max_score": 1.0
42
+ },
43
+ "score_details": {
44
+ "score": 0.8539741219963032,
45
+ "details": {
46
+ "seed_scores": "[0.8595194085027726, 0.8521256931608133, 0.8502772643253235]",
47
+ "seed_values": "[1234, 4158, 42]"
48
+ },
49
+ "uncertainty": {
50
+ "standard_error": {
51
+ "value": 0.0028235216851237458,
52
+ "method": "across_seeds"
53
+ },
54
+ "num_samples": 3
55
+ }
56
+ },
57
+ "generation_config": {
58
+ "generation_args": {
59
+ "temperature": 1.0,
60
+ "top_p": 0.95,
61
+ "top_k": 40.0,
62
+ "max_tokens": 64000,
63
+ "max_attempts": 1
64
+ },
65
+ "additional_details": {
66
+ "until": "[]",
67
+ "do_sample": "true",
68
+ "min_p": "0.0",
69
+ "presence_penalty": "1.5",
70
+ "repetition_penalty": "1.0",
71
+ "seed": "1234",
72
+ "num_fewshot": "0"
73
+ }
74
+ }
75
+ },
76
+ {
77
+ "evaluation_name": "ifeval",
78
+ "source_data": {
79
+ "dataset_name": "ifeval",
80
+ "source_type": "hf_dataset",
81
+ "hf_repo": "google/IFEval",
82
+ "hf_split": "train"
83
+ },
84
+ "evaluation_timestamp": "1777304940",
85
+ "metric_config": {
86
+ "evaluation_description": "inst_level_strict_acc",
87
+ "lower_is_better": false,
88
+ "score_type": "continuous",
89
+ "min_score": 0.0,
90
+ "max_score": 1.0
91
+ },
92
+ "score_details": {
93
+ "score": 0.898880895283773,
94
+ "details": {
95
+ "seed_scores": "[0.9052757793764988, 0.9004796163069544, 0.8908872901678657]",
96
+ "seed_values": "[1234, 4158, 42]"
97
+ },
98
+ "uncertainty": {
99
+ "standard_error": {
100
+ "value": 0.004229818243108859,
101
+ "method": "across_seeds"
102
+ },
103
+ "num_samples": 3
104
+ }
105
+ },
106
+ "generation_config": {
107
+ "generation_args": {
108
+ "temperature": 1.0,
109
+ "top_p": 0.95,
110
+ "top_k": 40.0,
111
+ "max_tokens": 64000,
112
+ "max_attempts": 1
113
+ },
114
+ "additional_details": {
115
+ "until": "[]",
116
+ "do_sample": "true",
117
+ "min_p": "0.0",
118
+ "presence_penalty": "1.5",
119
+ "repetition_penalty": "1.0",
120
+ "seed": "1234",
121
+ "num_fewshot": "0"
122
+ }
123
+ }
124
+ },
125
+ {
126
+ "evaluation_name": "ifeval",
127
+ "source_data": {
128
+ "dataset_name": "ifeval",
129
+ "source_type": "hf_dataset",
130
+ "hf_repo": "google/IFEval",
131
+ "hf_split": "train"
132
+ },
133
+ "evaluation_timestamp": "1777304940",
134
+ "metric_config": {
135
+ "evaluation_description": "prompt_level_loose_acc",
136
+ "lower_is_better": false,
137
+ "score_type": "continuous",
138
+ "min_score": 0.0,
139
+ "max_score": 1.0
140
+ },
141
+ "score_details": {
142
+ "score": 0.8829328404189772,
143
+ "details": {
144
+ "seed_scores": "[0.8964879852125693, 0.8743068391866913, 0.878003696857671]",
145
+ "seed_values": "[1234, 4158, 42]"
146
+ },
147
+ "uncertainty": {
148
+ "standard_error": {
149
+ "value": 0.006861077464978476,
150
+ "method": "across_seeds"
151
+ },
152
+ "num_samples": 3
153
+ }
154
+ },
155
+ "generation_config": {
156
+ "generation_args": {
157
+ "temperature": 1.0,
158
+ "top_p": 0.95,
159
+ "top_k": 40.0,
160
+ "max_tokens": 64000,
161
+ "max_attempts": 1
162
+ },
163
+ "additional_details": {
164
+ "until": "[]",
165
+ "do_sample": "true",
166
+ "min_p": "0.0",
167
+ "presence_penalty": "1.5",
168
+ "repetition_penalty": "1.0",
169
+ "seed": "1234",
170
+ "num_fewshot": "0"
171
+ }
172
+ }
173
+ },
174
+ {
175
+ "evaluation_name": "ifeval",
176
+ "source_data": {
177
+ "dataset_name": "ifeval",
178
+ "source_type": "hf_dataset",
179
+ "hf_repo": "google/IFEval",
180
+ "hf_split": "train"
181
+ },
182
+ "evaluation_timestamp": "1777304940",
183
+ "metric_config": {
184
+ "evaluation_description": "inst_level_loose_acc",
185
+ "lower_is_better": false,
186
+ "score_type": "continuous",
187
+ "min_score": 0.0,
188
+ "max_score": 1.0
189
+ },
190
+ "score_details": {
191
+ "score": 0.9180655475619505,
192
+ "details": {
193
+ "seed_scores": "[0.9292565947242206, 0.9148681055155875, 0.9100719424460432]",
194
+ "seed_values": "[1234, 4158, 42]"
195
+ },
196
+ "uncertainty": {
197
+ "standard_error": {
198
+ "value": 0.005764270624242989,
199
+ "method": "across_seeds"
200
+ },
201
+ "num_samples": 3
202
+ }
203
+ },
204
+ "generation_config": {
205
+ "generation_args": {
206
+ "temperature": 1.0,
207
+ "top_p": 0.95,
208
+ "top_k": 40.0,
209
+ "max_tokens": 64000,
210
+ "max_attempts": 1
211
+ },
212
+ "additional_details": {
213
+ "until": "[]",
214
+ "do_sample": "true",
215
+ "min_p": "0.0",
216
+ "presence_penalty": "1.5",
217
+ "repetition_penalty": "1.0",
218
+ "seed": "1234",
219
+ "num_fewshot": "0"
220
+ }
221
+ }
222
+ }
223
+ ]
224
+ }
every_eval_ever/math_500.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "schema_version": "0.2.2",
3
+ "evaluation_id": "math_500/inference-optimization/MiniMax-M2.5-NVFP4/1777382453.327009",
4
+ "evaluation_timestamp": "3278606",
5
+ "retrieved_timestamp": "1777382453.327009",
6
+ "source_metadata": {
7
+ "source_name": "lighteval",
8
+ "source_type": "evaluation_run",
9
+ "source_organization_name": "RedHatAI",
10
+ "evaluator_relationship": "third_party"
11
+ },
12
+ "eval_library": {
13
+ "name": "lighteval",
14
+ "version": "v0.13.0"
15
+ },
16
+ "model_info": {
17
+ "name": "inference-optimization/MiniMax-M2.5-NVFP4",
18
+ "id": "inference-optimization/MiniMax-M2.5-NVFP4",
19
+ "developer": "inference-optimization",
20
+ "inference_engine": {
21
+ "name": "vllm"
22
+ },
23
+ "additional_details": {
24
+ "provider": "hosted_vllm",
25
+ "base_url": "http://0.0.0.0:8003/v1",
26
+ "concurrent_requests": "8",
27
+ "verbose": "False",
28
+ "api_max_retry": "8",
29
+ "api_retry_sleep": "1.0",
30
+ "api_retry_multiplier": "2.0",
31
+ "timeout": "2400.0",
32
+ "num_seeds_merged": "3"
33
+ }
34
+ },
35
+ "evaluation_results": [
36
+ {
37
+ "evaluation_name": "math_500",
38
+ "source_data": {
39
+ "dataset_name": "math_500",
40
+ "source_type": "hf_dataset",
41
+ "hf_repo": "HuggingFaceH4/MATH-500",
42
+ "hf_split": "test"
43
+ },
44
+ "evaluation_timestamp": "3280699",
45
+ "metric_config": {
46
+ "evaluation_description": "pass@k:k=1&n=1",
47
+ "lower_is_better": false,
48
+ "score_type": "continuous",
49
+ "min_score": 0.0,
50
+ "max_score": 1.0
51
+ },
52
+ "score_details": {
53
+ "score": 0.8773333333333333,
54
+ "details": {
55
+ "seed_scores": "[0.882, 0.878, 0.872]",
56
+ "seed_values": "[1234, 4158, 42]"
57
+ },
58
+ "uncertainty": {
59
+ "standard_error": {
60
+ "value": 0.0029059326290271185,
61
+ "method": "across_seeds"
62
+ },
63
+ "num_samples": 3
64
+ }
65
+ },
66
+ "generation_config": {
67
+ "generation_args": {
68
+ "temperature": 1.0,
69
+ "top_p": 0.95,
70
+ "top_k": 40.0,
71
+ "max_tokens": 64000,
72
+ "max_attempts": 1
73
+ },
74
+ "additional_details": {
75
+ "repetition_penalty": "1.0",
76
+ "presence_penalty": "1.5",
77
+ "seed": "1234",
78
+ "min_p": "0.0"
79
+ }
80
+ }
81
+ }
82
+ ]
83
+ }
every_eval_ever/mmlu_pro_chat.json ADDED
@@ -0,0 +1,743 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "schema_version": "0.2.2",
3
+ "evaluation_id": "mmlu_pro_chat/inference-optimization/MiniMax-M2.5-NVFP4/1777381372.569359",
4
+ "evaluation_timestamp": "1776209745",
5
+ "retrieved_timestamp": "1777381372.569359",
6
+ "source_metadata": {
7
+ "source_name": "lm-evaluation-harness",
8
+ "source_type": "evaluation_run",
9
+ "source_organization_name": "RedHatAI",
10
+ "evaluator_relationship": "third_party"
11
+ },
12
+ "eval_library": {
13
+ "name": "lm_eval",
14
+ "version": "0.4.12.dev0"
15
+ },
16
+ "model_info": {
17
+ "name": "inference-optimization/MiniMax-M2.5-NVFP4",
18
+ "id": "inference-optimization/MiniMax-M2.5-NVFP4",
19
+ "developer": "inference-optimization",
20
+ "additional_details": {
21
+ "model_args": "{'model': 'inference-optimization/MiniMax-M2.5-NVFP4', 'max_length': 196608, 'base_url': 'http://0.0.0.0:8000/v1/chat/completions', 'num_concurrent': 128, 'max_retries': 3, 'tokenized_requests': False, 'tokenizer_backend': None, 'timeout': 2400}",
22
+ "seed": "1234",
23
+ "num_seeds_merged": "3"
24
+ }
25
+ },
26
+ "evaluation_results": [
27
+ {
28
+ "evaluation_name": "mmlu_pro_chat_biology/custom-extract",
29
+ "source_data": {
30
+ "dataset_name": "mmlu_pro_chat",
31
+ "source_type": "hf_dataset",
32
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
33
+ "hf_split": "test"
34
+ },
35
+ "evaluation_timestamp": "1776226127",
36
+ "metric_config": {
37
+ "evaluation_description": "exact_match (filter: custom-extract)",
38
+ "lower_is_better": false,
39
+ "score_type": "continuous",
40
+ "min_score": 0.0,
41
+ "max_score": 1.0
42
+ },
43
+ "score_details": {
44
+ "score": 0.903300790330079,
45
+ "details": {
46
+ "seed_scores": "[0.9121338912133892, 0.9037656903765691, 0.8940027894002789]",
47
+ "seed_values": "[1234, 4158, 42]"
48
+ },
49
+ "uncertainty": {
50
+ "standard_error": {
51
+ "value": 0.005239157447505677,
52
+ "method": "across_seeds"
53
+ },
54
+ "num_samples": 3
55
+ }
56
+ },
57
+ "generation_config": {
58
+ "generation_args": {
59
+ "temperature": 1.0,
60
+ "top_p": 0.95,
61
+ "top_k": 40.0,
62
+ "max_tokens": 64000,
63
+ "max_attempts": 1
64
+ },
65
+ "additional_details": {
66
+ "until": "[]",
67
+ "do_sample": "true",
68
+ "min_p": "0.0",
69
+ "presence_penalty": "1.5",
70
+ "repetition_penalty": "1.0",
71
+ "seed": "1234",
72
+ "num_fewshot": "0"
73
+ }
74
+ }
75
+ },
76
+ {
77
+ "evaluation_name": "mmlu_pro_chat_business/custom-extract",
78
+ "source_data": {
79
+ "dataset_name": "mmlu_pro_chat",
80
+ "source_type": "hf_dataset",
81
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
82
+ "hf_split": "test"
83
+ },
84
+ "evaluation_timestamp": "1776226127",
85
+ "metric_config": {
86
+ "evaluation_description": "exact_match (filter: custom-extract)",
87
+ "lower_is_better": false,
88
+ "score_type": "continuous",
89
+ "min_score": 0.0,
90
+ "max_score": 1.0
91
+ },
92
+ "score_details": {
93
+ "score": 0.8588931136459653,
94
+ "details": {
95
+ "seed_scores": "[0.8605830164765526, 0.8504435994930292, 0.8656527249683144]",
96
+ "seed_values": "[1234, 4158, 42]"
97
+ },
98
+ "uncertainty": {
99
+ "standard_error": {
100
+ "value": 0.004471062629597958,
101
+ "method": "across_seeds"
102
+ },
103
+ "num_samples": 3
104
+ }
105
+ },
106
+ "generation_config": {
107
+ "generation_args": {
108
+ "temperature": 1.0,
109
+ "top_p": 0.95,
110
+ "top_k": 40.0,
111
+ "max_tokens": 64000,
112
+ "max_attempts": 1
113
+ },
114
+ "additional_details": {
115
+ "until": "[]",
116
+ "do_sample": "true",
117
+ "min_p": "0.0",
118
+ "presence_penalty": "1.5",
119
+ "repetition_penalty": "1.0",
120
+ "seed": "1234",
121
+ "num_fewshot": "0"
122
+ }
123
+ }
124
+ },
125
+ {
126
+ "evaluation_name": "mmlu_pro_chat_chemistry/custom-extract",
127
+ "source_data": {
128
+ "dataset_name": "mmlu_pro_chat",
129
+ "source_type": "hf_dataset",
130
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
131
+ "hf_split": "test"
132
+ },
133
+ "evaluation_timestamp": "1776226127",
134
+ "metric_config": {
135
+ "evaluation_description": "exact_match (filter: custom-extract)",
136
+ "lower_is_better": false,
137
+ "score_type": "continuous",
138
+ "min_score": 0.0,
139
+ "max_score": 1.0
140
+ },
141
+ "score_details": {
142
+ "score": 0.8674911660777385,
143
+ "details": {
144
+ "seed_scores": "[0.8666077738515902, 0.8683745583038869, 0.8674911660777385]",
145
+ "seed_values": "[1234, 4158, 42]"
146
+ },
147
+ "uncertainty": {
148
+ "standard_error": {
149
+ "value": 0.0005100267395667788,
150
+ "method": "across_seeds"
151
+ },
152
+ "num_samples": 3
153
+ }
154
+ },
155
+ "generation_config": {
156
+ "generation_args": {
157
+ "temperature": 1.0,
158
+ "top_p": 0.95,
159
+ "top_k": 40.0,
160
+ "max_tokens": 64000,
161
+ "max_attempts": 1
162
+ },
163
+ "additional_details": {
164
+ "until": "[]",
165
+ "do_sample": "true",
166
+ "min_p": "0.0",
167
+ "presence_penalty": "1.5",
168
+ "repetition_penalty": "1.0",
169
+ "seed": "1234",
170
+ "num_fewshot": "0"
171
+ }
172
+ }
173
+ },
174
+ {
175
+ "evaluation_name": "mmlu_pro_chat_computer_science/custom-extract",
176
+ "source_data": {
177
+ "dataset_name": "mmlu_pro_chat",
178
+ "source_type": "hf_dataset",
179
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
180
+ "hf_split": "test"
181
+ },
182
+ "evaluation_timestamp": "1776226127",
183
+ "metric_config": {
184
+ "evaluation_description": "exact_match (filter: custom-extract)",
185
+ "lower_is_better": false,
186
+ "score_type": "continuous",
187
+ "min_score": 0.0,
188
+ "max_score": 1.0
189
+ },
190
+ "score_details": {
191
+ "score": 0.8569105691056911,
192
+ "details": {
193
+ "seed_scores": "[0.875609756097561, 0.8512195121951219, 0.8439024390243902]",
194
+ "seed_values": "[1234, 4158, 42]"
195
+ },
196
+ "uncertainty": {
197
+ "standard_error": {
198
+ "value": 0.009585224489879356,
199
+ "method": "across_seeds"
200
+ },
201
+ "num_samples": 3
202
+ }
203
+ },
204
+ "generation_config": {
205
+ "generation_args": {
206
+ "temperature": 1.0,
207
+ "top_p": 0.95,
208
+ "top_k": 40.0,
209
+ "max_tokens": 64000,
210
+ "max_attempts": 1
211
+ },
212
+ "additional_details": {
213
+ "until": "[]",
214
+ "do_sample": "true",
215
+ "min_p": "0.0",
216
+ "presence_penalty": "1.5",
217
+ "repetition_penalty": "1.0",
218
+ "seed": "1234",
219
+ "num_fewshot": "0"
220
+ }
221
+ }
222
+ },
223
+ {
224
+ "evaluation_name": "mmlu_pro_chat_economics/custom-extract",
225
+ "source_data": {
226
+ "dataset_name": "mmlu_pro_chat",
227
+ "source_type": "hf_dataset",
228
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
229
+ "hf_split": "test"
230
+ },
231
+ "evaluation_timestamp": "1776226127",
232
+ "metric_config": {
233
+ "evaluation_description": "exact_match (filter: custom-extract)",
234
+ "lower_is_better": false,
235
+ "score_type": "continuous",
236
+ "min_score": 0.0,
237
+ "max_score": 1.0
238
+ },
239
+ "score_details": {
240
+ "score": 0.8404423380726699,
241
+ "details": {
242
+ "seed_scores": "[0.8376777251184834, 0.840047393364929, 0.8436018957345972]",
243
+ "seed_values": "[1234, 4158, 42]"
244
+ },
245
+ "uncertainty": {
246
+ "standard_error": {
247
+ "value": 0.001721524069328884,
248
+ "method": "across_seeds"
249
+ },
250
+ "num_samples": 3
251
+ }
252
+ },
253
+ "generation_config": {
254
+ "generation_args": {
255
+ "temperature": 1.0,
256
+ "top_p": 0.95,
257
+ "top_k": 40.0,
258
+ "max_tokens": 64000,
259
+ "max_attempts": 1
260
+ },
261
+ "additional_details": {
262
+ "until": "[]",
263
+ "do_sample": "true",
264
+ "min_p": "0.0",
265
+ "presence_penalty": "1.5",
266
+ "repetition_penalty": "1.0",
267
+ "seed": "1234",
268
+ "num_fewshot": "0"
269
+ }
270
+ }
271
+ },
272
+ {
273
+ "evaluation_name": "mmlu_pro_chat_engineering/custom-extract",
274
+ "source_data": {
275
+ "dataset_name": "mmlu_pro_chat",
276
+ "source_type": "hf_dataset",
277
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
278
+ "hf_split": "test"
279
+ },
280
+ "evaluation_timestamp": "1776226127",
281
+ "metric_config": {
282
+ "evaluation_description": "exact_match (filter: custom-extract)",
283
+ "lower_is_better": false,
284
+ "score_type": "continuous",
285
+ "min_score": 0.0,
286
+ "max_score": 1.0
287
+ },
288
+ "score_details": {
289
+ "score": 0.7220502235982112,
290
+ "details": {
291
+ "seed_scores": "[0.7409700722394221, 0.6996904024767802, 0.7254901960784313]",
292
+ "seed_values": "[1234, 4158, 42]"
293
+ },
294
+ "uncertainty": {
295
+ "standard_error": {
296
+ "value": 0.012039903680770573,
297
+ "method": "across_seeds"
298
+ },
299
+ "num_samples": 3
300
+ }
301
+ },
302
+ "generation_config": {
303
+ "generation_args": {
304
+ "temperature": 1.0,
305
+ "top_p": 0.95,
306
+ "top_k": 40.0,
307
+ "max_tokens": 64000,
308
+ "max_attempts": 1
309
+ },
310
+ "additional_details": {
311
+ "until": "[]",
312
+ "do_sample": "true",
313
+ "min_p": "0.0",
314
+ "presence_penalty": "1.5",
315
+ "repetition_penalty": "1.0",
316
+ "seed": "1234",
317
+ "num_fewshot": "0"
318
+ }
319
+ }
320
+ },
321
+ {
322
+ "evaluation_name": "mmlu_pro_chat_health/custom-extract",
323
+ "source_data": {
324
+ "dataset_name": "mmlu_pro_chat",
325
+ "source_type": "hf_dataset",
326
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
327
+ "hf_split": "test"
328
+ },
329
+ "evaluation_timestamp": "1776226127",
330
+ "metric_config": {
331
+ "evaluation_description": "exact_match (filter: custom-extract)",
332
+ "lower_is_better": false,
333
+ "score_type": "continuous",
334
+ "min_score": 0.0,
335
+ "max_score": 1.0
336
+ },
337
+ "score_details": {
338
+ "score": 0.7766911165444174,
339
+ "details": {
340
+ "seed_scores": "[0.7701711491442543, 0.78239608801956, 0.7775061124694377]",
341
+ "seed_values": "[1234, 4158, 42]"
342
+ },
343
+ "uncertainty": {
344
+ "standard_error": {
345
+ "value": 0.0035524848765612657,
346
+ "method": "across_seeds"
347
+ },
348
+ "num_samples": 3
349
+ }
350
+ },
351
+ "generation_config": {
352
+ "generation_args": {
353
+ "temperature": 1.0,
354
+ "top_p": 0.95,
355
+ "top_k": 40.0,
356
+ "max_tokens": 64000,
357
+ "max_attempts": 1
358
+ },
359
+ "additional_details": {
360
+ "until": "[]",
361
+ "do_sample": "true",
362
+ "min_p": "0.0",
363
+ "presence_penalty": "1.5",
364
+ "repetition_penalty": "1.0",
365
+ "seed": "1234",
366
+ "num_fewshot": "0"
367
+ }
368
+ }
369
+ },
370
+ {
371
+ "evaluation_name": "mmlu_pro_chat_history/custom-extract",
372
+ "source_data": {
373
+ "dataset_name": "mmlu_pro_chat",
374
+ "source_type": "hf_dataset",
375
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
376
+ "hf_split": "test"
377
+ },
378
+ "evaluation_timestamp": "1776226127",
379
+ "metric_config": {
380
+ "evaluation_description": "exact_match (filter: custom-extract)",
381
+ "lower_is_better": false,
382
+ "score_type": "continuous",
383
+ "min_score": 0.0,
384
+ "max_score": 1.0
385
+ },
386
+ "score_details": {
387
+ "score": 0.6500437445319335,
388
+ "details": {
389
+ "seed_scores": "[0.6272965879265092, 0.6509186351706037, 0.6719160104986877]",
390
+ "seed_values": "[1234, 4158, 42]"
391
+ },
392
+ "uncertainty": {
393
+ "standard_error": {
394
+ "value": 0.01288794388683834,
395
+ "method": "across_seeds"
396
+ },
397
+ "num_samples": 3
398
+ }
399
+ },
400
+ "generation_config": {
401
+ "generation_args": {
402
+ "temperature": 1.0,
403
+ "top_p": 0.95,
404
+ "top_k": 40.0,
405
+ "max_tokens": 64000,
406
+ "max_attempts": 1
407
+ },
408
+ "additional_details": {
409
+ "until": "[]",
410
+ "do_sample": "true",
411
+ "min_p": "0.0",
412
+ "presence_penalty": "1.5",
413
+ "repetition_penalty": "1.0",
414
+ "seed": "1234",
415
+ "num_fewshot": "0"
416
+ }
417
+ }
418
+ },
419
+ {
420
+ "evaluation_name": "mmlu_pro_chat_law/custom-extract",
421
+ "source_data": {
422
+ "dataset_name": "mmlu_pro_chat",
423
+ "source_type": "hf_dataset",
424
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
425
+ "hf_split": "test"
426
+ },
427
+ "evaluation_timestamp": "1776226127",
428
+ "metric_config": {
429
+ "evaluation_description": "exact_match (filter: custom-extract)",
430
+ "lower_is_better": false,
431
+ "score_type": "continuous",
432
+ "min_score": 0.0,
433
+ "max_score": 1.0
434
+ },
435
+ "score_details": {
436
+ "score": 0.55858310626703,
437
+ "details": {
438
+ "seed_scores": "[0.5640326975476839, 0.5404178019981835, 0.5712988192552225]",
439
+ "seed_values": "[1234, 4158, 42]"
440
+ },
441
+ "uncertainty": {
442
+ "standard_error": {
443
+ "value": 0.00932171015340034,
444
+ "method": "across_seeds"
445
+ },
446
+ "num_samples": 3
447
+ }
448
+ },
449
+ "generation_config": {
450
+ "generation_args": {
451
+ "temperature": 1.0,
452
+ "top_p": 0.95,
453
+ "top_k": 40.0,
454
+ "max_tokens": 64000,
455
+ "max_attempts": 1
456
+ },
457
+ "additional_details": {
458
+ "until": "[]",
459
+ "do_sample": "true",
460
+ "min_p": "0.0",
461
+ "presence_penalty": "1.5",
462
+ "repetition_penalty": "1.0",
463
+ "seed": "1234",
464
+ "num_fewshot": "0"
465
+ }
466
+ }
467
+ },
468
+ {
469
+ "evaluation_name": "mmlu_pro_chat_math/custom-extract",
470
+ "source_data": {
471
+ "dataset_name": "mmlu_pro_chat",
472
+ "source_type": "hf_dataset",
473
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
474
+ "hf_split": "test"
475
+ },
476
+ "evaluation_timestamp": "1776226127",
477
+ "metric_config": {
478
+ "evaluation_description": "exact_match (filter: custom-extract)",
479
+ "lower_is_better": false,
480
+ "score_type": "continuous",
481
+ "min_score": 0.0,
482
+ "max_score": 1.0
483
+ },
484
+ "score_details": {
485
+ "score": 0.9338761411300271,
486
+ "details": {
487
+ "seed_scores": "[0.9341228719467062, 0.9356032568467801, 0.9319022945965951]",
488
+ "seed_values": "[1234, 4158, 42]"
489
+ },
490
+ "uncertainty": {
491
+ "standard_error": {
492
+ "value": 0.0010754746961610265,
493
+ "method": "across_seeds"
494
+ },
495
+ "num_samples": 3
496
+ }
497
+ },
498
+ "generation_config": {
499
+ "generation_args": {
500
+ "temperature": 1.0,
501
+ "top_p": 0.95,
502
+ "top_k": 40.0,
503
+ "max_tokens": 64000,
504
+ "max_attempts": 1
505
+ },
506
+ "additional_details": {
507
+ "until": "[]",
508
+ "do_sample": "true",
509
+ "min_p": "0.0",
510
+ "presence_penalty": "1.5",
511
+ "repetition_penalty": "1.0",
512
+ "seed": "1234",
513
+ "num_fewshot": "0"
514
+ }
515
+ }
516
+ },
517
+ {
518
+ "evaluation_name": "mmlu_pro_chat_other/custom-extract",
519
+ "source_data": {
520
+ "dataset_name": "mmlu_pro_chat",
521
+ "source_type": "hf_dataset",
522
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
523
+ "hf_split": "test"
524
+ },
525
+ "evaluation_timestamp": "1776226127",
526
+ "metric_config": {
527
+ "evaluation_description": "exact_match (filter: custom-extract)",
528
+ "lower_is_better": false,
529
+ "score_type": "continuous",
530
+ "min_score": 0.0,
531
+ "max_score": 1.0
532
+ },
533
+ "score_details": {
534
+ "score": 0.7525252525252526,
535
+ "details": {
536
+ "seed_scores": "[0.762987012987013, 0.7435064935064936, 0.7510822510822511]",
537
+ "seed_values": "[1234, 4158, 42]"
538
+ },
539
+ "uncertainty": {
540
+ "standard_error": {
541
+ "value": 0.005669636957251685,
542
+ "method": "across_seeds"
543
+ },
544
+ "num_samples": 3
545
+ }
546
+ },
547
+ "generation_config": {
548
+ "generation_args": {
549
+ "temperature": 1.0,
550
+ "top_p": 0.95,
551
+ "top_k": 40.0,
552
+ "max_tokens": 64000,
553
+ "max_attempts": 1
554
+ },
555
+ "additional_details": {
556
+ "until": "[]",
557
+ "do_sample": "true",
558
+ "min_p": "0.0",
559
+ "presence_penalty": "1.5",
560
+ "repetition_penalty": "1.0",
561
+ "seed": "1234",
562
+ "num_fewshot": "0"
563
+ }
564
+ }
565
+ },
566
+ {
567
+ "evaluation_name": "mmlu_pro_chat_philosophy/custom-extract",
568
+ "source_data": {
569
+ "dataset_name": "mmlu_pro_chat",
570
+ "source_type": "hf_dataset",
571
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
572
+ "hf_split": "test"
573
+ },
574
+ "evaluation_timestamp": "1776226127",
575
+ "metric_config": {
576
+ "evaluation_description": "exact_match (filter: custom-extract)",
577
+ "lower_is_better": false,
578
+ "score_type": "continuous",
579
+ "min_score": 0.0,
580
+ "max_score": 1.0
581
+ },
582
+ "score_details": {
583
+ "score": 0.6893787575150301,
584
+ "details": {
585
+ "seed_scores": "[0.6833667334669339, 0.685370741482966, 0.6993987975951904]",
586
+ "seed_values": "[1234, 4158, 42]"
587
+ },
588
+ "uncertainty": {
589
+ "standard_error": {
590
+ "value": 0.0050433095759991695,
591
+ "method": "across_seeds"
592
+ },
593
+ "num_samples": 3
594
+ }
595
+ },
596
+ "generation_config": {
597
+ "generation_args": {
598
+ "temperature": 1.0,
599
+ "top_p": 0.95,
600
+ "top_k": 40.0,
601
+ "max_tokens": 64000,
602
+ "max_attempts": 1
603
+ },
604
+ "additional_details": {
605
+ "until": "[]",
606
+ "do_sample": "true",
607
+ "min_p": "0.0",
608
+ "presence_penalty": "1.5",
609
+ "repetition_penalty": "1.0",
610
+ "seed": "1234",
611
+ "num_fewshot": "0"
612
+ }
613
+ }
614
+ },
615
+ {
616
+ "evaluation_name": "mmlu_pro_chat_physics/custom-extract",
617
+ "source_data": {
618
+ "dataset_name": "mmlu_pro_chat",
619
+ "source_type": "hf_dataset",
620
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
621
+ "hf_split": "test"
622
+ },
623
+ "evaluation_timestamp": "1776226127",
624
+ "metric_config": {
625
+ "evaluation_description": "exact_match (filter: custom-extract)",
626
+ "lower_is_better": false,
627
+ "score_type": "continuous",
628
+ "min_score": 0.0,
629
+ "max_score": 1.0
630
+ },
631
+ "score_details": {
632
+ "score": 0.8763151141904029,
633
+ "details": {
634
+ "seed_scores": "[0.8752886836027713, 0.876058506543495, 0.8775981524249422]",
635
+ "seed_values": "[1234, 4158, 42]"
636
+ },
637
+ "uncertainty": {
638
+ "standard_error": {
639
+ "value": 0.0006789200182357032,
640
+ "method": "across_seeds"
641
+ },
642
+ "num_samples": 3
643
+ }
644
+ },
645
+ "generation_config": {
646
+ "generation_args": {
647
+ "temperature": 1.0,
648
+ "top_p": 0.95,
649
+ "top_k": 40.0,
650
+ "max_tokens": 64000,
651
+ "max_attempts": 1
652
+ },
653
+ "additional_details": {
654
+ "until": "[]",
655
+ "do_sample": "true",
656
+ "min_p": "0.0",
657
+ "presence_penalty": "1.5",
658
+ "repetition_penalty": "1.0",
659
+ "seed": "1234",
660
+ "num_fewshot": "0"
661
+ }
662
+ }
663
+ },
664
+ {
665
+ "evaluation_name": "mmlu_pro_chat_psychology/custom-extract",
666
+ "source_data": {
667
+ "dataset_name": "mmlu_pro_chat",
668
+ "source_type": "hf_dataset",
669
+ "hf_repo": "TIGER-Lab/MMLU-Pro",
670
+ "hf_split": "test"
671
+ },
672
+ "evaluation_timestamp": "1776226127",
673
+ "metric_config": {
674
+ "evaluation_description": "exact_match (filter: custom-extract)",
675
+ "lower_is_better": false,
676
+ "score_type": "continuous",
677
+ "min_score": 0.0,
678
+ "max_score": 1.0
679
+ },
680
+ "score_details": {
681
+ "score": 0.789891395154553,
682
+ "details": {
683
+ "seed_scores": "[0.7869674185463659, 0.7907268170426065, 0.7919799498746867]",
684
+ "seed_values": "[1234, 4158, 42]"
685
+ },
686
+ "uncertainty": {
687
+ "standard_error": {
688
+ "value": 0.0015060782270108586,
689
+ "method": "across_seeds"
690
+ },
691
+ "num_samples": 3
692
+ }
693
+ },
694
+ "generation_config": {
695
+ "generation_args": {
696
+ "temperature": 1.0,
697
+ "top_p": 0.95,
698
+ "top_k": 40.0,
699
+ "max_tokens": 64000,
700
+ "max_attempts": 1
701
+ },
702
+ "additional_details": {
703
+ "until": "[]",
704
+ "do_sample": "true",
705
+ "min_p": "0.0",
706
+ "presence_penalty": "1.5",
707
+ "repetition_penalty": "1.0",
708
+ "seed": "1234",
709
+ "num_fewshot": "0"
710
+ }
711
+ }
712
+ },
713
+ {
714
+ "evaluation_name": "mmlu_pro_chat/custom-extract",
715
+ "source_data": {
716
+ "dataset_name": "mmlu_pro_chat",
717
+ "source_type": "other"
718
+ },
719
+ "evaluation_timestamp": "1776226127",
720
+ "metric_config": {
721
+ "evaluation_description": "exact_match (filter: custom-extract)",
722
+ "lower_is_better": false,
723
+ "score_type": "continuous",
724
+ "min_score": 0.0,
725
+ "max_score": 1.0
726
+ },
727
+ "score_details": {
728
+ "score": 0.8008089539007092,
729
+ "details": {
730
+ "seed_scores": "[0.8029421542553191, 0.7964594414893617, 0.8030252659574468]",
731
+ "seed_values": "[1234, 4158, 42]"
732
+ },
733
+ "uncertainty": {
734
+ "standard_error": {
735
+ "value": 0.002174888545121904,
736
+ "method": "across_seeds"
737
+ },
738
+ "num_samples": 3
739
+ }
740
+ }
741
+ }
742
+ ]
743
+ }