lewtun HF staff commited on
Commit
fdd3c5a
·
1 Parent(s): cc42f20

Add Kaggle math

Browse files
app.py CHANGED
@@ -57,10 +57,7 @@ def get_leaderboard_df(merge_values: bool = True):
57
  elif task.lower() == "agieval":
58
  value = data["results"]["all"]["acc_norm"]
59
  # MATH reports qem
60
- elif task.lower() == "math":
61
- value = data["results"]["all"]["qem"]
62
- # MINIMATH reports qem
63
- elif task.lower() == "mini_math":
64
  value = data["results"]["all"]["qem"]
65
  else:
66
  first_metric_key = next(
 
57
  elif task.lower() == "agieval":
58
  value = data["results"]["all"]["acc_norm"]
59
  # MATH reports qem
60
+ elif task.lower() in ["math", "mini_math", "aimo_kaggle"]:
 
 
 
61
  value = data["results"]["all"]["qem"]
62
  else:
63
  first_metric_key = next(
eval_results/AI-MO/mistral-7b-sft/aimo_v03.00/aimo_kaggle/results_2024-04-22T15-38-51.171312.json DELETED
@@ -1,89 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 6210406.872827123,
9
- "end_time": 6210648.874373582,
10
- "total_evaluation_time_secondes": "242.0015464592725",
11
- "model_name": "AI-MO/mistral-7b-sft",
12
- "model_sha": "159047b1ab76bbb7c9369ee71bfef1d441fc029e",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "13.99 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|aimo_kaggle|0": {
19
- "qem": 0.34615384615384615,
20
- "qem_stderr": 0.05421594160377287
21
- },
22
- "all": {
23
- "qem": 0.34615384615384615,
24
- "qem_stderr": 0.05421594160377287
25
- }
26
- },
27
- "versions": {
28
- "custom|aimo_kaggle|0": 0
29
- },
30
- "config_tasks": {
31
- "custom|aimo_kaggle": {
32
- "name": "aimo_kaggle",
33
- "prompt_function": "kaggle_prompt_fn",
34
- "hf_repo": "AI-MO/kaggle-validation-set-v1",
35
- "hf_subset": "default",
36
- "metric": [
37
- "quasi_exact_match_math"
38
- ],
39
- "hf_avail_splits": [
40
- "train"
41
- ],
42
- "evaluation_splits": [
43
- "train"
44
- ],
45
- "few_shots_split": null,
46
- "few_shots_select": null,
47
- "generation_size": 2048,
48
- "stop_sequence": null,
49
- "output_regex": null,
50
- "frozen": false,
51
- "suite": [
52
- "custom"
53
- ],
54
- "original_num_docs": 78,
55
- "effective_num_docs": 78,
56
- "trust_dataset": null,
57
- "must_remove_duplicate_docs": null
58
- }
59
- },
60
- "summary_tasks": {
61
- "custom|aimo_kaggle|0": {
62
- "hashes": {
63
- "hash_examples": "df0b9e4031e0ac93",
64
- "hash_full_prompts": "7dc3f63cdfbfe7fe",
65
- "hash_input_tokens": "88b37a81252e9f7f",
66
- "hash_cont_tokens": "6fa54d6858c4d02d"
67
- },
68
- "truncated": 78,
69
- "non_truncated": 0,
70
- "padded": 44,
71
- "non_padded": 34,
72
- "effective_few_shots": 0.0,
73
- "num_truncated_few_shots": 0
74
- }
75
- },
76
- "summary_general": {
77
- "hashes": {
78
- "hash_examples": "f5b0a912024169d0",
79
- "hash_full_prompts": "4532c87f6934c354",
80
- "hash_input_tokens": "8e967efb2f3aee9d",
81
- "hash_cont_tokens": "8b4f673571e10c5d"
82
- },
83
- "truncated": 78,
84
- "non_truncated": 0,
85
- "padded": 44,
86
- "non_padded": 34,
87
- "num_truncated_few_shots": 0
88
- }
89
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval_results/AI-MO/mistral-7b-sft/aimo_v03.00/mini_math/results_2024-04-23T12-07-13.400858.json DELETED
@@ -1,291 +0,0 @@
1
- {
2
- "config_general": {
3
- "lighteval_sha": "?",
4
- "num_fewshot_seeds": 1,
5
- "override_batch_size": 4,
6
- "max_samples": null,
7
- "job_id": "",
8
- "start_time": 1691110.31978568,
9
- "end_time": 1692074.373145221,
10
- "total_evaluation_time_secondes": "964.0533595411107",
11
- "model_name": "AI-MO/mistral-7b-sft",
12
- "model_sha": "159047b1ab76bbb7c9369ee71bfef1d441fc029e",
13
- "model_dtype": "torch.bfloat16",
14
- "model_size": "13.99 GB",
15
- "config": null
16
- },
17
- "results": {
18
- "custom|mini_math:level_1|0": {
19
- "qem": 0.6842105263157895,
20
- "qem_stderr": 0.07641750223933595
21
- },
22
- "custom|mini_math:level_2|0": {
23
- "qem": 0.4225352112676056,
24
- "qem_stderr": 0.05903984205682581
25
- },
26
- "custom|mini_math:level_3|0": {
27
- "qem": 0.28205128205128205,
28
- "qem_stderr": 0.051282051282051246
29
- },
30
- "custom|mini_math:level_4|0": {
31
- "qem": 0.24770642201834864,
32
- "qem_stderr": 0.04153846393641014
33
- },
34
- "custom|mini_math:level_5|0": {
35
- "qem": 0.06542056074766354,
36
- "qem_stderr": 0.02401664339953487
37
- },
38
- "custom|mini_math:_average|0": {
39
- "qem": 0.34038480048013786,
40
- "qem_stderr": 0.0504589005828316
41
- },
42
- "all": {
43
- "qem": 0.34038480048013786,
44
- "qem_stderr": 0.0504589005828316
45
- }
46
- },
47
- "versions": {
48
- "custom|mini_math:level_1|0": 0,
49
- "custom|mini_math:level_2|0": 0,
50
- "custom|mini_math:level_3|0": 0,
51
- "custom|mini_math:level_4|0": 0,
52
- "custom|mini_math:level_5|0": 0
53
- },
54
- "config_tasks": {
55
- "custom|mini_math:level_1": {
56
- "name": "mini_math:level_1",
57
- "prompt_function": "math",
58
- "hf_repo": "HuggingFaceH4/lighteval-mini-math",
59
- "hf_subset": "Level 1",
60
- "metric": [
61
- "quasi_exact_match_math"
62
- ],
63
- "hf_avail_splits": [
64
- "train",
65
- "test"
66
- ],
67
- "evaluation_splits": [
68
- "test"
69
- ],
70
- "few_shots_split": null,
71
- "few_shots_select": null,
72
- "generation_size": 2048,
73
- "stop_sequence": null,
74
- "output_regex": null,
75
- "frozen": false,
76
- "suite": [
77
- "custom",
78
- "mini_math"
79
- ],
80
- "original_num_docs": 38,
81
- "effective_num_docs": 38,
82
- "trust_dataset": null,
83
- "must_remove_duplicate_docs": null
84
- },
85
- "custom|mini_math:level_2": {
86
- "name": "mini_math:level_2",
87
- "prompt_function": "math",
88
- "hf_repo": "HuggingFaceH4/lighteval-mini-math",
89
- "hf_subset": "Level 2",
90
- "metric": [
91
- "quasi_exact_match_math"
92
- ],
93
- "hf_avail_splits": [
94
- "train",
95
- "test"
96
- ],
97
- "evaluation_splits": [
98
- "test"
99
- ],
100
- "few_shots_split": null,
101
- "few_shots_select": null,
102
- "generation_size": 2048,
103
- "stop_sequence": null,
104
- "output_regex": null,
105
- "frozen": false,
106
- "suite": [
107
- "custom",
108
- "mini_math"
109
- ],
110
- "original_num_docs": 71,
111
- "effective_num_docs": 71,
112
- "trust_dataset": null,
113
- "must_remove_duplicate_docs": null
114
- },
115
- "custom|mini_math:level_3": {
116
- "name": "mini_math:level_3",
117
- "prompt_function": "math",
118
- "hf_repo": "HuggingFaceH4/lighteval-mini-math",
119
- "hf_subset": "Level 3",
120
- "metric": [
121
- "quasi_exact_match_math"
122
- ],
123
- "hf_avail_splits": [
124
- "train",
125
- "test"
126
- ],
127
- "evaluation_splits": [
128
- "test"
129
- ],
130
- "few_shots_split": null,
131
- "few_shots_select": null,
132
- "generation_size": 2048,
133
- "stop_sequence": null,
134
- "output_regex": null,
135
- "frozen": false,
136
- "suite": [
137
- "custom",
138
- "mini_math"
139
- ],
140
- "original_num_docs": 78,
141
- "effective_num_docs": 78,
142
- "trust_dataset": null,
143
- "must_remove_duplicate_docs": null
144
- },
145
- "custom|mini_math:level_4": {
146
- "name": "mini_math:level_4",
147
- "prompt_function": "math",
148
- "hf_repo": "HuggingFaceH4/lighteval-mini-math",
149
- "hf_subset": "Level 4",
150
- "metric": [
151
- "quasi_exact_match_math"
152
- ],
153
- "hf_avail_splits": [
154
- "train",
155
- "test"
156
- ],
157
- "evaluation_splits": [
158
- "test"
159
- ],
160
- "few_shots_split": null,
161
- "few_shots_select": null,
162
- "generation_size": 2048,
163
- "stop_sequence": null,
164
- "output_regex": null,
165
- "frozen": false,
166
- "suite": [
167
- "custom",
168
- "mini_math"
169
- ],
170
- "original_num_docs": 109,
171
- "effective_num_docs": 109,
172
- "trust_dataset": null,
173
- "must_remove_duplicate_docs": null
174
- },
175
- "custom|mini_math:level_5": {
176
- "name": "mini_math:level_5",
177
- "prompt_function": "math",
178
- "hf_repo": "HuggingFaceH4/lighteval-mini-math",
179
- "hf_subset": "Level 5",
180
- "metric": [
181
- "quasi_exact_match_math"
182
- ],
183
- "hf_avail_splits": [
184
- "train",
185
- "test"
186
- ],
187
- "evaluation_splits": [
188
- "test"
189
- ],
190
- "few_shots_split": null,
191
- "few_shots_select": null,
192
- "generation_size": 2048,
193
- "stop_sequence": null,
194
- "output_regex": null,
195
- "frozen": false,
196
- "suite": [
197
- "custom",
198
- "mini_math"
199
- ],
200
- "original_num_docs": 107,
201
- "effective_num_docs": 107,
202
- "trust_dataset": null,
203
- "must_remove_duplicate_docs": null
204
- }
205
- },
206
- "summary_tasks": {
207
- "custom|mini_math:level_1|0": {
208
- "hashes": {
209
- "hash_examples": "781a25d9a2f29e87",
210
- "hash_full_prompts": "ab476c3f1e6189c8",
211
- "hash_input_tokens": "748b0dfb1d890562",
212
- "hash_cont_tokens": "719d7c50adc91f8f"
213
- },
214
- "truncated": 38,
215
- "non_truncated": 0,
216
- "padded": 23,
217
- "non_padded": 15,
218
- "effective_few_shots": 0.0,
219
- "num_truncated_few_shots": 0
220
- },
221
- "custom|mini_math:level_2|0": {
222
- "hashes": {
223
- "hash_examples": "471a4a8568c70994",
224
- "hash_full_prompts": "8ec479c7a8a3d59b",
225
- "hash_input_tokens": "79bfecc8111dea84",
226
- "hash_cont_tokens": "238f82a182eb52a4"
227
- },
228
- "truncated": 71,
229
- "non_truncated": 0,
230
- "padded": 35,
231
- "non_padded": 36,
232
- "effective_few_shots": 0.0,
233
- "num_truncated_few_shots": 0
234
- },
235
- "custom|mini_math:level_3|0": {
236
- "hashes": {
237
- "hash_examples": "aa6979a90c9a7776",
238
- "hash_full_prompts": "0c55bb3e608c2221",
239
- "hash_input_tokens": "152332152ec83113",
240
- "hash_cont_tokens": "15e129814ba3f8ce"
241
- },
242
- "truncated": 78,
243
- "non_truncated": 0,
244
- "padded": 36,
245
- "non_padded": 42,
246
- "effective_few_shots": 0.0,
247
- "num_truncated_few_shots": 0
248
- },
249
- "custom|mini_math:level_4|0": {
250
- "hashes": {
251
- "hash_examples": "2e9e3be067de6ec6",
252
- "hash_full_prompts": "e471b8649404f9a7",
253
- "hash_input_tokens": "b0305a77648ed6a6",
254
- "hash_cont_tokens": "0d133d0776517b35"
255
- },
256
- "truncated": 108,
257
- "non_truncated": 1,
258
- "padded": 36,
259
- "non_padded": 73,
260
- "effective_few_shots": 0.0,
261
- "num_truncated_few_shots": 0
262
- },
263
- "custom|mini_math:level_5|0": {
264
- "hashes": {
265
- "hash_examples": "18b402b510f9a746",
266
- "hash_full_prompts": "12e58a2a352d2763",
267
- "hash_input_tokens": "5c4b6f568f76445a",
268
- "hash_cont_tokens": "c138c90600d234a1"
269
- },
270
- "truncated": 107,
271
- "non_truncated": 0,
272
- "padded": 39,
273
- "non_padded": 68,
274
- "effective_few_shots": 0.0,
275
- "num_truncated_few_shots": 0
276
- }
277
- },
278
- "summary_general": {
279
- "hashes": {
280
- "hash_examples": "470ce01cab0383c9",
281
- "hash_full_prompts": "3a272d494f912b3f",
282
- "hash_input_tokens": "7d3b36c2ba40899d",
283
- "hash_cont_tokens": "017293ac30be3dc4"
284
- },
285
- "truncated": 402,
286
- "non_truncated": 1,
287
- "padded": 169,
288
- "non_padded": 234,
289
- "num_truncated_few_shots": 0
290
- }
291
- }