lewtun HF staff commited on
Commit
0b07157
·
verified ·
1 Parent(s): 77b3925

Upload eval_results/Qwen/Qwen1.5-7B-Chat/main/bbh/results_2024-03-28T13-49-45.748302.json with huggingface_hub

Browse files
eval_results/Qwen/Qwen1.5-7B-Chat/main/bbh/results_2024-03-28T13-49-45.748302.json ADDED
@@ -0,0 +1,1184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "?",
4
+ "num_fewshot_seeds": 1,
5
+ "override_batch_size": 1,
6
+ "max_samples": null,
7
+ "job_id": "",
8
+ "start_time": 2069217.71399371,
9
+ "end_time": 2069504.513928286,
10
+ "total_evaluation_time_secondes": "286.7999345760327",
11
+ "model_name": "Qwen/Qwen1.5-7B-Chat",
12
+ "model_sha": "a2662f4bc1afe913a91cd49f794d229a8c28f97e",
13
+ "model_dtype": "torch.bfloat16",
14
+ "model_size": "14.88 GB",
15
+ "config": null
16
+ },
17
+ "results": {
18
+ "harness|bbh:causal_judgment|3": {
19
+ "em": 0.5240641711229946,
20
+ "em_stderr": 0.03661929361528703,
21
+ "qem": 0.5240641711229946,
22
+ "qem_stderr": 0.03661929361528703,
23
+ "pem": 0.5240641711229946,
24
+ "pem_stderr": 0.03661929361528703,
25
+ "pqem": 0.5240641711229946,
26
+ "pqem_stderr": 0.03661929361528703,
27
+ "perfect_em": 0.5240641711229946,
28
+ "perfect_em_stderr": 0.03661929361528703
29
+ },
30
+ "harness|bbh:date_understanding|3": {
31
+ "em": 0.176,
32
+ "em_stderr": 0.024133497525457116,
33
+ "qem": 0.176,
34
+ "qem_stderr": 0.024133497525457116,
35
+ "pem": 0.22,
36
+ "pem_stderr": 0.026251792824605845,
37
+ "pqem": 0.4,
38
+ "pqem_stderr": 0.031046021028253247,
39
+ "perfect_em": 0.176,
40
+ "perfect_em_stderr": 0.024133497525457116
41
+ },
42
+ "harness|bbh:disambiguation_qa|3": {
43
+ "em": 0.404,
44
+ "em_stderr": 0.031096688184825295,
45
+ "qem": 0.404,
46
+ "qem_stderr": 0.031096688184825295,
47
+ "pem": 0.484,
48
+ "pem_stderr": 0.03166998503010742,
49
+ "pqem": 0.776,
50
+ "pqem_stderr": 0.0264213616873479,
51
+ "perfect_em": 0.404,
52
+ "perfect_em_stderr": 0.031096688184825295
53
+ },
54
+ "harness|bbh:geometric_shapes|3": {
55
+ "em": 0.232,
56
+ "em_stderr": 0.02675007037486516,
57
+ "qem": 0.232,
58
+ "qem_stderr": 0.02675007037486516,
59
+ "pem": 0.252,
60
+ "pem_stderr": 0.027513851933031352,
61
+ "pqem": 0.252,
62
+ "pqem_stderr": 0.027513851933031352,
63
+ "perfect_em": 0.232,
64
+ "perfect_em_stderr": 0.02675007037486516
65
+ },
66
+ "harness|bbh:logical_deduction_five_objects|3": {
67
+ "em": 0.38,
68
+ "em_stderr": 0.030760116042626046,
69
+ "qem": 0.38,
70
+ "qem_stderr": 0.030760116042626046,
71
+ "pem": 0.436,
72
+ "pem_stderr": 0.03142556706028128,
73
+ "pqem": 0.572,
74
+ "pqem_stderr": 0.0313559689237726,
75
+ "perfect_em": 0.38,
76
+ "perfect_em_stderr": 0.030760116042626046
77
+ },
78
+ "harness|bbh:logical_deduction_seven_objects|3": {
79
+ "em": 0.284,
80
+ "em_stderr": 0.028576958730437405,
81
+ "qem": 0.284,
82
+ "qem_stderr": 0.028576958730437405,
83
+ "pem": 0.38,
84
+ "pem_stderr": 0.03076011604262603,
85
+ "pqem": 0.488,
86
+ "pqem_stderr": 0.03167708558254709,
87
+ "perfect_em": 0.284,
88
+ "perfect_em_stderr": 0.028576958730437405
89
+ },
90
+ "harness|bbh:logical_deduction_three_objects|3": {
91
+ "em": 0.476,
92
+ "em_stderr": 0.03164968895968782,
93
+ "qem": 0.476,
94
+ "qem_stderr": 0.03164968895968782,
95
+ "pem": 0.52,
96
+ "pem_stderr": 0.03166085340849519,
97
+ "pqem": 0.756,
98
+ "pqem_stderr": 0.027217995464553182,
99
+ "perfect_em": 0.476,
100
+ "perfect_em_stderr": 0.03164968895968782
101
+ },
102
+ "harness|bbh:movie_recommendation|3": {
103
+ "em": 0.3092369477911647,
104
+ "em_stderr": 0.02934839790043843,
105
+ "qem": 0.3092369477911647,
106
+ "qem_stderr": 0.02934839790043843,
107
+ "pem": 0.5863453815261044,
108
+ "pem_stderr": 0.031273022170585044,
109
+ "pqem": 0.678714859437751,
110
+ "pqem_stderr": 0.029652625884384973,
111
+ "perfect_em": 0.3092369477911647,
112
+ "perfect_em_stderr": 0.02934839790043843
113
+ },
114
+ "harness|bbh:navigate|3": {
115
+ "em": 0.6,
116
+ "em_stderr": 0.03104602102825325,
117
+ "qem": 0.6,
118
+ "qem_stderr": 0.03104602102825325,
119
+ "pem": 0.6,
120
+ "pem_stderr": 0.03104602102825325,
121
+ "pqem": 0.6,
122
+ "pqem_stderr": 0.03104602102825325,
123
+ "perfect_em": 0.6,
124
+ "perfect_em_stderr": 0.03104602102825325
125
+ },
126
+ "harness|bbh:reasoning_about_colored_objects|3": {
127
+ "em": 0.128,
128
+ "em_stderr": 0.021172081336336506,
129
+ "qem": 0.128,
130
+ "qem_stderr": 0.021172081336336506,
131
+ "pem": 0.304,
132
+ "pem_stderr": 0.029150213374159677,
133
+ "pqem": 0.428,
134
+ "pqem_stderr": 0.03135596892377261,
135
+ "perfect_em": 0.128,
136
+ "perfect_em_stderr": 0.021172081336336506
137
+ },
138
+ "harness|bbh:ruin_names|3": {
139
+ "em": 0.3709677419354839,
140
+ "em_stderr": 0.030736616282226906,
141
+ "qem": 0.3709677419354839,
142
+ "qem_stderr": 0.030736616282226906,
143
+ "pem": 0.40725806451612906,
144
+ "pem_stderr": 0.03126217550035507,
145
+ "pqem": 0.5685483870967742,
146
+ "pqem_stderr": 0.03151383724269122,
147
+ "perfect_em": 0.3709677419354839,
148
+ "perfect_em_stderr": 0.030736616282226906
149
+ },
150
+ "harness|bbh:salient_translation_error_detection|3": {
151
+ "em": 0.412,
152
+ "em_stderr": 0.031191596026022894,
153
+ "qem": 0.412,
154
+ "qem_stderr": 0.031191596026022894,
155
+ "pem": 0.412,
156
+ "pem_stderr": 0.031191596026022894,
157
+ "pqem": 0.536,
158
+ "pqem_stderr": 0.03160397514522374,
159
+ "perfect_em": 0.412,
160
+ "perfect_em_stderr": 0.031191596026022894
161
+ },
162
+ "harness|bbh:snarks|3": {
163
+ "em": 0.5561797752808989,
164
+ "em_stderr": 0.03734431584194243,
165
+ "qem": 0.5561797752808989,
166
+ "qem_stderr": 0.03734431584194243,
167
+ "pem": 0.5561797752808989,
168
+ "pem_stderr": 0.03734431584194243,
169
+ "pqem": 0.7134831460674157,
170
+ "pqem_stderr": 0.033984455265461594,
171
+ "perfect_em": 0.5561797752808989,
172
+ "perfect_em_stderr": 0.03734431584194243
173
+ },
174
+ "harness|bbh:sports_understanding|3": {
175
+ "em": 0.548,
176
+ "em_stderr": 0.03153986449255663,
177
+ "qem": 0.548,
178
+ "qem_stderr": 0.03153986449255663,
179
+ "pem": 0.56,
180
+ "pem_stderr": 0.031457244522235625,
181
+ "pqem": 0.56,
182
+ "pqem_stderr": 0.031457244522235625,
183
+ "perfect_em": 0.548,
184
+ "perfect_em_stderr": 0.03153986449255663
185
+ },
186
+ "harness|bbh:temporal_sequences|3": {
187
+ "em": 0.284,
188
+ "em_stderr": 0.028576958730437394,
189
+ "qem": 0.284,
190
+ "qem_stderr": 0.028576958730437394,
191
+ "pem": 0.44,
192
+ "pem_stderr": 0.031457244522235715,
193
+ "pqem": 0.604,
194
+ "pqem_stderr": 0.030993197854577853,
195
+ "perfect_em": 0.284,
196
+ "perfect_em_stderr": 0.028576958730437394
197
+ },
198
+ "harness|bbh:tracking_shuffled_objects_five_objects|3": {
199
+ "em": 0.064,
200
+ "em_stderr": 0.015510587134374155,
201
+ "qem": 0.064,
202
+ "qem_stderr": 0.015510587134374155,
203
+ "pem": 0.156,
204
+ "pem_stderr": 0.022995023034068755,
205
+ "pqem": 0.348,
206
+ "pqem_stderr": 0.03018656846451171,
207
+ "perfect_em": 0.064,
208
+ "perfect_em_stderr": 0.015510587134374155
209
+ },
210
+ "harness|bbh:tracking_shuffled_objects_seven_objects|3": {
211
+ "em": 0.104,
212
+ "em_stderr": 0.019345100974843897,
213
+ "qem": 0.104,
214
+ "qem_stderr": 0.019345100974843897,
215
+ "pem": 0.124,
216
+ "pem_stderr": 0.02088638225867326,
217
+ "pqem": 0.26,
218
+ "pqem_stderr": 0.027797315752644304,
219
+ "perfect_em": 0.104,
220
+ "perfect_em_stderr": 0.019345100974843897
221
+ },
222
+ "harness|bbh:tracking_shuffled_objects_three_objects|3": {
223
+ "em": 0.172,
224
+ "em_stderr": 0.023915513944486218,
225
+ "qem": 0.172,
226
+ "qem_stderr": 0.023915513944486218,
227
+ "pem": 0.352,
228
+ "pem_stderr": 0.03026628805735993,
229
+ "pqem": 0.664,
230
+ "pqem_stderr": 0.029933259094191516,
231
+ "perfect_em": 0.172,
232
+ "perfect_em_stderr": 0.023915513944486218
233
+ },
234
+ "harness|bbh:_average|3": {
235
+ "em": 0.33469159089614126,
236
+ "em_stderr": 0.028295187062505816,
237
+ "qem": 0.33469159089614126,
238
+ "qem_stderr": 0.028295187062505816,
239
+ "pem": 0.40632485513589595,
240
+ "pem_stderr": 0.03023505479168477,
241
+ "pqem": 0.5404894757624965,
242
+ "pqem_stderr": 0.03063200263404116,
243
+ "perfect_em": 0.33469159089614126,
244
+ "perfect_em_stderr": 0.028295187062505816
245
+ }
246
+ },
247
+ "versions": {
248
+ "harness|bbh:causal_judgment|3": 0,
249
+ "harness|bbh:date_understanding|3": 0,
250
+ "harness|bbh:disambiguation_qa|3": 0,
251
+ "harness|bbh:geometric_shapes|3": 0,
252
+ "harness|bbh:logical_deduction_five_objects|3": 0,
253
+ "harness|bbh:logical_deduction_seven_objects|3": 0,
254
+ "harness|bbh:logical_deduction_three_objects|3": 0,
255
+ "harness|bbh:movie_recommendation|3": 0,
256
+ "harness|bbh:navigate|3": 0,
257
+ "harness|bbh:reasoning_about_colored_objects|3": 0,
258
+ "harness|bbh:ruin_names|3": 0,
259
+ "harness|bbh:salient_translation_error_detection|3": 0,
260
+ "harness|bbh:snarks|3": 0,
261
+ "harness|bbh:sports_understanding|3": 0,
262
+ "harness|bbh:temporal_sequences|3": 0,
263
+ "harness|bbh:tracking_shuffled_objects_five_objects|3": 0,
264
+ "harness|bbh:tracking_shuffled_objects_seven_objects|3": 0,
265
+ "harness|bbh:tracking_shuffled_objects_three_objects|3": 0
266
+ },
267
+ "config_tasks": {
268
+ "harness|bbh:causal_judgment": {
269
+ "name": "bbh:causal_judgment",
270
+ "prompt_function": "bbh_causal_judgment",
271
+ "hf_repo": "lukaemon/bbh",
272
+ "hf_subset": "causal_judgement",
273
+ "metric": [
274
+ "exact_match",
275
+ "quasi_exact_match",
276
+ "prefix_exact_match",
277
+ "prefix_quasi_exact_match",
278
+ "perfect_exact_match"
279
+ ],
280
+ "hf_avail_splits": [
281
+ "test"
282
+ ],
283
+ "evaluation_splits": [
284
+ "test"
285
+ ],
286
+ "few_shots_split": null,
287
+ "few_shots_select": null,
288
+ "generation_size": 20,
289
+ "stop_sequence": [
290
+ "</s>",
291
+ "Q:",
292
+ "\n\n"
293
+ ],
294
+ "output_regex": null,
295
+ "frozen": false,
296
+ "suite": [
297
+ "harness"
298
+ ],
299
+ "original_num_docs": 187,
300
+ "effective_num_docs": 187,
301
+ "trust_dataset": true,
302
+ "must_remove_duplicate_docs": null
303
+ },
304
+ "harness|bbh:date_understanding": {
305
+ "name": "bbh:date_understanding",
306
+ "prompt_function": "bbh_date_understanding",
307
+ "hf_repo": "lukaemon/bbh",
308
+ "hf_subset": "date_understanding",
309
+ "metric": [
310
+ "exact_match",
311
+ "quasi_exact_match",
312
+ "prefix_exact_match",
313
+ "prefix_quasi_exact_match",
314
+ "perfect_exact_match"
315
+ ],
316
+ "hf_avail_splits": [
317
+ "test"
318
+ ],
319
+ "evaluation_splits": [
320
+ "test"
321
+ ],
322
+ "few_shots_split": null,
323
+ "few_shots_select": null,
324
+ "generation_size": 20,
325
+ "stop_sequence": [
326
+ "</s>",
327
+ "Q:",
328
+ "\n\n"
329
+ ],
330
+ "output_regex": null,
331
+ "frozen": false,
332
+ "suite": [
333
+ "harness"
334
+ ],
335
+ "original_num_docs": 250,
336
+ "effective_num_docs": 250,
337
+ "trust_dataset": true,
338
+ "must_remove_duplicate_docs": null
339
+ },
340
+ "harness|bbh:disambiguation_qa": {
341
+ "name": "bbh:disambiguation_qa",
342
+ "prompt_function": "bbh_disambiguation_qa",
343
+ "hf_repo": "lukaemon/bbh",
344
+ "hf_subset": "disambiguation_qa",
345
+ "metric": [
346
+ "exact_match",
347
+ "quasi_exact_match",
348
+ "prefix_exact_match",
349
+ "prefix_quasi_exact_match",
350
+ "perfect_exact_match"
351
+ ],
352
+ "hf_avail_splits": [
353
+ "test"
354
+ ],
355
+ "evaluation_splits": [
356
+ "test"
357
+ ],
358
+ "few_shots_split": null,
359
+ "few_shots_select": null,
360
+ "generation_size": 20,
361
+ "stop_sequence": [
362
+ "</s>",
363
+ "Q:",
364
+ "\n\n"
365
+ ],
366
+ "output_regex": null,
367
+ "frozen": false,
368
+ "suite": [
369
+ "harness"
370
+ ],
371
+ "original_num_docs": 250,
372
+ "effective_num_docs": 250,
373
+ "trust_dataset": true,
374
+ "must_remove_duplicate_docs": null
375
+ },
376
+ "harness|bbh:geometric_shapes": {
377
+ "name": "bbh:geometric_shapes",
378
+ "prompt_function": "bbh_geometric_shapes",
379
+ "hf_repo": "lukaemon/bbh",
380
+ "hf_subset": "geometric_shapes",
381
+ "metric": [
382
+ "exact_match",
383
+ "quasi_exact_match",
384
+ "prefix_exact_match",
385
+ "prefix_quasi_exact_match",
386
+ "perfect_exact_match"
387
+ ],
388
+ "hf_avail_splits": [
389
+ "test"
390
+ ],
391
+ "evaluation_splits": [
392
+ "test"
393
+ ],
394
+ "few_shots_split": null,
395
+ "few_shots_select": null,
396
+ "generation_size": 20,
397
+ "stop_sequence": [
398
+ "</s>",
399
+ "Q:",
400
+ "\n\n"
401
+ ],
402
+ "output_regex": null,
403
+ "frozen": false,
404
+ "suite": [
405
+ "harness"
406
+ ],
407
+ "original_num_docs": 250,
408
+ "effective_num_docs": 250,
409
+ "trust_dataset": true,
410
+ "must_remove_duplicate_docs": null
411
+ },
412
+ "harness|bbh:logical_deduction_five_objects": {
413
+ "name": "bbh:logical_deduction_five_objects",
414
+ "prompt_function": "bbh_logical_deduction_five_objects",
415
+ "hf_repo": "lukaemon/bbh",
416
+ "hf_subset": "logical_deduction_five_objects",
417
+ "metric": [
418
+ "exact_match",
419
+ "quasi_exact_match",
420
+ "prefix_exact_match",
421
+ "prefix_quasi_exact_match",
422
+ "perfect_exact_match"
423
+ ],
424
+ "hf_avail_splits": [
425
+ "test"
426
+ ],
427
+ "evaluation_splits": [
428
+ "test"
429
+ ],
430
+ "few_shots_split": null,
431
+ "few_shots_select": null,
432
+ "generation_size": 20,
433
+ "stop_sequence": [
434
+ "</s>",
435
+ "Q:",
436
+ "\n\n"
437
+ ],
438
+ "output_regex": null,
439
+ "frozen": false,
440
+ "suite": [
441
+ "harness"
442
+ ],
443
+ "original_num_docs": 250,
444
+ "effective_num_docs": 250,
445
+ "trust_dataset": true,
446
+ "must_remove_duplicate_docs": null
447
+ },
448
+ "harness|bbh:logical_deduction_seven_objects": {
449
+ "name": "bbh:logical_deduction_seven_objects",
450
+ "prompt_function": "bbh_logical_deduction_seven_objects",
451
+ "hf_repo": "lukaemon/bbh",
452
+ "hf_subset": "logical_deduction_seven_objects",
453
+ "metric": [
454
+ "exact_match",
455
+ "quasi_exact_match",
456
+ "prefix_exact_match",
457
+ "prefix_quasi_exact_match",
458
+ "perfect_exact_match"
459
+ ],
460
+ "hf_avail_splits": [
461
+ "test"
462
+ ],
463
+ "evaluation_splits": [
464
+ "test"
465
+ ],
466
+ "few_shots_split": null,
467
+ "few_shots_select": null,
468
+ "generation_size": 20,
469
+ "stop_sequence": [
470
+ "</s>",
471
+ "Q:",
472
+ "\n\n"
473
+ ],
474
+ "output_regex": null,
475
+ "frozen": false,
476
+ "suite": [
477
+ "harness"
478
+ ],
479
+ "original_num_docs": 250,
480
+ "effective_num_docs": 250,
481
+ "trust_dataset": true,
482
+ "must_remove_duplicate_docs": null
483
+ },
484
+ "harness|bbh:logical_deduction_three_objects": {
485
+ "name": "bbh:logical_deduction_three_objects",
486
+ "prompt_function": "bbh_logical_deduction_three_objects",
487
+ "hf_repo": "lukaemon/bbh",
488
+ "hf_subset": "logical_deduction_three_objects",
489
+ "metric": [
490
+ "exact_match",
491
+ "quasi_exact_match",
492
+ "prefix_exact_match",
493
+ "prefix_quasi_exact_match",
494
+ "perfect_exact_match"
495
+ ],
496
+ "hf_avail_splits": [
497
+ "test"
498
+ ],
499
+ "evaluation_splits": [
500
+ "test"
501
+ ],
502
+ "few_shots_split": null,
503
+ "few_shots_select": null,
504
+ "generation_size": 20,
505
+ "stop_sequence": [
506
+ "</s>",
507
+ "Q:",
508
+ "\n\n"
509
+ ],
510
+ "output_regex": null,
511
+ "frozen": false,
512
+ "suite": [
513
+ "harness"
514
+ ],
515
+ "original_num_docs": 250,
516
+ "effective_num_docs": 250,
517
+ "trust_dataset": true,
518
+ "must_remove_duplicate_docs": null
519
+ },
520
+ "harness|bbh:movie_recommendation": {
521
+ "name": "bbh:movie_recommendation",
522
+ "prompt_function": "bbh_movie_recommendation",
523
+ "hf_repo": "lukaemon/bbh",
524
+ "hf_subset": "movie_recommendation",
525
+ "metric": [
526
+ "exact_match",
527
+ "quasi_exact_match",
528
+ "prefix_exact_match",
529
+ "prefix_quasi_exact_match",
530
+ "perfect_exact_match"
531
+ ],
532
+ "hf_avail_splits": [
533
+ "test"
534
+ ],
535
+ "evaluation_splits": [
536
+ "test"
537
+ ],
538
+ "few_shots_split": null,
539
+ "few_shots_select": null,
540
+ "generation_size": 20,
541
+ "stop_sequence": [
542
+ "</s>",
543
+ "Q:",
544
+ "\n\n"
545
+ ],
546
+ "output_regex": null,
547
+ "frozen": false,
548
+ "suite": [
549
+ "harness"
550
+ ],
551
+ "original_num_docs": 249,
552
+ "effective_num_docs": 249,
553
+ "trust_dataset": true,
554
+ "must_remove_duplicate_docs": null
555
+ },
556
+ "harness|bbh:navigate": {
557
+ "name": "bbh:navigate",
558
+ "prompt_function": "bbh_navigate",
559
+ "hf_repo": "lukaemon/bbh",
560
+ "hf_subset": "navigate",
561
+ "metric": [
562
+ "exact_match",
563
+ "quasi_exact_match",
564
+ "prefix_exact_match",
565
+ "prefix_quasi_exact_match",
566
+ "perfect_exact_match"
567
+ ],
568
+ "hf_avail_splits": [
569
+ "test"
570
+ ],
571
+ "evaluation_splits": [
572
+ "test"
573
+ ],
574
+ "few_shots_split": null,
575
+ "few_shots_select": null,
576
+ "generation_size": 20,
577
+ "stop_sequence": [
578
+ "</s>",
579
+ "Q:",
580
+ "\n\n"
581
+ ],
582
+ "output_regex": null,
583
+ "frozen": false,
584
+ "suite": [
585
+ "harness"
586
+ ],
587
+ "original_num_docs": 250,
588
+ "effective_num_docs": 250,
589
+ "trust_dataset": true,
590
+ "must_remove_duplicate_docs": null
591
+ },
592
+ "harness|bbh:reasoning_about_colored_objects": {
593
+ "name": "bbh:reasoning_about_colored_objects",
594
+ "prompt_function": "bbh_reasoning_about_colored_objects",
595
+ "hf_repo": "lukaemon/bbh",
596
+ "hf_subset": "reasoning_about_colored_objects",
597
+ "metric": [
598
+ "exact_match",
599
+ "quasi_exact_match",
600
+ "prefix_exact_match",
601
+ "prefix_quasi_exact_match",
602
+ "perfect_exact_match"
603
+ ],
604
+ "hf_avail_splits": [
605
+ "test"
606
+ ],
607
+ "evaluation_splits": [
608
+ "test"
609
+ ],
610
+ "few_shots_split": null,
611
+ "few_shots_select": null,
612
+ "generation_size": 20,
613
+ "stop_sequence": [
614
+ "</s>",
615
+ "Q:",
616
+ "\n\n"
617
+ ],
618
+ "output_regex": null,
619
+ "frozen": false,
620
+ "suite": [
621
+ "harness"
622
+ ],
623
+ "original_num_docs": 250,
624
+ "effective_num_docs": 250,
625
+ "trust_dataset": true,
626
+ "must_remove_duplicate_docs": null
627
+ },
628
+ "harness|bbh:ruin_names": {
629
+ "name": "bbh:ruin_names",
630
+ "prompt_function": "bbh_ruin_names",
631
+ "hf_repo": "lukaemon/bbh",
632
+ "hf_subset": "ruin_names",
633
+ "metric": [
634
+ "exact_match",
635
+ "quasi_exact_match",
636
+ "prefix_exact_match",
637
+ "prefix_quasi_exact_match",
638
+ "perfect_exact_match"
639
+ ],
640
+ "hf_avail_splits": [
641
+ "test"
642
+ ],
643
+ "evaluation_splits": [
644
+ "test"
645
+ ],
646
+ "few_shots_split": null,
647
+ "few_shots_select": null,
648
+ "generation_size": 20,
649
+ "stop_sequence": [
650
+ "</s>",
651
+ "Q:",
652
+ "\n\n"
653
+ ],
654
+ "output_regex": null,
655
+ "frozen": false,
656
+ "suite": [
657
+ "harness"
658
+ ],
659
+ "original_num_docs": 248,
660
+ "effective_num_docs": 248,
661
+ "trust_dataset": true,
662
+ "must_remove_duplicate_docs": null
663
+ },
664
+ "harness|bbh:salient_translation_error_detection": {
665
+ "name": "bbh:salient_translation_error_detection",
666
+ "prompt_function": "bbh_salient_translation_error_detection",
667
+ "hf_repo": "lukaemon/bbh",
668
+ "hf_subset": "salient_translation_error_detection",
669
+ "metric": [
670
+ "exact_match",
671
+ "quasi_exact_match",
672
+ "prefix_exact_match",
673
+ "prefix_quasi_exact_match",
674
+ "perfect_exact_match"
675
+ ],
676
+ "hf_avail_splits": [
677
+ "test"
678
+ ],
679
+ "evaluation_splits": [
680
+ "test"
681
+ ],
682
+ "few_shots_split": null,
683
+ "few_shots_select": null,
684
+ "generation_size": 20,
685
+ "stop_sequence": [
686
+ "</s>",
687
+ "Q:",
688
+ "\n\n"
689
+ ],
690
+ "output_regex": null,
691
+ "frozen": false,
692
+ "suite": [
693
+ "harness"
694
+ ],
695
+ "original_num_docs": 250,
696
+ "effective_num_docs": 250,
697
+ "trust_dataset": true,
698
+ "must_remove_duplicate_docs": null
699
+ },
700
+ "harness|bbh:snarks": {
701
+ "name": "bbh:snarks",
702
+ "prompt_function": "bbh_snarks",
703
+ "hf_repo": "lukaemon/bbh",
704
+ "hf_subset": "snarks",
705
+ "metric": [
706
+ "exact_match",
707
+ "quasi_exact_match",
708
+ "prefix_exact_match",
709
+ "prefix_quasi_exact_match",
710
+ "perfect_exact_match"
711
+ ],
712
+ "hf_avail_splits": [
713
+ "test"
714
+ ],
715
+ "evaluation_splits": [
716
+ "test"
717
+ ],
718
+ "few_shots_split": null,
719
+ "few_shots_select": null,
720
+ "generation_size": 20,
721
+ "stop_sequence": [
722
+ "</s>",
723
+ "Q:",
724
+ "\n\n"
725
+ ],
726
+ "output_regex": null,
727
+ "frozen": false,
728
+ "suite": [
729
+ "harness"
730
+ ],
731
+ "original_num_docs": 178,
732
+ "effective_num_docs": 178,
733
+ "trust_dataset": true,
734
+ "must_remove_duplicate_docs": null
735
+ },
736
+ "harness|bbh:sports_understanding": {
737
+ "name": "bbh:sports_understanding",
738
+ "prompt_function": "bbh_sports_understanding",
739
+ "hf_repo": "lukaemon/bbh",
740
+ "hf_subset": "sports_understanding",
741
+ "metric": [
742
+ "exact_match",
743
+ "quasi_exact_match",
744
+ "prefix_exact_match",
745
+ "prefix_quasi_exact_match",
746
+ "perfect_exact_match"
747
+ ],
748
+ "hf_avail_splits": [
749
+ "test"
750
+ ],
751
+ "evaluation_splits": [
752
+ "test"
753
+ ],
754
+ "few_shots_split": null,
755
+ "few_shots_select": null,
756
+ "generation_size": 20,
757
+ "stop_sequence": [
758
+ "</s>",
759
+ "Q:",
760
+ "\n\n"
761
+ ],
762
+ "output_regex": null,
763
+ "frozen": false,
764
+ "suite": [
765
+ "harness"
766
+ ],
767
+ "original_num_docs": 250,
768
+ "effective_num_docs": 250,
769
+ "trust_dataset": true,
770
+ "must_remove_duplicate_docs": null
771
+ },
772
+ "harness|bbh:temporal_sequences": {
773
+ "name": "bbh:temporal_sequences",
774
+ "prompt_function": "bbh_temporal_sequences",
775
+ "hf_repo": "lukaemon/bbh",
776
+ "hf_subset": "temporal_sequences",
777
+ "metric": [
778
+ "exact_match",
779
+ "quasi_exact_match",
780
+ "prefix_exact_match",
781
+ "prefix_quasi_exact_match",
782
+ "perfect_exact_match"
783
+ ],
784
+ "hf_avail_splits": [
785
+ "test"
786
+ ],
787
+ "evaluation_splits": [
788
+ "test"
789
+ ],
790
+ "few_shots_split": null,
791
+ "few_shots_select": null,
792
+ "generation_size": 20,
793
+ "stop_sequence": [
794
+ "</s>",
795
+ "Q:",
796
+ "\n\n"
797
+ ],
798
+ "output_regex": null,
799
+ "frozen": false,
800
+ "suite": [
801
+ "harness"
802
+ ],
803
+ "original_num_docs": 250,
804
+ "effective_num_docs": 250,
805
+ "trust_dataset": true,
806
+ "must_remove_duplicate_docs": null
807
+ },
808
+ "harness|bbh:tracking_shuffled_objects_five_objects": {
809
+ "name": "bbh:tracking_shuffled_objects_five_objects",
810
+ "prompt_function": "bbh_tracking_shuffled_objects_five_objects",
811
+ "hf_repo": "lukaemon/bbh",
812
+ "hf_subset": "tracking_shuffled_objects_five_objects",
813
+ "metric": [
814
+ "exact_match",
815
+ "quasi_exact_match",
816
+ "prefix_exact_match",
817
+ "prefix_quasi_exact_match",
818
+ "perfect_exact_match"
819
+ ],
820
+ "hf_avail_splits": [
821
+ "test"
822
+ ],
823
+ "evaluation_splits": [
824
+ "test"
825
+ ],
826
+ "few_shots_split": null,
827
+ "few_shots_select": null,
828
+ "generation_size": 20,
829
+ "stop_sequence": [
830
+ "</s>",
831
+ "Q:",
832
+ "\n\n"
833
+ ],
834
+ "output_regex": null,
835
+ "frozen": false,
836
+ "suite": [
837
+ "harness"
838
+ ],
839
+ "original_num_docs": 250,
840
+ "effective_num_docs": 250,
841
+ "trust_dataset": true,
842
+ "must_remove_duplicate_docs": null
843
+ },
844
+ "harness|bbh:tracking_shuffled_objects_seven_objects": {
845
+ "name": "bbh:tracking_shuffled_objects_seven_objects",
846
+ "prompt_function": "bbh_tracking_shuffled_objects_seven_objects",
847
+ "hf_repo": "lukaemon/bbh",
848
+ "hf_subset": "tracking_shuffled_objects_seven_objects",
849
+ "metric": [
850
+ "exact_match",
851
+ "quasi_exact_match",
852
+ "prefix_exact_match",
853
+ "prefix_quasi_exact_match",
854
+ "perfect_exact_match"
855
+ ],
856
+ "hf_avail_splits": [
857
+ "test"
858
+ ],
859
+ "evaluation_splits": [
860
+ "test"
861
+ ],
862
+ "few_shots_split": null,
863
+ "few_shots_select": null,
864
+ "generation_size": 20,
865
+ "stop_sequence": [
866
+ "</s>",
867
+ "Q:",
868
+ "\n\n"
869
+ ],
870
+ "output_regex": null,
871
+ "frozen": false,
872
+ "suite": [
873
+ "harness"
874
+ ],
875
+ "original_num_docs": 250,
876
+ "effective_num_docs": 250,
877
+ "trust_dataset": true,
878
+ "must_remove_duplicate_docs": null
879
+ },
880
+ "harness|bbh:tracking_shuffled_objects_three_objects": {
881
+ "name": "bbh:tracking_shuffled_objects_three_objects",
882
+ "prompt_function": "bbh_tracking_shuffled_objects_three_objects",
883
+ "hf_repo": "lukaemon/bbh",
884
+ "hf_subset": "tracking_shuffled_objects_three_objects",
885
+ "metric": [
886
+ "exact_match",
887
+ "quasi_exact_match",
888
+ "prefix_exact_match",
889
+ "prefix_quasi_exact_match",
890
+ "perfect_exact_match"
891
+ ],
892
+ "hf_avail_splits": [
893
+ "test"
894
+ ],
895
+ "evaluation_splits": [
896
+ "test"
897
+ ],
898
+ "few_shots_split": null,
899
+ "few_shots_select": null,
900
+ "generation_size": 20,
901
+ "stop_sequence": [
902
+ "</s>",
903
+ "Q:",
904
+ "\n\n"
905
+ ],
906
+ "output_regex": null,
907
+ "frozen": false,
908
+ "suite": [
909
+ "harness"
910
+ ],
911
+ "original_num_docs": 250,
912
+ "effective_num_docs": 250,
913
+ "trust_dataset": true,
914
+ "must_remove_duplicate_docs": null
915
+ }
916
+ },
917
+ "summary_tasks": {
918
+ "harness|bbh:causal_judgment|3": {
919
+ "hashes": {
920
+ "hash_examples": "63218f5ae055ab2b",
921
+ "hash_full_prompts": "148a4c45a8d2b858",
922
+ "hash_input_tokens": "41f3903cf7efd5be",
923
+ "hash_cont_tokens": "3b4e0e3575a9fe46"
924
+ },
925
+ "truncated": 187,
926
+ "non_truncated": 0,
927
+ "padded": 0,
928
+ "non_padded": 187,
929
+ "effective_few_shots": 3.0,
930
+ "num_truncated_few_shots": 0
931
+ },
932
+ "harness|bbh:date_understanding|3": {
933
+ "hashes": {
934
+ "hash_examples": "f145c7a06def3c8e",
935
+ "hash_full_prompts": "e79a3237877b106e",
936
+ "hash_input_tokens": "ae3f6744fd7add8b",
937
+ "hash_cont_tokens": "e8ae17ee5b188129"
938
+ },
939
+ "truncated": 250,
940
+ "non_truncated": 0,
941
+ "padded": 0,
942
+ "non_padded": 250,
943
+ "effective_few_shots": 3.0,
944
+ "num_truncated_few_shots": 0
945
+ },
946
+ "harness|bbh:disambiguation_qa|3": {
947
+ "hashes": {
948
+ "hash_examples": "19677fd1773f7eb9",
949
+ "hash_full_prompts": "9458fa1926b438bb",
950
+ "hash_input_tokens": "a9ea8e42e7ca4f0f",
951
+ "hash_cont_tokens": "e3bf43a8ea23285a"
952
+ },
953
+ "truncated": 250,
954
+ "non_truncated": 0,
955
+ "padded": 0,
956
+ "non_padded": 250,
957
+ "effective_few_shots": 3.0,
958
+ "num_truncated_few_shots": 0
959
+ },
960
+ "harness|bbh:geometric_shapes|3": {
961
+ "hashes": {
962
+ "hash_examples": "76c7b11a13cc72a9",
963
+ "hash_full_prompts": "78ead1e22de562a8",
964
+ "hash_input_tokens": "ded49f2dd58c24f6",
965
+ "hash_cont_tokens": "45151dea1e2e71f3"
966
+ },
967
+ "truncated": 250,
968
+ "non_truncated": 0,
969
+ "padded": 0,
970
+ "non_padded": 250,
971
+ "effective_few_shots": 3.0,
972
+ "num_truncated_few_shots": 0
973
+ },
974
+ "harness|bbh:logical_deduction_five_objects|3": {
975
+ "hashes": {
976
+ "hash_examples": "0e958c856332a745",
977
+ "hash_full_prompts": "0d7cff0e511b49e7",
978
+ "hash_input_tokens": "d16bae7ef2034333",
979
+ "hash_cont_tokens": "c51b8361a6f4e6bd"
980
+ },
981
+ "truncated": 250,
982
+ "non_truncated": 0,
983
+ "padded": 0,
984
+ "non_padded": 250,
985
+ "effective_few_shots": 3.0,
986
+ "num_truncated_few_shots": 0
987
+ },
988
+ "harness|bbh:logical_deduction_seven_objects|3": {
989
+ "hashes": {
990
+ "hash_examples": "ab9de25a5eb40d09",
991
+ "hash_full_prompts": "db7b7b19919ef4a7",
992
+ "hash_input_tokens": "8b1a10d2204d99c3",
993
+ "hash_cont_tokens": "ca75dd49f4c8d505"
994
+ },
995
+ "truncated": 250,
996
+ "non_truncated": 0,
997
+ "padded": 0,
998
+ "non_padded": 250,
999
+ "effective_few_shots": 3.0,
1000
+ "num_truncated_few_shots": 0
1001
+ },
1002
+ "harness|bbh:logical_deduction_three_objects|3": {
1003
+ "hashes": {
1004
+ "hash_examples": "3c6bf52517714218",
1005
+ "hash_full_prompts": "fd6a5580415c1e21",
1006
+ "hash_input_tokens": "d48e4b55d3c0ab6c",
1007
+ "hash_cont_tokens": "14d8d9e3f4729056"
1008
+ },
1009
+ "truncated": 250,
1010
+ "non_truncated": 0,
1011
+ "padded": 0,
1012
+ "non_padded": 250,
1013
+ "effective_few_shots": 3.0,
1014
+ "num_truncated_few_shots": 0
1015
+ },
1016
+ "harness|bbh:movie_recommendation|3": {
1017
+ "hashes": {
1018
+ "hash_examples": "2d9dc4975935d31a",
1019
+ "hash_full_prompts": "b7c9fdf1a2ad8106",
1020
+ "hash_input_tokens": "019f6011d8ebb7b4",
1021
+ "hash_cont_tokens": "4e32ce157b48f24a"
1022
+ },
1023
+ "truncated": 249,
1024
+ "non_truncated": 0,
1025
+ "padded": 0,
1026
+ "non_padded": 249,
1027
+ "effective_few_shots": 3.0,
1028
+ "num_truncated_few_shots": 0
1029
+ },
1030
+ "harness|bbh:navigate|3": {
1031
+ "hashes": {
1032
+ "hash_examples": "ba91dcdb9a064255",
1033
+ "hash_full_prompts": "35aa68650803f91c",
1034
+ "hash_input_tokens": "fbf7bbc927857899",
1035
+ "hash_cont_tokens": "6a051a13fb1325c7"
1036
+ },
1037
+ "truncated": 250,
1038
+ "non_truncated": 0,
1039
+ "padded": 0,
1040
+ "non_padded": 250,
1041
+ "effective_few_shots": 3.0,
1042
+ "num_truncated_few_shots": 0
1043
+ },
1044
+ "harness|bbh:reasoning_about_colored_objects|3": {
1045
+ "hashes": {
1046
+ "hash_examples": "a6ba328c4c3385d2",
1047
+ "hash_full_prompts": "954618143d9d5c6d",
1048
+ "hash_input_tokens": "eab386685d65e6d7",
1049
+ "hash_cont_tokens": "28c75cb68920c5e5"
1050
+ },
1051
+ "truncated": 250,
1052
+ "non_truncated": 0,
1053
+ "padded": 0,
1054
+ "non_padded": 250,
1055
+ "effective_few_shots": 3.0,
1056
+ "num_truncated_few_shots": 0
1057
+ },
1058
+ "harness|bbh:ruin_names|3": {
1059
+ "hashes": {
1060
+ "hash_examples": "2ef28d5f2d4fdd25",
1061
+ "hash_full_prompts": "fd807f4380c14312",
1062
+ "hash_input_tokens": "f17c7a83f10826d5",
1063
+ "hash_cont_tokens": "e2b9bda947d0d72f"
1064
+ },
1065
+ "truncated": 248,
1066
+ "non_truncated": 0,
1067
+ "padded": 0,
1068
+ "non_padded": 248,
1069
+ "effective_few_shots": 3.0,
1070
+ "num_truncated_few_shots": 0
1071
+ },
1072
+ "harness|bbh:salient_translation_error_detection|3": {
1073
+ "hashes": {
1074
+ "hash_examples": "c13f25ec8ffed496",
1075
+ "hash_full_prompts": "49ab3bc1ed62613f",
1076
+ "hash_input_tokens": "4ecdc0361737b984",
1077
+ "hash_cont_tokens": "650e93e42a6b12c3"
1078
+ },
1079
+ "truncated": 250,
1080
+ "non_truncated": 0,
1081
+ "padded": 0,
1082
+ "non_padded": 250,
1083
+ "effective_few_shots": 3.0,
1084
+ "num_truncated_few_shots": 0
1085
+ },
1086
+ "harness|bbh:snarks|3": {
1087
+ "hashes": {
1088
+ "hash_examples": "5f6db7bff7f6f22e",
1089
+ "hash_full_prompts": "fa5c1ca26f4a8d48",
1090
+ "hash_input_tokens": "e0e9ee6065d02980",
1091
+ "hash_cont_tokens": "c7cb581edecb36c5"
1092
+ },
1093
+ "truncated": 178,
1094
+ "non_truncated": 0,
1095
+ "padded": 0,
1096
+ "non_padded": 178,
1097
+ "effective_few_shots": 3.0,
1098
+ "num_truncated_few_shots": 0
1099
+ },
1100
+ "harness|bbh:sports_understanding|3": {
1101
+ "hashes": {
1102
+ "hash_examples": "042afbe5d9c1f02d",
1103
+ "hash_full_prompts": "607b29401b4907ec",
1104
+ "hash_input_tokens": "e2868a0e92e68765",
1105
+ "hash_cont_tokens": "96ff555916e57fb2"
1106
+ },
1107
+ "truncated": 250,
1108
+ "non_truncated": 0,
1109
+ "padded": 0,
1110
+ "non_padded": 250,
1111
+ "effective_few_shots": 3.0,
1112
+ "num_truncated_few_shots": 0
1113
+ },
1114
+ "harness|bbh:temporal_sequences|3": {
1115
+ "hashes": {
1116
+ "hash_examples": "803a05f352eb6afc",
1117
+ "hash_full_prompts": "faab4b5e14b9304e",
1118
+ "hash_input_tokens": "26a0ab6e389383c3",
1119
+ "hash_cont_tokens": "819d7bf2bc094710"
1120
+ },
1121
+ "truncated": 250,
1122
+ "non_truncated": 0,
1123
+ "padded": 0,
1124
+ "non_padded": 250,
1125
+ "effective_few_shots": 3.0,
1126
+ "num_truncated_few_shots": 0
1127
+ },
1128
+ "harness|bbh:tracking_shuffled_objects_five_objects|3": {
1129
+ "hashes": {
1130
+ "hash_examples": "2bbac6db7ab0d527",
1131
+ "hash_full_prompts": "7e689cfb3916666f",
1132
+ "hash_input_tokens": "d0e6770bedbf925a",
1133
+ "hash_cont_tokens": "16010037e6da1d04"
1134
+ },
1135
+ "truncated": 250,
1136
+ "non_truncated": 0,
1137
+ "padded": 0,
1138
+ "non_padded": 250,
1139
+ "effective_few_shots": 3.0,
1140
+ "num_truncated_few_shots": 0
1141
+ },
1142
+ "harness|bbh:tracking_shuffled_objects_seven_objects|3": {
1143
+ "hashes": {
1144
+ "hash_examples": "845caf093ac2b58c",
1145
+ "hash_full_prompts": "a80a61e259878fa0",
1146
+ "hash_input_tokens": "9b1288a55911b79d",
1147
+ "hash_cont_tokens": "2eb386173e7fff58"
1148
+ },
1149
+ "truncated": 250,
1150
+ "non_truncated": 0,
1151
+ "padded": 0,
1152
+ "non_padded": 250,
1153
+ "effective_few_shots": 3.0,
1154
+ "num_truncated_few_shots": 0
1155
+ },
1156
+ "harness|bbh:tracking_shuffled_objects_three_objects|3": {
1157
+ "hashes": {
1158
+ "hash_examples": "9004f14d5a32b9a8",
1159
+ "hash_full_prompts": "fc66cf32f54cd46f",
1160
+ "hash_input_tokens": "1328545243e4b107",
1161
+ "hash_cont_tokens": "405575bc8e10ea9d"
1162
+ },
1163
+ "truncated": 250,
1164
+ "non_truncated": 0,
1165
+ "padded": 0,
1166
+ "non_padded": 250,
1167
+ "effective_few_shots": 3.0,
1168
+ "num_truncated_few_shots": 0
1169
+ }
1170
+ },
1171
+ "summary_general": {
1172
+ "hashes": {
1173
+ "hash_examples": "4ff1e3dc5703575d",
1174
+ "hash_full_prompts": "3758756e616fd780",
1175
+ "hash_input_tokens": "3ee0dfd31c556ab0",
1176
+ "hash_cont_tokens": "b178ec6cb27af027"
1177
+ },
1178
+ "truncated": 4362,
1179
+ "non_truncated": 0,
1180
+ "padded": 0,
1181
+ "non_padded": 4362,
1182
+ "num_truncated_few_shots": 0
1183
+ }
1184
+ }