pminervini commited on
Commit
142beab
1 Parent(s): 4d9b54f
meta-llama/Llama-2-13b-chat-hf/results.json DELETED
@@ -1,871 +0,0 @@
1
- {
2
- "results": {
3
- "harness|arc:challenge|25": {
4
- "acc": 0.5563139931740614,
5
- "acc_stderr": 0.014518421825670449,
6
- "acc_norm": 0.590443686006826,
7
- "acc_norm_stderr": 0.01437035863247244
8
- },
9
- "harness|hellaswag|10": {
10
- "acc": 0.6293567018522207,
11
- "acc_stderr": 0.004819899945342489,
12
- "acc_norm": 0.8193586934873531,
13
- "acc_norm_stderr": 0.0038393444971919545
14
- },
15
- "harness|hendrycksTest-abstract_algebra|5": {
16
- "acc": 0.31,
17
- "acc_stderr": 0.046482319871173156,
18
- "acc_norm": 0.31,
19
- "acc_norm_stderr": 0.046482319871173156
20
- },
21
- "harness|hendrycksTest-anatomy|5": {
22
- "acc": 0.4740740740740741,
23
- "acc_stderr": 0.04313531696750574,
24
- "acc_norm": 0.4740740740740741,
25
- "acc_norm_stderr": 0.04313531696750574
26
- },
27
- "harness|hendrycksTest-astronomy|5": {
28
- "acc": 0.5460526315789473,
29
- "acc_stderr": 0.04051646342874142,
30
- "acc_norm": 0.5460526315789473,
31
- "acc_norm_stderr": 0.04051646342874142
32
- },
33
- "harness|hendrycksTest-business_ethics|5": {
34
- "acc": 0.53,
35
- "acc_stderr": 0.05016135580465919,
36
- "acc_norm": 0.53,
37
- "acc_norm_stderr": 0.05016135580465919
38
- },
39
- "harness|hendrycksTest-clinical_knowledge|5": {
40
- "acc": 0.5849056603773585,
41
- "acc_stderr": 0.03032594578928611,
42
- "acc_norm": 0.5849056603773585,
43
- "acc_norm_stderr": 0.03032594578928611
44
- },
45
- "harness|hendrycksTest-college_biology|5": {
46
- "acc": 0.5833333333333334,
47
- "acc_stderr": 0.04122728707651282,
48
- "acc_norm": 0.5833333333333334,
49
- "acc_norm_stderr": 0.04122728707651282
50
- },
51
- "harness|hendrycksTest-college_chemistry|5": {
52
- "acc": 0.38,
53
- "acc_stderr": 0.048783173121456316,
54
- "acc_norm": 0.38,
55
- "acc_norm_stderr": 0.048783173121456316
56
- },
57
- "harness|hendrycksTest-college_computer_science|5": {
58
- "acc": 0.47,
59
- "acc_stderr": 0.05016135580465919,
60
- "acc_norm": 0.47,
61
- "acc_norm_stderr": 0.05016135580465919
62
- },
63
- "harness|hendrycksTest-college_mathematics|5": {
64
- "acc": 0.32,
65
- "acc_stderr": 0.046882617226215034,
66
- "acc_norm": 0.32,
67
- "acc_norm_stderr": 0.046882617226215034
68
- },
69
- "harness|hendrycksTest-college_medicine|5": {
70
- "acc": 0.4624277456647399,
71
- "acc_stderr": 0.0380168510452446,
72
- "acc_norm": 0.4624277456647399,
73
- "acc_norm_stderr": 0.0380168510452446
74
- },
75
- "harness|hendrycksTest-college_physics|5": {
76
- "acc": 0.3137254901960784,
77
- "acc_stderr": 0.04617034827006717,
78
- "acc_norm": 0.3137254901960784,
79
- "acc_norm_stderr": 0.04617034827006717
80
- },
81
- "harness|hendrycksTest-computer_security|5": {
82
- "acc": 0.68,
83
- "acc_stderr": 0.04688261722621505,
84
- "acc_norm": 0.68,
85
- "acc_norm_stderr": 0.04688261722621505
86
- },
87
- "harness|hendrycksTest-conceptual_physics|5": {
88
- "acc": 0.4,
89
- "acc_stderr": 0.03202563076101735,
90
- "acc_norm": 0.4,
91
- "acc_norm_stderr": 0.03202563076101735
92
- },
93
- "harness|hendrycksTest-econometrics|5": {
94
- "acc": 0.3157894736842105,
95
- "acc_stderr": 0.043727482902780064,
96
- "acc_norm": 0.3157894736842105,
97
- "acc_norm_stderr": 0.043727482902780064
98
- },
99
- "harness|hendrycksTest-electrical_engineering|5": {
100
- "acc": 0.503448275862069,
101
- "acc_stderr": 0.0416656757710158,
102
- "acc_norm": 0.503448275862069,
103
- "acc_norm_stderr": 0.0416656757710158
104
- },
105
- "harness|hendrycksTest-elementary_mathematics|5": {
106
- "acc": 0.3412698412698413,
107
- "acc_stderr": 0.024419234966819064,
108
- "acc_norm": 0.3412698412698413,
109
- "acc_norm_stderr": 0.024419234966819064
110
- },
111
- "harness|hendrycksTest-formal_logic|5": {
112
- "acc": 0.30952380952380953,
113
- "acc_stderr": 0.04134913018303316,
114
- "acc_norm": 0.30952380952380953,
115
- "acc_norm_stderr": 0.04134913018303316
116
- },
117
- "harness|hendrycksTest-global_facts|5": {
118
- "acc": 0.3,
119
- "acc_stderr": 0.046056618647183814,
120
- "acc_norm": 0.3,
121
- "acc_norm_stderr": 0.046056618647183814
122
- },
123
- "harness|hendrycksTest-high_school_biology|5": {
124
- "acc": 0.6419354838709678,
125
- "acc_stderr": 0.02727389059430064,
126
- "acc_norm": 0.6419354838709678,
127
- "acc_norm_stderr": 0.02727389059430064
128
- },
129
- "harness|hendrycksTest-high_school_chemistry|5": {
130
- "acc": 0.4630541871921182,
131
- "acc_stderr": 0.035083705204426656,
132
- "acc_norm": 0.4630541871921182,
133
- "acc_norm_stderr": 0.035083705204426656
134
- },
135
- "harness|hendrycksTest-high_school_computer_science|5": {
136
- "acc": 0.59,
137
- "acc_stderr": 0.04943110704237102,
138
- "acc_norm": 0.59,
139
- "acc_norm_stderr": 0.04943110704237102
140
- },
141
- "harness|hendrycksTest-high_school_european_history|5": {
142
- "acc": 0.6727272727272727,
143
- "acc_stderr": 0.03663974994391244,
144
- "acc_norm": 0.6727272727272727,
145
- "acc_norm_stderr": 0.03663974994391244
146
- },
147
- "harness|hendrycksTest-high_school_geography|5": {
148
- "acc": 0.702020202020202,
149
- "acc_stderr": 0.03258630383836556,
150
- "acc_norm": 0.702020202020202,
151
- "acc_norm_stderr": 0.03258630383836556
152
- },
153
- "harness|hendrycksTest-high_school_government_and_politics|5": {
154
- "acc": 0.7875647668393783,
155
- "acc_stderr": 0.029519282616817234,
156
- "acc_norm": 0.7875647668393783,
157
- "acc_norm_stderr": 0.029519282616817234
158
- },
159
- "harness|hendrycksTest-high_school_macroeconomics|5": {
160
- "acc": 0.49230769230769234,
161
- "acc_stderr": 0.025348006031534788,
162
- "acc_norm": 0.49230769230769234,
163
- "acc_norm_stderr": 0.025348006031534788
164
- },
165
- "harness|hendrycksTest-high_school_mathematics|5": {
166
- "acc": 0.3111111111111111,
167
- "acc_stderr": 0.028226446749683522,
168
- "acc_norm": 0.3111111111111111,
169
- "acc_norm_stderr": 0.028226446749683522
170
- },
171
- "harness|hendrycksTest-high_school_microeconomics|5": {
172
- "acc": 0.5294117647058824,
173
- "acc_stderr": 0.03242225027115007,
174
- "acc_norm": 0.5294117647058824,
175
- "acc_norm_stderr": 0.03242225027115007
176
- },
177
- "harness|hendrycksTest-high_school_physics|5": {
178
- "acc": 0.33774834437086093,
179
- "acc_stderr": 0.038615575462551684,
180
- "acc_norm": 0.33774834437086093,
181
- "acc_norm_stderr": 0.038615575462551684
182
- },
183
- "harness|hendrycksTest-high_school_psychology|5": {
184
- "acc": 0.7321100917431193,
185
- "acc_stderr": 0.018987462257978652,
186
- "acc_norm": 0.7321100917431193,
187
- "acc_norm_stderr": 0.018987462257978652
188
- },
189
- "harness|hendrycksTest-high_school_statistics|5": {
190
- "acc": 0.3888888888888889,
191
- "acc_stderr": 0.03324708911809117,
192
- "acc_norm": 0.3888888888888889,
193
- "acc_norm_stderr": 0.03324708911809117
194
- },
195
- "harness|hendrycksTest-high_school_us_history|5": {
196
- "acc": 0.75,
197
- "acc_stderr": 0.03039153369274154,
198
- "acc_norm": 0.75,
199
- "acc_norm_stderr": 0.03039153369274154
200
- },
201
- "harness|hendrycksTest-high_school_world_history|5": {
202
- "acc": 0.7172995780590717,
203
- "acc_stderr": 0.02931281415395592,
204
- "acc_norm": 0.7172995780590717,
205
- "acc_norm_stderr": 0.02931281415395592
206
- },
207
- "harness|hendrycksTest-human_aging|5": {
208
- "acc": 0.6457399103139013,
209
- "acc_stderr": 0.032100621541349864,
210
- "acc_norm": 0.6457399103139013,
211
- "acc_norm_stderr": 0.032100621541349864
212
- },
213
- "harness|hendrycksTest-human_sexuality|5": {
214
- "acc": 0.6335877862595419,
215
- "acc_stderr": 0.04225875451969637,
216
- "acc_norm": 0.6335877862595419,
217
- "acc_norm_stderr": 0.04225875451969637
218
- },
219
- "harness|hendrycksTest-international_law|5": {
220
- "acc": 0.768595041322314,
221
- "acc_stderr": 0.03849856098794089,
222
- "acc_norm": 0.768595041322314,
223
- "acc_norm_stderr": 0.03849856098794089
224
- },
225
- "harness|hendrycksTest-jurisprudence|5": {
226
- "acc": 0.6944444444444444,
227
- "acc_stderr": 0.044531975073749834,
228
- "acc_norm": 0.6944444444444444,
229
- "acc_norm_stderr": 0.044531975073749834
230
- },
231
- "harness|hendrycksTest-logical_fallacies|5": {
232
- "acc": 0.6503067484662577,
233
- "acc_stderr": 0.037466683254700206,
234
- "acc_norm": 0.6503067484662577,
235
- "acc_norm_stderr": 0.037466683254700206
236
- },
237
- "harness|hendrycksTest-machine_learning|5": {
238
- "acc": 0.35714285714285715,
239
- "acc_stderr": 0.04547960999764376,
240
- "acc_norm": 0.35714285714285715,
241
- "acc_norm_stderr": 0.04547960999764376
242
- },
243
- "harness|hendrycksTest-management|5": {
244
- "acc": 0.7378640776699029,
245
- "acc_stderr": 0.04354631077260595,
246
- "acc_norm": 0.7378640776699029,
247
- "acc_norm_stderr": 0.04354631077260595
248
- },
249
- "harness|hendrycksTest-marketing|5": {
250
- "acc": 0.7863247863247863,
251
- "acc_stderr": 0.026853450377009175,
252
- "acc_norm": 0.7863247863247863,
253
- "acc_norm_stderr": 0.026853450377009175
254
- },
255
- "harness|hendrycksTest-medical_genetics|5": {
256
- "acc": 0.57,
257
- "acc_stderr": 0.049756985195624284,
258
- "acc_norm": 0.57,
259
- "acc_norm_stderr": 0.049756985195624284
260
- },
261
- "harness|hendrycksTest-miscellaneous|5": {
262
- "acc": 0.7471264367816092,
263
- "acc_stderr": 0.015543377313719681,
264
- "acc_norm": 0.7471264367816092,
265
- "acc_norm_stderr": 0.015543377313719681
266
- },
267
- "harness|hendrycksTest-moral_disputes|5": {
268
- "acc": 0.6127167630057804,
269
- "acc_stderr": 0.026226158605124655,
270
- "acc_norm": 0.6127167630057804,
271
- "acc_norm_stderr": 0.026226158605124655
272
- },
273
- "harness|hendrycksTest-moral_scenarios|5": {
274
- "acc": 0.30502793296089387,
275
- "acc_stderr": 0.015398723510916716,
276
- "acc_norm": 0.30502793296089387,
277
- "acc_norm_stderr": 0.015398723510916716
278
- },
279
- "harness|hendrycksTest-nutrition|5": {
280
- "acc": 0.5947712418300654,
281
- "acc_stderr": 0.028110928492809068,
282
- "acc_norm": 0.5947712418300654,
283
- "acc_norm_stderr": 0.028110928492809068
284
- },
285
- "harness|hendrycksTest-philosophy|5": {
286
- "acc": 0.5884244372990354,
287
- "acc_stderr": 0.02795048149440127,
288
- "acc_norm": 0.5884244372990354,
289
- "acc_norm_stderr": 0.02795048149440127
290
- },
291
- "harness|hendrycksTest-prehistory|5": {
292
- "acc": 0.6111111111111112,
293
- "acc_stderr": 0.02712511551316687,
294
- "acc_norm": 0.6111111111111112,
295
- "acc_norm_stderr": 0.02712511551316687
296
- },
297
- "harness|hendrycksTest-professional_accounting|5": {
298
- "acc": 0.38652482269503546,
299
- "acc_stderr": 0.029049190342543454,
300
- "acc_norm": 0.38652482269503546,
301
- "acc_norm_stderr": 0.029049190342543454
302
- },
303
- "harness|hendrycksTest-professional_law|5": {
304
- "acc": 0.39113428943937417,
305
- "acc_stderr": 0.012463861839982058,
306
- "acc_norm": 0.39113428943937417,
307
- "acc_norm_stderr": 0.012463861839982058
308
- },
309
- "harness|hendrycksTest-professional_medicine|5": {
310
- "acc": 0.5,
311
- "acc_stderr": 0.030372836961539352,
312
- "acc_norm": 0.5,
313
- "acc_norm_stderr": 0.030372836961539352
314
- },
315
- "harness|hendrycksTest-professional_psychology|5": {
316
- "acc": 0.5424836601307189,
317
- "acc_stderr": 0.020154685712590888,
318
- "acc_norm": 0.5424836601307189,
319
- "acc_norm_stderr": 0.020154685712590888
320
- },
321
- "harness|hendrycksTest-public_relations|5": {
322
- "acc": 0.6636363636363637,
323
- "acc_stderr": 0.04525393596302505,
324
- "acc_norm": 0.6636363636363637,
325
- "acc_norm_stderr": 0.04525393596302505
326
- },
327
- "harness|hendrycksTest-security_studies|5": {
328
- "acc": 0.6408163265306123,
329
- "acc_stderr": 0.030713560455108493,
330
- "acc_norm": 0.6408163265306123,
331
- "acc_norm_stderr": 0.030713560455108493
332
- },
333
- "harness|hendrycksTest-sociology|5": {
334
- "acc": 0.7512437810945274,
335
- "acc_stderr": 0.030567675938916714,
336
- "acc_norm": 0.7512437810945274,
337
- "acc_norm_stderr": 0.030567675938916714
338
- },
339
- "harness|hendrycksTest-us_foreign_policy|5": {
340
- "acc": 0.81,
341
- "acc_stderr": 0.03942772444036625,
342
- "acc_norm": 0.81,
343
- "acc_norm_stderr": 0.03942772444036625
344
- },
345
- "harness|hendrycksTest-virology|5": {
346
- "acc": 0.4819277108433735,
347
- "acc_stderr": 0.038899512528272166,
348
- "acc_norm": 0.4819277108433735,
349
- "acc_norm_stderr": 0.038899512528272166
350
- },
351
- "harness|hendrycksTest-world_religions|5": {
352
- "acc": 0.7309941520467836,
353
- "acc_stderr": 0.03401052620104089,
354
- "acc_norm": 0.7309941520467836,
355
- "acc_norm_stderr": 0.03401052620104089
356
- },
357
- "harness|truthfulqa:mc|0": {
358
- "mc1": 0.28518971848225216,
359
- "mc1_stderr": 0.015805827874454895,
360
- "mc2": 0.4411794590119937,
361
- "mc2_stderr": 0.015755921757439843
362
- },
363
- "all": {
364
- "acc": 0.5479380524707899,
365
- "acc_stderr": 0.03451142729909022,
366
- "acc_norm": 0.5517368945804153,
367
- "acc_norm_stderr": 0.03449229816957583,
368
- "mc1": 0.28518971848225216,
369
- "mc1_stderr": 0.015805827874454895,
370
- "mc2": 0.4411794590119937,
371
- "mc2_stderr": 0.015755921757439843
372
- }
373
- },
374
- "versions": {
375
- "harness|arc:challenge|25": 0,
376
- "harness|hellaswag|10": 0,
377
- "harness|hendrycksTest-abstract_algebra|5": 1,
378
- "harness|hendrycksTest-anatomy|5": 1,
379
- "harness|hendrycksTest-astronomy|5": 1,
380
- "harness|hendrycksTest-business_ethics|5": 1,
381
- "harness|hendrycksTest-clinical_knowledge|5": 1,
382
- "harness|hendrycksTest-college_biology|5": 1,
383
- "harness|hendrycksTest-college_chemistry|5": 1,
384
- "harness|hendrycksTest-college_computer_science|5": 1,
385
- "harness|hendrycksTest-college_mathematics|5": 1,
386
- "harness|hendrycksTest-college_medicine|5": 1,
387
- "harness|hendrycksTest-college_physics|5": 1,
388
- "harness|hendrycksTest-computer_security|5": 1,
389
- "harness|hendrycksTest-conceptual_physics|5": 1,
390
- "harness|hendrycksTest-econometrics|5": 1,
391
- "harness|hendrycksTest-electrical_engineering|5": 1,
392
- "harness|hendrycksTest-elementary_mathematics|5": 1,
393
- "harness|hendrycksTest-formal_logic|5": 1,
394
- "harness|hendrycksTest-global_facts|5": 1,
395
- "harness|hendrycksTest-high_school_biology|5": 1,
396
- "harness|hendrycksTest-high_school_chemistry|5": 1,
397
- "harness|hendrycksTest-high_school_computer_science|5": 1,
398
- "harness|hendrycksTest-high_school_european_history|5": 1,
399
- "harness|hendrycksTest-high_school_geography|5": 1,
400
- "harness|hendrycksTest-high_school_government_and_politics|5": 1,
401
- "harness|hendrycksTest-high_school_macroeconomics|5": 1,
402
- "harness|hendrycksTest-high_school_mathematics|5": 1,
403
- "harness|hendrycksTest-high_school_microeconomics|5": 1,
404
- "harness|hendrycksTest-high_school_physics|5": 1,
405
- "harness|hendrycksTest-high_school_psychology|5": 1,
406
- "harness|hendrycksTest-high_school_statistics|5": 1,
407
- "harness|hendrycksTest-high_school_us_history|5": 1,
408
- "harness|hendrycksTest-high_school_world_history|5": 1,
409
- "harness|hendrycksTest-human_aging|5": 1,
410
- "harness|hendrycksTest-human_sexuality|5": 1,
411
- "harness|hendrycksTest-international_law|5": 1,
412
- "harness|hendrycksTest-jurisprudence|5": 1,
413
- "harness|hendrycksTest-logical_fallacies|5": 1,
414
- "harness|hendrycksTest-machine_learning|5": 1,
415
- "harness|hendrycksTest-management|5": 1,
416
- "harness|hendrycksTest-marketing|5": 1,
417
- "harness|hendrycksTest-medical_genetics|5": 1,
418
- "harness|hendrycksTest-miscellaneous|5": 1,
419
- "harness|hendrycksTest-moral_disputes|5": 1,
420
- "harness|hendrycksTest-moral_scenarios|5": 1,
421
- "harness|hendrycksTest-nutrition|5": 1,
422
- "harness|hendrycksTest-philosophy|5": 1,
423
- "harness|hendrycksTest-prehistory|5": 1,
424
- "harness|hendrycksTest-professional_accounting|5": 1,
425
- "harness|hendrycksTest-professional_law|5": 1,
426
- "harness|hendrycksTest-professional_medicine|5": 1,
427
- "harness|hendrycksTest-professional_psychology|5": 1,
428
- "harness|hendrycksTest-public_relations|5": 1,
429
- "harness|hendrycksTest-security_studies|5": 1,
430
- "harness|hendrycksTest-sociology|5": 1,
431
- "harness|hendrycksTest-us_foreign_policy|5": 1,
432
- "harness|hendrycksTest-virology|5": 1,
433
- "harness|hendrycksTest-world_religions|5": 1,
434
- "harness|truthfulqa:mc|0": 1,
435
- "all": 0
436
- },
437
- "config": {
438
- "model_name": "meta-llama/Llama-2-13b-chat-hf",
439
- "model_sha": "f848cf15ab9a51ae5735ab28120a9a0773eeb541",
440
- "model_dtype": "torch.float16",
441
- "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937",
442
- "num_few_shot_default": 0,
443
- "num_fewshot_seeds": 1,
444
- "override_batch_size": 1,
445
- "max_samples": null
446
- },
447
- "task_config": {
448
- "harness|arc:challenge": "LM Harness task",
449
- "harness|hellaswag": "LM Harness task",
450
- "harness|hendrycksTest-abstract_algebra": "LM Harness task",
451
- "harness|hendrycksTest-anatomy": "LM Harness task",
452
- "harness|hendrycksTest-astronomy": "LM Harness task",
453
- "harness|hendrycksTest-business_ethics": "LM Harness task",
454
- "harness|hendrycksTest-clinical_knowledge": "LM Harness task",
455
- "harness|hendrycksTest-college_biology": "LM Harness task",
456
- "harness|hendrycksTest-college_chemistry": "LM Harness task",
457
- "harness|hendrycksTest-college_computer_science": "LM Harness task",
458
- "harness|hendrycksTest-college_mathematics": "LM Harness task",
459
- "harness|hendrycksTest-college_medicine": "LM Harness task",
460
- "harness|hendrycksTest-college_physics": "LM Harness task",
461
- "harness|hendrycksTest-computer_security": "LM Harness task",
462
- "harness|hendrycksTest-conceptual_physics": "LM Harness task",
463
- "harness|hendrycksTest-econometrics": "LM Harness task",
464
- "harness|hendrycksTest-electrical_engineering": "LM Harness task",
465
- "harness|hendrycksTest-elementary_mathematics": "LM Harness task",
466
- "harness|hendrycksTest-formal_logic": "LM Harness task",
467
- "harness|hendrycksTest-global_facts": "LM Harness task",
468
- "harness|hendrycksTest-high_school_biology": "LM Harness task",
469
- "harness|hendrycksTest-high_school_chemistry": "LM Harness task",
470
- "harness|hendrycksTest-high_school_computer_science": "LM Harness task",
471
- "harness|hendrycksTest-high_school_european_history": "LM Harness task",
472
- "harness|hendrycksTest-high_school_geography": "LM Harness task",
473
- "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task",
474
- "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task",
475
- "harness|hendrycksTest-high_school_mathematics": "LM Harness task",
476
- "harness|hendrycksTest-high_school_microeconomics": "LM Harness task",
477
- "harness|hendrycksTest-high_school_physics": "LM Harness task",
478
- "harness|hendrycksTest-high_school_psychology": "LM Harness task",
479
- "harness|hendrycksTest-high_school_statistics": "LM Harness task",
480
- "harness|hendrycksTest-high_school_us_history": "LM Harness task",
481
- "harness|hendrycksTest-high_school_world_history": "LM Harness task",
482
- "harness|hendrycksTest-human_aging": "LM Harness task",
483
- "harness|hendrycksTest-human_sexuality": "LM Harness task",
484
- "harness|hendrycksTest-international_law": "LM Harness task",
485
- "harness|hendrycksTest-jurisprudence": "LM Harness task",
486
- "harness|hendrycksTest-logical_fallacies": "LM Harness task",
487
- "harness|hendrycksTest-machine_learning": "LM Harness task",
488
- "harness|hendrycksTest-management": "LM Harness task",
489
- "harness|hendrycksTest-marketing": "LM Harness task",
490
- "harness|hendrycksTest-medical_genetics": "LM Harness task",
491
- "harness|hendrycksTest-miscellaneous": "LM Harness task",
492
- "harness|hendrycksTest-moral_disputes": "LM Harness task",
493
- "harness|hendrycksTest-moral_scenarios": "LM Harness task",
494
- "harness|hendrycksTest-nutrition": "LM Harness task",
495
- "harness|hendrycksTest-philosophy": "LM Harness task",
496
- "harness|hendrycksTest-prehistory": "LM Harness task",
497
- "harness|hendrycksTest-professional_accounting": "LM Harness task",
498
- "harness|hendrycksTest-professional_law": "LM Harness task",
499
- "harness|hendrycksTest-professional_medicine": "LM Harness task",
500
- "harness|hendrycksTest-professional_psychology": "LM Harness task",
501
- "harness|hendrycksTest-public_relations": "LM Harness task",
502
- "harness|hendrycksTest-security_studies": "LM Harness task",
503
- "harness|hendrycksTest-sociology": "LM Harness task",
504
- "harness|hendrycksTest-us_foreign_policy": "LM Harness task",
505
- "harness|hendrycksTest-virology": "LM Harness task",
506
- "harness|hendrycksTest-world_religions": "LM Harness task",
507
- "harness|truthfulqa:mc": "LM Harness task"
508
- },
509
- "hashes": {
510
- "harness|arc:challenge|25": {
511
- "hash_examples": "fb8c51b1872daeda",
512
- "hash_full_prompts": "045cbb916e5145c6",
513
- "hash_input_tokens": "61571bf68d6d89aa",
514
- "hash_cont_tokens": "8210decc6ff6f7df"
515
- },
516
- "harness|hellaswag|10": {
517
- "hash_examples": "e1768ecb99d7ecf0",
518
- "hash_full_prompts": "0b4c16983130f84f",
519
- "hash_input_tokens": "29906669b1c7054a",
520
- "hash_cont_tokens": "b3b9e9017afa63af"
521
- },
522
- "harness|hendrycksTest-abstract_algebra|5": {
523
- "hash_examples": "280f9f325b40559a",
524
- "hash_full_prompts": "2f776a367d23aea2",
525
- "hash_input_tokens": "c54ff61ad0273dd7",
526
- "hash_cont_tokens": "50421e30bef398f9"
527
- },
528
- "harness|hendrycksTest-anatomy|5": {
529
- "hash_examples": "2f83a4f1cab4ba18",
530
- "hash_full_prompts": "516f74bef25df620",
531
- "hash_input_tokens": "be31a1e22aef5f90",
532
- "hash_cont_tokens": "f11971a765cb609f"
533
- },
534
- "harness|hendrycksTest-astronomy|5": {
535
- "hash_examples": "7d587b908da4d762",
536
- "hash_full_prompts": "faf4e80f65de93ca",
537
- "hash_input_tokens": "277a7b1fad566940",
538
- "hash_cont_tokens": "bf30e5d3f48250cb"
539
- },
540
- "harness|hendrycksTest-business_ethics|5": {
541
- "hash_examples": "33e51740670de686",
542
- "hash_full_prompts": "db01c3ef8e1479d4",
543
- "hash_input_tokens": "ba552605bc116de5",
544
- "hash_cont_tokens": "bc1dd9b2d995eb61"
545
- },
546
- "harness|hendrycksTest-clinical_knowledge|5": {
547
- "hash_examples": "f3366dbe7eefffa4",
548
- "hash_full_prompts": "49654f71d94b65c3",
549
- "hash_input_tokens": "428c7563d0b98ab9",
550
- "hash_cont_tokens": "890a119624b3b935"
551
- },
552
- "harness|hendrycksTest-college_biology|5": {
553
- "hash_examples": "ca2b6753a0193e7f",
554
- "hash_full_prompts": "2b460b75f1fdfefd",
555
- "hash_input_tokens": "da036601573942e2",
556
- "hash_cont_tokens": "875cde3af7a0ee14"
557
- },
558
- "harness|hendrycksTest-college_chemistry|5": {
559
- "hash_examples": "22ff85f1d34f42d1",
560
- "hash_full_prompts": "242c9be6da583e95",
561
- "hash_input_tokens": "94e0196d6aded13d",
562
- "hash_cont_tokens": "50421e30bef398f9"
563
- },
564
- "harness|hendrycksTest-college_computer_science|5": {
565
- "hash_examples": "30318289d717a5cf",
566
- "hash_full_prompts": "ed2bdb4e87c4b371",
567
- "hash_input_tokens": "6e4d0f4a8d36690b",
568
- "hash_cont_tokens": "ffc0fe414cdc4a83"
569
- },
570
- "harness|hendrycksTest-college_mathematics|5": {
571
- "hash_examples": "4944d1f0b6b5d911",
572
- "hash_full_prompts": "770bc4281c973190",
573
- "hash_input_tokens": "614054d17109a25d",
574
- "hash_cont_tokens": "50421e30bef398f9"
575
- },
576
- "harness|hendrycksTest-college_medicine|5": {
577
- "hash_examples": "dd69cc33381275af",
578
- "hash_full_prompts": "ad2a53e5250ab46e",
579
- "hash_input_tokens": "1d633b3cc0524ba8",
580
- "hash_cont_tokens": "1f88b00d41957d82"
581
- },
582
- "harness|hendrycksTest-college_physics|5": {
583
- "hash_examples": "875dd26d22655b0d",
584
- "hash_full_prompts": "833a0d7b55aed500",
585
- "hash_input_tokens": "5421d9a1af86cbd4",
586
- "hash_cont_tokens": "f7b8097afc16a47c"
587
- },
588
- "harness|hendrycksTest-computer_security|5": {
589
- "hash_examples": "006451eedc0ededb",
590
- "hash_full_prompts": "94034c97e85d8f46",
591
- "hash_input_tokens": "5e6b70ecb333cf18",
592
- "hash_cont_tokens": "50421e30bef398f9"
593
- },
594
- "harness|hendrycksTest-conceptual_physics|5": {
595
- "hash_examples": "8874ece872d2ca4c",
596
- "hash_full_prompts": "e40d15a34640d6fa",
597
- "hash_input_tokens": "c2ef11a87264ceed",
598
- "hash_cont_tokens": "aa0e8bc655f2f641"
599
- },
600
- "harness|hendrycksTest-econometrics|5": {
601
- "hash_examples": "64d3623b0bfaa43f",
602
- "hash_full_prompts": "612f340fae41338d",
603
- "hash_input_tokens": "ecaccd912a4c3978",
604
- "hash_cont_tokens": "bfb7e3c3c88313f1"
605
- },
606
- "harness|hendrycksTest-electrical_engineering|5": {
607
- "hash_examples": "e98f51780c674d7e",
608
- "hash_full_prompts": "10275b312d812ae6",
609
- "hash_input_tokens": "1590c84291399be8",
610
- "hash_cont_tokens": "2425a3f084a591ef"
611
- },
612
- "harness|hendrycksTest-elementary_mathematics|5": {
613
- "hash_examples": "fc48208a5ac1c0ce",
614
- "hash_full_prompts": "5ec274c6c82aca23",
615
- "hash_input_tokens": "3269597f715b0da1",
616
- "hash_cont_tokens": "f52691aef15a407b"
617
- },
618
- "harness|hendrycksTest-formal_logic|5": {
619
- "hash_examples": "5a6525665f63ea72",
620
- "hash_full_prompts": "07b92638c4a6b500",
621
- "hash_input_tokens": "a2800d20f3ab8d7c",
622
- "hash_cont_tokens": "f515d598d9c21263"
623
- },
624
- "harness|hendrycksTest-global_facts|5": {
625
- "hash_examples": "371d70d743b2b89b",
626
- "hash_full_prompts": "332fdee50a1921b4",
627
- "hash_input_tokens": "94ed44b3772505ad",
628
- "hash_cont_tokens": "50421e30bef398f9"
629
- },
630
- "harness|hendrycksTest-high_school_biology|5": {
631
- "hash_examples": "a79e1018b1674052",
632
- "hash_full_prompts": "e624e26ede922561",
633
- "hash_input_tokens": "24423acb928db768",
634
- "hash_cont_tokens": "bd85a4156a3613ee"
635
- },
636
- "harness|hendrycksTest-high_school_chemistry|5": {
637
- "hash_examples": "44bfc25c389f0e03",
638
- "hash_full_prompts": "0e3e5f5d9246482a",
639
- "hash_input_tokens": "831ff35c474e5cef",
640
- "hash_cont_tokens": "a95c97af1c14e068"
641
- },
642
- "harness|hendrycksTest-high_school_computer_science|5": {
643
- "hash_examples": "8b8cdb1084f24169",
644
- "hash_full_prompts": "c00487e67c1813cc",
645
- "hash_input_tokens": "8c34e0f2bda77358",
646
- "hash_cont_tokens": "8abfedef914e33c9"
647
- },
648
- "harness|hendrycksTest-high_school_european_history|5": {
649
- "hash_examples": "11cd32d0ef440171",
650
- "hash_full_prompts": "318f4513c537c6bf",
651
- "hash_input_tokens": "f1f73dd687da18d7",
652
- "hash_cont_tokens": "674fc454bdc5ac93"
653
- },
654
- "harness|hendrycksTest-high_school_geography|5": {
655
- "hash_examples": "b60019b9e80b642f",
656
- "hash_full_prompts": "ee5789fcc1a81b1e",
657
- "hash_input_tokens": "7c5547c7da5bc793",
658
- "hash_cont_tokens": "03a5012b916274ea"
659
- },
660
- "harness|hendrycksTest-high_school_government_and_politics|5": {
661
- "hash_examples": "d221ec983d143dc3",
662
- "hash_full_prompts": "ac42d888e1ce1155",
663
- "hash_input_tokens": "f62991cb6a496b05",
664
- "hash_cont_tokens": "a83effb8f76b7d7c"
665
- },
666
- "harness|hendrycksTest-high_school_macroeconomics|5": {
667
- "hash_examples": "59c2915cacfd3fbb",
668
- "hash_full_prompts": "c6bd9d25158abd0e",
669
- "hash_input_tokens": "4cef2aff6e3d59ed",
670
- "hash_cont_tokens": "c583432ad27fcfe0"
671
- },
672
- "harness|hendrycksTest-high_school_mathematics|5": {
673
- "hash_examples": "1f8ac897608de342",
674
- "hash_full_prompts": "5d88f41fc2d643a8",
675
- "hash_input_tokens": "6e2577ea4082ed2b",
676
- "hash_cont_tokens": "24f5dc613660300b"
677
- },
678
- "harness|hendrycksTest-high_school_microeconomics|5": {
679
- "hash_examples": "ead6a0f2f6c83370",
680
- "hash_full_prompts": "bfc393381298609e",
681
- "hash_input_tokens": "c5fc9aeb1079c8e4",
682
- "hash_cont_tokens": "f47f041de50333b9"
683
- },
684
- "harness|hendrycksTest-high_school_physics|5": {
685
- "hash_examples": "c3f2025990afec64",
686
- "hash_full_prompts": "fc78b4997e436734",
687
- "hash_input_tokens": "555fc385cffa84ca",
688
- "hash_cont_tokens": "ba2efcd283e938cc"
689
- },
690
- "harness|hendrycksTest-high_school_psychology|5": {
691
- "hash_examples": "21f8aab618f6d636",
692
- "hash_full_prompts": "d5c76aa40b9dbc43",
693
- "hash_input_tokens": "febd23cbf9973b7f",
694
- "hash_cont_tokens": "942069cd363844d9"
695
- },
696
- "harness|hendrycksTest-high_school_statistics|5": {
697
- "hash_examples": "2386a60a11fc5de3",
698
- "hash_full_prompts": "4c5c8be5aafac432",
699
- "hash_input_tokens": "424b02981230ee83",
700
- "hash_cont_tokens": "955ed42b6f7fa019"
701
- },
702
- "harness|hendrycksTest-high_school_us_history|5": {
703
- "hash_examples": "74961543be40f04f",
704
- "hash_full_prompts": "5d5ca4840131ba21",
705
- "hash_input_tokens": "50c9ff438c85a69e",
706
- "hash_cont_tokens": "cdd0b3dc06d933e5"
707
- },
708
- "harness|hendrycksTest-high_school_world_history|5": {
709
- "hash_examples": "2ad2f6b7198b2234",
710
- "hash_full_prompts": "11845057459afd72",
711
- "hash_input_tokens": "054824cc474caef5",
712
- "hash_cont_tokens": "9a864184946033ac"
713
- },
714
- "harness|hendrycksTest-human_aging|5": {
715
- "hash_examples": "1a7199dc733e779b",
716
- "hash_full_prompts": "756b9096b8eaf892",
717
- "hash_input_tokens": "541a75f071dcf579",
718
- "hash_cont_tokens": "142a4a8a1138a214"
719
- },
720
- "harness|hendrycksTest-human_sexuality|5": {
721
- "hash_examples": "7acb8fdad97f88a6",
722
- "hash_full_prompts": "731a52ff15b8cfdb",
723
- "hash_input_tokens": "04269e5c5a257dd9",
724
- "hash_cont_tokens": "bc54813e809b796d"
725
- },
726
- "harness|hendrycksTest-international_law|5": {
727
- "hash_examples": "1300bfd0dfc59114",
728
- "hash_full_prompts": "db2aefbff5eec996",
729
- "hash_input_tokens": "d93ba9d9d38e4397",
730
- "hash_cont_tokens": "dc45b45fcda18e5d"
731
- },
732
- "harness|hendrycksTest-jurisprudence|5": {
733
- "hash_examples": "083b1e4904c48dc2",
734
- "hash_full_prompts": "0f89ee3fe03d6a21",
735
- "hash_input_tokens": "9eeaccd2698b4f5a",
736
- "hash_cont_tokens": "e3a8cd951b6e3469"
737
- },
738
- "harness|hendrycksTest-logical_fallacies|5": {
739
- "hash_examples": "709128f9926a634c",
740
- "hash_full_prompts": "98a04b1f8f841069",
741
- "hash_input_tokens": "b4f08f544f2b7576",
742
- "hash_cont_tokens": "1e80dbd30f6453d5"
743
- },
744
- "harness|hendrycksTest-machine_learning|5": {
745
- "hash_examples": "88f22a636029ae47",
746
- "hash_full_prompts": "2e1c8d4b1e0cc921",
747
- "hash_input_tokens": "900c2a51f1174b9f",
748
- "hash_cont_tokens": "9b37da7777378ca9"
749
- },
750
- "harness|hendrycksTest-management|5": {
751
- "hash_examples": "8c8a1e07a2151dca",
752
- "hash_full_prompts": "f51611f514b265b0",
753
- "hash_input_tokens": "6b36efb4689c6eca",
754
- "hash_cont_tokens": "a01d6d39a83c4597"
755
- },
756
- "harness|hendrycksTest-marketing|5": {
757
- "hash_examples": "2668953431f91e96",
758
- "hash_full_prompts": "77562bef997c7650",
759
- "hash_input_tokens": "2aaac78a0cfed47a",
760
- "hash_cont_tokens": "6aeaed4d823c98aa"
761
- },
762
- "harness|hendrycksTest-medical_genetics|5": {
763
- "hash_examples": "9c2dda34a2ea4fd2",
764
- "hash_full_prompts": "202139046daa118f",
765
- "hash_input_tokens": "886ca823b41c094a",
766
- "hash_cont_tokens": "50421e30bef398f9"
767
- },
768
- "harness|hendrycksTest-miscellaneous|5": {
769
- "hash_examples": "41adb694024809c2",
770
- "hash_full_prompts": "bffec9fc237bcf93",
771
- "hash_input_tokens": "72fd71de7675e7d0",
772
- "hash_cont_tokens": "9b0ab02a64603081"
773
- },
774
- "harness|hendrycksTest-moral_disputes|5": {
775
- "hash_examples": "3171c13ba3c594c4",
776
- "hash_full_prompts": "170831fc36f1d59e",
777
- "hash_input_tokens": "f3ca0dd8e7a1eb09",
778
- "hash_cont_tokens": "8badf768f7b0467a"
779
- },
780
- "harness|hendrycksTest-moral_scenarios|5": {
781
- "hash_examples": "9873e077e83e0546",
782
- "hash_full_prompts": "08f4ceba3131a068",
783
- "hash_input_tokens": "3e793631e951f23c",
784
- "hash_cont_tokens": "32ae620376b2bbba"
785
- },
786
- "harness|hendrycksTest-nutrition|5": {
787
- "hash_examples": "7db1d8142ec14323",
788
- "hash_full_prompts": "4c0e68e3586cb453",
789
- "hash_input_tokens": "59753c2144ea93af",
790
- "hash_cont_tokens": "3071def75bacc404"
791
- },
792
- "harness|hendrycksTest-philosophy|5": {
793
- "hash_examples": "9b455b7d72811cc8",
794
- "hash_full_prompts": "e467f822d8a0d3ff",
795
- "hash_input_tokens": "bd8d3dbed15a8c34",
796
- "hash_cont_tokens": "9f6ff69d23a48783"
797
- },
798
- "harness|hendrycksTest-prehistory|5": {
799
- "hash_examples": "8be90d0f538f1560",
800
- "hash_full_prompts": "152187949bcd0921",
801
- "hash_input_tokens": "3573cd87facbb7c5",
802
- "hash_cont_tokens": "de469d2b981e32a3"
803
- },
804
- "harness|hendrycksTest-professional_accounting|5": {
805
- "hash_examples": "8d377597916cd07e",
806
- "hash_full_prompts": "0eb7345d6144ee0d",
807
- "hash_input_tokens": "17e721bc1a7cbb47",
808
- "hash_cont_tokens": "c46f74d2dfc7b13b"
809
- },
810
- "harness|hendrycksTest-professional_law|5": {
811
- "hash_examples": "cd9dbc52b3c932d6",
812
- "hash_full_prompts": "36ac764272bfb182",
813
- "hash_input_tokens": "9178e10bd0763ec4",
814
- "hash_cont_tokens": "2e590029ef41fbcd"
815
- },
816
- "harness|hendrycksTest-professional_medicine|5": {
817
- "hash_examples": "b20e4e816c1e383e",
818
- "hash_full_prompts": "7b8d69ea2acaf2f7",
819
- "hash_input_tokens": "f5a22012a54f70ea",
820
- "hash_cont_tokens": "fe35cfa9c6ca802e"
821
- },
822
- "harness|hendrycksTest-professional_psychology|5": {
823
- "hash_examples": "d45b73b22f9cc039",
824
- "hash_full_prompts": "fe8937e9ffc99771",
825
- "hash_input_tokens": "0dfb73a8eb3f692c",
826
- "hash_cont_tokens": "f020fbddf72c8652"
827
- },
828
- "harness|hendrycksTest-public_relations|5": {
829
- "hash_examples": "0d25072e1761652a",
830
- "hash_full_prompts": "f9adc39cfa9f42ba",
831
- "hash_input_tokens": "1710c6ba4c9f3cbd",
832
- "hash_cont_tokens": "568f585a259965c1"
833
- },
834
- "harness|hendrycksTest-security_studies|5": {
835
- "hash_examples": "62bb8197e63d60d4",
836
- "hash_full_prompts": "869c9c3ae196b7c3",
837
- "hash_input_tokens": "d49711415961ced7",
838
- "hash_cont_tokens": "cc6fd7cccd64cd5d"
839
- },
840
- "harness|hendrycksTest-sociology|5": {
841
- "hash_examples": "e7959df87dea8672",
842
- "hash_full_prompts": "1a1fc00e17b3a52a",
843
- "hash_input_tokens": "828999f7624cbe7e",
844
- "hash_cont_tokens": "c3a3bdfd177eed5b"
845
- },
846
- "harness|hendrycksTest-us_foreign_policy|5": {
847
- "hash_examples": "4a56a01ddca44dca",
848
- "hash_full_prompts": "0c7a7081c71c07b6",
849
- "hash_input_tokens": "42054621e718dbee",
850
- "hash_cont_tokens": "2568d0e8e36fa959"
851
- },
852
- "harness|hendrycksTest-virology|5": {
853
- "hash_examples": "451cc86a8c4f4fe9",
854
- "hash_full_prompts": "01e95325d8b738e4",
855
- "hash_input_tokens": "6c4f0aa4dc859c04",
856
- "hash_cont_tokens": "926cf60b0891f374"
857
- },
858
- "harness|hendrycksTest-world_religions|5": {
859
- "hash_examples": "3b29cfaf1a81c379",
860
- "hash_full_prompts": "e0d79a15083dfdff",
861
- "hash_input_tokens": "6c75d44e092ff24f",
862
- "hash_cont_tokens": "c525a5de974c1ea3"
863
- },
864
- "harness|truthfulqa:mc|0": {
865
- "hash_examples": "23176c0531c7b867",
866
- "hash_full_prompts": "36a6d90e75d92d4a",
867
- "hash_input_tokens": "2738d7ed7075faa7",
868
- "hash_cont_tokens": "c014154380b74b9e"
869
- }
870
- }
871
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
meta-llama/Llama-2-13b-chat-hf/results_2023-10-14T19-39-26.636545.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "config_general": {
3
- "model_name": "meta-llama/Llama-2-13b-chat-hf",
4
- "model_sha": "13f8d72c0456c17e41b3d8b4327259125cd0defa",
5
- "model_size": "24.32 GB",
6
- "model_dtype": "torch.float16",
7
- "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374",
8
- "num_few_shot_default": 0,
9
- "num_fewshot_seeds": 1,
10
- "override_batch_size": 1,
11
- "max_samples": null,
12
- "job_id": ""
13
- },
14
- "results": {
15
- "harness|drop|3": {
16
- "em": 0.1782718120805369,
17
- "em_stderr": 0.003919630092588375,
18
- "f1": 0.2387195889261742,
19
- "f1_stderr": 0.003944947017182046
20
- },
21
- "harness|gsm8k|5": {
22
- "acc": 0.15238817285822592,
23
- "acc_stderr": 0.009899572254794204
24
- },
25
- "harness|winogrande|5": {
26
- "acc": 0.745067087608524,
27
- "acc_stderr": 0.012248806969376422
28
- },
29
- "all": {
30
- "em": 0.1782718120805369,
31
- "em_stderr": 0.003919630092588375,
32
- "f1": 0.2387195889261742,
33
- "f1_stderr": 0.003944947017182046,
34
- "acc": 0.448727630233375,
35
- "acc_stderr": 0.011074189612085313
36
- }
37
- },
38
- "versions": {
39
- "harness|drop|3": 1,
40
- "harness|gsm8k|5": 0,
41
- "harness|winogrande|5": 0,
42
- "all": 0
43
- },
44
- "config_tasks": {
45
- "harness|drop": "LM Harness task",
46
- "harness|gsm8k": "LM Harness task",
47
- "harness|winogrande": "LM Harness task"
48
- },
49
- "summary_tasks": {
50
- "harness|drop|3": {
51
- "hashes": {
52
- "hash_examples": "1d27416e8324e9a3",
53
- "hash_full_prompts": "a5513ff9a741b385",
54
- "hash_input_tokens": "42076f0efbb50aa6",
55
- "hash_cont_tokens": "eaa0f770b728538e"
56
- },
57
- "truncated": 3,
58
- "non-truncated": 9533,
59
- "padded": 0,
60
- "non-padded": 9536,
61
- "effective_few_shots": 3.0,
62
- "num_truncated_few_shots": 0
63
- },
64
- "harness|gsm8k|5": {
65
- "hashes": {
66
- "hash_examples": "4c0843a5d99bcfdc",
67
- "hash_full_prompts": "41d55e83abc0e02d",
68
- "hash_input_tokens": "bda342e47b5099b2",
69
- "hash_cont_tokens": "9956899ac09638ce"
70
- },
71
- "truncated": 0,
72
- "non-truncated": 1319,
73
- "padded": 0,
74
- "non-padded": 1319,
75
- "effective_few_shots": 5.0,
76
- "num_truncated_few_shots": 0
77
- },
78
- "harness|winogrande|5": {
79
- "hashes": {
80
- "hash_examples": "aada0a176fd81218",
81
- "hash_full_prompts": "c8655cbd12de8409",
82
- "hash_input_tokens": "c0bedf98cb040854",
83
- "hash_cont_tokens": "f08975ad6f2d5864"
84
- },
85
- "truncated": 0,
86
- "non-truncated": 2534,
87
- "padded": 2432,
88
- "non-padded": 102,
89
- "effective_few_shots": 5.0,
90
- "num_truncated_few_shots": 0
91
- }
92
- },
93
- "summary_general": {
94
- "hashes": {
95
- "hash_examples": "9b4d8993161e637d",
96
- "hash_full_prompts": "08215e527b7e60a5",
97
- "hash_input_tokens": "a12f3e3c934bd78b",
98
- "hash_cont_tokens": "8197d42f9e3e7f68"
99
- },
100
- "total_evaluation_time_secondes": "10933.773993730545",
101
- "truncated": 3,
102
- "non-truncated": 13386,
103
- "padded": 2432,
104
- "non-padded": 10957,
105
- "num_truncated_few_shots": 0
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
meta-llama/Llama-2-13b-hf/results_2023-08-20T22-26-02.660247.json DELETED
@@ -1,871 +0,0 @@
1
- {
2
- "results": {
3
- "harness|arc:challenge|25": {
4
- "acc": 0.5486348122866894,
5
- "acc_stderr": 0.014542104569955265,
6
- "acc_norm": 0.5938566552901023,
7
- "acc_norm_stderr": 0.014351656690097862
8
- },
9
- "harness|hellaswag|10": {
10
- "acc": 0.614618601872137,
11
- "acc_stderr": 0.004856906473719381,
12
- "acc_norm": 0.8212507468631747,
13
- "acc_norm_stderr": 0.003823591814133036
14
- },
15
- "harness|hendrycksTest-abstract_algebra|5": {
16
- "acc": 0.35,
17
- "acc_stderr": 0.04793724854411022,
18
- "acc_norm": 0.35,
19
- "acc_norm_stderr": 0.04793724854411022
20
- },
21
- "harness|hendrycksTest-anatomy|5": {
22
- "acc": 0.4666666666666667,
23
- "acc_stderr": 0.043097329010363554,
24
- "acc_norm": 0.4666666666666667,
25
- "acc_norm_stderr": 0.043097329010363554
26
- },
27
- "harness|hendrycksTest-astronomy|5": {
28
- "acc": 0.5263157894736842,
29
- "acc_stderr": 0.04063302731486671,
30
- "acc_norm": 0.5263157894736842,
31
- "acc_norm_stderr": 0.04063302731486671
32
- },
33
- "harness|hendrycksTest-business_ethics|5": {
34
- "acc": 0.55,
35
- "acc_stderr": 0.049999999999999996,
36
- "acc_norm": 0.55,
37
- "acc_norm_stderr": 0.049999999999999996
38
- },
39
- "harness|hendrycksTest-clinical_knowledge|5": {
40
- "acc": 0.6037735849056604,
41
- "acc_stderr": 0.030102793781791197,
42
- "acc_norm": 0.6037735849056604,
43
- "acc_norm_stderr": 0.030102793781791197
44
- },
45
- "harness|hendrycksTest-college_biology|5": {
46
- "acc": 0.6180555555555556,
47
- "acc_stderr": 0.040629907841466674,
48
- "acc_norm": 0.6180555555555556,
49
- "acc_norm_stderr": 0.040629907841466674
50
- },
51
- "harness|hendrycksTest-college_chemistry|5": {
52
- "acc": 0.44,
53
- "acc_stderr": 0.04988876515698589,
54
- "acc_norm": 0.44,
55
- "acc_norm_stderr": 0.04988876515698589
56
- },
57
- "harness|hendrycksTest-college_computer_science|5": {
58
- "acc": 0.47,
59
- "acc_stderr": 0.05016135580465919,
60
- "acc_norm": 0.47,
61
- "acc_norm_stderr": 0.05016135580465919
62
- },
63
- "harness|hendrycksTest-college_mathematics|5": {
64
- "acc": 0.31,
65
- "acc_stderr": 0.04648231987117316,
66
- "acc_norm": 0.31,
67
- "acc_norm_stderr": 0.04648231987117316
68
- },
69
- "harness|hendrycksTest-college_medicine|5": {
70
- "acc": 0.5317919075144508,
71
- "acc_stderr": 0.03804749744364764,
72
- "acc_norm": 0.5317919075144508,
73
- "acc_norm_stderr": 0.03804749744364764
74
- },
75
- "harness|hendrycksTest-college_physics|5": {
76
- "acc": 0.24509803921568626,
77
- "acc_stderr": 0.04280105837364395,
78
- "acc_norm": 0.24509803921568626,
79
- "acc_norm_stderr": 0.04280105837364395
80
- },
81
- "harness|hendrycksTest-computer_security|5": {
82
- "acc": 0.7,
83
- "acc_stderr": 0.046056618647183814,
84
- "acc_norm": 0.7,
85
- "acc_norm_stderr": 0.046056618647183814
86
- },
87
- "harness|hendrycksTest-conceptual_physics|5": {
88
- "acc": 0.425531914893617,
89
- "acc_stderr": 0.032321469162244675,
90
- "acc_norm": 0.425531914893617,
91
- "acc_norm_stderr": 0.032321469162244675
92
- },
93
- "harness|hendrycksTest-econometrics|5": {
94
- "acc": 0.32456140350877194,
95
- "acc_stderr": 0.04404556157374768,
96
- "acc_norm": 0.32456140350877194,
97
- "acc_norm_stderr": 0.04404556157374768
98
- },
99
- "harness|hendrycksTest-electrical_engineering|5": {
100
- "acc": 0.503448275862069,
101
- "acc_stderr": 0.04166567577101579,
102
- "acc_norm": 0.503448275862069,
103
- "acc_norm_stderr": 0.04166567577101579
104
- },
105
- "harness|hendrycksTest-elementary_mathematics|5": {
106
- "acc": 0.3386243386243386,
107
- "acc_stderr": 0.02437319786798306,
108
- "acc_norm": 0.3386243386243386,
109
- "acc_norm_stderr": 0.02437319786798306
110
- },
111
- "harness|hendrycksTest-formal_logic|5": {
112
- "acc": 0.3888888888888889,
113
- "acc_stderr": 0.04360314860077459,
114
- "acc_norm": 0.3888888888888889,
115
- "acc_norm_stderr": 0.04360314860077459
116
- },
117
- "harness|hendrycksTest-global_facts|5": {
118
- "acc": 0.33,
119
- "acc_stderr": 0.04725815626252604,
120
- "acc_norm": 0.33,
121
- "acc_norm_stderr": 0.04725815626252604
122
- },
123
- "harness|hendrycksTest-high_school_biology|5": {
124
- "acc": 0.6741935483870968,
125
- "acc_stderr": 0.026662010578567107,
126
- "acc_norm": 0.6741935483870968,
127
- "acc_norm_stderr": 0.026662010578567107
128
- },
129
- "harness|hendrycksTest-high_school_chemistry|5": {
130
- "acc": 0.4482758620689655,
131
- "acc_stderr": 0.034991131376767445,
132
- "acc_norm": 0.4482758620689655,
133
- "acc_norm_stderr": 0.034991131376767445
134
- },
135
- "harness|hendrycksTest-high_school_computer_science|5": {
136
- "acc": 0.57,
137
- "acc_stderr": 0.04975698519562427,
138
- "acc_norm": 0.57,
139
- "acc_norm_stderr": 0.04975698519562427
140
- },
141
- "harness|hendrycksTest-high_school_european_history|5": {
142
- "acc": 0.6484848484848484,
143
- "acc_stderr": 0.037282069986826503,
144
- "acc_norm": 0.6484848484848484,
145
- "acc_norm_stderr": 0.037282069986826503
146
- },
147
- "harness|hendrycksTest-high_school_geography|5": {
148
- "acc": 0.6919191919191919,
149
- "acc_stderr": 0.032894773300986155,
150
- "acc_norm": 0.6919191919191919,
151
- "acc_norm_stderr": 0.032894773300986155
152
- },
153
- "harness|hendrycksTest-high_school_government_and_politics|5": {
154
- "acc": 0.8186528497409327,
155
- "acc_stderr": 0.02780703236068609,
156
- "acc_norm": 0.8186528497409327,
157
- "acc_norm_stderr": 0.02780703236068609
158
- },
159
- "harness|hendrycksTest-high_school_macroeconomics|5": {
160
- "acc": 0.5102564102564102,
161
- "acc_stderr": 0.025345672221942374,
162
- "acc_norm": 0.5102564102564102,
163
- "acc_norm_stderr": 0.025345672221942374
164
- },
165
- "harness|hendrycksTest-high_school_mathematics|5": {
166
- "acc": 0.2777777777777778,
167
- "acc_stderr": 0.02730914058823018,
168
- "acc_norm": 0.2777777777777778,
169
- "acc_norm_stderr": 0.02730914058823018
170
- },
171
- "harness|hendrycksTest-high_school_microeconomics|5": {
172
- "acc": 0.5840336134453782,
173
- "acc_stderr": 0.032016501007396114,
174
- "acc_norm": 0.5840336134453782,
175
- "acc_norm_stderr": 0.032016501007396114
176
- },
177
- "harness|hendrycksTest-high_school_physics|5": {
178
- "acc": 0.36423841059602646,
179
- "acc_stderr": 0.03929111781242742,
180
- "acc_norm": 0.36423841059602646,
181
- "acc_norm_stderr": 0.03929111781242742
182
- },
183
- "harness|hendrycksTest-high_school_psychology|5": {
184
- "acc": 0.7614678899082569,
185
- "acc_stderr": 0.01827257581023187,
186
- "acc_norm": 0.7614678899082569,
187
- "acc_norm_stderr": 0.01827257581023187
188
- },
189
- "harness|hendrycksTest-high_school_statistics|5": {
190
- "acc": 0.4398148148148148,
191
- "acc_stderr": 0.03385177976044812,
192
- "acc_norm": 0.4398148148148148,
193
- "acc_norm_stderr": 0.03385177976044812
194
- },
195
- "harness|hendrycksTest-high_school_us_history|5": {
196
- "acc": 0.7450980392156863,
197
- "acc_stderr": 0.030587591351604246,
198
- "acc_norm": 0.7450980392156863,
199
- "acc_norm_stderr": 0.030587591351604246
200
- },
201
- "harness|hendrycksTest-high_school_world_history|5": {
202
- "acc": 0.7215189873417721,
203
- "acc_stderr": 0.029178682304842538,
204
- "acc_norm": 0.7215189873417721,
205
- "acc_norm_stderr": 0.029178682304842538
206
- },
207
- "harness|hendrycksTest-human_aging|5": {
208
- "acc": 0.6367713004484304,
209
- "acc_stderr": 0.03227790442850499,
210
- "acc_norm": 0.6367713004484304,
211
- "acc_norm_stderr": 0.03227790442850499
212
- },
213
- "harness|hendrycksTest-human_sexuality|5": {
214
- "acc": 0.6183206106870229,
215
- "acc_stderr": 0.04260735157644559,
216
- "acc_norm": 0.6183206106870229,
217
- "acc_norm_stderr": 0.04260735157644559
218
- },
219
- "harness|hendrycksTest-international_law|5": {
220
- "acc": 0.743801652892562,
221
- "acc_stderr": 0.03984979653302873,
222
- "acc_norm": 0.743801652892562,
223
- "acc_norm_stderr": 0.03984979653302873
224
- },
225
- "harness|hendrycksTest-jurisprudence|5": {
226
- "acc": 0.7407407407407407,
227
- "acc_stderr": 0.04236511258094633,
228
- "acc_norm": 0.7407407407407407,
229
- "acc_norm_stderr": 0.04236511258094633
230
- },
231
- "harness|hendrycksTest-logical_fallacies|5": {
232
- "acc": 0.6687116564417178,
233
- "acc_stderr": 0.03697983910025588,
234
- "acc_norm": 0.6687116564417178,
235
- "acc_norm_stderr": 0.03697983910025588
236
- },
237
- "harness|hendrycksTest-machine_learning|5": {
238
- "acc": 0.2857142857142857,
239
- "acc_stderr": 0.042878587513404565,
240
- "acc_norm": 0.2857142857142857,
241
- "acc_norm_stderr": 0.042878587513404565
242
- },
243
- "harness|hendrycksTest-management|5": {
244
- "acc": 0.7378640776699029,
245
- "acc_stderr": 0.04354631077260595,
246
- "acc_norm": 0.7378640776699029,
247
- "acc_norm_stderr": 0.04354631077260595
248
- },
249
- "harness|hendrycksTest-marketing|5": {
250
- "acc": 0.7948717948717948,
251
- "acc_stderr": 0.02645350805404032,
252
- "acc_norm": 0.7948717948717948,
253
- "acc_norm_stderr": 0.02645350805404032
254
- },
255
- "harness|hendrycksTest-medical_genetics|5": {
256
- "acc": 0.55,
257
- "acc_stderr": 0.04999999999999999,
258
- "acc_norm": 0.55,
259
- "acc_norm_stderr": 0.04999999999999999
260
- },
261
- "harness|hendrycksTest-miscellaneous|5": {
262
- "acc": 0.7471264367816092,
263
- "acc_stderr": 0.015543377313719681,
264
- "acc_norm": 0.7471264367816092,
265
- "acc_norm_stderr": 0.015543377313719681
266
- },
267
- "harness|hendrycksTest-moral_disputes|5": {
268
- "acc": 0.6473988439306358,
269
- "acc_stderr": 0.025722802200895803,
270
- "acc_norm": 0.6473988439306358,
271
- "acc_norm_stderr": 0.025722802200895803
272
- },
273
- "harness|hendrycksTest-moral_scenarios|5": {
274
- "acc": 0.39776536312849164,
275
- "acc_stderr": 0.01636920497126298,
276
- "acc_norm": 0.39776536312849164,
277
- "acc_norm_stderr": 0.01636920497126298
278
- },
279
- "harness|hendrycksTest-nutrition|5": {
280
- "acc": 0.6241830065359477,
281
- "acc_stderr": 0.027732834353363947,
282
- "acc_norm": 0.6241830065359477,
283
- "acc_norm_stderr": 0.027732834353363947
284
- },
285
- "harness|hendrycksTest-philosophy|5": {
286
- "acc": 0.6463022508038585,
287
- "acc_stderr": 0.02715520810320086,
288
- "acc_norm": 0.6463022508038585,
289
- "acc_norm_stderr": 0.02715520810320086
290
- },
291
- "harness|hendrycksTest-prehistory|5": {
292
- "acc": 0.6512345679012346,
293
- "acc_stderr": 0.026517597724465013,
294
- "acc_norm": 0.6512345679012346,
295
- "acc_norm_stderr": 0.026517597724465013
296
- },
297
- "harness|hendrycksTest-professional_accounting|5": {
298
- "acc": 0.3900709219858156,
299
- "acc_stderr": 0.029097675599463926,
300
- "acc_norm": 0.3900709219858156,
301
- "acc_norm_stderr": 0.029097675599463926
302
- },
303
- "harness|hendrycksTest-professional_law|5": {
304
- "acc": 0.424380704041721,
305
- "acc_stderr": 0.012623343757430018,
306
- "acc_norm": 0.424380704041721,
307
- "acc_norm_stderr": 0.012623343757430018
308
- },
309
- "harness|hendrycksTest-professional_medicine|5": {
310
- "acc": 0.5404411764705882,
311
- "acc_stderr": 0.030273325077345755,
312
- "acc_norm": 0.5404411764705882,
313
- "acc_norm_stderr": 0.030273325077345755
314
- },
315
- "harness|hendrycksTest-professional_psychology|5": {
316
- "acc": 0.5490196078431373,
317
- "acc_stderr": 0.020130388312904528,
318
- "acc_norm": 0.5490196078431373,
319
- "acc_norm_stderr": 0.020130388312904528
320
- },
321
- "harness|hendrycksTest-public_relations|5": {
322
- "acc": 0.6090909090909091,
323
- "acc_stderr": 0.04673752333670239,
324
- "acc_norm": 0.6090909090909091,
325
- "acc_norm_stderr": 0.04673752333670239
326
- },
327
- "harness|hendrycksTest-security_studies|5": {
328
- "acc": 0.636734693877551,
329
- "acc_stderr": 0.030789051139030806,
330
- "acc_norm": 0.636734693877551,
331
- "acc_norm_stderr": 0.030789051139030806
332
- },
333
- "harness|hendrycksTest-sociology|5": {
334
- "acc": 0.7263681592039801,
335
- "acc_stderr": 0.031524391865554016,
336
- "acc_norm": 0.7263681592039801,
337
- "acc_norm_stderr": 0.031524391865554016
338
- },
339
- "harness|hendrycksTest-us_foreign_policy|5": {
340
- "acc": 0.82,
341
- "acc_stderr": 0.038612291966536934,
342
- "acc_norm": 0.82,
343
- "acc_norm_stderr": 0.038612291966536934
344
- },
345
- "harness|hendrycksTest-virology|5": {
346
- "acc": 0.463855421686747,
347
- "acc_stderr": 0.03882310850890593,
348
- "acc_norm": 0.463855421686747,
349
- "acc_norm_stderr": 0.03882310850890593
350
- },
351
- "harness|hendrycksTest-world_religions|5": {
352
- "acc": 0.7602339181286549,
353
- "acc_stderr": 0.03274485211946956,
354
- "acc_norm": 0.7602339181286549,
355
- "acc_norm_stderr": 0.03274485211946956
356
- },
357
- "harness|truthfulqa:mc|0": {
358
- "mc1": 0.26805385556915545,
359
- "mc1_stderr": 0.01550620472283456,
360
- "mc2": 0.37375264473944586,
361
- "mc2_stderr": 0.01368799302217441
362
- },
363
- "all": {
364
- "acc": 0.5585210868491985,
365
- "acc_stderr": 0.03442553546843938,
366
- "acc_norm": 0.5627897985101215,
367
- "acc_norm_stderr": 0.034404793730482705,
368
- "mc1": 0.26805385556915545,
369
- "mc1_stderr": 0.01550620472283456,
370
- "mc2": 0.37375264473944586,
371
- "mc2_stderr": 0.01368799302217441
372
- }
373
- },
374
- "versions": {
375
- "harness|arc:challenge|25": 0,
376
- "harness|hellaswag|10": 0,
377
- "harness|hendrycksTest-abstract_algebra|5": 1,
378
- "harness|hendrycksTest-anatomy|5": 1,
379
- "harness|hendrycksTest-astronomy|5": 1,
380
- "harness|hendrycksTest-business_ethics|5": 1,
381
- "harness|hendrycksTest-clinical_knowledge|5": 1,
382
- "harness|hendrycksTest-college_biology|5": 1,
383
- "harness|hendrycksTest-college_chemistry|5": 1,
384
- "harness|hendrycksTest-college_computer_science|5": 1,
385
- "harness|hendrycksTest-college_mathematics|5": 1,
386
- "harness|hendrycksTest-college_medicine|5": 1,
387
- "harness|hendrycksTest-college_physics|5": 1,
388
- "harness|hendrycksTest-computer_security|5": 1,
389
- "harness|hendrycksTest-conceptual_physics|5": 1,
390
- "harness|hendrycksTest-econometrics|5": 1,
391
- "harness|hendrycksTest-electrical_engineering|5": 1,
392
- "harness|hendrycksTest-elementary_mathematics|5": 1,
393
- "harness|hendrycksTest-formal_logic|5": 1,
394
- "harness|hendrycksTest-global_facts|5": 1,
395
- "harness|hendrycksTest-high_school_biology|5": 1,
396
- "harness|hendrycksTest-high_school_chemistry|5": 1,
397
- "harness|hendrycksTest-high_school_computer_science|5": 1,
398
- "harness|hendrycksTest-high_school_european_history|5": 1,
399
- "harness|hendrycksTest-high_school_geography|5": 1,
400
- "harness|hendrycksTest-high_school_government_and_politics|5": 1,
401
- "harness|hendrycksTest-high_school_macroeconomics|5": 1,
402
- "harness|hendrycksTest-high_school_mathematics|5": 1,
403
- "harness|hendrycksTest-high_school_microeconomics|5": 1,
404
- "harness|hendrycksTest-high_school_physics|5": 1,
405
- "harness|hendrycksTest-high_school_psychology|5": 1,
406
- "harness|hendrycksTest-high_school_statistics|5": 1,
407
- "harness|hendrycksTest-high_school_us_history|5": 1,
408
- "harness|hendrycksTest-high_school_world_history|5": 1,
409
- "harness|hendrycksTest-human_aging|5": 1,
410
- "harness|hendrycksTest-human_sexuality|5": 1,
411
- "harness|hendrycksTest-international_law|5": 1,
412
- "harness|hendrycksTest-jurisprudence|5": 1,
413
- "harness|hendrycksTest-logical_fallacies|5": 1,
414
- "harness|hendrycksTest-machine_learning|5": 1,
415
- "harness|hendrycksTest-management|5": 1,
416
- "harness|hendrycksTest-marketing|5": 1,
417
- "harness|hendrycksTest-medical_genetics|5": 1,
418
- "harness|hendrycksTest-miscellaneous|5": 1,
419
- "harness|hendrycksTest-moral_disputes|5": 1,
420
- "harness|hendrycksTest-moral_scenarios|5": 1,
421
- "harness|hendrycksTest-nutrition|5": 1,
422
- "harness|hendrycksTest-philosophy|5": 1,
423
- "harness|hendrycksTest-prehistory|5": 1,
424
- "harness|hendrycksTest-professional_accounting|5": 1,
425
- "harness|hendrycksTest-professional_law|5": 1,
426
- "harness|hendrycksTest-professional_medicine|5": 1,
427
- "harness|hendrycksTest-professional_psychology|5": 1,
428
- "harness|hendrycksTest-public_relations|5": 1,
429
- "harness|hendrycksTest-security_studies|5": 1,
430
- "harness|hendrycksTest-sociology|5": 1,
431
- "harness|hendrycksTest-us_foreign_policy|5": 1,
432
- "harness|hendrycksTest-virology|5": 1,
433
- "harness|hendrycksTest-world_religions|5": 1,
434
- "harness|truthfulqa:mc|0": 1,
435
- "all": 0
436
- },
437
- "config": {
438
- "model_name": "meta-llama/Llama-2-13b-hf",
439
- "model_sha": "7da18fb10421c3ae2a1eb92815bad75e84816e35",
440
- "model_dtype": "torch.float16",
441
- "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937",
442
- "num_few_shot_default": 0,
443
- "num_fewshot_seeds": 1,
444
- "override_batch_size": 1,
445
- "max_samples": null
446
- },
447
- "task_config": {
448
- "harness|arc:challenge": "LM Harness task",
449
- "harness|hellaswag": "LM Harness task",
450
- "harness|hendrycksTest-abstract_algebra": "LM Harness task",
451
- "harness|hendrycksTest-anatomy": "LM Harness task",
452
- "harness|hendrycksTest-astronomy": "LM Harness task",
453
- "harness|hendrycksTest-business_ethics": "LM Harness task",
454
- "harness|hendrycksTest-clinical_knowledge": "LM Harness task",
455
- "harness|hendrycksTest-college_biology": "LM Harness task",
456
- "harness|hendrycksTest-college_chemistry": "LM Harness task",
457
- "harness|hendrycksTest-college_computer_science": "LM Harness task",
458
- "harness|hendrycksTest-college_mathematics": "LM Harness task",
459
- "harness|hendrycksTest-college_medicine": "LM Harness task",
460
- "harness|hendrycksTest-college_physics": "LM Harness task",
461
- "harness|hendrycksTest-computer_security": "LM Harness task",
462
- "harness|hendrycksTest-conceptual_physics": "LM Harness task",
463
- "harness|hendrycksTest-econometrics": "LM Harness task",
464
- "harness|hendrycksTest-electrical_engineering": "LM Harness task",
465
- "harness|hendrycksTest-elementary_mathematics": "LM Harness task",
466
- "harness|hendrycksTest-formal_logic": "LM Harness task",
467
- "harness|hendrycksTest-global_facts": "LM Harness task",
468
- "harness|hendrycksTest-high_school_biology": "LM Harness task",
469
- "harness|hendrycksTest-high_school_chemistry": "LM Harness task",
470
- "harness|hendrycksTest-high_school_computer_science": "LM Harness task",
471
- "harness|hendrycksTest-high_school_european_history": "LM Harness task",
472
- "harness|hendrycksTest-high_school_geography": "LM Harness task",
473
- "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task",
474
- "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task",
475
- "harness|hendrycksTest-high_school_mathematics": "LM Harness task",
476
- "harness|hendrycksTest-high_school_microeconomics": "LM Harness task",
477
- "harness|hendrycksTest-high_school_physics": "LM Harness task",
478
- "harness|hendrycksTest-high_school_psychology": "LM Harness task",
479
- "harness|hendrycksTest-high_school_statistics": "LM Harness task",
480
- "harness|hendrycksTest-high_school_us_history": "LM Harness task",
481
- "harness|hendrycksTest-high_school_world_history": "LM Harness task",
482
- "harness|hendrycksTest-human_aging": "LM Harness task",
483
- "harness|hendrycksTest-human_sexuality": "LM Harness task",
484
- "harness|hendrycksTest-international_law": "LM Harness task",
485
- "harness|hendrycksTest-jurisprudence": "LM Harness task",
486
- "harness|hendrycksTest-logical_fallacies": "LM Harness task",
487
- "harness|hendrycksTest-machine_learning": "LM Harness task",
488
- "harness|hendrycksTest-management": "LM Harness task",
489
- "harness|hendrycksTest-marketing": "LM Harness task",
490
- "harness|hendrycksTest-medical_genetics": "LM Harness task",
491
- "harness|hendrycksTest-miscellaneous": "LM Harness task",
492
- "harness|hendrycksTest-moral_disputes": "LM Harness task",
493
- "harness|hendrycksTest-moral_scenarios": "LM Harness task",
494
- "harness|hendrycksTest-nutrition": "LM Harness task",
495
- "harness|hendrycksTest-philosophy": "LM Harness task",
496
- "harness|hendrycksTest-prehistory": "LM Harness task",
497
- "harness|hendrycksTest-professional_accounting": "LM Harness task",
498
- "harness|hendrycksTest-professional_law": "LM Harness task",
499
- "harness|hendrycksTest-professional_medicine": "LM Harness task",
500
- "harness|hendrycksTest-professional_psychology": "LM Harness task",
501
- "harness|hendrycksTest-public_relations": "LM Harness task",
502
- "harness|hendrycksTest-security_studies": "LM Harness task",
503
- "harness|hendrycksTest-sociology": "LM Harness task",
504
- "harness|hendrycksTest-us_foreign_policy": "LM Harness task",
505
- "harness|hendrycksTest-virology": "LM Harness task",
506
- "harness|hendrycksTest-world_religions": "LM Harness task",
507
- "harness|truthfulqa:mc": "LM Harness task"
508
- },
509
- "hashes": {
510
- "harness|arc:challenge|25": {
511
- "hash_examples": "fb8c51b1872daeda",
512
- "hash_full_prompts": "045cbb916e5145c6",
513
- "hash_input_tokens": "61571bf68d6d89aa",
514
- "hash_cont_tokens": "8210decc6ff6f7df"
515
- },
516
- "harness|hellaswag|10": {
517
- "hash_examples": "e1768ecb99d7ecf0",
518
- "hash_full_prompts": "0b4c16983130f84f",
519
- "hash_input_tokens": "29906669b1c7054a",
520
- "hash_cont_tokens": "b3b9e9017afa63af"
521
- },
522
- "harness|hendrycksTest-abstract_algebra|5": {
523
- "hash_examples": "280f9f325b40559a",
524
- "hash_full_prompts": "2f776a367d23aea2",
525
- "hash_input_tokens": "c54ff61ad0273dd7",
526
- "hash_cont_tokens": "50421e30bef398f9"
527
- },
528
- "harness|hendrycksTest-anatomy|5": {
529
- "hash_examples": "2f83a4f1cab4ba18",
530
- "hash_full_prompts": "516f74bef25df620",
531
- "hash_input_tokens": "be31a1e22aef5f90",
532
- "hash_cont_tokens": "f11971a765cb609f"
533
- },
534
- "harness|hendrycksTest-astronomy|5": {
535
- "hash_examples": "7d587b908da4d762",
536
- "hash_full_prompts": "faf4e80f65de93ca",
537
- "hash_input_tokens": "277a7b1fad566940",
538
- "hash_cont_tokens": "bf30e5d3f48250cb"
539
- },
540
- "harness|hendrycksTest-business_ethics|5": {
541
- "hash_examples": "33e51740670de686",
542
- "hash_full_prompts": "db01c3ef8e1479d4",
543
- "hash_input_tokens": "ba552605bc116de5",
544
- "hash_cont_tokens": "bc1dd9b2d995eb61"
545
- },
546
- "harness|hendrycksTest-clinical_knowledge|5": {
547
- "hash_examples": "f3366dbe7eefffa4",
548
- "hash_full_prompts": "49654f71d94b65c3",
549
- "hash_input_tokens": "428c7563d0b98ab9",
550
- "hash_cont_tokens": "890a119624b3b935"
551
- },
552
- "harness|hendrycksTest-college_biology|5": {
553
- "hash_examples": "ca2b6753a0193e7f",
554
- "hash_full_prompts": "2b460b75f1fdfefd",
555
- "hash_input_tokens": "da036601573942e2",
556
- "hash_cont_tokens": "875cde3af7a0ee14"
557
- },
558
- "harness|hendrycksTest-college_chemistry|5": {
559
- "hash_examples": "22ff85f1d34f42d1",
560
- "hash_full_prompts": "242c9be6da583e95",
561
- "hash_input_tokens": "94e0196d6aded13d",
562
- "hash_cont_tokens": "50421e30bef398f9"
563
- },
564
- "harness|hendrycksTest-college_computer_science|5": {
565
- "hash_examples": "30318289d717a5cf",
566
- "hash_full_prompts": "ed2bdb4e87c4b371",
567
- "hash_input_tokens": "6e4d0f4a8d36690b",
568
- "hash_cont_tokens": "ffc0fe414cdc4a83"
569
- },
570
- "harness|hendrycksTest-college_mathematics|5": {
571
- "hash_examples": "4944d1f0b6b5d911",
572
- "hash_full_prompts": "770bc4281c973190",
573
- "hash_input_tokens": "614054d17109a25d",
574
- "hash_cont_tokens": "50421e30bef398f9"
575
- },
576
- "harness|hendrycksTest-college_medicine|5": {
577
- "hash_examples": "dd69cc33381275af",
578
- "hash_full_prompts": "ad2a53e5250ab46e",
579
- "hash_input_tokens": "1d633b3cc0524ba8",
580
- "hash_cont_tokens": "1f88b00d41957d82"
581
- },
582
- "harness|hendrycksTest-college_physics|5": {
583
- "hash_examples": "875dd26d22655b0d",
584
- "hash_full_prompts": "833a0d7b55aed500",
585
- "hash_input_tokens": "5421d9a1af86cbd4",
586
- "hash_cont_tokens": "f7b8097afc16a47c"
587
- },
588
- "harness|hendrycksTest-computer_security|5": {
589
- "hash_examples": "006451eedc0ededb",
590
- "hash_full_prompts": "94034c97e85d8f46",
591
- "hash_input_tokens": "5e6b70ecb333cf18",
592
- "hash_cont_tokens": "50421e30bef398f9"
593
- },
594
- "harness|hendrycksTest-conceptual_physics|5": {
595
- "hash_examples": "8874ece872d2ca4c",
596
- "hash_full_prompts": "e40d15a34640d6fa",
597
- "hash_input_tokens": "c2ef11a87264ceed",
598
- "hash_cont_tokens": "aa0e8bc655f2f641"
599
- },
600
- "harness|hendrycksTest-econometrics|5": {
601
- "hash_examples": "64d3623b0bfaa43f",
602
- "hash_full_prompts": "612f340fae41338d",
603
- "hash_input_tokens": "ecaccd912a4c3978",
604
- "hash_cont_tokens": "bfb7e3c3c88313f1"
605
- },
606
- "harness|hendrycksTest-electrical_engineering|5": {
607
- "hash_examples": "e98f51780c674d7e",
608
- "hash_full_prompts": "10275b312d812ae6",
609
- "hash_input_tokens": "1590c84291399be8",
610
- "hash_cont_tokens": "2425a3f084a591ef"
611
- },
612
- "harness|hendrycksTest-elementary_mathematics|5": {
613
- "hash_examples": "fc48208a5ac1c0ce",
614
- "hash_full_prompts": "5ec274c6c82aca23",
615
- "hash_input_tokens": "3269597f715b0da1",
616
- "hash_cont_tokens": "f52691aef15a407b"
617
- },
618
- "harness|hendrycksTest-formal_logic|5": {
619
- "hash_examples": "5a6525665f63ea72",
620
- "hash_full_prompts": "07b92638c4a6b500",
621
- "hash_input_tokens": "a2800d20f3ab8d7c",
622
- "hash_cont_tokens": "f515d598d9c21263"
623
- },
624
- "harness|hendrycksTest-global_facts|5": {
625
- "hash_examples": "371d70d743b2b89b",
626
- "hash_full_prompts": "332fdee50a1921b4",
627
- "hash_input_tokens": "94ed44b3772505ad",
628
- "hash_cont_tokens": "50421e30bef398f9"
629
- },
630
- "harness|hendrycksTest-high_school_biology|5": {
631
- "hash_examples": "a79e1018b1674052",
632
- "hash_full_prompts": "e624e26ede922561",
633
- "hash_input_tokens": "24423acb928db768",
634
- "hash_cont_tokens": "bd85a4156a3613ee"
635
- },
636
- "harness|hendrycksTest-high_school_chemistry|5": {
637
- "hash_examples": "44bfc25c389f0e03",
638
- "hash_full_prompts": "0e3e5f5d9246482a",
639
- "hash_input_tokens": "831ff35c474e5cef",
640
- "hash_cont_tokens": "a95c97af1c14e068"
641
- },
642
- "harness|hendrycksTest-high_school_computer_science|5": {
643
- "hash_examples": "8b8cdb1084f24169",
644
- "hash_full_prompts": "c00487e67c1813cc",
645
- "hash_input_tokens": "8c34e0f2bda77358",
646
- "hash_cont_tokens": "8abfedef914e33c9"
647
- },
648
- "harness|hendrycksTest-high_school_european_history|5": {
649
- "hash_examples": "11cd32d0ef440171",
650
- "hash_full_prompts": "318f4513c537c6bf",
651
- "hash_input_tokens": "f1f73dd687da18d7",
652
- "hash_cont_tokens": "674fc454bdc5ac93"
653
- },
654
- "harness|hendrycksTest-high_school_geography|5": {
655
- "hash_examples": "b60019b9e80b642f",
656
- "hash_full_prompts": "ee5789fcc1a81b1e",
657
- "hash_input_tokens": "7c5547c7da5bc793",
658
- "hash_cont_tokens": "03a5012b916274ea"
659
- },
660
- "harness|hendrycksTest-high_school_government_and_politics|5": {
661
- "hash_examples": "d221ec983d143dc3",
662
- "hash_full_prompts": "ac42d888e1ce1155",
663
- "hash_input_tokens": "f62991cb6a496b05",
664
- "hash_cont_tokens": "a83effb8f76b7d7c"
665
- },
666
- "harness|hendrycksTest-high_school_macroeconomics|5": {
667
- "hash_examples": "59c2915cacfd3fbb",
668
- "hash_full_prompts": "c6bd9d25158abd0e",
669
- "hash_input_tokens": "4cef2aff6e3d59ed",
670
- "hash_cont_tokens": "c583432ad27fcfe0"
671
- },
672
- "harness|hendrycksTest-high_school_mathematics|5": {
673
- "hash_examples": "1f8ac897608de342",
674
- "hash_full_prompts": "5d88f41fc2d643a8",
675
- "hash_input_tokens": "6e2577ea4082ed2b",
676
- "hash_cont_tokens": "24f5dc613660300b"
677
- },
678
- "harness|hendrycksTest-high_school_microeconomics|5": {
679
- "hash_examples": "ead6a0f2f6c83370",
680
- "hash_full_prompts": "bfc393381298609e",
681
- "hash_input_tokens": "c5fc9aeb1079c8e4",
682
- "hash_cont_tokens": "f47f041de50333b9"
683
- },
684
- "harness|hendrycksTest-high_school_physics|5": {
685
- "hash_examples": "c3f2025990afec64",
686
- "hash_full_prompts": "fc78b4997e436734",
687
- "hash_input_tokens": "555fc385cffa84ca",
688
- "hash_cont_tokens": "ba2efcd283e938cc"
689
- },
690
- "harness|hendrycksTest-high_school_psychology|5": {
691
- "hash_examples": "21f8aab618f6d636",
692
- "hash_full_prompts": "d5c76aa40b9dbc43",
693
- "hash_input_tokens": "febd23cbf9973b7f",
694
- "hash_cont_tokens": "942069cd363844d9"
695
- },
696
- "harness|hendrycksTest-high_school_statistics|5": {
697
- "hash_examples": "2386a60a11fc5de3",
698
- "hash_full_prompts": "4c5c8be5aafac432",
699
- "hash_input_tokens": "424b02981230ee83",
700
- "hash_cont_tokens": "955ed42b6f7fa019"
701
- },
702
- "harness|hendrycksTest-high_school_us_history|5": {
703
- "hash_examples": "74961543be40f04f",
704
- "hash_full_prompts": "5d5ca4840131ba21",
705
- "hash_input_tokens": "50c9ff438c85a69e",
706
- "hash_cont_tokens": "cdd0b3dc06d933e5"
707
- },
708
- "harness|hendrycksTest-high_school_world_history|5": {
709
- "hash_examples": "2ad2f6b7198b2234",
710
- "hash_full_prompts": "11845057459afd72",
711
- "hash_input_tokens": "054824cc474caef5",
712
- "hash_cont_tokens": "9a864184946033ac"
713
- },
714
- "harness|hendrycksTest-human_aging|5": {
715
- "hash_examples": "1a7199dc733e779b",
716
- "hash_full_prompts": "756b9096b8eaf892",
717
- "hash_input_tokens": "541a75f071dcf579",
718
- "hash_cont_tokens": "142a4a8a1138a214"
719
- },
720
- "harness|hendrycksTest-human_sexuality|5": {
721
- "hash_examples": "7acb8fdad97f88a6",
722
- "hash_full_prompts": "731a52ff15b8cfdb",
723
- "hash_input_tokens": "04269e5c5a257dd9",
724
- "hash_cont_tokens": "bc54813e809b796d"
725
- },
726
- "harness|hendrycksTest-international_law|5": {
727
- "hash_examples": "1300bfd0dfc59114",
728
- "hash_full_prompts": "db2aefbff5eec996",
729
- "hash_input_tokens": "d93ba9d9d38e4397",
730
- "hash_cont_tokens": "dc45b45fcda18e5d"
731
- },
732
- "harness|hendrycksTest-jurisprudence|5": {
733
- "hash_examples": "083b1e4904c48dc2",
734
- "hash_full_prompts": "0f89ee3fe03d6a21",
735
- "hash_input_tokens": "9eeaccd2698b4f5a",
736
- "hash_cont_tokens": "e3a8cd951b6e3469"
737
- },
738
- "harness|hendrycksTest-logical_fallacies|5": {
739
- "hash_examples": "709128f9926a634c",
740
- "hash_full_prompts": "98a04b1f8f841069",
741
- "hash_input_tokens": "b4f08f544f2b7576",
742
- "hash_cont_tokens": "1e80dbd30f6453d5"
743
- },
744
- "harness|hendrycksTest-machine_learning|5": {
745
- "hash_examples": "88f22a636029ae47",
746
- "hash_full_prompts": "2e1c8d4b1e0cc921",
747
- "hash_input_tokens": "900c2a51f1174b9f",
748
- "hash_cont_tokens": "9b37da7777378ca9"
749
- },
750
- "harness|hendrycksTest-management|5": {
751
- "hash_examples": "8c8a1e07a2151dca",
752
- "hash_full_prompts": "f51611f514b265b0",
753
- "hash_input_tokens": "6b36efb4689c6eca",
754
- "hash_cont_tokens": "a01d6d39a83c4597"
755
- },
756
- "harness|hendrycksTest-marketing|5": {
757
- "hash_examples": "2668953431f91e96",
758
- "hash_full_prompts": "77562bef997c7650",
759
- "hash_input_tokens": "2aaac78a0cfed47a",
760
- "hash_cont_tokens": "6aeaed4d823c98aa"
761
- },
762
- "harness|hendrycksTest-medical_genetics|5": {
763
- "hash_examples": "9c2dda34a2ea4fd2",
764
- "hash_full_prompts": "202139046daa118f",
765
- "hash_input_tokens": "886ca823b41c094a",
766
- "hash_cont_tokens": "50421e30bef398f9"
767
- },
768
- "harness|hendrycksTest-miscellaneous|5": {
769
- "hash_examples": "41adb694024809c2",
770
- "hash_full_prompts": "bffec9fc237bcf93",
771
- "hash_input_tokens": "72fd71de7675e7d0",
772
- "hash_cont_tokens": "9b0ab02a64603081"
773
- },
774
- "harness|hendrycksTest-moral_disputes|5": {
775
- "hash_examples": "3171c13ba3c594c4",
776
- "hash_full_prompts": "170831fc36f1d59e",
777
- "hash_input_tokens": "f3ca0dd8e7a1eb09",
778
- "hash_cont_tokens": "8badf768f7b0467a"
779
- },
780
- "harness|hendrycksTest-moral_scenarios|5": {
781
- "hash_examples": "9873e077e83e0546",
782
- "hash_full_prompts": "08f4ceba3131a068",
783
- "hash_input_tokens": "3e793631e951f23c",
784
- "hash_cont_tokens": "32ae620376b2bbba"
785
- },
786
- "harness|hendrycksTest-nutrition|5": {
787
- "hash_examples": "7db1d8142ec14323",
788
- "hash_full_prompts": "4c0e68e3586cb453",
789
- "hash_input_tokens": "59753c2144ea93af",
790
- "hash_cont_tokens": "3071def75bacc404"
791
- },
792
- "harness|hendrycksTest-philosophy|5": {
793
- "hash_examples": "9b455b7d72811cc8",
794
- "hash_full_prompts": "e467f822d8a0d3ff",
795
- "hash_input_tokens": "bd8d3dbed15a8c34",
796
- "hash_cont_tokens": "9f6ff69d23a48783"
797
- },
798
- "harness|hendrycksTest-prehistory|5": {
799
- "hash_examples": "8be90d0f538f1560",
800
- "hash_full_prompts": "152187949bcd0921",
801
- "hash_input_tokens": "3573cd87facbb7c5",
802
- "hash_cont_tokens": "de469d2b981e32a3"
803
- },
804
- "harness|hendrycksTest-professional_accounting|5": {
805
- "hash_examples": "8d377597916cd07e",
806
- "hash_full_prompts": "0eb7345d6144ee0d",
807
- "hash_input_tokens": "17e721bc1a7cbb47",
808
- "hash_cont_tokens": "c46f74d2dfc7b13b"
809
- },
810
- "harness|hendrycksTest-professional_law|5": {
811
- "hash_examples": "cd9dbc52b3c932d6",
812
- "hash_full_prompts": "36ac764272bfb182",
813
- "hash_input_tokens": "9178e10bd0763ec4",
814
- "hash_cont_tokens": "2e590029ef41fbcd"
815
- },
816
- "harness|hendrycksTest-professional_medicine|5": {
817
- "hash_examples": "b20e4e816c1e383e",
818
- "hash_full_prompts": "7b8d69ea2acaf2f7",
819
- "hash_input_tokens": "f5a22012a54f70ea",
820
- "hash_cont_tokens": "fe35cfa9c6ca802e"
821
- },
822
- "harness|hendrycksTest-professional_psychology|5": {
823
- "hash_examples": "d45b73b22f9cc039",
824
- "hash_full_prompts": "fe8937e9ffc99771",
825
- "hash_input_tokens": "0dfb73a8eb3f692c",
826
- "hash_cont_tokens": "f020fbddf72c8652"
827
- },
828
- "harness|hendrycksTest-public_relations|5": {
829
- "hash_examples": "0d25072e1761652a",
830
- "hash_full_prompts": "f9adc39cfa9f42ba",
831
- "hash_input_tokens": "1710c6ba4c9f3cbd",
832
- "hash_cont_tokens": "568f585a259965c1"
833
- },
834
- "harness|hendrycksTest-security_studies|5": {
835
- "hash_examples": "62bb8197e63d60d4",
836
- "hash_full_prompts": "869c9c3ae196b7c3",
837
- "hash_input_tokens": "d49711415961ced7",
838
- "hash_cont_tokens": "cc6fd7cccd64cd5d"
839
- },
840
- "harness|hendrycksTest-sociology|5": {
841
- "hash_examples": "e7959df87dea8672",
842
- "hash_full_prompts": "1a1fc00e17b3a52a",
843
- "hash_input_tokens": "828999f7624cbe7e",
844
- "hash_cont_tokens": "c3a3bdfd177eed5b"
845
- },
846
- "harness|hendrycksTest-us_foreign_policy|5": {
847
- "hash_examples": "4a56a01ddca44dca",
848
- "hash_full_prompts": "0c7a7081c71c07b6",
849
- "hash_input_tokens": "42054621e718dbee",
850
- "hash_cont_tokens": "2568d0e8e36fa959"
851
- },
852
- "harness|hendrycksTest-virology|5": {
853
- "hash_examples": "451cc86a8c4f4fe9",
854
- "hash_full_prompts": "01e95325d8b738e4",
855
- "hash_input_tokens": "6c4f0aa4dc859c04",
856
- "hash_cont_tokens": "926cf60b0891f374"
857
- },
858
- "harness|hendrycksTest-world_religions|5": {
859
- "hash_examples": "3b29cfaf1a81c379",
860
- "hash_full_prompts": "e0d79a15083dfdff",
861
- "hash_input_tokens": "6c75d44e092ff24f",
862
- "hash_cont_tokens": "c525a5de974c1ea3"
863
- },
864
- "harness|truthfulqa:mc|0": {
865
- "hash_examples": "23176c0531c7b867",
866
- "hash_full_prompts": "36a6d90e75d92d4a",
867
- "hash_input_tokens": "2738d7ed7075faa7",
868
- "hash_cont_tokens": "c014154380b74b9e"
869
- }
870
- }
871
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
meta-llama/Llama-2-13b-hf/results_2023-08-29T22-26-02.660247.json DELETED
@@ -1,1366 +0,0 @@
1
- {
2
- "config_general": {
3
- "model_name": "meta-llama/Llama-2-13b-hf",
4
- "model_sha": "db6b8eb1feabb38985fdf785a89895959e944936",
5
- "model_dtype": "4bit",
6
- "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63",
7
- "num_few_shot_default": 0,
8
- "num_fewshot_seeds": 1,
9
- "override_batch_size": 1,
10
- "max_samples": null,
11
- "job_id": ""
12
- },
13
- "results": {
14
- "harness|arc:challenge|25": {
15
- "acc": 0.5366894197952219,
16
- "acc_stderr": 0.01457200052775699,
17
- "acc_norm": 0.5810580204778157,
18
- "acc_norm_stderr": 0.014418106953639013
19
- },
20
- "harness|hellaswag|10": {
21
- "acc": 0.6059549890460068,
22
- "acc_stderr": 0.004876459434619801,
23
- "acc_norm": 0.809699263095001,
24
- "acc_norm_stderr": 0.003917361254101999
25
- },
26
- "harness|hendrycksTest-abstract_algebra|5": {
27
- "acc": 0.33,
28
- "acc_stderr": 0.04725815626252605,
29
- "acc_norm": 0.33,
30
- "acc_norm_stderr": 0.04725815626252605
31
- },
32
- "harness|hendrycksTest-anatomy|5": {
33
- "acc": 0.5111111111111111,
34
- "acc_stderr": 0.04318275491977976,
35
- "acc_norm": 0.5111111111111111,
36
- "acc_norm_stderr": 0.04318275491977976
37
- },
38
- "harness|hendrycksTest-astronomy|5": {
39
- "acc": 0.5131578947368421,
40
- "acc_stderr": 0.04067533136309174,
41
- "acc_norm": 0.5131578947368421,
42
- "acc_norm_stderr": 0.04067533136309174
43
- },
44
- "harness|hendrycksTest-business_ethics|5": {
45
- "acc": 0.54,
46
- "acc_stderr": 0.05009082659620332,
47
- "acc_norm": 0.54,
48
- "acc_norm_stderr": 0.05009082659620332
49
- },
50
- "harness|hendrycksTest-clinical_knowledge|5": {
51
- "acc": 0.6075471698113207,
52
- "acc_stderr": 0.03005258057955785,
53
- "acc_norm": 0.6075471698113207,
54
- "acc_norm_stderr": 0.03005258057955785
55
- },
56
- "harness|hendrycksTest-college_biology|5": {
57
- "acc": 0.5555555555555556,
58
- "acc_stderr": 0.041553199555931467,
59
- "acc_norm": 0.5555555555555556,
60
- "acc_norm_stderr": 0.041553199555931467
61
- },
62
- "harness|hendrycksTest-college_chemistry|5": {
63
- "acc": 0.43,
64
- "acc_stderr": 0.04975698519562428,
65
- "acc_norm": 0.43,
66
- "acc_norm_stderr": 0.04975698519562428
67
- },
68
- "harness|hendrycksTest-college_computer_science|5": {
69
- "acc": 0.44,
70
- "acc_stderr": 0.04988876515698589,
71
- "acc_norm": 0.44,
72
- "acc_norm_stderr": 0.04988876515698589
73
- },
74
- "harness|hendrycksTest-college_mathematics|5": {
75
- "acc": 0.34,
76
- "acc_stderr": 0.04760952285695235,
77
- "acc_norm": 0.34,
78
- "acc_norm_stderr": 0.04760952285695235
79
- },
80
- "harness|hendrycksTest-college_medicine|5": {
81
- "acc": 0.5144508670520231,
82
- "acc_stderr": 0.03810871630454764,
83
- "acc_norm": 0.5144508670520231,
84
- "acc_norm_stderr": 0.03810871630454764
85
- },
86
- "harness|hendrycksTest-college_physics|5": {
87
- "acc": 0.21568627450980393,
88
- "acc_stderr": 0.040925639582376536,
89
- "acc_norm": 0.21568627450980393,
90
- "acc_norm_stderr": 0.040925639582376536
91
- },
92
- "harness|hendrycksTest-computer_security|5": {
93
- "acc": 0.68,
94
- "acc_stderr": 0.04688261722621504,
95
- "acc_norm": 0.68,
96
- "acc_norm_stderr": 0.04688261722621504
97
- },
98
- "harness|hendrycksTest-conceptual_physics|5": {
99
- "acc": 0.4553191489361702,
100
- "acc_stderr": 0.03255525359340354,
101
- "acc_norm": 0.4553191489361702,
102
- "acc_norm_stderr": 0.03255525359340354
103
- },
104
- "harness|hendrycksTest-econometrics|5": {
105
- "acc": 0.2894736842105263,
106
- "acc_stderr": 0.04266339443159394,
107
- "acc_norm": 0.2894736842105263,
108
- "acc_norm_stderr": 0.04266339443159394
109
- },
110
- "harness|hendrycksTest-electrical_engineering|5": {
111
- "acc": 0.496551724137931,
112
- "acc_stderr": 0.041665675771015785,
113
- "acc_norm": 0.496551724137931,
114
- "acc_norm_stderr": 0.041665675771015785
115
- },
116
- "harness|hendrycksTest-elementary_mathematics|5": {
117
- "acc": 0.3439153439153439,
118
- "acc_stderr": 0.024464426625596433,
119
- "acc_norm": 0.3439153439153439,
120
- "acc_norm_stderr": 0.024464426625596433
121
- },
122
- "harness|hendrycksTest-formal_logic|5": {
123
- "acc": 0.3333333333333333,
124
- "acc_stderr": 0.04216370213557835,
125
- "acc_norm": 0.3333333333333333,
126
- "acc_norm_stderr": 0.04216370213557835
127
- },
128
- "harness|hendrycksTest-global_facts|5": {
129
- "acc": 0.32,
130
- "acc_stderr": 0.046882617226215034,
131
- "acc_norm": 0.32,
132
- "acc_norm_stderr": 0.046882617226215034
133
- },
134
- "harness|hendrycksTest-high_school_biology|5": {
135
- "acc": 0.6580645161290323,
136
- "acc_stderr": 0.02698528957655274,
137
- "acc_norm": 0.6580645161290323,
138
- "acc_norm_stderr": 0.02698528957655274
139
- },
140
- "harness|hendrycksTest-high_school_chemistry|5": {
141
- "acc": 0.458128078817734,
142
- "acc_stderr": 0.03505630140785741,
143
- "acc_norm": 0.458128078817734,
144
- "acc_norm_stderr": 0.03505630140785741
145
- },
146
- "harness|hendrycksTest-high_school_computer_science|5": {
147
- "acc": 0.55,
148
- "acc_stderr": 0.05,
149
- "acc_norm": 0.55,
150
- "acc_norm_stderr": 0.05
151
- },
152
- "harness|hendrycksTest-high_school_european_history|5": {
153
- "acc": 0.6484848484848484,
154
- "acc_stderr": 0.037282069986826503,
155
- "acc_norm": 0.6484848484848484,
156
- "acc_norm_stderr": 0.037282069986826503
157
- },
158
- "harness|hendrycksTest-high_school_geography|5": {
159
- "acc": 0.6868686868686869,
160
- "acc_stderr": 0.033042050878136525,
161
- "acc_norm": 0.6868686868686869,
162
- "acc_norm_stderr": 0.033042050878136525
163
- },
164
- "harness|hendrycksTest-high_school_government_and_politics|5": {
165
- "acc": 0.7668393782383419,
166
- "acc_stderr": 0.03051611137147601,
167
- "acc_norm": 0.7668393782383419,
168
- "acc_norm_stderr": 0.03051611137147601
169
- },
170
- "harness|hendrycksTest-high_school_macroeconomics|5": {
171
- "acc": 0.47692307692307695,
172
- "acc_stderr": 0.025323990861736118,
173
- "acc_norm": 0.47692307692307695,
174
- "acc_norm_stderr": 0.025323990861736118
175
- },
176
- "harness|hendrycksTest-high_school_mathematics|5": {
177
- "acc": 0.2814814814814815,
178
- "acc_stderr": 0.027420019350945277,
179
- "acc_norm": 0.2814814814814815,
180
- "acc_norm_stderr": 0.027420019350945277
181
- },
182
- "harness|hendrycksTest-high_school_microeconomics|5": {
183
- "acc": 0.5546218487394958,
184
- "acc_stderr": 0.032284106267163895,
185
- "acc_norm": 0.5546218487394958,
186
- "acc_norm_stderr": 0.032284106267163895
187
- },
188
- "harness|hendrycksTest-high_school_physics|5": {
189
- "acc": 0.39072847682119205,
190
- "acc_stderr": 0.039837983066598075,
191
- "acc_norm": 0.39072847682119205,
192
- "acc_norm_stderr": 0.039837983066598075
193
- },
194
- "harness|hendrycksTest-high_school_psychology|5": {
195
- "acc": 0.7339449541284404,
196
- "acc_stderr": 0.018946022322225597,
197
- "acc_norm": 0.7339449541284404,
198
- "acc_norm_stderr": 0.018946022322225597
199
- },
200
- "harness|hendrycksTest-high_school_statistics|5": {
201
- "acc": 0.4675925925925926,
202
- "acc_stderr": 0.034028015813589656,
203
- "acc_norm": 0.4675925925925926,
204
- "acc_norm_stderr": 0.034028015813589656
205
- },
206
- "harness|hendrycksTest-high_school_us_history|5": {
207
- "acc": 0.6911764705882353,
208
- "acc_stderr": 0.03242661719827218,
209
- "acc_norm": 0.6911764705882353,
210
- "acc_norm_stderr": 0.03242661719827218
211
- },
212
- "harness|hendrycksTest-high_school_world_history|5": {
213
- "acc": 0.7172995780590717,
214
- "acc_stderr": 0.029312814153955924,
215
- "acc_norm": 0.7172995780590717,
216
- "acc_norm_stderr": 0.029312814153955924
217
- },
218
- "harness|hendrycksTest-human_aging|5": {
219
- "acc": 0.6636771300448431,
220
- "acc_stderr": 0.031708824268455,
221
- "acc_norm": 0.6636771300448431,
222
- "acc_norm_stderr": 0.031708824268455
223
- },
224
- "harness|hendrycksTest-human_sexuality|5": {
225
- "acc": 0.6030534351145038,
226
- "acc_stderr": 0.04291135671009224,
227
- "acc_norm": 0.6030534351145038,
228
- "acc_norm_stderr": 0.04291135671009224
229
- },
230
- "harness|hendrycksTest-international_law|5": {
231
- "acc": 0.7024793388429752,
232
- "acc_stderr": 0.04173349148083499,
233
- "acc_norm": 0.7024793388429752,
234
- "acc_norm_stderr": 0.04173349148083499
235
- },
236
- "harness|hendrycksTest-jurisprudence|5": {
237
- "acc": 0.7037037037037037,
238
- "acc_stderr": 0.044143436668549335,
239
- "acc_norm": 0.7037037037037037,
240
- "acc_norm_stderr": 0.044143436668549335
241
- },
242
- "harness|hendrycksTest-logical_fallacies|5": {
243
- "acc": 0.6871165644171779,
244
- "acc_stderr": 0.03642914578292406,
245
- "acc_norm": 0.6871165644171779,
246
- "acc_norm_stderr": 0.03642914578292406
247
- },
248
- "harness|hendrycksTest-machine_learning|5": {
249
- "acc": 0.26785714285714285,
250
- "acc_stderr": 0.04203277291467762,
251
- "acc_norm": 0.26785714285714285,
252
- "acc_norm_stderr": 0.04203277291467762
253
- },
254
- "harness|hendrycksTest-management|5": {
255
- "acc": 0.7572815533980582,
256
- "acc_stderr": 0.04245022486384495,
257
- "acc_norm": 0.7572815533980582,
258
- "acc_norm_stderr": 0.04245022486384495
259
- },
260
- "harness|hendrycksTest-marketing|5": {
261
- "acc": 0.7991452991452992,
262
- "acc_stderr": 0.026246772946890477,
263
- "acc_norm": 0.7991452991452992,
264
- "acc_norm_stderr": 0.026246772946890477
265
- },
266
- "harness|hendrycksTest-medical_genetics|5": {
267
- "acc": 0.55,
268
- "acc_stderr": 0.049999999999999996,
269
- "acc_norm": 0.55,
270
- "acc_norm_stderr": 0.049999999999999996
271
- },
272
- "harness|hendrycksTest-miscellaneous|5": {
273
- "acc": 0.7266922094508301,
274
- "acc_stderr": 0.015936681062628556,
275
- "acc_norm": 0.7266922094508301,
276
- "acc_norm_stderr": 0.015936681062628556
277
- },
278
- "harness|hendrycksTest-moral_disputes|5": {
279
- "acc": 0.6358381502890174,
280
- "acc_stderr": 0.025906632631016124,
281
- "acc_norm": 0.6358381502890174,
282
- "acc_norm_stderr": 0.025906632631016124
283
- },
284
- "harness|hendrycksTest-moral_scenarios|5": {
285
- "acc": 0.2759776536312849,
286
- "acc_stderr": 0.014950103002475356,
287
- "acc_norm": 0.2759776536312849,
288
- "acc_norm_stderr": 0.014950103002475356
289
- },
290
- "harness|hendrycksTest-nutrition|5": {
291
- "acc": 0.5947712418300654,
292
- "acc_stderr": 0.028110928492809075,
293
- "acc_norm": 0.5947712418300654,
294
- "acc_norm_stderr": 0.028110928492809075
295
- },
296
- "harness|hendrycksTest-philosophy|5": {
297
- "acc": 0.6495176848874598,
298
- "acc_stderr": 0.027098652621301754,
299
- "acc_norm": 0.6495176848874598,
300
- "acc_norm_stderr": 0.027098652621301754
301
- },
302
- "harness|hendrycksTest-prehistory|5": {
303
- "acc": 0.6080246913580247,
304
- "acc_stderr": 0.027163686038271146,
305
- "acc_norm": 0.6080246913580247,
306
- "acc_norm_stderr": 0.027163686038271146
307
- },
308
- "harness|hendrycksTest-professional_accounting|5": {
309
- "acc": 0.3971631205673759,
310
- "acc_stderr": 0.02918980567358711,
311
- "acc_norm": 0.3971631205673759,
312
- "acc_norm_stderr": 0.02918980567358711
313
- },
314
- "harness|hendrycksTest-professional_law|5": {
315
- "acc": 0.41590612777053454,
316
- "acc_stderr": 0.012588323850313611,
317
- "acc_norm": 0.41590612777053454,
318
- "acc_norm_stderr": 0.012588323850313611
319
- },
320
- "harness|hendrycksTest-professional_medicine|5": {
321
- "acc": 0.5147058823529411,
322
- "acc_stderr": 0.03035969707904611,
323
- "acc_norm": 0.5147058823529411,
324
- "acc_norm_stderr": 0.03035969707904611
325
- },
326
- "harness|hendrycksTest-professional_psychology|5": {
327
- "acc": 0.5424836601307189,
328
- "acc_stderr": 0.020154685712590888,
329
- "acc_norm": 0.5424836601307189,
330
- "acc_norm_stderr": 0.020154685712590888
331
- },
332
- "harness|hendrycksTest-public_relations|5": {
333
- "acc": 0.5727272727272728,
334
- "acc_stderr": 0.04738198703545483,
335
- "acc_norm": 0.5727272727272728,
336
- "acc_norm_stderr": 0.04738198703545483
337
- },
338
- "harness|hendrycksTest-security_studies|5": {
339
- "acc": 0.6122448979591837,
340
- "acc_stderr": 0.031192230726795656,
341
- "acc_norm": 0.6122448979591837,
342
- "acc_norm_stderr": 0.031192230726795656
343
- },
344
- "harness|hendrycksTest-sociology|5": {
345
- "acc": 0.7114427860696517,
346
- "acc_stderr": 0.03203841040213322,
347
- "acc_norm": 0.7114427860696517,
348
- "acc_norm_stderr": 0.03203841040213322
349
- },
350
- "harness|hendrycksTest-us_foreign_policy|5": {
351
- "acc": 0.83,
352
- "acc_stderr": 0.0377525168068637,
353
- "acc_norm": 0.83,
354
- "acc_norm_stderr": 0.0377525168068637
355
- },
356
- "harness|hendrycksTest-virology|5": {
357
- "acc": 0.41566265060240964,
358
- "acc_stderr": 0.038367221765980515,
359
- "acc_norm": 0.41566265060240964,
360
- "acc_norm_stderr": 0.038367221765980515
361
- },
362
- "harness|hendrycksTest-world_religions|5": {
363
- "acc": 0.7368421052631579,
364
- "acc_stderr": 0.03377310252209204,
365
- "acc_norm": 0.7368421052631579,
366
- "acc_norm_stderr": 0.03377310252209204
367
- },
368
- "harness|truthfulqa:mc|0": {
369
- "mc1": 0.2386780905752754,
370
- "mc1_stderr": 0.014922629695456421,
371
- "mc2": 0.34172402963708387,
372
- "mc2_stderr": 0.01332205356000871
373
- },
374
- "all": {
375
- "acc": 0.5443256746853737,
376
- "acc_stderr": 0.034540859468822654,
377
- "acc_norm": 0.5485309776469262,
378
- "acc_norm_stderr": 0.03452199520179493,
379
- "mc1": 0.2386780905752754,
380
- "mc1_stderr": 0.014922629695456421,
381
- "mc2": 0.34172402963708387,
382
- "mc2_stderr": 0.01332205356000871
383
- }
384
- },
385
- "versions": {
386
- "harness|arc:challenge|25": 0,
387
- "harness|hellaswag|10": 0,
388
- "harness|hendrycksTest-abstract_algebra|5": 1,
389
- "harness|hendrycksTest-anatomy|5": 1,
390
- "harness|hendrycksTest-astronomy|5": 1,
391
- "harness|hendrycksTest-business_ethics|5": 1,
392
- "harness|hendrycksTest-clinical_knowledge|5": 1,
393
- "harness|hendrycksTest-college_biology|5": 1,
394
- "harness|hendrycksTest-college_chemistry|5": 1,
395
- "harness|hendrycksTest-college_computer_science|5": 1,
396
- "harness|hendrycksTest-college_mathematics|5": 1,
397
- "harness|hendrycksTest-college_medicine|5": 1,
398
- "harness|hendrycksTest-college_physics|5": 1,
399
- "harness|hendrycksTest-computer_security|5": 1,
400
- "harness|hendrycksTest-conceptual_physics|5": 1,
401
- "harness|hendrycksTest-econometrics|5": 1,
402
- "harness|hendrycksTest-electrical_engineering|5": 1,
403
- "harness|hendrycksTest-elementary_mathematics|5": 1,
404
- "harness|hendrycksTest-formal_logic|5": 1,
405
- "harness|hendrycksTest-global_facts|5": 1,
406
- "harness|hendrycksTest-high_school_biology|5": 1,
407
- "harness|hendrycksTest-high_school_chemistry|5": 1,
408
- "harness|hendrycksTest-high_school_computer_science|5": 1,
409
- "harness|hendrycksTest-high_school_european_history|5": 1,
410
- "harness|hendrycksTest-high_school_geography|5": 1,
411
- "harness|hendrycksTest-high_school_government_and_politics|5": 1,
412
- "harness|hendrycksTest-high_school_macroeconomics|5": 1,
413
- "harness|hendrycksTest-high_school_mathematics|5": 1,
414
- "harness|hendrycksTest-high_school_microeconomics|5": 1,
415
- "harness|hendrycksTest-high_school_physics|5": 1,
416
- "harness|hendrycksTest-high_school_psychology|5": 1,
417
- "harness|hendrycksTest-high_school_statistics|5": 1,
418
- "harness|hendrycksTest-high_school_us_history|5": 1,
419
- "harness|hendrycksTest-high_school_world_history|5": 1,
420
- "harness|hendrycksTest-human_aging|5": 1,
421
- "harness|hendrycksTest-human_sexuality|5": 1,
422
- "harness|hendrycksTest-international_law|5": 1,
423
- "harness|hendrycksTest-jurisprudence|5": 1,
424
- "harness|hendrycksTest-logical_fallacies|5": 1,
425
- "harness|hendrycksTest-machine_learning|5": 1,
426
- "harness|hendrycksTest-management|5": 1,
427
- "harness|hendrycksTest-marketing|5": 1,
428
- "harness|hendrycksTest-medical_genetics|5": 1,
429
- "harness|hendrycksTest-miscellaneous|5": 1,
430
- "harness|hendrycksTest-moral_disputes|5": 1,
431
- "harness|hendrycksTest-moral_scenarios|5": 1,
432
- "harness|hendrycksTest-nutrition|5": 1,
433
- "harness|hendrycksTest-philosophy|5": 1,
434
- "harness|hendrycksTest-prehistory|5": 1,
435
- "harness|hendrycksTest-professional_accounting|5": 1,
436
- "harness|hendrycksTest-professional_law|5": 1,
437
- "harness|hendrycksTest-professional_medicine|5": 1,
438
- "harness|hendrycksTest-professional_psychology|5": 1,
439
- "harness|hendrycksTest-public_relations|5": 1,
440
- "harness|hendrycksTest-security_studies|5": 1,
441
- "harness|hendrycksTest-sociology|5": 1,
442
- "harness|hendrycksTest-us_foreign_policy|5": 1,
443
- "harness|hendrycksTest-virology|5": 1,
444
- "harness|hendrycksTest-world_religions|5": 1,
445
- "harness|truthfulqa:mc|0": 1,
446
- "all": 0
447
- },
448
- "config_tasks": {
449
- "harness|arc:challenge": "LM Harness task",
450
- "harness|hellaswag": "LM Harness task",
451
- "harness|hendrycksTest-abstract_algebra": "LM Harness task",
452
- "harness|hendrycksTest-anatomy": "LM Harness task",
453
- "harness|hendrycksTest-astronomy": "LM Harness task",
454
- "harness|hendrycksTest-business_ethics": "LM Harness task",
455
- "harness|hendrycksTest-clinical_knowledge": "LM Harness task",
456
- "harness|hendrycksTest-college_biology": "LM Harness task",
457
- "harness|hendrycksTest-college_chemistry": "LM Harness task",
458
- "harness|hendrycksTest-college_computer_science": "LM Harness task",
459
- "harness|hendrycksTest-college_mathematics": "LM Harness task",
460
- "harness|hendrycksTest-college_medicine": "LM Harness task",
461
- "harness|hendrycksTest-college_physics": "LM Harness task",
462
- "harness|hendrycksTest-computer_security": "LM Harness task",
463
- "harness|hendrycksTest-conceptual_physics": "LM Harness task",
464
- "harness|hendrycksTest-econometrics": "LM Harness task",
465
- "harness|hendrycksTest-electrical_engineering": "LM Harness task",
466
- "harness|hendrycksTest-elementary_mathematics": "LM Harness task",
467
- "harness|hendrycksTest-formal_logic": "LM Harness task",
468
- "harness|hendrycksTest-global_facts": "LM Harness task",
469
- "harness|hendrycksTest-high_school_biology": "LM Harness task",
470
- "harness|hendrycksTest-high_school_chemistry": "LM Harness task",
471
- "harness|hendrycksTest-high_school_computer_science": "LM Harness task",
472
- "harness|hendrycksTest-high_school_european_history": "LM Harness task",
473
- "harness|hendrycksTest-high_school_geography": "LM Harness task",
474
- "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task",
475
- "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task",
476
- "harness|hendrycksTest-high_school_mathematics": "LM Harness task",
477
- "harness|hendrycksTest-high_school_microeconomics": "LM Harness task",
478
- "harness|hendrycksTest-high_school_physics": "LM Harness task",
479
- "harness|hendrycksTest-high_school_psychology": "LM Harness task",
480
- "harness|hendrycksTest-high_school_statistics": "LM Harness task",
481
- "harness|hendrycksTest-high_school_us_history": "LM Harness task",
482
- "harness|hendrycksTest-high_school_world_history": "LM Harness task",
483
- "harness|hendrycksTest-human_aging": "LM Harness task",
484
- "harness|hendrycksTest-human_sexuality": "LM Harness task",
485
- "harness|hendrycksTest-international_law": "LM Harness task",
486
- "harness|hendrycksTest-jurisprudence": "LM Harness task",
487
- "harness|hendrycksTest-logical_fallacies": "LM Harness task",
488
- "harness|hendrycksTest-machine_learning": "LM Harness task",
489
- "harness|hendrycksTest-management": "LM Harness task",
490
- "harness|hendrycksTest-marketing": "LM Harness task",
491
- "harness|hendrycksTest-medical_genetics": "LM Harness task",
492
- "harness|hendrycksTest-miscellaneous": "LM Harness task",
493
- "harness|hendrycksTest-moral_disputes": "LM Harness task",
494
- "harness|hendrycksTest-moral_scenarios": "LM Harness task",
495
- "harness|hendrycksTest-nutrition": "LM Harness task",
496
- "harness|hendrycksTest-philosophy": "LM Harness task",
497
- "harness|hendrycksTest-prehistory": "LM Harness task",
498
- "harness|hendrycksTest-professional_accounting": "LM Harness task",
499
- "harness|hendrycksTest-professional_law": "LM Harness task",
500
- "harness|hendrycksTest-professional_medicine": "LM Harness task",
501
- "harness|hendrycksTest-professional_psychology": "LM Harness task",
502
- "harness|hendrycksTest-public_relations": "LM Harness task",
503
- "harness|hendrycksTest-security_studies": "LM Harness task",
504
- "harness|hendrycksTest-sociology": "LM Harness task",
505
- "harness|hendrycksTest-us_foreign_policy": "LM Harness task",
506
- "harness|hendrycksTest-virology": "LM Harness task",
507
- "harness|hendrycksTest-world_religions": "LM Harness task",
508
- "harness|truthfulqa:mc": "LM Harness task"
509
- },
510
- "summary_tasks": {
511
- "harness|arc:challenge|25": {
512
- "hashes": {
513
- "hash_examples": "17b0cae357c0259e",
514
- "hash_full_prompts": "045cbb916e5145c6",
515
- "hash_input_tokens": "3722289b79076c44",
516
- "hash_cont_tokens": "8210decc6ff6f7df"
517
- },
518
- "truncated": 0,
519
- "non-truncated": 4687,
520
- "padded": 4687,
521
- "non-padded": 0,
522
- "effective_few_shots": 25.0,
523
- "num_truncated_few_shots": 0
524
- },
525
- "harness|hellaswag|10": {
526
- "hashes": {
527
- "hash_examples": "e1768ecb99d7ecf0",
528
- "hash_full_prompts": "0b4c16983130f84f",
529
- "hash_input_tokens": "ececd684171f1ef2",
530
- "hash_cont_tokens": "b3b9e9017afa63af"
531
- },
532
- "truncated": 0,
533
- "non-truncated": 40168,
534
- "padded": 40113,
535
- "non-padded": 55,
536
- "effective_few_shots": 10.0,
537
- "num_truncated_few_shots": 0
538
- },
539
- "harness|hendrycksTest-abstract_algebra|5": {
540
- "hashes": {
541
- "hash_examples": "280f9f325b40559a",
542
- "hash_full_prompts": "2f776a367d23aea2",
543
- "hash_input_tokens": "c54ff61ad0273dd7",
544
- "hash_cont_tokens": "50421e30bef398f9"
545
- },
546
- "truncated": 0,
547
- "non-truncated": 400,
548
- "padded": 400,
549
- "non-padded": 0,
550
- "effective_few_shots": 5.0,
551
- "num_truncated_few_shots": 0
552
- },
553
- "harness|hendrycksTest-anatomy|5": {
554
- "hashes": {
555
- "hash_examples": "2f83a4f1cab4ba18",
556
- "hash_full_prompts": "516f74bef25df620",
557
- "hash_input_tokens": "be31a1e22aef5f90",
558
- "hash_cont_tokens": "f11971a765cb609f"
559
- },
560
- "truncated": 0,
561
- "non-truncated": 540,
562
- "padded": 540,
563
- "non-padded": 0,
564
- "effective_few_shots": 5.0,
565
- "num_truncated_few_shots": 0
566
- },
567
- "harness|hendrycksTest-astronomy|5": {
568
- "hashes": {
569
- "hash_examples": "7d587b908da4d762",
570
- "hash_full_prompts": "faf4e80f65de93ca",
571
- "hash_input_tokens": "277a7b1fad566940",
572
- "hash_cont_tokens": "bf30e5d3f48250cb"
573
- },
574
- "truncated": 0,
575
- "non-truncated": 608,
576
- "padded": 608,
577
- "non-padded": 0,
578
- "effective_few_shots": 5.0,
579
- "num_truncated_few_shots": 0
580
- },
581
- "harness|hendrycksTest-business_ethics|5": {
582
- "hashes": {
583
- "hash_examples": "33e51740670de686",
584
- "hash_full_prompts": "db01c3ef8e1479d4",
585
- "hash_input_tokens": "ba552605bc116de5",
586
- "hash_cont_tokens": "bc1dd9b2d995eb61"
587
- },
588
- "truncated": 0,
589
- "non-truncated": 400,
590
- "padded": 400,
591
- "non-padded": 0,
592
- "effective_few_shots": 5.0,
593
- "num_truncated_few_shots": 0
594
- },
595
- "harness|hendrycksTest-clinical_knowledge|5": {
596
- "hashes": {
597
- "hash_examples": "f3366dbe7eefffa4",
598
- "hash_full_prompts": "49654f71d94b65c3",
599
- "hash_input_tokens": "428c7563d0b98ab9",
600
- "hash_cont_tokens": "890a119624b3b935"
601
- },
602
- "truncated": 0,
603
- "non-truncated": 1060,
604
- "padded": 1060,
605
- "non-padded": 0,
606
- "effective_few_shots": 5.0,
607
- "num_truncated_few_shots": 0
608
- },
609
- "harness|hendrycksTest-college_biology|5": {
610
- "hashes": {
611
- "hash_examples": "ca2b6753a0193e7f",
612
- "hash_full_prompts": "2b460b75f1fdfefd",
613
- "hash_input_tokens": "da036601573942e2",
614
- "hash_cont_tokens": "875cde3af7a0ee14"
615
- },
616
- "truncated": 0,
617
- "non-truncated": 576,
618
- "padded": 576,
619
- "non-padded": 0,
620
- "effective_few_shots": 5.0,
621
- "num_truncated_few_shots": 0
622
- },
623
- "harness|hendrycksTest-college_chemistry|5": {
624
- "hashes": {
625
- "hash_examples": "22ff85f1d34f42d1",
626
- "hash_full_prompts": "242c9be6da583e95",
627
- "hash_input_tokens": "94e0196d6aded13d",
628
- "hash_cont_tokens": "50421e30bef398f9"
629
- },
630
- "truncated": 0,
631
- "non-truncated": 400,
632
- "padded": 400,
633
- "non-padded": 0,
634
- "effective_few_shots": 5.0,
635
- "num_truncated_few_shots": 0
636
- },
637
- "harness|hendrycksTest-college_computer_science|5": {
638
- "hashes": {
639
- "hash_examples": "30318289d717a5cf",
640
- "hash_full_prompts": "ed2bdb4e87c4b371",
641
- "hash_input_tokens": "6e4d0f4a8d36690b",
642
- "hash_cont_tokens": "ffc0fe414cdc4a83"
643
- },
644
- "truncated": 0,
645
- "non-truncated": 400,
646
- "padded": 400,
647
- "non-padded": 0,
648
- "effective_few_shots": 5.0,
649
- "num_truncated_few_shots": 0
650
- },
651
- "harness|hendrycksTest-college_mathematics|5": {
652
- "hashes": {
653
- "hash_examples": "4944d1f0b6b5d911",
654
- "hash_full_prompts": "770bc4281c973190",
655
- "hash_input_tokens": "614054d17109a25d",
656
- "hash_cont_tokens": "50421e30bef398f9"
657
- },
658
- "truncated": 0,
659
- "non-truncated": 400,
660
- "padded": 400,
661
- "non-padded": 0,
662
- "effective_few_shots": 5.0,
663
- "num_truncated_few_shots": 0
664
- },
665
- "harness|hendrycksTest-college_medicine|5": {
666
- "hashes": {
667
- "hash_examples": "dd69cc33381275af",
668
- "hash_full_prompts": "ad2a53e5250ab46e",
669
- "hash_input_tokens": "081bb2b524defd1c",
670
- "hash_cont_tokens": "1f88b00d41957d82"
671
- },
672
- "truncated": 0,
673
- "non-truncated": 692,
674
- "padded": 692,
675
- "non-padded": 0,
676
- "effective_few_shots": 5.0,
677
- "num_truncated_few_shots": 0
678
- },
679
- "harness|hendrycksTest-college_physics|5": {
680
- "hashes": {
681
- "hash_examples": "875dd26d22655b0d",
682
- "hash_full_prompts": "833a0d7b55aed500",
683
- "hash_input_tokens": "5421d9a1af86cbd4",
684
- "hash_cont_tokens": "f7b8097afc16a47c"
685
- },
686
- "truncated": 0,
687
- "non-truncated": 408,
688
- "padded": 408,
689
- "non-padded": 0,
690
- "effective_few_shots": 5.0,
691
- "num_truncated_few_shots": 0
692
- },
693
- "harness|hendrycksTest-computer_security|5": {
694
- "hashes": {
695
- "hash_examples": "006451eedc0ededb",
696
- "hash_full_prompts": "94034c97e85d8f46",
697
- "hash_input_tokens": "5e6b70ecb333cf18",
698
- "hash_cont_tokens": "50421e30bef398f9"
699
- },
700
- "truncated": 0,
701
- "non-truncated": 400,
702
- "padded": 400,
703
- "non-padded": 0,
704
- "effective_few_shots": 5.0,
705
- "num_truncated_few_shots": 0
706
- },
707
- "harness|hendrycksTest-conceptual_physics|5": {
708
- "hashes": {
709
- "hash_examples": "8874ece872d2ca4c",
710
- "hash_full_prompts": "e40d15a34640d6fa",
711
- "hash_input_tokens": "c2ef11a87264ceed",
712
- "hash_cont_tokens": "aa0e8bc655f2f641"
713
- },
714
- "truncated": 0,
715
- "non-truncated": 940,
716
- "padded": 940,
717
- "non-padded": 0,
718
- "effective_few_shots": 5.0,
719
- "num_truncated_few_shots": 0
720
- },
721
- "harness|hendrycksTest-econometrics|5": {
722
- "hashes": {
723
- "hash_examples": "64d3623b0bfaa43f",
724
- "hash_full_prompts": "612f340fae41338d",
725
- "hash_input_tokens": "ecaccd912a4c3978",
726
- "hash_cont_tokens": "bfb7e3c3c88313f1"
727
- },
728
- "truncated": 0,
729
- "non-truncated": 456,
730
- "padded": 456,
731
- "non-padded": 0,
732
- "effective_few_shots": 5.0,
733
- "num_truncated_few_shots": 0
734
- },
735
- "harness|hendrycksTest-electrical_engineering|5": {
736
- "hashes": {
737
- "hash_examples": "e98f51780c674d7e",
738
- "hash_full_prompts": "10275b312d812ae6",
739
- "hash_input_tokens": "1590c84291399be8",
740
- "hash_cont_tokens": "2425a3f084a591ef"
741
- },
742
- "truncated": 0,
743
- "non-truncated": 580,
744
- "padded": 580,
745
- "non-padded": 0,
746
- "effective_few_shots": 5.0,
747
- "num_truncated_few_shots": 0
748
- },
749
- "harness|hendrycksTest-elementary_mathematics|5": {
750
- "hashes": {
751
- "hash_examples": "fc48208a5ac1c0ce",
752
- "hash_full_prompts": "5ec274c6c82aca23",
753
- "hash_input_tokens": "3269597f715b0da1",
754
- "hash_cont_tokens": "f52691aef15a407b"
755
- },
756
- "truncated": 0,
757
- "non-truncated": 1512,
758
- "padded": 1512,
759
- "non-padded": 0,
760
- "effective_few_shots": 5.0,
761
- "num_truncated_few_shots": 0
762
- },
763
- "harness|hendrycksTest-formal_logic|5": {
764
- "hashes": {
765
- "hash_examples": "5a6525665f63ea72",
766
- "hash_full_prompts": "07b92638c4a6b500",
767
- "hash_input_tokens": "a2800d20f3ab8d7c",
768
- "hash_cont_tokens": "f515d598d9c21263"
769
- },
770
- "truncated": 0,
771
- "non-truncated": 504,
772
- "padded": 504,
773
- "non-padded": 0,
774
- "effective_few_shots": 5.0,
775
- "num_truncated_few_shots": 0
776
- },
777
- "harness|hendrycksTest-global_facts|5": {
778
- "hashes": {
779
- "hash_examples": "371d70d743b2b89b",
780
- "hash_full_prompts": "332fdee50a1921b4",
781
- "hash_input_tokens": "94ed44b3772505ad",
782
- "hash_cont_tokens": "50421e30bef398f9"
783
- },
784
- "truncated": 0,
785
- "non-truncated": 400,
786
- "padded": 400,
787
- "non-padded": 0,
788
- "effective_few_shots": 5.0,
789
- "num_truncated_few_shots": 0
790
- },
791
- "harness|hendrycksTest-high_school_biology|5": {
792
- "hashes": {
793
- "hash_examples": "a79e1018b1674052",
794
- "hash_full_prompts": "e624e26ede922561",
795
- "hash_input_tokens": "24423acb928db768",
796
- "hash_cont_tokens": "bd85a4156a3613ee"
797
- },
798
- "truncated": 0,
799
- "non-truncated": 1240,
800
- "padded": 1240,
801
- "non-padded": 0,
802
- "effective_few_shots": 5.0,
803
- "num_truncated_few_shots": 0
804
- },
805
- "harness|hendrycksTest-high_school_chemistry|5": {
806
- "hashes": {
807
- "hash_examples": "44bfc25c389f0e03",
808
- "hash_full_prompts": "0e3e5f5d9246482a",
809
- "hash_input_tokens": "831ff35c474e5cef",
810
- "hash_cont_tokens": "a95c97af1c14e068"
811
- },
812
- "truncated": 0,
813
- "non-truncated": 812,
814
- "padded": 812,
815
- "non-padded": 0,
816
- "effective_few_shots": 5.0,
817
- "num_truncated_few_shots": 0
818
- },
819
- "harness|hendrycksTest-high_school_computer_science|5": {
820
- "hashes": {
821
- "hash_examples": "8b8cdb1084f24169",
822
- "hash_full_prompts": "c00487e67c1813cc",
823
- "hash_input_tokens": "a20a96b44dcc5b30",
824
- "hash_cont_tokens": "8abfedef914e33c9"
825
- },
826
- "truncated": 0,
827
- "non-truncated": 400,
828
- "padded": 400,
829
- "non-padded": 0,
830
- "effective_few_shots": 5.0,
831
- "num_truncated_few_shots": 0
832
- },
833
- "harness|hendrycksTest-high_school_european_history|5": {
834
- "hashes": {
835
- "hash_examples": "11cd32d0ef440171",
836
- "hash_full_prompts": "318f4513c537c6bf",
837
- "hash_input_tokens": "5002f4ac8b1562ca",
838
- "hash_cont_tokens": "674fc454bdc5ac93"
839
- },
840
- "truncated": 0,
841
- "non-truncated": 660,
842
- "padded": 656,
843
- "non-padded": 4,
844
- "effective_few_shots": 5.0,
845
- "num_truncated_few_shots": 0
846
- },
847
- "harness|hendrycksTest-high_school_geography|5": {
848
- "hashes": {
849
- "hash_examples": "b60019b9e80b642f",
850
- "hash_full_prompts": "ee5789fcc1a81b1e",
851
- "hash_input_tokens": "7c5547c7da5bc793",
852
- "hash_cont_tokens": "03a5012b916274ea"
853
- },
854
- "truncated": 0,
855
- "non-truncated": 792,
856
- "padded": 792,
857
- "non-padded": 0,
858
- "effective_few_shots": 5.0,
859
- "num_truncated_few_shots": 0
860
- },
861
- "harness|hendrycksTest-high_school_government_and_politics|5": {
862
- "hashes": {
863
- "hash_examples": "d221ec983d143dc3",
864
- "hash_full_prompts": "ac42d888e1ce1155",
865
- "hash_input_tokens": "f62991cb6a496b05",
866
- "hash_cont_tokens": "a83effb8f76b7d7c"
867
- },
868
- "truncated": 0,
869
- "non-truncated": 772,
870
- "padded": 772,
871
- "non-padded": 0,
872
- "effective_few_shots": 5.0,
873
- "num_truncated_few_shots": 0
874
- },
875
- "harness|hendrycksTest-high_school_macroeconomics|5": {
876
- "hashes": {
877
- "hash_examples": "59c2915cacfd3fbb",
878
- "hash_full_prompts": "c6bd9d25158abd0e",
879
- "hash_input_tokens": "4cef2aff6e3d59ed",
880
- "hash_cont_tokens": "c583432ad27fcfe0"
881
- },
882
- "truncated": 0,
883
- "non-truncated": 1560,
884
- "padded": 1560,
885
- "non-padded": 0,
886
- "effective_few_shots": 5.0,
887
- "num_truncated_few_shots": 0
888
- },
889
- "harness|hendrycksTest-high_school_mathematics|5": {
890
- "hashes": {
891
- "hash_examples": "1f8ac897608de342",
892
- "hash_full_prompts": "5d88f41fc2d643a8",
893
- "hash_input_tokens": "6e2577ea4082ed2b",
894
- "hash_cont_tokens": "24f5dc613660300b"
895
- },
896
- "truncated": 0,
897
- "non-truncated": 1080,
898
- "padded": 1080,
899
- "non-padded": 0,
900
- "effective_few_shots": 5.0,
901
- "num_truncated_few_shots": 0
902
- },
903
- "harness|hendrycksTest-high_school_microeconomics|5": {
904
- "hashes": {
905
- "hash_examples": "ead6a0f2f6c83370",
906
- "hash_full_prompts": "bfc393381298609e",
907
- "hash_input_tokens": "c5fc9aeb1079c8e4",
908
- "hash_cont_tokens": "f47f041de50333b9"
909
- },
910
- "truncated": 0,
911
- "non-truncated": 952,
912
- "padded": 952,
913
- "non-padded": 0,
914
- "effective_few_shots": 5.0,
915
- "num_truncated_few_shots": 0
916
- },
917
- "harness|hendrycksTest-high_school_physics|5": {
918
- "hashes": {
919
- "hash_examples": "c3f2025990afec64",
920
- "hash_full_prompts": "fc78b4997e436734",
921
- "hash_input_tokens": "555fc385cffa84ca",
922
- "hash_cont_tokens": "ba2efcd283e938cc"
923
- },
924
- "truncated": 0,
925
- "non-truncated": 604,
926
- "padded": 604,
927
- "non-padded": 0,
928
- "effective_few_shots": 5.0,
929
- "num_truncated_few_shots": 0
930
- },
931
- "harness|hendrycksTest-high_school_psychology|5": {
932
- "hashes": {
933
- "hash_examples": "21f8aab618f6d636",
934
- "hash_full_prompts": "d5c76aa40b9dbc43",
935
- "hash_input_tokens": "febd23cbf9973b7f",
936
- "hash_cont_tokens": "942069cd363844d9"
937
- },
938
- "truncated": 0,
939
- "non-truncated": 2180,
940
- "padded": 2180,
941
- "non-padded": 0,
942
- "effective_few_shots": 5.0,
943
- "num_truncated_few_shots": 0
944
- },
945
- "harness|hendrycksTest-high_school_statistics|5": {
946
- "hashes": {
947
- "hash_examples": "2386a60a11fc5de3",
948
- "hash_full_prompts": "4c5c8be5aafac432",
949
- "hash_input_tokens": "400e55b56ee6fbd7",
950
- "hash_cont_tokens": "955ed42b6f7fa019"
951
- },
952
- "truncated": 0,
953
- "non-truncated": 864,
954
- "padded": 864,
955
- "non-padded": 0,
956
- "effective_few_shots": 5.0,
957
- "num_truncated_few_shots": 0
958
- },
959
- "harness|hendrycksTest-high_school_us_history|5": {
960
- "hashes": {
961
- "hash_examples": "74961543be40f04f",
962
- "hash_full_prompts": "5d5ca4840131ba21",
963
- "hash_input_tokens": "c639cce12a46ebad",
964
- "hash_cont_tokens": "cdd0b3dc06d933e5"
965
- },
966
- "truncated": 0,
967
- "non-truncated": 816,
968
- "padded": 816,
969
- "non-padded": 0,
970
- "effective_few_shots": 5.0,
971
- "num_truncated_few_shots": 0
972
- },
973
- "harness|hendrycksTest-high_school_world_history|5": {
974
- "hashes": {
975
- "hash_examples": "2ad2f6b7198b2234",
976
- "hash_full_prompts": "11845057459afd72",
977
- "hash_input_tokens": "b9762065cce6f3a6",
978
- "hash_cont_tokens": "9a864184946033ac"
979
- },
980
- "truncated": 0,
981
- "non-truncated": 948,
982
- "padded": 948,
983
- "non-padded": 0,
984
- "effective_few_shots": 5.0,
985
- "num_truncated_few_shots": 0
986
- },
987
- "harness|hendrycksTest-human_aging|5": {
988
- "hashes": {
989
- "hash_examples": "1a7199dc733e779b",
990
- "hash_full_prompts": "756b9096b8eaf892",
991
- "hash_input_tokens": "541a75f071dcf579",
992
- "hash_cont_tokens": "142a4a8a1138a214"
993
- },
994
- "truncated": 0,
995
- "non-truncated": 892,
996
- "padded": 892,
997
- "non-padded": 0,
998
- "effective_few_shots": 5.0,
999
- "num_truncated_few_shots": 0
1000
- },
1001
- "harness|hendrycksTest-human_sexuality|5": {
1002
- "hashes": {
1003
- "hash_examples": "7acb8fdad97f88a6",
1004
- "hash_full_prompts": "731a52ff15b8cfdb",
1005
- "hash_input_tokens": "04269e5c5a257dd9",
1006
- "hash_cont_tokens": "bc54813e809b796d"
1007
- },
1008
- "truncated": 0,
1009
- "non-truncated": 524,
1010
- "padded": 524,
1011
- "non-padded": 0,
1012
- "effective_few_shots": 5.0,
1013
- "num_truncated_few_shots": 0
1014
- },
1015
- "harness|hendrycksTest-international_law|5": {
1016
- "hashes": {
1017
- "hash_examples": "1300bfd0dfc59114",
1018
- "hash_full_prompts": "db2aefbff5eec996",
1019
- "hash_input_tokens": "d93ba9d9d38e4397",
1020
- "hash_cont_tokens": "dc45b45fcda18e5d"
1021
- },
1022
- "truncated": 0,
1023
- "non-truncated": 484,
1024
- "padded": 484,
1025
- "non-padded": 0,
1026
- "effective_few_shots": 5.0,
1027
- "num_truncated_few_shots": 0
1028
- },
1029
- "harness|hendrycksTest-jurisprudence|5": {
1030
- "hashes": {
1031
- "hash_examples": "083b1e4904c48dc2",
1032
- "hash_full_prompts": "0f89ee3fe03d6a21",
1033
- "hash_input_tokens": "9eeaccd2698b4f5a",
1034
- "hash_cont_tokens": "e3a8cd951b6e3469"
1035
- },
1036
- "truncated": 0,
1037
- "non-truncated": 432,
1038
- "padded": 432,
1039
- "non-padded": 0,
1040
- "effective_few_shots": 5.0,
1041
- "num_truncated_few_shots": 0
1042
- },
1043
- "harness|hendrycksTest-logical_fallacies|5": {
1044
- "hashes": {
1045
- "hash_examples": "709128f9926a634c",
1046
- "hash_full_prompts": "98a04b1f8f841069",
1047
- "hash_input_tokens": "b4f08f544f2b7576",
1048
- "hash_cont_tokens": "1e80dbd30f6453d5"
1049
- },
1050
- "truncated": 0,
1051
- "non-truncated": 652,
1052
- "padded": 648,
1053
- "non-padded": 4,
1054
- "effective_few_shots": 5.0,
1055
- "num_truncated_few_shots": 0
1056
- },
1057
- "harness|hendrycksTest-machine_learning|5": {
1058
- "hashes": {
1059
- "hash_examples": "88f22a636029ae47",
1060
- "hash_full_prompts": "2e1c8d4b1e0cc921",
1061
- "hash_input_tokens": "900c2a51f1174b9f",
1062
- "hash_cont_tokens": "9b37da7777378ca9"
1063
- },
1064
- "truncated": 0,
1065
- "non-truncated": 448,
1066
- "padded": 448,
1067
- "non-padded": 0,
1068
- "effective_few_shots": 5.0,
1069
- "num_truncated_few_shots": 0
1070
- },
1071
- "harness|hendrycksTest-management|5": {
1072
- "hashes": {
1073
- "hash_examples": "8c8a1e07a2151dca",
1074
- "hash_full_prompts": "f51611f514b265b0",
1075
- "hash_input_tokens": "6b36efb4689c6eca",
1076
- "hash_cont_tokens": "a01d6d39a83c4597"
1077
- },
1078
- "truncated": 0,
1079
- "non-truncated": 412,
1080
- "padded": 412,
1081
- "non-padded": 0,
1082
- "effective_few_shots": 5.0,
1083
- "num_truncated_few_shots": 0
1084
- },
1085
- "harness|hendrycksTest-marketing|5": {
1086
- "hashes": {
1087
- "hash_examples": "2668953431f91e96",
1088
- "hash_full_prompts": "77562bef997c7650",
1089
- "hash_input_tokens": "2aaac78a0cfed47a",
1090
- "hash_cont_tokens": "6aeaed4d823c98aa"
1091
- },
1092
- "truncated": 0,
1093
- "non-truncated": 936,
1094
- "padded": 936,
1095
- "non-padded": 0,
1096
- "effective_few_shots": 5.0,
1097
- "num_truncated_few_shots": 0
1098
- },
1099
- "harness|hendrycksTest-medical_genetics|5": {
1100
- "hashes": {
1101
- "hash_examples": "9c2dda34a2ea4fd2",
1102
- "hash_full_prompts": "202139046daa118f",
1103
- "hash_input_tokens": "886ca823b41c094a",
1104
- "hash_cont_tokens": "50421e30bef398f9"
1105
- },
1106
- "truncated": 0,
1107
- "non-truncated": 400,
1108
- "padded": 400,
1109
- "non-padded": 0,
1110
- "effective_few_shots": 5.0,
1111
- "num_truncated_few_shots": 0
1112
- },
1113
- "harness|hendrycksTest-miscellaneous|5": {
1114
- "hashes": {
1115
- "hash_examples": "41adb694024809c2",
1116
- "hash_full_prompts": "bffec9fc237bcf93",
1117
- "hash_input_tokens": "72fd71de7675e7d0",
1118
- "hash_cont_tokens": "9b0ab02a64603081"
1119
- },
1120
- "truncated": 0,
1121
- "non-truncated": 3132,
1122
- "padded": 3132,
1123
- "non-padded": 0,
1124
- "effective_few_shots": 5.0,
1125
- "num_truncated_few_shots": 0
1126
- },
1127
- "harness|hendrycksTest-moral_disputes|5": {
1128
- "hashes": {
1129
- "hash_examples": "3171c13ba3c594c4",
1130
- "hash_full_prompts": "170831fc36f1d59e",
1131
- "hash_input_tokens": "f3ca0dd8e7a1eb09",
1132
- "hash_cont_tokens": "8badf768f7b0467a"
1133
- },
1134
- "truncated": 0,
1135
- "non-truncated": 1384,
1136
- "padded": 1354,
1137
- "non-padded": 30,
1138
- "effective_few_shots": 5.0,
1139
- "num_truncated_few_shots": 0
1140
- },
1141
- "harness|hendrycksTest-moral_scenarios|5": {
1142
- "hashes": {
1143
- "hash_examples": "9873e077e83e0546",
1144
- "hash_full_prompts": "08f4ceba3131a068",
1145
- "hash_input_tokens": "3e793631e951f23c",
1146
- "hash_cont_tokens": "32ae620376b2bbba"
1147
- },
1148
- "truncated": 0,
1149
- "non-truncated": 3580,
1150
- "padded": 3580,
1151
- "non-padded": 0,
1152
- "effective_few_shots": 5.0,
1153
- "num_truncated_few_shots": 0
1154
- },
1155
- "harness|hendrycksTest-nutrition|5": {
1156
- "hashes": {
1157
- "hash_examples": "7db1d8142ec14323",
1158
- "hash_full_prompts": "4c0e68e3586cb453",
1159
- "hash_input_tokens": "59753c2144ea93af",
1160
- "hash_cont_tokens": "3071def75bacc404"
1161
- },
1162
- "truncated": 0,
1163
- "non-truncated": 1224,
1164
- "padded": 1224,
1165
- "non-padded": 0,
1166
- "effective_few_shots": 5.0,
1167
- "num_truncated_few_shots": 0
1168
- },
1169
- "harness|hendrycksTest-philosophy|5": {
1170
- "hashes": {
1171
- "hash_examples": "9b455b7d72811cc8",
1172
- "hash_full_prompts": "e467f822d8a0d3ff",
1173
- "hash_input_tokens": "bd8d3dbed15a8c34",
1174
- "hash_cont_tokens": "9f6ff69d23a48783"
1175
- },
1176
- "truncated": 0,
1177
- "non-truncated": 1244,
1178
- "padded": 1244,
1179
- "non-padded": 0,
1180
- "effective_few_shots": 5.0,
1181
- "num_truncated_few_shots": 0
1182
- },
1183
- "harness|hendrycksTest-prehistory|5": {
1184
- "hashes": {
1185
- "hash_examples": "8be90d0f538f1560",
1186
- "hash_full_prompts": "152187949bcd0921",
1187
- "hash_input_tokens": "3573cd87facbb7c5",
1188
- "hash_cont_tokens": "de469d2b981e32a3"
1189
- },
1190
- "truncated": 0,
1191
- "non-truncated": 1296,
1192
- "padded": 1296,
1193
- "non-padded": 0,
1194
- "effective_few_shots": 5.0,
1195
- "num_truncated_few_shots": 0
1196
- },
1197
- "harness|hendrycksTest-professional_accounting|5": {
1198
- "hashes": {
1199
- "hash_examples": "8d377597916cd07e",
1200
- "hash_full_prompts": "0eb7345d6144ee0d",
1201
- "hash_input_tokens": "17e721bc1a7cbb47",
1202
- "hash_cont_tokens": "c46f74d2dfc7b13b"
1203
- },
1204
- "truncated": 0,
1205
- "non-truncated": 1128,
1206
- "padded": 1128,
1207
- "non-padded": 0,
1208
- "effective_few_shots": 5.0,
1209
- "num_truncated_few_shots": 0
1210
- },
1211
- "harness|hendrycksTest-professional_law|5": {
1212
- "hashes": {
1213
- "hash_examples": "cd9dbc52b3c932d6",
1214
- "hash_full_prompts": "36ac764272bfb182",
1215
- "hash_input_tokens": "c9f7583fff66d361",
1216
- "hash_cont_tokens": "2e590029ef41fbcd"
1217
- },
1218
- "truncated": 0,
1219
- "non-truncated": 6136,
1220
- "padded": 6136,
1221
- "non-padded": 0,
1222
- "effective_few_shots": 5.0,
1223
- "num_truncated_few_shots": 0
1224
- },
1225
- "harness|hendrycksTest-professional_medicine|5": {
1226
- "hashes": {
1227
- "hash_examples": "b20e4e816c1e383e",
1228
- "hash_full_prompts": "7b8d69ea2acaf2f7",
1229
- "hash_input_tokens": "40a933f829116f8d",
1230
- "hash_cont_tokens": "fe35cfa9c6ca802e"
1231
- },
1232
- "truncated": 0,
1233
- "non-truncated": 1088,
1234
- "padded": 1088,
1235
- "non-padded": 0,
1236
- "effective_few_shots": 5.0,
1237
- "num_truncated_few_shots": 0
1238
- },
1239
- "harness|hendrycksTest-professional_psychology|5": {
1240
- "hashes": {
1241
- "hash_examples": "d45b73b22f9cc039",
1242
- "hash_full_prompts": "fe8937e9ffc99771",
1243
- "hash_input_tokens": "0dfb73a8eb3f692c",
1244
- "hash_cont_tokens": "f020fbddf72c8652"
1245
- },
1246
- "truncated": 0,
1247
- "non-truncated": 2448,
1248
- "padded": 2448,
1249
- "non-padded": 0,
1250
- "effective_few_shots": 5.0,
1251
- "num_truncated_few_shots": 0
1252
- },
1253
- "harness|hendrycksTest-public_relations|5": {
1254
- "hashes": {
1255
- "hash_examples": "0d25072e1761652a",
1256
- "hash_full_prompts": "f9adc39cfa9f42ba",
1257
- "hash_input_tokens": "1710c6ba4c9f3cbd",
1258
- "hash_cont_tokens": "568f585a259965c1"
1259
- },
1260
- "truncated": 0,
1261
- "non-truncated": 440,
1262
- "padded": 440,
1263
- "non-padded": 0,
1264
- "effective_few_shots": 5.0,
1265
- "num_truncated_few_shots": 0
1266
- },
1267
- "harness|hendrycksTest-security_studies|5": {
1268
- "hashes": {
1269
- "hash_examples": "62bb8197e63d60d4",
1270
- "hash_full_prompts": "869c9c3ae196b7c3",
1271
- "hash_input_tokens": "32a03f1f22a6e103",
1272
- "hash_cont_tokens": "cc6fd7cccd64cd5d"
1273
- },
1274
- "truncated": 0,
1275
- "non-truncated": 980,
1276
- "padded": 980,
1277
- "non-padded": 0,
1278
- "effective_few_shots": 5.0,
1279
- "num_truncated_few_shots": 0
1280
- },
1281
- "harness|hendrycksTest-sociology|5": {
1282
- "hashes": {
1283
- "hash_examples": "e7959df87dea8672",
1284
- "hash_full_prompts": "1a1fc00e17b3a52a",
1285
- "hash_input_tokens": "828999f7624cbe7e",
1286
- "hash_cont_tokens": "c3a3bdfd177eed5b"
1287
- },
1288
- "truncated": 0,
1289
- "non-truncated": 804,
1290
- "padded": 804,
1291
- "non-padded": 0,
1292
- "effective_few_shots": 5.0,
1293
- "num_truncated_few_shots": 0
1294
- },
1295
- "harness|hendrycksTest-us_foreign_policy|5": {
1296
- "hashes": {
1297
- "hash_examples": "4a56a01ddca44dca",
1298
- "hash_full_prompts": "0c7a7081c71c07b6",
1299
- "hash_input_tokens": "42054621e718dbee",
1300
- "hash_cont_tokens": "2568d0e8e36fa959"
1301
- },
1302
- "truncated": 0,
1303
- "non-truncated": 400,
1304
- "padded": 400,
1305
- "non-padded": 0,
1306
- "effective_few_shots": 5.0,
1307
- "num_truncated_few_shots": 0
1308
- },
1309
- "harness|hendrycksTest-virology|5": {
1310
- "hashes": {
1311
- "hash_examples": "451cc86a8c4f4fe9",
1312
- "hash_full_prompts": "01e95325d8b738e4",
1313
- "hash_input_tokens": "6c4f0aa4dc859c04",
1314
- "hash_cont_tokens": "926cf60b0891f374"
1315
- },
1316
- "truncated": 0,
1317
- "non-truncated": 664,
1318
- "padded": 664,
1319
- "non-padded": 0,
1320
- "effective_few_shots": 5.0,
1321
- "num_truncated_few_shots": 0
1322
- },
1323
- "harness|hendrycksTest-world_religions|5": {
1324
- "hashes": {
1325
- "hash_examples": "3b29cfaf1a81c379",
1326
- "hash_full_prompts": "e0d79a15083dfdff",
1327
- "hash_input_tokens": "6c75d44e092ff24f",
1328
- "hash_cont_tokens": "c525a5de974c1ea3"
1329
- },
1330
- "truncated": 0,
1331
- "non-truncated": 684,
1332
- "padded": 684,
1333
- "non-padded": 0,
1334
- "effective_few_shots": 5.0,
1335
- "num_truncated_few_shots": 0
1336
- },
1337
- "harness|truthfulqa:mc|0": {
1338
- "hashes": {
1339
- "hash_examples": "23176c0531c7b867",
1340
- "hash_full_prompts": "36a6d90e75d92d4a",
1341
- "hash_input_tokens": "2738d7ed7075faa7",
1342
- "hash_cont_tokens": "c014154380b74b9e"
1343
- },
1344
- "truncated": 0,
1345
- "non-truncated": 9996,
1346
- "padded": 9996,
1347
- "non-padded": 0,
1348
- "effective_few_shots": 0.0,
1349
- "num_truncated_few_shots": 0
1350
- }
1351
- },
1352
- "summary_general": {
1353
- "hashes": {
1354
- "hash_examples": "d84d18e9a963753d",
1355
- "hash_full_prompts": "12b540783521a8e6",
1356
- "hash_input_tokens": "5c73a7dce6ccf737",
1357
- "hash_cont_tokens": "fb1646e2bdd5fc38"
1358
- },
1359
- "total_evaluation_time_secondes": "36167.03701233864",
1360
- "truncated": 0,
1361
- "non-truncated": 111019,
1362
- "padded": 110926,
1363
- "non-padded": 93,
1364
- "num_truncated_few_shots": 0
1365
- }
1366
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
meta-llama/Llama-2-13b-hf/results_2023-09-07T13-43-41.802129.json DELETED
@@ -1,61 +0,0 @@
1
- {
2
- "config_general": {
3
- "model_name": "meta-llama/Llama-2-13b-hf",
4
- "model_sha": "db6b8eb1feabb38985fdf785a89895959e944936",
5
- "model_size": "24.32 GB",
6
- "model_dtype": "torch.float16",
7
- "lighteval_sha": "457ac5672c5fdebfd6bc95bb94bda825c148eccf",
8
- "num_few_shot_default": 0,
9
- "num_fewshot_seeds": 1,
10
- "override_batch_size": 1,
11
- "max_samples": null,
12
- "job_id": ""
13
- },
14
- "results": {
15
- "harness|winogrande|5": {
16
- "acc": 0.7663772691397001,
17
- "acc_stderr": 0.011892194477183524
18
- },
19
- "all": {
20
- "acc": 0.7663772691397001,
21
- "acc_stderr": 0.011892194477183524
22
- }
23
- },
24
- "versions": {
25
- "harness|winogrande|5": 0,
26
- "all": 0
27
- },
28
- "config_tasks": {
29
- "harness|winogrande": "LM Harness task"
30
- },
31
- "summary_tasks": {
32
- "harness|winogrande|5": {
33
- "hashes": {
34
- "hash_examples": "aada0a176fd81218",
35
- "hash_full_prompts": "c8655cbd12de8409",
36
- "hash_input_tokens": "c0bedf98cb040854",
37
- "hash_cont_tokens": "f08975ad6f2d5864"
38
- },
39
- "truncated": 0,
40
- "non-truncated": 2534,
41
- "padded": 2432,
42
- "non-padded": 102,
43
- "effective_few_shots": 5.0,
44
- "num_truncated_few_shots": 0
45
- }
46
- },
47
- "summary_general": {
48
- "hashes": {
49
- "hash_examples": "42f54c7ae3f28ef3",
50
- "hash_full_prompts": "897c968b27a8c59a",
51
- "hash_input_tokens": "ee5c3cb253d643d1",
52
- "hash_cont_tokens": "273a70958f734c00"
53
- },
54
- "total_evaluation_time_secondes": "172.65713024139404",
55
- "truncated": 0,
56
- "non-truncated": 2534,
57
- "padded": 2432,
58
- "non-padded": 102,
59
- "num_truncated_few_shots": 0
60
- }
61
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
meta-llama/Llama-2-13b-hf/results_2023-09-07T15-27-15.010124.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "config_general": {
3
- "model_name": "meta-llama/Llama-2-13b-hf",
4
- "model_sha": "db6b8eb1feabb38985fdf785a89895959e944936",
5
- "model_size": "24.32 GB",
6
- "model_dtype": "torch.float16",
7
- "lighteval_sha": "457ac5672c5fdebfd6bc95bb94bda825c148eccf",
8
- "num_few_shot_default": 0,
9
- "num_fewshot_seeds": 1,
10
- "override_batch_size": 1,
11
- "max_samples": null,
12
- "job_id": ""
13
- },
14
- "results": {
15
- "harness|drop|3": {
16
- "em": 0.0014681208053691276,
17
- "em_stderr": 0.00039210421902982666,
18
- "f1": 0.0607822986577181,
19
- "f1_stderr": 0.0013583957676382913
20
- },
21
- "harness|gsm8k|5": {
22
- "acc": 0.10841546626231995,
23
- "acc_stderr": 0.008563852506627487
24
- },
25
- "harness|winogrande|5": {
26
- "acc": 0.7663772691397001,
27
- "acc_stderr": 0.011892194477183524
28
- },
29
- "all": {
30
- "em": 0.0014681208053691276,
31
- "em_stderr": 0.00039210421902982666,
32
- "f1": 0.0607822986577181,
33
- "f1_stderr": 0.0013583957676382913,
34
- "acc": 0.43739636770101,
35
- "acc_stderr": 0.010228023491905505
36
- }
37
- },
38
- "versions": {
39
- "harness|drop|3": 1,
40
- "harness|gsm8k|5": 0,
41
- "harness|winogrande|5": 0,
42
- "all": 0
43
- },
44
- "config_tasks": {
45
- "harness|drop": "LM Harness task",
46
- "harness|gsm8k": "LM Harness task",
47
- "harness|winogrande": "LM Harness task"
48
- },
49
- "summary_tasks": {
50
- "harness|drop|3": {
51
- "hashes": {
52
- "hash_examples": "1d27416e8324e9a3",
53
- "hash_full_prompts": "a5513ff9a741b385",
54
- "hash_input_tokens": "42076f0efbb50aa6",
55
- "hash_cont_tokens": "c9346ec21b7560de"
56
- },
57
- "truncated": 3,
58
- "non-truncated": 9533,
59
- "padded": 0,
60
- "non-padded": 9536,
61
- "effective_few_shots": 3.0,
62
- "num_truncated_few_shots": 0
63
- },
64
- "harness|gsm8k|5": {
65
- "hashes": {
66
- "hash_examples": "4c0843a5d99bcfdc",
67
- "hash_full_prompts": "41d55e83abc0e02d",
68
- "hash_input_tokens": "bda342e47b5099b2",
69
- "hash_cont_tokens": "32cafa77d8a3f04e"
70
- },
71
- "truncated": 0,
72
- "non-truncated": 1319,
73
- "padded": 0,
74
- "non-padded": 1319,
75
- "effective_few_shots": 5.0,
76
- "num_truncated_few_shots": 0
77
- },
78
- "harness|winogrande|5": {
79
- "hashes": {
80
- "hash_examples": "aada0a176fd81218",
81
- "hash_full_prompts": "c8655cbd12de8409",
82
- "hash_input_tokens": "c0bedf98cb040854",
83
- "hash_cont_tokens": "f08975ad6f2d5864"
84
- },
85
- "truncated": 0,
86
- "non-truncated": 2534,
87
- "padded": 2432,
88
- "non-padded": 102,
89
- "effective_few_shots": 5.0,
90
- "num_truncated_few_shots": 0
91
- }
92
- },
93
- "summary_general": {
94
- "hashes": {
95
- "hash_examples": "9b4d8993161e637d",
96
- "hash_full_prompts": "08215e527b7e60a5",
97
- "hash_input_tokens": "a12f3e3c934bd78b",
98
- "hash_cont_tokens": "4d8f1e04b1d56e40"
99
- },
100
- "total_evaluation_time_secondes": "6066.877633810043",
101
- "truncated": 3,
102
- "non-truncated": 13386,
103
- "padded": 2432,
104
- "non-padded": 10957,
105
- "num_truncated_few_shots": 0
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
meta-llama/Llama-2-13b-hf/results_2023-09-08T14-32-14.957248.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "config_general": {
3
- "model_name": "meta-llama/Llama-2-13b-hf",
4
- "model_sha": "db6b8eb1feabb38985fdf785a89895959e944936",
5
- "model_size": "24.32 GB",
6
- "model_dtype": "torch.float16",
7
- "lighteval_sha": "457ac5672c5fdebfd6bc95bb94bda825c148eccf",
8
- "num_few_shot_default": 0,
9
- "num_fewshot_seeds": 1,
10
- "override_batch_size": 1,
11
- "max_samples": null,
12
- "job_id": ""
13
- },
14
- "results": {
15
- "harness|drop|3": {
16
- "em": 0.0014681208053691276,
17
- "em_stderr": 0.00039210421902982666,
18
- "f1": 0.0607822986577181,
19
- "f1_stderr": 0.0013583957676382913
20
- },
21
- "harness|gsm8k|5": {
22
- "acc": 0.10841546626231995,
23
- "acc_stderr": 0.008563852506627487
24
- },
25
- "harness|winogrande|5": {
26
- "acc": 0.7663772691397001,
27
- "acc_stderr": 0.011892194477183524
28
- },
29
- "all": {
30
- "em": 0.0014681208053691276,
31
- "em_stderr": 0.00039210421902982666,
32
- "f1": 0.0607822986577181,
33
- "f1_stderr": 0.0013583957676382913,
34
- "acc": 0.43739636770101,
35
- "acc_stderr": 0.010228023491905505
36
- }
37
- },
38
- "versions": {
39
- "harness|drop|3": 1,
40
- "harness|gsm8k|5": 0,
41
- "harness|winogrande|5": 0,
42
- "all": 0
43
- },
44
- "config_tasks": {
45
- "harness|drop": "LM Harness task",
46
- "harness|gsm8k": "LM Harness task",
47
- "harness|winogrande": "LM Harness task"
48
- },
49
- "summary_tasks": {
50
- "harness|drop|3": {
51
- "hashes": {
52
- "hash_examples": "1d27416e8324e9a3",
53
- "hash_full_prompts": "a5513ff9a741b385",
54
- "hash_input_tokens": "42076f0efbb50aa6",
55
- "hash_cont_tokens": "c9346ec21b7560de"
56
- },
57
- "truncated": 3,
58
- "non-truncated": 9533,
59
- "padded": 0,
60
- "non-padded": 9536,
61
- "effective_few_shots": 3.0,
62
- "num_truncated_few_shots": 0
63
- },
64
- "harness|gsm8k|5": {
65
- "hashes": {
66
- "hash_examples": "4c0843a5d99bcfdc",
67
- "hash_full_prompts": "41d55e83abc0e02d",
68
- "hash_input_tokens": "bda342e47b5099b2",
69
- "hash_cont_tokens": "32cafa77d8a3f04e"
70
- },
71
- "truncated": 0,
72
- "non-truncated": 1319,
73
- "padded": 0,
74
- "non-padded": 1319,
75
- "effective_few_shots": 5.0,
76
- "num_truncated_few_shots": 0
77
- },
78
- "harness|winogrande|5": {
79
- "hashes": {
80
- "hash_examples": "aada0a176fd81218",
81
- "hash_full_prompts": "c8655cbd12de8409",
82
- "hash_input_tokens": "c0bedf98cb040854",
83
- "hash_cont_tokens": "f08975ad6f2d5864"
84
- },
85
- "truncated": 0,
86
- "non-truncated": 2534,
87
- "padded": 2432,
88
- "non-padded": 102,
89
- "effective_few_shots": 5.0,
90
- "num_truncated_few_shots": 0
91
- }
92
- },
93
- "summary_general": {
94
- "hashes": {
95
- "hash_examples": "9b4d8993161e637d",
96
- "hash_full_prompts": "08215e527b7e60a5",
97
- "hash_input_tokens": "a12f3e3c934bd78b",
98
- "hash_cont_tokens": "4d8f1e04b1d56e40"
99
- },
100
- "total_evaluation_time_secondes": "6159.0038006305695",
101
- "truncated": 3,
102
- "non-truncated": 13386,
103
- "padded": 2432,
104
- "non-padded": 10957,
105
- "num_truncated_few_shots": 0
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
meta-llama/Llama-2-13b-hf/results_2023-10-14T23-00-26.644553.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "config_general": {
3
- "model_name": "meta-llama/Llama-2-13b-hf",
4
- "model_sha": "99afe33d7eaa87c7fc6ea2594a0e4e7e588ee0a4",
5
- "model_size": "24.32 GB",
6
- "model_dtype": "torch.float16",
7
- "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374",
8
- "num_few_shot_default": 0,
9
- "num_fewshot_seeds": 1,
10
- "override_batch_size": 1,
11
- "max_samples": null,
12
- "job_id": ""
13
- },
14
- "results": {
15
- "harness|drop|3": {
16
- "em": 0.0014681208053691276,
17
- "em_stderr": 0.00039210421902982666,
18
- "f1": 0.0607822986577181,
19
- "f1_stderr": 0.0013583957676382913
20
- },
21
- "harness|gsm8k|5": {
22
- "acc": 0.10841546626231995,
23
- "acc_stderr": 0.008563852506627487
24
- },
25
- "harness|winogrande|5": {
26
- "acc": 0.7663772691397001,
27
- "acc_stderr": 0.011892194477183524
28
- },
29
- "all": {
30
- "em": 0.0014681208053691276,
31
- "em_stderr": 0.00039210421902982666,
32
- "f1": 0.0607822986577181,
33
- "f1_stderr": 0.0013583957676382913,
34
- "acc": 0.43739636770101,
35
- "acc_stderr": 0.010228023491905505
36
- }
37
- },
38
- "versions": {
39
- "harness|drop|3": 1,
40
- "harness|gsm8k|5": 0,
41
- "harness|winogrande|5": 0,
42
- "all": 0
43
- },
44
- "config_tasks": {
45
- "harness|drop": "LM Harness task",
46
- "harness|gsm8k": "LM Harness task",
47
- "harness|winogrande": "LM Harness task"
48
- },
49
- "summary_tasks": {
50
- "harness|drop|3": {
51
- "hashes": {
52
- "hash_examples": "1d27416e8324e9a3",
53
- "hash_full_prompts": "a5513ff9a741b385",
54
- "hash_input_tokens": "42076f0efbb50aa6",
55
- "hash_cont_tokens": "c9346ec21b7560de"
56
- },
57
- "truncated": 3,
58
- "non-truncated": 9533,
59
- "padded": 0,
60
- "non-padded": 9536,
61
- "effective_few_shots": 3.0,
62
- "num_truncated_few_shots": 0
63
- },
64
- "harness|gsm8k|5": {
65
- "hashes": {
66
- "hash_examples": "4c0843a5d99bcfdc",
67
- "hash_full_prompts": "41d55e83abc0e02d",
68
- "hash_input_tokens": "bda342e47b5099b2",
69
- "hash_cont_tokens": "32cafa77d8a3f04e"
70
- },
71
- "truncated": 0,
72
- "non-truncated": 1319,
73
- "padded": 0,
74
- "non-padded": 1319,
75
- "effective_few_shots": 5.0,
76
- "num_truncated_few_shots": 0
77
- },
78
- "harness|winogrande|5": {
79
- "hashes": {
80
- "hash_examples": "aada0a176fd81218",
81
- "hash_full_prompts": "c8655cbd12de8409",
82
- "hash_input_tokens": "c0bedf98cb040854",
83
- "hash_cont_tokens": "f08975ad6f2d5864"
84
- },
85
- "truncated": 0,
86
- "non-truncated": 2534,
87
- "padded": 2432,
88
- "non-padded": 102,
89
- "effective_few_shots": 5.0,
90
- "num_truncated_few_shots": 0
91
- }
92
- },
93
- "summary_general": {
94
- "hashes": {
95
- "hash_examples": "9b4d8993161e637d",
96
- "hash_full_prompts": "08215e527b7e60a5",
97
- "hash_input_tokens": "a12f3e3c934bd78b",
98
- "hash_cont_tokens": "4d8f1e04b1d56e40"
99
- },
100
- "total_evaluation_time_secondes": "11938.282367944717",
101
- "truncated": 3,
102
- "non-truncated": 13386,
103
- "padded": 2432,
104
- "non-padded": 10957,
105
- "num_truncated_few_shots": 0
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
meta-llama/Llama-2-70b-chat-hf/results.json DELETED
@@ -1,868 +0,0 @@
1
- {
2
- "results": {
3
- "harness|arc:challenge|25": {
4
- "acc": 0.6049488054607508,
5
- "acc_stderr": 0.01428589829293817,
6
- "acc_norm": 0.6459044368600683,
7
- "acc_norm_stderr": 0.013975454122756564
8
- },
9
- "harness|hellaswag|10": {
10
- "acc": 0.6693885680143398,
11
- "acc_stderr": 0.004694718918225751,
12
- "acc_norm": 0.8587930691097391,
13
- "acc_norm_stderr": 0.003475231889452833
14
- },
15
- "harness|truthfulqa:mc|0": {
16
- "mc1": 0.3561811505507956,
17
- "mc1_stderr": 0.016763790728446335,
18
- "mc2": 0.5280473232260097,
19
- "mc2_stderr": 0.01553022126123046
20
- },
21
- "harness|hendrycksTest-abstract_algebra|5": {
22
- "acc": 0.35,
23
- "acc_stderr": 0.04793724854411021,
24
- "acc_norm": 0.35,
25
- "acc_norm_stderr": 0.04793724854411021
26
- },
27
- "harness|hendrycksTest-anatomy|5": {
28
- "acc": 0.5185185185185185,
29
- "acc_stderr": 0.043163785995113245,
30
- "acc_norm": 0.5185185185185185,
31
- "acc_norm_stderr": 0.043163785995113245
32
- },
33
- "harness|hendrycksTest-astronomy|5": {
34
- "acc": 0.7302631578947368,
35
- "acc_stderr": 0.03611780560284898,
36
- "acc_norm": 0.7302631578947368,
37
- "acc_norm_stderr": 0.03611780560284898
38
- },
39
- "harness|hendrycksTest-business_ethics|5": {
40
- "acc": 0.65,
41
- "acc_stderr": 0.0479372485441102,
42
- "acc_norm": 0.65,
43
- "acc_norm_stderr": 0.0479372485441102
44
- },
45
- "harness|hendrycksTest-clinical_knowledge|5": {
46
- "acc": 0.6377358490566037,
47
- "acc_stderr": 0.029582245128384303,
48
- "acc_norm": 0.6377358490566037,
49
- "acc_norm_stderr": 0.029582245128384303
50
- },
51
- "harness|hendrycksTest-college_biology|5": {
52
- "acc": 0.75,
53
- "acc_stderr": 0.03621034121889507,
54
- "acc_norm": 0.75,
55
- "acc_norm_stderr": 0.03621034121889507
56
- },
57
- "harness|hendrycksTest-college_chemistry|5": {
58
- "acc": 0.48,
59
- "acc_stderr": 0.050211673156867795,
60
- "acc_norm": 0.48,
61
- "acc_norm_stderr": 0.050211673156867795
62
- },
63
- "harness|hendrycksTest-college_computer_science|5": {
64
- "acc": 0.59,
65
- "acc_stderr": 0.04943110704237101,
66
- "acc_norm": 0.59,
67
- "acc_norm_stderr": 0.04943110704237101
68
- },
69
- "harness|hendrycksTest-college_mathematics|5": {
70
- "acc": 0.34,
71
- "acc_stderr": 0.04760952285695235,
72
- "acc_norm": 0.34,
73
- "acc_norm_stderr": 0.04760952285695235
74
- },
75
- "harness|hendrycksTest-college_medicine|5": {
76
- "acc": 0.6011560693641619,
77
- "acc_stderr": 0.0373362665538351,
78
- "acc_norm": 0.6011560693641619,
79
- "acc_norm_stderr": 0.0373362665538351
80
- },
81
- "harness|hendrycksTest-college_physics|5": {
82
- "acc": 0.3333333333333333,
83
- "acc_stderr": 0.04690650298201943,
84
- "acc_norm": 0.3333333333333333,
85
- "acc_norm_stderr": 0.04690650298201943
86
- },
87
- "harness|hendrycksTest-computer_security|5": {
88
- "acc": 0.71,
89
- "acc_stderr": 0.045604802157206845,
90
- "acc_norm": 0.71,
91
- "acc_norm_stderr": 0.045604802157206845
92
- },
93
- "harness|hendrycksTest-conceptual_physics|5": {
94
- "acc": 0.5829787234042553,
95
- "acc_stderr": 0.032232762667117124,
96
- "acc_norm": 0.5829787234042553,
97
- "acc_norm_stderr": 0.032232762667117124
98
- },
99
- "harness|hendrycksTest-econometrics|5": {
100
- "acc": 0.41228070175438597,
101
- "acc_stderr": 0.04630653203366595,
102
- "acc_norm": 0.41228070175438597,
103
- "acc_norm_stderr": 0.04630653203366595
104
- },
105
- "harness|hendrycksTest-electrical_engineering|5": {
106
- "acc": 0.5793103448275863,
107
- "acc_stderr": 0.0411391498118926,
108
- "acc_norm": 0.5793103448275863,
109
- "acc_norm_stderr": 0.0411391498118926
110
- },
111
- "harness|hendrycksTest-elementary_mathematics|5": {
112
- "acc": 0.41005291005291006,
113
- "acc_stderr": 0.02533120243894442,
114
- "acc_norm": 0.41005291005291006,
115
- "acc_norm_stderr": 0.02533120243894442
116
- },
117
- "harness|hendrycksTest-formal_logic|5": {
118
- "acc": 0.4126984126984127,
119
- "acc_stderr": 0.04403438954768176,
120
- "acc_norm": 0.4126984126984127,
121
- "acc_norm_stderr": 0.04403438954768176
122
- },
123
- "harness|hendrycksTest-global_facts|5": {
124
- "acc": 0.43,
125
- "acc_stderr": 0.049756985195624284,
126
- "acc_norm": 0.43,
127
- "acc_norm_stderr": 0.049756985195624284
128
- },
129
- "harness|hendrycksTest-high_school_biology|5": {
130
- "acc": 0.7645161290322581,
131
- "acc_stderr": 0.02413763242933771,
132
- "acc_norm": 0.7645161290322581,
133
- "acc_norm_stderr": 0.02413763242933771
134
- },
135
- "harness|hendrycksTest-high_school_chemistry|5": {
136
- "acc": 0.4630541871921182,
137
- "acc_stderr": 0.035083705204426656,
138
- "acc_norm": 0.4630541871921182,
139
- "acc_norm_stderr": 0.035083705204426656
140
- },
141
- "harness|hendrycksTest-high_school_computer_science|5": {
142
- "acc": 0.65,
143
- "acc_stderr": 0.047937248544110196,
144
- "acc_norm": 0.65,
145
- "acc_norm_stderr": 0.047937248544110196
146
- },
147
- "harness|hendrycksTest-high_school_european_history|5": {
148
- "acc": 0.8181818181818182,
149
- "acc_stderr": 0.03011768892950359,
150
- "acc_norm": 0.8181818181818182,
151
- "acc_norm_stderr": 0.03011768892950359
152
- },
153
- "harness|hendrycksTest-high_school_geography|5": {
154
- "acc": 0.8080808080808081,
155
- "acc_stderr": 0.02805779167298902,
156
- "acc_norm": 0.8080808080808081,
157
- "acc_norm_stderr": 0.02805779167298902
158
- },
159
- "harness|hendrycksTest-high_school_government_and_politics|5": {
160
- "acc": 0.8911917098445595,
161
- "acc_stderr": 0.022473253332768783,
162
- "acc_norm": 0.8911917098445595,
163
- "acc_norm_stderr": 0.022473253332768783
164
- },
165
- "harness|hendrycksTest-high_school_macroeconomics|5": {
166
- "acc": 0.6410256410256411,
167
- "acc_stderr": 0.02432173848460235,
168
- "acc_norm": 0.6410256410256411,
169
- "acc_norm_stderr": 0.02432173848460235
170
- },
171
- "harness|hendrycksTest-high_school_mathematics|5": {
172
- "acc": 0.3,
173
- "acc_stderr": 0.027940457136228416,
174
- "acc_norm": 0.3,
175
- "acc_norm_stderr": 0.027940457136228416
176
- },
177
- "harness|hendrycksTest-high_school_microeconomics|5": {
178
- "acc": 0.6596638655462185,
179
- "acc_stderr": 0.030778057422931673,
180
- "acc_norm": 0.6596638655462185,
181
- "acc_norm_stderr": 0.030778057422931673
182
- },
183
- "harness|hendrycksTest-high_school_physics|5": {
184
- "acc": 0.423841059602649,
185
- "acc_stderr": 0.04034846678603397,
186
- "acc_norm": 0.423841059602649,
187
- "acc_norm_stderr": 0.04034846678603397
188
- },
189
- "harness|hendrycksTest-high_school_psychology|5": {
190
- "acc": 0.8385321100917431,
191
- "acc_stderr": 0.015776239256163255,
192
- "acc_norm": 0.8385321100917431,
193
- "acc_norm_stderr": 0.015776239256163255
194
- },
195
- "harness|hendrycksTest-high_school_statistics|5": {
196
- "acc": 0.48148148148148145,
197
- "acc_stderr": 0.03407632093854052,
198
- "acc_norm": 0.48148148148148145,
199
- "acc_norm_stderr": 0.03407632093854052
200
- },
201
- "harness|hendrycksTest-high_school_us_history|5": {
202
- "acc": 0.8578431372549019,
203
- "acc_stderr": 0.024509803921568606,
204
- "acc_norm": 0.8578431372549019,
205
- "acc_norm_stderr": 0.024509803921568606
206
- },
207
- "harness|hendrycksTest-high_school_world_history|5": {
208
- "acc": 0.8438818565400844,
209
- "acc_stderr": 0.02362715946031867,
210
- "acc_norm": 0.8438818565400844,
211
- "acc_norm_stderr": 0.02362715946031867
212
- },
213
- "harness|hendrycksTest-human_aging|5": {
214
- "acc": 0.726457399103139,
215
- "acc_stderr": 0.02991858670779883,
216
- "acc_norm": 0.726457399103139,
217
- "acc_norm_stderr": 0.02991858670779883
218
- },
219
- "harness|hendrycksTest-human_sexuality|5": {
220
- "acc": 0.7099236641221374,
221
- "acc_stderr": 0.039800662464677665,
222
- "acc_norm": 0.7099236641221374,
223
- "acc_norm_stderr": 0.039800662464677665
224
- },
225
- "harness|hendrycksTest-international_law|5": {
226
- "acc": 0.8016528925619835,
227
- "acc_stderr": 0.03640118271990946,
228
- "acc_norm": 0.8016528925619835,
229
- "acc_norm_stderr": 0.03640118271990946
230
- },
231
- "harness|hendrycksTest-jurisprudence|5": {
232
- "acc": 0.8240740740740741,
233
- "acc_stderr": 0.036809181416738807,
234
- "acc_norm": 0.8240740740740741,
235
- "acc_norm_stderr": 0.036809181416738807
236
- },
237
- "harness|hendrycksTest-logical_fallacies|5": {
238
- "acc": 0.7607361963190185,
239
- "acc_stderr": 0.033519538795212696,
240
- "acc_norm": 0.7607361963190185,
241
- "acc_norm_stderr": 0.033519538795212696
242
- },
243
- "harness|hendrycksTest-machine_learning|5": {
244
- "acc": 0.48214285714285715,
245
- "acc_stderr": 0.047427623612430116,
246
- "acc_norm": 0.48214285714285715,
247
- "acc_norm_stderr": 0.047427623612430116
248
- },
249
- "harness|hendrycksTest-management|5": {
250
- "acc": 0.8058252427184466,
251
- "acc_stderr": 0.03916667762822584,
252
- "acc_norm": 0.8058252427184466,
253
- "acc_norm_stderr": 0.03916667762822584
254
- },
255
- "harness|hendrycksTest-marketing|5": {
256
- "acc": 0.8717948717948718,
257
- "acc_stderr": 0.02190190511507332,
258
- "acc_norm": 0.8717948717948718,
259
- "acc_norm_stderr": 0.02190190511507332
260
- },
261
- "harness|hendrycksTest-medical_genetics|5": {
262
- "acc": 0.65,
263
- "acc_stderr": 0.047937248544110196,
264
- "acc_norm": 0.65,
265
- "acc_norm_stderr": 0.047937248544110196
266
- },
267
- "harness|hendrycksTest-miscellaneous|5": {
268
- "acc": 0.8275862068965517,
269
- "acc_stderr": 0.013507943909371798,
270
- "acc_norm": 0.8275862068965517,
271
- "acc_norm_stderr": 0.013507943909371798
272
- },
273
- "harness|hendrycksTest-moral_disputes|5": {
274
- "acc": 0.7167630057803468,
275
- "acc_stderr": 0.02425790170532338,
276
- "acc_norm": 0.7167630057803468,
277
- "acc_norm_stderr": 0.02425790170532338
278
- },
279
- "harness|hendrycksTest-moral_scenarios|5": {
280
- "acc": 0.39553072625698327,
281
- "acc_stderr": 0.01635341541007577,
282
- "acc_norm": 0.39553072625698327,
283
- "acc_norm_stderr": 0.01635341541007577
284
- },
285
- "harness|hendrycksTest-nutrition|5": {
286
- "acc": 0.6993464052287581,
287
- "acc_stderr": 0.026256053835718968,
288
- "acc_norm": 0.6993464052287581,
289
- "acc_norm_stderr": 0.026256053835718968
290
- },
291
- "harness|hendrycksTest-philosophy|5": {
292
- "acc": 0.7041800643086816,
293
- "acc_stderr": 0.02592237178881877,
294
- "acc_norm": 0.7041800643086816,
295
- "acc_norm_stderr": 0.02592237178881877
296
- },
297
- "harness|hendrycksTest-prehistory|5": {
298
- "acc": 0.7098765432098766,
299
- "acc_stderr": 0.025251173936495036,
300
- "acc_norm": 0.7098765432098766,
301
- "acc_norm_stderr": 0.025251173936495036
302
- },
303
- "harness|hendrycksTest-professional_accounting|5": {
304
- "acc": 0.5070921985815603,
305
- "acc_stderr": 0.02982449855912901,
306
- "acc_norm": 0.5070921985815603,
307
- "acc_norm_stderr": 0.02982449855912901
308
- },
309
- "harness|hendrycksTest-professional_law|5": {
310
- "acc": 0.4771838331160365,
311
- "acc_stderr": 0.012756933382823694,
312
- "acc_norm": 0.4771838331160365,
313
- "acc_norm_stderr": 0.012756933382823694
314
- },
315
- "harness|hendrycksTest-professional_medicine|5": {
316
- "acc": 0.5772058823529411,
317
- "acc_stderr": 0.030008562845003476,
318
- "acc_norm": 0.5772058823529411,
319
- "acc_norm_stderr": 0.030008562845003476
320
- },
321
- "harness|hendrycksTest-professional_psychology|5": {
322
- "acc": 0.6699346405228758,
323
- "acc_stderr": 0.019023726160724556,
324
- "acc_norm": 0.6699346405228758,
325
- "acc_norm_stderr": 0.019023726160724556
326
- },
327
- "harness|hendrycksTest-public_relations|5": {
328
- "acc": 0.6909090909090909,
329
- "acc_stderr": 0.044262946482000985,
330
- "acc_norm": 0.6909090909090909,
331
- "acc_norm_stderr": 0.044262946482000985
332
- },
333
- "harness|hendrycksTest-security_studies|5": {
334
- "acc": 0.7877551020408163,
335
- "acc_stderr": 0.026176967197866767,
336
- "acc_norm": 0.7877551020408163,
337
- "acc_norm_stderr": 0.026176967197866767
338
- },
339
- "harness|hendrycksTest-sociology|5": {
340
- "acc": 0.8706467661691543,
341
- "acc_stderr": 0.023729830881018526,
342
- "acc_norm": 0.8706467661691543,
343
- "acc_norm_stderr": 0.023729830881018526
344
- },
345
- "harness|hendrycksTest-us_foreign_policy|5": {
346
- "acc": 0.87,
347
- "acc_stderr": 0.03379976689896309,
348
- "acc_norm": 0.87,
349
- "acc_norm_stderr": 0.03379976689896309
350
- },
351
- "harness|hendrycksTest-virology|5": {
352
- "acc": 0.5120481927710844,
353
- "acc_stderr": 0.03891364495835817,
354
- "acc_norm": 0.5120481927710844,
355
- "acc_norm_stderr": 0.03891364495835817
356
- },
357
- "harness|hendrycksTest-world_religions|5": {
358
- "acc": 0.8187134502923976,
359
- "acc_stderr": 0.029547741687640038,
360
- "acc_norm": 0.8187134502923976,
361
- "acc_norm_stderr": 0.029547741687640038
362
- },
363
- "all": {
364
- "acc": 0.6390701952816291,
365
- "acc_stderr": 0.03365809160773111,
366
- "acc_norm": 0.6390701952816291,
367
- "acc_norm_stderr": 0.03365809160773111
368
- }
369
- },
370
- "versions": {
371
- "harness|arc:challenge|25": 0,
372
- "harness|hellaswag|10": 0,
373
- "harness|truthfulqa:mc|0": 1,
374
- "harness|hendrycksTest-abstract_algebra|5": 1,
375
- "harness|hendrycksTest-anatomy|5": 1,
376
- "harness|hendrycksTest-astronomy|5": 1,
377
- "harness|hendrycksTest-business_ethics|5": 1,
378
- "harness|hendrycksTest-clinical_knowledge|5": 1,
379
- "harness|hendrycksTest-college_biology|5": 1,
380
- "harness|hendrycksTest-college_chemistry|5": 1,
381
- "harness|hendrycksTest-college_computer_science|5": 1,
382
- "harness|hendrycksTest-college_mathematics|5": 1,
383
- "harness|hendrycksTest-college_medicine|5": 1,
384
- "harness|hendrycksTest-college_physics|5": 1,
385
- "harness|hendrycksTest-computer_security|5": 1,
386
- "harness|hendrycksTest-conceptual_physics|5": 1,
387
- "harness|hendrycksTest-econometrics|5": 1,
388
- "harness|hendrycksTest-electrical_engineering|5": 1,
389
- "harness|hendrycksTest-elementary_mathematics|5": 1,
390
- "harness|hendrycksTest-formal_logic|5": 1,
391
- "harness|hendrycksTest-global_facts|5": 1,
392
- "harness|hendrycksTest-high_school_biology|5": 1,
393
- "harness|hendrycksTest-high_school_chemistry|5": 1,
394
- "harness|hendrycksTest-high_school_computer_science|5": 1,
395
- "harness|hendrycksTest-high_school_european_history|5": 1,
396
- "harness|hendrycksTest-high_school_geography|5": 1,
397
- "harness|hendrycksTest-high_school_government_and_politics|5": 1,
398
- "harness|hendrycksTest-high_school_macroeconomics|5": 1,
399
- "harness|hendrycksTest-high_school_mathematics|5": 1,
400
- "harness|hendrycksTest-high_school_microeconomics|5": 1,
401
- "harness|hendrycksTest-high_school_physics|5": 1,
402
- "harness|hendrycksTest-high_school_psychology|5": 1,
403
- "harness|hendrycksTest-high_school_statistics|5": 1,
404
- "harness|hendrycksTest-high_school_us_history|5": 1,
405
- "harness|hendrycksTest-high_school_world_history|5": 1,
406
- "harness|hendrycksTest-human_aging|5": 1,
407
- "harness|hendrycksTest-human_sexuality|5": 1,
408
- "harness|hendrycksTest-international_law|5": 1,
409
- "harness|hendrycksTest-jurisprudence|5": 1,
410
- "harness|hendrycksTest-logical_fallacies|5": 1,
411
- "harness|hendrycksTest-machine_learning|5": 1,
412
- "harness|hendrycksTest-management|5": 1,
413
- "harness|hendrycksTest-marketing|5": 1,
414
- "harness|hendrycksTest-medical_genetics|5": 1,
415
- "harness|hendrycksTest-miscellaneous|5": 1,
416
- "harness|hendrycksTest-moral_disputes|5": 1,
417
- "harness|hendrycksTest-moral_scenarios|5": 1,
418
- "harness|hendrycksTest-nutrition|5": 1,
419
- "harness|hendrycksTest-philosophy|5": 1,
420
- "harness|hendrycksTest-prehistory|5": 1,
421
- "harness|hendrycksTest-professional_accounting|5": 1,
422
- "harness|hendrycksTest-professional_law|5": 1,
423
- "harness|hendrycksTest-professional_medicine|5": 1,
424
- "harness|hendrycksTest-professional_psychology|5": 1,
425
- "harness|hendrycksTest-public_relations|5": 1,
426
- "harness|hendrycksTest-security_studies|5": 1,
427
- "harness|hendrycksTest-sociology|5": 1,
428
- "harness|hendrycksTest-us_foreign_policy|5": 1,
429
- "harness|hendrycksTest-virology|5": 1,
430
- "harness|hendrycksTest-world_religions|5": 1,
431
- "all": 0
432
- },
433
- "config": {
434
- "model_name": "meta-llama/Llama-2-70b-chat-hf",
435
- "model_sha": "7f54101c0fbb67a8143ca23eb8bd09b71f269c74",
436
- "model_dtype": "torch.float16",
437
- "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937",
438
- "num_few_shot_default": 0,
439
- "num_fewshot_seeds": 1,
440
- "override_batch_size": 1,
441
- "max_samples": null
442
- },
443
- "task_config": {
444
- "harness|arc:challenge": "LM Harness task",
445
- "harness|hellaswag": "LM Harness task",
446
- "harness|truthfulqa:mc": "LM Harness task",
447
- "harness|hendrycksTest-abstract_algebra": "LM Harness task",
448
- "harness|hendrycksTest-anatomy": "LM Harness task",
449
- "harness|hendrycksTest-astronomy": "LM Harness task",
450
- "harness|hendrycksTest-business_ethics": "LM Harness task",
451
- "harness|hendrycksTest-clinical_knowledge": "LM Harness task",
452
- "harness|hendrycksTest-college_biology": "LM Harness task",
453
- "harness|hendrycksTest-college_chemistry": "LM Harness task",
454
- "harness|hendrycksTest-college_computer_science": "LM Harness task",
455
- "harness|hendrycksTest-college_mathematics": "LM Harness task",
456
- "harness|hendrycksTest-college_medicine": "LM Harness task",
457
- "harness|hendrycksTest-college_physics": "LM Harness task",
458
- "harness|hendrycksTest-computer_security": "LM Harness task",
459
- "harness|hendrycksTest-conceptual_physics": "LM Harness task",
460
- "harness|hendrycksTest-econometrics": "LM Harness task",
461
- "harness|hendrycksTest-electrical_engineering": "LM Harness task",
462
- "harness|hendrycksTest-elementary_mathematics": "LM Harness task",
463
- "harness|hendrycksTest-formal_logic": "LM Harness task",
464
- "harness|hendrycksTest-global_facts": "LM Harness task",
465
- "harness|hendrycksTest-high_school_biology": "LM Harness task",
466
- "harness|hendrycksTest-high_school_chemistry": "LM Harness task",
467
- "harness|hendrycksTest-high_school_computer_science": "LM Harness task",
468
- "harness|hendrycksTest-high_school_european_history": "LM Harness task",
469
- "harness|hendrycksTest-high_school_geography": "LM Harness task",
470
- "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task",
471
- "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task",
472
- "harness|hendrycksTest-high_school_mathematics": "LM Harness task",
473
- "harness|hendrycksTest-high_school_microeconomics": "LM Harness task",
474
- "harness|hendrycksTest-high_school_physics": "LM Harness task",
475
- "harness|hendrycksTest-high_school_psychology": "LM Harness task",
476
- "harness|hendrycksTest-high_school_statistics": "LM Harness task",
477
- "harness|hendrycksTest-high_school_us_history": "LM Harness task",
478
- "harness|hendrycksTest-high_school_world_history": "LM Harness task",
479
- "harness|hendrycksTest-human_aging": "LM Harness task",
480
- "harness|hendrycksTest-human_sexuality": "LM Harness task",
481
- "harness|hendrycksTest-international_law": "LM Harness task",
482
- "harness|hendrycksTest-jurisprudence": "LM Harness task",
483
- "harness|hendrycksTest-logical_fallacies": "LM Harness task",
484
- "harness|hendrycksTest-machine_learning": "LM Harness task",
485
- "harness|hendrycksTest-management": "LM Harness task",
486
- "harness|hendrycksTest-marketing": "LM Harness task",
487
- "harness|hendrycksTest-medical_genetics": "LM Harness task",
488
- "harness|hendrycksTest-miscellaneous": "LM Harness task",
489
- "harness|hendrycksTest-moral_disputes": "LM Harness task",
490
- "harness|hendrycksTest-moral_scenarios": "LM Harness task",
491
- "harness|hendrycksTest-nutrition": "LM Harness task",
492
- "harness|hendrycksTest-philosophy": "LM Harness task",
493
- "harness|hendrycksTest-prehistory": "LM Harness task",
494
- "harness|hendrycksTest-professional_accounting": "LM Harness task",
495
- "harness|hendrycksTest-professional_law": "LM Harness task",
496
- "harness|hendrycksTest-professional_medicine": "LM Harness task",
497
- "harness|hendrycksTest-professional_psychology": "LM Harness task",
498
- "harness|hendrycksTest-public_relations": "LM Harness task",
499
- "harness|hendrycksTest-security_studies": "LM Harness task",
500
- "harness|hendrycksTest-sociology": "LM Harness task",
501
- "harness|hendrycksTest-us_foreign_policy": "LM Harness task",
502
- "harness|hendrycksTest-virology": "LM Harness task",
503
- "harness|hendrycksTest-world_religions": "LM Harness task"
504
-
505
- },
506
- "hashes": {
507
- "harness|arc:challenge|25": {
508
- "hash_examples": "fb8c51b1872daeda",
509
- "hash_full_prompts": "045cbb916e5145c6",
510
- "hash_input_tokens": "fab18a8dbccd885e",
511
- "hash_cont_tokens": "e8abf848493b50f7"
512
- },
513
- "harness|hellaswag|10": {
514
- "hash_examples": "e1768ecb99d7ecf0",
515
- "hash_full_prompts": "0b4c16983130f84f",
516
- "hash_input_tokens": "fd3d11be48664a7e",
517
- "hash_cont_tokens": "9fe0a5c42e1532db"
518
- },
519
- "harness|truthfulqa:mc|0": {
520
- "hash_examples": "23176c0531c7b867",
521
- "hash_full_prompts": "36a6d90e75d92d4a",
522
- "hash_input_tokens": "e3c2231820d87234",
523
- "hash_cont_tokens": "f5da56a132aab151"
524
- },
525
- "harness|hendrycksTest-abstract_algebra|5": {
526
- "hash_examples": "280f9f325b40559a",
527
- "hash_full_prompts": "2f776a367d23aea2",
528
- "hash_input_tokens": "c3792fce2534965f",
529
- "hash_cont_tokens": "50421e30bef398f9"
530
- },
531
- "harness|hendrycksTest-anatomy|5": {
532
- "hash_examples": "2f83a4f1cab4ba18",
533
- "hash_full_prompts": "516f74bef25df620",
534
- "hash_input_tokens": "1bfeea5736b995ee",
535
- "hash_cont_tokens": "f11971a765cb609f"
536
- },
537
- "harness|hendrycksTest-astronomy|5": {
538
- "hash_examples": "7d587b908da4d762",
539
- "hash_full_prompts": "faf4e80f65de93ca",
540
- "hash_input_tokens": "c4b2f1160f746871",
541
- "hash_cont_tokens": "440a970fadecdc7b"
542
- },
543
- "harness|hendrycksTest-business_ethics|5": {
544
- "hash_examples": "33e51740670de686",
545
- "hash_full_prompts": "db01c3ef8e1479d4",
546
- "hash_input_tokens": "b98d6ef1d1e2e17b",
547
- "hash_cont_tokens": "50421e30bef398f9"
548
- },
549
- "harness|hendrycksTest-clinical_knowledge|5": {
550
- "hash_examples": "f3366dbe7eefffa4",
551
- "hash_full_prompts": "49654f71d94b65c3",
552
- "hash_input_tokens": "9851119dacda883c",
553
- "hash_cont_tokens": "7ecd60c25b9bfe5b"
554
- },
555
- "harness|hendrycksTest-college_biology|5": {
556
- "hash_examples": "ca2b6753a0193e7f",
557
- "hash_full_prompts": "2b460b75f1fdfefd",
558
- "hash_input_tokens": "81a92a54cddefc2f",
559
- "hash_cont_tokens": "875cde3af7a0ee14"
560
- },
561
- "harness|hendrycksTest-college_chemistry|5": {
562
- "hash_examples": "22ff85f1d34f42d1",
563
- "hash_full_prompts": "242c9be6da583e95",
564
- "hash_input_tokens": "fd4c0cebdc2c1c3d",
565
- "hash_cont_tokens": "50421e30bef398f9"
566
- },
567
- "harness|hendrycksTest-college_computer_science|5": {
568
- "hash_examples": "30318289d717a5cf",
569
- "hash_full_prompts": "ed2bdb4e87c4b371",
570
- "hash_input_tokens": "49f6021f4c075e0d",
571
- "hash_cont_tokens": "50421e30bef398f9"
572
- },
573
- "harness|hendrycksTest-college_mathematics|5": {
574
- "hash_examples": "4944d1f0b6b5d911",
575
- "hash_full_prompts": "770bc4281c973190",
576
- "hash_input_tokens": "db61bad69399bfe8",
577
- "hash_cont_tokens": "50421e30bef398f9"
578
- },
579
- "harness|hendrycksTest-college_medicine|5": {
580
- "hash_examples": "dd69cc33381275af",
581
- "hash_full_prompts": "ad2a53e5250ab46e",
582
- "hash_input_tokens": "c458392f38424d77",
583
- "hash_cont_tokens": "702fb6d82ff0d6ac"
584
- },
585
- "harness|hendrycksTest-college_physics|5": {
586
- "hash_examples": "875dd26d22655b0d",
587
- "hash_full_prompts": "833a0d7b55aed500",
588
- "hash_input_tokens": "49cf4d8d8696b588",
589
- "hash_cont_tokens": "f7b8097afc16a47c"
590
- },
591
- "harness|hendrycksTest-computer_security|5": {
592
- "hash_examples": "006451eedc0ededb",
593
- "hash_full_prompts": "94034c97e85d8f46",
594
- "hash_input_tokens": "e81d46ca85fa2b7c",
595
- "hash_cont_tokens": "50421e30bef398f9"
596
- },
597
- "harness|hendrycksTest-conceptual_physics|5": {
598
- "hash_examples": "8874ece872d2ca4c",
599
- "hash_full_prompts": "e40d15a34640d6fa",
600
- "hash_input_tokens": "d5e231a26622e7d5",
601
- "hash_cont_tokens": "aa0e8bc655f2f641"
602
- },
603
- "harness|hendrycksTest-econometrics|5": {
604
- "hash_examples": "64d3623b0bfaa43f",
605
- "hash_full_prompts": "612f340fae41338d",
606
- "hash_input_tokens": "afa3603fd1622706",
607
- "hash_cont_tokens": "b1cc6e7e9fcd3827"
608
- },
609
- "harness|hendrycksTest-electrical_engineering|5": {
610
- "hash_examples": "e98f51780c674d7e",
611
- "hash_full_prompts": "10275b312d812ae6",
612
- "hash_input_tokens": "e0c62cf84ed22e7e",
613
- "hash_cont_tokens": "2425a3f084a591ef"
614
- },
615
- "harness|hendrycksTest-elementary_mathematics|5": {
616
- "hash_examples": "fc48208a5ac1c0ce",
617
- "hash_full_prompts": "5ec274c6c82aca23",
618
- "hash_input_tokens": "303123d2b857f30b",
619
- "hash_cont_tokens": "bd87bf0c060fd925"
620
- },
621
- "harness|hendrycksTest-formal_logic|5": {
622
- "hash_examples": "5a6525665f63ea72",
623
- "hash_full_prompts": "07b92638c4a6b500",
624
- "hash_input_tokens": "3fd8073b90b9736d",
625
- "hash_cont_tokens": "eb8932890e0605db"
626
- },
627
- "harness|hendrycksTest-global_facts|5": {
628
- "hash_examples": "371d70d743b2b89b",
629
- "hash_full_prompts": "332fdee50a1921b4",
630
- "hash_input_tokens": "f65051acd3210902",
631
- "hash_cont_tokens": "50421e30bef398f9"
632
- },
633
- "harness|hendrycksTest-high_school_biology|5": {
634
- "hash_examples": "a79e1018b1674052",
635
- "hash_full_prompts": "e624e26ede922561",
636
- "hash_input_tokens": "264263fc8c2123bc",
637
- "hash_cont_tokens": "1ddcb86d28cde266"
638
- },
639
- "harness|hendrycksTest-high_school_chemistry|5": {
640
- "hash_examples": "44bfc25c389f0e03",
641
- "hash_full_prompts": "0e3e5f5d9246482a",
642
- "hash_input_tokens": "42e1a18523b075e7",
643
- "hash_cont_tokens": "176c8dcff38c5f8f"
644
- },
645
- "harness|hendrycksTest-high_school_computer_science|5": {
646
- "hash_examples": "8b8cdb1084f24169",
647
- "hash_full_prompts": "c00487e67c1813cc",
648
- "hash_input_tokens": "6f109fbd505d364b",
649
- "hash_cont_tokens": "50421e30bef398f9"
650
- },
651
- "harness|hendrycksTest-high_school_european_history|5": {
652
- "hash_examples": "11cd32d0ef440171",
653
- "hash_full_prompts": "318f4513c537c6bf",
654
- "hash_input_tokens": "f1f73dd687da18d7",
655
- "hash_cont_tokens": "674fc454bdc5ac93"
656
- },
657
- "harness|hendrycksTest-high_school_geography|5": {
658
- "hash_examples": "b60019b9e80b642f",
659
- "hash_full_prompts": "ee5789fcc1a81b1e",
660
- "hash_input_tokens": "575ea4d290807e79",
661
- "hash_cont_tokens": "03a5012b916274ea"
662
- },
663
- "harness|hendrycksTest-high_school_government_and_politics|5": {
664
- "hash_examples": "d221ec983d143dc3",
665
- "hash_full_prompts": "ac42d888e1ce1155",
666
- "hash_input_tokens": "5954aff17f30959c",
667
- "hash_cont_tokens": "873d2aab226ba1d8"
668
- },
669
- "harness|hendrycksTest-high_school_macroeconomics|5": {
670
- "hash_examples": "59c2915cacfd3fbb",
671
- "hash_full_prompts": "c6bd9d25158abd0e",
672
- "hash_input_tokens": "cc4bb974def176ee",
673
- "hash_cont_tokens": "c583432ad27fcfe0"
674
- },
675
- "harness|hendrycksTest-high_school_mathematics|5": {
676
- "hash_examples": "1f8ac897608de342",
677
- "hash_full_prompts": "5d88f41fc2d643a8",
678
- "hash_input_tokens": "94100bcb23e1a13e",
679
- "hash_cont_tokens": "d7907b61bcb8c123"
680
- },
681
- "harness|hendrycksTest-high_school_microeconomics|5": {
682
- "hash_examples": "ead6a0f2f6c83370",
683
- "hash_full_prompts": "bfc393381298609e",
684
- "hash_input_tokens": "129c79724487131d",
685
- "hash_cont_tokens": "f47f041de50333b9"
686
- },
687
- "harness|hendrycksTest-high_school_physics|5": {
688
- "hash_examples": "c3f2025990afec64",
689
- "hash_full_prompts": "fc78b4997e436734",
690
- "hash_input_tokens": "82c2ac81ad5b141c",
691
- "hash_cont_tokens": "0d56317b3e5eedb5"
692
- },
693
- "harness|hendrycksTest-high_school_psychology|5": {
694
- "hash_examples": "21f8aab618f6d636",
695
- "hash_full_prompts": "d5c76aa40b9dbc43",
696
- "hash_input_tokens": "422b8bb7add88cc5",
697
- "hash_cont_tokens": "09ba1243e7390c0f"
698
- },
699
- "harness|hendrycksTest-high_school_statistics|5": {
700
- "hash_examples": "2386a60a11fc5de3",
701
- "hash_full_prompts": "4c5c8be5aafac432",
702
- "hash_input_tokens": "d3e6f7198120fbdc",
703
- "hash_cont_tokens": "9cc29889c3d3f77d"
704
- },
705
- "harness|hendrycksTest-high_school_us_history|5": {
706
- "hash_examples": "74961543be40f04f",
707
- "hash_full_prompts": "5d5ca4840131ba21",
708
- "hash_input_tokens": "50c9ff438c85a69e",
709
- "hash_cont_tokens": "cdd0b3dc06d933e5"
710
- },
711
- "harness|hendrycksTest-high_school_world_history|5": {
712
- "hash_examples": "2ad2f6b7198b2234",
713
- "hash_full_prompts": "11845057459afd72",
714
- "hash_input_tokens": "054824cc474caef5",
715
- "hash_cont_tokens": "e02816433ff28daf"
716
- },
717
- "harness|hendrycksTest-human_aging|5": {
718
- "hash_examples": "1a7199dc733e779b",
719
- "hash_full_prompts": "756b9096b8eaf892",
720
- "hash_input_tokens": "151f31a573d81257",
721
- "hash_cont_tokens": "142a4a8a1138a214"
722
- },
723
- "harness|hendrycksTest-human_sexuality|5": {
724
- "hash_examples": "7acb8fdad97f88a6",
725
- "hash_full_prompts": "731a52ff15b8cfdb",
726
- "hash_input_tokens": "b77763767fb18cc4",
727
- "hash_cont_tokens": "bc54813e809b796d"
728
- },
729
- "harness|hendrycksTest-international_law|5": {
730
- "hash_examples": "1300bfd0dfc59114",
731
- "hash_full_prompts": "db2aefbff5eec996",
732
- "hash_input_tokens": "a4e52c47400b8bca",
733
- "hash_cont_tokens": "8ea8c5ff76a15bca"
734
- },
735
- "harness|hendrycksTest-jurisprudence|5": {
736
- "hash_examples": "083b1e4904c48dc2",
737
- "hash_full_prompts": "0f89ee3fe03d6a21",
738
- "hash_input_tokens": "69644001a800b0f7",
739
- "hash_cont_tokens": "e3a8cd951b6e3469"
740
- },
741
- "harness|hendrycksTest-logical_fallacies|5": {
742
- "hash_examples": "709128f9926a634c",
743
- "hash_full_prompts": "98a04b1f8f841069",
744
- "hash_input_tokens": "332ca144a888ad7f",
745
- "hash_cont_tokens": "3e9e0bdc248fd88a"
746
- },
747
- "harness|hendrycksTest-machine_learning|5": {
748
- "hash_examples": "88f22a636029ae47",
749
- "hash_full_prompts": "2e1c8d4b1e0cc921",
750
- "hash_input_tokens": "a27f6dd3c2837ded",
751
- "hash_cont_tokens": "55b12fb138c6a064"
752
- },
753
- "harness|hendrycksTest-management|5": {
754
- "hash_examples": "8c8a1e07a2151dca",
755
- "hash_full_prompts": "f51611f514b265b0",
756
- "hash_input_tokens": "9f72696f5f9c4c80",
757
- "hash_cont_tokens": "a01d6d39a83c4597"
758
- },
759
- "harness|hendrycksTest-marketing|5": {
760
- "hash_examples": "2668953431f91e96",
761
- "hash_full_prompts": "77562bef997c7650",
762
- "hash_input_tokens": "0d9707022133f086",
763
- "hash_cont_tokens": "6aeaed4d823c98aa"
764
- },
765
- "harness|hendrycksTest-medical_genetics|5": {
766
- "hash_examples": "9c2dda34a2ea4fd2",
767
- "hash_full_prompts": "202139046daa118f",
768
- "hash_input_tokens": "e957962a583e58a2",
769
- "hash_cont_tokens": "50421e30bef398f9"
770
- },
771
- "harness|hendrycksTest-miscellaneous|5": {
772
- "hash_examples": "41adb694024809c2",
773
- "hash_full_prompts": "bffec9fc237bcf93",
774
- "hash_input_tokens": "46fe4585062aa36a",
775
- "hash_cont_tokens": "9b0ab02a64603081"
776
- },
777
- "harness|hendrycksTest-moral_disputes|5": {
778
- "hash_examples": "3171c13ba3c594c4",
779
- "hash_full_prompts": "170831fc36f1d59e",
780
- "hash_input_tokens": "cf9834b2c07721dc",
781
- "hash_cont_tokens": "3b8bbe9108e55ce9"
782
- },
783
- "harness|hendrycksTest-moral_scenarios|5": {
784
- "hash_examples": "9873e077e83e0546",
785
- "hash_full_prompts": "08f4ceba3131a068",
786
- "hash_input_tokens": "f257b7cce9ddb541",
787
- "hash_cont_tokens": "3e9bfc0362e97330"
788
- },
789
- "harness|hendrycksTest-nutrition|5": {
790
- "hash_examples": "7db1d8142ec14323",
791
- "hash_full_prompts": "4c0e68e3586cb453",
792
- "hash_input_tokens": "8650a7e901b42458",
793
- "hash_cont_tokens": "23b2dc6ee2da4cfc"
794
- },
795
- "harness|hendrycksTest-philosophy|5": {
796
- "hash_examples": "9b455b7d72811cc8",
797
- "hash_full_prompts": "e467f822d8a0d3ff",
798
- "hash_input_tokens": "4ba4c1d13e1040ec",
799
- "hash_cont_tokens": "9f6ff69d23a48783"
800
- },
801
- "harness|hendrycksTest-prehistory|5": {
802
- "hash_examples": "8be90d0f538f1560",
803
- "hash_full_prompts": "152187949bcd0921",
804
- "hash_input_tokens": "7431d7b2d5c13409",
805
- "hash_cont_tokens": "d6458d743d875837"
806
- },
807
- "harness|hendrycksTest-professional_accounting|5": {
808
- "hash_examples": "8d377597916cd07e",
809
- "hash_full_prompts": "0eb7345d6144ee0d",
810
- "hash_input_tokens": "e7bbb4a15e991424",
811
- "hash_cont_tokens": "922a195f53a35662"
812
- },
813
- "harness|hendrycksTest-professional_law|5": {
814
- "hash_examples": "cd9dbc52b3c932d6",
815
- "hash_full_prompts": "36ac764272bfb182",
816
- "hash_input_tokens": "9178e10bd0763ec4",
817
- "hash_cont_tokens": "2e590029ef41fbcd"
818
- },
819
- "harness|hendrycksTest-professional_medicine|5": {
820
- "hash_examples": "b20e4e816c1e383e",
821
- "hash_full_prompts": "7b8d69ea2acaf2f7",
822
- "hash_input_tokens": "f5a22012a54f70ea",
823
- "hash_cont_tokens": "7cfee54dbddd5a98"
824
- },
825
- "harness|hendrycksTest-professional_psychology|5": {
826
- "hash_examples": "d45b73b22f9cc039",
827
- "hash_full_prompts": "fe8937e9ffc99771",
828
- "hash_input_tokens": "8eeb91b3a7cbea0a",
829
- "hash_cont_tokens": "a86677b2a45c20e1"
830
- },
831
- "harness|hendrycksTest-public_relations|5": {
832
- "hash_examples": "0d25072e1761652a",
833
- "hash_full_prompts": "f9adc39cfa9f42ba",
834
- "hash_input_tokens": "bdfc559a40a1e8ec",
835
- "hash_cont_tokens": "0d756ccaae031757"
836
- },
837
- "harness|hendrycksTest-security_studies|5": {
838
- "hash_examples": "62bb8197e63d60d4",
839
- "hash_full_prompts": "869c9c3ae196b7c3",
840
- "hash_input_tokens": "d49711415961ced7",
841
- "hash_cont_tokens": "b2229bc2cfbf594b"
842
- },
843
- "harness|hendrycksTest-sociology|5": {
844
- "hash_examples": "e7959df87dea8672",
845
- "hash_full_prompts": "1a1fc00e17b3a52a",
846
- "hash_input_tokens": "f9a00c6fc5e9cea7",
847
- "hash_cont_tokens": "c3a3bdfd177eed5b"
848
- },
849
- "harness|hendrycksTest-us_foreign_policy|5": {
850
- "hash_examples": "4a56a01ddca44dca",
851
- "hash_full_prompts": "0c7a7081c71c07b6",
852
- "hash_input_tokens": "647f2d7d9075afaa",
853
- "hash_cont_tokens": "50421e30bef398f9"
854
- },
855
- "harness|hendrycksTest-virology|5": {
856
- "hash_examples": "451cc86a8c4f4fe9",
857
- "hash_full_prompts": "01e95325d8b738e4",
858
- "hash_input_tokens": "784f75f0ad6e0698",
859
- "hash_cont_tokens": "af8b3658088cb37f"
860
- },
861
- "harness|hendrycksTest-world_religions|5": {
862
- "hash_examples": "3b29cfaf1a81c379",
863
- "hash_full_prompts": "e0d79a15083dfdff",
864
- "hash_input_tokens": "17766ebe38853371",
865
- "hash_cont_tokens": "060118bef6de4e0a"
866
- }
867
- }
868
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
meta-llama/Llama-2-70b-chat-hf/results_2023-10-17T05-07-42.486452.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "config_general": {
3
- "model_name": "meta-llama/Llama-2-70b-chat-hf",
4
- "model_sha": "cfe96d938c52db7c6d936f99370c0801b24233c4",
5
- "model_size": "128.64 GB",
6
- "model_dtype": "torch.float16",
7
- "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374",
8
- "num_few_shot_default": 0,
9
- "num_fewshot_seeds": 1,
10
- "override_batch_size": 1,
11
- "max_samples": null,
12
- "job_id": ""
13
- },
14
- "results": {
15
- "harness|drop|3": {
16
- "em": 0.040373322147651006,
17
- "em_stderr": 0.0020157564185176837,
18
- "f1": 0.1050272651006715,
19
- "f1_stderr": 0.0023756238577676155
20
- },
21
- "harness|gsm8k|5": {
22
- "acc": 0.266868840030326,
23
- "acc_stderr": 0.012183780551887957
24
- },
25
- "harness|winogrande|5": {
26
- "acc": 0.8050513022888713,
27
- "acc_stderr": 0.011134099415938268
28
- },
29
- "all": {
30
- "em": 0.040373322147651006,
31
- "em_stderr": 0.0020157564185176837,
32
- "f1": 0.1050272651006715,
33
- "f1_stderr": 0.0023756238577676155,
34
- "acc": 0.5359600711595986,
35
- "acc_stderr": 0.011658939983913113
36
- }
37
- },
38
- "versions": {
39
- "harness|drop|3": 1,
40
- "harness|gsm8k|5": 0,
41
- "harness|winogrande|5": 0,
42
- "all": 0
43
- },
44
- "config_tasks": {
45
- "harness|drop": "LM Harness task",
46
- "harness|gsm8k": "LM Harness task",
47
- "harness|winogrande": "LM Harness task"
48
- },
49
- "summary_tasks": {
50
- "harness|drop|3": {
51
- "hashes": {
52
- "hash_examples": "1d27416e8324e9a3",
53
- "hash_full_prompts": "a5513ff9a741b385",
54
- "hash_input_tokens": "42076f0efbb50aa6",
55
- "hash_cont_tokens": "b7f7e4a7d842e431"
56
- },
57
- "truncated": 3,
58
- "non-truncated": 9533,
59
- "padded": 0,
60
- "non-padded": 9536,
61
- "effective_few_shots": 3.0,
62
- "num_truncated_few_shots": 0
63
- },
64
- "harness|gsm8k|5": {
65
- "hashes": {
66
- "hash_examples": "4c0843a5d99bcfdc",
67
- "hash_full_prompts": "41d55e83abc0e02d",
68
- "hash_input_tokens": "bda342e47b5099b2",
69
- "hash_cont_tokens": "13bcb12a5f7991f1"
70
- },
71
- "truncated": 0,
72
- "non-truncated": 1319,
73
- "padded": 0,
74
- "non-padded": 1319,
75
- "effective_few_shots": 5.0,
76
- "num_truncated_few_shots": 0
77
- },
78
- "harness|winogrande|5": {
79
- "hashes": {
80
- "hash_examples": "aada0a176fd81218",
81
- "hash_full_prompts": "c8655cbd12de8409",
82
- "hash_input_tokens": "c0bedf98cb040854",
83
- "hash_cont_tokens": "f08975ad6f2d5864"
84
- },
85
- "truncated": 0,
86
- "non-truncated": 2534,
87
- "padded": 2432,
88
- "non-padded": 102,
89
- "effective_few_shots": 5.0,
90
- "num_truncated_few_shots": 0
91
- }
92
- },
93
- "summary_general": {
94
- "hashes": {
95
- "hash_examples": "9b4d8993161e637d",
96
- "hash_full_prompts": "08215e527b7e60a5",
97
- "hash_input_tokens": "a12f3e3c934bd78b",
98
- "hash_cont_tokens": "5fa49b6513c85264"
99
- },
100
- "total_evaluation_time_secondes": "42063.55081868172",
101
- "truncated": 3,
102
- "non-truncated": 13386,
103
- "padded": 2432,
104
- "non-padded": 10957,
105
- "num_truncated_few_shots": 0
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
meta-llama/Llama-2-70b-hf/results.json DELETED
@@ -1,447 +0,0 @@
1
- {
2
- "results": {
3
- "harness|arc:challenge|25": {
4
- "acc": 0.6262798634812287,
5
- "acc_stderr": 0.014137708601759091,
6
- "acc_norm": 0.6732081911262798,
7
- "acc_norm_stderr": 0.013706665975587333
8
- },
9
- "harness|hellaswag|10": {
10
- "acc": 0.6760605457080263,
11
- "acc_stderr": 0.00467020812857923,
12
- "acc_norm": 0.8733320055765784,
13
- "acc_norm_stderr": 0.0033192094001351187
14
- },
15
- "harness|hendrycksTest-abstract_algebra|5": {
16
- "acc": 0.33,
17
- "acc_stderr": 0.04725815626252605,
18
- "acc_norm": 0.33,
19
- "acc_norm_stderr": 0.04725815626252605
20
- },
21
- "harness|hendrycksTest-anatomy|5": {
22
- "acc": 0.6296296296296297,
23
- "acc_stderr": 0.04171654161354544,
24
- "acc_norm": 0.6296296296296297,
25
- "acc_norm_stderr": 0.04171654161354544
26
- },
27
- "harness|hendrycksTest-astronomy|5": {
28
- "acc": 0.8092105263157895,
29
- "acc_stderr": 0.031975658210325,
30
- "acc_norm": 0.8092105263157895,
31
- "acc_norm_stderr": 0.031975658210325
32
- },
33
- "harness|hendrycksTest-business_ethics|5": {
34
- "acc": 0.72,
35
- "acc_stderr": 0.04512608598542127,
36
- "acc_norm": 0.72,
37
- "acc_norm_stderr": 0.04512608598542127
38
- },
39
- "harness|hendrycksTest-clinical_knowledge|5": {
40
- "acc": 0.7169811320754716,
41
- "acc_stderr": 0.027724236492700918,
42
- "acc_norm": 0.7169811320754716,
43
- "acc_norm_stderr": 0.027724236492700918
44
- },
45
- "harness|hendrycksTest-college_biology|5": {
46
- "acc": 0.8472222222222222,
47
- "acc_stderr": 0.030085743248565666,
48
- "acc_norm": 0.8472222222222222,
49
- "acc_norm_stderr": 0.030085743248565666
50
- },
51
- "harness|hendrycksTest-college_chemistry|5": {
52
- "acc": 0.51,
53
- "acc_stderr": 0.05024183937956912,
54
- "acc_norm": 0.51,
55
- "acc_norm_stderr": 0.05024183937956912
56
- },
57
- "harness|hendrycksTest-college_computer_science|5": {
58
- "acc": 0.6,
59
- "acc_stderr": 0.049236596391733084,
60
- "acc_norm": 0.6,
61
- "acc_norm_stderr": 0.049236596391733084
62
- },
63
- "harness|hendrycksTest-college_mathematics|5": {
64
- "acc": 0.37,
65
- "acc_stderr": 0.048523658709391,
66
- "acc_norm": 0.37,
67
- "acc_norm_stderr": 0.048523658709391
68
- },
69
- "harness|hendrycksTest-college_medicine|5": {
70
- "acc": 0.6416184971098265,
71
- "acc_stderr": 0.03656343653353159,
72
- "acc_norm": 0.6416184971098265,
73
- "acc_norm_stderr": 0.03656343653353159
74
- },
75
- "harness|hendrycksTest-college_physics|5": {
76
- "acc": 0.37254901960784315,
77
- "acc_stderr": 0.04810840148082635,
78
- "acc_norm": 0.37254901960784315,
79
- "acc_norm_stderr": 0.04810840148082635
80
- },
81
- "harness|hendrycksTest-computer_security|5": {
82
- "acc": 0.77,
83
- "acc_stderr": 0.04229525846816506,
84
- "acc_norm": 0.77,
85
- "acc_norm_stderr": 0.04229525846816506
86
- },
87
- "harness|hendrycksTest-conceptual_physics|5": {
88
- "acc": 0.6638297872340425,
89
- "acc_stderr": 0.030881618520676942,
90
- "acc_norm": 0.6638297872340425,
91
- "acc_norm_stderr": 0.030881618520676942
92
- },
93
- "harness|hendrycksTest-econometrics|5": {
94
- "acc": 0.4473684210526316,
95
- "acc_stderr": 0.04677473004491199,
96
- "acc_norm": 0.4473684210526316,
97
- "acc_norm_stderr": 0.04677473004491199
98
- },
99
- "harness|hendrycksTest-electrical_engineering|5": {
100
- "acc": 0.6551724137931034,
101
- "acc_stderr": 0.03960933549451207,
102
- "acc_norm": 0.6551724137931034,
103
- "acc_norm_stderr": 0.03960933549451207
104
- },
105
- "harness|hendrycksTest-elementary_mathematics|5": {
106
- "acc": 0.43386243386243384,
107
- "acc_stderr": 0.025525034382474894,
108
- "acc_norm": 0.43386243386243384,
109
- "acc_norm_stderr": 0.025525034382474894
110
- },
111
- "harness|hendrycksTest-formal_logic|5": {
112
- "acc": 0.47619047619047616,
113
- "acc_stderr": 0.04467062628403273,
114
- "acc_norm": 0.47619047619047616,
115
- "acc_norm_stderr": 0.04467062628403273
116
- },
117
- "harness|hendrycksTest-global_facts|5": {
118
- "acc": 0.46,
119
- "acc_stderr": 0.05009082659620332,
120
- "acc_norm": 0.46,
121
- "acc_norm_stderr": 0.05009082659620332
122
- },
123
- "harness|hendrycksTest-high_school_biology|5": {
124
- "acc": 0.8193548387096774,
125
- "acc_stderr": 0.02188617856717253,
126
- "acc_norm": 0.8193548387096774,
127
- "acc_norm_stderr": 0.02188617856717253
128
- },
129
- "harness|hendrycksTest-high_school_chemistry|5": {
130
- "acc": 0.5123152709359606,
131
- "acc_stderr": 0.035169204442208966,
132
- "acc_norm": 0.5123152709359606,
133
- "acc_norm_stderr": 0.035169204442208966
134
- },
135
- "harness|hendrycksTest-high_school_computer_science|5": {
136
- "acc": 0.79,
137
- "acc_stderr": 0.040936018074033256,
138
- "acc_norm": 0.79,
139
- "acc_norm_stderr": 0.040936018074033256
140
- },
141
- "harness|hendrycksTest-high_school_european_history|5": {
142
- "acc": 0.8303030303030303,
143
- "acc_stderr": 0.029311188674983134,
144
- "acc_norm": 0.8303030303030303,
145
- "acc_norm_stderr": 0.029311188674983134
146
- },
147
- "harness|hendrycksTest-high_school_geography|5": {
148
- "acc": 0.8787878787878788,
149
- "acc_stderr": 0.023253157951942084,
150
- "acc_norm": 0.8787878787878788,
151
- "acc_norm_stderr": 0.023253157951942084
152
- },
153
- "harness|hendrycksTest-high_school_government_and_politics|5": {
154
- "acc": 0.9430051813471503,
155
- "acc_stderr": 0.016731085293607555,
156
- "acc_norm": 0.9430051813471503,
157
- "acc_norm_stderr": 0.016731085293607555
158
- },
159
- "harness|hendrycksTest-high_school_macroeconomics|5": {
160
- "acc": 0.7410256410256411,
161
- "acc_stderr": 0.02221110681006167,
162
- "acc_norm": 0.7410256410256411,
163
- "acc_norm_stderr": 0.02221110681006167
164
- },
165
- "harness|hendrycksTest-high_school_mathematics|5": {
166
- "acc": 0.35555555555555557,
167
- "acc_stderr": 0.029185714949857403,
168
- "acc_norm": 0.35555555555555557,
169
- "acc_norm_stderr": 0.029185714949857403
170
- },
171
- "harness|hendrycksTest-high_school_microeconomics|5": {
172
- "acc": 0.7647058823529411,
173
- "acc_stderr": 0.02755361446786381,
174
- "acc_norm": 0.7647058823529411,
175
- "acc_norm_stderr": 0.02755361446786381
176
- },
177
- "harness|hendrycksTest-high_school_physics|5": {
178
- "acc": 0.4304635761589404,
179
- "acc_stderr": 0.04042809961395634,
180
- "acc_norm": 0.4304635761589404,
181
- "acc_norm_stderr": 0.04042809961395634
182
- },
183
- "harness|hendrycksTest-high_school_psychology|5": {
184
- "acc": 0.8733944954128441,
185
- "acc_stderr": 0.014257128686165169,
186
- "acc_norm": 0.8733944954128441,
187
- "acc_norm_stderr": 0.014257128686165169
188
- },
189
- "harness|hendrycksTest-high_school_statistics|5": {
190
- "acc": 0.6342592592592593,
191
- "acc_stderr": 0.032847388576472056,
192
- "acc_norm": 0.6342592592592593,
193
- "acc_norm_stderr": 0.032847388576472056
194
- },
195
- "harness|hendrycksTest-high_school_us_history|5": {
196
- "acc": 0.8970588235294118,
197
- "acc_stderr": 0.02132833757080437,
198
- "acc_norm": 0.8970588235294118,
199
- "acc_norm_stderr": 0.02132833757080437
200
- },
201
- "harness|hendrycksTest-high_school_world_history|5": {
202
- "acc": 0.8776371308016878,
203
- "acc_stderr": 0.021331741829746786,
204
- "acc_norm": 0.8776371308016878,
205
- "acc_norm_stderr": 0.021331741829746786
206
- },
207
- "harness|hendrycksTest-human_aging|5": {
208
- "acc": 0.8026905829596412,
209
- "acc_stderr": 0.02670985334496796,
210
- "acc_norm": 0.8026905829596412,
211
- "acc_norm_stderr": 0.02670985334496796
212
- },
213
- "harness|hendrycksTest-human_sexuality|5": {
214
- "acc": 0.8778625954198473,
215
- "acc_stderr": 0.028718776889342344,
216
- "acc_norm": 0.8778625954198473,
217
- "acc_norm_stderr": 0.028718776889342344
218
- },
219
- "harness|hendrycksTest-international_law|5": {
220
- "acc": 0.8760330578512396,
221
- "acc_stderr": 0.03008309871603521,
222
- "acc_norm": 0.8760330578512396,
223
- "acc_norm_stderr": 0.03008309871603521
224
- },
225
- "harness|hendrycksTest-jurisprudence|5": {
226
- "acc": 0.8333333333333334,
227
- "acc_stderr": 0.03602814176392645,
228
- "acc_norm": 0.8333333333333334,
229
- "acc_norm_stderr": 0.03602814176392645
230
- },
231
- "harness|hendrycksTest-logical_fallacies|5": {
232
- "acc": 0.803680981595092,
233
- "acc_stderr": 0.031207970394709218,
234
- "acc_norm": 0.803680981595092,
235
- "acc_norm_stderr": 0.031207970394709218
236
- },
237
- "harness|hendrycksTest-machine_learning|5": {
238
- "acc": 0.5357142857142857,
239
- "acc_stderr": 0.04733667890053756,
240
- "acc_norm": 0.5357142857142857,
241
- "acc_norm_stderr": 0.04733667890053756
242
- },
243
- "harness|hendrycksTest-management|5": {
244
- "acc": 0.8349514563106796,
245
- "acc_stderr": 0.03675668832233188,
246
- "acc_norm": 0.8349514563106796,
247
- "acc_norm_stderr": 0.03675668832233188
248
- },
249
- "harness|hendrycksTest-marketing|5": {
250
- "acc": 0.905982905982906,
251
- "acc_stderr": 0.01911989279892498,
252
- "acc_norm": 0.905982905982906,
253
- "acc_norm_stderr": 0.01911989279892498
254
- },
255
- "harness|hendrycksTest-medical_genetics|5": {
256
- "acc": 0.74,
257
- "acc_stderr": 0.04408440022768077,
258
- "acc_norm": 0.74,
259
- "acc_norm_stderr": 0.04408440022768077
260
- },
261
- "harness|hendrycksTest-miscellaneous|5": {
262
- "acc": 0.8620689655172413,
263
- "acc_stderr": 0.012331009307795656,
264
- "acc_norm": 0.8620689655172413,
265
- "acc_norm_stderr": 0.012331009307795656
266
- },
267
- "harness|hendrycksTest-moral_disputes|5": {
268
- "acc": 0.7774566473988439,
269
- "acc_stderr": 0.02239421566194282,
270
- "acc_norm": 0.7774566473988439,
271
- "acc_norm_stderr": 0.02239421566194282
272
- },
273
- "harness|hendrycksTest-moral_scenarios|5": {
274
- "acc": 0.4547486033519553,
275
- "acc_stderr": 0.016653875777524012,
276
- "acc_norm": 0.4547486033519553,
277
- "acc_norm_stderr": 0.016653875777524012
278
- },
279
- "harness|hendrycksTest-nutrition|5": {
280
- "acc": 0.7810457516339869,
281
- "acc_stderr": 0.02367908986180772,
282
- "acc_norm": 0.7810457516339869,
283
- "acc_norm_stderr": 0.02367908986180772
284
- },
285
- "harness|hendrycksTest-philosophy|5": {
286
- "acc": 0.7877813504823151,
287
- "acc_stderr": 0.023222756797435115,
288
- "acc_norm": 0.7877813504823151,
289
- "acc_norm_stderr": 0.023222756797435115
290
- },
291
- "harness|hendrycksTest-prehistory|5": {
292
- "acc": 0.8364197530864198,
293
- "acc_stderr": 0.020581466138257114,
294
- "acc_norm": 0.8364197530864198,
295
- "acc_norm_stderr": 0.020581466138257114
296
- },
297
- "harness|hendrycksTest-professional_accounting|5": {
298
- "acc": 0.5673758865248227,
299
- "acc_stderr": 0.02955545423677884,
300
- "acc_norm": 0.5673758865248227,
301
- "acc_norm_stderr": 0.02955545423677884
302
- },
303
- "harness|hendrycksTest-professional_law|5": {
304
- "acc": 0.5319426336375489,
305
- "acc_stderr": 0.012744149704869645,
306
- "acc_norm": 0.5319426336375489,
307
- "acc_norm_stderr": 0.012744149704869645
308
- },
309
- "harness|hendrycksTest-professional_medicine|5": {
310
- "acc": 0.75,
311
- "acc_stderr": 0.026303648393696036,
312
- "acc_norm": 0.75,
313
- "acc_norm_stderr": 0.026303648393696036
314
- },
315
- "harness|hendrycksTest-professional_psychology|5": {
316
- "acc": 0.7565359477124183,
317
- "acc_stderr": 0.01736247376214662,
318
- "acc_norm": 0.7565359477124183,
319
- "acc_norm_stderr": 0.01736247376214662
320
- },
321
- "harness|hendrycksTest-public_relations|5": {
322
- "acc": 0.6909090909090909,
323
- "acc_stderr": 0.044262946482000985,
324
- "acc_norm": 0.6909090909090909,
325
- "acc_norm_stderr": 0.044262946482000985
326
- },
327
- "harness|hendrycksTest-security_studies|5": {
328
- "acc": 0.7918367346938775,
329
- "acc_stderr": 0.0259911176728133,
330
- "acc_norm": 0.7918367346938775,
331
- "acc_norm_stderr": 0.0259911176728133
332
- },
333
- "harness|hendrycksTest-sociology|5": {
334
- "acc": 0.900497512437811,
335
- "acc_stderr": 0.021166216304659393,
336
- "acc_norm": 0.900497512437811,
337
- "acc_norm_stderr": 0.021166216304659393
338
- },
339
- "harness|hendrycksTest-us_foreign_policy|5": {
340
- "acc": 0.92,
341
- "acc_stderr": 0.0272659924344291,
342
- "acc_norm": 0.92,
343
- "acc_norm_stderr": 0.0272659924344291
344
- },
345
- "harness|hendrycksTest-virology|5": {
346
- "acc": 0.5301204819277109,
347
- "acc_stderr": 0.03885425420866767,
348
- "acc_norm": 0.5301204819277109,
349
- "acc_norm_stderr": 0.03885425420866767
350
- },
351
- "harness|hendrycksTest-world_religions|5": {
352
- "acc": 0.8538011695906432,
353
- "acc_stderr": 0.027097290118070806,
354
- "acc_norm": 0.8538011695906432,
355
- "acc_norm_stderr": 0.027097290118070806
356
- },
357
- "harness|truthfulqa:mc|0": {
358
- "mc1": 0.3108935128518972,
359
- "mc1_stderr": 0.016203316673559696,
360
- "mc2": 0.44923493721887353,
361
- "mc2_stderr": 0.01390226410719232
362
- },
363
- "all": {
364
- "acc": 0.6967225637378714,
365
- "acc_stderr": 0.030867069907791145,
366
- "acc_norm": 0.7008615431872544,
367
- "acc_norm_stderr": 0.030836865817034945,
368
- "mc1": 0.3108935128518972,
369
- "mc1_stderr": 0.016203316673559696,
370
- "mc2": 0.44923493721887353,
371
- "mc2_stderr": 0.01390226410719232
372
- }
373
- },
374
- "versions": {
375
- "harness|arc:challenge|25": 0,
376
- "harness|hellaswag|10": 0,
377
- "harness|hendrycksTest-abstract_algebra|5": 1,
378
- "harness|hendrycksTest-anatomy|5": 1,
379
- "harness|hendrycksTest-astronomy|5": 1,
380
- "harness|hendrycksTest-business_ethics|5": 1,
381
- "harness|hendrycksTest-clinical_knowledge|5": 1,
382
- "harness|hendrycksTest-college_biology|5": 1,
383
- "harness|hendrycksTest-college_chemistry|5": 1,
384
- "harness|hendrycksTest-college_computer_science|5": 1,
385
- "harness|hendrycksTest-college_mathematics|5": 1,
386
- "harness|hendrycksTest-college_medicine|5": 1,
387
- "harness|hendrycksTest-college_physics|5": 1,
388
- "harness|hendrycksTest-computer_security|5": 1,
389
- "harness|hendrycksTest-conceptual_physics|5": 1,
390
- "harness|hendrycksTest-econometrics|5": 1,
391
- "harness|hendrycksTest-electrical_engineering|5": 1,
392
- "harness|hendrycksTest-elementary_mathematics|5": 1,
393
- "harness|hendrycksTest-formal_logic|5": 1,
394
- "harness|hendrycksTest-global_facts|5": 1,
395
- "harness|hendrycksTest-high_school_biology|5": 1,
396
- "harness|hendrycksTest-high_school_chemistry|5": 1,
397
- "harness|hendrycksTest-high_school_computer_science|5": 1,
398
- "harness|hendrycksTest-high_school_european_history|5": 1,
399
- "harness|hendrycksTest-high_school_geography|5": 1,
400
- "harness|hendrycksTest-high_school_government_and_politics|5": 1,
401
- "harness|hendrycksTest-high_school_macroeconomics|5": 1,
402
- "harness|hendrycksTest-high_school_mathematics|5": 1,
403
- "harness|hendrycksTest-high_school_microeconomics|5": 1,
404
- "harness|hendrycksTest-high_school_physics|5": 1,
405
- "harness|hendrycksTest-high_school_psychology|5": 1,
406
- "harness|hendrycksTest-high_school_statistics|5": 1,
407
- "harness|hendrycksTest-high_school_us_history|5": 1,
408
- "harness|hendrycksTest-high_school_world_history|5": 1,
409
- "harness|hendrycksTest-human_aging|5": 1,
410
- "harness|hendrycksTest-human_sexuality|5": 1,
411
- "harness|hendrycksTest-international_law|5": 1,
412
- "harness|hendrycksTest-jurisprudence|5": 1,
413
- "harness|hendrycksTest-logical_fallacies|5": 1,
414
- "harness|hendrycksTest-machine_learning|5": 1,
415
- "harness|hendrycksTest-management|5": 1,
416
- "harness|hendrycksTest-marketing|5": 1,
417
- "harness|hendrycksTest-medical_genetics|5": 1,
418
- "harness|hendrycksTest-miscellaneous|5": 1,
419
- "harness|hendrycksTest-moral_disputes|5": 1,
420
- "harness|hendrycksTest-moral_scenarios|5": 1,
421
- "harness|hendrycksTest-nutrition|5": 1,
422
- "harness|hendrycksTest-philosophy|5": 1,
423
- "harness|hendrycksTest-prehistory|5": 1,
424
- "harness|hendrycksTest-professional_accounting|5": 1,
425
- "harness|hendrycksTest-professional_law|5": 1,
426
- "harness|hendrycksTest-professional_medicine|5": 1,
427
- "harness|hendrycksTest-professional_psychology|5": 1,
428
- "harness|hendrycksTest-public_relations|5": 1,
429
- "harness|hendrycksTest-security_studies|5": 1,
430
- "harness|hendrycksTest-sociology|5": 1,
431
- "harness|hendrycksTest-us_foreign_policy|5": 1,
432
- "harness|hendrycksTest-virology|5": 1,
433
- "harness|hendrycksTest-world_religions|5": 1,
434
- "harness|truthfulqa:mc|0": 1,
435
- "all": 0
436
- },
437
- "config": {
438
- "model_name": "meta-llama/Llama-2-70b-hf",
439
- "model_sha": "ed7b07231238f836b99bf45701b9a0063576b194",
440
- "model_dtype": "torch.float16",
441
- "lighteval_sha": "d2e819bc028044e701a13b954d3326ceddb71b98",
442
- "num_few_shot_default": 0,
443
- "num_fewshot_seeds": 1,
444
- "override_batch_size": 1,
445
- "max_samples": null
446
- }
447
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
meta-llama/Llama-2-70b-hf/results_2023-09-08T23-38-08.931556.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "config_general": {
3
- "model_name": "meta-llama/Llama-2-70b-hf",
4
- "model_sha": "cc8aa03a000ff08b4d5c5b39673321a2a396c396",
5
- "model_size": "128.64 GB",
6
- "model_dtype": "torch.float16",
7
- "lighteval_sha": "457ac5672c5fdebfd6bc95bb94bda825c148eccf",
8
- "num_few_shot_default": 0,
9
- "num_fewshot_seeds": 1,
10
- "override_batch_size": 1,
11
- "max_samples": null,
12
- "job_id": ""
13
- },
14
- "results": {
15
- "harness|drop|3": {
16
- "em": 0.0017827181208053692,
17
- "em_stderr": 0.00043200973460388544,
18
- "f1": 0.06615562080536916,
19
- "f1_stderr": 0.0013739852117668813
20
- },
21
- "harness|gsm8k|5": {
22
- "acc": 0.33965125094768767,
23
- "acc_stderr": 0.01304504506766526
24
- },
25
- "harness|winogrande|5": {
26
- "acc": 0.8374112075769534,
27
- "acc_stderr": 0.010370455551343326
28
- },
29
- "all": {
30
- "em": 0.0017827181208053692,
31
- "em_stderr": 0.00043200973460388544,
32
- "f1": 0.06615562080536916,
33
- "f1_stderr": 0.0013739852117668813,
34
- "acc": 0.5885312292623206,
35
- "acc_stderr": 0.011707750309504293
36
- }
37
- },
38
- "versions": {
39
- "harness|drop|3": 1,
40
- "harness|gsm8k|5": 0,
41
- "harness|winogrande|5": 0,
42
- "all": 0
43
- },
44
- "config_tasks": {
45
- "harness|drop": "LM Harness task",
46
- "harness|gsm8k": "LM Harness task",
47
- "harness|winogrande": "LM Harness task"
48
- },
49
- "summary_tasks": {
50
- "harness|drop|3": {
51
- "hashes": {
52
- "hash_examples": "1d27416e8324e9a3",
53
- "hash_full_prompts": "a5513ff9a741b385",
54
- "hash_input_tokens": "42076f0efbb50aa6",
55
- "hash_cont_tokens": "32bc149506251e60"
56
- },
57
- "truncated": 3,
58
- "non-truncated": 9533,
59
- "padded": 0,
60
- "non-padded": 9536,
61
- "effective_few_shots": 3.0,
62
- "num_truncated_few_shots": 0
63
- },
64
- "harness|gsm8k|5": {
65
- "hashes": {
66
- "hash_examples": "4c0843a5d99bcfdc",
67
- "hash_full_prompts": "41d55e83abc0e02d",
68
- "hash_input_tokens": "bda342e47b5099b2",
69
- "hash_cont_tokens": "a95ce63226eb9a2d"
70
- },
71
- "truncated": 0,
72
- "non-truncated": 1319,
73
- "padded": 0,
74
- "non-padded": 1319,
75
- "effective_few_shots": 5.0,
76
- "num_truncated_few_shots": 0
77
- },
78
- "harness|winogrande|5": {
79
- "hashes": {
80
- "hash_examples": "aada0a176fd81218",
81
- "hash_full_prompts": "c8655cbd12de8409",
82
- "hash_input_tokens": "c0bedf98cb040854",
83
- "hash_cont_tokens": "f08975ad6f2d5864"
84
- },
85
- "truncated": 0,
86
- "non-truncated": 2534,
87
- "padded": 2432,
88
- "non-padded": 102,
89
- "effective_few_shots": 5.0,
90
- "num_truncated_few_shots": 0
91
- }
92
- },
93
- "summary_general": {
94
- "hashes": {
95
- "hash_examples": "9b4d8993161e637d",
96
- "hash_full_prompts": "08215e527b7e60a5",
97
- "hash_input_tokens": "a12f3e3c934bd78b",
98
- "hash_cont_tokens": "dff37de5e6c9aeb7"
99
- },
100
- "total_evaluation_time_secondes": "28373.291680336",
101
- "truncated": 3,
102
- "non-truncated": 13386,
103
- "padded": 2432,
104
- "non-padded": 10957,
105
- "num_truncated_few_shots": 0
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
meta-llama/Llama-2-70b-hf/results_2023-09-18T06-46-44.905361.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "config_general": {
3
- "model_name": "meta-llama/Llama-2-70b-hf",
4
- "model_sha": "cc8aa03a000ff08b4d5c5b39673321a2a396c396",
5
- "model_size": "128.64 GB",
6
- "model_dtype": "torch.float16",
7
- "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374",
8
- "num_few_shot_default": 0,
9
- "num_fewshot_seeds": 1,
10
- "override_batch_size": 1,
11
- "max_samples": null,
12
- "job_id": ""
13
- },
14
- "results": {
15
- "harness|drop|3": {
16
- "em": 0.0017827181208053692,
17
- "em_stderr": 0.00043200973460388544,
18
- "f1": 0.06615562080536916,
19
- "f1_stderr": 0.0013739852117668813
20
- },
21
- "harness|gsm8k|5": {
22
- "acc": 0.33965125094768767,
23
- "acc_stderr": 0.01304504506766526
24
- },
25
- "harness|winogrande|5": {
26
- "acc": 0.8374112075769534,
27
- "acc_stderr": 0.010370455551343326
28
- },
29
- "all": {
30
- "em": 0.0017827181208053692,
31
- "em_stderr": 0.00043200973460388544,
32
- "f1": 0.06615562080536916,
33
- "f1_stderr": 0.0013739852117668813,
34
- "acc": 0.5885312292623206,
35
- "acc_stderr": 0.011707750309504293
36
- }
37
- },
38
- "versions": {
39
- "harness|drop|3": 1,
40
- "harness|gsm8k|5": 0,
41
- "harness|winogrande|5": 0,
42
- "all": 0
43
- },
44
- "config_tasks": {
45
- "harness|drop": "LM Harness task",
46
- "harness|gsm8k": "LM Harness task",
47
- "harness|winogrande": "LM Harness task"
48
- },
49
- "summary_tasks": {
50
- "harness|drop|3": {
51
- "hashes": {
52
- "hash_examples": "1d27416e8324e9a3",
53
- "hash_full_prompts": "a5513ff9a741b385",
54
- "hash_input_tokens": "42076f0efbb50aa6",
55
- "hash_cont_tokens": "32bc149506251e60"
56
- },
57
- "truncated": 3,
58
- "non-truncated": 9533,
59
- "padded": 0,
60
- "non-padded": 9536,
61
- "effective_few_shots": 3.0,
62
- "num_truncated_few_shots": 0
63
- },
64
- "harness|gsm8k|5": {
65
- "hashes": {
66
- "hash_examples": "4c0843a5d99bcfdc",
67
- "hash_full_prompts": "41d55e83abc0e02d",
68
- "hash_input_tokens": "bda342e47b5099b2",
69
- "hash_cont_tokens": "a95ce63226eb9a2d"
70
- },
71
- "truncated": 0,
72
- "non-truncated": 1319,
73
- "padded": 0,
74
- "non-padded": 1319,
75
- "effective_few_shots": 5.0,
76
- "num_truncated_few_shots": 0
77
- },
78
- "harness|winogrande|5": {
79
- "hashes": {
80
- "hash_examples": "aada0a176fd81218",
81
- "hash_full_prompts": "c8655cbd12de8409",
82
- "hash_input_tokens": "c0bedf98cb040854",
83
- "hash_cont_tokens": "f08975ad6f2d5864"
84
- },
85
- "truncated": 0,
86
- "non-truncated": 2534,
87
- "padded": 2432,
88
- "non-padded": 102,
89
- "effective_few_shots": 5.0,
90
- "num_truncated_few_shots": 0
91
- }
92
- },
93
- "summary_general": {
94
- "hashes": {
95
- "hash_examples": "9b4d8993161e637d",
96
- "hash_full_prompts": "08215e527b7e60a5",
97
- "hash_input_tokens": "a12f3e3c934bd78b",
98
- "hash_cont_tokens": "dff37de5e6c9aeb7"
99
- },
100
- "total_evaluation_time_secondes": "45388.11919736862",
101
- "truncated": 3,
102
- "non-truncated": 13386,
103
- "padded": 2432,
104
- "non-padded": 10957,
105
- "num_truncated_few_shots": 0
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
meta-llama/Llama-2-7b-chat-hf/results.json DELETED
@@ -1,871 +0,0 @@
1
- {
2
- "results": {
3
- "harness|arc:challenge|25": {
4
- "acc": 0.4948805460750853,
5
- "acc_stderr": 0.01461062489030916,
6
- "acc_norm": 0.5290102389078498,
7
- "acc_norm_stderr": 0.014586776355294323
8
- },
9
- "harness|hellaswag|10": {
10
- "acc": 0.5978888667596096,
11
- "acc_stderr": 0.004893220635011792,
12
- "acc_norm": 0.7855008962358097,
13
- "acc_norm_stderr": 0.004096355125117511
14
- },
15
- "harness|hendrycksTest-abstract_algebra|5": {
16
- "acc": 0.28,
17
- "acc_stderr": 0.04512608598542129,
18
- "acc_norm": 0.28,
19
- "acc_norm_stderr": 0.04512608598542129
20
- },
21
- "harness|hendrycksTest-anatomy|5": {
22
- "acc": 0.42962962962962964,
23
- "acc_stderr": 0.04276349494376599,
24
- "acc_norm": 0.42962962962962964,
25
- "acc_norm_stderr": 0.04276349494376599
26
- },
27
- "harness|hendrycksTest-astronomy|5": {
28
- "acc": 0.4868421052631579,
29
- "acc_stderr": 0.04067533136309173,
30
- "acc_norm": 0.4868421052631579,
31
- "acc_norm_stderr": 0.04067533136309173
32
- },
33
- "harness|hendrycksTest-business_ethics|5": {
34
- "acc": 0.53,
35
- "acc_stderr": 0.050161355804659205,
36
- "acc_norm": 0.53,
37
- "acc_norm_stderr": 0.050161355804659205
38
- },
39
- "harness|hendrycksTest-clinical_knowledge|5": {
40
- "acc": 0.5358490566037736,
41
- "acc_stderr": 0.030693675018458003,
42
- "acc_norm": 0.5358490566037736,
43
- "acc_norm_stderr": 0.030693675018458003
44
- },
45
- "harness|hendrycksTest-college_biology|5": {
46
- "acc": 0.5208333333333334,
47
- "acc_stderr": 0.041775789507399935,
48
- "acc_norm": 0.5208333333333334,
49
- "acc_norm_stderr": 0.041775789507399935
50
- },
51
- "harness|hendrycksTest-college_chemistry|5": {
52
- "acc": 0.29,
53
- "acc_stderr": 0.04560480215720684,
54
- "acc_norm": 0.29,
55
- "acc_norm_stderr": 0.04560480215720684
56
- },
57
- "harness|hendrycksTest-college_computer_science|5": {
58
- "acc": 0.38,
59
- "acc_stderr": 0.048783173121456316,
60
- "acc_norm": 0.38,
61
- "acc_norm_stderr": 0.048783173121456316
62
- },
63
- "harness|hendrycksTest-college_mathematics|5": {
64
- "acc": 0.36,
65
- "acc_stderr": 0.04824181513244218,
66
- "acc_norm": 0.36,
67
- "acc_norm_stderr": 0.04824181513244218
68
- },
69
- "harness|hendrycksTest-college_medicine|5": {
70
- "acc": 0.3988439306358382,
71
- "acc_stderr": 0.037336266553835096,
72
- "acc_norm": 0.3988439306358382,
73
- "acc_norm_stderr": 0.037336266553835096
74
- },
75
- "harness|hendrycksTest-college_physics|5": {
76
- "acc": 0.22549019607843138,
77
- "acc_stderr": 0.041583075330832865,
78
- "acc_norm": 0.22549019607843138,
79
- "acc_norm_stderr": 0.041583075330832865
80
- },
81
- "harness|hendrycksTest-computer_security|5": {
82
- "acc": 0.58,
83
- "acc_stderr": 0.049604496374885836,
84
- "acc_norm": 0.58,
85
- "acc_norm_stderr": 0.049604496374885836
86
- },
87
- "harness|hendrycksTest-conceptual_physics|5": {
88
- "acc": 0.4085106382978723,
89
- "acc_stderr": 0.03213418026701576,
90
- "acc_norm": 0.4085106382978723,
91
- "acc_norm_stderr": 0.03213418026701576
92
- },
93
- "harness|hendrycksTest-econometrics|5": {
94
- "acc": 0.37719298245614036,
95
- "acc_stderr": 0.045595221419582166,
96
- "acc_norm": 0.37719298245614036,
97
- "acc_norm_stderr": 0.045595221419582166
98
- },
99
- "harness|hendrycksTest-electrical_engineering|5": {
100
- "acc": 0.4896551724137931,
101
- "acc_stderr": 0.04165774775728762,
102
- "acc_norm": 0.4896551724137931,
103
- "acc_norm_stderr": 0.04165774775728762
104
- },
105
- "harness|hendrycksTest-elementary_mathematics|5": {
106
- "acc": 0.29894179894179895,
107
- "acc_stderr": 0.023577604791655805,
108
- "acc_norm": 0.29894179894179895,
109
- "acc_norm_stderr": 0.023577604791655805
110
- },
111
- "harness|hendrycksTest-formal_logic|5": {
112
- "acc": 0.25396825396825395,
113
- "acc_stderr": 0.03893259610604675,
114
- "acc_norm": 0.25396825396825395,
115
- "acc_norm_stderr": 0.03893259610604675
116
- },
117
- "harness|hendrycksTest-global_facts|5": {
118
- "acc": 0.36,
119
- "acc_stderr": 0.048241815132442176,
120
- "acc_norm": 0.36,
121
- "acc_norm_stderr": 0.048241815132442176
122
- },
123
- "harness|hendrycksTest-high_school_biology|5": {
124
- "acc": 0.5225806451612903,
125
- "acc_stderr": 0.02841498501970786,
126
- "acc_norm": 0.5225806451612903,
127
- "acc_norm_stderr": 0.02841498501970786
128
- },
129
- "harness|hendrycksTest-high_school_chemistry|5": {
130
- "acc": 0.3645320197044335,
131
- "acc_stderr": 0.033864057460620905,
132
- "acc_norm": 0.3645320197044335,
133
- "acc_norm_stderr": 0.033864057460620905
134
- },
135
- "harness|hendrycksTest-high_school_computer_science|5": {
136
- "acc": 0.41,
137
- "acc_stderr": 0.04943110704237102,
138
- "acc_norm": 0.41,
139
- "acc_norm_stderr": 0.04943110704237102
140
- },
141
- "harness|hendrycksTest-high_school_european_history|5": {
142
- "acc": 0.5878787878787879,
143
- "acc_stderr": 0.03843566993588718,
144
- "acc_norm": 0.5878787878787879,
145
- "acc_norm_stderr": 0.03843566993588718
146
- },
147
- "harness|hendrycksTest-high_school_geography|5": {
148
- "acc": 0.6060606060606061,
149
- "acc_stderr": 0.034812853382329624,
150
- "acc_norm": 0.6060606060606061,
151
- "acc_norm_stderr": 0.034812853382329624
152
- },
153
- "harness|hendrycksTest-high_school_government_and_politics|5": {
154
- "acc": 0.7150259067357513,
155
- "acc_stderr": 0.032577140777096614,
156
- "acc_norm": 0.7150259067357513,
157
- "acc_norm_stderr": 0.032577140777096614
158
- },
159
- "harness|hendrycksTest-high_school_macroeconomics|5": {
160
- "acc": 0.4256410256410256,
161
- "acc_stderr": 0.02506909438729654,
162
- "acc_norm": 0.4256410256410256,
163
- "acc_norm_stderr": 0.02506909438729654
164
- },
165
- "harness|hendrycksTest-high_school_mathematics|5": {
166
- "acc": 0.25555555555555554,
167
- "acc_stderr": 0.02659393910184408,
168
- "acc_norm": 0.25555555555555554,
169
- "acc_norm_stderr": 0.02659393910184408
170
- },
171
- "harness|hendrycksTest-high_school_microeconomics|5": {
172
- "acc": 0.42436974789915966,
173
- "acc_stderr": 0.03210479051015776,
174
- "acc_norm": 0.42436974789915966,
175
- "acc_norm_stderr": 0.03210479051015776
176
- },
177
- "harness|hendrycksTest-high_school_physics|5": {
178
- "acc": 0.2913907284768212,
179
- "acc_stderr": 0.03710185726119995,
180
- "acc_norm": 0.2913907284768212,
181
- "acc_norm_stderr": 0.03710185726119995
182
- },
183
- "harness|hendrycksTest-high_school_psychology|5": {
184
- "acc": 0.6752293577981652,
185
- "acc_stderr": 0.020077729109310327,
186
- "acc_norm": 0.6752293577981652,
187
- "acc_norm_stderr": 0.020077729109310327
188
- },
189
- "harness|hendrycksTest-high_school_statistics|5": {
190
- "acc": 0.3333333333333333,
191
- "acc_stderr": 0.0321495214780275,
192
- "acc_norm": 0.3333333333333333,
193
- "acc_norm_stderr": 0.0321495214780275
194
- },
195
- "harness|hendrycksTest-high_school_us_history|5": {
196
- "acc": 0.6764705882352942,
197
- "acc_stderr": 0.0328347205610856,
198
- "acc_norm": 0.6764705882352942,
199
- "acc_norm_stderr": 0.0328347205610856
200
- },
201
- "harness|hendrycksTest-high_school_world_history|5": {
202
- "acc": 0.6666666666666666,
203
- "acc_stderr": 0.03068582059661079,
204
- "acc_norm": 0.6666666666666666,
205
- "acc_norm_stderr": 0.03068582059661079
206
- },
207
- "harness|hendrycksTest-human_aging|5": {
208
- "acc": 0.5605381165919282,
209
- "acc_stderr": 0.03331092511038179,
210
- "acc_norm": 0.5605381165919282,
211
- "acc_norm_stderr": 0.03331092511038179
212
- },
213
- "harness|hendrycksTest-human_sexuality|5": {
214
- "acc": 0.5725190839694656,
215
- "acc_stderr": 0.04338920305792401,
216
- "acc_norm": 0.5725190839694656,
217
- "acc_norm_stderr": 0.04338920305792401
218
- },
219
- "harness|hendrycksTest-international_law|5": {
220
- "acc": 0.628099173553719,
221
- "acc_stderr": 0.04412015806624504,
222
- "acc_norm": 0.628099173553719,
223
- "acc_norm_stderr": 0.04412015806624504
224
- },
225
- "harness|hendrycksTest-jurisprudence|5": {
226
- "acc": 0.5833333333333334,
227
- "acc_stderr": 0.04766075165356461,
228
- "acc_norm": 0.5833333333333334,
229
- "acc_norm_stderr": 0.04766075165356461
230
- },
231
- "harness|hendrycksTest-logical_fallacies|5": {
232
- "acc": 0.5521472392638037,
233
- "acc_stderr": 0.03906947479456606,
234
- "acc_norm": 0.5521472392638037,
235
- "acc_norm_stderr": 0.03906947479456606
236
- },
237
- "harness|hendrycksTest-machine_learning|5": {
238
- "acc": 0.30357142857142855,
239
- "acc_stderr": 0.04364226155841044,
240
- "acc_norm": 0.30357142857142855,
241
- "acc_norm_stderr": 0.04364226155841044
242
- },
243
- "harness|hendrycksTest-management|5": {
244
- "acc": 0.6699029126213593,
245
- "acc_stderr": 0.04656147110012351,
246
- "acc_norm": 0.6699029126213593,
247
- "acc_norm_stderr": 0.04656147110012351
248
- },
249
- "harness|hendrycksTest-marketing|5": {
250
- "acc": 0.7094017094017094,
251
- "acc_stderr": 0.029745048572674074,
252
- "acc_norm": 0.7094017094017094,
253
- "acc_norm_stderr": 0.029745048572674074
254
- },
255
- "harness|hendrycksTest-medical_genetics|5": {
256
- "acc": 0.5,
257
- "acc_stderr": 0.050251890762960605,
258
- "acc_norm": 0.5,
259
- "acc_norm_stderr": 0.050251890762960605
260
- },
261
- "harness|hendrycksTest-miscellaneous|5": {
262
- "acc": 0.6756066411238825,
263
- "acc_stderr": 0.0167409290471627,
264
- "acc_norm": 0.6756066411238825,
265
- "acc_norm_stderr": 0.0167409290471627
266
- },
267
- "harness|hendrycksTest-moral_disputes|5": {
268
- "acc": 0.5144508670520231,
269
- "acc_stderr": 0.026907849856282542,
270
- "acc_norm": 0.5144508670520231,
271
- "acc_norm_stderr": 0.026907849856282542
272
- },
273
- "harness|hendrycksTest-moral_scenarios|5": {
274
- "acc": 0.2201117318435754,
275
- "acc_stderr": 0.013856994024227175,
276
- "acc_norm": 0.2201117318435754,
277
- "acc_norm_stderr": 0.013856994024227175
278
- },
279
- "harness|hendrycksTest-nutrition|5": {
280
- "acc": 0.5196078431372549,
281
- "acc_stderr": 0.028607893699576066,
282
- "acc_norm": 0.5196078431372549,
283
- "acc_norm_stderr": 0.028607893699576066
284
- },
285
- "harness|hendrycksTest-philosophy|5": {
286
- "acc": 0.5659163987138264,
287
- "acc_stderr": 0.02815023224453559,
288
- "acc_norm": 0.5659163987138264,
289
- "acc_norm_stderr": 0.02815023224453559
290
- },
291
- "harness|hendrycksTest-prehistory|5": {
292
- "acc": 0.5679012345679012,
293
- "acc_stderr": 0.027563010971606676,
294
- "acc_norm": 0.5679012345679012,
295
- "acc_norm_stderr": 0.027563010971606676
296
- },
297
- "harness|hendrycksTest-professional_accounting|5": {
298
- "acc": 0.3723404255319149,
299
- "acc_stderr": 0.028838921471251458,
300
- "acc_norm": 0.3723404255319149,
301
- "acc_norm_stderr": 0.028838921471251458
302
- },
303
- "harness|hendrycksTest-professional_law|5": {
304
- "acc": 0.3500651890482399,
305
- "acc_stderr": 0.012182552313215175,
306
- "acc_norm": 0.3500651890482399,
307
- "acc_norm_stderr": 0.012182552313215175
308
- },
309
- "harness|hendrycksTest-professional_medicine|5": {
310
- "acc": 0.45588235294117646,
311
- "acc_stderr": 0.030254372573976684,
312
- "acc_norm": 0.45588235294117646,
313
- "acc_norm_stderr": 0.030254372573976684
314
- },
315
- "harness|hendrycksTest-professional_psychology|5": {
316
- "acc": 0.4803921568627451,
317
- "acc_stderr": 0.020212274976302957,
318
- "acc_norm": 0.4803921568627451,
319
- "acc_norm_stderr": 0.020212274976302957
320
- },
321
- "harness|hendrycksTest-public_relations|5": {
322
- "acc": 0.5272727272727272,
323
- "acc_stderr": 0.04782001791380061,
324
- "acc_norm": 0.5272727272727272,
325
- "acc_norm_stderr": 0.04782001791380061
326
- },
327
- "harness|hendrycksTest-security_studies|5": {
328
- "acc": 0.5265306122448979,
329
- "acc_stderr": 0.03196412734523272,
330
- "acc_norm": 0.5265306122448979,
331
- "acc_norm_stderr": 0.03196412734523272
332
- },
333
- "harness|hendrycksTest-sociology|5": {
334
- "acc": 0.6467661691542289,
335
- "acc_stderr": 0.03379790611796777,
336
- "acc_norm": 0.6467661691542289,
337
- "acc_norm_stderr": 0.03379790611796777
338
- },
339
- "harness|hendrycksTest-us_foreign_policy|5": {
340
- "acc": 0.72,
341
- "acc_stderr": 0.045126085985421276,
342
- "acc_norm": 0.72,
343
- "acc_norm_stderr": 0.045126085985421276
344
- },
345
- "harness|hendrycksTest-virology|5": {
346
- "acc": 0.43373493975903615,
347
- "acc_stderr": 0.03858158940685517,
348
- "acc_norm": 0.43373493975903615,
349
- "acc_norm_stderr": 0.03858158940685517
350
- },
351
- "harness|hendrycksTest-world_religions|5": {
352
- "acc": 0.7251461988304093,
353
- "acc_stderr": 0.034240429246915824,
354
- "acc_norm": 0.7251461988304093,
355
- "acc_norm_stderr": 0.034240429246915824
356
- },
357
- "harness|truthfulqa:mc|0": {
358
- "mc1": 0.3011015911872705,
359
- "mc1_stderr": 0.016058999026100616,
360
- "mc2": 0.45570370195101134,
361
- "mc2_stderr": 0.015691038880908878
362
- },
363
- "all": {
364
- "acc": 0.4853305078812575,
365
- "acc_stderr": 0.03506342425063614,
366
- "acc_norm": 0.4890888421576806,
367
- "acc_norm_stderr": 0.03504951384309531,
368
- "mc1": 0.3011015911872705,
369
- "mc1_stderr": 0.016058999026100616,
370
- "mc2": 0.45570370195101134,
371
- "mc2_stderr": 0.015691038880908878
372
- }
373
- },
374
- "versions": {
375
- "harness|arc:challenge|25": 0,
376
- "harness|hellaswag|10": 0,
377
- "harness|hendrycksTest-abstract_algebra|5": 1,
378
- "harness|hendrycksTest-anatomy|5": 1,
379
- "harness|hendrycksTest-astronomy|5": 1,
380
- "harness|hendrycksTest-business_ethics|5": 1,
381
- "harness|hendrycksTest-clinical_knowledge|5": 1,
382
- "harness|hendrycksTest-college_biology|5": 1,
383
- "harness|hendrycksTest-college_chemistry|5": 1,
384
- "harness|hendrycksTest-college_computer_science|5": 1,
385
- "harness|hendrycksTest-college_mathematics|5": 1,
386
- "harness|hendrycksTest-college_medicine|5": 1,
387
- "harness|hendrycksTest-college_physics|5": 1,
388
- "harness|hendrycksTest-computer_security|5": 1,
389
- "harness|hendrycksTest-conceptual_physics|5": 1,
390
- "harness|hendrycksTest-econometrics|5": 1,
391
- "harness|hendrycksTest-electrical_engineering|5": 1,
392
- "harness|hendrycksTest-elementary_mathematics|5": 1,
393
- "harness|hendrycksTest-formal_logic|5": 1,
394
- "harness|hendrycksTest-global_facts|5": 1,
395
- "harness|hendrycksTest-high_school_biology|5": 1,
396
- "harness|hendrycksTest-high_school_chemistry|5": 1,
397
- "harness|hendrycksTest-high_school_computer_science|5": 1,
398
- "harness|hendrycksTest-high_school_european_history|5": 1,
399
- "harness|hendrycksTest-high_school_geography|5": 1,
400
- "harness|hendrycksTest-high_school_government_and_politics|5": 1,
401
- "harness|hendrycksTest-high_school_macroeconomics|5": 1,
402
- "harness|hendrycksTest-high_school_mathematics|5": 1,
403
- "harness|hendrycksTest-high_school_microeconomics|5": 1,
404
- "harness|hendrycksTest-high_school_physics|5": 1,
405
- "harness|hendrycksTest-high_school_psychology|5": 1,
406
- "harness|hendrycksTest-high_school_statistics|5": 1,
407
- "harness|hendrycksTest-high_school_us_history|5": 1,
408
- "harness|hendrycksTest-high_school_world_history|5": 1,
409
- "harness|hendrycksTest-human_aging|5": 1,
410
- "harness|hendrycksTest-human_sexuality|5": 1,
411
- "harness|hendrycksTest-international_law|5": 1,
412
- "harness|hendrycksTest-jurisprudence|5": 1,
413
- "harness|hendrycksTest-logical_fallacies|5": 1,
414
- "harness|hendrycksTest-machine_learning|5": 1,
415
- "harness|hendrycksTest-management|5": 1,
416
- "harness|hendrycksTest-marketing|5": 1,
417
- "harness|hendrycksTest-medical_genetics|5": 1,
418
- "harness|hendrycksTest-miscellaneous|5": 1,
419
- "harness|hendrycksTest-moral_disputes|5": 1,
420
- "harness|hendrycksTest-moral_scenarios|5": 1,
421
- "harness|hendrycksTest-nutrition|5": 1,
422
- "harness|hendrycksTest-philosophy|5": 1,
423
- "harness|hendrycksTest-prehistory|5": 1,
424
- "harness|hendrycksTest-professional_accounting|5": 1,
425
- "harness|hendrycksTest-professional_law|5": 1,
426
- "harness|hendrycksTest-professional_medicine|5": 1,
427
- "harness|hendrycksTest-professional_psychology|5": 1,
428
- "harness|hendrycksTest-public_relations|5": 1,
429
- "harness|hendrycksTest-security_studies|5": 1,
430
- "harness|hendrycksTest-sociology|5": 1,
431
- "harness|hendrycksTest-us_foreign_policy|5": 1,
432
- "harness|hendrycksTest-virology|5": 1,
433
- "harness|hendrycksTest-world_religions|5": 1,
434
- "harness|truthfulqa:mc|0": 1,
435
- "all": 0
436
- },
437
- "config": {
438
- "model_name": "meta-llama/Llama-2-7b-chat-hf",
439
- "model_sha": "b7701a9e825e79a5ab18b5801be113c2160cc627",
440
- "model_dtype": "torch.float16",
441
- "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937",
442
- "num_few_shot_default": 0,
443
- "num_fewshot_seeds": 1,
444
- "override_batch_size": 1,
445
- "max_samples": null
446
- },
447
- "task_config": {
448
- "harness|arc:challenge": "LM Harness task",
449
- "harness|hellaswag": "LM Harness task",
450
- "harness|hendrycksTest-abstract_algebra": "LM Harness task",
451
- "harness|hendrycksTest-anatomy": "LM Harness task",
452
- "harness|hendrycksTest-astronomy": "LM Harness task",
453
- "harness|hendrycksTest-business_ethics": "LM Harness task",
454
- "harness|hendrycksTest-clinical_knowledge": "LM Harness task",
455
- "harness|hendrycksTest-college_biology": "LM Harness task",
456
- "harness|hendrycksTest-college_chemistry": "LM Harness task",
457
- "harness|hendrycksTest-college_computer_science": "LM Harness task",
458
- "harness|hendrycksTest-college_mathematics": "LM Harness task",
459
- "harness|hendrycksTest-college_medicine": "LM Harness task",
460
- "harness|hendrycksTest-college_physics": "LM Harness task",
461
- "harness|hendrycksTest-computer_security": "LM Harness task",
462
- "harness|hendrycksTest-conceptual_physics": "LM Harness task",
463
- "harness|hendrycksTest-econometrics": "LM Harness task",
464
- "harness|hendrycksTest-electrical_engineering": "LM Harness task",
465
- "harness|hendrycksTest-elementary_mathematics": "LM Harness task",
466
- "harness|hendrycksTest-formal_logic": "LM Harness task",
467
- "harness|hendrycksTest-global_facts": "LM Harness task",
468
- "harness|hendrycksTest-high_school_biology": "LM Harness task",
469
- "harness|hendrycksTest-high_school_chemistry": "LM Harness task",
470
- "harness|hendrycksTest-high_school_computer_science": "LM Harness task",
471
- "harness|hendrycksTest-high_school_european_history": "LM Harness task",
472
- "harness|hendrycksTest-high_school_geography": "LM Harness task",
473
- "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task",
474
- "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task",
475
- "harness|hendrycksTest-high_school_mathematics": "LM Harness task",
476
- "harness|hendrycksTest-high_school_microeconomics": "LM Harness task",
477
- "harness|hendrycksTest-high_school_physics": "LM Harness task",
478
- "harness|hendrycksTest-high_school_psychology": "LM Harness task",
479
- "harness|hendrycksTest-high_school_statistics": "LM Harness task",
480
- "harness|hendrycksTest-high_school_us_history": "LM Harness task",
481
- "harness|hendrycksTest-high_school_world_history": "LM Harness task",
482
- "harness|hendrycksTest-human_aging": "LM Harness task",
483
- "harness|hendrycksTest-human_sexuality": "LM Harness task",
484
- "harness|hendrycksTest-international_law": "LM Harness task",
485
- "harness|hendrycksTest-jurisprudence": "LM Harness task",
486
- "harness|hendrycksTest-logical_fallacies": "LM Harness task",
487
- "harness|hendrycksTest-machine_learning": "LM Harness task",
488
- "harness|hendrycksTest-management": "LM Harness task",
489
- "harness|hendrycksTest-marketing": "LM Harness task",
490
- "harness|hendrycksTest-medical_genetics": "LM Harness task",
491
- "harness|hendrycksTest-miscellaneous": "LM Harness task",
492
- "harness|hendrycksTest-moral_disputes": "LM Harness task",
493
- "harness|hendrycksTest-moral_scenarios": "LM Harness task",
494
- "harness|hendrycksTest-nutrition": "LM Harness task",
495
- "harness|hendrycksTest-philosophy": "LM Harness task",
496
- "harness|hendrycksTest-prehistory": "LM Harness task",
497
- "harness|hendrycksTest-professional_accounting": "LM Harness task",
498
- "harness|hendrycksTest-professional_law": "LM Harness task",
499
- "harness|hendrycksTest-professional_medicine": "LM Harness task",
500
- "harness|hendrycksTest-professional_psychology": "LM Harness task",
501
- "harness|hendrycksTest-public_relations": "LM Harness task",
502
- "harness|hendrycksTest-security_studies": "LM Harness task",
503
- "harness|hendrycksTest-sociology": "LM Harness task",
504
- "harness|hendrycksTest-us_foreign_policy": "LM Harness task",
505
- "harness|hendrycksTest-virology": "LM Harness task",
506
- "harness|hendrycksTest-world_religions": "LM Harness task",
507
- "harness|truthfulqa:mc": "LM Harness task"
508
- },
509
- "hashes": {
510
- "harness|arc:challenge|25": {
511
- "hash_examples": "fb8c51b1872daeda",
512
- "hash_full_prompts": "045cbb916e5145c6",
513
- "hash_input_tokens": "61571bf68d6d89aa",
514
- "hash_cont_tokens": "8210decc6ff6f7df"
515
- },
516
- "harness|hellaswag|10": {
517
- "hash_examples": "e1768ecb99d7ecf0",
518
- "hash_full_prompts": "0b4c16983130f84f",
519
- "hash_input_tokens": "29906669b1c7054a",
520
- "hash_cont_tokens": "b3b9e9017afa63af"
521
- },
522
- "harness|hendrycksTest-abstract_algebra|5": {
523
- "hash_examples": "280f9f325b40559a",
524
- "hash_full_prompts": "2f776a367d23aea2",
525
- "hash_input_tokens": "c54ff61ad0273dd7",
526
- "hash_cont_tokens": "50421e30bef398f9"
527
- },
528
- "harness|hendrycksTest-anatomy|5": {
529
- "hash_examples": "2f83a4f1cab4ba18",
530
- "hash_full_prompts": "516f74bef25df620",
531
- "hash_input_tokens": "be31a1e22aef5f90",
532
- "hash_cont_tokens": "f11971a765cb609f"
533
- },
534
- "harness|hendrycksTest-astronomy|5": {
535
- "hash_examples": "7d587b908da4d762",
536
- "hash_full_prompts": "faf4e80f65de93ca",
537
- "hash_input_tokens": "277a7b1fad566940",
538
- "hash_cont_tokens": "bf30e5d3f48250cb"
539
- },
540
- "harness|hendrycksTest-business_ethics|5": {
541
- "hash_examples": "33e51740670de686",
542
- "hash_full_prompts": "db01c3ef8e1479d4",
543
- "hash_input_tokens": "ba552605bc116de5",
544
- "hash_cont_tokens": "bc1dd9b2d995eb61"
545
- },
546
- "harness|hendrycksTest-clinical_knowledge|5": {
547
- "hash_examples": "f3366dbe7eefffa4",
548
- "hash_full_prompts": "49654f71d94b65c3",
549
- "hash_input_tokens": "428c7563d0b98ab9",
550
- "hash_cont_tokens": "890a119624b3b935"
551
- },
552
- "harness|hendrycksTest-college_biology|5": {
553
- "hash_examples": "ca2b6753a0193e7f",
554
- "hash_full_prompts": "2b460b75f1fdfefd",
555
- "hash_input_tokens": "da036601573942e2",
556
- "hash_cont_tokens": "875cde3af7a0ee14"
557
- },
558
- "harness|hendrycksTest-college_chemistry|5": {
559
- "hash_examples": "22ff85f1d34f42d1",
560
- "hash_full_prompts": "242c9be6da583e95",
561
- "hash_input_tokens": "94e0196d6aded13d",
562
- "hash_cont_tokens": "50421e30bef398f9"
563
- },
564
- "harness|hendrycksTest-college_computer_science|5": {
565
- "hash_examples": "30318289d717a5cf",
566
- "hash_full_prompts": "ed2bdb4e87c4b371",
567
- "hash_input_tokens": "6e4d0f4a8d36690b",
568
- "hash_cont_tokens": "ffc0fe414cdc4a83"
569
- },
570
- "harness|hendrycksTest-college_mathematics|5": {
571
- "hash_examples": "4944d1f0b6b5d911",
572
- "hash_full_prompts": "770bc4281c973190",
573
- "hash_input_tokens": "614054d17109a25d",
574
- "hash_cont_tokens": "50421e30bef398f9"
575
- },
576
- "harness|hendrycksTest-college_medicine|5": {
577
- "hash_examples": "dd69cc33381275af",
578
- "hash_full_prompts": "ad2a53e5250ab46e",
579
- "hash_input_tokens": "1d633b3cc0524ba8",
580
- "hash_cont_tokens": "1f88b00d41957d82"
581
- },
582
- "harness|hendrycksTest-college_physics|5": {
583
- "hash_examples": "875dd26d22655b0d",
584
- "hash_full_prompts": "833a0d7b55aed500",
585
- "hash_input_tokens": "5421d9a1af86cbd4",
586
- "hash_cont_tokens": "f7b8097afc16a47c"
587
- },
588
- "harness|hendrycksTest-computer_security|5": {
589
- "hash_examples": "006451eedc0ededb",
590
- "hash_full_prompts": "94034c97e85d8f46",
591
- "hash_input_tokens": "5e6b70ecb333cf18",
592
- "hash_cont_tokens": "50421e30bef398f9"
593
- },
594
- "harness|hendrycksTest-conceptual_physics|5": {
595
- "hash_examples": "8874ece872d2ca4c",
596
- "hash_full_prompts": "e40d15a34640d6fa",
597
- "hash_input_tokens": "c2ef11a87264ceed",
598
- "hash_cont_tokens": "aa0e8bc655f2f641"
599
- },
600
- "harness|hendrycksTest-econometrics|5": {
601
- "hash_examples": "64d3623b0bfaa43f",
602
- "hash_full_prompts": "612f340fae41338d",
603
- "hash_input_tokens": "ecaccd912a4c3978",
604
- "hash_cont_tokens": "bfb7e3c3c88313f1"
605
- },
606
- "harness|hendrycksTest-electrical_engineering|5": {
607
- "hash_examples": "e98f51780c674d7e",
608
- "hash_full_prompts": "10275b312d812ae6",
609
- "hash_input_tokens": "1590c84291399be8",
610
- "hash_cont_tokens": "2425a3f084a591ef"
611
- },
612
- "harness|hendrycksTest-elementary_mathematics|5": {
613
- "hash_examples": "fc48208a5ac1c0ce",
614
- "hash_full_prompts": "5ec274c6c82aca23",
615
- "hash_input_tokens": "3269597f715b0da1",
616
- "hash_cont_tokens": "f52691aef15a407b"
617
- },
618
- "harness|hendrycksTest-formal_logic|5": {
619
- "hash_examples": "5a6525665f63ea72",
620
- "hash_full_prompts": "07b92638c4a6b500",
621
- "hash_input_tokens": "a2800d20f3ab8d7c",
622
- "hash_cont_tokens": "f515d598d9c21263"
623
- },
624
- "harness|hendrycksTest-global_facts|5": {
625
- "hash_examples": "371d70d743b2b89b",
626
- "hash_full_prompts": "332fdee50a1921b4",
627
- "hash_input_tokens": "94ed44b3772505ad",
628
- "hash_cont_tokens": "50421e30bef398f9"
629
- },
630
- "harness|hendrycksTest-high_school_biology|5": {
631
- "hash_examples": "a79e1018b1674052",
632
- "hash_full_prompts": "e624e26ede922561",
633
- "hash_input_tokens": "24423acb928db768",
634
- "hash_cont_tokens": "bd85a4156a3613ee"
635
- },
636
- "harness|hendrycksTest-high_school_chemistry|5": {
637
- "hash_examples": "44bfc25c389f0e03",
638
- "hash_full_prompts": "0e3e5f5d9246482a",
639
- "hash_input_tokens": "831ff35c474e5cef",
640
- "hash_cont_tokens": "a95c97af1c14e068"
641
- },
642
- "harness|hendrycksTest-high_school_computer_science|5": {
643
- "hash_examples": "8b8cdb1084f24169",
644
- "hash_full_prompts": "c00487e67c1813cc",
645
- "hash_input_tokens": "8c34e0f2bda77358",
646
- "hash_cont_tokens": "8abfedef914e33c9"
647
- },
648
- "harness|hendrycksTest-high_school_european_history|5": {
649
- "hash_examples": "11cd32d0ef440171",
650
- "hash_full_prompts": "318f4513c537c6bf",
651
- "hash_input_tokens": "f1f73dd687da18d7",
652
- "hash_cont_tokens": "674fc454bdc5ac93"
653
- },
654
- "harness|hendrycksTest-high_school_geography|5": {
655
- "hash_examples": "b60019b9e80b642f",
656
- "hash_full_prompts": "ee5789fcc1a81b1e",
657
- "hash_input_tokens": "7c5547c7da5bc793",
658
- "hash_cont_tokens": "03a5012b916274ea"
659
- },
660
- "harness|hendrycksTest-high_school_government_and_politics|5": {
661
- "hash_examples": "d221ec983d143dc3",
662
- "hash_full_prompts": "ac42d888e1ce1155",
663
- "hash_input_tokens": "f62991cb6a496b05",
664
- "hash_cont_tokens": "a83effb8f76b7d7c"
665
- },
666
- "harness|hendrycksTest-high_school_macroeconomics|5": {
667
- "hash_examples": "59c2915cacfd3fbb",
668
- "hash_full_prompts": "c6bd9d25158abd0e",
669
- "hash_input_tokens": "4cef2aff6e3d59ed",
670
- "hash_cont_tokens": "c583432ad27fcfe0"
671
- },
672
- "harness|hendrycksTest-high_school_mathematics|5": {
673
- "hash_examples": "1f8ac897608de342",
674
- "hash_full_prompts": "5d88f41fc2d643a8",
675
- "hash_input_tokens": "6e2577ea4082ed2b",
676
- "hash_cont_tokens": "24f5dc613660300b"
677
- },
678
- "harness|hendrycksTest-high_school_microeconomics|5": {
679
- "hash_examples": "ead6a0f2f6c83370",
680
- "hash_full_prompts": "bfc393381298609e",
681
- "hash_input_tokens": "c5fc9aeb1079c8e4",
682
- "hash_cont_tokens": "f47f041de50333b9"
683
- },
684
- "harness|hendrycksTest-high_school_physics|5": {
685
- "hash_examples": "c3f2025990afec64",
686
- "hash_full_prompts": "fc78b4997e436734",
687
- "hash_input_tokens": "555fc385cffa84ca",
688
- "hash_cont_tokens": "ba2efcd283e938cc"
689
- },
690
- "harness|hendrycksTest-high_school_psychology|5": {
691
- "hash_examples": "21f8aab618f6d636",
692
- "hash_full_prompts": "d5c76aa40b9dbc43",
693
- "hash_input_tokens": "febd23cbf9973b7f",
694
- "hash_cont_tokens": "942069cd363844d9"
695
- },
696
- "harness|hendrycksTest-high_school_statistics|5": {
697
- "hash_examples": "2386a60a11fc5de3",
698
- "hash_full_prompts": "4c5c8be5aafac432",
699
- "hash_input_tokens": "424b02981230ee83",
700
- "hash_cont_tokens": "955ed42b6f7fa019"
701
- },
702
- "harness|hendrycksTest-high_school_us_history|5": {
703
- "hash_examples": "74961543be40f04f",
704
- "hash_full_prompts": "5d5ca4840131ba21",
705
- "hash_input_tokens": "50c9ff438c85a69e",
706
- "hash_cont_tokens": "cdd0b3dc06d933e5"
707
- },
708
- "harness|hendrycksTest-high_school_world_history|5": {
709
- "hash_examples": "2ad2f6b7198b2234",
710
- "hash_full_prompts": "11845057459afd72",
711
- "hash_input_tokens": "054824cc474caef5",
712
- "hash_cont_tokens": "9a864184946033ac"
713
- },
714
- "harness|hendrycksTest-human_aging|5": {
715
- "hash_examples": "1a7199dc733e779b",
716
- "hash_full_prompts": "756b9096b8eaf892",
717
- "hash_input_tokens": "541a75f071dcf579",
718
- "hash_cont_tokens": "142a4a8a1138a214"
719
- },
720
- "harness|hendrycksTest-human_sexuality|5": {
721
- "hash_examples": "7acb8fdad97f88a6",
722
- "hash_full_prompts": "731a52ff15b8cfdb",
723
- "hash_input_tokens": "04269e5c5a257dd9",
724
- "hash_cont_tokens": "bc54813e809b796d"
725
- },
726
- "harness|hendrycksTest-international_law|5": {
727
- "hash_examples": "1300bfd0dfc59114",
728
- "hash_full_prompts": "db2aefbff5eec996",
729
- "hash_input_tokens": "d93ba9d9d38e4397",
730
- "hash_cont_tokens": "dc45b45fcda18e5d"
731
- },
732
- "harness|hendrycksTest-jurisprudence|5": {
733
- "hash_examples": "083b1e4904c48dc2",
734
- "hash_full_prompts": "0f89ee3fe03d6a21",
735
- "hash_input_tokens": "9eeaccd2698b4f5a",
736
- "hash_cont_tokens": "e3a8cd951b6e3469"
737
- },
738
- "harness|hendrycksTest-logical_fallacies|5": {
739
- "hash_examples": "709128f9926a634c",
740
- "hash_full_prompts": "98a04b1f8f841069",
741
- "hash_input_tokens": "b4f08f544f2b7576",
742
- "hash_cont_tokens": "1e80dbd30f6453d5"
743
- },
744
- "harness|hendrycksTest-machine_learning|5": {
745
- "hash_examples": "88f22a636029ae47",
746
- "hash_full_prompts": "2e1c8d4b1e0cc921",
747
- "hash_input_tokens": "900c2a51f1174b9f",
748
- "hash_cont_tokens": "9b37da7777378ca9"
749
- },
750
- "harness|hendrycksTest-management|5": {
751
- "hash_examples": "8c8a1e07a2151dca",
752
- "hash_full_prompts": "f51611f514b265b0",
753
- "hash_input_tokens": "6b36efb4689c6eca",
754
- "hash_cont_tokens": "a01d6d39a83c4597"
755
- },
756
- "harness|hendrycksTest-marketing|5": {
757
- "hash_examples": "2668953431f91e96",
758
- "hash_full_prompts": "77562bef997c7650",
759
- "hash_input_tokens": "2aaac78a0cfed47a",
760
- "hash_cont_tokens": "6aeaed4d823c98aa"
761
- },
762
- "harness|hendrycksTest-medical_genetics|5": {
763
- "hash_examples": "9c2dda34a2ea4fd2",
764
- "hash_full_prompts": "202139046daa118f",
765
- "hash_input_tokens": "886ca823b41c094a",
766
- "hash_cont_tokens": "50421e30bef398f9"
767
- },
768
- "harness|hendrycksTest-miscellaneous|5": {
769
- "hash_examples": "41adb694024809c2",
770
- "hash_full_prompts": "bffec9fc237bcf93",
771
- "hash_input_tokens": "72fd71de7675e7d0",
772
- "hash_cont_tokens": "9b0ab02a64603081"
773
- },
774
- "harness|hendrycksTest-moral_disputes|5": {
775
- "hash_examples": "3171c13ba3c594c4",
776
- "hash_full_prompts": "170831fc36f1d59e",
777
- "hash_input_tokens": "f3ca0dd8e7a1eb09",
778
- "hash_cont_tokens": "8badf768f7b0467a"
779
- },
780
- "harness|hendrycksTest-moral_scenarios|5": {
781
- "hash_examples": "9873e077e83e0546",
782
- "hash_full_prompts": "08f4ceba3131a068",
783
- "hash_input_tokens": "3e793631e951f23c",
784
- "hash_cont_tokens": "32ae620376b2bbba"
785
- },
786
- "harness|hendrycksTest-nutrition|5": {
787
- "hash_examples": "7db1d8142ec14323",
788
- "hash_full_prompts": "4c0e68e3586cb453",
789
- "hash_input_tokens": "59753c2144ea93af",
790
- "hash_cont_tokens": "3071def75bacc404"
791
- },
792
- "harness|hendrycksTest-philosophy|5": {
793
- "hash_examples": "9b455b7d72811cc8",
794
- "hash_full_prompts": "e467f822d8a0d3ff",
795
- "hash_input_tokens": "bd8d3dbed15a8c34",
796
- "hash_cont_tokens": "9f6ff69d23a48783"
797
- },
798
- "harness|hendrycksTest-prehistory|5": {
799
- "hash_examples": "8be90d0f538f1560",
800
- "hash_full_prompts": "152187949bcd0921",
801
- "hash_input_tokens": "3573cd87facbb7c5",
802
- "hash_cont_tokens": "de469d2b981e32a3"
803
- },
804
- "harness|hendrycksTest-professional_accounting|5": {
805
- "hash_examples": "8d377597916cd07e",
806
- "hash_full_prompts": "0eb7345d6144ee0d",
807
- "hash_input_tokens": "17e721bc1a7cbb47",
808
- "hash_cont_tokens": "c46f74d2dfc7b13b"
809
- },
810
- "harness|hendrycksTest-professional_law|5": {
811
- "hash_examples": "cd9dbc52b3c932d6",
812
- "hash_full_prompts": "36ac764272bfb182",
813
- "hash_input_tokens": "9178e10bd0763ec4",
814
- "hash_cont_tokens": "2e590029ef41fbcd"
815
- },
816
- "harness|hendrycksTest-professional_medicine|5": {
817
- "hash_examples": "b20e4e816c1e383e",
818
- "hash_full_prompts": "7b8d69ea2acaf2f7",
819
- "hash_input_tokens": "f5a22012a54f70ea",
820
- "hash_cont_tokens": "fe35cfa9c6ca802e"
821
- },
822
- "harness|hendrycksTest-professional_psychology|5": {
823
- "hash_examples": "d45b73b22f9cc039",
824
- "hash_full_prompts": "fe8937e9ffc99771",
825
- "hash_input_tokens": "0dfb73a8eb3f692c",
826
- "hash_cont_tokens": "f020fbddf72c8652"
827
- },
828
- "harness|hendrycksTest-public_relations|5": {
829
- "hash_examples": "0d25072e1761652a",
830
- "hash_full_prompts": "f9adc39cfa9f42ba",
831
- "hash_input_tokens": "1710c6ba4c9f3cbd",
832
- "hash_cont_tokens": "568f585a259965c1"
833
- },
834
- "harness|hendrycksTest-security_studies|5": {
835
- "hash_examples": "62bb8197e63d60d4",
836
- "hash_full_prompts": "869c9c3ae196b7c3",
837
- "hash_input_tokens": "d49711415961ced7",
838
- "hash_cont_tokens": "cc6fd7cccd64cd5d"
839
- },
840
- "harness|hendrycksTest-sociology|5": {
841
- "hash_examples": "e7959df87dea8672",
842
- "hash_full_prompts": "1a1fc00e17b3a52a",
843
- "hash_input_tokens": "828999f7624cbe7e",
844
- "hash_cont_tokens": "c3a3bdfd177eed5b"
845
- },
846
- "harness|hendrycksTest-us_foreign_policy|5": {
847
- "hash_examples": "4a56a01ddca44dca",
848
- "hash_full_prompts": "0c7a7081c71c07b6",
849
- "hash_input_tokens": "42054621e718dbee",
850
- "hash_cont_tokens": "2568d0e8e36fa959"
851
- },
852
- "harness|hendrycksTest-virology|5": {
853
- "hash_examples": "451cc86a8c4f4fe9",
854
- "hash_full_prompts": "01e95325d8b738e4",
855
- "hash_input_tokens": "6c4f0aa4dc859c04",
856
- "hash_cont_tokens": "926cf60b0891f374"
857
- },
858
- "harness|hendrycksTest-world_religions|5": {
859
- "hash_examples": "3b29cfaf1a81c379",
860
- "hash_full_prompts": "e0d79a15083dfdff",
861
- "hash_input_tokens": "6c75d44e092ff24f",
862
- "hash_cont_tokens": "c525a5de974c1ea3"
863
- },
864
- "harness|truthfulqa:mc|0": {
865
- "hash_examples": "23176c0531c7b867",
866
- "hash_full_prompts": "36a6d90e75d92d4a",
867
- "hash_input_tokens": "2738d7ed7075faa7",
868
- "hash_cont_tokens": "c014154380b74b9e"
869
- }
870
- }
871
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
meta-llama/Llama-2-7b-chat-hf/results_2023-10-15T02-34-15.484281.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "config_general": {
3
- "model_name": "meta-llama/Llama-2-7b-chat-hf",
4
- "model_sha": "af6df14e494ef16d69ec55e9a016e900a2dde1c8",
5
- "model_size": "12.61 GB",
6
- "model_dtype": "torch.float16",
7
- "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374",
8
- "num_few_shot_default": 0,
9
- "num_fewshot_seeds": 1,
10
- "override_batch_size": 1,
11
- "max_samples": null,
12
- "job_id": ""
13
- },
14
- "results": {
15
- "harness|drop|3": {
16
- "em": 0.06763842281879194,
17
- "em_stderr": 0.0025717489509556085,
18
- "f1": 0.13085570469798627,
19
- "f1_stderr": 0.0028825856446422905
20
- },
21
- "harness|gsm8k|5": {
22
- "acc": 0.07354056103108415,
23
- "acc_stderr": 0.0071898357543652685
24
- },
25
- "harness|winogrande|5": {
26
- "acc": 0.7174427782162589,
27
- "acc_stderr": 0.012654062850971384
28
- },
29
- "all": {
30
- "em": 0.06763842281879194,
31
- "em_stderr": 0.0025717489509556085,
32
- "f1": 0.13085570469798627,
33
- "f1_stderr": 0.0028825856446422905,
34
- "acc": 0.39549166962367155,
35
- "acc_stderr": 0.009921949302668327
36
- }
37
- },
38
- "versions": {
39
- "harness|drop|3": 1,
40
- "harness|gsm8k|5": 0,
41
- "harness|winogrande|5": 0,
42
- "all": 0
43
- },
44
- "config_tasks": {
45
- "harness|drop": "LM Harness task",
46
- "harness|gsm8k": "LM Harness task",
47
- "harness|winogrande": "LM Harness task"
48
- },
49
- "summary_tasks": {
50
- "harness|drop|3": {
51
- "hashes": {
52
- "hash_examples": "1d27416e8324e9a3",
53
- "hash_full_prompts": "a5513ff9a741b385",
54
- "hash_input_tokens": "42076f0efbb50aa6",
55
- "hash_cont_tokens": "c4b3a30639a21038"
56
- },
57
- "truncated": 3,
58
- "non-truncated": 9533,
59
- "padded": 0,
60
- "non-padded": 9536,
61
- "effective_few_shots": 3.0,
62
- "num_truncated_few_shots": 0
63
- },
64
- "harness|gsm8k|5": {
65
- "hashes": {
66
- "hash_examples": "4c0843a5d99bcfdc",
67
- "hash_full_prompts": "41d55e83abc0e02d",
68
- "hash_input_tokens": "bda342e47b5099b2",
69
- "hash_cont_tokens": "602d6f5b58c4afb3"
70
- },
71
- "truncated": 0,
72
- "non-truncated": 1319,
73
- "padded": 0,
74
- "non-padded": 1319,
75
- "effective_few_shots": 5.0,
76
- "num_truncated_few_shots": 0
77
- },
78
- "harness|winogrande|5": {
79
- "hashes": {
80
- "hash_examples": "aada0a176fd81218",
81
- "hash_full_prompts": "c8655cbd12de8409",
82
- "hash_input_tokens": "c0bedf98cb040854",
83
- "hash_cont_tokens": "f08975ad6f2d5864"
84
- },
85
- "truncated": 0,
86
- "non-truncated": 2534,
87
- "padded": 2432,
88
- "non-padded": 102,
89
- "effective_few_shots": 5.0,
90
- "num_truncated_few_shots": 0
91
- }
92
- },
93
- "summary_general": {
94
- "hashes": {
95
- "hash_examples": "9b4d8993161e637d",
96
- "hash_full_prompts": "08215e527b7e60a5",
97
- "hash_input_tokens": "a12f3e3c934bd78b",
98
- "hash_cont_tokens": "40836b0405f6a16d"
99
- },
100
- "total_evaluation_time_secondes": "9024.860627889633",
101
- "truncated": 3,
102
- "non-truncated": 13386,
103
- "padded": 2432,
104
- "non-padded": 10957,
105
- "num_truncated_few_shots": 0
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
meta-llama/Llama-2-7b-hf/results_2023-08-20T17-54-59.197645.json DELETED
@@ -1,871 +0,0 @@
1
- {
2
- "results": {
3
- "harness|arc:challenge|25": {
4
- "acc": 0.492320819112628,
5
- "acc_stderr": 0.01460966744089257,
6
- "acc_norm": 0.5307167235494881,
7
- "acc_norm_stderr": 0.014583792546304037
8
- },
9
- "harness|hellaswag|10": {
10
- "acc": 0.5883290181238797,
11
- "acc_stderr": 0.0049113035697697935,
12
- "acc_norm": 0.7858992232622983,
13
- "acc_norm_stderr": 0.0040935874043036904
14
- },
15
- "harness|hendrycksTest-abstract_algebra|5": {
16
- "acc": 0.3,
17
- "acc_stderr": 0.046056618647183814,
18
- "acc_norm": 0.3,
19
- "acc_norm_stderr": 0.046056618647183814
20
- },
21
- "harness|hendrycksTest-anatomy|5": {
22
- "acc": 0.48148148148148145,
23
- "acc_stderr": 0.043163785995113245,
24
- "acc_norm": 0.48148148148148145,
25
- "acc_norm_stderr": 0.043163785995113245
26
- },
27
- "harness|hendrycksTest-astronomy|5": {
28
- "acc": 0.40789473684210525,
29
- "acc_stderr": 0.03999309712777471,
30
- "acc_norm": 0.40789473684210525,
31
- "acc_norm_stderr": 0.03999309712777471
32
- },
33
- "harness|hendrycksTest-business_ethics|5": {
34
- "acc": 0.53,
35
- "acc_stderr": 0.05016135580465919,
36
- "acc_norm": 0.53,
37
- "acc_norm_stderr": 0.05016135580465919
38
- },
39
- "harness|hendrycksTest-clinical_knowledge|5": {
40
- "acc": 0.4641509433962264,
41
- "acc_stderr": 0.030693675018458003,
42
- "acc_norm": 0.4641509433962264,
43
- "acc_norm_stderr": 0.030693675018458003
44
- },
45
- "harness|hendrycksTest-college_biology|5": {
46
- "acc": 0.4652777777777778,
47
- "acc_stderr": 0.04171115858181618,
48
- "acc_norm": 0.4652777777777778,
49
- "acc_norm_stderr": 0.04171115858181618
50
- },
51
- "harness|hendrycksTest-college_chemistry|5": {
52
- "acc": 0.35,
53
- "acc_stderr": 0.047937248544110196,
54
- "acc_norm": 0.35,
55
- "acc_norm_stderr": 0.047937248544110196
56
- },
57
- "harness|hendrycksTest-college_computer_science|5": {
58
- "acc": 0.33,
59
- "acc_stderr": 0.047258156262526045,
60
- "acc_norm": 0.33,
61
- "acc_norm_stderr": 0.047258156262526045
62
- },
63
- "harness|hendrycksTest-college_mathematics|5": {
64
- "acc": 0.35,
65
- "acc_stderr": 0.047937248544110196,
66
- "acc_norm": 0.35,
67
- "acc_norm_stderr": 0.047937248544110196
68
- },
69
- "harness|hendrycksTest-college_medicine|5": {
70
- "acc": 0.4277456647398844,
71
- "acc_stderr": 0.037724468575180255,
72
- "acc_norm": 0.4277456647398844,
73
- "acc_norm_stderr": 0.037724468575180255
74
- },
75
- "harness|hendrycksTest-college_physics|5": {
76
- "acc": 0.23529411764705882,
77
- "acc_stderr": 0.04220773659171453,
78
- "acc_norm": 0.23529411764705882,
79
- "acc_norm_stderr": 0.04220773659171453
80
- },
81
- "harness|hendrycksTest-computer_security|5": {
82
- "acc": 0.6,
83
- "acc_stderr": 0.04923659639173309,
84
- "acc_norm": 0.6,
85
- "acc_norm_stderr": 0.04923659639173309
86
- },
87
- "harness|hendrycksTest-conceptual_physics|5": {
88
- "acc": 0.42127659574468085,
89
- "acc_stderr": 0.03227834510146267,
90
- "acc_norm": 0.42127659574468085,
91
- "acc_norm_stderr": 0.03227834510146267
92
- },
93
- "harness|hendrycksTest-econometrics|5": {
94
- "acc": 0.2719298245614035,
95
- "acc_stderr": 0.04185774424022056,
96
- "acc_norm": 0.2719298245614035,
97
- "acc_norm_stderr": 0.04185774424022056
98
- },
99
- "harness|hendrycksTest-electrical_engineering|5": {
100
- "acc": 0.47586206896551725,
101
- "acc_stderr": 0.041618085035015295,
102
- "acc_norm": 0.47586206896551725,
103
- "acc_norm_stderr": 0.041618085035015295
104
- },
105
- "harness|hendrycksTest-elementary_mathematics|5": {
106
- "acc": 0.2724867724867725,
107
- "acc_stderr": 0.02293097307163336,
108
- "acc_norm": 0.2724867724867725,
109
- "acc_norm_stderr": 0.02293097307163336
110
- },
111
- "harness|hendrycksTest-formal_logic|5": {
112
- "acc": 0.2857142857142857,
113
- "acc_stderr": 0.0404061017820884,
114
- "acc_norm": 0.2857142857142857,
115
- "acc_norm_stderr": 0.0404061017820884
116
- },
117
- "harness|hendrycksTest-global_facts|5": {
118
- "acc": 0.32,
119
- "acc_stderr": 0.04688261722621503,
120
- "acc_norm": 0.32,
121
- "acc_norm_stderr": 0.04688261722621503
122
- },
123
- "harness|hendrycksTest-high_school_biology|5": {
124
- "acc": 0.5,
125
- "acc_stderr": 0.028444006199428714,
126
- "acc_norm": 0.5,
127
- "acc_norm_stderr": 0.028444006199428714
128
- },
129
- "harness|hendrycksTest-high_school_chemistry|5": {
130
- "acc": 0.3694581280788177,
131
- "acc_stderr": 0.033959703819985726,
132
- "acc_norm": 0.3694581280788177,
133
- "acc_norm_stderr": 0.033959703819985726
134
- },
135
- "harness|hendrycksTest-high_school_computer_science|5": {
136
- "acc": 0.4,
137
- "acc_stderr": 0.049236596391733084,
138
- "acc_norm": 0.4,
139
- "acc_norm_stderr": 0.049236596391733084
140
- },
141
- "harness|hendrycksTest-high_school_european_history|5": {
142
- "acc": 0.6424242424242425,
143
- "acc_stderr": 0.03742597043806585,
144
- "acc_norm": 0.6424242424242425,
145
- "acc_norm_stderr": 0.03742597043806585
146
- },
147
- "harness|hendrycksTest-high_school_geography|5": {
148
- "acc": 0.48484848484848486,
149
- "acc_stderr": 0.03560716516531061,
150
- "acc_norm": 0.48484848484848486,
151
- "acc_norm_stderr": 0.03560716516531061
152
- },
153
- "harness|hendrycksTest-high_school_government_and_politics|5": {
154
- "acc": 0.6735751295336787,
155
- "acc_stderr": 0.033840286211432945,
156
- "acc_norm": 0.6735751295336787,
157
- "acc_norm_stderr": 0.033840286211432945
158
- },
159
- "harness|hendrycksTest-high_school_macroeconomics|5": {
160
- "acc": 0.45897435897435895,
161
- "acc_stderr": 0.025265525491284295,
162
- "acc_norm": 0.45897435897435895,
163
- "acc_norm_stderr": 0.025265525491284295
164
- },
165
- "harness|hendrycksTest-high_school_mathematics|5": {
166
- "acc": 0.3037037037037037,
167
- "acc_stderr": 0.02803792996911499,
168
- "acc_norm": 0.3037037037037037,
169
- "acc_norm_stderr": 0.02803792996911499
170
- },
171
- "harness|hendrycksTest-high_school_microeconomics|5": {
172
- "acc": 0.4411764705882353,
173
- "acc_stderr": 0.0322529423239964,
174
- "acc_norm": 0.4411764705882353,
175
- "acc_norm_stderr": 0.0322529423239964
176
- },
177
- "harness|hendrycksTest-high_school_physics|5": {
178
- "acc": 0.31125827814569534,
179
- "acc_stderr": 0.037804458505267334,
180
- "acc_norm": 0.31125827814569534,
181
- "acc_norm_stderr": 0.037804458505267334
182
- },
183
- "harness|hendrycksTest-high_school_psychology|5": {
184
- "acc": 0.6293577981651376,
185
- "acc_stderr": 0.02070745816435298,
186
- "acc_norm": 0.6293577981651376,
187
- "acc_norm_stderr": 0.02070745816435298
188
- },
189
- "harness|hendrycksTest-high_school_statistics|5": {
190
- "acc": 0.27314814814814814,
191
- "acc_stderr": 0.03038805130167812,
192
- "acc_norm": 0.27314814814814814,
193
- "acc_norm_stderr": 0.03038805130167812
194
- },
195
- "harness|hendrycksTest-high_school_us_history|5": {
196
- "acc": 0.5343137254901961,
197
- "acc_stderr": 0.03501038327635897,
198
- "acc_norm": 0.5343137254901961,
199
- "acc_norm_stderr": 0.03501038327635897
200
- },
201
- "harness|hendrycksTest-high_school_world_history|5": {
202
- "acc": 0.6286919831223629,
203
- "acc_stderr": 0.03145068600744859,
204
- "acc_norm": 0.6286919831223629,
205
- "acc_norm_stderr": 0.03145068600744859
206
- },
207
- "harness|hendrycksTest-human_aging|5": {
208
- "acc": 0.5695067264573991,
209
- "acc_stderr": 0.033231973029429394,
210
- "acc_norm": 0.5695067264573991,
211
- "acc_norm_stderr": 0.033231973029429394
212
- },
213
- "harness|hendrycksTest-human_sexuality|5": {
214
- "acc": 0.5648854961832062,
215
- "acc_stderr": 0.04348208051644858,
216
- "acc_norm": 0.5648854961832062,
217
- "acc_norm_stderr": 0.04348208051644858
218
- },
219
- "harness|hendrycksTest-international_law|5": {
220
- "acc": 0.6528925619834711,
221
- "acc_stderr": 0.043457245702925335,
222
- "acc_norm": 0.6528925619834711,
223
- "acc_norm_stderr": 0.043457245702925335
224
- },
225
- "harness|hendrycksTest-jurisprudence|5": {
226
- "acc": 0.5370370370370371,
227
- "acc_stderr": 0.04820403072760628,
228
- "acc_norm": 0.5370370370370371,
229
- "acc_norm_stderr": 0.04820403072760628
230
- },
231
- "harness|hendrycksTest-logical_fallacies|5": {
232
- "acc": 0.50920245398773,
233
- "acc_stderr": 0.03927705600787443,
234
- "acc_norm": 0.50920245398773,
235
- "acc_norm_stderr": 0.03927705600787443
236
- },
237
- "harness|hendrycksTest-machine_learning|5": {
238
- "acc": 0.38392857142857145,
239
- "acc_stderr": 0.04616143075028547,
240
- "acc_norm": 0.38392857142857145,
241
- "acc_norm_stderr": 0.04616143075028547
242
- },
243
- "harness|hendrycksTest-management|5": {
244
- "acc": 0.5533980582524272,
245
- "acc_stderr": 0.04922424153458933,
246
- "acc_norm": 0.5533980582524272,
247
- "acc_norm_stderr": 0.04922424153458933
248
- },
249
- "harness|hendrycksTest-marketing|5": {
250
- "acc": 0.6923076923076923,
251
- "acc_stderr": 0.030236389942173085,
252
- "acc_norm": 0.6923076923076923,
253
- "acc_norm_stderr": 0.030236389942173085
254
- },
255
- "harness|hendrycksTest-medical_genetics|5": {
256
- "acc": 0.55,
257
- "acc_stderr": 0.04999999999999999,
258
- "acc_norm": 0.55,
259
- "acc_norm_stderr": 0.04999999999999999
260
- },
261
- "harness|hendrycksTest-miscellaneous|5": {
262
- "acc": 0.6411238825031929,
263
- "acc_stderr": 0.017152991797501342,
264
- "acc_norm": 0.6411238825031929,
265
- "acc_norm_stderr": 0.017152991797501342
266
- },
267
- "harness|hendrycksTest-moral_disputes|5": {
268
- "acc": 0.49421965317919075,
269
- "acc_stderr": 0.026917296179149116,
270
- "acc_norm": 0.49421965317919075,
271
- "acc_norm_stderr": 0.026917296179149116
272
- },
273
- "harness|hendrycksTest-moral_scenarios|5": {
274
- "acc": 0.23910614525139665,
275
- "acc_stderr": 0.014265554192331144,
276
- "acc_norm": 0.23910614525139665,
277
- "acc_norm_stderr": 0.014265554192331144
278
- },
279
- "harness|hendrycksTest-nutrition|5": {
280
- "acc": 0.4934640522875817,
281
- "acc_stderr": 0.028627470550556047,
282
- "acc_norm": 0.4934640522875817,
283
- "acc_norm_stderr": 0.028627470550556047
284
- },
285
- "harness|hendrycksTest-philosophy|5": {
286
- "acc": 0.6012861736334405,
287
- "acc_stderr": 0.0278093225857745,
288
- "acc_norm": 0.6012861736334405,
289
- "acc_norm_stderr": 0.0278093225857745
290
- },
291
- "harness|hendrycksTest-prehistory|5": {
292
- "acc": 0.49074074074074076,
293
- "acc_stderr": 0.027815973433878014,
294
- "acc_norm": 0.49074074074074076,
295
- "acc_norm_stderr": 0.027815973433878014
296
- },
297
- "harness|hendrycksTest-professional_accounting|5": {
298
- "acc": 0.3617021276595745,
299
- "acc_stderr": 0.028663820147199492,
300
- "acc_norm": 0.3617021276595745,
301
- "acc_norm_stderr": 0.028663820147199492
302
- },
303
- "harness|hendrycksTest-professional_law|5": {
304
- "acc": 0.3617992177314211,
305
- "acc_stderr": 0.01227273623326293,
306
- "acc_norm": 0.3617992177314211,
307
- "acc_norm_stderr": 0.01227273623326293
308
- },
309
- "harness|hendrycksTest-professional_medicine|5": {
310
- "acc": 0.5257352941176471,
311
- "acc_stderr": 0.03033257809455504,
312
- "acc_norm": 0.5257352941176471,
313
- "acc_norm_stderr": 0.03033257809455504
314
- },
315
- "harness|hendrycksTest-professional_psychology|5": {
316
- "acc": 0.4411764705882353,
317
- "acc_stderr": 0.020087362076702857,
318
- "acc_norm": 0.4411764705882353,
319
- "acc_norm_stderr": 0.020087362076702857
320
- },
321
- "harness|hendrycksTest-public_relations|5": {
322
- "acc": 0.5272727272727272,
323
- "acc_stderr": 0.04782001791380061,
324
- "acc_norm": 0.5272727272727272,
325
- "acc_norm_stderr": 0.04782001791380061
326
- },
327
- "harness|hendrycksTest-security_studies|5": {
328
- "acc": 0.4775510204081633,
329
- "acc_stderr": 0.031976941187136725,
330
- "acc_norm": 0.4775510204081633,
331
- "acc_norm_stderr": 0.031976941187136725
332
- },
333
- "harness|hendrycksTest-sociology|5": {
334
- "acc": 0.6318407960199005,
335
- "acc_stderr": 0.03410410565495301,
336
- "acc_norm": 0.6318407960199005,
337
- "acc_norm_stderr": 0.03410410565495301
338
- },
339
- "harness|hendrycksTest-us_foreign_policy|5": {
340
- "acc": 0.65,
341
- "acc_stderr": 0.047937248544110196,
342
- "acc_norm": 0.65,
343
- "acc_norm_stderr": 0.047937248544110196
344
- },
345
- "harness|hendrycksTest-virology|5": {
346
- "acc": 0.42168674698795183,
347
- "acc_stderr": 0.03844453181770917,
348
- "acc_norm": 0.42168674698795183,
349
- "acc_norm_stderr": 0.03844453181770917
350
- },
351
- "harness|hendrycksTest-world_religions|5": {
352
- "acc": 0.7017543859649122,
353
- "acc_stderr": 0.03508771929824563,
354
- "acc_norm": 0.7017543859649122,
355
- "acc_norm_stderr": 0.03508771929824563
356
- },
357
- "harness|truthfulqa:mc|0": {
358
- "mc1": 0.24724602203182375,
359
- "mc1_stderr": 0.01510240479735965,
360
- "mc2": 0.3875703155565465,
361
- "mc2_stderr": 0.013511615953021569
362
- },
363
- "all": {
364
- "acc": 0.4710900438949217,
365
- "acc_stderr": 0.03528130957178532,
366
- "acc_norm": 0.4750894694809433,
367
- "acc_norm_stderr": 0.03526701141822507,
368
- "mc1": 0.24724602203182375,
369
- "mc1_stderr": 0.01510240479735965,
370
- "mc2": 0.3875703155565465,
371
- "mc2_stderr": 0.013511615953021569
372
- }
373
- },
374
- "versions": {
375
- "harness|arc:challenge|25": 0,
376
- "harness|hellaswag|10": 0,
377
- "harness|hendrycksTest-abstract_algebra|5": 1,
378
- "harness|hendrycksTest-anatomy|5": 1,
379
- "harness|hendrycksTest-astronomy|5": 1,
380
- "harness|hendrycksTest-business_ethics|5": 1,
381
- "harness|hendrycksTest-clinical_knowledge|5": 1,
382
- "harness|hendrycksTest-college_biology|5": 1,
383
- "harness|hendrycksTest-college_chemistry|5": 1,
384
- "harness|hendrycksTest-college_computer_science|5": 1,
385
- "harness|hendrycksTest-college_mathematics|5": 1,
386
- "harness|hendrycksTest-college_medicine|5": 1,
387
- "harness|hendrycksTest-college_physics|5": 1,
388
- "harness|hendrycksTest-computer_security|5": 1,
389
- "harness|hendrycksTest-conceptual_physics|5": 1,
390
- "harness|hendrycksTest-econometrics|5": 1,
391
- "harness|hendrycksTest-electrical_engineering|5": 1,
392
- "harness|hendrycksTest-elementary_mathematics|5": 1,
393
- "harness|hendrycksTest-formal_logic|5": 1,
394
- "harness|hendrycksTest-global_facts|5": 1,
395
- "harness|hendrycksTest-high_school_biology|5": 1,
396
- "harness|hendrycksTest-high_school_chemistry|5": 1,
397
- "harness|hendrycksTest-high_school_computer_science|5": 1,
398
- "harness|hendrycksTest-high_school_european_history|5": 1,
399
- "harness|hendrycksTest-high_school_geography|5": 1,
400
- "harness|hendrycksTest-high_school_government_and_politics|5": 1,
401
- "harness|hendrycksTest-high_school_macroeconomics|5": 1,
402
- "harness|hendrycksTest-high_school_mathematics|5": 1,
403
- "harness|hendrycksTest-high_school_microeconomics|5": 1,
404
- "harness|hendrycksTest-high_school_physics|5": 1,
405
- "harness|hendrycksTest-high_school_psychology|5": 1,
406
- "harness|hendrycksTest-high_school_statistics|5": 1,
407
- "harness|hendrycksTest-high_school_us_history|5": 1,
408
- "harness|hendrycksTest-high_school_world_history|5": 1,
409
- "harness|hendrycksTest-human_aging|5": 1,
410
- "harness|hendrycksTest-human_sexuality|5": 1,
411
- "harness|hendrycksTest-international_law|5": 1,
412
- "harness|hendrycksTest-jurisprudence|5": 1,
413
- "harness|hendrycksTest-logical_fallacies|5": 1,
414
- "harness|hendrycksTest-machine_learning|5": 1,
415
- "harness|hendrycksTest-management|5": 1,
416
- "harness|hendrycksTest-marketing|5": 1,
417
- "harness|hendrycksTest-medical_genetics|5": 1,
418
- "harness|hendrycksTest-miscellaneous|5": 1,
419
- "harness|hendrycksTest-moral_disputes|5": 1,
420
- "harness|hendrycksTest-moral_scenarios|5": 1,
421
- "harness|hendrycksTest-nutrition|5": 1,
422
- "harness|hendrycksTest-philosophy|5": 1,
423
- "harness|hendrycksTest-prehistory|5": 1,
424
- "harness|hendrycksTest-professional_accounting|5": 1,
425
- "harness|hendrycksTest-professional_law|5": 1,
426
- "harness|hendrycksTest-professional_medicine|5": 1,
427
- "harness|hendrycksTest-professional_psychology|5": 1,
428
- "harness|hendrycksTest-public_relations|5": 1,
429
- "harness|hendrycksTest-security_studies|5": 1,
430
- "harness|hendrycksTest-sociology|5": 1,
431
- "harness|hendrycksTest-us_foreign_policy|5": 1,
432
- "harness|hendrycksTest-virology|5": 1,
433
- "harness|hendrycksTest-world_religions|5": 1,
434
- "harness|truthfulqa:mc|0": 1,
435
- "all": 0
436
- },
437
- "config": {
438
- "model_name": "meta-llama/Llama-2-7b-hf",
439
- "model_sha": "e8f058fa738b6b308540024e9aa12e274e291f75",
440
- "model_dtype": "torch.float16",
441
- "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937",
442
- "num_few_shot_default": 0,
443
- "num_fewshot_seeds": 1,
444
- "override_batch_size": 1,
445
- "max_samples": null
446
- },
447
- "task_config": {
448
- "harness|arc:challenge": "LM Harness task",
449
- "harness|hellaswag": "LM Harness task",
450
- "harness|hendrycksTest-abstract_algebra": "LM Harness task",
451
- "harness|hendrycksTest-anatomy": "LM Harness task",
452
- "harness|hendrycksTest-astronomy": "LM Harness task",
453
- "harness|hendrycksTest-business_ethics": "LM Harness task",
454
- "harness|hendrycksTest-clinical_knowledge": "LM Harness task",
455
- "harness|hendrycksTest-college_biology": "LM Harness task",
456
- "harness|hendrycksTest-college_chemistry": "LM Harness task",
457
- "harness|hendrycksTest-college_computer_science": "LM Harness task",
458
- "harness|hendrycksTest-college_mathematics": "LM Harness task",
459
- "harness|hendrycksTest-college_medicine": "LM Harness task",
460
- "harness|hendrycksTest-college_physics": "LM Harness task",
461
- "harness|hendrycksTest-computer_security": "LM Harness task",
462
- "harness|hendrycksTest-conceptual_physics": "LM Harness task",
463
- "harness|hendrycksTest-econometrics": "LM Harness task",
464
- "harness|hendrycksTest-electrical_engineering": "LM Harness task",
465
- "harness|hendrycksTest-elementary_mathematics": "LM Harness task",
466
- "harness|hendrycksTest-formal_logic": "LM Harness task",
467
- "harness|hendrycksTest-global_facts": "LM Harness task",
468
- "harness|hendrycksTest-high_school_biology": "LM Harness task",
469
- "harness|hendrycksTest-high_school_chemistry": "LM Harness task",
470
- "harness|hendrycksTest-high_school_computer_science": "LM Harness task",
471
- "harness|hendrycksTest-high_school_european_history": "LM Harness task",
472
- "harness|hendrycksTest-high_school_geography": "LM Harness task",
473
- "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task",
474
- "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task",
475
- "harness|hendrycksTest-high_school_mathematics": "LM Harness task",
476
- "harness|hendrycksTest-high_school_microeconomics": "LM Harness task",
477
- "harness|hendrycksTest-high_school_physics": "LM Harness task",
478
- "harness|hendrycksTest-high_school_psychology": "LM Harness task",
479
- "harness|hendrycksTest-high_school_statistics": "LM Harness task",
480
- "harness|hendrycksTest-high_school_us_history": "LM Harness task",
481
- "harness|hendrycksTest-high_school_world_history": "LM Harness task",
482
- "harness|hendrycksTest-human_aging": "LM Harness task",
483
- "harness|hendrycksTest-human_sexuality": "LM Harness task",
484
- "harness|hendrycksTest-international_law": "LM Harness task",
485
- "harness|hendrycksTest-jurisprudence": "LM Harness task",
486
- "harness|hendrycksTest-logical_fallacies": "LM Harness task",
487
- "harness|hendrycksTest-machine_learning": "LM Harness task",
488
- "harness|hendrycksTest-management": "LM Harness task",
489
- "harness|hendrycksTest-marketing": "LM Harness task",
490
- "harness|hendrycksTest-medical_genetics": "LM Harness task",
491
- "harness|hendrycksTest-miscellaneous": "LM Harness task",
492
- "harness|hendrycksTest-moral_disputes": "LM Harness task",
493
- "harness|hendrycksTest-moral_scenarios": "LM Harness task",
494
- "harness|hendrycksTest-nutrition": "LM Harness task",
495
- "harness|hendrycksTest-philosophy": "LM Harness task",
496
- "harness|hendrycksTest-prehistory": "LM Harness task",
497
- "harness|hendrycksTest-professional_accounting": "LM Harness task",
498
- "harness|hendrycksTest-professional_law": "LM Harness task",
499
- "harness|hendrycksTest-professional_medicine": "LM Harness task",
500
- "harness|hendrycksTest-professional_psychology": "LM Harness task",
501
- "harness|hendrycksTest-public_relations": "LM Harness task",
502
- "harness|hendrycksTest-security_studies": "LM Harness task",
503
- "harness|hendrycksTest-sociology": "LM Harness task",
504
- "harness|hendrycksTest-us_foreign_policy": "LM Harness task",
505
- "harness|hendrycksTest-virology": "LM Harness task",
506
- "harness|hendrycksTest-world_religions": "LM Harness task",
507
- "harness|truthfulqa:mc": "LM Harness task"
508
- },
509
- "hashes": {
510
- "harness|arc:challenge|25": {
511
- "hash_examples": "fb8c51b1872daeda",
512
- "hash_full_prompts": "045cbb916e5145c6",
513
- "hash_input_tokens": "61571bf68d6d89aa",
514
- "hash_cont_tokens": "8210decc6ff6f7df"
515
- },
516
- "harness|hellaswag|10": {
517
- "hash_examples": "e1768ecb99d7ecf0",
518
- "hash_full_prompts": "0b4c16983130f84f",
519
- "hash_input_tokens": "29906669b1c7054a",
520
- "hash_cont_tokens": "b3b9e9017afa63af"
521
- },
522
- "harness|hendrycksTest-abstract_algebra|5": {
523
- "hash_examples": "280f9f325b40559a",
524
- "hash_full_prompts": "2f776a367d23aea2",
525
- "hash_input_tokens": "c54ff61ad0273dd7",
526
- "hash_cont_tokens": "50421e30bef398f9"
527
- },
528
- "harness|hendrycksTest-anatomy|5": {
529
- "hash_examples": "2f83a4f1cab4ba18",
530
- "hash_full_prompts": "516f74bef25df620",
531
- "hash_input_tokens": "be31a1e22aef5f90",
532
- "hash_cont_tokens": "f11971a765cb609f"
533
- },
534
- "harness|hendrycksTest-astronomy|5": {
535
- "hash_examples": "7d587b908da4d762",
536
- "hash_full_prompts": "faf4e80f65de93ca",
537
- "hash_input_tokens": "277a7b1fad566940",
538
- "hash_cont_tokens": "bf30e5d3f48250cb"
539
- },
540
- "harness|hendrycksTest-business_ethics|5": {
541
- "hash_examples": "33e51740670de686",
542
- "hash_full_prompts": "db01c3ef8e1479d4",
543
- "hash_input_tokens": "ba552605bc116de5",
544
- "hash_cont_tokens": "bc1dd9b2d995eb61"
545
- },
546
- "harness|hendrycksTest-clinical_knowledge|5": {
547
- "hash_examples": "f3366dbe7eefffa4",
548
- "hash_full_prompts": "49654f71d94b65c3",
549
- "hash_input_tokens": "428c7563d0b98ab9",
550
- "hash_cont_tokens": "890a119624b3b935"
551
- },
552
- "harness|hendrycksTest-college_biology|5": {
553
- "hash_examples": "ca2b6753a0193e7f",
554
- "hash_full_prompts": "2b460b75f1fdfefd",
555
- "hash_input_tokens": "da036601573942e2",
556
- "hash_cont_tokens": "875cde3af7a0ee14"
557
- },
558
- "harness|hendrycksTest-college_chemistry|5": {
559
- "hash_examples": "22ff85f1d34f42d1",
560
- "hash_full_prompts": "242c9be6da583e95",
561
- "hash_input_tokens": "94e0196d6aded13d",
562
- "hash_cont_tokens": "50421e30bef398f9"
563
- },
564
- "harness|hendrycksTest-college_computer_science|5": {
565
- "hash_examples": "30318289d717a5cf",
566
- "hash_full_prompts": "ed2bdb4e87c4b371",
567
- "hash_input_tokens": "6e4d0f4a8d36690b",
568
- "hash_cont_tokens": "ffc0fe414cdc4a83"
569
- },
570
- "harness|hendrycksTest-college_mathematics|5": {
571
- "hash_examples": "4944d1f0b6b5d911",
572
- "hash_full_prompts": "770bc4281c973190",
573
- "hash_input_tokens": "614054d17109a25d",
574
- "hash_cont_tokens": "50421e30bef398f9"
575
- },
576
- "harness|hendrycksTest-college_medicine|5": {
577
- "hash_examples": "dd69cc33381275af",
578
- "hash_full_prompts": "ad2a53e5250ab46e",
579
- "hash_input_tokens": "1d633b3cc0524ba8",
580
- "hash_cont_tokens": "1f88b00d41957d82"
581
- },
582
- "harness|hendrycksTest-college_physics|5": {
583
- "hash_examples": "875dd26d22655b0d",
584
- "hash_full_prompts": "833a0d7b55aed500",
585
- "hash_input_tokens": "5421d9a1af86cbd4",
586
- "hash_cont_tokens": "f7b8097afc16a47c"
587
- },
588
- "harness|hendrycksTest-computer_security|5": {
589
- "hash_examples": "006451eedc0ededb",
590
- "hash_full_prompts": "94034c97e85d8f46",
591
- "hash_input_tokens": "5e6b70ecb333cf18",
592
- "hash_cont_tokens": "50421e30bef398f9"
593
- },
594
- "harness|hendrycksTest-conceptual_physics|5": {
595
- "hash_examples": "8874ece872d2ca4c",
596
- "hash_full_prompts": "e40d15a34640d6fa",
597
- "hash_input_tokens": "c2ef11a87264ceed",
598
- "hash_cont_tokens": "aa0e8bc655f2f641"
599
- },
600
- "harness|hendrycksTest-econometrics|5": {
601
- "hash_examples": "64d3623b0bfaa43f",
602
- "hash_full_prompts": "612f340fae41338d",
603
- "hash_input_tokens": "ecaccd912a4c3978",
604
- "hash_cont_tokens": "bfb7e3c3c88313f1"
605
- },
606
- "harness|hendrycksTest-electrical_engineering|5": {
607
- "hash_examples": "e98f51780c674d7e",
608
- "hash_full_prompts": "10275b312d812ae6",
609
- "hash_input_tokens": "1590c84291399be8",
610
- "hash_cont_tokens": "2425a3f084a591ef"
611
- },
612
- "harness|hendrycksTest-elementary_mathematics|5": {
613
- "hash_examples": "fc48208a5ac1c0ce",
614
- "hash_full_prompts": "5ec274c6c82aca23",
615
- "hash_input_tokens": "3269597f715b0da1",
616
- "hash_cont_tokens": "f52691aef15a407b"
617
- },
618
- "harness|hendrycksTest-formal_logic|5": {
619
- "hash_examples": "5a6525665f63ea72",
620
- "hash_full_prompts": "07b92638c4a6b500",
621
- "hash_input_tokens": "a2800d20f3ab8d7c",
622
- "hash_cont_tokens": "f515d598d9c21263"
623
- },
624
- "harness|hendrycksTest-global_facts|5": {
625
- "hash_examples": "371d70d743b2b89b",
626
- "hash_full_prompts": "332fdee50a1921b4",
627
- "hash_input_tokens": "94ed44b3772505ad",
628
- "hash_cont_tokens": "50421e30bef398f9"
629
- },
630
- "harness|hendrycksTest-high_school_biology|5": {
631
- "hash_examples": "a79e1018b1674052",
632
- "hash_full_prompts": "e624e26ede922561",
633
- "hash_input_tokens": "24423acb928db768",
634
- "hash_cont_tokens": "bd85a4156a3613ee"
635
- },
636
- "harness|hendrycksTest-high_school_chemistry|5": {
637
- "hash_examples": "44bfc25c389f0e03",
638
- "hash_full_prompts": "0e3e5f5d9246482a",
639
- "hash_input_tokens": "831ff35c474e5cef",
640
- "hash_cont_tokens": "a95c97af1c14e068"
641
- },
642
- "harness|hendrycksTest-high_school_computer_science|5": {
643
- "hash_examples": "8b8cdb1084f24169",
644
- "hash_full_prompts": "c00487e67c1813cc",
645
- "hash_input_tokens": "8c34e0f2bda77358",
646
- "hash_cont_tokens": "8abfedef914e33c9"
647
- },
648
- "harness|hendrycksTest-high_school_european_history|5": {
649
- "hash_examples": "11cd32d0ef440171",
650
- "hash_full_prompts": "318f4513c537c6bf",
651
- "hash_input_tokens": "f1f73dd687da18d7",
652
- "hash_cont_tokens": "674fc454bdc5ac93"
653
- },
654
- "harness|hendrycksTest-high_school_geography|5": {
655
- "hash_examples": "b60019b9e80b642f",
656
- "hash_full_prompts": "ee5789fcc1a81b1e",
657
- "hash_input_tokens": "7c5547c7da5bc793",
658
- "hash_cont_tokens": "03a5012b916274ea"
659
- },
660
- "harness|hendrycksTest-high_school_government_and_politics|5": {
661
- "hash_examples": "d221ec983d143dc3",
662
- "hash_full_prompts": "ac42d888e1ce1155",
663
- "hash_input_tokens": "f62991cb6a496b05",
664
- "hash_cont_tokens": "a83effb8f76b7d7c"
665
- },
666
- "harness|hendrycksTest-high_school_macroeconomics|5": {
667
- "hash_examples": "59c2915cacfd3fbb",
668
- "hash_full_prompts": "c6bd9d25158abd0e",
669
- "hash_input_tokens": "4cef2aff6e3d59ed",
670
- "hash_cont_tokens": "c583432ad27fcfe0"
671
- },
672
- "harness|hendrycksTest-high_school_mathematics|5": {
673
- "hash_examples": "1f8ac897608de342",
674
- "hash_full_prompts": "5d88f41fc2d643a8",
675
- "hash_input_tokens": "6e2577ea4082ed2b",
676
- "hash_cont_tokens": "24f5dc613660300b"
677
- },
678
- "harness|hendrycksTest-high_school_microeconomics|5": {
679
- "hash_examples": "ead6a0f2f6c83370",
680
- "hash_full_prompts": "bfc393381298609e",
681
- "hash_input_tokens": "c5fc9aeb1079c8e4",
682
- "hash_cont_tokens": "f47f041de50333b9"
683
- },
684
- "harness|hendrycksTest-high_school_physics|5": {
685
- "hash_examples": "c3f2025990afec64",
686
- "hash_full_prompts": "fc78b4997e436734",
687
- "hash_input_tokens": "555fc385cffa84ca",
688
- "hash_cont_tokens": "ba2efcd283e938cc"
689
- },
690
- "harness|hendrycksTest-high_school_psychology|5": {
691
- "hash_examples": "21f8aab618f6d636",
692
- "hash_full_prompts": "d5c76aa40b9dbc43",
693
- "hash_input_tokens": "febd23cbf9973b7f",
694
- "hash_cont_tokens": "942069cd363844d9"
695
- },
696
- "harness|hendrycksTest-high_school_statistics|5": {
697
- "hash_examples": "2386a60a11fc5de3",
698
- "hash_full_prompts": "4c5c8be5aafac432",
699
- "hash_input_tokens": "424b02981230ee83",
700
- "hash_cont_tokens": "955ed42b6f7fa019"
701
- },
702
- "harness|hendrycksTest-high_school_us_history|5": {
703
- "hash_examples": "74961543be40f04f",
704
- "hash_full_prompts": "5d5ca4840131ba21",
705
- "hash_input_tokens": "50c9ff438c85a69e",
706
- "hash_cont_tokens": "cdd0b3dc06d933e5"
707
- },
708
- "harness|hendrycksTest-high_school_world_history|5": {
709
- "hash_examples": "2ad2f6b7198b2234",
710
- "hash_full_prompts": "11845057459afd72",
711
- "hash_input_tokens": "054824cc474caef5",
712
- "hash_cont_tokens": "9a864184946033ac"
713
- },
714
- "harness|hendrycksTest-human_aging|5": {
715
- "hash_examples": "1a7199dc733e779b",
716
- "hash_full_prompts": "756b9096b8eaf892",
717
- "hash_input_tokens": "541a75f071dcf579",
718
- "hash_cont_tokens": "142a4a8a1138a214"
719
- },
720
- "harness|hendrycksTest-human_sexuality|5": {
721
- "hash_examples": "7acb8fdad97f88a6",
722
- "hash_full_prompts": "731a52ff15b8cfdb",
723
- "hash_input_tokens": "04269e5c5a257dd9",
724
- "hash_cont_tokens": "bc54813e809b796d"
725
- },
726
- "harness|hendrycksTest-international_law|5": {
727
- "hash_examples": "1300bfd0dfc59114",
728
- "hash_full_prompts": "db2aefbff5eec996",
729
- "hash_input_tokens": "d93ba9d9d38e4397",
730
- "hash_cont_tokens": "dc45b45fcda18e5d"
731
- },
732
- "harness|hendrycksTest-jurisprudence|5": {
733
- "hash_examples": "083b1e4904c48dc2",
734
- "hash_full_prompts": "0f89ee3fe03d6a21",
735
- "hash_input_tokens": "9eeaccd2698b4f5a",
736
- "hash_cont_tokens": "e3a8cd951b6e3469"
737
- },
738
- "harness|hendrycksTest-logical_fallacies|5": {
739
- "hash_examples": "709128f9926a634c",
740
- "hash_full_prompts": "98a04b1f8f841069",
741
- "hash_input_tokens": "b4f08f544f2b7576",
742
- "hash_cont_tokens": "1e80dbd30f6453d5"
743
- },
744
- "harness|hendrycksTest-machine_learning|5": {
745
- "hash_examples": "88f22a636029ae47",
746
- "hash_full_prompts": "2e1c8d4b1e0cc921",
747
- "hash_input_tokens": "900c2a51f1174b9f",
748
- "hash_cont_tokens": "9b37da7777378ca9"
749
- },
750
- "harness|hendrycksTest-management|5": {
751
- "hash_examples": "8c8a1e07a2151dca",
752
- "hash_full_prompts": "f51611f514b265b0",
753
- "hash_input_tokens": "6b36efb4689c6eca",
754
- "hash_cont_tokens": "a01d6d39a83c4597"
755
- },
756
- "harness|hendrycksTest-marketing|5": {
757
- "hash_examples": "2668953431f91e96",
758
- "hash_full_prompts": "77562bef997c7650",
759
- "hash_input_tokens": "2aaac78a0cfed47a",
760
- "hash_cont_tokens": "6aeaed4d823c98aa"
761
- },
762
- "harness|hendrycksTest-medical_genetics|5": {
763
- "hash_examples": "9c2dda34a2ea4fd2",
764
- "hash_full_prompts": "202139046daa118f",
765
- "hash_input_tokens": "886ca823b41c094a",
766
- "hash_cont_tokens": "50421e30bef398f9"
767
- },
768
- "harness|hendrycksTest-miscellaneous|5": {
769
- "hash_examples": "41adb694024809c2",
770
- "hash_full_prompts": "bffec9fc237bcf93",
771
- "hash_input_tokens": "72fd71de7675e7d0",
772
- "hash_cont_tokens": "9b0ab02a64603081"
773
- },
774
- "harness|hendrycksTest-moral_disputes|5": {
775
- "hash_examples": "3171c13ba3c594c4",
776
- "hash_full_prompts": "170831fc36f1d59e",
777
- "hash_input_tokens": "f3ca0dd8e7a1eb09",
778
- "hash_cont_tokens": "8badf768f7b0467a"
779
- },
780
- "harness|hendrycksTest-moral_scenarios|5": {
781
- "hash_examples": "9873e077e83e0546",
782
- "hash_full_prompts": "08f4ceba3131a068",
783
- "hash_input_tokens": "3e793631e951f23c",
784
- "hash_cont_tokens": "32ae620376b2bbba"
785
- },
786
- "harness|hendrycksTest-nutrition|5": {
787
- "hash_examples": "7db1d8142ec14323",
788
- "hash_full_prompts": "4c0e68e3586cb453",
789
- "hash_input_tokens": "59753c2144ea93af",
790
- "hash_cont_tokens": "3071def75bacc404"
791
- },
792
- "harness|hendrycksTest-philosophy|5": {
793
- "hash_examples": "9b455b7d72811cc8",
794
- "hash_full_prompts": "e467f822d8a0d3ff",
795
- "hash_input_tokens": "bd8d3dbed15a8c34",
796
- "hash_cont_tokens": "9f6ff69d23a48783"
797
- },
798
- "harness|hendrycksTest-prehistory|5": {
799
- "hash_examples": "8be90d0f538f1560",
800
- "hash_full_prompts": "152187949bcd0921",
801
- "hash_input_tokens": "3573cd87facbb7c5",
802
- "hash_cont_tokens": "de469d2b981e32a3"
803
- },
804
- "harness|hendrycksTest-professional_accounting|5": {
805
- "hash_examples": "8d377597916cd07e",
806
- "hash_full_prompts": "0eb7345d6144ee0d",
807
- "hash_input_tokens": "17e721bc1a7cbb47",
808
- "hash_cont_tokens": "c46f74d2dfc7b13b"
809
- },
810
- "harness|hendrycksTest-professional_law|5": {
811
- "hash_examples": "cd9dbc52b3c932d6",
812
- "hash_full_prompts": "36ac764272bfb182",
813
- "hash_input_tokens": "9178e10bd0763ec4",
814
- "hash_cont_tokens": "2e590029ef41fbcd"
815
- },
816
- "harness|hendrycksTest-professional_medicine|5": {
817
- "hash_examples": "b20e4e816c1e383e",
818
- "hash_full_prompts": "7b8d69ea2acaf2f7",
819
- "hash_input_tokens": "f5a22012a54f70ea",
820
- "hash_cont_tokens": "fe35cfa9c6ca802e"
821
- },
822
- "harness|hendrycksTest-professional_psychology|5": {
823
- "hash_examples": "d45b73b22f9cc039",
824
- "hash_full_prompts": "fe8937e9ffc99771",
825
- "hash_input_tokens": "0dfb73a8eb3f692c",
826
- "hash_cont_tokens": "f020fbddf72c8652"
827
- },
828
- "harness|hendrycksTest-public_relations|5": {
829
- "hash_examples": "0d25072e1761652a",
830
- "hash_full_prompts": "f9adc39cfa9f42ba",
831
- "hash_input_tokens": "1710c6ba4c9f3cbd",
832
- "hash_cont_tokens": "568f585a259965c1"
833
- },
834
- "harness|hendrycksTest-security_studies|5": {
835
- "hash_examples": "62bb8197e63d60d4",
836
- "hash_full_prompts": "869c9c3ae196b7c3",
837
- "hash_input_tokens": "d49711415961ced7",
838
- "hash_cont_tokens": "cc6fd7cccd64cd5d"
839
- },
840
- "harness|hendrycksTest-sociology|5": {
841
- "hash_examples": "e7959df87dea8672",
842
- "hash_full_prompts": "1a1fc00e17b3a52a",
843
- "hash_input_tokens": "828999f7624cbe7e",
844
- "hash_cont_tokens": "c3a3bdfd177eed5b"
845
- },
846
- "harness|hendrycksTest-us_foreign_policy|5": {
847
- "hash_examples": "4a56a01ddca44dca",
848
- "hash_full_prompts": "0c7a7081c71c07b6",
849
- "hash_input_tokens": "42054621e718dbee",
850
- "hash_cont_tokens": "2568d0e8e36fa959"
851
- },
852
- "harness|hendrycksTest-virology|5": {
853
- "hash_examples": "451cc86a8c4f4fe9",
854
- "hash_full_prompts": "01e95325d8b738e4",
855
- "hash_input_tokens": "6c4f0aa4dc859c04",
856
- "hash_cont_tokens": "926cf60b0891f374"
857
- },
858
- "harness|hendrycksTest-world_religions|5": {
859
- "hash_examples": "3b29cfaf1a81c379",
860
- "hash_full_prompts": "e0d79a15083dfdff",
861
- "hash_input_tokens": "6c75d44e092ff24f",
862
- "hash_cont_tokens": "c525a5de974c1ea3"
863
- },
864
- "harness|truthfulqa:mc|0": {
865
- "hash_examples": "23176c0531c7b867",
866
- "hash_full_prompts": "36a6d90e75d92d4a",
867
- "hash_input_tokens": "2738d7ed7075faa7",
868
- "hash_cont_tokens": "c014154380b74b9e"
869
- }
870
- }
871
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
meta-llama/Llama-2-7b-hf/results_2023-08-29T17-54-59.197645.json DELETED
@@ -1,1366 +0,0 @@
1
- {
2
- "config_general": {
3
- "model_name": "meta-llama/Llama-2-7b-hf",
4
- "model_sha": "6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9",
5
- "model_dtype": "4bit",
6
- "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63",
7
- "num_few_shot_default": 0,
8
- "num_fewshot_seeds": 1,
9
- "override_batch_size": 1,
10
- "max_samples": null,
11
- "job_id": ""
12
- },
13
- "results": {
14
- "harness|arc:challenge|25": {
15
- "acc": 0.4854948805460751,
16
- "acc_stderr": 0.014605241081370056,
17
- "acc_norm": 0.5307167235494881,
18
- "acc_norm_stderr": 0.014583792546304037
19
- },
20
- "harness|hellaswag|10": {
21
- "acc": 0.5789683330013942,
22
- "acc_stderr": 0.0049271558825981845,
23
- "acc_norm": 0.7774347739494125,
24
- "acc_norm_stderr": 0.004151185615952062
25
- },
26
- "harness|hendrycksTest-abstract_algebra|5": {
27
- "acc": 0.28,
28
- "acc_stderr": 0.04512608598542129,
29
- "acc_norm": 0.28,
30
- "acc_norm_stderr": 0.04512608598542129
31
- },
32
- "harness|hendrycksTest-anatomy|5": {
33
- "acc": 0.42962962962962964,
34
- "acc_stderr": 0.04276349494376599,
35
- "acc_norm": 0.42962962962962964,
36
- "acc_norm_stderr": 0.04276349494376599
37
- },
38
- "harness|hendrycksTest-astronomy|5": {
39
- "acc": 0.40789473684210525,
40
- "acc_stderr": 0.03999309712777471,
41
- "acc_norm": 0.40789473684210525,
42
- "acc_norm_stderr": 0.03999309712777471
43
- },
44
- "harness|hendrycksTest-business_ethics|5": {
45
- "acc": 0.49,
46
- "acc_stderr": 0.05024183937956911,
47
- "acc_norm": 0.49,
48
- "acc_norm_stderr": 0.05024183937956911
49
- },
50
- "harness|hendrycksTest-clinical_knowledge|5": {
51
- "acc": 0.4377358490566038,
52
- "acc_stderr": 0.030533338430467516,
53
- "acc_norm": 0.4377358490566038,
54
- "acc_norm_stderr": 0.030533338430467516
55
- },
56
- "harness|hendrycksTest-college_biology|5": {
57
- "acc": 0.4375,
58
- "acc_stderr": 0.04148415739394154,
59
- "acc_norm": 0.4375,
60
- "acc_norm_stderr": 0.04148415739394154
61
- },
62
- "harness|hendrycksTest-college_chemistry|5": {
63
- "acc": 0.31,
64
- "acc_stderr": 0.04648231987117316,
65
- "acc_norm": 0.31,
66
- "acc_norm_stderr": 0.04648231987117316
67
- },
68
- "harness|hendrycksTest-college_computer_science|5": {
69
- "acc": 0.39,
70
- "acc_stderr": 0.04902071300001975,
71
- "acc_norm": 0.39,
72
- "acc_norm_stderr": 0.04902071300001975
73
- },
74
- "harness|hendrycksTest-college_mathematics|5": {
75
- "acc": 0.32,
76
- "acc_stderr": 0.04688261722621505,
77
- "acc_norm": 0.32,
78
- "acc_norm_stderr": 0.04688261722621505
79
- },
80
- "harness|hendrycksTest-college_medicine|5": {
81
- "acc": 0.37572254335260113,
82
- "acc_stderr": 0.036928207672648664,
83
- "acc_norm": 0.37572254335260113,
84
- "acc_norm_stderr": 0.036928207672648664
85
- },
86
- "harness|hendrycksTest-college_physics|5": {
87
- "acc": 0.18627450980392157,
88
- "acc_stderr": 0.038739587141493524,
89
- "acc_norm": 0.18627450980392157,
90
- "acc_norm_stderr": 0.038739587141493524
91
- },
92
- "harness|hendrycksTest-computer_security|5": {
93
- "acc": 0.58,
94
- "acc_stderr": 0.049604496374885836,
95
- "acc_norm": 0.58,
96
- "acc_norm_stderr": 0.049604496374885836
97
- },
98
- "harness|hendrycksTest-conceptual_physics|5": {
99
- "acc": 0.4425531914893617,
100
- "acc_stderr": 0.03246956919789958,
101
- "acc_norm": 0.4425531914893617,
102
- "acc_norm_stderr": 0.03246956919789958
103
- },
104
- "harness|hendrycksTest-econometrics|5": {
105
- "acc": 0.30701754385964913,
106
- "acc_stderr": 0.04339138322579861,
107
- "acc_norm": 0.30701754385964913,
108
- "acc_norm_stderr": 0.04339138322579861
109
- },
110
- "harness|hendrycksTest-electrical_engineering|5": {
111
- "acc": 0.46206896551724136,
112
- "acc_stderr": 0.041546596717075474,
113
- "acc_norm": 0.46206896551724136,
114
- "acc_norm_stderr": 0.041546596717075474
115
- },
116
- "harness|hendrycksTest-elementary_mathematics|5": {
117
- "acc": 0.24603174603174602,
118
- "acc_stderr": 0.02218203720294836,
119
- "acc_norm": 0.24603174603174602,
120
- "acc_norm_stderr": 0.02218203720294836
121
- },
122
- "harness|hendrycksTest-formal_logic|5": {
123
- "acc": 0.3412698412698413,
124
- "acc_stderr": 0.04240799327574924,
125
- "acc_norm": 0.3412698412698413,
126
- "acc_norm_stderr": 0.04240799327574924
127
- },
128
- "harness|hendrycksTest-global_facts|5": {
129
- "acc": 0.36,
130
- "acc_stderr": 0.04824181513244218,
131
- "acc_norm": 0.36,
132
- "acc_norm_stderr": 0.04824181513244218
133
- },
134
- "harness|hendrycksTest-high_school_biology|5": {
135
- "acc": 0.4290322580645161,
136
- "acc_stderr": 0.02815603653823321,
137
- "acc_norm": 0.4290322580645161,
138
- "acc_norm_stderr": 0.02815603653823321
139
- },
140
- "harness|hendrycksTest-high_school_chemistry|5": {
141
- "acc": 0.35467980295566504,
142
- "acc_stderr": 0.0336612448905145,
143
- "acc_norm": 0.35467980295566504,
144
- "acc_norm_stderr": 0.0336612448905145
145
- },
146
- "harness|hendrycksTest-high_school_computer_science|5": {
147
- "acc": 0.42,
148
- "acc_stderr": 0.049604496374885836,
149
- "acc_norm": 0.42,
150
- "acc_norm_stderr": 0.049604496374885836
151
- },
152
- "harness|hendrycksTest-high_school_european_history|5": {
153
- "acc": 0.5696969696969697,
154
- "acc_stderr": 0.03866225962879077,
155
- "acc_norm": 0.5696969696969697,
156
- "acc_norm_stderr": 0.03866225962879077
157
- },
158
- "harness|hendrycksTest-high_school_geography|5": {
159
- "acc": 0.4797979797979798,
160
- "acc_stderr": 0.0355944356556392,
161
- "acc_norm": 0.4797979797979798,
162
- "acc_norm_stderr": 0.0355944356556392
163
- },
164
- "harness|hendrycksTest-high_school_government_and_politics|5": {
165
- "acc": 0.6321243523316062,
166
- "acc_stderr": 0.034801756684660366,
167
- "acc_norm": 0.6321243523316062,
168
- "acc_norm_stderr": 0.034801756684660366
169
- },
170
- "harness|hendrycksTest-high_school_macroeconomics|5": {
171
- "acc": 0.4,
172
- "acc_stderr": 0.024838811988033158,
173
- "acc_norm": 0.4,
174
- "acc_norm_stderr": 0.024838811988033158
175
- },
176
- "harness|hendrycksTest-high_school_mathematics|5": {
177
- "acc": 0.24444444444444444,
178
- "acc_stderr": 0.026202766534652148,
179
- "acc_norm": 0.24444444444444444,
180
- "acc_norm_stderr": 0.026202766534652148
181
- },
182
- "harness|hendrycksTest-high_school_microeconomics|5": {
183
- "acc": 0.3907563025210084,
184
- "acc_stderr": 0.031693802357129965,
185
- "acc_norm": 0.3907563025210084,
186
- "acc_norm_stderr": 0.031693802357129965
187
- },
188
- "harness|hendrycksTest-high_school_physics|5": {
189
- "acc": 0.304635761589404,
190
- "acc_stderr": 0.03757949922943342,
191
- "acc_norm": 0.304635761589404,
192
- "acc_norm_stderr": 0.03757949922943342
193
- },
194
- "harness|hendrycksTest-high_school_psychology|5": {
195
- "acc": 0.5798165137614679,
196
- "acc_stderr": 0.021162420048273508,
197
- "acc_norm": 0.5798165137614679,
198
- "acc_norm_stderr": 0.021162420048273508
199
- },
200
- "harness|hendrycksTest-high_school_statistics|5": {
201
- "acc": 0.19444444444444445,
202
- "acc_stderr": 0.02699145450203673,
203
- "acc_norm": 0.19444444444444445,
204
- "acc_norm_stderr": 0.02699145450203673
205
- },
206
- "harness|hendrycksTest-high_school_us_history|5": {
207
- "acc": 0.4803921568627451,
208
- "acc_stderr": 0.03506612560524867,
209
- "acc_norm": 0.4803921568627451,
210
- "acc_norm_stderr": 0.03506612560524867
211
- },
212
- "harness|hendrycksTest-high_school_world_history|5": {
213
- "acc": 0.5485232067510548,
214
- "acc_stderr": 0.0323936001739747,
215
- "acc_norm": 0.5485232067510548,
216
- "acc_norm_stderr": 0.0323936001739747
217
- },
218
- "harness|hendrycksTest-human_aging|5": {
219
- "acc": 0.5246636771300448,
220
- "acc_stderr": 0.03351695167652628,
221
- "acc_norm": 0.5246636771300448,
222
- "acc_norm_stderr": 0.03351695167652628
223
- },
224
- "harness|hendrycksTest-human_sexuality|5": {
225
- "acc": 0.45038167938931295,
226
- "acc_stderr": 0.04363643698524779,
227
- "acc_norm": 0.45038167938931295,
228
- "acc_norm_stderr": 0.04363643698524779
229
- },
230
- "harness|hendrycksTest-international_law|5": {
231
- "acc": 0.6198347107438017,
232
- "acc_stderr": 0.04431324501968432,
233
- "acc_norm": 0.6198347107438017,
234
- "acc_norm_stderr": 0.04431324501968432
235
- },
236
- "harness|hendrycksTest-jurisprudence|5": {
237
- "acc": 0.48148148148148145,
238
- "acc_stderr": 0.04830366024635331,
239
- "acc_norm": 0.48148148148148145,
240
- "acc_norm_stderr": 0.04830366024635331
241
- },
242
- "harness|hendrycksTest-logical_fallacies|5": {
243
- "acc": 0.4601226993865031,
244
- "acc_stderr": 0.03915857291436972,
245
- "acc_norm": 0.4601226993865031,
246
- "acc_norm_stderr": 0.03915857291436972
247
- },
248
- "harness|hendrycksTest-machine_learning|5": {
249
- "acc": 0.36607142857142855,
250
- "acc_stderr": 0.0457237235873743,
251
- "acc_norm": 0.36607142857142855,
252
- "acc_norm_stderr": 0.0457237235873743
253
- },
254
- "harness|hendrycksTest-management|5": {
255
- "acc": 0.49514563106796117,
256
- "acc_stderr": 0.049505043821289195,
257
- "acc_norm": 0.49514563106796117,
258
- "acc_norm_stderr": 0.049505043821289195
259
- },
260
- "harness|hendrycksTest-marketing|5": {
261
- "acc": 0.6837606837606838,
262
- "acc_stderr": 0.030463656747340275,
263
- "acc_norm": 0.6837606837606838,
264
- "acc_norm_stderr": 0.030463656747340275
265
- },
266
- "harness|hendrycksTest-medical_genetics|5": {
267
- "acc": 0.52,
268
- "acc_stderr": 0.050211673156867795,
269
- "acc_norm": 0.52,
270
- "acc_norm_stderr": 0.050211673156867795
271
- },
272
- "harness|hendrycksTest-miscellaneous|5": {
273
- "acc": 0.6002554278416348,
274
- "acc_stderr": 0.017516847907053282,
275
- "acc_norm": 0.6002554278416348,
276
- "acc_norm_stderr": 0.017516847907053282
277
- },
278
- "harness|hendrycksTest-moral_disputes|5": {
279
- "acc": 0.48554913294797686,
280
- "acc_stderr": 0.02690784985628254,
281
- "acc_norm": 0.48554913294797686,
282
- "acc_norm_stderr": 0.02690784985628254
283
- },
284
- "harness|hendrycksTest-moral_scenarios|5": {
285
- "acc": 0.23798882681564246,
286
- "acc_stderr": 0.014242630070574915,
287
- "acc_norm": 0.23798882681564246,
288
- "acc_norm_stderr": 0.014242630070574915
289
- },
290
- "harness|hendrycksTest-nutrition|5": {
291
- "acc": 0.49673202614379086,
292
- "acc_stderr": 0.02862930519400354,
293
- "acc_norm": 0.49673202614379086,
294
- "acc_norm_stderr": 0.02862930519400354
295
- },
296
- "harness|hendrycksTest-philosophy|5": {
297
- "acc": 0.5498392282958199,
298
- "acc_stderr": 0.028256660723360177,
299
- "acc_norm": 0.5498392282958199,
300
- "acc_norm_stderr": 0.028256660723360177
301
- },
302
- "harness|hendrycksTest-prehistory|5": {
303
- "acc": 0.5,
304
- "acc_stderr": 0.02782074420373286,
305
- "acc_norm": 0.5,
306
- "acc_norm_stderr": 0.02782074420373286
307
- },
308
- "harness|hendrycksTest-professional_accounting|5": {
309
- "acc": 0.3262411347517731,
310
- "acc_stderr": 0.027968453043563168,
311
- "acc_norm": 0.3262411347517731,
312
- "acc_norm_stderr": 0.027968453043563168
313
- },
314
- "harness|hendrycksTest-professional_law|5": {
315
- "acc": 0.3318122555410691,
316
- "acc_stderr": 0.012026088259897637,
317
- "acc_norm": 0.3318122555410691,
318
- "acc_norm_stderr": 0.012026088259897637
319
- },
320
- "harness|hendrycksTest-professional_medicine|5": {
321
- "acc": 0.4485294117647059,
322
- "acc_stderr": 0.030211479609121593,
323
- "acc_norm": 0.4485294117647059,
324
- "acc_norm_stderr": 0.030211479609121593
325
- },
326
- "harness|hendrycksTest-professional_psychology|5": {
327
- "acc": 0.4215686274509804,
328
- "acc_stderr": 0.019977422600227467,
329
- "acc_norm": 0.4215686274509804,
330
- "acc_norm_stderr": 0.019977422600227467
331
- },
332
- "harness|hendrycksTest-public_relations|5": {
333
- "acc": 0.4727272727272727,
334
- "acc_stderr": 0.04782001791380063,
335
- "acc_norm": 0.4727272727272727,
336
- "acc_norm_stderr": 0.04782001791380063
337
- },
338
- "harness|hendrycksTest-security_studies|5": {
339
- "acc": 0.3673469387755102,
340
- "acc_stderr": 0.030862144921087558,
341
- "acc_norm": 0.3673469387755102,
342
- "acc_norm_stderr": 0.030862144921087558
343
- },
344
- "harness|hendrycksTest-sociology|5": {
345
- "acc": 0.5970149253731343,
346
- "acc_stderr": 0.034683432951111266,
347
- "acc_norm": 0.5970149253731343,
348
- "acc_norm_stderr": 0.034683432951111266
349
- },
350
- "harness|hendrycksTest-us_foreign_policy|5": {
351
- "acc": 0.66,
352
- "acc_stderr": 0.04760952285695237,
353
- "acc_norm": 0.66,
354
- "acc_norm_stderr": 0.04760952285695237
355
- },
356
- "harness|hendrycksTest-virology|5": {
357
- "acc": 0.3855421686746988,
358
- "acc_stderr": 0.037891344246115496,
359
- "acc_norm": 0.3855421686746988,
360
- "acc_norm_stderr": 0.037891344246115496
361
- },
362
- "harness|hendrycksTest-world_religions|5": {
363
- "acc": 0.6491228070175439,
364
- "acc_stderr": 0.03660298834049163,
365
- "acc_norm": 0.6491228070175439,
366
- "acc_norm_stderr": 0.03660298834049163
367
- },
368
- "harness|truthfulqa:mc|0": {
369
- "mc1": 0.2484700122399021,
370
- "mc1_stderr": 0.01512742709652068,
371
- "mc2": 0.38980202801580316,
372
- "mc2_stderr": 0.013645286936347097
373
- },
374
- "all": {
375
- "acc": 0.4411565786317669,
376
- "acc_stderr": 0.03521763310724054,
377
- "acc_norm": 0.44528688852924886,
378
- "acc_norm_stderr": 0.03520411753433017,
379
- "mc1": 0.2484700122399021,
380
- "mc1_stderr": 0.01512742709652068,
381
- "mc2": 0.38980202801580316,
382
- "mc2_stderr": 0.013645286936347097
383
- }
384
- },
385
- "versions": {
386
- "harness|arc:challenge|25": 0,
387
- "harness|hellaswag|10": 0,
388
- "harness|hendrycksTest-abstract_algebra|5": 1,
389
- "harness|hendrycksTest-anatomy|5": 1,
390
- "harness|hendrycksTest-astronomy|5": 1,
391
- "harness|hendrycksTest-business_ethics|5": 1,
392
- "harness|hendrycksTest-clinical_knowledge|5": 1,
393
- "harness|hendrycksTest-college_biology|5": 1,
394
- "harness|hendrycksTest-college_chemistry|5": 1,
395
- "harness|hendrycksTest-college_computer_science|5": 1,
396
- "harness|hendrycksTest-college_mathematics|5": 1,
397
- "harness|hendrycksTest-college_medicine|5": 1,
398
- "harness|hendrycksTest-college_physics|5": 1,
399
- "harness|hendrycksTest-computer_security|5": 1,
400
- "harness|hendrycksTest-conceptual_physics|5": 1,
401
- "harness|hendrycksTest-econometrics|5": 1,
402
- "harness|hendrycksTest-electrical_engineering|5": 1,
403
- "harness|hendrycksTest-elementary_mathematics|5": 1,
404
- "harness|hendrycksTest-formal_logic|5": 1,
405
- "harness|hendrycksTest-global_facts|5": 1,
406
- "harness|hendrycksTest-high_school_biology|5": 1,
407
- "harness|hendrycksTest-high_school_chemistry|5": 1,
408
- "harness|hendrycksTest-high_school_computer_science|5": 1,
409
- "harness|hendrycksTest-high_school_european_history|5": 1,
410
- "harness|hendrycksTest-high_school_geography|5": 1,
411
- "harness|hendrycksTest-high_school_government_and_politics|5": 1,
412
- "harness|hendrycksTest-high_school_macroeconomics|5": 1,
413
- "harness|hendrycksTest-high_school_mathematics|5": 1,
414
- "harness|hendrycksTest-high_school_microeconomics|5": 1,
415
- "harness|hendrycksTest-high_school_physics|5": 1,
416
- "harness|hendrycksTest-high_school_psychology|5": 1,
417
- "harness|hendrycksTest-high_school_statistics|5": 1,
418
- "harness|hendrycksTest-high_school_us_history|5": 1,
419
- "harness|hendrycksTest-high_school_world_history|5": 1,
420
- "harness|hendrycksTest-human_aging|5": 1,
421
- "harness|hendrycksTest-human_sexuality|5": 1,
422
- "harness|hendrycksTest-international_law|5": 1,
423
- "harness|hendrycksTest-jurisprudence|5": 1,
424
- "harness|hendrycksTest-logical_fallacies|5": 1,
425
- "harness|hendrycksTest-machine_learning|5": 1,
426
- "harness|hendrycksTest-management|5": 1,
427
- "harness|hendrycksTest-marketing|5": 1,
428
- "harness|hendrycksTest-medical_genetics|5": 1,
429
- "harness|hendrycksTest-miscellaneous|5": 1,
430
- "harness|hendrycksTest-moral_disputes|5": 1,
431
- "harness|hendrycksTest-moral_scenarios|5": 1,
432
- "harness|hendrycksTest-nutrition|5": 1,
433
- "harness|hendrycksTest-philosophy|5": 1,
434
- "harness|hendrycksTest-prehistory|5": 1,
435
- "harness|hendrycksTest-professional_accounting|5": 1,
436
- "harness|hendrycksTest-professional_law|5": 1,
437
- "harness|hendrycksTest-professional_medicine|5": 1,
438
- "harness|hendrycksTest-professional_psychology|5": 1,
439
- "harness|hendrycksTest-public_relations|5": 1,
440
- "harness|hendrycksTest-security_studies|5": 1,
441
- "harness|hendrycksTest-sociology|5": 1,
442
- "harness|hendrycksTest-us_foreign_policy|5": 1,
443
- "harness|hendrycksTest-virology|5": 1,
444
- "harness|hendrycksTest-world_religions|5": 1,
445
- "harness|truthfulqa:mc|0": 1,
446
- "all": 0
447
- },
448
- "config_tasks": {
449
- "harness|arc:challenge": "LM Harness task",
450
- "harness|hellaswag": "LM Harness task",
451
- "harness|hendrycksTest-abstract_algebra": "LM Harness task",
452
- "harness|hendrycksTest-anatomy": "LM Harness task",
453
- "harness|hendrycksTest-astronomy": "LM Harness task",
454
- "harness|hendrycksTest-business_ethics": "LM Harness task",
455
- "harness|hendrycksTest-clinical_knowledge": "LM Harness task",
456
- "harness|hendrycksTest-college_biology": "LM Harness task",
457
- "harness|hendrycksTest-college_chemistry": "LM Harness task",
458
- "harness|hendrycksTest-college_computer_science": "LM Harness task",
459
- "harness|hendrycksTest-college_mathematics": "LM Harness task",
460
- "harness|hendrycksTest-college_medicine": "LM Harness task",
461
- "harness|hendrycksTest-college_physics": "LM Harness task",
462
- "harness|hendrycksTest-computer_security": "LM Harness task",
463
- "harness|hendrycksTest-conceptual_physics": "LM Harness task",
464
- "harness|hendrycksTest-econometrics": "LM Harness task",
465
- "harness|hendrycksTest-electrical_engineering": "LM Harness task",
466
- "harness|hendrycksTest-elementary_mathematics": "LM Harness task",
467
- "harness|hendrycksTest-formal_logic": "LM Harness task",
468
- "harness|hendrycksTest-global_facts": "LM Harness task",
469
- "harness|hendrycksTest-high_school_biology": "LM Harness task",
470
- "harness|hendrycksTest-high_school_chemistry": "LM Harness task",
471
- "harness|hendrycksTest-high_school_computer_science": "LM Harness task",
472
- "harness|hendrycksTest-high_school_european_history": "LM Harness task",
473
- "harness|hendrycksTest-high_school_geography": "LM Harness task",
474
- "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task",
475
- "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task",
476
- "harness|hendrycksTest-high_school_mathematics": "LM Harness task",
477
- "harness|hendrycksTest-high_school_microeconomics": "LM Harness task",
478
- "harness|hendrycksTest-high_school_physics": "LM Harness task",
479
- "harness|hendrycksTest-high_school_psychology": "LM Harness task",
480
- "harness|hendrycksTest-high_school_statistics": "LM Harness task",
481
- "harness|hendrycksTest-high_school_us_history": "LM Harness task",
482
- "harness|hendrycksTest-high_school_world_history": "LM Harness task",
483
- "harness|hendrycksTest-human_aging": "LM Harness task",
484
- "harness|hendrycksTest-human_sexuality": "LM Harness task",
485
- "harness|hendrycksTest-international_law": "LM Harness task",
486
- "harness|hendrycksTest-jurisprudence": "LM Harness task",
487
- "harness|hendrycksTest-logical_fallacies": "LM Harness task",
488
- "harness|hendrycksTest-machine_learning": "LM Harness task",
489
- "harness|hendrycksTest-management": "LM Harness task",
490
- "harness|hendrycksTest-marketing": "LM Harness task",
491
- "harness|hendrycksTest-medical_genetics": "LM Harness task",
492
- "harness|hendrycksTest-miscellaneous": "LM Harness task",
493
- "harness|hendrycksTest-moral_disputes": "LM Harness task",
494
- "harness|hendrycksTest-moral_scenarios": "LM Harness task",
495
- "harness|hendrycksTest-nutrition": "LM Harness task",
496
- "harness|hendrycksTest-philosophy": "LM Harness task",
497
- "harness|hendrycksTest-prehistory": "LM Harness task",
498
- "harness|hendrycksTest-professional_accounting": "LM Harness task",
499
- "harness|hendrycksTest-professional_law": "LM Harness task",
500
- "harness|hendrycksTest-professional_medicine": "LM Harness task",
501
- "harness|hendrycksTest-professional_psychology": "LM Harness task",
502
- "harness|hendrycksTest-public_relations": "LM Harness task",
503
- "harness|hendrycksTest-security_studies": "LM Harness task",
504
- "harness|hendrycksTest-sociology": "LM Harness task",
505
- "harness|hendrycksTest-us_foreign_policy": "LM Harness task",
506
- "harness|hendrycksTest-virology": "LM Harness task",
507
- "harness|hendrycksTest-world_religions": "LM Harness task",
508
- "harness|truthfulqa:mc": "LM Harness task"
509
- },
510
- "summary_tasks": {
511
- "harness|arc:challenge|25": {
512
- "hashes": {
513
- "hash_examples": "17b0cae357c0259e",
514
- "hash_full_prompts": "045cbb916e5145c6",
515
- "hash_input_tokens": "3722289b79076c44",
516
- "hash_cont_tokens": "8210decc6ff6f7df"
517
- },
518
- "truncated": 0,
519
- "non-truncated": 4687,
520
- "padded": 4687,
521
- "non-padded": 0,
522
- "effective_few_shots": 25.0,
523
- "num_truncated_few_shots": 0
524
- },
525
- "harness|hellaswag|10": {
526
- "hashes": {
527
- "hash_examples": "e1768ecb99d7ecf0",
528
- "hash_full_prompts": "0b4c16983130f84f",
529
- "hash_input_tokens": "ececd684171f1ef2",
530
- "hash_cont_tokens": "b3b9e9017afa63af"
531
- },
532
- "truncated": 0,
533
- "non-truncated": 40168,
534
- "padded": 40113,
535
- "non-padded": 55,
536
- "effective_few_shots": 10.0,
537
- "num_truncated_few_shots": 0
538
- },
539
- "harness|hendrycksTest-abstract_algebra|5": {
540
- "hashes": {
541
- "hash_examples": "280f9f325b40559a",
542
- "hash_full_prompts": "2f776a367d23aea2",
543
- "hash_input_tokens": "c54ff61ad0273dd7",
544
- "hash_cont_tokens": "50421e30bef398f9"
545
- },
546
- "truncated": 0,
547
- "non-truncated": 400,
548
- "padded": 400,
549
- "non-padded": 0,
550
- "effective_few_shots": 5.0,
551
- "num_truncated_few_shots": 0
552
- },
553
- "harness|hendrycksTest-anatomy|5": {
554
- "hashes": {
555
- "hash_examples": "2f83a4f1cab4ba18",
556
- "hash_full_prompts": "516f74bef25df620",
557
- "hash_input_tokens": "be31a1e22aef5f90",
558
- "hash_cont_tokens": "f11971a765cb609f"
559
- },
560
- "truncated": 0,
561
- "non-truncated": 540,
562
- "padded": 540,
563
- "non-padded": 0,
564
- "effective_few_shots": 5.0,
565
- "num_truncated_few_shots": 0
566
- },
567
- "harness|hendrycksTest-astronomy|5": {
568
- "hashes": {
569
- "hash_examples": "7d587b908da4d762",
570
- "hash_full_prompts": "faf4e80f65de93ca",
571
- "hash_input_tokens": "277a7b1fad566940",
572
- "hash_cont_tokens": "bf30e5d3f48250cb"
573
- },
574
- "truncated": 0,
575
- "non-truncated": 608,
576
- "padded": 608,
577
- "non-padded": 0,
578
- "effective_few_shots": 5.0,
579
- "num_truncated_few_shots": 0
580
- },
581
- "harness|hendrycksTest-business_ethics|5": {
582
- "hashes": {
583
- "hash_examples": "33e51740670de686",
584
- "hash_full_prompts": "db01c3ef8e1479d4",
585
- "hash_input_tokens": "ba552605bc116de5",
586
- "hash_cont_tokens": "bc1dd9b2d995eb61"
587
- },
588
- "truncated": 0,
589
- "non-truncated": 400,
590
- "padded": 400,
591
- "non-padded": 0,
592
- "effective_few_shots": 5.0,
593
- "num_truncated_few_shots": 0
594
- },
595
- "harness|hendrycksTest-clinical_knowledge|5": {
596
- "hashes": {
597
- "hash_examples": "f3366dbe7eefffa4",
598
- "hash_full_prompts": "49654f71d94b65c3",
599
- "hash_input_tokens": "428c7563d0b98ab9",
600
- "hash_cont_tokens": "890a119624b3b935"
601
- },
602
- "truncated": 0,
603
- "non-truncated": 1060,
604
- "padded": 1060,
605
- "non-padded": 0,
606
- "effective_few_shots": 5.0,
607
- "num_truncated_few_shots": 0
608
- },
609
- "harness|hendrycksTest-college_biology|5": {
610
- "hashes": {
611
- "hash_examples": "ca2b6753a0193e7f",
612
- "hash_full_prompts": "2b460b75f1fdfefd",
613
- "hash_input_tokens": "da036601573942e2",
614
- "hash_cont_tokens": "875cde3af7a0ee14"
615
- },
616
- "truncated": 0,
617
- "non-truncated": 576,
618
- "padded": 576,
619
- "non-padded": 0,
620
- "effective_few_shots": 5.0,
621
- "num_truncated_few_shots": 0
622
- },
623
- "harness|hendrycksTest-college_chemistry|5": {
624
- "hashes": {
625
- "hash_examples": "22ff85f1d34f42d1",
626
- "hash_full_prompts": "242c9be6da583e95",
627
- "hash_input_tokens": "94e0196d6aded13d",
628
- "hash_cont_tokens": "50421e30bef398f9"
629
- },
630
- "truncated": 0,
631
- "non-truncated": 400,
632
- "padded": 400,
633
- "non-padded": 0,
634
- "effective_few_shots": 5.0,
635
- "num_truncated_few_shots": 0
636
- },
637
- "harness|hendrycksTest-college_computer_science|5": {
638
- "hashes": {
639
- "hash_examples": "30318289d717a5cf",
640
- "hash_full_prompts": "ed2bdb4e87c4b371",
641
- "hash_input_tokens": "6e4d0f4a8d36690b",
642
- "hash_cont_tokens": "ffc0fe414cdc4a83"
643
- },
644
- "truncated": 0,
645
- "non-truncated": 400,
646
- "padded": 400,
647
- "non-padded": 0,
648
- "effective_few_shots": 5.0,
649
- "num_truncated_few_shots": 0
650
- },
651
- "harness|hendrycksTest-college_mathematics|5": {
652
- "hashes": {
653
- "hash_examples": "4944d1f0b6b5d911",
654
- "hash_full_prompts": "770bc4281c973190",
655
- "hash_input_tokens": "614054d17109a25d",
656
- "hash_cont_tokens": "50421e30bef398f9"
657
- },
658
- "truncated": 0,
659
- "non-truncated": 400,
660
- "padded": 400,
661
- "non-padded": 0,
662
- "effective_few_shots": 5.0,
663
- "num_truncated_few_shots": 0
664
- },
665
- "harness|hendrycksTest-college_medicine|5": {
666
- "hashes": {
667
- "hash_examples": "dd69cc33381275af",
668
- "hash_full_prompts": "ad2a53e5250ab46e",
669
- "hash_input_tokens": "081bb2b524defd1c",
670
- "hash_cont_tokens": "1f88b00d41957d82"
671
- },
672
- "truncated": 0,
673
- "non-truncated": 692,
674
- "padded": 692,
675
- "non-padded": 0,
676
- "effective_few_shots": 5.0,
677
- "num_truncated_few_shots": 0
678
- },
679
- "harness|hendrycksTest-college_physics|5": {
680
- "hashes": {
681
- "hash_examples": "875dd26d22655b0d",
682
- "hash_full_prompts": "833a0d7b55aed500",
683
- "hash_input_tokens": "5421d9a1af86cbd4",
684
- "hash_cont_tokens": "f7b8097afc16a47c"
685
- },
686
- "truncated": 0,
687
- "non-truncated": 408,
688
- "padded": 408,
689
- "non-padded": 0,
690
- "effective_few_shots": 5.0,
691
- "num_truncated_few_shots": 0
692
- },
693
- "harness|hendrycksTest-computer_security|5": {
694
- "hashes": {
695
- "hash_examples": "006451eedc0ededb",
696
- "hash_full_prompts": "94034c97e85d8f46",
697
- "hash_input_tokens": "5e6b70ecb333cf18",
698
- "hash_cont_tokens": "50421e30bef398f9"
699
- },
700
- "truncated": 0,
701
- "non-truncated": 400,
702
- "padded": 400,
703
- "non-padded": 0,
704
- "effective_few_shots": 5.0,
705
- "num_truncated_few_shots": 0
706
- },
707
- "harness|hendrycksTest-conceptual_physics|5": {
708
- "hashes": {
709
- "hash_examples": "8874ece872d2ca4c",
710
- "hash_full_prompts": "e40d15a34640d6fa",
711
- "hash_input_tokens": "c2ef11a87264ceed",
712
- "hash_cont_tokens": "aa0e8bc655f2f641"
713
- },
714
- "truncated": 0,
715
- "non-truncated": 940,
716
- "padded": 940,
717
- "non-padded": 0,
718
- "effective_few_shots": 5.0,
719
- "num_truncated_few_shots": 0
720
- },
721
- "harness|hendrycksTest-econometrics|5": {
722
- "hashes": {
723
- "hash_examples": "64d3623b0bfaa43f",
724
- "hash_full_prompts": "612f340fae41338d",
725
- "hash_input_tokens": "ecaccd912a4c3978",
726
- "hash_cont_tokens": "bfb7e3c3c88313f1"
727
- },
728
- "truncated": 0,
729
- "non-truncated": 456,
730
- "padded": 456,
731
- "non-padded": 0,
732
- "effective_few_shots": 5.0,
733
- "num_truncated_few_shots": 0
734
- },
735
- "harness|hendrycksTest-electrical_engineering|5": {
736
- "hashes": {
737
- "hash_examples": "e98f51780c674d7e",
738
- "hash_full_prompts": "10275b312d812ae6",
739
- "hash_input_tokens": "1590c84291399be8",
740
- "hash_cont_tokens": "2425a3f084a591ef"
741
- },
742
- "truncated": 0,
743
- "non-truncated": 580,
744
- "padded": 580,
745
- "non-padded": 0,
746
- "effective_few_shots": 5.0,
747
- "num_truncated_few_shots": 0
748
- },
749
- "harness|hendrycksTest-elementary_mathematics|5": {
750
- "hashes": {
751
- "hash_examples": "fc48208a5ac1c0ce",
752
- "hash_full_prompts": "5ec274c6c82aca23",
753
- "hash_input_tokens": "3269597f715b0da1",
754
- "hash_cont_tokens": "f52691aef15a407b"
755
- },
756
- "truncated": 0,
757
- "non-truncated": 1512,
758
- "padded": 1512,
759
- "non-padded": 0,
760
- "effective_few_shots": 5.0,
761
- "num_truncated_few_shots": 0
762
- },
763
- "harness|hendrycksTest-formal_logic|5": {
764
- "hashes": {
765
- "hash_examples": "5a6525665f63ea72",
766
- "hash_full_prompts": "07b92638c4a6b500",
767
- "hash_input_tokens": "a2800d20f3ab8d7c",
768
- "hash_cont_tokens": "f515d598d9c21263"
769
- },
770
- "truncated": 0,
771
- "non-truncated": 504,
772
- "padded": 504,
773
- "non-padded": 0,
774
- "effective_few_shots": 5.0,
775
- "num_truncated_few_shots": 0
776
- },
777
- "harness|hendrycksTest-global_facts|5": {
778
- "hashes": {
779
- "hash_examples": "371d70d743b2b89b",
780
- "hash_full_prompts": "332fdee50a1921b4",
781
- "hash_input_tokens": "94ed44b3772505ad",
782
- "hash_cont_tokens": "50421e30bef398f9"
783
- },
784
- "truncated": 0,
785
- "non-truncated": 400,
786
- "padded": 400,
787
- "non-padded": 0,
788
- "effective_few_shots": 5.0,
789
- "num_truncated_few_shots": 0
790
- },
791
- "harness|hendrycksTest-high_school_biology|5": {
792
- "hashes": {
793
- "hash_examples": "a79e1018b1674052",
794
- "hash_full_prompts": "e624e26ede922561",
795
- "hash_input_tokens": "24423acb928db768",
796
- "hash_cont_tokens": "bd85a4156a3613ee"
797
- },
798
- "truncated": 0,
799
- "non-truncated": 1240,
800
- "padded": 1240,
801
- "non-padded": 0,
802
- "effective_few_shots": 5.0,
803
- "num_truncated_few_shots": 0
804
- },
805
- "harness|hendrycksTest-high_school_chemistry|5": {
806
- "hashes": {
807
- "hash_examples": "44bfc25c389f0e03",
808
- "hash_full_prompts": "0e3e5f5d9246482a",
809
- "hash_input_tokens": "831ff35c474e5cef",
810
- "hash_cont_tokens": "a95c97af1c14e068"
811
- },
812
- "truncated": 0,
813
- "non-truncated": 812,
814
- "padded": 812,
815
- "non-padded": 0,
816
- "effective_few_shots": 5.0,
817
- "num_truncated_few_shots": 0
818
- },
819
- "harness|hendrycksTest-high_school_computer_science|5": {
820
- "hashes": {
821
- "hash_examples": "8b8cdb1084f24169",
822
- "hash_full_prompts": "c00487e67c1813cc",
823
- "hash_input_tokens": "a20a96b44dcc5b30",
824
- "hash_cont_tokens": "8abfedef914e33c9"
825
- },
826
- "truncated": 0,
827
- "non-truncated": 400,
828
- "padded": 400,
829
- "non-padded": 0,
830
- "effective_few_shots": 5.0,
831
- "num_truncated_few_shots": 0
832
- },
833
- "harness|hendrycksTest-high_school_european_history|5": {
834
- "hashes": {
835
- "hash_examples": "11cd32d0ef440171",
836
- "hash_full_prompts": "318f4513c537c6bf",
837
- "hash_input_tokens": "5002f4ac8b1562ca",
838
- "hash_cont_tokens": "674fc454bdc5ac93"
839
- },
840
- "truncated": 0,
841
- "non-truncated": 660,
842
- "padded": 656,
843
- "non-padded": 4,
844
- "effective_few_shots": 5.0,
845
- "num_truncated_few_shots": 0
846
- },
847
- "harness|hendrycksTest-high_school_geography|5": {
848
- "hashes": {
849
- "hash_examples": "b60019b9e80b642f",
850
- "hash_full_prompts": "ee5789fcc1a81b1e",
851
- "hash_input_tokens": "7c5547c7da5bc793",
852
- "hash_cont_tokens": "03a5012b916274ea"
853
- },
854
- "truncated": 0,
855
- "non-truncated": 792,
856
- "padded": 792,
857
- "non-padded": 0,
858
- "effective_few_shots": 5.0,
859
- "num_truncated_few_shots": 0
860
- },
861
- "harness|hendrycksTest-high_school_government_and_politics|5": {
862
- "hashes": {
863
- "hash_examples": "d221ec983d143dc3",
864
- "hash_full_prompts": "ac42d888e1ce1155",
865
- "hash_input_tokens": "f62991cb6a496b05",
866
- "hash_cont_tokens": "a83effb8f76b7d7c"
867
- },
868
- "truncated": 0,
869
- "non-truncated": 772,
870
- "padded": 772,
871
- "non-padded": 0,
872
- "effective_few_shots": 5.0,
873
- "num_truncated_few_shots": 0
874
- },
875
- "harness|hendrycksTest-high_school_macroeconomics|5": {
876
- "hashes": {
877
- "hash_examples": "59c2915cacfd3fbb",
878
- "hash_full_prompts": "c6bd9d25158abd0e",
879
- "hash_input_tokens": "4cef2aff6e3d59ed",
880
- "hash_cont_tokens": "c583432ad27fcfe0"
881
- },
882
- "truncated": 0,
883
- "non-truncated": 1560,
884
- "padded": 1560,
885
- "non-padded": 0,
886
- "effective_few_shots": 5.0,
887
- "num_truncated_few_shots": 0
888
- },
889
- "harness|hendrycksTest-high_school_mathematics|5": {
890
- "hashes": {
891
- "hash_examples": "1f8ac897608de342",
892
- "hash_full_prompts": "5d88f41fc2d643a8",
893
- "hash_input_tokens": "6e2577ea4082ed2b",
894
- "hash_cont_tokens": "24f5dc613660300b"
895
- },
896
- "truncated": 0,
897
- "non-truncated": 1080,
898
- "padded": 1080,
899
- "non-padded": 0,
900
- "effective_few_shots": 5.0,
901
- "num_truncated_few_shots": 0
902
- },
903
- "harness|hendrycksTest-high_school_microeconomics|5": {
904
- "hashes": {
905
- "hash_examples": "ead6a0f2f6c83370",
906
- "hash_full_prompts": "bfc393381298609e",
907
- "hash_input_tokens": "c5fc9aeb1079c8e4",
908
- "hash_cont_tokens": "f47f041de50333b9"
909
- },
910
- "truncated": 0,
911
- "non-truncated": 952,
912
- "padded": 952,
913
- "non-padded": 0,
914
- "effective_few_shots": 5.0,
915
- "num_truncated_few_shots": 0
916
- },
917
- "harness|hendrycksTest-high_school_physics|5": {
918
- "hashes": {
919
- "hash_examples": "c3f2025990afec64",
920
- "hash_full_prompts": "fc78b4997e436734",
921
- "hash_input_tokens": "555fc385cffa84ca",
922
- "hash_cont_tokens": "ba2efcd283e938cc"
923
- },
924
- "truncated": 0,
925
- "non-truncated": 604,
926
- "padded": 604,
927
- "non-padded": 0,
928
- "effective_few_shots": 5.0,
929
- "num_truncated_few_shots": 0
930
- },
931
- "harness|hendrycksTest-high_school_psychology|5": {
932
- "hashes": {
933
- "hash_examples": "21f8aab618f6d636",
934
- "hash_full_prompts": "d5c76aa40b9dbc43",
935
- "hash_input_tokens": "febd23cbf9973b7f",
936
- "hash_cont_tokens": "942069cd363844d9"
937
- },
938
- "truncated": 0,
939
- "non-truncated": 2180,
940
- "padded": 2180,
941
- "non-padded": 0,
942
- "effective_few_shots": 5.0,
943
- "num_truncated_few_shots": 0
944
- },
945
- "harness|hendrycksTest-high_school_statistics|5": {
946
- "hashes": {
947
- "hash_examples": "2386a60a11fc5de3",
948
- "hash_full_prompts": "4c5c8be5aafac432",
949
- "hash_input_tokens": "400e55b56ee6fbd7",
950
- "hash_cont_tokens": "955ed42b6f7fa019"
951
- },
952
- "truncated": 0,
953
- "non-truncated": 864,
954
- "padded": 864,
955
- "non-padded": 0,
956
- "effective_few_shots": 5.0,
957
- "num_truncated_few_shots": 0
958
- },
959
- "harness|hendrycksTest-high_school_us_history|5": {
960
- "hashes": {
961
- "hash_examples": "74961543be40f04f",
962
- "hash_full_prompts": "5d5ca4840131ba21",
963
- "hash_input_tokens": "c639cce12a46ebad",
964
- "hash_cont_tokens": "cdd0b3dc06d933e5"
965
- },
966
- "truncated": 0,
967
- "non-truncated": 816,
968
- "padded": 816,
969
- "non-padded": 0,
970
- "effective_few_shots": 5.0,
971
- "num_truncated_few_shots": 0
972
- },
973
- "harness|hendrycksTest-high_school_world_history|5": {
974
- "hashes": {
975
- "hash_examples": "2ad2f6b7198b2234",
976
- "hash_full_prompts": "11845057459afd72",
977
- "hash_input_tokens": "b9762065cce6f3a6",
978
- "hash_cont_tokens": "9a864184946033ac"
979
- },
980
- "truncated": 0,
981
- "non-truncated": 948,
982
- "padded": 948,
983
- "non-padded": 0,
984
- "effective_few_shots": 5.0,
985
- "num_truncated_few_shots": 0
986
- },
987
- "harness|hendrycksTest-human_aging|5": {
988
- "hashes": {
989
- "hash_examples": "1a7199dc733e779b",
990
- "hash_full_prompts": "756b9096b8eaf892",
991
- "hash_input_tokens": "541a75f071dcf579",
992
- "hash_cont_tokens": "142a4a8a1138a214"
993
- },
994
- "truncated": 0,
995
- "non-truncated": 892,
996
- "padded": 892,
997
- "non-padded": 0,
998
- "effective_few_shots": 5.0,
999
- "num_truncated_few_shots": 0
1000
- },
1001
- "harness|hendrycksTest-human_sexuality|5": {
1002
- "hashes": {
1003
- "hash_examples": "7acb8fdad97f88a6",
1004
- "hash_full_prompts": "731a52ff15b8cfdb",
1005
- "hash_input_tokens": "04269e5c5a257dd9",
1006
- "hash_cont_tokens": "bc54813e809b796d"
1007
- },
1008
- "truncated": 0,
1009
- "non-truncated": 524,
1010
- "padded": 524,
1011
- "non-padded": 0,
1012
- "effective_few_shots": 5.0,
1013
- "num_truncated_few_shots": 0
1014
- },
1015
- "harness|hendrycksTest-international_law|5": {
1016
- "hashes": {
1017
- "hash_examples": "1300bfd0dfc59114",
1018
- "hash_full_prompts": "db2aefbff5eec996",
1019
- "hash_input_tokens": "d93ba9d9d38e4397",
1020
- "hash_cont_tokens": "dc45b45fcda18e5d"
1021
- },
1022
- "truncated": 0,
1023
- "non-truncated": 484,
1024
- "padded": 484,
1025
- "non-padded": 0,
1026
- "effective_few_shots": 5.0,
1027
- "num_truncated_few_shots": 0
1028
- },
1029
- "harness|hendrycksTest-jurisprudence|5": {
1030
- "hashes": {
1031
- "hash_examples": "083b1e4904c48dc2",
1032
- "hash_full_prompts": "0f89ee3fe03d6a21",
1033
- "hash_input_tokens": "9eeaccd2698b4f5a",
1034
- "hash_cont_tokens": "e3a8cd951b6e3469"
1035
- },
1036
- "truncated": 0,
1037
- "non-truncated": 432,
1038
- "padded": 432,
1039
- "non-padded": 0,
1040
- "effective_few_shots": 5.0,
1041
- "num_truncated_few_shots": 0
1042
- },
1043
- "harness|hendrycksTest-logical_fallacies|5": {
1044
- "hashes": {
1045
- "hash_examples": "709128f9926a634c",
1046
- "hash_full_prompts": "98a04b1f8f841069",
1047
- "hash_input_tokens": "b4f08f544f2b7576",
1048
- "hash_cont_tokens": "1e80dbd30f6453d5"
1049
- },
1050
- "truncated": 0,
1051
- "non-truncated": 652,
1052
- "padded": 648,
1053
- "non-padded": 4,
1054
- "effective_few_shots": 5.0,
1055
- "num_truncated_few_shots": 0
1056
- },
1057
- "harness|hendrycksTest-machine_learning|5": {
1058
- "hashes": {
1059
- "hash_examples": "88f22a636029ae47",
1060
- "hash_full_prompts": "2e1c8d4b1e0cc921",
1061
- "hash_input_tokens": "900c2a51f1174b9f",
1062
- "hash_cont_tokens": "9b37da7777378ca9"
1063
- },
1064
- "truncated": 0,
1065
- "non-truncated": 448,
1066
- "padded": 448,
1067
- "non-padded": 0,
1068
- "effective_few_shots": 5.0,
1069
- "num_truncated_few_shots": 0
1070
- },
1071
- "harness|hendrycksTest-management|5": {
1072
- "hashes": {
1073
- "hash_examples": "8c8a1e07a2151dca",
1074
- "hash_full_prompts": "f51611f514b265b0",
1075
- "hash_input_tokens": "6b36efb4689c6eca",
1076
- "hash_cont_tokens": "a01d6d39a83c4597"
1077
- },
1078
- "truncated": 0,
1079
- "non-truncated": 412,
1080
- "padded": 412,
1081
- "non-padded": 0,
1082
- "effective_few_shots": 5.0,
1083
- "num_truncated_few_shots": 0
1084
- },
1085
- "harness|hendrycksTest-marketing|5": {
1086
- "hashes": {
1087
- "hash_examples": "2668953431f91e96",
1088
- "hash_full_prompts": "77562bef997c7650",
1089
- "hash_input_tokens": "2aaac78a0cfed47a",
1090
- "hash_cont_tokens": "6aeaed4d823c98aa"
1091
- },
1092
- "truncated": 0,
1093
- "non-truncated": 936,
1094
- "padded": 936,
1095
- "non-padded": 0,
1096
- "effective_few_shots": 5.0,
1097
- "num_truncated_few_shots": 0
1098
- },
1099
- "harness|hendrycksTest-medical_genetics|5": {
1100
- "hashes": {
1101
- "hash_examples": "9c2dda34a2ea4fd2",
1102
- "hash_full_prompts": "202139046daa118f",
1103
- "hash_input_tokens": "886ca823b41c094a",
1104
- "hash_cont_tokens": "50421e30bef398f9"
1105
- },
1106
- "truncated": 0,
1107
- "non-truncated": 400,
1108
- "padded": 400,
1109
- "non-padded": 0,
1110
- "effective_few_shots": 5.0,
1111
- "num_truncated_few_shots": 0
1112
- },
1113
- "harness|hendrycksTest-miscellaneous|5": {
1114
- "hashes": {
1115
- "hash_examples": "41adb694024809c2",
1116
- "hash_full_prompts": "bffec9fc237bcf93",
1117
- "hash_input_tokens": "72fd71de7675e7d0",
1118
- "hash_cont_tokens": "9b0ab02a64603081"
1119
- },
1120
- "truncated": 0,
1121
- "non-truncated": 3132,
1122
- "padded": 3132,
1123
- "non-padded": 0,
1124
- "effective_few_shots": 5.0,
1125
- "num_truncated_few_shots": 0
1126
- },
1127
- "harness|hendrycksTest-moral_disputes|5": {
1128
- "hashes": {
1129
- "hash_examples": "3171c13ba3c594c4",
1130
- "hash_full_prompts": "170831fc36f1d59e",
1131
- "hash_input_tokens": "f3ca0dd8e7a1eb09",
1132
- "hash_cont_tokens": "8badf768f7b0467a"
1133
- },
1134
- "truncated": 0,
1135
- "non-truncated": 1384,
1136
- "padded": 1354,
1137
- "non-padded": 30,
1138
- "effective_few_shots": 5.0,
1139
- "num_truncated_few_shots": 0
1140
- },
1141
- "harness|hendrycksTest-moral_scenarios|5": {
1142
- "hashes": {
1143
- "hash_examples": "9873e077e83e0546",
1144
- "hash_full_prompts": "08f4ceba3131a068",
1145
- "hash_input_tokens": "3e793631e951f23c",
1146
- "hash_cont_tokens": "32ae620376b2bbba"
1147
- },
1148
- "truncated": 0,
1149
- "non-truncated": 3580,
1150
- "padded": 3580,
1151
- "non-padded": 0,
1152
- "effective_few_shots": 5.0,
1153
- "num_truncated_few_shots": 0
1154
- },
1155
- "harness|hendrycksTest-nutrition|5": {
1156
- "hashes": {
1157
- "hash_examples": "7db1d8142ec14323",
1158
- "hash_full_prompts": "4c0e68e3586cb453",
1159
- "hash_input_tokens": "59753c2144ea93af",
1160
- "hash_cont_tokens": "3071def75bacc404"
1161
- },
1162
- "truncated": 0,
1163
- "non-truncated": 1224,
1164
- "padded": 1224,
1165
- "non-padded": 0,
1166
- "effective_few_shots": 5.0,
1167
- "num_truncated_few_shots": 0
1168
- },
1169
- "harness|hendrycksTest-philosophy|5": {
1170
- "hashes": {
1171
- "hash_examples": "9b455b7d72811cc8",
1172
- "hash_full_prompts": "e467f822d8a0d3ff",
1173
- "hash_input_tokens": "bd8d3dbed15a8c34",
1174
- "hash_cont_tokens": "9f6ff69d23a48783"
1175
- },
1176
- "truncated": 0,
1177
- "non-truncated": 1244,
1178
- "padded": 1244,
1179
- "non-padded": 0,
1180
- "effective_few_shots": 5.0,
1181
- "num_truncated_few_shots": 0
1182
- },
1183
- "harness|hendrycksTest-prehistory|5": {
1184
- "hashes": {
1185
- "hash_examples": "8be90d0f538f1560",
1186
- "hash_full_prompts": "152187949bcd0921",
1187
- "hash_input_tokens": "3573cd87facbb7c5",
1188
- "hash_cont_tokens": "de469d2b981e32a3"
1189
- },
1190
- "truncated": 0,
1191
- "non-truncated": 1296,
1192
- "padded": 1296,
1193
- "non-padded": 0,
1194
- "effective_few_shots": 5.0,
1195
- "num_truncated_few_shots": 0
1196
- },
1197
- "harness|hendrycksTest-professional_accounting|5": {
1198
- "hashes": {
1199
- "hash_examples": "8d377597916cd07e",
1200
- "hash_full_prompts": "0eb7345d6144ee0d",
1201
- "hash_input_tokens": "17e721bc1a7cbb47",
1202
- "hash_cont_tokens": "c46f74d2dfc7b13b"
1203
- },
1204
- "truncated": 0,
1205
- "non-truncated": 1128,
1206
- "padded": 1128,
1207
- "non-padded": 0,
1208
- "effective_few_shots": 5.0,
1209
- "num_truncated_few_shots": 0
1210
- },
1211
- "harness|hendrycksTest-professional_law|5": {
1212
- "hashes": {
1213
- "hash_examples": "cd9dbc52b3c932d6",
1214
- "hash_full_prompts": "36ac764272bfb182",
1215
- "hash_input_tokens": "c9f7583fff66d361",
1216
- "hash_cont_tokens": "2e590029ef41fbcd"
1217
- },
1218
- "truncated": 0,
1219
- "non-truncated": 6136,
1220
- "padded": 6136,
1221
- "non-padded": 0,
1222
- "effective_few_shots": 5.0,
1223
- "num_truncated_few_shots": 0
1224
- },
1225
- "harness|hendrycksTest-professional_medicine|5": {
1226
- "hashes": {
1227
- "hash_examples": "b20e4e816c1e383e",
1228
- "hash_full_prompts": "7b8d69ea2acaf2f7",
1229
- "hash_input_tokens": "40a933f829116f8d",
1230
- "hash_cont_tokens": "fe35cfa9c6ca802e"
1231
- },
1232
- "truncated": 0,
1233
- "non-truncated": 1088,
1234
- "padded": 1088,
1235
- "non-padded": 0,
1236
- "effective_few_shots": 5.0,
1237
- "num_truncated_few_shots": 0
1238
- },
1239
- "harness|hendrycksTest-professional_psychology|5": {
1240
- "hashes": {
1241
- "hash_examples": "d45b73b22f9cc039",
1242
- "hash_full_prompts": "fe8937e9ffc99771",
1243
- "hash_input_tokens": "0dfb73a8eb3f692c",
1244
- "hash_cont_tokens": "f020fbddf72c8652"
1245
- },
1246
- "truncated": 0,
1247
- "non-truncated": 2448,
1248
- "padded": 2448,
1249
- "non-padded": 0,
1250
- "effective_few_shots": 5.0,
1251
- "num_truncated_few_shots": 0
1252
- },
1253
- "harness|hendrycksTest-public_relations|5": {
1254
- "hashes": {
1255
- "hash_examples": "0d25072e1761652a",
1256
- "hash_full_prompts": "f9adc39cfa9f42ba",
1257
- "hash_input_tokens": "1710c6ba4c9f3cbd",
1258
- "hash_cont_tokens": "568f585a259965c1"
1259
- },
1260
- "truncated": 0,
1261
- "non-truncated": 440,
1262
- "padded": 440,
1263
- "non-padded": 0,
1264
- "effective_few_shots": 5.0,
1265
- "num_truncated_few_shots": 0
1266
- },
1267
- "harness|hendrycksTest-security_studies|5": {
1268
- "hashes": {
1269
- "hash_examples": "62bb8197e63d60d4",
1270
- "hash_full_prompts": "869c9c3ae196b7c3",
1271
- "hash_input_tokens": "32a03f1f22a6e103",
1272
- "hash_cont_tokens": "cc6fd7cccd64cd5d"
1273
- },
1274
- "truncated": 0,
1275
- "non-truncated": 980,
1276
- "padded": 980,
1277
- "non-padded": 0,
1278
- "effective_few_shots": 5.0,
1279
- "num_truncated_few_shots": 0
1280
- },
1281
- "harness|hendrycksTest-sociology|5": {
1282
- "hashes": {
1283
- "hash_examples": "e7959df87dea8672",
1284
- "hash_full_prompts": "1a1fc00e17b3a52a",
1285
- "hash_input_tokens": "828999f7624cbe7e",
1286
- "hash_cont_tokens": "c3a3bdfd177eed5b"
1287
- },
1288
- "truncated": 0,
1289
- "non-truncated": 804,
1290
- "padded": 804,
1291
- "non-padded": 0,
1292
- "effective_few_shots": 5.0,
1293
- "num_truncated_few_shots": 0
1294
- },
1295
- "harness|hendrycksTest-us_foreign_policy|5": {
1296
- "hashes": {
1297
- "hash_examples": "4a56a01ddca44dca",
1298
- "hash_full_prompts": "0c7a7081c71c07b6",
1299
- "hash_input_tokens": "42054621e718dbee",
1300
- "hash_cont_tokens": "2568d0e8e36fa959"
1301
- },
1302
- "truncated": 0,
1303
- "non-truncated": 400,
1304
- "padded": 400,
1305
- "non-padded": 0,
1306
- "effective_few_shots": 5.0,
1307
- "num_truncated_few_shots": 0
1308
- },
1309
- "harness|hendrycksTest-virology|5": {
1310
- "hashes": {
1311
- "hash_examples": "451cc86a8c4f4fe9",
1312
- "hash_full_prompts": "01e95325d8b738e4",
1313
- "hash_input_tokens": "6c4f0aa4dc859c04",
1314
- "hash_cont_tokens": "926cf60b0891f374"
1315
- },
1316
- "truncated": 0,
1317
- "non-truncated": 664,
1318
- "padded": 664,
1319
- "non-padded": 0,
1320
- "effective_few_shots": 5.0,
1321
- "num_truncated_few_shots": 0
1322
- },
1323
- "harness|hendrycksTest-world_religions|5": {
1324
- "hashes": {
1325
- "hash_examples": "3b29cfaf1a81c379",
1326
- "hash_full_prompts": "e0d79a15083dfdff",
1327
- "hash_input_tokens": "6c75d44e092ff24f",
1328
- "hash_cont_tokens": "c525a5de974c1ea3"
1329
- },
1330
- "truncated": 0,
1331
- "non-truncated": 684,
1332
- "padded": 684,
1333
- "non-padded": 0,
1334
- "effective_few_shots": 5.0,
1335
- "num_truncated_few_shots": 0
1336
- },
1337
- "harness|truthfulqa:mc|0": {
1338
- "hashes": {
1339
- "hash_examples": "23176c0531c7b867",
1340
- "hash_full_prompts": "36a6d90e75d92d4a",
1341
- "hash_input_tokens": "2738d7ed7075faa7",
1342
- "hash_cont_tokens": "c014154380b74b9e"
1343
- },
1344
- "truncated": 0,
1345
- "non-truncated": 9996,
1346
- "padded": 9996,
1347
- "non-padded": 0,
1348
- "effective_few_shots": 0.0,
1349
- "num_truncated_few_shots": 0
1350
- }
1351
- },
1352
- "summary_general": {
1353
- "hashes": {
1354
- "hash_examples": "d84d18e9a963753d",
1355
- "hash_full_prompts": "12b540783521a8e6",
1356
- "hash_input_tokens": "5c73a7dce6ccf737",
1357
- "hash_cont_tokens": "fb1646e2bdd5fc38"
1358
- },
1359
- "total_evaluation_time_secondes": "19912.24178814888",
1360
- "truncated": 0,
1361
- "non-truncated": 111019,
1362
- "padded": 110926,
1363
- "non-padded": 93,
1364
- "num_truncated_few_shots": 0
1365
- }
1366
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
meta-llama/Llama-2-7b-hf/results_2023-09-07T13-40-06.600532.json DELETED
@@ -1,61 +0,0 @@
1
- {
2
- "config_general": {
3
- "model_name": "meta-llama/Llama-2-7b-hf",
4
- "model_sha": "6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9",
5
- "model_size": "12.61 GB",
6
- "model_dtype": "torch.float16",
7
- "lighteval_sha": "457ac5672c5fdebfd6bc95bb94bda825c148eccf",
8
- "num_few_shot_default": 0,
9
- "num_fewshot_seeds": 1,
10
- "override_batch_size": 1,
11
- "max_samples": null,
12
- "job_id": ""
13
- },
14
- "results": {
15
- "harness|winogrande|5": {
16
- "acc": 0.7403314917127072,
17
- "acc_stderr": 0.012322700705552667
18
- },
19
- "all": {
20
- "acc": 0.7403314917127072,
21
- "acc_stderr": 0.012322700705552667
22
- }
23
- },
24
- "versions": {
25
- "harness|winogrande|5": 0,
26
- "all": 0
27
- },
28
- "config_tasks": {
29
- "harness|winogrande": "LM Harness task"
30
- },
31
- "summary_tasks": {
32
- "harness|winogrande|5": {
33
- "hashes": {
34
- "hash_examples": "aada0a176fd81218",
35
- "hash_full_prompts": "c8655cbd12de8409",
36
- "hash_input_tokens": "c0bedf98cb040854",
37
- "hash_cont_tokens": "f08975ad6f2d5864"
38
- },
39
- "truncated": 0,
40
- "non-truncated": 2534,
41
- "padded": 2432,
42
- "non-padded": 102,
43
- "effective_few_shots": 5.0,
44
- "num_truncated_few_shots": 0
45
- }
46
- },
47
- "summary_general": {
48
- "hashes": {
49
- "hash_examples": "42f54c7ae3f28ef3",
50
- "hash_full_prompts": "897c968b27a8c59a",
51
- "hash_input_tokens": "ee5c3cb253d643d1",
52
- "hash_cont_tokens": "273a70958f734c00"
53
- },
54
- "total_evaluation_time_secondes": "91.96187782287598",
55
- "truncated": 0,
56
- "non-truncated": 2534,
57
- "padded": 2432,
58
- "non-padded": 102,
59
- "num_truncated_few_shots": 0
60
- }
61
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
meta-llama/Llama-2-7b-hf/results_2023-09-08T17-00-44.389859.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "config_general": {
3
- "model_name": "meta-llama/Llama-2-7b-hf",
4
- "model_sha": "6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9",
5
- "model_size": "12.61 GB",
6
- "model_dtype": "torch.float16",
7
- "lighteval_sha": "457ac5672c5fdebfd6bc95bb94bda825c148eccf",
8
- "num_few_shot_default": 0,
9
- "num_fewshot_seeds": 1,
10
- "override_batch_size": 1,
11
- "max_samples": null,
12
- "job_id": ""
13
- },
14
- "results": {
15
- "harness|drop|3": {
16
- "em": 0.0012583892617449664,
17
- "em_stderr": 0.00036305608931194434,
18
- "f1": 0.055925964765100665,
19
- "f1_stderr": 0.0013181664771628632
20
- },
21
- "harness|gsm8k|5": {
22
- "acc": 0.0712661106899166,
23
- "acc_stderr": 0.007086462127954491
24
- },
25
- "harness|winogrande|5": {
26
- "acc": 0.7403314917127072,
27
- "acc_stderr": 0.012322700705552667
28
- },
29
- "all": {
30
- "em": 0.0012583892617449664,
31
- "em_stderr": 0.00036305608931194434,
32
- "f1": 0.055925964765100665,
33
- "f1_stderr": 0.0013181664771628632,
34
- "acc": 0.4057988012013119,
35
- "acc_stderr": 0.00970458141675358
36
- }
37
- },
38
- "versions": {
39
- "harness|drop|3": 1,
40
- "harness|gsm8k|5": 0,
41
- "harness|winogrande|5": 0,
42
- "all": 0
43
- },
44
- "config_tasks": {
45
- "harness|drop": "LM Harness task",
46
- "harness|gsm8k": "LM Harness task",
47
- "harness|winogrande": "LM Harness task"
48
- },
49
- "summary_tasks": {
50
- "harness|drop|3": {
51
- "hashes": {
52
- "hash_examples": "1d27416e8324e9a3",
53
- "hash_full_prompts": "a5513ff9a741b385",
54
- "hash_input_tokens": "42076f0efbb50aa6",
55
- "hash_cont_tokens": "ef74ade15eb78da6"
56
- },
57
- "truncated": 3,
58
- "non-truncated": 9533,
59
- "padded": 0,
60
- "non-padded": 9536,
61
- "effective_few_shots": 3.0,
62
- "num_truncated_few_shots": 0
63
- },
64
- "harness|gsm8k|5": {
65
- "hashes": {
66
- "hash_examples": "4c0843a5d99bcfdc",
67
- "hash_full_prompts": "41d55e83abc0e02d",
68
- "hash_input_tokens": "bda342e47b5099b2",
69
- "hash_cont_tokens": "542d7b742ca594d0"
70
- },
71
- "truncated": 0,
72
- "non-truncated": 1319,
73
- "padded": 0,
74
- "non-padded": 1319,
75
- "effective_few_shots": 5.0,
76
- "num_truncated_few_shots": 0
77
- },
78
- "harness|winogrande|5": {
79
- "hashes": {
80
- "hash_examples": "aada0a176fd81218",
81
- "hash_full_prompts": "c8655cbd12de8409",
82
- "hash_input_tokens": "c0bedf98cb040854",
83
- "hash_cont_tokens": "f08975ad6f2d5864"
84
- },
85
- "truncated": 0,
86
- "non-truncated": 2534,
87
- "padded": 2432,
88
- "non-padded": 102,
89
- "effective_few_shots": 5.0,
90
- "num_truncated_few_shots": 0
91
- }
92
- },
93
- "summary_general": {
94
- "hashes": {
95
- "hash_examples": "9b4d8993161e637d",
96
- "hash_full_prompts": "08215e527b7e60a5",
97
- "hash_input_tokens": "a12f3e3c934bd78b",
98
- "hash_cont_tokens": "58a2c19976e6dde8"
99
- },
100
- "total_evaluation_time_secondes": "4621.534999847412",
101
- "truncated": 3,
102
- "non-truncated": 13386,
103
- "padded": 2432,
104
- "non-padded": 10957,
105
- "num_truncated_few_shots": 0
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
meta-llama/Llama-2-7b-hf/results_2023-09-09T12-32-30.613622.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "config_general": {
3
- "model_name": "meta-llama/Llama-2-7b-hf",
4
- "model_sha": "6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9",
5
- "model_size": "3.57 GB",
6
- "model_dtype": "4bit",
7
- "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9",
8
- "num_few_shot_default": 0,
9
- "num_fewshot_seeds": 1,
10
- "override_batch_size": 1,
11
- "max_samples": null,
12
- "job_id": ""
13
- },
14
- "results": {
15
- "harness|drop|3": {
16
- "em": 0.0010486577181208054,
17
- "em_stderr": 0.00033145814652191404,
18
- "f1": 0.05131291946308739,
19
- "f1_stderr": 0.0012542058656851648
20
- },
21
- "harness|gsm8k|5": {
22
- "acc": 0.053828658074298714,
23
- "acc_stderr": 0.006216328640238123
24
- },
25
- "harness|winogrande|5": {
26
- "acc": 0.7458563535911602,
27
- "acc_stderr": 0.012236307219708262
28
- },
29
- "all": {
30
- "em": 0.0010486577181208054,
31
- "em_stderr": 0.00033145814652191404,
32
- "f1": 0.05131291946308739,
33
- "f1_stderr": 0.0012542058656851648,
34
- "acc": 0.39984250583272946,
35
- "acc_stderr": 0.009226317929973193
36
- }
37
- },
38
- "versions": {
39
- "harness|drop|3": 1,
40
- "harness|gsm8k|5": 0,
41
- "harness|winogrande|5": 0,
42
- "all": 0
43
- },
44
- "config_tasks": {
45
- "harness|drop": "LM Harness task",
46
- "harness|gsm8k": "LM Harness task",
47
- "harness|winogrande": "LM Harness task"
48
- },
49
- "summary_tasks": {
50
- "harness|drop|3": {
51
- "hashes": {
52
- "hash_examples": "1d27416e8324e9a3",
53
- "hash_full_prompts": "a5513ff9a741b385",
54
- "hash_input_tokens": "42076f0efbb50aa6",
55
- "hash_cont_tokens": "586467d69620d89a"
56
- },
57
- "truncated": 3,
58
- "non-truncated": 9533,
59
- "padded": 0,
60
- "non-padded": 9536,
61
- "effective_few_shots": 3.0,
62
- "num_truncated_few_shots": 0
63
- },
64
- "harness|gsm8k|5": {
65
- "hashes": {
66
- "hash_examples": "4c0843a5d99bcfdc",
67
- "hash_full_prompts": "41d55e83abc0e02d",
68
- "hash_input_tokens": "bda342e47b5099b2",
69
- "hash_cont_tokens": "d10e414c90dc2a07"
70
- },
71
- "truncated": 0,
72
- "non-truncated": 1319,
73
- "padded": 0,
74
- "non-padded": 1319,
75
- "effective_few_shots": 5.0,
76
- "num_truncated_few_shots": 0
77
- },
78
- "harness|winogrande|5": {
79
- "hashes": {
80
- "hash_examples": "aada0a176fd81218",
81
- "hash_full_prompts": "c8655cbd12de8409",
82
- "hash_input_tokens": "c0bedf98cb040854",
83
- "hash_cont_tokens": "f08975ad6f2d5864"
84
- },
85
- "truncated": 0,
86
- "non-truncated": 2534,
87
- "padded": 2432,
88
- "non-padded": 102,
89
- "effective_few_shots": 5.0,
90
- "num_truncated_few_shots": 0
91
- }
92
- },
93
- "summary_general": {
94
- "hashes": {
95
- "hash_examples": "9b4d8993161e637d",
96
- "hash_full_prompts": "08215e527b7e60a5",
97
- "hash_input_tokens": "a12f3e3c934bd78b",
98
- "hash_cont_tokens": "a4df4eaec2b09094"
99
- },
100
- "total_evaluation_time_secondes": "8584.129543542862",
101
- "truncated": 3,
102
- "non-truncated": 13386,
103
- "padded": 2432,
104
- "non-padded": 10957,
105
- "num_truncated_few_shots": 0
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
meta-llama/Llama-2-7b-hf/results_2023-09-20T14-39-46.791628.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "config_general": {
3
- "model_name": "meta-llama/Llama-2-7b-hf",
4
- "model_sha": "6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9",
5
- "model_size": "12.61 GB",
6
- "model_dtype": "torch.float16",
7
- "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374",
8
- "num_few_shot_default": 0,
9
- "num_fewshot_seeds": 1,
10
- "override_batch_size": 1,
11
- "max_samples": null,
12
- "job_id": ""
13
- },
14
- "results": {
15
- "harness|drop|3": {
16
- "em": 0.0012583892617449664,
17
- "em_stderr": 0.00036305608931194434,
18
- "f1": 0.055925964765100665,
19
- "f1_stderr": 0.0013181664771628632
20
- },
21
- "harness|gsm8k|5": {
22
- "acc": 0.0712661106899166,
23
- "acc_stderr": 0.007086462127954491
24
- },
25
- "harness|winogrande|5": {
26
- "acc": 0.7403314917127072,
27
- "acc_stderr": 0.012322700705552667
28
- },
29
- "all": {
30
- "em": 0.0012583892617449664,
31
- "em_stderr": 0.00036305608931194434,
32
- "f1": 0.055925964765100665,
33
- "f1_stderr": 0.0013181664771628632,
34
- "acc": 0.4057988012013119,
35
- "acc_stderr": 0.00970458141675358
36
- }
37
- },
38
- "versions": {
39
- "harness|drop|3": 1,
40
- "harness|gsm8k|5": 0,
41
- "harness|winogrande|5": 0,
42
- "all": 0
43
- },
44
- "config_tasks": {
45
- "harness|drop": "LM Harness task",
46
- "harness|gsm8k": "LM Harness task",
47
- "harness|winogrande": "LM Harness task"
48
- },
49
- "summary_tasks": {
50
- "harness|drop|3": {
51
- "hashes": {
52
- "hash_examples": "1d27416e8324e9a3",
53
- "hash_full_prompts": "a5513ff9a741b385",
54
- "hash_input_tokens": "42076f0efbb50aa6",
55
- "hash_cont_tokens": "ef74ade15eb78da6"
56
- },
57
- "truncated": 3,
58
- "non-truncated": 9533,
59
- "padded": 0,
60
- "non-padded": 9536,
61
- "effective_few_shots": 3.0,
62
- "num_truncated_few_shots": 0
63
- },
64
- "harness|gsm8k|5": {
65
- "hashes": {
66
- "hash_examples": "4c0843a5d99bcfdc",
67
- "hash_full_prompts": "41d55e83abc0e02d",
68
- "hash_input_tokens": "bda342e47b5099b2",
69
- "hash_cont_tokens": "542d7b742ca594d0"
70
- },
71
- "truncated": 0,
72
- "non-truncated": 1319,
73
- "padded": 0,
74
- "non-padded": 1319,
75
- "effective_few_shots": 5.0,
76
- "num_truncated_few_shots": 0
77
- },
78
- "harness|winogrande|5": {
79
- "hashes": {
80
- "hash_examples": "aada0a176fd81218",
81
- "hash_full_prompts": "c8655cbd12de8409",
82
- "hash_input_tokens": "c0bedf98cb040854",
83
- "hash_cont_tokens": "f08975ad6f2d5864"
84
- },
85
- "truncated": 0,
86
- "non-truncated": 2534,
87
- "padded": 2432,
88
- "non-padded": 102,
89
- "effective_few_shots": 5.0,
90
- "num_truncated_few_shots": 0
91
- }
92
- },
93
- "summary_general": {
94
- "hashes": {
95
- "hash_examples": "9b4d8993161e637d",
96
- "hash_full_prompts": "08215e527b7e60a5",
97
- "hash_input_tokens": "a12f3e3c934bd78b",
98
- "hash_cont_tokens": "58a2c19976e6dde8"
99
- },
100
- "total_evaluation_time_secondes": "9738.608674764633",
101
- "truncated": 3,
102
- "non-truncated": 13386,
103
- "padded": 2432,
104
- "non-padded": 10957,
105
- "num_truncated_few_shots": 0
106
- }
107
- }