File size: 16,385 Bytes
4d9b54f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
{
  "results": {
    "harness|arc:challenge|25": {
      "acc": 0.6262798634812287,
      "acc_stderr": 0.014137708601759091,
      "acc_norm": 0.6732081911262798,
      "acc_norm_stderr": 0.013706665975587333
    },
    "harness|hellaswag|10": {
      "acc": 0.6760605457080263,
      "acc_stderr": 0.00467020812857923,
      "acc_norm": 0.8733320055765784,
      "acc_norm_stderr": 0.0033192094001351187
    },
    "harness|hendrycksTest-abstract_algebra|5": {
      "acc": 0.33,
      "acc_stderr": 0.04725815626252605,
      "acc_norm": 0.33,
      "acc_norm_stderr": 0.04725815626252605
    },
    "harness|hendrycksTest-anatomy|5": {
      "acc": 0.6296296296296297,
      "acc_stderr": 0.04171654161354544,
      "acc_norm": 0.6296296296296297,
      "acc_norm_stderr": 0.04171654161354544
    },
    "harness|hendrycksTest-astronomy|5": {
      "acc": 0.8092105263157895,
      "acc_stderr": 0.031975658210325,
      "acc_norm": 0.8092105263157895,
      "acc_norm_stderr": 0.031975658210325
    },
    "harness|hendrycksTest-business_ethics|5": {
      "acc": 0.72,
      "acc_stderr": 0.04512608598542127,
      "acc_norm": 0.72,
      "acc_norm_stderr": 0.04512608598542127
    },
    "harness|hendrycksTest-clinical_knowledge|5": {
      "acc": 0.7169811320754716,
      "acc_stderr": 0.027724236492700918,
      "acc_norm": 0.7169811320754716,
      "acc_norm_stderr": 0.027724236492700918
    },
    "harness|hendrycksTest-college_biology|5": {
      "acc": 0.8472222222222222,
      "acc_stderr": 0.030085743248565666,
      "acc_norm": 0.8472222222222222,
      "acc_norm_stderr": 0.030085743248565666
    },
    "harness|hendrycksTest-college_chemistry|5": {
      "acc": 0.51,
      "acc_stderr": 0.05024183937956912,
      "acc_norm": 0.51,
      "acc_norm_stderr": 0.05024183937956912
    },
    "harness|hendrycksTest-college_computer_science|5": {
      "acc": 0.6,
      "acc_stderr": 0.049236596391733084,
      "acc_norm": 0.6,
      "acc_norm_stderr": 0.049236596391733084
    },
    "harness|hendrycksTest-college_mathematics|5": {
      "acc": 0.37,
      "acc_stderr": 0.048523658709391,
      "acc_norm": 0.37,
      "acc_norm_stderr": 0.048523658709391
    },
    "harness|hendrycksTest-college_medicine|5": {
      "acc": 0.6416184971098265,
      "acc_stderr": 0.03656343653353159,
      "acc_norm": 0.6416184971098265,
      "acc_norm_stderr": 0.03656343653353159
    },
    "harness|hendrycksTest-college_physics|5": {
      "acc": 0.37254901960784315,
      "acc_stderr": 0.04810840148082635,
      "acc_norm": 0.37254901960784315,
      "acc_norm_stderr": 0.04810840148082635
    },
    "harness|hendrycksTest-computer_security|5": {
      "acc": 0.77,
      "acc_stderr": 0.04229525846816506,
      "acc_norm": 0.77,
      "acc_norm_stderr": 0.04229525846816506
    },
    "harness|hendrycksTest-conceptual_physics|5": {
      "acc": 0.6638297872340425,
      "acc_stderr": 0.030881618520676942,
      "acc_norm": 0.6638297872340425,
      "acc_norm_stderr": 0.030881618520676942
    },
    "harness|hendrycksTest-econometrics|5": {
      "acc": 0.4473684210526316,
      "acc_stderr": 0.04677473004491199,
      "acc_norm": 0.4473684210526316,
      "acc_norm_stderr": 0.04677473004491199
    },
    "harness|hendrycksTest-electrical_engineering|5": {
      "acc": 0.6551724137931034,
      "acc_stderr": 0.03960933549451207,
      "acc_norm": 0.6551724137931034,
      "acc_norm_stderr": 0.03960933549451207
    },
    "harness|hendrycksTest-elementary_mathematics|5": {
      "acc": 0.43386243386243384,
      "acc_stderr": 0.025525034382474894,
      "acc_norm": 0.43386243386243384,
      "acc_norm_stderr": 0.025525034382474894
    },
    "harness|hendrycksTest-formal_logic|5": {
      "acc": 0.47619047619047616,
      "acc_stderr": 0.04467062628403273,
      "acc_norm": 0.47619047619047616,
      "acc_norm_stderr": 0.04467062628403273
    },
    "harness|hendrycksTest-global_facts|5": {
      "acc": 0.46,
      "acc_stderr": 0.05009082659620332,
      "acc_norm": 0.46,
      "acc_norm_stderr": 0.05009082659620332
    },
    "harness|hendrycksTest-high_school_biology|5": {
      "acc": 0.8193548387096774,
      "acc_stderr": 0.02188617856717253,
      "acc_norm": 0.8193548387096774,
      "acc_norm_stderr": 0.02188617856717253
    },
    "harness|hendrycksTest-high_school_chemistry|5": {
      "acc": 0.5123152709359606,
      "acc_stderr": 0.035169204442208966,
      "acc_norm": 0.5123152709359606,
      "acc_norm_stderr": 0.035169204442208966
    },
    "harness|hendrycksTest-high_school_computer_science|5": {
      "acc": 0.79,
      "acc_stderr": 0.040936018074033256,
      "acc_norm": 0.79,
      "acc_norm_stderr": 0.040936018074033256
    },
    "harness|hendrycksTest-high_school_european_history|5": {
      "acc": 0.8303030303030303,
      "acc_stderr": 0.029311188674983134,
      "acc_norm": 0.8303030303030303,
      "acc_norm_stderr": 0.029311188674983134
    },
    "harness|hendrycksTest-high_school_geography|5": {
      "acc": 0.8787878787878788,
      "acc_stderr": 0.023253157951942084,
      "acc_norm": 0.8787878787878788,
      "acc_norm_stderr": 0.023253157951942084
    },
    "harness|hendrycksTest-high_school_government_and_politics|5": {
      "acc": 0.9430051813471503,
      "acc_stderr": 0.016731085293607555,
      "acc_norm": 0.9430051813471503,
      "acc_norm_stderr": 0.016731085293607555
    },
    "harness|hendrycksTest-high_school_macroeconomics|5": {
      "acc": 0.7410256410256411,
      "acc_stderr": 0.02221110681006167,
      "acc_norm": 0.7410256410256411,
      "acc_norm_stderr": 0.02221110681006167
    },
    "harness|hendrycksTest-high_school_mathematics|5": {
      "acc": 0.35555555555555557,
      "acc_stderr": 0.029185714949857403,
      "acc_norm": 0.35555555555555557,
      "acc_norm_stderr": 0.029185714949857403
    },
    "harness|hendrycksTest-high_school_microeconomics|5": {
      "acc": 0.7647058823529411,
      "acc_stderr": 0.02755361446786381,
      "acc_norm": 0.7647058823529411,
      "acc_norm_stderr": 0.02755361446786381
    },
    "harness|hendrycksTest-high_school_physics|5": {
      "acc": 0.4304635761589404,
      "acc_stderr": 0.04042809961395634,
      "acc_norm": 0.4304635761589404,
      "acc_norm_stderr": 0.04042809961395634
    },
    "harness|hendrycksTest-high_school_psychology|5": {
      "acc": 0.8733944954128441,
      "acc_stderr": 0.014257128686165169,
      "acc_norm": 0.8733944954128441,
      "acc_norm_stderr": 0.014257128686165169
    },
    "harness|hendrycksTest-high_school_statistics|5": {
      "acc": 0.6342592592592593,
      "acc_stderr": 0.032847388576472056,
      "acc_norm": 0.6342592592592593,
      "acc_norm_stderr": 0.032847388576472056
    },
    "harness|hendrycksTest-high_school_us_history|5": {
      "acc": 0.8970588235294118,
      "acc_stderr": 0.02132833757080437,
      "acc_norm": 0.8970588235294118,
      "acc_norm_stderr": 0.02132833757080437
    },
    "harness|hendrycksTest-high_school_world_history|5": {
      "acc": 0.8776371308016878,
      "acc_stderr": 0.021331741829746786,
      "acc_norm": 0.8776371308016878,
      "acc_norm_stderr": 0.021331741829746786
    },
    "harness|hendrycksTest-human_aging|5": {
      "acc": 0.8026905829596412,
      "acc_stderr": 0.02670985334496796,
      "acc_norm": 0.8026905829596412,
      "acc_norm_stderr": 0.02670985334496796
    },
    "harness|hendrycksTest-human_sexuality|5": {
      "acc": 0.8778625954198473,
      "acc_stderr": 0.028718776889342344,
      "acc_norm": 0.8778625954198473,
      "acc_norm_stderr": 0.028718776889342344
    },
    "harness|hendrycksTest-international_law|5": {
      "acc": 0.8760330578512396,
      "acc_stderr": 0.03008309871603521,
      "acc_norm": 0.8760330578512396,
      "acc_norm_stderr": 0.03008309871603521
    },
    "harness|hendrycksTest-jurisprudence|5": {
      "acc": 0.8333333333333334,
      "acc_stderr": 0.03602814176392645,
      "acc_norm": 0.8333333333333334,
      "acc_norm_stderr": 0.03602814176392645
    },
    "harness|hendrycksTest-logical_fallacies|5": {
      "acc": 0.803680981595092,
      "acc_stderr": 0.031207970394709218,
      "acc_norm": 0.803680981595092,
      "acc_norm_stderr": 0.031207970394709218
    },
    "harness|hendrycksTest-machine_learning|5": {
      "acc": 0.5357142857142857,
      "acc_stderr": 0.04733667890053756,
      "acc_norm": 0.5357142857142857,
      "acc_norm_stderr": 0.04733667890053756
    },
    "harness|hendrycksTest-management|5": {
      "acc": 0.8349514563106796,
      "acc_stderr": 0.03675668832233188,
      "acc_norm": 0.8349514563106796,
      "acc_norm_stderr": 0.03675668832233188
    },
    "harness|hendrycksTest-marketing|5": {
      "acc": 0.905982905982906,
      "acc_stderr": 0.01911989279892498,
      "acc_norm": 0.905982905982906,
      "acc_norm_stderr": 0.01911989279892498
    },
    "harness|hendrycksTest-medical_genetics|5": {
      "acc": 0.74,
      "acc_stderr": 0.04408440022768077,
      "acc_norm": 0.74,
      "acc_norm_stderr": 0.04408440022768077
    },
    "harness|hendrycksTest-miscellaneous|5": {
      "acc": 0.8620689655172413,
      "acc_stderr": 0.012331009307795656,
      "acc_norm": 0.8620689655172413,
      "acc_norm_stderr": 0.012331009307795656
    },
    "harness|hendrycksTest-moral_disputes|5": {
      "acc": 0.7774566473988439,
      "acc_stderr": 0.02239421566194282,
      "acc_norm": 0.7774566473988439,
      "acc_norm_stderr": 0.02239421566194282
    },
    "harness|hendrycksTest-moral_scenarios|5": {
      "acc": 0.4547486033519553,
      "acc_stderr": 0.016653875777524012,
      "acc_norm": 0.4547486033519553,
      "acc_norm_stderr": 0.016653875777524012
    },
    "harness|hendrycksTest-nutrition|5": {
      "acc": 0.7810457516339869,
      "acc_stderr": 0.02367908986180772,
      "acc_norm": 0.7810457516339869,
      "acc_norm_stderr": 0.02367908986180772
    },
    "harness|hendrycksTest-philosophy|5": {
      "acc": 0.7877813504823151,
      "acc_stderr": 0.023222756797435115,
      "acc_norm": 0.7877813504823151,
      "acc_norm_stderr": 0.023222756797435115
    },
    "harness|hendrycksTest-prehistory|5": {
      "acc": 0.8364197530864198,
      "acc_stderr": 0.020581466138257114,
      "acc_norm": 0.8364197530864198,
      "acc_norm_stderr": 0.020581466138257114
    },
    "harness|hendrycksTest-professional_accounting|5": {
      "acc": 0.5673758865248227,
      "acc_stderr": 0.02955545423677884,
      "acc_norm": 0.5673758865248227,
      "acc_norm_stderr": 0.02955545423677884
    },
    "harness|hendrycksTest-professional_law|5": {
      "acc": 0.5319426336375489,
      "acc_stderr": 0.012744149704869645,
      "acc_norm": 0.5319426336375489,
      "acc_norm_stderr": 0.012744149704869645
    },
    "harness|hendrycksTest-professional_medicine|5": {
      "acc": 0.75,
      "acc_stderr": 0.026303648393696036,
      "acc_norm": 0.75,
      "acc_norm_stderr": 0.026303648393696036
    },
    "harness|hendrycksTest-professional_psychology|5": {
      "acc": 0.7565359477124183,
      "acc_stderr": 0.01736247376214662,
      "acc_norm": 0.7565359477124183,
      "acc_norm_stderr": 0.01736247376214662
    },
    "harness|hendrycksTest-public_relations|5": {
      "acc": 0.6909090909090909,
      "acc_stderr": 0.044262946482000985,
      "acc_norm": 0.6909090909090909,
      "acc_norm_stderr": 0.044262946482000985
    },
    "harness|hendrycksTest-security_studies|5": {
      "acc": 0.7918367346938775,
      "acc_stderr": 0.0259911176728133,
      "acc_norm": 0.7918367346938775,
      "acc_norm_stderr": 0.0259911176728133
    },
    "harness|hendrycksTest-sociology|5": {
      "acc": 0.900497512437811,
      "acc_stderr": 0.021166216304659393,
      "acc_norm": 0.900497512437811,
      "acc_norm_stderr": 0.021166216304659393
    },
    "harness|hendrycksTest-us_foreign_policy|5": {
      "acc": 0.92,
      "acc_stderr": 0.0272659924344291,
      "acc_norm": 0.92,
      "acc_norm_stderr": 0.0272659924344291
    },
    "harness|hendrycksTest-virology|5": {
      "acc": 0.5301204819277109,
      "acc_stderr": 0.03885425420866767,
      "acc_norm": 0.5301204819277109,
      "acc_norm_stderr": 0.03885425420866767
    },
    "harness|hendrycksTest-world_religions|5": {
      "acc": 0.8538011695906432,
      "acc_stderr": 0.027097290118070806,
      "acc_norm": 0.8538011695906432,
      "acc_norm_stderr": 0.027097290118070806
    },
    "harness|truthfulqa:mc|0": {
      "mc1": 0.3108935128518972,
      "mc1_stderr": 0.016203316673559696,
      "mc2": 0.44923493721887353,
      "mc2_stderr": 0.01390226410719232
    },
    "all": {
      "acc": 0.6967225637378714,
      "acc_stderr": 0.030867069907791145,
      "acc_norm": 0.7008615431872544,
      "acc_norm_stderr": 0.030836865817034945,
      "mc1": 0.3108935128518972,
      "mc1_stderr": 0.016203316673559696,
      "mc2": 0.44923493721887353,
      "mc2_stderr": 0.01390226410719232
    }
  },
  "versions": {
    "harness|arc:challenge|25": 0,
    "harness|hellaswag|10": 0,
    "harness|hendrycksTest-abstract_algebra|5": 1,
    "harness|hendrycksTest-anatomy|5": 1,
    "harness|hendrycksTest-astronomy|5": 1,
    "harness|hendrycksTest-business_ethics|5": 1,
    "harness|hendrycksTest-clinical_knowledge|5": 1,
    "harness|hendrycksTest-college_biology|5": 1,
    "harness|hendrycksTest-college_chemistry|5": 1,
    "harness|hendrycksTest-college_computer_science|5": 1,
    "harness|hendrycksTest-college_mathematics|5": 1,
    "harness|hendrycksTest-college_medicine|5": 1,
    "harness|hendrycksTest-college_physics|5": 1,
    "harness|hendrycksTest-computer_security|5": 1,
    "harness|hendrycksTest-conceptual_physics|5": 1,
    "harness|hendrycksTest-econometrics|5": 1,
    "harness|hendrycksTest-electrical_engineering|5": 1,
    "harness|hendrycksTest-elementary_mathematics|5": 1,
    "harness|hendrycksTest-formal_logic|5": 1,
    "harness|hendrycksTest-global_facts|5": 1,
    "harness|hendrycksTest-high_school_biology|5": 1,
    "harness|hendrycksTest-high_school_chemistry|5": 1,
    "harness|hendrycksTest-high_school_computer_science|5": 1,
    "harness|hendrycksTest-high_school_european_history|5": 1,
    "harness|hendrycksTest-high_school_geography|5": 1,
    "harness|hendrycksTest-high_school_government_and_politics|5": 1,
    "harness|hendrycksTest-high_school_macroeconomics|5": 1,
    "harness|hendrycksTest-high_school_mathematics|5": 1,
    "harness|hendrycksTest-high_school_microeconomics|5": 1,
    "harness|hendrycksTest-high_school_physics|5": 1,
    "harness|hendrycksTest-high_school_psychology|5": 1,
    "harness|hendrycksTest-high_school_statistics|5": 1,
    "harness|hendrycksTest-high_school_us_history|5": 1,
    "harness|hendrycksTest-high_school_world_history|5": 1,
    "harness|hendrycksTest-human_aging|5": 1,
    "harness|hendrycksTest-human_sexuality|5": 1,
    "harness|hendrycksTest-international_law|5": 1,
    "harness|hendrycksTest-jurisprudence|5": 1,
    "harness|hendrycksTest-logical_fallacies|5": 1,
    "harness|hendrycksTest-machine_learning|5": 1,
    "harness|hendrycksTest-management|5": 1,
    "harness|hendrycksTest-marketing|5": 1,
    "harness|hendrycksTest-medical_genetics|5": 1,
    "harness|hendrycksTest-miscellaneous|5": 1,
    "harness|hendrycksTest-moral_disputes|5": 1,
    "harness|hendrycksTest-moral_scenarios|5": 1,
    "harness|hendrycksTest-nutrition|5": 1,
    "harness|hendrycksTest-philosophy|5": 1,
    "harness|hendrycksTest-prehistory|5": 1,
    "harness|hendrycksTest-professional_accounting|5": 1,
    "harness|hendrycksTest-professional_law|5": 1,
    "harness|hendrycksTest-professional_medicine|5": 1,
    "harness|hendrycksTest-professional_psychology|5": 1,
    "harness|hendrycksTest-public_relations|5": 1,
    "harness|hendrycksTest-security_studies|5": 1,
    "harness|hendrycksTest-sociology|5": 1,
    "harness|hendrycksTest-us_foreign_policy|5": 1,
    "harness|hendrycksTest-virology|5": 1,
    "harness|hendrycksTest-world_religions|5": 1,
    "harness|truthfulqa:mc|0": 1,
    "all": 0
  },
  "config": {
    "model_name": "meta-llama/Llama-2-70b-hf",
    "model_sha": "ed7b07231238f836b99bf45701b9a0063576b194",
    "model_dtype": "torch.float16",
    "lighteval_sha": "d2e819bc028044e701a13b954d3326ceddb71b98",
    "num_few_shot_default": 0,
    "num_fewshot_seeds": 1,
    "override_batch_size": 1,
    "max_samples": null
  }
}