Files changed (1) hide show
  1. README.md +385 -0
README.md CHANGED
@@ -124,4 +124,389 @@ or for CoT (❗For the simple math questions, we do NOT recommend to use the CoT
124
 
125
  ```
126
  "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response: Let's think step by step."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  ```
 
124
 
125
  ```
126
  "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response: Let's think step by step."
127
+ ```
128
+
129
+ ## Eval
130
+
131
+ ```python
132
+ {
133
+ "all": {
134
+ "acc": 0.6868282613819305,
135
+ "acc_stderr": 0.030371866427473967,
136
+ "acc_norm": 0.695311288530275,
137
+ "acc_norm_stderr": 0.030984285786669577,
138
+ "mc1": 0.36964504283965727,
139
+ "mc1_stderr": 0.01689818070697388,
140
+ "mc2": 0.5360987678643523,
141
+ "mc2_stderr": 0.014938153988985473
142
+ },
143
+ "harness|arc:challenge|25": {
144
+ "acc": 0.6424914675767918,
145
+ "acc_stderr": 0.014005494275916573,
146
+ "acc_norm": 0.6851535836177475,
147
+ "acc_norm_stderr": 0.01357265770308495
148
+ },
149
+ "harness|hellaswag|10": {
150
+ "acc": 0.6836287592113125,
151
+ "acc_stderr": 0.004641092001425294,
152
+ "acc_norm": 0.8686516630153356,
153
+ "acc_norm_stderr": 0.0033709059327855567
154
+ },
155
+ "harness|hendrycksTest-abstract_algebra|5": {
156
+ "acc": 0.33,
157
+ "acc_stderr": 0.04725815626252605,
158
+ "acc_norm": 0.33,
159
+ "acc_norm_stderr": 0.04725815626252605
160
+ },
161
+ "harness|hendrycksTest-anatomy|5": {
162
+ "acc": 0.6222222222222222,
163
+ "acc_stderr": 0.04188307537595852,
164
+ "acc_norm": 0.6222222222222222,
165
+ "acc_norm_stderr": 0.04188307537595852
166
+ },
167
+ "harness|hendrycksTest-astronomy|5": {
168
+ "acc": 0.8026315789473685,
169
+ "acc_stderr": 0.03238981601699397,
170
+ "acc_norm": 0.8026315789473685,
171
+ "acc_norm_stderr": 0.03238981601699397
172
+ },
173
+ "harness|hendrycksTest-business_ethics|5": {
174
+ "acc": 0.72,
175
+ "acc_stderr": 0.04512608598542127,
176
+ "acc_norm": 0.72,
177
+ "acc_norm_stderr": 0.04512608598542127
178
+ },
179
+ "harness|hendrycksTest-clinical_knowledge|5": {
180
+ "acc": 0.7471698113207547,
181
+ "acc_stderr": 0.026749899771241214,
182
+ "acc_norm": 0.7471698113207547,
183
+ "acc_norm_stderr": 0.026749899771241214
184
+ },
185
+ "harness|hendrycksTest-college_biology|5": {
186
+ "acc": 0.8194444444444444,
187
+ "acc_stderr": 0.032166008088022675,
188
+ "acc_norm": 0.8194444444444444,
189
+ "acc_norm_stderr": 0.032166008088022675
190
+ },
191
+ "harness|hendrycksTest-college_chemistry|5": {
192
+ "acc": 0.48,
193
+ "acc_stderr": 0.050211673156867795,
194
+ "acc_norm": 0.48,
195
+ "acc_norm_stderr": 0.050211673156867795
196
+ },
197
+ "harness|hendrycksTest-college_computer_science|5": {
198
+ "acc": 0.57,
199
+ "acc_stderr": 0.04975698519562428,
200
+ "acc_norm": 0.57,
201
+ "acc_norm_stderr": 0.04975698519562428
202
+ },
203
+ "harness|hendrycksTest-college_mathematics|5": {
204
+ "acc": 0.33,
205
+ "acc_stderr": 0.04725815626252604,
206
+ "acc_norm": 0.33,
207
+ "acc_norm_stderr": 0.04725815626252604
208
+ },
209
+ "harness|hendrycksTest-college_medicine|5": {
210
+ "acc": 0.6647398843930635,
211
+ "acc_stderr": 0.03599586301247077,
212
+ "acc_norm": 0.6647398843930635,
213
+ "acc_norm_stderr": 0.03599586301247077
214
+ },
215
+ "harness|hendrycksTest-college_physics|5": {
216
+ "acc": 0.35294117647058826,
217
+ "acc_stderr": 0.047551296160629475,
218
+ "acc_norm": 0.35294117647058826,
219
+ "acc_norm_stderr": 0.047551296160629475
220
+ },
221
+ "harness|hendrycksTest-computer_security|5": {
222
+ "acc": 0.74,
223
+ "acc_stderr": 0.04408440022768079,
224
+ "acc_norm": 0.74,
225
+ "acc_norm_stderr": 0.04408440022768079
226
+ },
227
+ "harness|hendrycksTest-conceptual_physics|5": {
228
+ "acc": 0.6893617021276596,
229
+ "acc_stderr": 0.03025123757921317,
230
+ "acc_norm": 0.6893617021276596,
231
+ "acc_norm_stderr": 0.03025123757921317
232
+ },
233
+ "harness|hendrycksTest-econometrics|5": {
234
+ "acc": 0.40350877192982454,
235
+ "acc_stderr": 0.046151869625837026,
236
+ "acc_norm": 0.40350877192982454,
237
+ "acc_norm_stderr": 0.046151869625837026
238
+ },
239
+ "harness|hendrycksTest-electrical_engineering|5": {
240
+ "acc": 0.6068965517241379,
241
+ "acc_stderr": 0.040703290137070705,
242
+ "acc_norm": 0.6068965517241379,
243
+ "acc_norm_stderr": 0.040703290137070705
244
+ },
245
+ "harness|hendrycksTest-elementary_mathematics|5": {
246
+ "acc": 0.4312169312169312,
247
+ "acc_stderr": 0.0255064816981382,
248
+ "acc_norm": 0.4312169312169312,
249
+ "acc_norm_stderr": 0.0255064816981382
250
+ },
251
+ "harness|hendrycksTest-formal_logic|5": {
252
+ "acc": 0.5,
253
+ "acc_stderr": 0.04472135954999579,
254
+ "acc_norm": 0.5,
255
+ "acc_norm_stderr": 0.04472135954999579
256
+ },
257
+ "harness|hendrycksTest-global_facts|5": {
258
+ "acc": 0.43,
259
+ "acc_stderr": 0.049756985195624284,
260
+ "acc_norm": 0.43,
261
+ "acc_norm_stderr": 0.049756985195624284
262
+ },
263
+ "harness|hendrycksTest-high_school_biology|5": {
264
+ "acc": 0.8193548387096774,
265
+ "acc_stderr": 0.021886178567172527,
266
+ "acc_norm": 0.8193548387096774,
267
+ "acc_norm_stderr": 0.021886178567172527
268
+ },
269
+ "harness|hendrycksTest-high_school_chemistry|5": {
270
+ "acc": 0.5320197044334976,
271
+ "acc_stderr": 0.03510766597959217,
272
+ "acc_norm": 0.5320197044334976,
273
+ "acc_norm_stderr": 0.03510766597959217
274
+ },
275
+ "harness|hendrycksTest-high_school_computer_science|5": {
276
+ "acc": 0.71,
277
+ "acc_stderr": 0.045604802157206845,
278
+ "acc_norm": 0.71,
279
+ "acc_norm_stderr": 0.045604802157206845
280
+ },
281
+ "harness|hendrycksTest-high_school_european_history|5": {
282
+ "acc": 0.8121212121212121,
283
+ "acc_stderr": 0.03050193405942914,
284
+ "acc_norm": 0.8121212121212121,
285
+ "acc_norm_stderr": 0.03050193405942914
286
+ },
287
+ "harness|hendrycksTest-high_school_geography|5": {
288
+ "acc": 0.8888888888888888,
289
+ "acc_stderr": 0.022390787638216773,
290
+ "acc_norm": 0.8888888888888888,
291
+ "acc_norm_stderr": 0.022390787638216773
292
+ },
293
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
294
+ "acc": 0.927461139896373,
295
+ "acc_stderr": 0.018718998520678185,
296
+ "acc_norm": 0.927461139896373,
297
+ "acc_norm_stderr": 0.018718998520678185
298
+ },
299
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
300
+ "acc": 0.7230769230769231,
301
+ "acc_stderr": 0.022688042352424994,
302
+ "acc_norm": 0.7230769230769231,
303
+ "acc_norm_stderr": 0.022688042352424994
304
+ },
305
+ "harness|hendrycksTest-high_school_mathematics|5": {
306
+ "acc": 0.3333333333333333,
307
+ "acc_stderr": 0.028742040903948492,
308
+ "acc_norm": 0.3333333333333333,
309
+ "acc_norm_stderr": 0.028742040903948492
310
+ },
311
+ "harness|hendrycksTest-high_school_microeconomics|5": {
312
+ "acc": 0.8109243697478992,
313
+ "acc_stderr": 0.02543511943810537,
314
+ "acc_norm": 0.8109243697478992,
315
+ "acc_norm_stderr": 0.02543511943810537
316
+ },
317
+ "harness|hendrycksTest-high_school_physics|5": {
318
+ "acc": 0.4304635761589404,
319
+ "acc_stderr": 0.04042809961395634,
320
+ "acc_norm": 0.4304635761589404,
321
+ "acc_norm_stderr": 0.04042809961395634
322
+ },
323
+ "harness|hendrycksTest-high_school_psychology|5": {
324
+ "acc": 0.8862385321100917,
325
+ "acc_stderr": 0.0136136148002328,
326
+ "acc_norm": 0.8862385321100917,
327
+ "acc_norm_stderr": 0.0136136148002328
328
+ },
329
+ "harness|hendrycksTest-high_school_statistics|5": {
330
+ "acc": 0.5879629629629629,
331
+ "acc_stderr": 0.03356787758160831,
332
+ "acc_norm": 0.5879629629629629,
333
+ "acc_norm_stderr": 0.03356787758160831
334
+ },
335
+ "harness|hendrycksTest-high_school_us_history|5": {
336
+ "acc": 0.9166666666666666,
337
+ "acc_stderr": 0.019398452135813895,
338
+ "acc_norm": 0.9166666666666666,
339
+ "acc_norm_stderr": 0.019398452135813895
340
+ },
341
+ "harness|hendrycksTest-high_school_world_history|5": {
342
+ "acc": 0.8776371308016878,
343
+ "acc_stderr": 0.02133174182974679,
344
+ "acc_norm": 0.8776371308016878,
345
+ "acc_norm_stderr": 0.02133174182974679
346
+ },
347
+ "harness|hendrycksTest-human_aging|5": {
348
+ "acc": 0.8161434977578476,
349
+ "acc_stderr": 0.025998379092356513,
350
+ "acc_norm": 0.8161434977578476,
351
+ "acc_norm_stderr": 0.025998379092356513
352
+ },
353
+ "harness|hendrycksTest-human_sexuality|5": {
354
+ "acc": 0.8473282442748091,
355
+ "acc_stderr": 0.03154521672005472,
356
+ "acc_norm": 0.8473282442748091,
357
+ "acc_norm_stderr": 0.03154521672005472
358
+ },
359
+ "harness|hendrycksTest-international_law|5": {
360
+ "acc": 0.8512396694214877,
361
+ "acc_stderr": 0.03248470083807194,
362
+ "acc_norm": 0.8512396694214877,
363
+ "acc_norm_stderr": 0.03248470083807194
364
+ },
365
+ "harness|hendrycksTest-jurisprudence|5": {
366
+ "acc": 0.8148148148148148,
367
+ "acc_stderr": 0.03755265865037181,
368
+ "acc_norm": 0.8148148148148148,
369
+ "acc_norm_stderr": 0.03755265865037181
370
+ },
371
+ "harness|hendrycksTest-logical_fallacies|5": {
372
+ "acc": 0.803680981595092,
373
+ "acc_stderr": 0.031207970394709225,
374
+ "acc_norm": 0.803680981595092,
375
+ "acc_norm_stderr": 0.031207970394709225
376
+ },
377
+ "harness|hendrycksTest-machine_learning|5": {
378
+ "acc": 0.48214285714285715,
379
+ "acc_stderr": 0.047427623612430116,
380
+ "acc_norm": 0.48214285714285715,
381
+ "acc_norm_stderr": 0.047427623612430116
382
+ },
383
+ "harness|hendrycksTest-management|5": {
384
+ "acc": 0.8446601941747572,
385
+ "acc_stderr": 0.03586594738573975,
386
+ "acc_norm": 0.8446601941747572,
387
+ "acc_norm_stderr": 0.03586594738573975
388
+ },
389
+ "harness|hendrycksTest-marketing|5": {
390
+ "acc": 0.8846153846153846,
391
+ "acc_stderr": 0.020930193185179333,
392
+ "acc_norm": 0.8846153846153846,
393
+ "acc_norm_stderr": 0.020930193185179333
394
+ },
395
+ "harness|hendrycksTest-medical_genetics|5": {
396
+ "acc": 0.72,
397
+ "acc_stderr": 0.045126085985421276,
398
+ "acc_norm": 0.72,
399
+ "acc_norm_stderr": 0.045126085985421276
400
+ },
401
+ "harness|hendrycksTest-miscellaneous|5": {
402
+ "acc": 0.8620689655172413,
403
+ "acc_stderr": 0.012331009307795663,
404
+ "acc_norm": 0.8620689655172413,
405
+ "acc_norm_stderr": 0.012331009307795663
406
+ },
407
+ "harness|hendrycksTest-moral_disputes|5": {
408
+ "acc": 0.7745664739884393,
409
+ "acc_stderr": 0.022497230190967558,
410
+ "acc_norm": 0.7745664739884393,
411
+ "acc_norm_stderr": 0.022497230190967558
412
+ },
413
+ "harness|hendrycksTest-moral_scenarios|5": {
414
+ "acc": 0.5452513966480447,
415
+ "acc_stderr": 0.016653875777523995,
416
+ "acc_norm": 0.5452513966480447,
417
+ "acc_norm_stderr": 0.016653875777523995
418
+ },
419
+ "harness|hendrycksTest-nutrition|5": {
420
+ "acc": 0.7581699346405228,
421
+ "acc_stderr": 0.024518195641879334,
422
+ "acc_norm": 0.7581699346405228,
423
+ "acc_norm_stderr": 0.024518195641879334
424
+ },
425
+ "harness|hendrycksTest-philosophy|5": {
426
+ "acc": 0.7845659163987139,
427
+ "acc_stderr": 0.023350225475471442,
428
+ "acc_norm": 0.7845659163987139,
429
+ "acc_norm_stderr": 0.023350225475471442
430
+ },
431
+ "harness|hendrycksTest-prehistory|5": {
432
+ "acc": 0.8364197530864198,
433
+ "acc_stderr": 0.02058146613825712,
434
+ "acc_norm": 0.8364197530864198,
435
+ "acc_norm_stderr": 0.02058146613825712
436
+ },
437
+ "harness|hendrycksTest-professional_accounting|5": {
438
+ "acc": 0.5177304964539007,
439
+ "acc_stderr": 0.02980873964223777,
440
+ "acc_norm": 0.5177304964539007,
441
+ "acc_norm_stderr": 0.02980873964223777
442
+ },
443
+ "harness|hendrycksTest-professional_law|5": {
444
+ "acc": 0.5658409387222947,
445
+ "acc_stderr": 0.012659033237067253,
446
+ "acc_norm": 0.5658409387222947,
447
+ "acc_norm_stderr": 0.012659033237067253
448
+ },
449
+ "harness|hendrycksTest-professional_medicine|5": {
450
+ "acc": 0.7683823529411765,
451
+ "acc_stderr": 0.025626533803777562,
452
+ "acc_norm": 0.7683823529411765,
453
+ "acc_norm_stderr": 0.025626533803777562
454
+ },
455
+ "harness|hendrycksTest-professional_psychology|5": {
456
+ "acc": 0.7532679738562091,
457
+ "acc_stderr": 0.0174408203674025,
458
+ "acc_norm": 0.7532679738562091,
459
+ "acc_norm_stderr": 0.0174408203674025
460
+ },
461
+ "harness|hendrycksTest-public_relations|5": {
462
+ "acc": 0.7272727272727273,
463
+ "acc_stderr": 0.04265792110940588,
464
+ "acc_norm": 0.7272727272727273,
465
+ "acc_norm_stderr": 0.04265792110940588
466
+ },
467
+ "harness|hendrycksTest-security_studies|5": {
468
+ "acc": 0.8,
469
+ "acc_stderr": 0.02560737598657916,
470
+ "acc_norm": 0.8,
471
+ "acc_norm_stderr": 0.02560737598657916
472
+ },
473
+ "harness|hendrycksTest-sociology|5": {
474
+ "acc": 0.8955223880597015,
475
+ "acc_stderr": 0.021628920516700643,
476
+ "acc_norm": 0.8955223880597015,
477
+ "acc_norm_stderr": 0.021628920516700643
478
+ },
479
+ "harness|hendrycksTest-us_foreign_policy|5": {
480
+ "acc": 0.93,
481
+ "acc_stderr": 0.0256432399976243,
482
+ "acc_norm": 0.93,
483
+ "acc_norm_stderr": 0.0256432399976243
484
+ },
485
+ "harness|hendrycksTest-virology|5": {
486
+ "acc": 0.5240963855421686,
487
+ "acc_stderr": 0.03887971849597264,
488
+ "acc_norm": 0.5240963855421686,
489
+ "acc_norm_stderr": 0.03887971849597264
490
+ },
491
+ "harness|hendrycksTest-world_religions|5": {
492
+ "acc": 0.8654970760233918,
493
+ "acc_stderr": 0.026168221344662297,
494
+ "acc_norm": 0.8654970760233918,
495
+ "acc_norm_stderr": 0.026168221344662297
496
+ },
497
+ "harness|truthfulqa:mc|0": {
498
+ "mc1": 0.36964504283965727,
499
+ "mc1_stderr": 0.01689818070697388,
500
+ "mc2": 0.5360987678643523,
501
+ "mc2_stderr": 0.014938153988985473
502
+ },
503
+ "harness|winogrande|5": {
504
+ "acc": 0.8271507498026835,
505
+ "acc_stderr": 0.010626964529971855
506
+ },
507
+ "harness|gsm8k|5": {
508
+ "acc": 0.27369219105382864,
509
+ "acc_stderr": 0.012281003490963456
510
+ }
511
+ }
512
  ```