Files changed (1) hide show
  1. README.md +385 -0
README.md CHANGED
@@ -67,3 +67,388 @@ Step 3: Finally, perform the remaining addition operation.
67
  So, 25-4*2+3 equals 20.
68
  ```
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  So, 25-4*2+3 equals 20.
68
  ```
69
 
70
+ ## Eval
71
+
72
+
73
+ ```python
74
+ {
75
+ "all": {
76
+ "acc": 0.6914116069568377,
77
+ "acc_stderr": 0.03063431437342948,
78
+ "acc_norm": 0.6938613221179539,
79
+ "acc_norm_stderr": 0.031238741076549784,
80
+ "mc1": 0.40269277845777235,
81
+ "mc1_stderr": 0.01716883093518722,
82
+ "mc2": 0.5707095526544473,
83
+ "mc2_stderr": 0.01525040450448649
84
+ },
85
+ "harness|arc:challenge|25": {
86
+ "acc": 0.6322525597269625,
87
+ "acc_stderr": 0.014090995618168482,
88
+ "acc_norm": 0.6706484641638225,
89
+ "acc_norm_stderr": 0.013734057652635474
90
+ },
91
+ "harness|hellaswag|10": {
92
+ "acc": 0.6746664011153157,
93
+ "acc_stderr": 0.0046754187743142306,
94
+ "acc_norm": 0.8600876319458275,
95
+ "acc_norm_stderr": 0.0034618713240671846
96
+ },
97
+ "harness|hendrycksTest-abstract_algebra|5": {
98
+ "acc": 0.34,
99
+ "acc_stderr": 0.04760952285695236,
100
+ "acc_norm": 0.34,
101
+ "acc_norm_stderr": 0.04760952285695236
102
+ },
103
+ "harness|hendrycksTest-anatomy|5": {
104
+ "acc": 0.6518518518518519,
105
+ "acc_stderr": 0.041153246103369526,
106
+ "acc_norm": 0.6518518518518519,
107
+ "acc_norm_stderr": 0.041153246103369526
108
+ },
109
+ "harness|hendrycksTest-astronomy|5": {
110
+ "acc": 0.7894736842105263,
111
+ "acc_stderr": 0.03317672787533157,
112
+ "acc_norm": 0.7894736842105263,
113
+ "acc_norm_stderr": 0.03317672787533157
114
+ },
115
+ "harness|hendrycksTest-business_ethics|5": {
116
+ "acc": 0.73,
117
+ "acc_stderr": 0.04461960433384741,
118
+ "acc_norm": 0.73,
119
+ "acc_norm_stderr": 0.04461960433384741
120
+ },
121
+ "harness|hendrycksTest-clinical_knowledge|5": {
122
+ "acc": 0.7283018867924528,
123
+ "acc_stderr": 0.027377706624670713,
124
+ "acc_norm": 0.7283018867924528,
125
+ "acc_norm_stderr": 0.027377706624670713
126
+ },
127
+ "harness|hendrycksTest-college_biology|5": {
128
+ "acc": 0.8194444444444444,
129
+ "acc_stderr": 0.032166008088022675,
130
+ "acc_norm": 0.8194444444444444,
131
+ "acc_norm_stderr": 0.032166008088022675
132
+ },
133
+ "harness|hendrycksTest-college_chemistry|5": {
134
+ "acc": 0.5,
135
+ "acc_stderr": 0.050251890762960605,
136
+ "acc_norm": 0.5,
137
+ "acc_norm_stderr": 0.050251890762960605
138
+ },
139
+ "harness|hendrycksTest-college_computer_science|5": {
140
+ "acc": 0.57,
141
+ "acc_stderr": 0.049756985195624284,
142
+ "acc_norm": 0.57,
143
+ "acc_norm_stderr": 0.049756985195624284
144
+ },
145
+ "harness|hendrycksTest-college_mathematics|5": {
146
+ "acc": 0.37,
147
+ "acc_stderr": 0.04852365870939099,
148
+ "acc_norm": 0.37,
149
+ "acc_norm_stderr": 0.04852365870939099
150
+ },
151
+ "harness|hendrycksTest-college_medicine|5": {
152
+ "acc": 0.6878612716763006,
153
+ "acc_stderr": 0.035331333893236574,
154
+ "acc_norm": 0.6878612716763006,
155
+ "acc_norm_stderr": 0.035331333893236574
156
+ },
157
+ "harness|hendrycksTest-college_physics|5": {
158
+ "acc": 0.35294117647058826,
159
+ "acc_stderr": 0.047551296160629475,
160
+ "acc_norm": 0.35294117647058826,
161
+ "acc_norm_stderr": 0.047551296160629475
162
+ },
163
+ "harness|hendrycksTest-computer_security|5": {
164
+ "acc": 0.7,
165
+ "acc_stderr": 0.046056618647183814,
166
+ "acc_norm": 0.7,
167
+ "acc_norm_stderr": 0.046056618647183814
168
+ },
169
+ "harness|hendrycksTest-conceptual_physics|5": {
170
+ "acc": 0.676595744680851,
171
+ "acc_stderr": 0.030579442773610337,
172
+ "acc_norm": 0.676595744680851,
173
+ "acc_norm_stderr": 0.030579442773610337
174
+ },
175
+ "harness|hendrycksTest-econometrics|5": {
176
+ "acc": 0.40350877192982454,
177
+ "acc_stderr": 0.046151869625837026,
178
+ "acc_norm": 0.40350877192982454,
179
+ "acc_norm_stderr": 0.046151869625837026
180
+ },
181
+ "harness|hendrycksTest-electrical_engineering|5": {
182
+ "acc": 0.5793103448275863,
183
+ "acc_stderr": 0.04113914981189261,
184
+ "acc_norm": 0.5793103448275863,
185
+ "acc_norm_stderr": 0.04113914981189261
186
+ },
187
+ "harness|hendrycksTest-elementary_mathematics|5": {
188
+ "acc": 0.4497354497354497,
189
+ "acc_stderr": 0.02562085704293665,
190
+ "acc_norm": 0.4497354497354497,
191
+ "acc_norm_stderr": 0.02562085704293665
192
+ },
193
+ "harness|hendrycksTest-formal_logic|5": {
194
+ "acc": 0.46825396825396826,
195
+ "acc_stderr": 0.04463112720677172,
196
+ "acc_norm": 0.46825396825396826,
197
+ "acc_norm_stderr": 0.04463112720677172
198
+ },
199
+ "harness|hendrycksTest-global_facts|5": {
200
+ "acc": 0.46,
201
+ "acc_stderr": 0.05009082659620332,
202
+ "acc_norm": 0.46,
203
+ "acc_norm_stderr": 0.05009082659620332
204
+ },
205
+ "harness|hendrycksTest-high_school_biology|5": {
206
+ "acc": 0.8129032258064516,
207
+ "acc_stderr": 0.022185710092252252,
208
+ "acc_norm": 0.8129032258064516,
209
+ "acc_norm_stderr": 0.022185710092252252
210
+ },
211
+ "harness|hendrycksTest-high_school_chemistry|5": {
212
+ "acc": 0.5369458128078818,
213
+ "acc_stderr": 0.035083705204426656,
214
+ "acc_norm": 0.5369458128078818,
215
+ "acc_norm_stderr": 0.035083705204426656
216
+ },
217
+ "harness|hendrycksTest-high_school_computer_science|5": {
218
+ "acc": 0.79,
219
+ "acc_stderr": 0.040936018074033256,
220
+ "acc_norm": 0.79,
221
+ "acc_norm_stderr": 0.040936018074033256
222
+ },
223
+ "harness|hendrycksTest-high_school_european_history|5": {
224
+ "acc": 0.8363636363636363,
225
+ "acc_stderr": 0.02888787239548795,
226
+ "acc_norm": 0.8363636363636363,
227
+ "acc_norm_stderr": 0.02888787239548795
228
+ },
229
+ "harness|hendrycksTest-high_school_geography|5": {
230
+ "acc": 0.8686868686868687,
231
+ "acc_stderr": 0.024063156416822502,
232
+ "acc_norm": 0.8686868686868687,
233
+ "acc_norm_stderr": 0.024063156416822502
234
+ },
235
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
236
+ "acc": 0.927461139896373,
237
+ "acc_stderr": 0.018718998520678178,
238
+ "acc_norm": 0.927461139896373,
239
+ "acc_norm_stderr": 0.018718998520678178
240
+ },
241
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
242
+ "acc": 0.7025641025641025,
243
+ "acc_stderr": 0.023177408131465953,
244
+ "acc_norm": 0.7025641025641025,
245
+ "acc_norm_stderr": 0.023177408131465953
246
+ },
247
+ "harness|hendrycksTest-high_school_mathematics|5": {
248
+ "acc": 0.34814814814814815,
249
+ "acc_stderr": 0.02904560029061626,
250
+ "acc_norm": 0.34814814814814815,
251
+ "acc_norm_stderr": 0.02904560029061626
252
+ },
253
+ "harness|hendrycksTest-high_school_microeconomics|5": {
254
+ "acc": 0.7941176470588235,
255
+ "acc_stderr": 0.02626502460827588,
256
+ "acc_norm": 0.7941176470588235,
257
+ "acc_norm_stderr": 0.02626502460827588
258
+ },
259
+ "harness|hendrycksTest-high_school_physics|5": {
260
+ "acc": 0.4503311258278146,
261
+ "acc_stderr": 0.04062290018683776,
262
+ "acc_norm": 0.4503311258278146,
263
+ "acc_norm_stderr": 0.04062290018683776
264
+ },
265
+ "harness|hendrycksTest-high_school_psychology|5": {
266
+ "acc": 0.8954128440366973,
267
+ "acc_stderr": 0.013120530245265593,
268
+ "acc_norm": 0.8954128440366973,
269
+ "acc_norm_stderr": 0.013120530245265593
270
+ },
271
+ "harness|hendrycksTest-high_school_statistics|5": {
272
+ "acc": 0.5787037037037037,
273
+ "acc_stderr": 0.03367462138896078,
274
+ "acc_norm": 0.5787037037037037,
275
+ "acc_norm_stderr": 0.03367462138896078
276
+ },
277
+ "harness|hendrycksTest-high_school_us_history|5": {
278
+ "acc": 0.9166666666666666,
279
+ "acc_stderr": 0.019398452135813905,
280
+ "acc_norm": 0.9166666666666666,
281
+ "acc_norm_stderr": 0.019398452135813905
282
+ },
283
+ "harness|hendrycksTest-high_school_world_history|5": {
284
+ "acc": 0.8860759493670886,
285
+ "acc_stderr": 0.020681745135884565,
286
+ "acc_norm": 0.8860759493670886,
287
+ "acc_norm_stderr": 0.020681745135884565
288
+ },
289
+ "harness|hendrycksTest-human_aging|5": {
290
+ "acc": 0.757847533632287,
291
+ "acc_stderr": 0.028751392398694755,
292
+ "acc_norm": 0.757847533632287,
293
+ "acc_norm_stderr": 0.028751392398694755
294
+ },
295
+ "harness|hendrycksTest-human_sexuality|5": {
296
+ "acc": 0.8702290076335878,
297
+ "acc_stderr": 0.029473649496907065,
298
+ "acc_norm": 0.8702290076335878,
299
+ "acc_norm_stderr": 0.029473649496907065
300
+ },
301
+ "harness|hendrycksTest-international_law|5": {
302
+ "acc": 0.8181818181818182,
303
+ "acc_stderr": 0.03520893951097655,
304
+ "acc_norm": 0.8181818181818182,
305
+ "acc_norm_stderr": 0.03520893951097655
306
+ },
307
+ "harness|hendrycksTest-jurisprudence|5": {
308
+ "acc": 0.8148148148148148,
309
+ "acc_stderr": 0.03755265865037181,
310
+ "acc_norm": 0.8148148148148148,
311
+ "acc_norm_stderr": 0.03755265865037181
312
+ },
313
+ "harness|hendrycksTest-logical_fallacies|5": {
314
+ "acc": 0.7791411042944786,
315
+ "acc_stderr": 0.03259177392742179,
316
+ "acc_norm": 0.7791411042944786,
317
+ "acc_norm_stderr": 0.03259177392742179
318
+ },
319
+ "harness|hendrycksTest-machine_learning|5": {
320
+ "acc": 0.48214285714285715,
321
+ "acc_stderr": 0.047427623612430116,
322
+ "acc_norm": 0.48214285714285715,
323
+ "acc_norm_stderr": 0.047427623612430116
324
+ },
325
+ "harness|hendrycksTest-management|5": {
326
+ "acc": 0.8446601941747572,
327
+ "acc_stderr": 0.03586594738573974,
328
+ "acc_norm": 0.8446601941747572,
329
+ "acc_norm_stderr": 0.03586594738573974
330
+ },
331
+ "harness|hendrycksTest-marketing|5": {
332
+ "acc": 0.905982905982906,
333
+ "acc_stderr": 0.019119892798924974,
334
+ "acc_norm": 0.905982905982906,
335
+ "acc_norm_stderr": 0.019119892798924974
336
+ },
337
+ "harness|hendrycksTest-medical_genetics|5": {
338
+ "acc": 0.67,
339
+ "acc_stderr": 0.047258156262526066,
340
+ "acc_norm": 0.67,
341
+ "acc_norm_stderr": 0.047258156262526066
342
+ },
343
+ "harness|hendrycksTest-miscellaneous|5": {
344
+ "acc": 0.8697318007662835,
345
+ "acc_stderr": 0.012036729568216054,
346
+ "acc_norm": 0.8697318007662835,
347
+ "acc_norm_stderr": 0.012036729568216054
348
+ },
349
+ "harness|hendrycksTest-moral_disputes|5": {
350
+ "acc": 0.7774566473988439,
351
+ "acc_stderr": 0.02239421566194282,
352
+ "acc_norm": 0.7774566473988439,
353
+ "acc_norm_stderr": 0.02239421566194282
354
+ },
355
+ "harness|hendrycksTest-moral_scenarios|5": {
356
+ "acc": 0.5553072625698324,
357
+ "acc_stderr": 0.016619881988177012,
358
+ "acc_norm": 0.5553072625698324,
359
+ "acc_norm_stderr": 0.016619881988177012
360
+ },
361
+ "harness|hendrycksTest-nutrition|5": {
362
+ "acc": 0.7516339869281046,
363
+ "acc_stderr": 0.024739981355113592,
364
+ "acc_norm": 0.7516339869281046,
365
+ "acc_norm_stderr": 0.024739981355113592
366
+ },
367
+ "harness|hendrycksTest-philosophy|5": {
368
+ "acc": 0.77491961414791,
369
+ "acc_stderr": 0.023720088516179027,
370
+ "acc_norm": 0.77491961414791,
371
+ "acc_norm_stderr": 0.023720088516179027
372
+ },
373
+ "harness|hendrycksTest-prehistory|5": {
374
+ "acc": 0.7962962962962963,
375
+ "acc_stderr": 0.02240967454730417,
376
+ "acc_norm": 0.7962962962962963,
377
+ "acc_norm_stderr": 0.02240967454730417
378
+ },
379
+ "harness|hendrycksTest-professional_accounting|5": {
380
+ "acc": 0.5390070921985816,
381
+ "acc_stderr": 0.029736592526424445,
382
+ "acc_norm": 0.5390070921985816,
383
+ "acc_norm_stderr": 0.029736592526424445
384
+ },
385
+ "harness|hendrycksTest-professional_law|5": {
386
+ "acc": 0.5586701434159062,
387
+ "acc_stderr": 0.012682016335646683,
388
+ "acc_norm": 0.5586701434159062,
389
+ "acc_norm_stderr": 0.012682016335646683
390
+ },
391
+ "harness|hendrycksTest-professional_medicine|5": {
392
+ "acc": 0.7242647058823529,
393
+ "acc_stderr": 0.027146271936625162,
394
+ "acc_norm": 0.7242647058823529,
395
+ "acc_norm_stderr": 0.027146271936625162
396
+ },
397
+ "harness|hendrycksTest-professional_psychology|5": {
398
+ "acc": 0.761437908496732,
399
+ "acc_stderr": 0.017242385828779627,
400
+ "acc_norm": 0.761437908496732,
401
+ "acc_norm_stderr": 0.017242385828779627
402
+ },
403
+ "harness|hendrycksTest-public_relations|5": {
404
+ "acc": 0.7454545454545455,
405
+ "acc_stderr": 0.041723430387053825,
406
+ "acc_norm": 0.7454545454545455,
407
+ "acc_norm_stderr": 0.041723430387053825
408
+ },
409
+ "harness|hendrycksTest-security_studies|5": {
410
+ "acc": 0.7877551020408163,
411
+ "acc_stderr": 0.026176967197866767,
412
+ "acc_norm": 0.7877551020408163,
413
+ "acc_norm_stderr": 0.026176967197866767
414
+ },
415
+ "harness|hendrycksTest-sociology|5": {
416
+ "acc": 0.8805970149253731,
417
+ "acc_stderr": 0.02292879327721974,
418
+ "acc_norm": 0.8805970149253731,
419
+ "acc_norm_stderr": 0.02292879327721974
420
+ },
421
+ "harness|hendrycksTest-us_foreign_policy|5": {
422
+ "acc": 0.9,
423
+ "acc_stderr": 0.030151134457776334,
424
+ "acc_norm": 0.9,
425
+ "acc_norm_stderr": 0.030151134457776334
426
+ },
427
+ "harness|hendrycksTest-virology|5": {
428
+ "acc": 0.5602409638554217,
429
+ "acc_stderr": 0.03864139923699122,
430
+ "acc_norm": 0.5602409638554217,
431
+ "acc_norm_stderr": 0.03864139923699122
432
+ },
433
+ "harness|hendrycksTest-world_religions|5": {
434
+ "acc": 0.8596491228070176,
435
+ "acc_stderr": 0.0266405825391332,
436
+ "acc_norm": 0.8596491228070176,
437
+ "acc_norm_stderr": 0.0266405825391332
438
+ },
439
+ "harness|truthfulqa:mc|0": {
440
+ "mc1": 0.40269277845777235,
441
+ "mc1_stderr": 0.01716883093518722,
442
+ "mc2": 0.5707095526544473,
443
+ "mc2_stderr": 0.01525040450448649
444
+ },
445
+ "harness|winogrande|5": {
446
+ "acc": 0.8176795580110497,
447
+ "acc_stderr": 0.010851565594267207
448
+ },
449
+ "harness|gsm8k|5": {
450
+ "acc": 0.6444275966641395,
451
+ "acc_stderr": 0.013185402252713852
452
+ }
453
+ }
454
+ ```