Sharathhebbar24 commited on
Commit
f9930b0
1 Parent(s): 3ed7a2e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +391 -0
README.md CHANGED
@@ -49,4 +49,395 @@ prompt.
49
  >>> prompt = "Gracie and Joe are choosing numbers on the complex plane. Joe chooses the point $1+2i$. Gracie chooses $-1+i$. How far apart are Gracie and Joe's points?"
50
  >>> res = generate_text(prompt)
51
  >>> res
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  ```
 
49
  >>> prompt = "Gracie and Joe are choosing numbers on the complex plane. Joe chooses the point $1+2i$. Gracie chooses $-1+i$. How far apart are Gracie and Joe's points?"
50
  >>> res = generate_text(prompt)
51
  >>> res
52
+ ```
53
+
54
+ # Benchmark / Evaluation
55
+
56
+ | Model | Average | ARC | HellaSwag | MMLU | TruthfulQA | Winogrande | GSM8k |
57
+ | ------- | -------- | -------- | ------- | -------- | -------- | ------- | -------- |
58
+ | Sharathhebbar24/math_gpt2_sft | 28.503 | 22.87 | 30.41 | 25.06 | 37.62 | 51.54 | 0.68 |
59
+
60
+ ```python
61
+ {
62
+ "all": {
63
+ "acc": 0.25082189621988066,
64
+ "acc_stderr": 0.030526589726831692,
65
+ "acc_norm": 0.25112870356236633,
66
+ "acc_norm_stderr": 0.03129390389566968,
67
+ "mc1": 0.24112607099143207,
68
+ "mc1_stderr": 0.014974827279752334,
69
+ "mc2": 0.3762297840067963,
70
+ "mc2_stderr": 0.01445991036363257
71
+ },
72
+ "harness|arc:challenge|25": {
73
+ "acc": 0.20563139931740615,
74
+ "acc_stderr": 0.01181074526074258,
75
+ "acc_norm": 0.22866894197952217,
76
+ "acc_norm_stderr": 0.012272853582540799
77
+ },
78
+ "harness|hellaswag|10": {
79
+ "acc": 0.2884883489344752,
80
+ "acc_stderr": 0.004521334761709224,
81
+ "acc_norm": 0.30412268472415854,
82
+ "acc_norm_stderr": 0.00459094683972719
83
+ },
84
+ "harness|hendrycksTest-abstract_algebra|5": {
85
+ "acc": 0.19,
86
+ "acc_stderr": 0.03942772444036625,
87
+ "acc_norm": 0.19,
88
+ "acc_norm_stderr": 0.03942772444036625
89
+ },
90
+ "harness|hendrycksTest-anatomy|5": {
91
+ "acc": 0.2074074074074074,
92
+ "acc_stderr": 0.03502553170678319,
93
+ "acc_norm": 0.2074074074074074,
94
+ "acc_norm_stderr": 0.03502553170678319
95
+ },
96
+ "harness|hendrycksTest-astronomy|5": {
97
+ "acc": 0.17763157894736842,
98
+ "acc_stderr": 0.031103182383123398,
99
+ "acc_norm": 0.17763157894736842,
100
+ "acc_norm_stderr": 0.031103182383123398
101
+ },
102
+ "harness|hendrycksTest-business_ethics|5": {
103
+ "acc": 0.19,
104
+ "acc_stderr": 0.03942772444036622,
105
+ "acc_norm": 0.19,
106
+ "acc_norm_stderr": 0.03942772444036622
107
+ },
108
+ "harness|hendrycksTest-clinical_knowledge|5": {
109
+ "acc": 0.2188679245283019,
110
+ "acc_stderr": 0.025447863825108618,
111
+ "acc_norm": 0.2188679245283019,
112
+ "acc_norm_stderr": 0.025447863825108618
113
+ },
114
+ "harness|hendrycksTest-college_biology|5": {
115
+ "acc": 0.25,
116
+ "acc_stderr": 0.03621034121889507,
117
+ "acc_norm": 0.25,
118
+ "acc_norm_stderr": 0.03621034121889507
119
+ },
120
+ "harness|hendrycksTest-college_chemistry|5": {
121
+ "acc": 0.2,
122
+ "acc_stderr": 0.04020151261036845,
123
+ "acc_norm": 0.2,
124
+ "acc_norm_stderr": 0.04020151261036845
125
+ },
126
+ "harness|hendrycksTest-college_computer_science|5": {
127
+ "acc": 0.32,
128
+ "acc_stderr": 0.046882617226215034,
129
+ "acc_norm": 0.32,
130
+ "acc_norm_stderr": 0.046882617226215034
131
+ },
132
+ "harness|hendrycksTest-college_mathematics|5": {
133
+ "acc": 0.24,
134
+ "acc_stderr": 0.042923469599092816,
135
+ "acc_norm": 0.24,
136
+ "acc_norm_stderr": 0.042923469599092816
137
+ },
138
+ "harness|hendrycksTest-college_medicine|5": {
139
+ "acc": 0.21965317919075145,
140
+ "acc_stderr": 0.031568093627031744,
141
+ "acc_norm": 0.21965317919075145,
142
+ "acc_norm_stderr": 0.031568093627031744
143
+ },
144
+ "harness|hendrycksTest-college_physics|5": {
145
+ "acc": 0.23529411764705882,
146
+ "acc_stderr": 0.04220773659171453,
147
+ "acc_norm": 0.23529411764705882,
148
+ "acc_norm_stderr": 0.04220773659171453
149
+ },
150
+ "harness|hendrycksTest-computer_security|5": {
151
+ "acc": 0.23,
152
+ "acc_stderr": 0.04229525846816505,
153
+ "acc_norm": 0.23,
154
+ "acc_norm_stderr": 0.04229525846816505
155
+ },
156
+ "harness|hendrycksTest-conceptual_physics|5": {
157
+ "acc": 0.2680851063829787,
158
+ "acc_stderr": 0.028957342788342347,
159
+ "acc_norm": 0.2680851063829787,
160
+ "acc_norm_stderr": 0.028957342788342347
161
+ },
162
+ "harness|hendrycksTest-econometrics|5": {
163
+ "acc": 0.24561403508771928,
164
+ "acc_stderr": 0.040493392977481404,
165
+ "acc_norm": 0.24561403508771928,
166
+ "acc_norm_stderr": 0.040493392977481404
167
+ },
168
+ "harness|hendrycksTest-electrical_engineering|5": {
169
+ "acc": 0.2482758620689655,
170
+ "acc_stderr": 0.036001056927277716,
171
+ "acc_norm": 0.2482758620689655,
172
+ "acc_norm_stderr": 0.036001056927277716
173
+ },
174
+ "harness|hendrycksTest-elementary_mathematics|5": {
175
+ "acc": 0.24074074074074073,
176
+ "acc_stderr": 0.0220190800122179,
177
+ "acc_norm": 0.24074074074074073,
178
+ "acc_norm_stderr": 0.0220190800122179
179
+ },
180
+ "harness|hendrycksTest-formal_logic|5": {
181
+ "acc": 0.23015873015873015,
182
+ "acc_stderr": 0.03764950879790605,
183
+ "acc_norm": 0.23015873015873015,
184
+ "acc_norm_stderr": 0.03764950879790605
185
+ },
186
+ "harness|hendrycksTest-global_facts|5": {
187
+ "acc": 0.18,
188
+ "acc_stderr": 0.038612291966536934,
189
+ "acc_norm": 0.18,
190
+ "acc_norm_stderr": 0.038612291966536934
191
+ },
192
+ "harness|hendrycksTest-high_school_biology|5": {
193
+ "acc": 0.25483870967741934,
194
+ "acc_stderr": 0.024790118459332208,
195
+ "acc_norm": 0.25483870967741934,
196
+ "acc_norm_stderr": 0.024790118459332208
197
+ },
198
+ "harness|hendrycksTest-high_school_chemistry|5": {
199
+ "acc": 0.19704433497536947,
200
+ "acc_stderr": 0.02798672466673622,
201
+ "acc_norm": 0.19704433497536947,
202
+ "acc_norm_stderr": 0.02798672466673622
203
+ },
204
+ "harness|hendrycksTest-high_school_computer_science|5": {
205
+ "acc": 0.22,
206
+ "acc_stderr": 0.041633319989322695,
207
+ "acc_norm": 0.22,
208
+ "acc_norm_stderr": 0.041633319989322695
209
+ },
210
+ "harness|hendrycksTest-high_school_european_history|5": {
211
+ "acc": 0.19393939393939394,
212
+ "acc_stderr": 0.0308741451365621,
213
+ "acc_norm": 0.19393939393939394,
214
+ "acc_norm_stderr": 0.0308741451365621
215
+ },
216
+ "harness|hendrycksTest-high_school_geography|5": {
217
+ "acc": 0.3484848484848485,
218
+ "acc_stderr": 0.033948539651564025,
219
+ "acc_norm": 0.3484848484848485,
220
+ "acc_norm_stderr": 0.033948539651564025
221
+ },
222
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
223
+ "acc": 0.32124352331606215,
224
+ "acc_stderr": 0.033699508685490674,
225
+ "acc_norm": 0.32124352331606215,
226
+ "acc_norm_stderr": 0.033699508685490674
227
+ },
228
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
229
+ "acc": 0.23333333333333334,
230
+ "acc_stderr": 0.021444547301560476,
231
+ "acc_norm": 0.23333333333333334,
232
+ "acc_norm_stderr": 0.021444547301560476
233
+ },
234
+ "harness|hendrycksTest-high_school_mathematics|5": {
235
+ "acc": 0.2851851851851852,
236
+ "acc_stderr": 0.027528599210340492,
237
+ "acc_norm": 0.2851851851851852,
238
+ "acc_norm_stderr": 0.027528599210340492
239
+ },
240
+ "harness|hendrycksTest-high_school_microeconomics|5": {
241
+ "acc": 0.29831932773109243,
242
+ "acc_stderr": 0.029719142876342856,
243
+ "acc_norm": 0.29831932773109243,
244
+ "acc_norm_stderr": 0.029719142876342856
245
+ },
246
+ "harness|hendrycksTest-high_school_physics|5": {
247
+ "acc": 0.2781456953642384,
248
+ "acc_stderr": 0.03658603262763744,
249
+ "acc_norm": 0.2781456953642384,
250
+ "acc_norm_stderr": 0.03658603262763744
251
+ },
252
+ "harness|hendrycksTest-high_school_psychology|5": {
253
+ "acc": 0.26788990825688075,
254
+ "acc_stderr": 0.018987462257978652,
255
+ "acc_norm": 0.26788990825688075,
256
+ "acc_norm_stderr": 0.018987462257978652
257
+ },
258
+ "harness|hendrycksTest-high_school_statistics|5": {
259
+ "acc": 0.4351851851851852,
260
+ "acc_stderr": 0.03381200005643525,
261
+ "acc_norm": 0.4351851851851852,
262
+ "acc_norm_stderr": 0.03381200005643525
263
+ },
264
+ "harness|hendrycksTest-high_school_us_history|5": {
265
+ "acc": 0.2647058823529412,
266
+ "acc_stderr": 0.0309645179269234,
267
+ "acc_norm": 0.2647058823529412,
268
+ "acc_norm_stderr": 0.0309645179269234
269
+ },
270
+ "harness|hendrycksTest-high_school_world_history|5": {
271
+ "acc": 0.28270042194092826,
272
+ "acc_stderr": 0.029312814153955927,
273
+ "acc_norm": 0.28270042194092826,
274
+ "acc_norm_stderr": 0.029312814153955927
275
+ },
276
+ "harness|hendrycksTest-human_aging|5": {
277
+ "acc": 0.31390134529147984,
278
+ "acc_stderr": 0.031146796482972465,
279
+ "acc_norm": 0.31390134529147984,
280
+ "acc_norm_stderr": 0.031146796482972465
281
+ },
282
+ "harness|hendrycksTest-human_sexuality|5": {
283
+ "acc": 0.2595419847328244,
284
+ "acc_stderr": 0.03844876139785271,
285
+ "acc_norm": 0.2595419847328244,
286
+ "acc_norm_stderr": 0.03844876139785271
287
+ },
288
+ "harness|hendrycksTest-international_law|5": {
289
+ "acc": 0.2231404958677686,
290
+ "acc_stderr": 0.03800754475228733,
291
+ "acc_norm": 0.2231404958677686,
292
+ "acc_norm_stderr": 0.03800754475228733
293
+ },
294
+ "harness|hendrycksTest-jurisprudence|5": {
295
+ "acc": 0.25925925925925924,
296
+ "acc_stderr": 0.042365112580946336,
297
+ "acc_norm": 0.25925925925925924,
298
+ "acc_norm_stderr": 0.042365112580946336
299
+ },
300
+ "harness|hendrycksTest-logical_fallacies|5": {
301
+ "acc": 0.25153374233128833,
302
+ "acc_stderr": 0.03408997886857529,
303
+ "acc_norm": 0.25153374233128833,
304
+ "acc_norm_stderr": 0.03408997886857529
305
+ },
306
+ "harness|hendrycksTest-machine_learning|5": {
307
+ "acc": 0.29464285714285715,
308
+ "acc_stderr": 0.043270409325787296,
309
+ "acc_norm": 0.29464285714285715,
310
+ "acc_norm_stderr": 0.043270409325787296
311
+ },
312
+ "harness|hendrycksTest-management|5": {
313
+ "acc": 0.17475728155339806,
314
+ "acc_stderr": 0.037601780060266224,
315
+ "acc_norm": 0.17475728155339806,
316
+ "acc_norm_stderr": 0.037601780060266224
317
+ },
318
+ "harness|hendrycksTest-marketing|5": {
319
+ "acc": 0.20085470085470086,
320
+ "acc_stderr": 0.026246772946890488,
321
+ "acc_norm": 0.20085470085470086,
322
+ "acc_norm_stderr": 0.026246772946890488
323
+ },
324
+ "harness|hendrycksTest-medical_genetics|5": {
325
+ "acc": 0.3,
326
+ "acc_stderr": 0.046056618647183814,
327
+ "acc_norm": 0.3,
328
+ "acc_norm_stderr": 0.046056618647183814
329
+ },
330
+ "harness|hendrycksTest-miscellaneous|5": {
331
+ "acc": 0.23499361430395913,
332
+ "acc_stderr": 0.01516202415227844,
333
+ "acc_norm": 0.23499361430395913,
334
+ "acc_norm_stderr": 0.01516202415227844
335
+ },
336
+ "harness|hendrycksTest-moral_disputes|5": {
337
+ "acc": 0.23699421965317918,
338
+ "acc_stderr": 0.02289408248992599,
339
+ "acc_norm": 0.23699421965317918,
340
+ "acc_norm_stderr": 0.02289408248992599
341
+ },
342
+ "harness|hendrycksTest-moral_scenarios|5": {
343
+ "acc": 0.23798882681564246,
344
+ "acc_stderr": 0.014242630070574915,
345
+ "acc_norm": 0.23798882681564246,
346
+ "acc_norm_stderr": 0.014242630070574915
347
+ },
348
+ "harness|hendrycksTest-nutrition|5": {
349
+ "acc": 0.23202614379084968,
350
+ "acc_stderr": 0.024170840879341005,
351
+ "acc_norm": 0.23202614379084968,
352
+ "acc_norm_stderr": 0.024170840879341005
353
+ },
354
+ "harness|hendrycksTest-philosophy|5": {
355
+ "acc": 0.1864951768488746,
356
+ "acc_stderr": 0.02212243977248077,
357
+ "acc_norm": 0.1864951768488746,
358
+ "acc_norm_stderr": 0.02212243977248077
359
+ },
360
+ "harness|hendrycksTest-prehistory|5": {
361
+ "acc": 0.24074074074074073,
362
+ "acc_stderr": 0.02378858355165854,
363
+ "acc_norm": 0.24074074074074073,
364
+ "acc_norm_stderr": 0.02378858355165854
365
+ },
366
+ "harness|hendrycksTest-professional_accounting|5": {
367
+ "acc": 0.2695035460992908,
368
+ "acc_stderr": 0.026469036818590627,
369
+ "acc_norm": 0.2695035460992908,
370
+ "acc_norm_stderr": 0.026469036818590627
371
+ },
372
+ "harness|hendrycksTest-professional_law|5": {
373
+ "acc": 0.2529335071707953,
374
+ "acc_stderr": 0.011102268713839989,
375
+ "acc_norm": 0.2529335071707953,
376
+ "acc_norm_stderr": 0.011102268713839989
377
+ },
378
+ "harness|hendrycksTest-professional_medicine|5": {
379
+ "acc": 0.4411764705882353,
380
+ "acc_stderr": 0.030161911930767102,
381
+ "acc_norm": 0.4411764705882353,
382
+ "acc_norm_stderr": 0.030161911930767102
383
+ },
384
+ "harness|hendrycksTest-professional_psychology|5": {
385
+ "acc": 0.25,
386
+ "acc_stderr": 0.01751781884501444,
387
+ "acc_norm": 0.25,
388
+ "acc_norm_stderr": 0.01751781884501444
389
+ },
390
+ "harness|hendrycksTest-public_relations|5": {
391
+ "acc": 0.21818181818181817,
392
+ "acc_stderr": 0.03955932861795833,
393
+ "acc_norm": 0.21818181818181817,
394
+ "acc_norm_stderr": 0.03955932861795833
395
+ },
396
+ "harness|hendrycksTest-security_studies|5": {
397
+ "acc": 0.20408163265306123,
398
+ "acc_stderr": 0.025801283475090506,
399
+ "acc_norm": 0.20408163265306123,
400
+ "acc_norm_stderr": 0.025801283475090506
401
+ },
402
+ "harness|hendrycksTest-sociology|5": {
403
+ "acc": 0.24378109452736318,
404
+ "acc_stderr": 0.03036049015401465,
405
+ "acc_norm": 0.24378109452736318,
406
+ "acc_norm_stderr": 0.03036049015401465
407
+ },
408
+ "harness|hendrycksTest-us_foreign_policy|5": {
409
+ "acc": 0.24,
410
+ "acc_stderr": 0.04292346959909281,
411
+ "acc_norm": 0.24,
412
+ "acc_norm_stderr": 0.04292346959909281
413
+ },
414
+ "harness|hendrycksTest-virology|5": {
415
+ "acc": 0.22289156626506024,
416
+ "acc_stderr": 0.03240004825594687,
417
+ "acc_norm": 0.22289156626506024,
418
+ "acc_norm_stderr": 0.03240004825594687
419
+ },
420
+ "harness|hendrycksTest-world_religions|5": {
421
+ "acc": 0.3216374269005848,
422
+ "acc_stderr": 0.03582529442573122,
423
+ "acc_norm": 0.3216374269005848,
424
+ "acc_norm_stderr": 0.03582529442573122
425
+ },
426
+ "harness|truthfulqa:mc|0": {
427
+ "mc1": 0.24112607099143207,
428
+ "mc1_stderr": 0.014974827279752334,
429
+ "mc2": 0.3762297840067963,
430
+ "mc2_stderr": 0.01445991036363257
431
+ },
432
+ "harness|winogrande|5": {
433
+ "acc": 0.5153906866614049,
434
+ "acc_stderr": 0.014045826789783668
435
+ },
436
+ "harness|gsm8k|5": {
437
+ "acc": 0.006823351023502654,
438
+ "acc_stderr": 0.0022675371022544823
439
+ }
440
+ }
441
+
442
+
443
  ```