HenryJJ commited on
Commit
6a6febd
1 Parent(s): c1d0441

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +387 -0
README.md CHANGED
@@ -32,4 +32,391 @@ Output:
32
  Who was the was the second president of the United States?
33
 
34
  Output:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  ```
 
32
  Who was the was the second president of the United States?
33
 
34
  Output:
35
+ ```
36
+
37
+ ## Latest results
38
+
39
+ These are the [latest results from run 2024-01-04T13:27:32.660899](https://huggingface.co/datasets/open-llm-leaderboard/details_HenryJJ__Instruct_Mistral-7B-v0.1_Dolly15K/blob/main/results_2024-01-04T13-27-32.660899.json)(note that their might be results for other tasks in the repos if successive evals didn't cover the same tasks. You find each in the results and the "latest" split for each eval):
40
+
41
+ ```python
42
+ {
43
+ "all": {
44
+ "acc": 0.6241143484289186,
45
+ "acc_stderr": 0.032689663124831826,
46
+ "acc_norm": 0.6299031400315822,
47
+ "acc_norm_stderr": 0.033361474961048916,
48
+ "mc1": 0.2802937576499388,
49
+ "mc1_stderr": 0.015723139524608767,
50
+ "mc2": 0.435601924823795,
51
+ "mc2_stderr": 0.014179199089974604
52
+ },
53
+ "harness|arc:challenge|25": {
54
+ "acc": 0.5571672354948806,
55
+ "acc_stderr": 0.014515573873348906,
56
+ "acc_norm": 0.5938566552901023,
57
+ "acc_norm_stderr": 0.014351656690097862
58
+ },
59
+ "harness|hellaswag|10": {
60
+ "acc": 0.6253734315873332,
61
+ "acc_stderr": 0.004830371317841054,
62
+ "acc_norm": 0.826229834694284,
63
+ "acc_norm_stderr": 0.00378137335887
64
+ },
65
+ "harness|hendrycksTest-abstract_algebra|5": {
66
+ "acc": 0.31,
67
+ "acc_stderr": 0.04648231987117316,
68
+ "acc_norm": 0.31,
69
+ "acc_norm_stderr": 0.04648231987117316
70
+ },
71
+ "harness|hendrycksTest-anatomy|5": {
72
+ "acc": 0.6148148148148148,
73
+ "acc_stderr": 0.04203921040156279,
74
+ "acc_norm": 0.6148148148148148,
75
+ "acc_norm_stderr": 0.04203921040156279
76
+ },
77
+ "harness|hendrycksTest-astronomy|5": {
78
+ "acc": 0.6513157894736842,
79
+ "acc_stderr": 0.03878139888797611,
80
+ "acc_norm": 0.6513157894736842,
81
+ "acc_norm_stderr": 0.03878139888797611
82
+ },
83
+ "harness|hendrycksTest-business_ethics|5": {
84
+ "acc": 0.57,
85
+ "acc_stderr": 0.04975698519562428,
86
+ "acc_norm": 0.57,
87
+ "acc_norm_stderr": 0.04975698519562428
88
+ },
89
+ "harness|hendrycksTest-clinical_knowledge|5": {
90
+ "acc": 0.660377358490566,
91
+ "acc_stderr": 0.029146904747798328,
92
+ "acc_norm": 0.660377358490566,
93
+ "acc_norm_stderr": 0.029146904747798328
94
+ },
95
+ "harness|hendrycksTest-college_biology|5": {
96
+ "acc": 0.7291666666666666,
97
+ "acc_stderr": 0.03716177437566017,
98
+ "acc_norm": 0.7291666666666666,
99
+ "acc_norm_stderr": 0.03716177437566017
100
+ },
101
+ "harness|hendrycksTest-college_chemistry|5": {
102
+ "acc": 0.46,
103
+ "acc_stderr": 0.05009082659620332,
104
+ "acc_norm": 0.46,
105
+ "acc_norm_stderr": 0.05009082659620332
106
+ },
107
+ "harness|hendrycksTest-college_computer_science|5": {
108
+ "acc": 0.54,
109
+ "acc_stderr": 0.05009082659620333,
110
+ "acc_norm": 0.54,
111
+ "acc_norm_stderr": 0.05009082659620333
112
+ },
113
+ "harness|hendrycksTest-college_mathematics|5": {
114
+ "acc": 0.38,
115
+ "acc_stderr": 0.04878317312145632,
116
+ "acc_norm": 0.38,
117
+ "acc_norm_stderr": 0.04878317312145632
118
+ },
119
+ "harness|hendrycksTest-college_medicine|5": {
120
+ "acc": 0.5838150289017341,
121
+ "acc_stderr": 0.03758517775404947,
122
+ "acc_norm": 0.5838150289017341,
123
+ "acc_norm_stderr": 0.03758517775404947
124
+ },
125
+ "harness|hendrycksTest-college_physics|5": {
126
+ "acc": 0.35294117647058826,
127
+ "acc_stderr": 0.04755129616062946,
128
+ "acc_norm": 0.35294117647058826,
129
+ "acc_norm_stderr": 0.04755129616062946
130
+ },
131
+ "harness|hendrycksTest-computer_security|5": {
132
+ "acc": 0.77,
133
+ "acc_stderr": 0.04229525846816505,
134
+ "acc_norm": 0.77,
135
+ "acc_norm_stderr": 0.04229525846816505
136
+ },
137
+ "harness|hendrycksTest-conceptual_physics|5": {
138
+ "acc": 0.5574468085106383,
139
+ "acc_stderr": 0.032469569197899575,
140
+ "acc_norm": 0.5574468085106383,
141
+ "acc_norm_stderr": 0.032469569197899575
142
+ },
143
+ "harness|hendrycksTest-econometrics|5": {
144
+ "acc": 0.5,
145
+ "acc_stderr": 0.047036043419179864,
146
+ "acc_norm": 0.5,
147
+ "acc_norm_stderr": 0.047036043419179864
148
+ },
149
+ "harness|hendrycksTest-electrical_engineering|5": {
150
+ "acc": 0.5724137931034483,
151
+ "acc_stderr": 0.041227371113703316,
152
+ "acc_norm": 0.5724137931034483,
153
+ "acc_norm_stderr": 0.041227371113703316
154
+ },
155
+ "harness|hendrycksTest-elementary_mathematics|5": {
156
+ "acc": 0.3994708994708995,
157
+ "acc_stderr": 0.02522545028406788,
158
+ "acc_norm": 0.3994708994708995,
159
+ "acc_norm_stderr": 0.02522545028406788
160
+ },
161
+ "harness|hendrycksTest-formal_logic|5": {
162
+ "acc": 0.3968253968253968,
163
+ "acc_stderr": 0.04375888492727061,
164
+ "acc_norm": 0.3968253968253968,
165
+ "acc_norm_stderr": 0.04375888492727061
166
+ },
167
+ "harness|hendrycksTest-global_facts|5": {
168
+ "acc": 0.35,
169
+ "acc_stderr": 0.0479372485441102,
170
+ "acc_norm": 0.35,
171
+ "acc_norm_stderr": 0.0479372485441102
172
+ },
173
+ "harness|hendrycksTest-high_school_biology|5": {
174
+ "acc": 0.7483870967741936,
175
+ "acc_stderr": 0.024685979286239956,
176
+ "acc_norm": 0.7483870967741936,
177
+ "acc_norm_stderr": 0.024685979286239956
178
+ },
179
+ "harness|hendrycksTest-high_school_chemistry|5": {
180
+ "acc": 0.5221674876847291,
181
+ "acc_stderr": 0.03514528562175008,
182
+ "acc_norm": 0.5221674876847291,
183
+ "acc_norm_stderr": 0.03514528562175008
184
+ },
185
+ "harness|hendrycksTest-high_school_computer_science|5": {
186
+ "acc": 0.67,
187
+ "acc_stderr": 0.04725815626252607,
188
+ "acc_norm": 0.67,
189
+ "acc_norm_stderr": 0.04725815626252607
190
+ },
191
+ "harness|hendrycksTest-high_school_european_history|5": {
192
+ "acc": 0.7636363636363637,
193
+ "acc_stderr": 0.03317505930009182,
194
+ "acc_norm": 0.7636363636363637,
195
+ "acc_norm_stderr": 0.03317505930009182
196
+ },
197
+ "harness|hendrycksTest-high_school_geography|5": {
198
+ "acc": 0.7525252525252525,
199
+ "acc_stderr": 0.030746300742124498,
200
+ "acc_norm": 0.7525252525252525,
201
+ "acc_norm_stderr": 0.030746300742124498
202
+ },
203
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
204
+ "acc": 0.844559585492228,
205
+ "acc_stderr": 0.026148483469153314,
206
+ "acc_norm": 0.844559585492228,
207
+ "acc_norm_stderr": 0.026148483469153314
208
+ },
209
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
210
+ "acc": 0.6205128205128205,
211
+ "acc_stderr": 0.024603626924097417,
212
+ "acc_norm": 0.6205128205128205,
213
+ "acc_norm_stderr": 0.024603626924097417
214
+ },
215
+ "harness|hendrycksTest-high_school_mathematics|5": {
216
+ "acc": 0.337037037037037,
217
+ "acc_stderr": 0.028820884666253252,
218
+ "acc_norm": 0.337037037037037,
219
+ "acc_norm_stderr": 0.028820884666253252
220
+ },
221
+ "harness|hendrycksTest-high_school_microeconomics|5": {
222
+ "acc": 0.6260504201680672,
223
+ "acc_stderr": 0.031429466378837076,
224
+ "acc_norm": 0.6260504201680672,
225
+ "acc_norm_stderr": 0.031429466378837076
226
+ },
227
+ "harness|hendrycksTest-high_school_physics|5": {
228
+ "acc": 0.33774834437086093,
229
+ "acc_stderr": 0.03861557546255169,
230
+ "acc_norm": 0.33774834437086093,
231
+ "acc_norm_stderr": 0.03861557546255169
232
+ },
233
+ "harness|hendrycksTest-high_school_psychology|5": {
234
+ "acc": 0.7944954128440367,
235
+ "acc_stderr": 0.01732435232501601,
236
+ "acc_norm": 0.7944954128440367,
237
+ "acc_norm_stderr": 0.01732435232501601
238
+ },
239
+ "harness|hendrycksTest-high_school_statistics|5": {
240
+ "acc": 0.5046296296296297,
241
+ "acc_stderr": 0.03409825519163572,
242
+ "acc_norm": 0.5046296296296297,
243
+ "acc_norm_stderr": 0.03409825519163572
244
+ },
245
+ "harness|hendrycksTest-high_school_us_history|5": {
246
+ "acc": 0.8137254901960784,
247
+ "acc_stderr": 0.027325470966716312,
248
+ "acc_norm": 0.8137254901960784,
249
+ "acc_norm_stderr": 0.027325470966716312
250
+ },
251
+ "harness|hendrycksTest-high_school_world_history|5": {
252
+ "acc": 0.7763713080168776,
253
+ "acc_stderr": 0.027123298205229966,
254
+ "acc_norm": 0.7763713080168776,
255
+ "acc_norm_stderr": 0.027123298205229966
256
+ },
257
+ "harness|hendrycksTest-human_aging|5": {
258
+ "acc": 0.6860986547085202,
259
+ "acc_stderr": 0.031146796482972465,
260
+ "acc_norm": 0.6860986547085202,
261
+ "acc_norm_stderr": 0.031146796482972465
262
+ },
263
+ "harness|hendrycksTest-human_sexuality|5": {
264
+ "acc": 0.7557251908396947,
265
+ "acc_stderr": 0.037683359597287434,
266
+ "acc_norm": 0.7557251908396947,
267
+ "acc_norm_stderr": 0.037683359597287434
268
+ },
269
+ "harness|hendrycksTest-international_law|5": {
270
+ "acc": 0.7851239669421488,
271
+ "acc_stderr": 0.037494924487096966,
272
+ "acc_norm": 0.7851239669421488,
273
+ "acc_norm_stderr": 0.037494924487096966
274
+ },
275
+ "harness|hendrycksTest-jurisprudence|5": {
276
+ "acc": 0.75,
277
+ "acc_stderr": 0.04186091791394607,
278
+ "acc_norm": 0.75,
279
+ "acc_norm_stderr": 0.04186091791394607
280
+ },
281
+ "harness|hendrycksTest-logical_fallacies|5": {
282
+ "acc": 0.7791411042944786,
283
+ "acc_stderr": 0.03259177392742178,
284
+ "acc_norm": 0.7791411042944786,
285
+ "acc_norm_stderr": 0.03259177392742178
286
+ },
287
+ "harness|hendrycksTest-machine_learning|5": {
288
+ "acc": 0.41964285714285715,
289
+ "acc_stderr": 0.04684099321077106,
290
+ "acc_norm": 0.41964285714285715,
291
+ "acc_norm_stderr": 0.04684099321077106
292
+ },
293
+ "harness|hendrycksTest-management|5": {
294
+ "acc": 0.7961165048543689,
295
+ "acc_stderr": 0.039891398595317706,
296
+ "acc_norm": 0.7961165048543689,
297
+ "acc_norm_stderr": 0.039891398595317706
298
+ },
299
+ "harness|hendrycksTest-marketing|5": {
300
+ "acc": 0.8589743589743589,
301
+ "acc_stderr": 0.022801382534597528,
302
+ "acc_norm": 0.8589743589743589,
303
+ "acc_norm_stderr": 0.022801382534597528
304
+ },
305
+ "harness|hendrycksTest-medical_genetics|5": {
306
+ "acc": 0.73,
307
+ "acc_stderr": 0.044619604333847394,
308
+ "acc_norm": 0.73,
309
+ "acc_norm_stderr": 0.044619604333847394
310
+ },
311
+ "harness|hendrycksTest-miscellaneous|5": {
312
+ "acc": 0.8135376756066411,
313
+ "acc_stderr": 0.013927751372001501,
314
+ "acc_norm": 0.8135376756066411,
315
+ "acc_norm_stderr": 0.013927751372001501
316
+ },
317
+ "harness|hendrycksTest-moral_disputes|5": {
318
+ "acc": 0.6994219653179191,
319
+ "acc_stderr": 0.0246853168672578,
320
+ "acc_norm": 0.6994219653179191,
321
+ "acc_norm_stderr": 0.0246853168672578
322
+ },
323
+ "harness|hendrycksTest-moral_scenarios|5": {
324
+ "acc": 0.4033519553072626,
325
+ "acc_stderr": 0.01640712303219525,
326
+ "acc_norm": 0.4033519553072626,
327
+ "acc_norm_stderr": 0.01640712303219525
328
+ },
329
+ "harness|hendrycksTest-nutrition|5": {
330
+ "acc": 0.7320261437908496,
331
+ "acc_stderr": 0.02536060379624255,
332
+ "acc_norm": 0.7320261437908496,
333
+ "acc_norm_stderr": 0.02536060379624255
334
+ },
335
+ "harness|hendrycksTest-philosophy|5": {
336
+ "acc": 0.7009646302250804,
337
+ "acc_stderr": 0.02600330111788514,
338
+ "acc_norm": 0.7009646302250804,
339
+ "acc_norm_stderr": 0.02600330111788514
340
+ },
341
+ "harness|hendrycksTest-prehistory|5": {
342
+ "acc": 0.7067901234567902,
343
+ "acc_stderr": 0.025329888171900926,
344
+ "acc_norm": 0.7067901234567902,
345
+ "acc_norm_stderr": 0.025329888171900926
346
+ },
347
+ "harness|hendrycksTest-professional_accounting|5": {
348
+ "acc": 0.49645390070921985,
349
+ "acc_stderr": 0.02982674915328092,
350
+ "acc_norm": 0.49645390070921985,
351
+ "acc_norm_stderr": 0.02982674915328092
352
+ },
353
+ "harness|hendrycksTest-professional_law|5": {
354
+ "acc": 0.44784876140808344,
355
+ "acc_stderr": 0.01270058240476822,
356
+ "acc_norm": 0.44784876140808344,
357
+ "acc_norm_stderr": 0.01270058240476822
358
+ },
359
+ "harness|hendrycksTest-professional_medicine|5": {
360
+ "acc": 0.6397058823529411,
361
+ "acc_stderr": 0.029163128570670733,
362
+ "acc_norm": 0.6397058823529411,
363
+ "acc_norm_stderr": 0.029163128570670733
364
+ },
365
+ "harness|hendrycksTest-professional_psychology|5": {
366
+ "acc": 0.6666666666666666,
367
+ "acc_stderr": 0.019070985589687495,
368
+ "acc_norm": 0.6666666666666666,
369
+ "acc_norm_stderr": 0.019070985589687495
370
+ },
371
+ "harness|hendrycksTest-public_relations|5": {
372
+ "acc": 0.6727272727272727,
373
+ "acc_stderr": 0.0449429086625209,
374
+ "acc_norm": 0.6727272727272727,
375
+ "acc_norm_stderr": 0.0449429086625209
376
+ },
377
+ "harness|hendrycksTest-security_studies|5": {
378
+ "acc": 0.7020408163265306,
379
+ "acc_stderr": 0.029279567411065677,
380
+ "acc_norm": 0.7020408163265306,
381
+ "acc_norm_stderr": 0.029279567411065677
382
+ },
383
+ "harness|hendrycksTest-sociology|5": {
384
+ "acc": 0.7960199004975125,
385
+ "acc_stderr": 0.02849317624532607,
386
+ "acc_norm": 0.7960199004975125,
387
+ "acc_norm_stderr": 0.02849317624532607
388
+ },
389
+ "harness|hendrycksTest-us_foreign_policy|5": {
390
+ "acc": 0.84,
391
+ "acc_stderr": 0.03684529491774709,
392
+ "acc_norm": 0.84,
393
+ "acc_norm_stderr": 0.03684529491774709
394
+ },
395
+ "harness|hendrycksTest-virology|5": {
396
+ "acc": 0.5542168674698795,
397
+ "acc_stderr": 0.03869543323472101,
398
+ "acc_norm": 0.5542168674698795,
399
+ "acc_norm_stderr": 0.03869543323472101
400
+ },
401
+ "harness|hendrycksTest-world_religions|5": {
402
+ "acc": 0.8011695906432749,
403
+ "acc_stderr": 0.03061111655743253,
404
+ "acc_norm": 0.8011695906432749,
405
+ "acc_norm_stderr": 0.03061111655743253
406
+ },
407
+ "harness|truthfulqa:mc|0": {
408
+ "mc1": 0.2802937576499388,
409
+ "mc1_stderr": 0.015723139524608767,
410
+ "mc2": 0.435601924823795,
411
+ "mc2_stderr": 0.014179199089974604
412
+ },
413
+ "harness|winogrande|5": {
414
+ "acc": 0.7932123125493291,
415
+ "acc_stderr": 0.011382566829235805
416
+ },
417
+ "harness|gsm8k|5": {
418
+ "acc": 0.3510235026535254,
419
+ "acc_stderr": 0.01314694594139722
420
+ }
421
+ }
422
  ```