mychen76 commited on
Commit
3b74bfb
1 Parent(s): c1a0afc

Update README.md

Browse files

updated evaluation result

Files changed (1) hide show
  1. README.md +386 -1
README.md CHANGED
@@ -41,4 +41,389 @@ experts:
41
  - "solve"
42
  - "count"
43
  tokenizer_source: union
44
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  - "solve"
42
  - "count"
43
  tokenizer_source: union
44
+ ```
45
+
46
+ Evaluation Result:
47
+ https://huggingface.co/datasets/open-llm-leaderboard/details_mychen76__openmixtral-4x7b-merged
48
+ ```
49
+ {
50
+ "all": {
51
+ "acc": 0.657144834577193,
52
+ "acc_stderr": 0.03198053543647407,
53
+ "acc_norm": 0.6572006879598793,
54
+ "acc_norm_stderr": 0.0326392415851668,
55
+ "mc1": 0.44430844553243576,
56
+ "mc1_stderr": 0.01739458625074317,
57
+ "mc2": 0.6132594486430695,
58
+ "mc2_stderr": 0.015532509494332434
59
+ },
60
+ "harness|arc:challenge|25": {
61
+ "acc": 0.6604095563139932,
62
+ "acc_stderr": 0.013839039762820166,
63
+ "acc_norm": 0.6945392491467577,
64
+ "acc_norm_stderr": 0.01346008047800251
65
+ },
66
+ "harness|hellaswag|10": {
67
+ "acc": 0.6901015733917546,
68
+ "acc_stderr": 0.004615063817741861,
69
+ "acc_norm": 0.8674566819358693,
70
+ "acc_norm_stderr": 0.0033838751726700217
71
+ },
72
+ "harness|hendrycksTest-abstract_algebra|5": {
73
+ "acc": 0.38,
74
+ "acc_stderr": 0.048783173121456316,
75
+ "acc_norm": 0.38,
76
+ "acc_norm_stderr": 0.048783173121456316
77
+ },
78
+ "harness|hendrycksTest-anatomy|5": {
79
+ "acc": 0.6444444444444445,
80
+ "acc_stderr": 0.04135176749720385,
81
+ "acc_norm": 0.6444444444444445,
82
+ "acc_norm_stderr": 0.04135176749720385
83
+ },
84
+ "harness|hendrycksTest-astronomy|5": {
85
+ "acc": 0.6776315789473685,
86
+ "acc_stderr": 0.03803510248351585,
87
+ "acc_norm": 0.6776315789473685,
88
+ "acc_norm_stderr": 0.03803510248351585
89
+ },
90
+ "harness|hendrycksTest-business_ethics|5": {
91
+ "acc": 0.63,
92
+ "acc_stderr": 0.04852365870939099,
93
+ "acc_norm": 0.63,
94
+ "acc_norm_stderr": 0.04852365870939099
95
+ },
96
+ "harness|hendrycksTest-clinical_knowledge|5": {
97
+ "acc": 0.7358490566037735,
98
+ "acc_stderr": 0.027134291628741713,
99
+ "acc_norm": 0.7358490566037735,
100
+ "acc_norm_stderr": 0.027134291628741713
101
+ },
102
+ "harness|hendrycksTest-college_biology|5": {
103
+ "acc": 0.7777777777777778,
104
+ "acc_stderr": 0.03476590104304134,
105
+ "acc_norm": 0.7777777777777778,
106
+ "acc_norm_stderr": 0.03476590104304134
107
+ },
108
+ "harness|hendrycksTest-college_chemistry|5": {
109
+ "acc": 0.49,
110
+ "acc_stderr": 0.05024183937956911,
111
+ "acc_norm": 0.49,
112
+ "acc_norm_stderr": 0.05024183937956911
113
+ },
114
+ "harness|hendrycksTest-college_computer_science|5": {
115
+ "acc": 0.57,
116
+ "acc_stderr": 0.04975698519562428,
117
+ "acc_norm": 0.57,
118
+ "acc_norm_stderr": 0.04975698519562428
119
+ },
120
+ "harness|hendrycksTest-college_mathematics|5": {
121
+ "acc": 0.35,
122
+ "acc_stderr": 0.047937248544110196,
123
+ "acc_norm": 0.35,
124
+ "acc_norm_stderr": 0.047937248544110196
125
+ },
126
+ "harness|hendrycksTest-college_medicine|5": {
127
+ "acc": 0.6763005780346821,
128
+ "acc_stderr": 0.0356760379963917,
129
+ "acc_norm": 0.6763005780346821,
130
+ "acc_norm_stderr": 0.0356760379963917
131
+ },
132
+ "harness|hendrycksTest-college_physics|5": {
133
+ "acc": 0.4117647058823529,
134
+ "acc_stderr": 0.048971049527263666,
135
+ "acc_norm": 0.4117647058823529,
136
+ "acc_norm_stderr": 0.048971049527263666
137
+ },
138
+ "harness|hendrycksTest-computer_security|5": {
139
+ "acc": 0.76,
140
+ "acc_stderr": 0.04292346959909283,
141
+ "acc_norm": 0.76,
142
+ "acc_norm_stderr": 0.04292346959909283
143
+ },
144
+ "harness|hendrycksTest-conceptual_physics|5": {
145
+ "acc": 0.5531914893617021,
146
+ "acc_stderr": 0.0325005368436584,
147
+ "acc_norm": 0.5531914893617021,
148
+ "acc_norm_stderr": 0.0325005368436584
149
+ },
150
+ "harness|hendrycksTest-econometrics|5": {
151
+ "acc": 0.5,
152
+ "acc_stderr": 0.047036043419179864,
153
+ "acc_norm": 0.5,
154
+ "acc_norm_stderr": 0.047036043419179864
155
+ },
156
+ "harness|hendrycksTest-electrical_engineering|5": {
157
+ "acc": 0.5655172413793104,
158
+ "acc_stderr": 0.04130740879555497,
159
+ "acc_norm": 0.5655172413793104,
160
+ "acc_norm_stderr": 0.04130740879555497
161
+ },
162
+ "harness|hendrycksTest-elementary_mathematics|5": {
163
+ "acc": 0.42328042328042326,
164
+ "acc_stderr": 0.025446365634406793,
165
+ "acc_norm": 0.42328042328042326,
166
+ "acc_norm_stderr": 0.025446365634406793
167
+ },
168
+ "harness|hendrycksTest-formal_logic|5": {
169
+ "acc": 0.47619047619047616,
170
+ "acc_stderr": 0.04467062628403273,
171
+ "acc_norm": 0.47619047619047616,
172
+ "acc_norm_stderr": 0.04467062628403273
173
+ },
174
+ "harness|hendrycksTest-global_facts|5": {
175
+ "acc": 0.33,
176
+ "acc_stderr": 0.047258156262526045,
177
+ "acc_norm": 0.33,
178
+ "acc_norm_stderr": 0.047258156262526045
179
+ },
180
+ "harness|hendrycksTest-high_school_biology|5": {
181
+ "acc": 0.8064516129032258,
182
+ "acc_stderr": 0.022475258525536057,
183
+ "acc_norm": 0.8064516129032258,
184
+ "acc_norm_stderr": 0.022475258525536057
185
+ },
186
+ "harness|hendrycksTest-high_school_chemistry|5": {
187
+ "acc": 0.5123152709359606,
188
+ "acc_stderr": 0.035169204442208966,
189
+ "acc_norm": 0.5123152709359606,
190
+ "acc_norm_stderr": 0.035169204442208966
191
+ },
192
+ "harness|hendrycksTest-high_school_computer_science|5": {
193
+ "acc": 0.72,
194
+ "acc_stderr": 0.04512608598542127,
195
+ "acc_norm": 0.72,
196
+ "acc_norm_stderr": 0.04512608598542127
197
+ },
198
+ "harness|hendrycksTest-high_school_european_history|5": {
199
+ "acc": 0.7757575757575758,
200
+ "acc_stderr": 0.03256866661681102,
201
+ "acc_norm": 0.7757575757575758,
202
+ "acc_norm_stderr": 0.03256866661681102
203
+ },
204
+ "harness|hendrycksTest-high_school_geography|5": {
205
+ "acc": 0.8080808080808081,
206
+ "acc_stderr": 0.028057791672989017,
207
+ "acc_norm": 0.8080808080808081,
208
+ "acc_norm_stderr": 0.028057791672989017
209
+ },
210
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
211
+ "acc": 0.8963730569948186,
212
+ "acc_stderr": 0.02199531196364424,
213
+ "acc_norm": 0.8963730569948186,
214
+ "acc_norm_stderr": 0.02199531196364424
215
+ },
216
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
217
+ "acc": 0.6666666666666666,
218
+ "acc_stderr": 0.023901157979402534,
219
+ "acc_norm": 0.6666666666666666,
220
+ "acc_norm_stderr": 0.023901157979402534
221
+ },
222
+ "harness|hendrycksTest-high_school_mathematics|5": {
223
+ "acc": 0.34814814814814815,
224
+ "acc_stderr": 0.029045600290616255,
225
+ "acc_norm": 0.34814814814814815,
226
+ "acc_norm_stderr": 0.029045600290616255
227
+ },
228
+ "harness|hendrycksTest-high_school_microeconomics|5": {
229
+ "acc": 0.7142857142857143,
230
+ "acc_stderr": 0.029344572500634335,
231
+ "acc_norm": 0.7142857142857143,
232
+ "acc_norm_stderr": 0.029344572500634335
233
+ },
234
+ "harness|hendrycksTest-high_school_physics|5": {
235
+ "acc": 0.31788079470198677,
236
+ "acc_stderr": 0.038020397601079024,
237
+ "acc_norm": 0.31788079470198677,
238
+ "acc_norm_stderr": 0.038020397601079024
239
+ },
240
+ "harness|hendrycksTest-high_school_psychology|5": {
241
+ "acc": 0.8495412844036697,
242
+ "acc_stderr": 0.015328563932669237,
243
+ "acc_norm": 0.8495412844036697,
244
+ "acc_norm_stderr": 0.015328563932669237
245
+ },
246
+ "harness|hendrycksTest-high_school_statistics|5": {
247
+ "acc": 0.5231481481481481,
248
+ "acc_stderr": 0.03406315360711507,
249
+ "acc_norm": 0.5231481481481481,
250
+ "acc_norm_stderr": 0.03406315360711507
251
+ },
252
+ "harness|hendrycksTest-high_school_us_history|5": {
253
+ "acc": 0.8480392156862745,
254
+ "acc_stderr": 0.025195658428931796,
255
+ "acc_norm": 0.8480392156862745,
256
+ "acc_norm_stderr": 0.025195658428931796
257
+ },
258
+ "harness|hendrycksTest-high_school_world_history|5": {
259
+ "acc": 0.810126582278481,
260
+ "acc_stderr": 0.025530100460233504,
261
+ "acc_norm": 0.810126582278481,
262
+ "acc_norm_stderr": 0.025530100460233504
263
+ },
264
+ "harness|hendrycksTest-human_aging|5": {
265
+ "acc": 0.695067264573991,
266
+ "acc_stderr": 0.030898610882477515,
267
+ "acc_norm": 0.695067264573991,
268
+ "acc_norm_stderr": 0.030898610882477515
269
+ },
270
+ "harness|hendrycksTest-human_sexuality|5": {
271
+ "acc": 0.7709923664122137,
272
+ "acc_stderr": 0.036853466317118506,
273
+ "acc_norm": 0.7709923664122137,
274
+ "acc_norm_stderr": 0.036853466317118506
275
+ },
276
+ "harness|hendrycksTest-international_law|5": {
277
+ "acc": 0.8016528925619835,
278
+ "acc_stderr": 0.03640118271990947,
279
+ "acc_norm": 0.8016528925619835,
280
+ "acc_norm_stderr": 0.03640118271990947
281
+ },
282
+ "harness|hendrycksTest-jurisprudence|5": {
283
+ "acc": 0.8240740740740741,
284
+ "acc_stderr": 0.036809181416738807,
285
+ "acc_norm": 0.8240740740740741,
286
+ "acc_norm_stderr": 0.036809181416738807
287
+ },
288
+ "harness|hendrycksTest-logical_fallacies|5": {
289
+ "acc": 0.754601226993865,
290
+ "acc_stderr": 0.03380939813943354,
291
+ "acc_norm": 0.754601226993865,
292
+ "acc_norm_stderr": 0.03380939813943354
293
+ },
294
+ "harness|hendrycksTest-machine_learning|5": {
295
+ "acc": 0.4642857142857143,
296
+ "acc_stderr": 0.04733667890053756,
297
+ "acc_norm": 0.4642857142857143,
298
+ "acc_norm_stderr": 0.04733667890053756
299
+ },
300
+ "harness|hendrycksTest-management|5": {
301
+ "acc": 0.7961165048543689,
302
+ "acc_stderr": 0.039891398595317706,
303
+ "acc_norm": 0.7961165048543689,
304
+ "acc_norm_stderr": 0.039891398595317706
305
+ },
306
+ "harness|hendrycksTest-marketing|5": {
307
+ "acc": 0.8846153846153846,
308
+ "acc_stderr": 0.020930193185179326,
309
+ "acc_norm": 0.8846153846153846,
310
+ "acc_norm_stderr": 0.020930193185179326
311
+ },
312
+ "harness|hendrycksTest-medical_genetics|5": {
313
+ "acc": 0.72,
314
+ "acc_stderr": 0.045126085985421276,
315
+ "acc_norm": 0.72,
316
+ "acc_norm_stderr": 0.045126085985421276
317
+ },
318
+ "harness|hendrycksTest-miscellaneous|5": {
319
+ "acc": 0.8288633461047255,
320
+ "acc_stderr": 0.0134682016140663,
321
+ "acc_norm": 0.8288633461047255,
322
+ "acc_norm_stderr": 0.0134682016140663
323
+ },
324
+ "harness|hendrycksTest-moral_disputes|5": {
325
+ "acc": 0.7369942196531792,
326
+ "acc_stderr": 0.023703099525258172,
327
+ "acc_norm": 0.7369942196531792,
328
+ "acc_norm_stderr": 0.023703099525258172
329
+ },
330
+ "harness|hendrycksTest-moral_scenarios|5": {
331
+ "acc": 0.41564245810055866,
332
+ "acc_stderr": 0.016482782187500676,
333
+ "acc_norm": 0.41564245810055866,
334
+ "acc_norm_stderr": 0.016482782187500676
335
+ },
336
+ "harness|hendrycksTest-nutrition|5": {
337
+ "acc": 0.7320261437908496,
338
+ "acc_stderr": 0.025360603796242557,
339
+ "acc_norm": 0.7320261437908496,
340
+ "acc_norm_stderr": 0.025360603796242557
341
+ },
342
+ "harness|hendrycksTest-philosophy|5": {
343
+ "acc": 0.7363344051446945,
344
+ "acc_stderr": 0.02502553850053234,
345
+ "acc_norm": 0.7363344051446945,
346
+ "acc_norm_stderr": 0.02502553850053234
347
+ },
348
+ "harness|hendrycksTest-prehistory|5": {
349
+ "acc": 0.7376543209876543,
350
+ "acc_stderr": 0.024477222856135107,
351
+ "acc_norm": 0.7376543209876543,
352
+ "acc_norm_stderr": 0.024477222856135107
353
+ },
354
+ "harness|hendrycksTest-professional_accounting|5": {
355
+ "acc": 0.5141843971631206,
356
+ "acc_stderr": 0.02981549448368206,
357
+ "acc_norm": 0.5141843971631206,
358
+ "acc_norm_stderr": 0.02981549448368206
359
+ },
360
+ "harness|hendrycksTest-professional_law|5": {
361
+ "acc": 0.46740547588005216,
362
+ "acc_stderr": 0.012743072942653349,
363
+ "acc_norm": 0.46740547588005216,
364
+ "acc_norm_stderr": 0.012743072942653349
365
+ },
366
+ "harness|hendrycksTest-professional_medicine|5": {
367
+ "acc": 0.6801470588235294,
368
+ "acc_stderr": 0.028332959514031218,
369
+ "acc_norm": 0.6801470588235294,
370
+ "acc_norm_stderr": 0.028332959514031218
371
+ },
372
+ "harness|hendrycksTest-professional_psychology|5": {
373
+ "acc": 0.6699346405228758,
374
+ "acc_stderr": 0.019023726160724553,
375
+ "acc_norm": 0.6699346405228758,
376
+ "acc_norm_stderr": 0.019023726160724553
377
+ },
378
+ "harness|hendrycksTest-public_relations|5": {
379
+ "acc": 0.6727272727272727,
380
+ "acc_stderr": 0.0449429086625209,
381
+ "acc_norm": 0.6727272727272727,
382
+ "acc_norm_stderr": 0.0449429086625209
383
+ },
384
+ "harness|hendrycksTest-security_studies|5": {
385
+ "acc": 0.7142857142857143,
386
+ "acc_stderr": 0.0289205832206756,
387
+ "acc_norm": 0.7142857142857143,
388
+ "acc_norm_stderr": 0.0289205832206756
389
+ },
390
+ "harness|hendrycksTest-sociology|5": {
391
+ "acc": 0.845771144278607,
392
+ "acc_stderr": 0.025538433368578337,
393
+ "acc_norm": 0.845771144278607,
394
+ "acc_norm_stderr": 0.025538433368578337
395
+ },
396
+ "harness|hendrycksTest-us_foreign_policy|5": {
397
+ "acc": 0.84,
398
+ "acc_stderr": 0.03684529491774708,
399
+ "acc_norm": 0.84,
400
+ "acc_norm_stderr": 0.03684529491774708
401
+ },
402
+ "harness|hendrycksTest-virology|5": {
403
+ "acc": 0.536144578313253,
404
+ "acc_stderr": 0.038823108508905954,
405
+ "acc_norm": 0.536144578313253,
406
+ "acc_norm_stderr": 0.038823108508905954
407
+ },
408
+ "harness|hendrycksTest-world_religions|5": {
409
+ "acc": 0.8245614035087719,
410
+ "acc_stderr": 0.02917088550072767,
411
+ "acc_norm": 0.8245614035087719,
412
+ "acc_norm_stderr": 0.02917088550072767
413
+ },
414
+ "harness|truthfulqa:mc|0": {
415
+ "mc1": 0.44430844553243576,
416
+ "mc1_stderr": 0.01739458625074317,
417
+ "mc2": 0.6132594486430695,
418
+ "mc2_stderr": 0.015532509494332434
419
+ },
420
+ "harness|winogrande|5": {
421
+ "acc": 0.8105761641673244,
422
+ "acc_stderr": 0.011012790432989245
423
+ },
424
+ "harness|gsm8k|5": {
425
+ "acc": 0.7119029567854435,
426
+ "acc_stderr": 0.012474469737197916
427
+ }
428
+ }
429
+ ```