mychen76 commited on
Commit
44dff50
1 Parent(s): 36c42c6

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +390 -1
README.md CHANGED
@@ -31,4 +31,393 @@ parameters:
31
  normalize: true
32
  dtype: bfloat16
33
 
34
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  normalize: true
32
  dtype: bfloat16
33
 
34
+ ```
35
+ ## Evaluation
36
+ https://huggingface.co/datasets/open-llm-leaderboard/details_mychen76__mistral-7b-merged-ties
37
+
38
+ Latest Result:
39
+ https://huggingface.co/datasets/open-llm-leaderboard/details_mychen76__mistral-7b-merged-ties/blob/main/results_2024-03-10T11-05-18.535141.json
40
+
41
+ ```
42
+ {
43
+ "all": {
44
+ "acc": 0.6445924072176131,
45
+ "acc_stderr": 0.03213293328697562,
46
+ "acc_norm": 0.6450342620069291,
47
+ "acc_norm_stderr": 0.032788565108750604,
48
+ "mc1": 0.4455324357405141,
49
+ "mc1_stderr": 0.017399335280140357,
50
+ "mc2": 0.6131109579182783,
51
+ "mc2_stderr": 0.015351738756398125
52
+ },
53
+ "harness|arc:challenge|25": {
54
+ "acc": 0.6390784982935154,
55
+ "acc_stderr": 0.014034761386175452,
56
+ "acc_norm": 0.6791808873720137,
57
+ "acc_norm_stderr": 0.013640943091946531
58
+ },
59
+ "harness|hellaswag|10": {
60
+ "acc": 0.6722764389563832,
61
+ "acc_stderr": 0.004684241685200317,
62
+ "acc_norm": 0.85929097789285,
63
+ "acc_norm_stderr": 0.00347010499020439
64
+ },
65
+ "harness|hendrycksTest-abstract_algebra|5": {
66
+ "acc": 0.28,
67
+ "acc_stderr": 0.04512608598542128,
68
+ "acc_norm": 0.28,
69
+ "acc_norm_stderr": 0.04512608598542128
70
+ },
71
+ "harness|hendrycksTest-anatomy|5": {
72
+ "acc": 0.6074074074074074,
73
+ "acc_stderr": 0.0421850621536888,
74
+ "acc_norm": 0.6074074074074074,
75
+ "acc_norm_stderr": 0.0421850621536888
76
+ },
77
+ "harness|hendrycksTest-astronomy|5": {
78
+ "acc": 0.743421052631579,
79
+ "acc_stderr": 0.0355418036802569,
80
+ "acc_norm": 0.743421052631579,
81
+ "acc_norm_stderr": 0.0355418036802569
82
+ },
83
+ "harness|hendrycksTest-business_ethics|5": {
84
+ "acc": 0.61,
85
+ "acc_stderr": 0.04902071300001975,
86
+ "acc_norm": 0.61,
87
+ "acc_norm_stderr": 0.04902071300001975
88
+ },
89
+ "harness|hendrycksTest-clinical_knowledge|5": {
90
+ "acc": 0.6867924528301886,
91
+ "acc_stderr": 0.028544793319055326,
92
+ "acc_norm": 0.6867924528301886,
93
+ "acc_norm_stderr": 0.028544793319055326
94
+ },
95
+ "harness|hendrycksTest-college_biology|5": {
96
+ "acc": 0.7777777777777778,
97
+ "acc_stderr": 0.03476590104304134,
98
+ "acc_norm": 0.7777777777777778,
99
+ "acc_norm_stderr": 0.03476590104304134
100
+ },
101
+ "harness|hendrycksTest-college_chemistry|5": {
102
+ "acc": 0.48,
103
+ "acc_stderr": 0.050211673156867795,
104
+ "acc_norm": 0.48,
105
+ "acc_norm_stderr": 0.050211673156867795
106
+ },
107
+ "harness|hendrycksTest-college_computer_science|5": {
108
+ "acc": 0.48,
109
+ "acc_stderr": 0.050211673156867795,
110
+ "acc_norm": 0.48,
111
+ "acc_norm_stderr": 0.050211673156867795
112
+ },
113
+ "harness|hendrycksTest-college_mathematics|5": {
114
+ "acc": 0.32,
115
+ "acc_stderr": 0.04688261722621504,
116
+ "acc_norm": 0.32,
117
+ "acc_norm_stderr": 0.04688261722621504
118
+ },
119
+ "harness|hendrycksTest-college_medicine|5": {
120
+ "acc": 0.630057803468208,
121
+ "acc_stderr": 0.036812296333943194,
122
+ "acc_norm": 0.630057803468208,
123
+ "acc_norm_stderr": 0.036812296333943194
124
+ },
125
+ "harness|hendrycksTest-college_physics|5": {
126
+ "acc": 0.4117647058823529,
127
+ "acc_stderr": 0.048971049527263666,
128
+ "acc_norm": 0.4117647058823529,
129
+ "acc_norm_stderr": 0.048971049527263666
130
+ },
131
+ "harness|hendrycksTest-computer_security|5": {
132
+ "acc": 0.76,
133
+ "acc_stderr": 0.042923469599092816,
134
+ "acc_norm": 0.76,
135
+ "acc_norm_stderr": 0.042923469599092816
136
+ },
137
+ "harness|hendrycksTest-conceptual_physics|5": {
138
+ "acc": 0.574468085106383,
139
+ "acc_stderr": 0.03232146916224468,
140
+ "acc_norm": 0.574468085106383,
141
+ "acc_norm_stderr": 0.03232146916224468
142
+ },
143
+ "harness|hendrycksTest-econometrics|5": {
144
+ "acc": 0.5175438596491229,
145
+ "acc_stderr": 0.04700708033551038,
146
+ "acc_norm": 0.5175438596491229,
147
+ "acc_norm_stderr": 0.04700708033551038
148
+ },
149
+ "harness|hendrycksTest-electrical_engineering|5": {
150
+ "acc": 0.5448275862068965,
151
+ "acc_stderr": 0.04149886942192117,
152
+ "acc_norm": 0.5448275862068965,
153
+ "acc_norm_stderr": 0.04149886942192117
154
+ },
155
+ "harness|hendrycksTest-elementary_mathematics|5": {
156
+ "acc": 0.4126984126984127,
157
+ "acc_stderr": 0.025355741263055263,
158
+ "acc_norm": 0.4126984126984127,
159
+ "acc_norm_stderr": 0.025355741263055263
160
+ },
161
+ "harness|hendrycksTest-formal_logic|5": {
162
+ "acc": 0.4365079365079365,
163
+ "acc_stderr": 0.04435932892851466,
164
+ "acc_norm": 0.4365079365079365,
165
+ "acc_norm_stderr": 0.04435932892851466
166
+ },
167
+ "harness|hendrycksTest-global_facts|5": {
168
+ "acc": 0.35,
169
+ "acc_stderr": 0.047937248544110196,
170
+ "acc_norm": 0.35,
171
+ "acc_norm_stderr": 0.047937248544110196
172
+ },
173
+ "harness|hendrycksTest-high_school_biology|5": {
174
+ "acc": 0.7645161290322581,
175
+ "acc_stderr": 0.02413763242933771,
176
+ "acc_norm": 0.7645161290322581,
177
+ "acc_norm_stderr": 0.02413763242933771
178
+ },
179
+ "harness|hendrycksTest-high_school_chemistry|5": {
180
+ "acc": 0.49261083743842365,
181
+ "acc_stderr": 0.035176035403610084,
182
+ "acc_norm": 0.49261083743842365,
183
+ "acc_norm_stderr": 0.035176035403610084
184
+ },
185
+ "harness|hendrycksTest-high_school_computer_science|5": {
186
+ "acc": 0.67,
187
+ "acc_stderr": 0.04725815626252607,
188
+ "acc_norm": 0.67,
189
+ "acc_norm_stderr": 0.04725815626252607
190
+ },
191
+ "harness|hendrycksTest-high_school_european_history|5": {
192
+ "acc": 0.7757575757575758,
193
+ "acc_stderr": 0.03256866661681102,
194
+ "acc_norm": 0.7757575757575758,
195
+ "acc_norm_stderr": 0.03256866661681102
196
+ },
197
+ "harness|hendrycksTest-high_school_geography|5": {
198
+ "acc": 0.7828282828282829,
199
+ "acc_stderr": 0.02937661648494563,
200
+ "acc_norm": 0.7828282828282829,
201
+ "acc_norm_stderr": 0.02937661648494563
202
+ },
203
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
204
+ "acc": 0.8963730569948186,
205
+ "acc_stderr": 0.02199531196364424,
206
+ "acc_norm": 0.8963730569948186,
207
+ "acc_norm_stderr": 0.02199531196364424
208
+ },
209
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
210
+ "acc": 0.6410256410256411,
211
+ "acc_stderr": 0.024321738484602354,
212
+ "acc_norm": 0.6410256410256411,
213
+ "acc_norm_stderr": 0.024321738484602354
214
+ },
215
+ "harness|hendrycksTest-high_school_mathematics|5": {
216
+ "acc": 0.34814814814814815,
217
+ "acc_stderr": 0.029045600290616255,
218
+ "acc_norm": 0.34814814814814815,
219
+ "acc_norm_stderr": 0.029045600290616255
220
+ },
221
+ "harness|hendrycksTest-high_school_microeconomics|5": {
222
+ "acc": 0.6890756302521008,
223
+ "acc_stderr": 0.03006676158297793,
224
+ "acc_norm": 0.6890756302521008,
225
+ "acc_norm_stderr": 0.03006676158297793
226
+ },
227
+ "harness|hendrycksTest-high_school_physics|5": {
228
+ "acc": 0.2980132450331126,
229
+ "acc_stderr": 0.037345356767871984,
230
+ "acc_norm": 0.2980132450331126,
231
+ "acc_norm_stderr": 0.037345356767871984
232
+ },
233
+ "harness|hendrycksTest-high_school_psychology|5": {
234
+ "acc": 0.8495412844036697,
235
+ "acc_stderr": 0.015328563932669237,
236
+ "acc_norm": 0.8495412844036697,
237
+ "acc_norm_stderr": 0.015328563932669237
238
+ },
239
+ "harness|hendrycksTest-high_school_statistics|5": {
240
+ "acc": 0.5231481481481481,
241
+ "acc_stderr": 0.03406315360711507,
242
+ "acc_norm": 0.5231481481481481,
243
+ "acc_norm_stderr": 0.03406315360711507
244
+ },
245
+ "harness|hendrycksTest-high_school_us_history|5": {
246
+ "acc": 0.8186274509803921,
247
+ "acc_stderr": 0.027044621719474086,
248
+ "acc_norm": 0.8186274509803921,
249
+ "acc_norm_stderr": 0.027044621719474086
250
+ },
251
+ "harness|hendrycksTest-high_school_world_history|5": {
252
+ "acc": 0.8185654008438819,
253
+ "acc_stderr": 0.025085961144579665,
254
+ "acc_norm": 0.8185654008438819,
255
+ "acc_norm_stderr": 0.025085961144579665
256
+ },
257
+ "harness|hendrycksTest-human_aging|5": {
258
+ "acc": 0.6860986547085202,
259
+ "acc_stderr": 0.031146796482972465,
260
+ "acc_norm": 0.6860986547085202,
261
+ "acc_norm_stderr": 0.031146796482972465
262
+ },
263
+ "harness|hendrycksTest-human_sexuality|5": {
264
+ "acc": 0.7862595419847328,
265
+ "acc_stderr": 0.0359546161177469,
266
+ "acc_norm": 0.7862595419847328,
267
+ "acc_norm_stderr": 0.0359546161177469
268
+ },
269
+ "harness|hendrycksTest-international_law|5": {
270
+ "acc": 0.7851239669421488,
271
+ "acc_stderr": 0.037494924487096966,
272
+ "acc_norm": 0.7851239669421488,
273
+ "acc_norm_stderr": 0.037494924487096966
274
+ },
275
+ "harness|hendrycksTest-jurisprudence|5": {
276
+ "acc": 0.7962962962962963,
277
+ "acc_stderr": 0.03893542518824847,
278
+ "acc_norm": 0.7962962962962963,
279
+ "acc_norm_stderr": 0.03893542518824847
280
+ },
281
+ "harness|hendrycksTest-logical_fallacies|5": {
282
+ "acc": 0.7607361963190185,
283
+ "acc_stderr": 0.033519538795212696,
284
+ "acc_norm": 0.7607361963190185,
285
+ "acc_norm_stderr": 0.033519538795212696
286
+ },
287
+ "harness|hendrycksTest-machine_learning|5": {
288
+ "acc": 0.4642857142857143,
289
+ "acc_stderr": 0.04733667890053756,
290
+ "acc_norm": 0.4642857142857143,
291
+ "acc_norm_stderr": 0.04733667890053756
292
+ },
293
+ "harness|hendrycksTest-management|5": {
294
+ "acc": 0.7766990291262136,
295
+ "acc_stderr": 0.04123553189891431,
296
+ "acc_norm": 0.7766990291262136,
297
+ "acc_norm_stderr": 0.04123553189891431
298
+ },
299
+ "harness|hendrycksTest-marketing|5": {
300
+ "acc": 0.8547008547008547,
301
+ "acc_stderr": 0.023086635086841407,
302
+ "acc_norm": 0.8547008547008547,
303
+ "acc_norm_stderr": 0.023086635086841407
304
+ },
305
+ "harness|hendrycksTest-medical_genetics|5": {
306
+ "acc": 0.71,
307
+ "acc_stderr": 0.045604802157206845,
308
+ "acc_norm": 0.71,
309
+ "acc_norm_stderr": 0.045604802157206845
310
+ },
311
+ "harness|hendrycksTest-miscellaneous|5": {
312
+ "acc": 0.8301404853128991,
313
+ "acc_stderr": 0.013428186370608304,
314
+ "acc_norm": 0.8301404853128991,
315
+ "acc_norm_stderr": 0.013428186370608304
316
+ },
317
+ "harness|hendrycksTest-moral_disputes|5": {
318
+ "acc": 0.7369942196531792,
319
+ "acc_stderr": 0.023703099525258172,
320
+ "acc_norm": 0.7369942196531792,
321
+ "acc_norm_stderr": 0.023703099525258172
322
+ },
323
+ "harness|hendrycksTest-moral_scenarios|5": {
324
+ "acc": 0.3664804469273743,
325
+ "acc_stderr": 0.016115235504865467,
326
+ "acc_norm": 0.3664804469273743,
327
+ "acc_norm_stderr": 0.016115235504865467
328
+ },
329
+ "harness|hendrycksTest-nutrition|5": {
330
+ "acc": 0.7320261437908496,
331
+ "acc_stderr": 0.025360603796242553,
332
+ "acc_norm": 0.7320261437908496,
333
+ "acc_norm_stderr": 0.025360603796242553
334
+ },
335
+ "harness|hendrycksTest-philosophy|5": {
336
+ "acc": 0.7170418006430869,
337
+ "acc_stderr": 0.02558306248998481,
338
+ "acc_norm": 0.7170418006430869,
339
+ "acc_norm_stderr": 0.02558306248998481
340
+ },
341
+ "harness|hendrycksTest-prehistory|5": {
342
+ "acc": 0.7376543209876543,
343
+ "acc_stderr": 0.024477222856135114,
344
+ "acc_norm": 0.7376543209876543,
345
+ "acc_norm_stderr": 0.024477222856135114
346
+ },
347
+ "harness|hendrycksTest-professional_accounting|5": {
348
+ "acc": 0.5070921985815603,
349
+ "acc_stderr": 0.02982449855912901,
350
+ "acc_norm": 0.5070921985815603,
351
+ "acc_norm_stderr": 0.02982449855912901
352
+ },
353
+ "harness|hendrycksTest-professional_law|5": {
354
+ "acc": 0.4667535853976532,
355
+ "acc_stderr": 0.01274197433389723,
356
+ "acc_norm": 0.4667535853976532,
357
+ "acc_norm_stderr": 0.01274197433389723
358
+ },
359
+ "harness|hendrycksTest-professional_medicine|5": {
360
+ "acc": 0.6764705882352942,
361
+ "acc_stderr": 0.02841820861940676,
362
+ "acc_norm": 0.6764705882352942,
363
+ "acc_norm_stderr": 0.02841820861940676
364
+ },
365
+ "harness|hendrycksTest-professional_psychology|5": {
366
+ "acc": 0.6666666666666666,
367
+ "acc_stderr": 0.019070985589687495,
368
+ "acc_norm": 0.6666666666666666,
369
+ "acc_norm_stderr": 0.019070985589687495
370
+ },
371
+ "harness|hendrycksTest-public_relations|5": {
372
+ "acc": 0.6545454545454545,
373
+ "acc_stderr": 0.04554619617541054,
374
+ "acc_norm": 0.6545454545454545,
375
+ "acc_norm_stderr": 0.04554619617541054
376
+ },
377
+ "harness|hendrycksTest-security_studies|5": {
378
+ "acc": 0.7306122448979592,
379
+ "acc_stderr": 0.02840125202902294,
380
+ "acc_norm": 0.7306122448979592,
381
+ "acc_norm_stderr": 0.02840125202902294
382
+ },
383
+ "harness|hendrycksTest-sociology|5": {
384
+ "acc": 0.845771144278607,
385
+ "acc_stderr": 0.025538433368578337,
386
+ "acc_norm": 0.845771144278607,
387
+ "acc_norm_stderr": 0.025538433368578337
388
+ },
389
+ "harness|hendrycksTest-us_foreign_policy|5": {
390
+ "acc": 0.86,
391
+ "acc_stderr": 0.0348735088019777,
392
+ "acc_norm": 0.86,
393
+ "acc_norm_stderr": 0.0348735088019777
394
+ },
395
+ "harness|hendrycksTest-virology|5": {
396
+ "acc": 0.5481927710843374,
397
+ "acc_stderr": 0.03874371556587953,
398
+ "acc_norm": 0.5481927710843374,
399
+ "acc_norm_stderr": 0.03874371556587953
400
+ },
401
+ "harness|hendrycksTest-world_religions|5": {
402
+ "acc": 0.8304093567251462,
403
+ "acc_stderr": 0.02878210810540171,
404
+ "acc_norm": 0.8304093567251462,
405
+ "acc_norm_stderr": 0.02878210810540171
406
+ },
407
+ "harness|truthfulqa:mc|0": {
408
+ "mc1": 0.4455324357405141,
409
+ "mc1_stderr": 0.017399335280140357,
410
+ "mc2": 0.6131109579182783,
411
+ "mc2_stderr": 0.015351738756398125
412
+ },
413
+ "harness|winogrande|5": {
414
+ "acc": 0.8003157063930545,
415
+ "acc_stderr": 0.011235328382625842
416
+ },
417
+ "harness|gsm8k|5": {
418
+ "acc": 0.6899166034874905,
419
+ "acc_stderr": 0.01274030571737627
420
+ }
421
+ }
422
+
423
+ ```