MaziyarPanahi commited on
Commit
d331b89
1 Parent(s): 1e5a203

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +388 -0
README.md CHANGED
@@ -124,4 +124,392 @@ or for CoT (❗For the simple math questions, we do NOT recommend to use the CoT
124
 
125
  ```
126
  "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response: Let's think step by step."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  ```
 
124
 
125
  ```
126
  "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response: Let's think step by step."
127
+ ```
128
+
129
+
130
+ ## Latest results
131
+
132
+ These are the [latest results from run 2024-02-18T06:49:50.553009](https://huggingface.co/datasets/open-llm-leaderboard/details_MaziyarPanahi__WizardLM-Math-70B-TIES-v0.1/blob/main/results_2024-02-18T06-49-50.553009.json)(note that their might be results for other tasks in the repos if successive evals didn't cover the same tasks. You find each in the results and the "latest" split for each eval):
133
+
134
+ ```python
135
+ {
136
+ "all": {
137
+ "acc": 0.6868282613819305,
138
+ "acc_stderr": 0.030371866427473967,
139
+ "acc_norm": 0.695311288530275,
140
+ "acc_norm_stderr": 0.030984285786669577,
141
+ "mc1": 0.36964504283965727,
142
+ "mc1_stderr": 0.01689818070697388,
143
+ "mc2": 0.5360987678643523,
144
+ "mc2_stderr": 0.014938153988985473
145
+ },
146
+ "harness|arc:challenge|25": {
147
+ "acc": 0.6424914675767918,
148
+ "acc_stderr": 0.014005494275916573,
149
+ "acc_norm": 0.6851535836177475,
150
+ "acc_norm_stderr": 0.01357265770308495
151
+ },
152
+ "harness|hellaswag|10": {
153
+ "acc": 0.6836287592113125,
154
+ "acc_stderr": 0.004641092001425294,
155
+ "acc_norm": 0.8686516630153356,
156
+ "acc_norm_stderr": 0.0033709059327855567
157
+ },
158
+ "harness|hendrycksTest-abstract_algebra|5": {
159
+ "acc": 0.33,
160
+ "acc_stderr": 0.04725815626252605,
161
+ "acc_norm": 0.33,
162
+ "acc_norm_stderr": 0.04725815626252605
163
+ },
164
+ "harness|hendrycksTest-anatomy|5": {
165
+ "acc": 0.6222222222222222,
166
+ "acc_stderr": 0.04188307537595852,
167
+ "acc_norm": 0.6222222222222222,
168
+ "acc_norm_stderr": 0.04188307537595852
169
+ },
170
+ "harness|hendrycksTest-astronomy|5": {
171
+ "acc": 0.8026315789473685,
172
+ "acc_stderr": 0.03238981601699397,
173
+ "acc_norm": 0.8026315789473685,
174
+ "acc_norm_stderr": 0.03238981601699397
175
+ },
176
+ "harness|hendrycksTest-business_ethics|5": {
177
+ "acc": 0.72,
178
+ "acc_stderr": 0.04512608598542127,
179
+ "acc_norm": 0.72,
180
+ "acc_norm_stderr": 0.04512608598542127
181
+ },
182
+ "harness|hendrycksTest-clinical_knowledge|5": {
183
+ "acc": 0.7471698113207547,
184
+ "acc_stderr": 0.026749899771241214,
185
+ "acc_norm": 0.7471698113207547,
186
+ "acc_norm_stderr": 0.026749899771241214
187
+ },
188
+ "harness|hendrycksTest-college_biology|5": {
189
+ "acc": 0.8194444444444444,
190
+ "acc_stderr": 0.032166008088022675,
191
+ "acc_norm": 0.8194444444444444,
192
+ "acc_norm_stderr": 0.032166008088022675
193
+ },
194
+ "harness|hendrycksTest-college_chemistry|5": {
195
+ "acc": 0.48,
196
+ "acc_stderr": 0.050211673156867795,
197
+ "acc_norm": 0.48,
198
+ "acc_norm_stderr": 0.050211673156867795
199
+ },
200
+ "harness|hendrycksTest-college_computer_science|5": {
201
+ "acc": 0.57,
202
+ "acc_stderr": 0.04975698519562428,
203
+ "acc_norm": 0.57,
204
+ "acc_norm_stderr": 0.04975698519562428
205
+ },
206
+ "harness|hendrycksTest-college_mathematics|5": {
207
+ "acc": 0.33,
208
+ "acc_stderr": 0.04725815626252604,
209
+ "acc_norm": 0.33,
210
+ "acc_norm_stderr": 0.04725815626252604
211
+ },
212
+ "harness|hendrycksTest-college_medicine|5": {
213
+ "acc": 0.6647398843930635,
214
+ "acc_stderr": 0.03599586301247077,
215
+ "acc_norm": 0.6647398843930635,
216
+ "acc_norm_stderr": 0.03599586301247077
217
+ },
218
+ "harness|hendrycksTest-college_physics|5": {
219
+ "acc": 0.35294117647058826,
220
+ "acc_stderr": 0.047551296160629475,
221
+ "acc_norm": 0.35294117647058826,
222
+ "acc_norm_stderr": 0.047551296160629475
223
+ },
224
+ "harness|hendrycksTest-computer_security|5": {
225
+ "acc": 0.74,
226
+ "acc_stderr": 0.04408440022768079,
227
+ "acc_norm": 0.74,
228
+ "acc_norm_stderr": 0.04408440022768079
229
+ },
230
+ "harness|hendrycksTest-conceptual_physics|5": {
231
+ "acc": 0.6893617021276596,
232
+ "acc_stderr": 0.03025123757921317,
233
+ "acc_norm": 0.6893617021276596,
234
+ "acc_norm_stderr": 0.03025123757921317
235
+ },
236
+ "harness|hendrycksTest-econometrics|5": {
237
+ "acc": 0.40350877192982454,
238
+ "acc_stderr": 0.046151869625837026,
239
+ "acc_norm": 0.40350877192982454,
240
+ "acc_norm_stderr": 0.046151869625837026
241
+ },
242
+ "harness|hendrycksTest-electrical_engineering|5": {
243
+ "acc": 0.6068965517241379,
244
+ "acc_stderr": 0.040703290137070705,
245
+ "acc_norm": 0.6068965517241379,
246
+ "acc_norm_stderr": 0.040703290137070705
247
+ },
248
+ "harness|hendrycksTest-elementary_mathematics|5": {
249
+ "acc": 0.4312169312169312,
250
+ "acc_stderr": 0.0255064816981382,
251
+ "acc_norm": 0.4312169312169312,
252
+ "acc_norm_stderr": 0.0255064816981382
253
+ },
254
+ "harness|hendrycksTest-formal_logic|5": {
255
+ "acc": 0.5,
256
+ "acc_stderr": 0.04472135954999579,
257
+ "acc_norm": 0.5,
258
+ "acc_norm_stderr": 0.04472135954999579
259
+ },
260
+ "harness|hendrycksTest-global_facts|5": {
261
+ "acc": 0.43,
262
+ "acc_stderr": 0.049756985195624284,
263
+ "acc_norm": 0.43,
264
+ "acc_norm_stderr": 0.049756985195624284
265
+ },
266
+ "harness|hendrycksTest-high_school_biology|5": {
267
+ "acc": 0.8193548387096774,
268
+ "acc_stderr": 0.021886178567172527,
269
+ "acc_norm": 0.8193548387096774,
270
+ "acc_norm_stderr": 0.021886178567172527
271
+ },
272
+ "harness|hendrycksTest-high_school_chemistry|5": {
273
+ "acc": 0.5320197044334976,
274
+ "acc_stderr": 0.03510766597959217,
275
+ "acc_norm": 0.5320197044334976,
276
+ "acc_norm_stderr": 0.03510766597959217
277
+ },
278
+ "harness|hendrycksTest-high_school_computer_science|5": {
279
+ "acc": 0.71,
280
+ "acc_stderr": 0.045604802157206845,
281
+ "acc_norm": 0.71,
282
+ "acc_norm_stderr": 0.045604802157206845
283
+ },
284
+ "harness|hendrycksTest-high_school_european_history|5": {
285
+ "acc": 0.8121212121212121,
286
+ "acc_stderr": 0.03050193405942914,
287
+ "acc_norm": 0.8121212121212121,
288
+ "acc_norm_stderr": 0.03050193405942914
289
+ },
290
+ "harness|hendrycksTest-high_school_geography|5": {
291
+ "acc": 0.8888888888888888,
292
+ "acc_stderr": 0.022390787638216773,
293
+ "acc_norm": 0.8888888888888888,
294
+ "acc_norm_stderr": 0.022390787638216773
295
+ },
296
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
297
+ "acc": 0.927461139896373,
298
+ "acc_stderr": 0.018718998520678185,
299
+ "acc_norm": 0.927461139896373,
300
+ "acc_norm_stderr": 0.018718998520678185
301
+ },
302
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
303
+ "acc": 0.7230769230769231,
304
+ "acc_stderr": 0.022688042352424994,
305
+ "acc_norm": 0.7230769230769231,
306
+ "acc_norm_stderr": 0.022688042352424994
307
+ },
308
+ "harness|hendrycksTest-high_school_mathematics|5": {
309
+ "acc": 0.3333333333333333,
310
+ "acc_stderr": 0.028742040903948492,
311
+ "acc_norm": 0.3333333333333333,
312
+ "acc_norm_stderr": 0.028742040903948492
313
+ },
314
+ "harness|hendrycksTest-high_school_microeconomics|5": {
315
+ "acc": 0.8109243697478992,
316
+ "acc_stderr": 0.02543511943810537,
317
+ "acc_norm": 0.8109243697478992,
318
+ "acc_norm_stderr": 0.02543511943810537
319
+ },
320
+ "harness|hendrycksTest-high_school_physics|5": {
321
+ "acc": 0.4304635761589404,
322
+ "acc_stderr": 0.04042809961395634,
323
+ "acc_norm": 0.4304635761589404,
324
+ "acc_norm_stderr": 0.04042809961395634
325
+ },
326
+ "harness|hendrycksTest-high_school_psychology|5": {
327
+ "acc": 0.8862385321100917,
328
+ "acc_stderr": 0.0136136148002328,
329
+ "acc_norm": 0.8862385321100917,
330
+ "acc_norm_stderr": 0.0136136148002328
331
+ },
332
+ "harness|hendrycksTest-high_school_statistics|5": {
333
+ "acc": 0.5879629629629629,
334
+ "acc_stderr": 0.03356787758160831,
335
+ "acc_norm": 0.5879629629629629,
336
+ "acc_norm_stderr": 0.03356787758160831
337
+ },
338
+ "harness|hendrycksTest-high_school_us_history|5": {
339
+ "acc": 0.9166666666666666,
340
+ "acc_stderr": 0.019398452135813895,
341
+ "acc_norm": 0.9166666666666666,
342
+ "acc_norm_stderr": 0.019398452135813895
343
+ },
344
+ "harness|hendrycksTest-high_school_world_history|5": {
345
+ "acc": 0.8776371308016878,
346
+ "acc_stderr": 0.02133174182974679,
347
+ "acc_norm": 0.8776371308016878,
348
+ "acc_norm_stderr": 0.02133174182974679
349
+ },
350
+ "harness|hendrycksTest-human_aging|5": {
351
+ "acc": 0.8161434977578476,
352
+ "acc_stderr": 0.025998379092356513,
353
+ "acc_norm": 0.8161434977578476,
354
+ "acc_norm_stderr": 0.025998379092356513
355
+ },
356
+ "harness|hendrycksTest-human_sexuality|5": {
357
+ "acc": 0.8473282442748091,
358
+ "acc_stderr": 0.03154521672005472,
359
+ "acc_norm": 0.8473282442748091,
360
+ "acc_norm_stderr": 0.03154521672005472
361
+ },
362
+ "harness|hendrycksTest-international_law|5": {
363
+ "acc": 0.8512396694214877,
364
+ "acc_stderr": 0.03248470083807194,
365
+ "acc_norm": 0.8512396694214877,
366
+ "acc_norm_stderr": 0.03248470083807194
367
+ },
368
+ "harness|hendrycksTest-jurisprudence|5": {
369
+ "acc": 0.8148148148148148,
370
+ "acc_stderr": 0.03755265865037181,
371
+ "acc_norm": 0.8148148148148148,
372
+ "acc_norm_stderr": 0.03755265865037181
373
+ },
374
+ "harness|hendrycksTest-logical_fallacies|5": {
375
+ "acc": 0.803680981595092,
376
+ "acc_stderr": 0.031207970394709225,
377
+ "acc_norm": 0.803680981595092,
378
+ "acc_norm_stderr": 0.031207970394709225
379
+ },
380
+ "harness|hendrycksTest-machine_learning|5": {
381
+ "acc": 0.48214285714285715,
382
+ "acc_stderr": 0.047427623612430116,
383
+ "acc_norm": 0.48214285714285715,
384
+ "acc_norm_stderr": 0.047427623612430116
385
+ },
386
+ "harness|hendrycksTest-management|5": {
387
+ "acc": 0.8446601941747572,
388
+ "acc_stderr": 0.03586594738573975,
389
+ "acc_norm": 0.8446601941747572,
390
+ "acc_norm_stderr": 0.03586594738573975
391
+ },
392
+ "harness|hendrycksTest-marketing|5": {
393
+ "acc": 0.8846153846153846,
394
+ "acc_stderr": 0.020930193185179333,
395
+ "acc_norm": 0.8846153846153846,
396
+ "acc_norm_stderr": 0.020930193185179333
397
+ },
398
+ "harness|hendrycksTest-medical_genetics|5": {
399
+ "acc": 0.72,
400
+ "acc_stderr": 0.045126085985421276,
401
+ "acc_norm": 0.72,
402
+ "acc_norm_stderr": 0.045126085985421276
403
+ },
404
+ "harness|hendrycksTest-miscellaneous|5": {
405
+ "acc": 0.8620689655172413,
406
+ "acc_stderr": 0.012331009307795663,
407
+ "acc_norm": 0.8620689655172413,
408
+ "acc_norm_stderr": 0.012331009307795663
409
+ },
410
+ "harness|hendrycksTest-moral_disputes|5": {
411
+ "acc": 0.7745664739884393,
412
+ "acc_stderr": 0.022497230190967558,
413
+ "acc_norm": 0.7745664739884393,
414
+ "acc_norm_stderr": 0.022497230190967558
415
+ },
416
+ "harness|hendrycksTest-moral_scenarios|5": {
417
+ "acc": 0.5452513966480447,
418
+ "acc_stderr": 0.016653875777523995,
419
+ "acc_norm": 0.5452513966480447,
420
+ "acc_norm_stderr": 0.016653875777523995
421
+ },
422
+ "harness|hendrycksTest-nutrition|5": {
423
+ "acc": 0.7581699346405228,
424
+ "acc_stderr": 0.024518195641879334,
425
+ "acc_norm": 0.7581699346405228,
426
+ "acc_norm_stderr": 0.024518195641879334
427
+ },
428
+ "harness|hendrycksTest-philosophy|5": {
429
+ "acc": 0.7845659163987139,
430
+ "acc_stderr": 0.023350225475471442,
431
+ "acc_norm": 0.7845659163987139,
432
+ "acc_norm_stderr": 0.023350225475471442
433
+ },
434
+ "harness|hendrycksTest-prehistory|5": {
435
+ "acc": 0.8364197530864198,
436
+ "acc_stderr": 0.02058146613825712,
437
+ "acc_norm": 0.8364197530864198,
438
+ "acc_norm_stderr": 0.02058146613825712
439
+ },
440
+ "harness|hendrycksTest-professional_accounting|5": {
441
+ "acc": 0.5177304964539007,
442
+ "acc_stderr": 0.02980873964223777,
443
+ "acc_norm": 0.5177304964539007,
444
+ "acc_norm_stderr": 0.02980873964223777
445
+ },
446
+ "harness|hendrycksTest-professional_law|5": {
447
+ "acc": 0.5658409387222947,
448
+ "acc_stderr": 0.012659033237067253,
449
+ "acc_norm": 0.5658409387222947,
450
+ "acc_norm_stderr": 0.012659033237067253
451
+ },
452
+ "harness|hendrycksTest-professional_medicine|5": {
453
+ "acc": 0.7683823529411765,
454
+ "acc_stderr": 0.025626533803777562,
455
+ "acc_norm": 0.7683823529411765,
456
+ "acc_norm_stderr": 0.025626533803777562
457
+ },
458
+ "harness|hendrycksTest-professional_psychology|5": {
459
+ "acc": 0.7532679738562091,
460
+ "acc_stderr": 0.0174408203674025,
461
+ "acc_norm": 0.7532679738562091,
462
+ "acc_norm_stderr": 0.0174408203674025
463
+ },
464
+ "harness|hendrycksTest-public_relations|5": {
465
+ "acc": 0.7272727272727273,
466
+ "acc_stderr": 0.04265792110940588,
467
+ "acc_norm": 0.7272727272727273,
468
+ "acc_norm_stderr": 0.04265792110940588
469
+ },
470
+ "harness|hendrycksTest-security_studies|5": {
471
+ "acc": 0.8,
472
+ "acc_stderr": 0.02560737598657916,
473
+ "acc_norm": 0.8,
474
+ "acc_norm_stderr": 0.02560737598657916
475
+ },
476
+ "harness|hendrycksTest-sociology|5": {
477
+ "acc": 0.8955223880597015,
478
+ "acc_stderr": 0.021628920516700643,
479
+ "acc_norm": 0.8955223880597015,
480
+ "acc_norm_stderr": 0.021628920516700643
481
+ },
482
+ "harness|hendrycksTest-us_foreign_policy|5": {
483
+ "acc": 0.93,
484
+ "acc_stderr": 0.0256432399976243,
485
+ "acc_norm": 0.93,
486
+ "acc_norm_stderr": 0.0256432399976243
487
+ },
488
+ "harness|hendrycksTest-virology|5": {
489
+ "acc": 0.5240963855421686,
490
+ "acc_stderr": 0.03887971849597264,
491
+ "acc_norm": 0.5240963855421686,
492
+ "acc_norm_stderr": 0.03887971849597264
493
+ },
494
+ "harness|hendrycksTest-world_religions|5": {
495
+ "acc": 0.8654970760233918,
496
+ "acc_stderr": 0.026168221344662297,
497
+ "acc_norm": 0.8654970760233918,
498
+ "acc_norm_stderr": 0.026168221344662297
499
+ },
500
+ "harness|truthfulqa:mc|0": {
501
+ "mc1": 0.36964504283965727,
502
+ "mc1_stderr": 0.01689818070697388,
503
+ "mc2": 0.5360987678643523,
504
+ "mc2_stderr": 0.014938153988985473
505
+ },
506
+ "harness|winogrande|5": {
507
+ "acc": 0.8271507498026835,
508
+ "acc_stderr": 0.010626964529971855
509
+ },
510
+ "harness|gsm8k|5": {
511
+ "acc": 0.27369219105382864,
512
+ "acc_stderr": 0.012281003490963456
513
+ }
514
+ }
515
  ```