mychen76 commited on
Commit
865c225
1 Parent(s): c640d09

update result

Browse files
Files changed (1) hide show
  1. README.md +35 -384
README.md CHANGED
@@ -44,390 +44,41 @@ parameters:
44
  dtype: bfloat16
45
 
46
  ```
47
- ## Evaluation
48
- https://huggingface.co/datasets/open-llm-leaderboard/details_mychen76__mistral-7b-merged-dare_6x7
 
49
 
50
- ## Result
51
- ```
52
- {
53
- "all": {
54
- "acc": 0.6563139414530638,
55
- "acc_stderr": 0.031967569574421976,
56
- "acc_norm": 0.6562534942043537,
57
- "acc_norm_stderr": 0.03262528877791407,
58
- "mc1": 0.5067319461444308,
59
- "mc1_stderr": 0.017501914492655396,
60
- "mc2": 0.6698288226697681,
61
- "mc2_stderr": 0.015121056875692264
62
- },
63
- "harness|arc:challenge|25": {
64
- "acc": 0.6689419795221843,
65
- "acc_stderr": 0.013752062419817837,
66
- "acc_norm": 0.6962457337883959,
67
- "acc_norm_stderr": 0.013438909184778768
68
- },
69
- "harness|hellaswag|10": {
70
- "acc": 0.6946823341963753,
71
- "acc_stderr": 0.004596006250433551,
72
- "acc_norm": 0.870444134634535,
73
- "acc_norm_stderr": 0.003351278403392407
74
- },
75
- "harness|hendrycksTest-abstract_algebra|5": {
76
- "acc": 0.32,
77
- "acc_stderr": 0.046882617226215034,
78
- "acc_norm": 0.32,
79
- "acc_norm_stderr": 0.046882617226215034
80
- },
81
- "harness|hendrycksTest-anatomy|5": {
82
- "acc": 0.6370370370370371,
83
- "acc_stderr": 0.041539484047423976,
84
- "acc_norm": 0.6370370370370371,
85
- "acc_norm_stderr": 0.041539484047423976
86
- },
87
- "harness|hendrycksTest-astronomy|5": {
88
- "acc": 0.6907894736842105,
89
- "acc_stderr": 0.037610708698674805,
90
- "acc_norm": 0.6907894736842105,
91
- "acc_norm_stderr": 0.037610708698674805
92
- },
93
- "harness|hendrycksTest-business_ethics|5": {
94
- "acc": 0.64,
95
- "acc_stderr": 0.04824181513244218,
96
- "acc_norm": 0.64,
97
- "acc_norm_stderr": 0.04824181513244218
98
- },
99
- "harness|hendrycksTest-clinical_knowledge|5": {
100
- "acc": 0.7018867924528301,
101
- "acc_stderr": 0.028152837942493864,
102
- "acc_norm": 0.7018867924528301,
103
- "acc_norm_stderr": 0.028152837942493864
104
- },
105
- "harness|hendrycksTest-college_biology|5": {
106
- "acc": 0.7638888888888888,
107
- "acc_stderr": 0.03551446610810826,
108
- "acc_norm": 0.7638888888888888,
109
- "acc_norm_stderr": 0.03551446610810826
110
- },
111
- "harness|hendrycksTest-college_chemistry|5": {
112
- "acc": 0.47,
113
- "acc_stderr": 0.050161355804659205,
114
- "acc_norm": 0.47,
115
- "acc_norm_stderr": 0.050161355804659205
116
- },
117
- "harness|hendrycksTest-college_computer_science|5": {
118
- "acc": 0.54,
119
- "acc_stderr": 0.05009082659620333,
120
- "acc_norm": 0.54,
121
- "acc_norm_stderr": 0.05009082659620333
122
- },
123
- "harness|hendrycksTest-college_mathematics|5": {
124
- "acc": 0.32,
125
- "acc_stderr": 0.04688261722621504,
126
- "acc_norm": 0.32,
127
- "acc_norm_stderr": 0.04688261722621504
128
- },
129
- "harness|hendrycksTest-college_medicine|5": {
130
- "acc": 0.6589595375722543,
131
- "acc_stderr": 0.036146654241808254,
132
- "acc_norm": 0.6589595375722543,
133
- "acc_norm_stderr": 0.036146654241808254
134
- },
135
- "harness|hendrycksTest-college_physics|5": {
136
- "acc": 0.4019607843137255,
137
- "acc_stderr": 0.048786087144669955,
138
- "acc_norm": 0.4019607843137255,
139
- "acc_norm_stderr": 0.048786087144669955
140
- },
141
- "harness|hendrycksTest-computer_security|5": {
142
- "acc": 0.77,
143
- "acc_stderr": 0.04229525846816507,
144
- "acc_norm": 0.77,
145
- "acc_norm_stderr": 0.04229525846816507
146
- },
147
- "harness|hendrycksTest-conceptual_physics|5": {
148
- "acc": 0.5829787234042553,
149
- "acc_stderr": 0.03223276266711712,
150
- "acc_norm": 0.5829787234042553,
151
- "acc_norm_stderr": 0.03223276266711712
152
- },
153
- "harness|hendrycksTest-econometrics|5": {
154
- "acc": 0.45614035087719296,
155
- "acc_stderr": 0.046854730419077895,
156
- "acc_norm": 0.45614035087719296,
157
- "acc_norm_stderr": 0.046854730419077895
158
- },
159
- "harness|hendrycksTest-electrical_engineering|5": {
160
- "acc": 0.5724137931034483,
161
- "acc_stderr": 0.04122737111370333,
162
- "acc_norm": 0.5724137931034483,
163
- "acc_norm_stderr": 0.04122737111370333
164
- },
165
- "harness|hendrycksTest-elementary_mathematics|5": {
166
- "acc": 0.42063492063492064,
167
- "acc_stderr": 0.025424835086924,
168
- "acc_norm": 0.42063492063492064,
169
- "acc_norm_stderr": 0.025424835086924
170
- },
171
- "harness|hendrycksTest-formal_logic|5": {
172
- "acc": 0.49206349206349204,
173
- "acc_stderr": 0.044715725362943486,
174
- "acc_norm": 0.49206349206349204,
175
- "acc_norm_stderr": 0.044715725362943486
176
- },
177
- "harness|hendrycksTest-global_facts|5": {
178
- "acc": 0.39,
179
- "acc_stderr": 0.04902071300001974,
180
- "acc_norm": 0.39,
181
- "acc_norm_stderr": 0.04902071300001974
182
- },
183
- "harness|hendrycksTest-high_school_biology|5": {
184
- "acc": 0.7838709677419354,
185
- "acc_stderr": 0.02341529343356853,
186
- "acc_norm": 0.7838709677419354,
187
- "acc_norm_stderr": 0.02341529343356853
188
- },
189
- "harness|hendrycksTest-high_school_chemistry|5": {
190
- "acc": 0.5024630541871922,
191
- "acc_stderr": 0.03517945038691063,
192
- "acc_norm": 0.5024630541871922,
193
- "acc_norm_stderr": 0.03517945038691063
194
- },
195
- "harness|hendrycksTest-high_school_computer_science|5": {
196
- "acc": 0.72,
197
- "acc_stderr": 0.04512608598542127,
198
- "acc_norm": 0.72,
199
- "acc_norm_stderr": 0.04512608598542127
200
- },
201
- "harness|hendrycksTest-high_school_european_history|5": {
202
- "acc": 0.7696969696969697,
203
- "acc_stderr": 0.0328766675860349,
204
- "acc_norm": 0.7696969696969697,
205
- "acc_norm_stderr": 0.0328766675860349
206
- },
207
- "harness|hendrycksTest-high_school_geography|5": {
208
- "acc": 0.7929292929292929,
209
- "acc_stderr": 0.028869778460267042,
210
- "acc_norm": 0.7929292929292929,
211
- "acc_norm_stderr": 0.028869778460267042
212
- },
213
- "harness|hendrycksTest-high_school_government_and_politics|5": {
214
- "acc": 0.8963730569948186,
215
- "acc_stderr": 0.02199531196364424,
216
- "acc_norm": 0.8963730569948186,
217
- "acc_norm_stderr": 0.02199531196364424
218
- },
219
- "harness|hendrycksTest-high_school_macroeconomics|5": {
220
- "acc": 0.6641025641025641,
221
- "acc_stderr": 0.023946724741563976,
222
- "acc_norm": 0.6641025641025641,
223
- "acc_norm_stderr": 0.023946724741563976
224
- },
225
- "harness|hendrycksTest-high_school_mathematics|5": {
226
- "acc": 0.35185185185185186,
227
- "acc_stderr": 0.02911661760608301,
228
- "acc_norm": 0.35185185185185186,
229
- "acc_norm_stderr": 0.02911661760608301
230
- },
231
- "harness|hendrycksTest-high_school_microeconomics|5": {
232
- "acc": 0.7016806722689075,
233
- "acc_stderr": 0.02971914287634286,
234
- "acc_norm": 0.7016806722689075,
235
- "acc_norm_stderr": 0.02971914287634286
236
- },
237
- "harness|hendrycksTest-high_school_physics|5": {
238
- "acc": 0.37748344370860926,
239
- "acc_stderr": 0.03958027231121569,
240
- "acc_norm": 0.37748344370860926,
241
- "acc_norm_stderr": 0.03958027231121569
242
- },
243
- "harness|hendrycksTest-high_school_psychology|5": {
244
- "acc": 0.8587155963302753,
245
- "acc_stderr": 0.014933868987028075,
246
- "acc_norm": 0.8587155963302753,
247
- "acc_norm_stderr": 0.014933868987028075
248
- },
249
- "harness|hendrycksTest-high_school_statistics|5": {
250
- "acc": 0.5324074074074074,
251
- "acc_stderr": 0.03402801581358966,
252
- "acc_norm": 0.5324074074074074,
253
- "acc_norm_stderr": 0.03402801581358966
254
- },
255
- "harness|hendrycksTest-high_school_us_history|5": {
256
- "acc": 0.8480392156862745,
257
- "acc_stderr": 0.025195658428931792,
258
- "acc_norm": 0.8480392156862745,
259
- "acc_norm_stderr": 0.025195658428931792
260
- },
261
- "harness|hendrycksTest-high_school_world_history|5": {
262
- "acc": 0.8016877637130801,
263
- "acc_stderr": 0.02595502084162113,
264
- "acc_norm": 0.8016877637130801,
265
- "acc_norm_stderr": 0.02595502084162113
266
- },
267
- "harness|hendrycksTest-human_aging|5": {
268
- "acc": 0.6905829596412556,
269
- "acc_stderr": 0.03102441174057221,
270
- "acc_norm": 0.6905829596412556,
271
- "acc_norm_stderr": 0.03102441174057221
272
- },
273
- "harness|hendrycksTest-human_sexuality|5": {
274
- "acc": 0.8091603053435115,
275
- "acc_stderr": 0.03446513350752598,
276
- "acc_norm": 0.8091603053435115,
277
- "acc_norm_stderr": 0.03446513350752598
278
- },
279
- "harness|hendrycksTest-international_law|5": {
280
- "acc": 0.7933884297520661,
281
- "acc_stderr": 0.03695980128098824,
282
- "acc_norm": 0.7933884297520661,
283
- "acc_norm_stderr": 0.03695980128098824
284
- },
285
- "harness|hendrycksTest-jurisprudence|5": {
286
- "acc": 0.8055555555555556,
287
- "acc_stderr": 0.038260763248848646,
288
- "acc_norm": 0.8055555555555556,
289
- "acc_norm_stderr": 0.038260763248848646
290
- },
291
- "harness|hendrycksTest-logical_fallacies|5": {
292
- "acc": 0.7423312883435583,
293
- "acc_stderr": 0.03436150827846917,
294
- "acc_norm": 0.7423312883435583,
295
- "acc_norm_stderr": 0.03436150827846917
296
- },
297
- "harness|hendrycksTest-machine_learning|5": {
298
- "acc": 0.45535714285714285,
299
- "acc_stderr": 0.047268355537191,
300
- "acc_norm": 0.45535714285714285,
301
- "acc_norm_stderr": 0.047268355537191
302
- },
303
- "harness|hendrycksTest-management|5": {
304
- "acc": 0.8058252427184466,
305
- "acc_stderr": 0.039166677628225836,
306
- "acc_norm": 0.8058252427184466,
307
- "acc_norm_stderr": 0.039166677628225836
308
- },
309
- "harness|hendrycksTest-marketing|5": {
310
- "acc": 0.8717948717948718,
311
- "acc_stderr": 0.021901905115073325,
312
- "acc_norm": 0.8717948717948718,
313
- "acc_norm_stderr": 0.021901905115073325
314
- },
315
- "harness|hendrycksTest-medical_genetics|5": {
316
- "acc": 0.76,
317
- "acc_stderr": 0.042923469599092816,
318
- "acc_norm": 0.76,
319
- "acc_norm_stderr": 0.042923469599092816
320
- },
321
- "harness|hendrycksTest-miscellaneous|5": {
322
- "acc": 0.8326947637292464,
323
- "acc_stderr": 0.013347327202920332,
324
- "acc_norm": 0.8326947637292464,
325
- "acc_norm_stderr": 0.013347327202920332
326
- },
327
- "harness|hendrycksTest-moral_disputes|5": {
328
- "acc": 0.7283236994219653,
329
- "acc_stderr": 0.023948512905468365,
330
- "acc_norm": 0.7283236994219653,
331
- "acc_norm_stderr": 0.023948512905468365
332
- },
333
- "harness|hendrycksTest-moral_scenarios|5": {
334
- "acc": 0.4770949720670391,
335
- "acc_stderr": 0.016704945740326188,
336
- "acc_norm": 0.4770949720670391,
337
- "acc_norm_stderr": 0.016704945740326188
338
- },
339
- "harness|hendrycksTest-nutrition|5": {
340
- "acc": 0.7450980392156863,
341
- "acc_stderr": 0.02495418432487991,
342
- "acc_norm": 0.7450980392156863,
343
- "acc_norm_stderr": 0.02495418432487991
344
- },
345
- "harness|hendrycksTest-philosophy|5": {
346
- "acc": 0.7106109324758842,
347
- "acc_stderr": 0.025755865922632952,
348
- "acc_norm": 0.7106109324758842,
349
- "acc_norm_stderr": 0.025755865922632952
350
- },
351
- "harness|hendrycksTest-prehistory|5": {
352
- "acc": 0.75,
353
- "acc_stderr": 0.02409347123262133,
354
- "acc_norm": 0.75,
355
- "acc_norm_stderr": 0.02409347123262133
356
- },
357
- "harness|hendrycksTest-professional_accounting|5": {
358
- "acc": 0.475177304964539,
359
- "acc_stderr": 0.02979071924382972,
360
- "acc_norm": 0.475177304964539,
361
- "acc_norm_stderr": 0.02979071924382972
362
- },
363
- "harness|hendrycksTest-professional_law|5": {
364
- "acc": 0.46284224250325945,
365
- "acc_stderr": 0.012734923579532069,
366
- "acc_norm": 0.46284224250325945,
367
- "acc_norm_stderr": 0.012734923579532069
368
- },
369
- "harness|hendrycksTest-professional_medicine|5": {
370
- "acc": 0.6985294117647058,
371
- "acc_stderr": 0.027875982114273168,
372
- "acc_norm": 0.6985294117647058,
373
- "acc_norm_stderr": 0.027875982114273168
374
- },
375
- "harness|hendrycksTest-professional_psychology|5": {
376
- "acc": 0.6666666666666666,
377
- "acc_stderr": 0.0190709855896875,
378
- "acc_norm": 0.6666666666666666,
379
- "acc_norm_stderr": 0.0190709855896875
380
- },
381
- "harness|hendrycksTest-public_relations|5": {
382
- "acc": 0.6545454545454545,
383
- "acc_stderr": 0.04554619617541054,
384
- "acc_norm": 0.6545454545454545,
385
- "acc_norm_stderr": 0.04554619617541054
386
- },
387
- "harness|hendrycksTest-security_studies|5": {
388
- "acc": 0.726530612244898,
389
- "acc_stderr": 0.02853556033712844,
390
- "acc_norm": 0.726530612244898,
391
- "acc_norm_stderr": 0.02853556033712844
392
- },
393
- "harness|hendrycksTest-sociology|5": {
394
- "acc": 0.845771144278607,
395
- "acc_stderr": 0.025538433368578337,
396
- "acc_norm": 0.845771144278607,
397
- "acc_norm_stderr": 0.025538433368578337
398
- },
399
- "harness|hendrycksTest-us_foreign_policy|5": {
400
- "acc": 0.86,
401
- "acc_stderr": 0.0348735088019777,
402
- "acc_norm": 0.86,
403
- "acc_norm_stderr": 0.0348735088019777
404
- },
405
- "harness|hendrycksTest-virology|5": {
406
- "acc": 0.5180722891566265,
407
- "acc_stderr": 0.03889951252827216,
408
- "acc_norm": 0.5180722891566265,
409
- "acc_norm_stderr": 0.03889951252827216
410
- },
411
- "harness|hendrycksTest-world_religions|5": {
412
- "acc": 0.8362573099415205,
413
- "acc_stderr": 0.028380919596145866,
414
- "acc_norm": 0.8362573099415205,
415
- "acc_norm_stderr": 0.028380919596145866
416
- },
417
- "harness|truthfulqa:mc|0": {
418
- "mc1": 0.5067319461444308,
419
- "mc1_stderr": 0.017501914492655396,
420
- "mc2": 0.6698288226697681,
421
- "mc2_stderr": 0.015121056875692264
422
- },
423
- "harness|winogrande|5": {
424
- "acc": 0.8058405682715075,
425
- "acc_stderr": 0.01111698339239267
426
- },
427
- "harness|gsm8k|5": {
428
- "acc": 0.7134192570128886,
429
- "acc_stderr": 0.0124548416683377
430
- }
431
- }
432
 
 
 
 
 
 
 
 
 
 
 
 
433
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  dtype: bfloat16
45
 
46
  ```
47
+ ## 💻 Usage
48
+ ```python
49
+ !pip install -qU transformers bitsandbytes accelerate
50
 
51
+ from transformers import AutoTokenizer
52
+ import transformers
53
+ import torch
54
+
55
+ model = "mistral-7b-merged-dare_6x7"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ tokenizer = AutoTokenizer.from_pretrained(model)
58
+ pipeline = transformers.pipeline(
59
+ "text-generation",
60
+ model=model,
61
+ model_kwargs={"torch_dtype": torch.float16, "load_in_4bit": True},
62
+ )
63
+
64
+ messages = [{"role": "user", "content": "Why the sky is blue"}]
65
+ prompt = pipeline.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
66
+ outputs = pipeline(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
67
+ print(outputs[0]["generated_text"])
68
  ```
69
+
70
+ # [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
71
+ Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_mychen76__mistral-7b-merged-dare_6x7)
72
+
73
+
74
+ | Metric |Value|
75
+ |---------------------------------|----:|
76
+ |Avg. |73.46|
77
+ |AI2 Reasoning Challenge (25-Shot)|69.62|
78
+ |HellaSwag (10-Shot) |87.04|
79
+ |MMLU (5-Shot) |65.18|
80
+ |TruthfulQA (0-shot) |66.98|
81
+ |Winogrande (5-shot) |80.58|
82
+ |GSM8k (5-shot) |71.34|
83
+
84
+