mychen76 commited on
Commit
0ff0dfb
1 Parent(s): 44dff50

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +31 -385
README.md CHANGED
@@ -32,392 +32,38 @@ parameters:
32
  dtype: bfloat16
33
 
34
  ```
35
- ## Evaluation
36
- https://huggingface.co/datasets/open-llm-leaderboard/details_mychen76__mistral-7b-merged-ties
 
37
 
38
- Latest Result:
39
- https://huggingface.co/datasets/open-llm-leaderboard/details_mychen76__mistral-7b-merged-ties/blob/main/results_2024-03-10T11-05-18.535141.json
 
40
 
41
- ```
42
- {
43
- "all": {
44
- "acc": 0.6445924072176131,
45
- "acc_stderr": 0.03213293328697562,
46
- "acc_norm": 0.6450342620069291,
47
- "acc_norm_stderr": 0.032788565108750604,
48
- "mc1": 0.4455324357405141,
49
- "mc1_stderr": 0.017399335280140357,
50
- "mc2": 0.6131109579182783,
51
- "mc2_stderr": 0.015351738756398125
52
- },
53
- "harness|arc:challenge|25": {
54
- "acc": 0.6390784982935154,
55
- "acc_stderr": 0.014034761386175452,
56
- "acc_norm": 0.6791808873720137,
57
- "acc_norm_stderr": 0.013640943091946531
58
- },
59
- "harness|hellaswag|10": {
60
- "acc": 0.6722764389563832,
61
- "acc_stderr": 0.004684241685200317,
62
- "acc_norm": 0.85929097789285,
63
- "acc_norm_stderr": 0.00347010499020439
64
- },
65
- "harness|hendrycksTest-abstract_algebra|5": {
66
- "acc": 0.28,
67
- "acc_stderr": 0.04512608598542128,
68
- "acc_norm": 0.28,
69
- "acc_norm_stderr": 0.04512608598542128
70
- },
71
- "harness|hendrycksTest-anatomy|5": {
72
- "acc": 0.6074074074074074,
73
- "acc_stderr": 0.0421850621536888,
74
- "acc_norm": 0.6074074074074074,
75
- "acc_norm_stderr": 0.0421850621536888
76
- },
77
- "harness|hendrycksTest-astronomy|5": {
78
- "acc": 0.743421052631579,
79
- "acc_stderr": 0.0355418036802569,
80
- "acc_norm": 0.743421052631579,
81
- "acc_norm_stderr": 0.0355418036802569
82
- },
83
- "harness|hendrycksTest-business_ethics|5": {
84
- "acc": 0.61,
85
- "acc_stderr": 0.04902071300001975,
86
- "acc_norm": 0.61,
87
- "acc_norm_stderr": 0.04902071300001975
88
- },
89
- "harness|hendrycksTest-clinical_knowledge|5": {
90
- "acc": 0.6867924528301886,
91
- "acc_stderr": 0.028544793319055326,
92
- "acc_norm": 0.6867924528301886,
93
- "acc_norm_stderr": 0.028544793319055326
94
- },
95
- "harness|hendrycksTest-college_biology|5": {
96
- "acc": 0.7777777777777778,
97
- "acc_stderr": 0.03476590104304134,
98
- "acc_norm": 0.7777777777777778,
99
- "acc_norm_stderr": 0.03476590104304134
100
- },
101
- "harness|hendrycksTest-college_chemistry|5": {
102
- "acc": 0.48,
103
- "acc_stderr": 0.050211673156867795,
104
- "acc_norm": 0.48,
105
- "acc_norm_stderr": 0.050211673156867795
106
- },
107
- "harness|hendrycksTest-college_computer_science|5": {
108
- "acc": 0.48,
109
- "acc_stderr": 0.050211673156867795,
110
- "acc_norm": 0.48,
111
- "acc_norm_stderr": 0.050211673156867795
112
- },
113
- "harness|hendrycksTest-college_mathematics|5": {
114
- "acc": 0.32,
115
- "acc_stderr": 0.04688261722621504,
116
- "acc_norm": 0.32,
117
- "acc_norm_stderr": 0.04688261722621504
118
- },
119
- "harness|hendrycksTest-college_medicine|5": {
120
- "acc": 0.630057803468208,
121
- "acc_stderr": 0.036812296333943194,
122
- "acc_norm": 0.630057803468208,
123
- "acc_norm_stderr": 0.036812296333943194
124
- },
125
- "harness|hendrycksTest-college_physics|5": {
126
- "acc": 0.4117647058823529,
127
- "acc_stderr": 0.048971049527263666,
128
- "acc_norm": 0.4117647058823529,
129
- "acc_norm_stderr": 0.048971049527263666
130
- },
131
- "harness|hendrycksTest-computer_security|5": {
132
- "acc": 0.76,
133
- "acc_stderr": 0.042923469599092816,
134
- "acc_norm": 0.76,
135
- "acc_norm_stderr": 0.042923469599092816
136
- },
137
- "harness|hendrycksTest-conceptual_physics|5": {
138
- "acc": 0.574468085106383,
139
- "acc_stderr": 0.03232146916224468,
140
- "acc_norm": 0.574468085106383,
141
- "acc_norm_stderr": 0.03232146916224468
142
- },
143
- "harness|hendrycksTest-econometrics|5": {
144
- "acc": 0.5175438596491229,
145
- "acc_stderr": 0.04700708033551038,
146
- "acc_norm": 0.5175438596491229,
147
- "acc_norm_stderr": 0.04700708033551038
148
- },
149
- "harness|hendrycksTest-electrical_engineering|5": {
150
- "acc": 0.5448275862068965,
151
- "acc_stderr": 0.04149886942192117,
152
- "acc_norm": 0.5448275862068965,
153
- "acc_norm_stderr": 0.04149886942192117
154
- },
155
- "harness|hendrycksTest-elementary_mathematics|5": {
156
- "acc": 0.4126984126984127,
157
- "acc_stderr": 0.025355741263055263,
158
- "acc_norm": 0.4126984126984127,
159
- "acc_norm_stderr": 0.025355741263055263
160
- },
161
- "harness|hendrycksTest-formal_logic|5": {
162
- "acc": 0.4365079365079365,
163
- "acc_stderr": 0.04435932892851466,
164
- "acc_norm": 0.4365079365079365,
165
- "acc_norm_stderr": 0.04435932892851466
166
- },
167
- "harness|hendrycksTest-global_facts|5": {
168
- "acc": 0.35,
169
- "acc_stderr": 0.047937248544110196,
170
- "acc_norm": 0.35,
171
- "acc_norm_stderr": 0.047937248544110196
172
- },
173
- "harness|hendrycksTest-high_school_biology|5": {
174
- "acc": 0.7645161290322581,
175
- "acc_stderr": 0.02413763242933771,
176
- "acc_norm": 0.7645161290322581,
177
- "acc_norm_stderr": 0.02413763242933771
178
- },
179
- "harness|hendrycksTest-high_school_chemistry|5": {
180
- "acc": 0.49261083743842365,
181
- "acc_stderr": 0.035176035403610084,
182
- "acc_norm": 0.49261083743842365,
183
- "acc_norm_stderr": 0.035176035403610084
184
- },
185
- "harness|hendrycksTest-high_school_computer_science|5": {
186
- "acc": 0.67,
187
- "acc_stderr": 0.04725815626252607,
188
- "acc_norm": 0.67,
189
- "acc_norm_stderr": 0.04725815626252607
190
- },
191
- "harness|hendrycksTest-high_school_european_history|5": {
192
- "acc": 0.7757575757575758,
193
- "acc_stderr": 0.03256866661681102,
194
- "acc_norm": 0.7757575757575758,
195
- "acc_norm_stderr": 0.03256866661681102
196
- },
197
- "harness|hendrycksTest-high_school_geography|5": {
198
- "acc": 0.7828282828282829,
199
- "acc_stderr": 0.02937661648494563,
200
- "acc_norm": 0.7828282828282829,
201
- "acc_norm_stderr": 0.02937661648494563
202
- },
203
- "harness|hendrycksTest-high_school_government_and_politics|5": {
204
- "acc": 0.8963730569948186,
205
- "acc_stderr": 0.02199531196364424,
206
- "acc_norm": 0.8963730569948186,
207
- "acc_norm_stderr": 0.02199531196364424
208
- },
209
- "harness|hendrycksTest-high_school_macroeconomics|5": {
210
- "acc": 0.6410256410256411,
211
- "acc_stderr": 0.024321738484602354,
212
- "acc_norm": 0.6410256410256411,
213
- "acc_norm_stderr": 0.024321738484602354
214
- },
215
- "harness|hendrycksTest-high_school_mathematics|5": {
216
- "acc": 0.34814814814814815,
217
- "acc_stderr": 0.029045600290616255,
218
- "acc_norm": 0.34814814814814815,
219
- "acc_norm_stderr": 0.029045600290616255
220
- },
221
- "harness|hendrycksTest-high_school_microeconomics|5": {
222
- "acc": 0.6890756302521008,
223
- "acc_stderr": 0.03006676158297793,
224
- "acc_norm": 0.6890756302521008,
225
- "acc_norm_stderr": 0.03006676158297793
226
- },
227
- "harness|hendrycksTest-high_school_physics|5": {
228
- "acc": 0.2980132450331126,
229
- "acc_stderr": 0.037345356767871984,
230
- "acc_norm": 0.2980132450331126,
231
- "acc_norm_stderr": 0.037345356767871984
232
- },
233
- "harness|hendrycksTest-high_school_psychology|5": {
234
- "acc": 0.8495412844036697,
235
- "acc_stderr": 0.015328563932669237,
236
- "acc_norm": 0.8495412844036697,
237
- "acc_norm_stderr": 0.015328563932669237
238
- },
239
- "harness|hendrycksTest-high_school_statistics|5": {
240
- "acc": 0.5231481481481481,
241
- "acc_stderr": 0.03406315360711507,
242
- "acc_norm": 0.5231481481481481,
243
- "acc_norm_stderr": 0.03406315360711507
244
- },
245
- "harness|hendrycksTest-high_school_us_history|5": {
246
- "acc": 0.8186274509803921,
247
- "acc_stderr": 0.027044621719474086,
248
- "acc_norm": 0.8186274509803921,
249
- "acc_norm_stderr": 0.027044621719474086
250
- },
251
- "harness|hendrycksTest-high_school_world_history|5": {
252
- "acc": 0.8185654008438819,
253
- "acc_stderr": 0.025085961144579665,
254
- "acc_norm": 0.8185654008438819,
255
- "acc_norm_stderr": 0.025085961144579665
256
- },
257
- "harness|hendrycksTest-human_aging|5": {
258
- "acc": 0.6860986547085202,
259
- "acc_stderr": 0.031146796482972465,
260
- "acc_norm": 0.6860986547085202,
261
- "acc_norm_stderr": 0.031146796482972465
262
- },
263
- "harness|hendrycksTest-human_sexuality|5": {
264
- "acc": 0.7862595419847328,
265
- "acc_stderr": 0.0359546161177469,
266
- "acc_norm": 0.7862595419847328,
267
- "acc_norm_stderr": 0.0359546161177469
268
- },
269
- "harness|hendrycksTest-international_law|5": {
270
- "acc": 0.7851239669421488,
271
- "acc_stderr": 0.037494924487096966,
272
- "acc_norm": 0.7851239669421488,
273
- "acc_norm_stderr": 0.037494924487096966
274
- },
275
- "harness|hendrycksTest-jurisprudence|5": {
276
- "acc": 0.7962962962962963,
277
- "acc_stderr": 0.03893542518824847,
278
- "acc_norm": 0.7962962962962963,
279
- "acc_norm_stderr": 0.03893542518824847
280
- },
281
- "harness|hendrycksTest-logical_fallacies|5": {
282
- "acc": 0.7607361963190185,
283
- "acc_stderr": 0.033519538795212696,
284
- "acc_norm": 0.7607361963190185,
285
- "acc_norm_stderr": 0.033519538795212696
286
- },
287
- "harness|hendrycksTest-machine_learning|5": {
288
- "acc": 0.4642857142857143,
289
- "acc_stderr": 0.04733667890053756,
290
- "acc_norm": 0.4642857142857143,
291
- "acc_norm_stderr": 0.04733667890053756
292
- },
293
- "harness|hendrycksTest-management|5": {
294
- "acc": 0.7766990291262136,
295
- "acc_stderr": 0.04123553189891431,
296
- "acc_norm": 0.7766990291262136,
297
- "acc_norm_stderr": 0.04123553189891431
298
- },
299
- "harness|hendrycksTest-marketing|5": {
300
- "acc": 0.8547008547008547,
301
- "acc_stderr": 0.023086635086841407,
302
- "acc_norm": 0.8547008547008547,
303
- "acc_norm_stderr": 0.023086635086841407
304
- },
305
- "harness|hendrycksTest-medical_genetics|5": {
306
- "acc": 0.71,
307
- "acc_stderr": 0.045604802157206845,
308
- "acc_norm": 0.71,
309
- "acc_norm_stderr": 0.045604802157206845
310
- },
311
- "harness|hendrycksTest-miscellaneous|5": {
312
- "acc": 0.8301404853128991,
313
- "acc_stderr": 0.013428186370608304,
314
- "acc_norm": 0.8301404853128991,
315
- "acc_norm_stderr": 0.013428186370608304
316
- },
317
- "harness|hendrycksTest-moral_disputes|5": {
318
- "acc": 0.7369942196531792,
319
- "acc_stderr": 0.023703099525258172,
320
- "acc_norm": 0.7369942196531792,
321
- "acc_norm_stderr": 0.023703099525258172
322
- },
323
- "harness|hendrycksTest-moral_scenarios|5": {
324
- "acc": 0.3664804469273743,
325
- "acc_stderr": 0.016115235504865467,
326
- "acc_norm": 0.3664804469273743,
327
- "acc_norm_stderr": 0.016115235504865467
328
- },
329
- "harness|hendrycksTest-nutrition|5": {
330
- "acc": 0.7320261437908496,
331
- "acc_stderr": 0.025360603796242553,
332
- "acc_norm": 0.7320261437908496,
333
- "acc_norm_stderr": 0.025360603796242553
334
- },
335
- "harness|hendrycksTest-philosophy|5": {
336
- "acc": 0.7170418006430869,
337
- "acc_stderr": 0.02558306248998481,
338
- "acc_norm": 0.7170418006430869,
339
- "acc_norm_stderr": 0.02558306248998481
340
- },
341
- "harness|hendrycksTest-prehistory|5": {
342
- "acc": 0.7376543209876543,
343
- "acc_stderr": 0.024477222856135114,
344
- "acc_norm": 0.7376543209876543,
345
- "acc_norm_stderr": 0.024477222856135114
346
- },
347
- "harness|hendrycksTest-professional_accounting|5": {
348
- "acc": 0.5070921985815603,
349
- "acc_stderr": 0.02982449855912901,
350
- "acc_norm": 0.5070921985815603,
351
- "acc_norm_stderr": 0.02982449855912901
352
- },
353
- "harness|hendrycksTest-professional_law|5": {
354
- "acc": 0.4667535853976532,
355
- "acc_stderr": 0.01274197433389723,
356
- "acc_norm": 0.4667535853976532,
357
- "acc_norm_stderr": 0.01274197433389723
358
- },
359
- "harness|hendrycksTest-professional_medicine|5": {
360
- "acc": 0.6764705882352942,
361
- "acc_stderr": 0.02841820861940676,
362
- "acc_norm": 0.6764705882352942,
363
- "acc_norm_stderr": 0.02841820861940676
364
- },
365
- "harness|hendrycksTest-professional_psychology|5": {
366
- "acc": 0.6666666666666666,
367
- "acc_stderr": 0.019070985589687495,
368
- "acc_norm": 0.6666666666666666,
369
- "acc_norm_stderr": 0.019070985589687495
370
- },
371
- "harness|hendrycksTest-public_relations|5": {
372
- "acc": 0.6545454545454545,
373
- "acc_stderr": 0.04554619617541054,
374
- "acc_norm": 0.6545454545454545,
375
- "acc_norm_stderr": 0.04554619617541054
376
- },
377
- "harness|hendrycksTest-security_studies|5": {
378
- "acc": 0.7306122448979592,
379
- "acc_stderr": 0.02840125202902294,
380
- "acc_norm": 0.7306122448979592,
381
- "acc_norm_stderr": 0.02840125202902294
382
- },
383
- "harness|hendrycksTest-sociology|5": {
384
- "acc": 0.845771144278607,
385
- "acc_stderr": 0.025538433368578337,
386
- "acc_norm": 0.845771144278607,
387
- "acc_norm_stderr": 0.025538433368578337
388
- },
389
- "harness|hendrycksTest-us_foreign_policy|5": {
390
- "acc": 0.86,
391
- "acc_stderr": 0.0348735088019777,
392
- "acc_norm": 0.86,
393
- "acc_norm_stderr": 0.0348735088019777
394
- },
395
- "harness|hendrycksTest-virology|5": {
396
- "acc": 0.5481927710843374,
397
- "acc_stderr": 0.03874371556587953,
398
- "acc_norm": 0.5481927710843374,
399
- "acc_norm_stderr": 0.03874371556587953
400
- },
401
- "harness|hendrycksTest-world_religions|5": {
402
- "acc": 0.8304093567251462,
403
- "acc_stderr": 0.02878210810540171,
404
- "acc_norm": 0.8304093567251462,
405
- "acc_norm_stderr": 0.02878210810540171
406
- },
407
- "harness|truthfulqa:mc|0": {
408
- "mc1": 0.4455324357405141,
409
- "mc1_stderr": 0.017399335280140357,
410
- "mc2": 0.6131109579182783,
411
- "mc2_stderr": 0.015351738756398125
412
- },
413
- "harness|winogrande|5": {
414
- "acc": 0.8003157063930545,
415
- "acc_stderr": 0.011235328382625842
416
- },
417
- "harness|gsm8k|5": {
418
- "acc": 0.6899166034874905,
419
- "acc_stderr": 0.01274030571737627
420
- }
421
- }
422
 
 
 
 
 
423
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  dtype: bfloat16
33
 
34
  ```
35
+ ## 💻 Usage
36
+ ```python
37
+ !pip install -qU transformers bitsandbytes accelerate
38
 
39
+ from transformers import AutoTokenizer
40
+ import transformers
41
+ import torch
42
 
43
+ model = "mychen76/mistral-7b-merged-ties"
44
+
45
+ tokenizer = AutoTokenizer.from_pretrained(model)
46
+ pipeline = transformers.pipeline(
47
+ "text-generation",
48
+ model=model,
49
+ model_kwargs={"torch_dtype": torch.float16, "load_in_4bit": True},
50
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ messages = [{"role": "user", "content": "Why the sky is blue"}]
53
+ prompt = pipeline.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
54
+ outputs = pipeline(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
55
+ print(outputs[0]["generated_text"])
56
  ```
57
+
58
+ # [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
59
+ Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_mychen76__mistral-7b-merged-ties)
60
+
61
+ | Metric |Value|
62
+ |---------------------------------|----:|
63
+ |Avg. |71.37|
64
+ |AI2 Reasoning Challenge (25-Shot)|67.92|
65
+ |HellaSwag (10-Shot) |85.93|
66
+ |MMLU (5-Shot) |64.07|
67
+ |TruthfulQA (0-shot) |61.31|
68
+ |Winogrande (5-shot) |80.03|
69
+ |GSM8k (5-shot) |68.54|