lex-hue commited on
Commit
883ed9e
1 Parent(s): 95bef32

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +1 -382
README.md CHANGED
@@ -19,385 +19,4 @@ As this was an test run, we just tested it and heres the Data, the model hasnt I
19
  | vicuna-13b-v1.3 | 6.812500 | 5.962500 | 6.387500 |
20
 
21
  Open-LLM Leaderboard Results:
22
- '''
23
- {
24
- "all": {
25
- "acc": 0.647154984215818,
26
- "acc_stderr": 0.03221441224437104,
27
- "acc_norm": 0.6487599114885558,
28
- "acc_norm_stderr": 0.032860268812293904,
29
- "mc1": 0.4283965728274174,
30
- "mc1_stderr": 0.017323088597314757,
31
- "mc2": 0.5998074537794252,
32
- "mc2_stderr": 0.015494960379071198
33
- },
34
- "harness|arc:challenge|25": {
35
- "acc": 0.64419795221843,
36
- "acc_stderr": 0.01399057113791876,
37
- "acc_norm": 0.6646757679180887,
38
- "acc_norm_stderr": 0.013796182947785562
39
- },
40
- "harness|hellaswag|10": {
41
- "acc": 0.6782513443537144,
42
- "acc_stderr": 0.004661924314756093,
43
- "acc_norm": 0.8590918143796057,
44
- "acc_norm_stderr": 0.003472157511639361
45
- },
46
- "harness|hendrycksTest-abstract_algebra|5": {
47
- "acc": 0.29,
48
- "acc_stderr": 0.045604802157206845,
49
- "acc_norm": 0.29,
50
- "acc_norm_stderr": 0.045604802157206845
51
- },
52
- "harness|hendrycksTest-anatomy|5": {
53
- "acc": 0.5925925925925926,
54
- "acc_stderr": 0.04244633238353227,
55
- "acc_norm": 0.5925925925925926,
56
- "acc_norm_stderr": 0.04244633238353227
57
- },
58
- "harness|hendrycksTest-astronomy|5": {
59
- "acc": 0.6973684210526315,
60
- "acc_stderr": 0.03738520676119667,
61
- "acc_norm": 0.6973684210526315,
62
- "acc_norm_stderr": 0.03738520676119667
63
- },
64
- "harness|hendrycksTest-business_ethics|5": {
65
- "acc": 0.59,
66
- "acc_stderr": 0.04943110704237102,
67
- "acc_norm": 0.59,
68
- "acc_norm_stderr": 0.04943110704237102
69
- },
70
- "harness|hendrycksTest-clinical_knowledge|5": {
71
- "acc": 0.7132075471698113,
72
- "acc_stderr": 0.027834912527544057,
73
- "acc_norm": 0.7132075471698113,
74
- "acc_norm_stderr": 0.027834912527544057
75
- },
76
- "harness|hendrycksTest-college_biology|5": {
77
- "acc": 0.7847222222222222,
78
- "acc_stderr": 0.03437079344106135,
79
- "acc_norm": 0.7847222222222222,
80
- "acc_norm_stderr": 0.03437079344106135
81
- },
82
- "harness|hendrycksTest-college_chemistry|5": {
83
- "acc": 0.49,
84
- "acc_stderr": 0.05024183937956912,
85
- "acc_norm": 0.49,
86
- "acc_norm_stderr": 0.05024183937956912
87
- },
88
- "harness|hendrycksTest-college_computer_science|5": {
89
- "acc": 0.54,
90
- "acc_stderr": 0.05009082659620332,
91
- "acc_norm": 0.54,
92
- "acc_norm_stderr": 0.05009082659620332
93
- },
94
- "harness|hendrycksTest-college_mathematics|5": {
95
- "acc": 0.34,
96
- "acc_stderr": 0.04760952285695235,
97
- "acc_norm": 0.34,
98
- "acc_norm_stderr": 0.04760952285695235
99
- },
100
- "harness|hendrycksTest-college_medicine|5": {
101
- "acc": 0.653179190751445,
102
- "acc_stderr": 0.036291466701596636,
103
- "acc_norm": 0.653179190751445,
104
- "acc_norm_stderr": 0.036291466701596636
105
- },
106
- "harness|hendrycksTest-college_physics|5": {
107
- "acc": 0.4019607843137255,
108
- "acc_stderr": 0.04878608714466996,
109
- "acc_norm": 0.4019607843137255,
110
- "acc_norm_stderr": 0.04878608714466996
111
- },
112
- "harness|hendrycksTest-computer_security|5": {
113
- "acc": 0.76,
114
- "acc_stderr": 0.042923469599092816,
115
- "acc_norm": 0.76,
116
- "acc_norm_stderr": 0.042923469599092816
117
- },
118
- "harness|hendrycksTest-conceptual_physics|5": {
119
- "acc": 0.5914893617021276,
120
- "acc_stderr": 0.032134180267015755,
121
- "acc_norm": 0.5914893617021276,
122
- "acc_norm_stderr": 0.032134180267015755
123
- },
124
- "harness|hendrycksTest-econometrics|5": {
125
- "acc": 0.45614035087719296,
126
- "acc_stderr": 0.04685473041907789,
127
- "acc_norm": 0.45614035087719296,
128
- "acc_norm_stderr": 0.04685473041907789
129
- },
130
- "harness|hendrycksTest-electrical_engineering|5": {
131
- "acc": 0.5379310344827586,
132
- "acc_stderr": 0.041546596717075474,
133
- "acc_norm": 0.5379310344827586,
134
- "acc_norm_stderr": 0.041546596717075474
135
- },
136
- "harness|hendrycksTest-elementary_mathematics|5": {
137
- "acc": 0.4126984126984127,
138
- "acc_stderr": 0.02535574126305527,
139
- "acc_norm": 0.4126984126984127,
140
- "acc_norm_stderr": 0.02535574126305527
141
- },
142
- "harness|hendrycksTest-formal_logic|5": {
143
- "acc": 0.4603174603174603,
144
- "acc_stderr": 0.04458029125470973,
145
- "acc_norm": 0.4603174603174603,
146
- "acc_norm_stderr": 0.04458029125470973
147
- },
148
- "harness|hendrycksTest-global_facts|5": {
149
- "acc": 0.39,
150
- "acc_stderr": 0.04902071300001975,
151
- "acc_norm": 0.39,
152
- "acc_norm_stderr": 0.04902071300001975
153
- },
154
- "harness|hendrycksTest-high_school_biology|5": {
155
- "acc": 0.7967741935483871,
156
- "acc_stderr": 0.02289168798455496,
157
- "acc_norm": 0.7967741935483871,
158
- "acc_norm_stderr": 0.02289168798455496
159
- },
160
- "harness|hendrycksTest-high_school_chemistry|5": {
161
- "acc": 0.5073891625615764,
162
- "acc_stderr": 0.035176035403610105,
163
- "acc_norm": 0.5073891625615764,
164
- "acc_norm_stderr": 0.035176035403610105
165
- },
166
- "harness|hendrycksTest-high_school_computer_science|5": {
167
- "acc": 0.71,
168
- "acc_stderr": 0.045604802157206845,
169
- "acc_norm": 0.71,
170
- "acc_norm_stderr": 0.045604802157206845
171
- },
172
- "harness|hendrycksTest-high_school_european_history|5": {
173
- "acc": 0.7878787878787878,
174
- "acc_stderr": 0.031922715695483,
175
- "acc_norm": 0.7878787878787878,
176
- "acc_norm_stderr": 0.031922715695483
177
- },
178
- "harness|hendrycksTest-high_school_geography|5": {
179
- "acc": 0.7878787878787878,
180
- "acc_stderr": 0.029126522834586815,
181
- "acc_norm": 0.7878787878787878,
182
- "acc_norm_stderr": 0.029126522834586815
183
- },
184
- "harness|hendrycksTest-high_school_government_and_politics|5": {
185
- "acc": 0.8963730569948186,
186
- "acc_stderr": 0.021995311963644237,
187
- "acc_norm": 0.8963730569948186,
188
- "acc_norm_stderr": 0.021995311963644237
189
- },
190
- "harness|hendrycksTest-high_school_macroeconomics|5": {
191
- "acc": 0.6846153846153846,
192
- "acc_stderr": 0.02355964698318994,
193
- "acc_norm": 0.6846153846153846,
194
- "acc_norm_stderr": 0.02355964698318994
195
- },
196
- "harness|hendrycksTest-high_school_mathematics|5": {
197
- "acc": 0.35185185185185186,
198
- "acc_stderr": 0.02911661760608301,
199
- "acc_norm": 0.35185185185185186,
200
- "acc_norm_stderr": 0.02911661760608301
201
- },
202
- "harness|hendrycksTest-high_school_microeconomics|5": {
203
- "acc": 0.7100840336134454,
204
- "acc_stderr": 0.029472485833136094,
205
- "acc_norm": 0.7100840336134454,
206
- "acc_norm_stderr": 0.029472485833136094
207
- },
208
- "harness|hendrycksTest-high_school_physics|5": {
209
- "acc": 0.3708609271523179,
210
- "acc_stderr": 0.03943966699183629,
211
- "acc_norm": 0.3708609271523179,
212
- "acc_norm_stderr": 0.03943966699183629
213
- },
214
- "harness|hendrycksTest-high_school_psychology|5": {
215
- "acc": 0.8293577981651377,
216
- "acc_stderr": 0.016129271025099857,
217
- "acc_norm": 0.8293577981651377,
218
- "acc_norm_stderr": 0.016129271025099857
219
- },
220
- "harness|hendrycksTest-high_school_statistics|5": {
221
- "acc": 0.5462962962962963,
222
- "acc_stderr": 0.033953227263757976,
223
- "acc_norm": 0.5462962962962963,
224
- "acc_norm_stderr": 0.033953227263757976
225
- },
226
- "harness|hendrycksTest-high_school_us_history|5": {
227
- "acc": 0.8333333333333334,
228
- "acc_stderr": 0.026156867523931045,
229
- "acc_norm": 0.8333333333333334,
230
- "acc_norm_stderr": 0.026156867523931045
231
- },
232
- "harness|hendrycksTest-high_school_world_history|5": {
233
- "acc": 0.8227848101265823,
234
- "acc_stderr": 0.024856364184503224,
235
- "acc_norm": 0.8227848101265823,
236
- "acc_norm_stderr": 0.024856364184503224
237
- },
238
- "harness|hendrycksTest-human_aging|5": {
239
- "acc": 0.7130044843049327,
240
- "acc_stderr": 0.03036037971029195,
241
- "acc_norm": 0.7130044843049327,
242
- "acc_norm_stderr": 0.03036037971029195
243
- },
244
- "harness|hendrycksTest-human_sexuality|5": {
245
- "acc": 0.7709923664122137,
246
- "acc_stderr": 0.036853466317118506,
247
- "acc_norm": 0.7709923664122137,
248
- "acc_norm_stderr": 0.036853466317118506
249
- },
250
- "harness|hendrycksTest-international_law|5": {
251
- "acc": 0.7520661157024794,
252
- "acc_stderr": 0.03941897526516302,
253
- "acc_norm": 0.7520661157024794,
254
- "acc_norm_stderr": 0.03941897526516302
255
- },
256
- "harness|hendrycksTest-jurisprudence|5": {
257
- "acc": 0.7592592592592593,
258
- "acc_stderr": 0.04133119440243839,
259
- "acc_norm": 0.7592592592592593,
260
- "acc_norm_stderr": 0.04133119440243839
261
- },
262
- "harness|hendrycksTest-logical_fallacies|5": {
263
- "acc": 0.754601226993865,
264
- "acc_stderr": 0.03380939813943354,
265
- "acc_norm": 0.754601226993865,
266
- "acc_norm_stderr": 0.03380939813943354
267
- },
268
- "harness|hendrycksTest-machine_learning|5": {
269
- "acc": 0.5089285714285714,
270
- "acc_stderr": 0.04745033255489123,
271
- "acc_norm": 0.5089285714285714,
272
- "acc_norm_stderr": 0.04745033255489123
273
- },
274
- "harness|hendrycksTest-management|5": {
275
- "acc": 0.7378640776699029,
276
- "acc_stderr": 0.043546310772605956,
277
- "acc_norm": 0.7378640776699029,
278
- "acc_norm_stderr": 0.043546310772605956
279
- },
280
- "harness|hendrycksTest-marketing|5": {
281
- "acc": 0.8547008547008547,
282
- "acc_stderr": 0.023086635086841407,
283
- "acc_norm": 0.8547008547008547,
284
- "acc_norm_stderr": 0.023086635086841407
285
- },
286
- "harness|hendrycksTest-medical_genetics|5": {
287
- "acc": 0.72,
288
- "acc_stderr": 0.045126085985421276,
289
- "acc_norm": 0.72,
290
- "acc_norm_stderr": 0.045126085985421276
291
- },
292
- "harness|hendrycksTest-miscellaneous|5": {
293
- "acc": 0.8186462324393359,
294
- "acc_stderr": 0.01377869377846408,
295
- "acc_norm": 0.8186462324393359,
296
- "acc_norm_stderr": 0.01377869377846408
297
- },
298
- "harness|hendrycksTest-moral_disputes|5": {
299
- "acc": 0.7341040462427746,
300
- "acc_stderr": 0.023786203255508283,
301
- "acc_norm": 0.7341040462427746,
302
- "acc_norm_stderr": 0.023786203255508283
303
- },
304
- "harness|hendrycksTest-moral_scenarios|5": {
305
- "acc": 0.329608938547486,
306
- "acc_stderr": 0.01572153107518388,
307
- "acc_norm": 0.329608938547486,
308
- "acc_norm_stderr": 0.01572153107518388
309
- },
310
- "harness|hendrycksTest-nutrition|5": {
311
- "acc": 0.7450980392156863,
312
- "acc_stderr": 0.02495418432487991,
313
- "acc_norm": 0.7450980392156863,
314
- "acc_norm_stderr": 0.02495418432487991
315
- },
316
- "harness|hendrycksTest-philosophy|5": {
317
- "acc": 0.6913183279742765,
318
- "acc_stderr": 0.026236965881153266,
319
- "acc_norm": 0.6913183279742765,
320
- "acc_norm_stderr": 0.026236965881153266
321
- },
322
- "harness|hendrycksTest-prehistory|5": {
323
- "acc": 0.7314814814814815,
324
- "acc_stderr": 0.02465968518596728,
325
- "acc_norm": 0.7314814814814815,
326
- "acc_norm_stderr": 0.02465968518596728
327
- },
328
- "harness|hendrycksTest-professional_accounting|5": {
329
- "acc": 0.49645390070921985,
330
- "acc_stderr": 0.02982674915328092,
331
- "acc_norm": 0.49645390070921985,
332
- "acc_norm_stderr": 0.02982674915328092
333
- },
334
- "harness|hendrycksTest-professional_law|5": {
335
- "acc": 0.4817470664928292,
336
- "acc_stderr": 0.012761723960595472,
337
- "acc_norm": 0.4817470664928292,
338
- "acc_norm_stderr": 0.012761723960595472
339
- },
340
- "harness|hendrycksTest-professional_medicine|5": {
341
- "acc": 0.6875,
342
- "acc_stderr": 0.02815637344037142,
343
- "acc_norm": 0.6875,
344
- "acc_norm_stderr": 0.02815637344037142
345
- },
346
- "harness|hendrycksTest-professional_psychology|5": {
347
- "acc": 0.6454248366013072,
348
- "acc_stderr": 0.0193533605475537,
349
- "acc_norm": 0.6454248366013072,
350
- "acc_norm_stderr": 0.0193533605475537
351
- },
352
- "harness|hendrycksTest-public_relations|5": {
353
- "acc": 0.6727272727272727,
354
- "acc_stderr": 0.0449429086625209,
355
- "acc_norm": 0.6727272727272727,
356
- "acc_norm_stderr": 0.0449429086625209
357
- },
358
- "harness|hendrycksTest-security_studies|5": {
359
- "acc": 0.7306122448979592,
360
- "acc_stderr": 0.02840125202902294,
361
- "acc_norm": 0.7306122448979592,
362
- "acc_norm_stderr": 0.02840125202902294
363
- },
364
- "harness|hendrycksTest-sociology|5": {
365
- "acc": 0.8159203980099502,
366
- "acc_stderr": 0.027403859410786845,
367
- "acc_norm": 0.8159203980099502,
368
- "acc_norm_stderr": 0.027403859410786845
369
- },
370
- "harness|hendrycksTest-us_foreign_policy|5": {
371
- "acc": 0.88,
372
- "acc_stderr": 0.03265986323710906,
373
- "acc_norm": 0.88,
374
- "acc_norm_stderr": 0.03265986323710906
375
- },
376
- "harness|hendrycksTest-virology|5": {
377
- "acc": 0.5301204819277109,
378
- "acc_stderr": 0.03885425420866767,
379
- "acc_norm": 0.5301204819277109,
380
- "acc_norm_stderr": 0.03885425420866767
381
- },
382
- "harness|hendrycksTest-world_religions|5": {
383
- "acc": 0.8538011695906432,
384
- "acc_stderr": 0.02709729011807081,
385
- "acc_norm": 0.8538011695906432,
386
- "acc_norm_stderr": 0.02709729011807081
387
- },
388
- "harness|truthfulqa:mc|0": {
389
- "mc1": 0.4283965728274174,
390
- "mc1_stderr": 0.017323088597314757,
391
- "mc2": 0.5998074537794252,
392
- "mc2_stderr": 0.015494960379071198
393
- },
394
- "harness|winogrande|5": {
395
- "acc": 0.7853196527229677,
396
- "acc_stderr": 0.011539912734345403
397
- },
398
- "harness|gsm8k|5": {
399
- "acc": 0.6156178923426838,
400
- "acc_stderr": 0.013399219253698186
401
- }
402
- }
403
- '''
 
19
  | vicuna-13b-v1.3 | 6.812500 | 5.962500 | 6.387500 |
20
 
21
  Open-LLM Leaderboard Results:
22
+ [Results](https://huggingface.co/datasets/open-llm-leaderboard/details_lex-hue__LexGPT-V3)