lex-hue commited on
Commit
be49537
1 Parent(s): fe4898f

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +383 -1
README.md CHANGED
@@ -16,4 +16,386 @@ As this was an test run, we just tested it and heres the Data, the model hasnt I
16
  | gpt-3.5-turbo | 8.075000 | 7.943750 | 7.943750 |
17
  | claude-v1 | 8.150000 | 7.900000 | 8.025000 |
18
  | **LexGPT-V3** | **8.14375** | 7.719355 | 7.926667 |
19
- | vicuna-13b-v1.3 | 6.812500 | 5.962500 | 6.387500 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  | gpt-3.5-turbo | 8.075000 | 7.943750 | 7.943750 |
17
  | claude-v1 | 8.150000 | 7.900000 | 8.025000 |
18
  | **LexGPT-V3** | **8.14375** | 7.719355 | 7.926667 |
19
+ | vicuna-13b-v1.3 | 6.812500 | 5.962500 | 6.387500 |
20
+
21
+ Open-LLM Leaderboard Results:
22
+ {
23
+ "all": {
24
+ "acc": 0.647154984215818,
25
+ "acc_stderr": 0.03221441224437104,
26
+ "acc_norm": 0.6487599114885558,
27
+ "acc_norm_stderr": 0.032860268812293904,
28
+ "mc1": 0.4283965728274174,
29
+ "mc1_stderr": 0.017323088597314757,
30
+ "mc2": 0.5998074537794252,
31
+ "mc2_stderr": 0.015494960379071198
32
+ },
33
+ "harness|arc:challenge|25": {
34
+ "acc": 0.64419795221843,
35
+ "acc_stderr": 0.01399057113791876,
36
+ "acc_norm": 0.6646757679180887,
37
+ "acc_norm_stderr": 0.013796182947785562
38
+ },
39
+ "harness|hellaswag|10": {
40
+ "acc": 0.6782513443537144,
41
+ "acc_stderr": 0.004661924314756093,
42
+ "acc_norm": 0.8590918143796057,
43
+ "acc_norm_stderr": 0.003472157511639361
44
+ },
45
+ "harness|hendrycksTest-abstract_algebra|5": {
46
+ "acc": 0.29,
47
+ "acc_stderr": 0.045604802157206845,
48
+ "acc_norm": 0.29,
49
+ "acc_norm_stderr": 0.045604802157206845
50
+ },
51
+ "harness|hendrycksTest-anatomy|5": {
52
+ "acc": 0.5925925925925926,
53
+ "acc_stderr": 0.04244633238353227,
54
+ "acc_norm": 0.5925925925925926,
55
+ "acc_norm_stderr": 0.04244633238353227
56
+ },
57
+ "harness|hendrycksTest-astronomy|5": {
58
+ "acc": 0.6973684210526315,
59
+ "acc_stderr": 0.03738520676119667,
60
+ "acc_norm": 0.6973684210526315,
61
+ "acc_norm_stderr": 0.03738520676119667
62
+ },
63
+ "harness|hendrycksTest-business_ethics|5": {
64
+ "acc": 0.59,
65
+ "acc_stderr": 0.04943110704237102,
66
+ "acc_norm": 0.59,
67
+ "acc_norm_stderr": 0.04943110704237102
68
+ },
69
+ "harness|hendrycksTest-clinical_knowledge|5": {
70
+ "acc": 0.7132075471698113,
71
+ "acc_stderr": 0.027834912527544057,
72
+ "acc_norm": 0.7132075471698113,
73
+ "acc_norm_stderr": 0.027834912527544057
74
+ },
75
+ "harness|hendrycksTest-college_biology|5": {
76
+ "acc": 0.7847222222222222,
77
+ "acc_stderr": 0.03437079344106135,
78
+ "acc_norm": 0.7847222222222222,
79
+ "acc_norm_stderr": 0.03437079344106135
80
+ },
81
+ "harness|hendrycksTest-college_chemistry|5": {
82
+ "acc": 0.49,
83
+ "acc_stderr": 0.05024183937956912,
84
+ "acc_norm": 0.49,
85
+ "acc_norm_stderr": 0.05024183937956912
86
+ },
87
+ "harness|hendrycksTest-college_computer_science|5": {
88
+ "acc": 0.54,
89
+ "acc_stderr": 0.05009082659620332,
90
+ "acc_norm": 0.54,
91
+ "acc_norm_stderr": 0.05009082659620332
92
+ },
93
+ "harness|hendrycksTest-college_mathematics|5": {
94
+ "acc": 0.34,
95
+ "acc_stderr": 0.04760952285695235,
96
+ "acc_norm": 0.34,
97
+ "acc_norm_stderr": 0.04760952285695235
98
+ },
99
+ "harness|hendrycksTest-college_medicine|5": {
100
+ "acc": 0.653179190751445,
101
+ "acc_stderr": 0.036291466701596636,
102
+ "acc_norm": 0.653179190751445,
103
+ "acc_norm_stderr": 0.036291466701596636
104
+ },
105
+ "harness|hendrycksTest-college_physics|5": {
106
+ "acc": 0.4019607843137255,
107
+ "acc_stderr": 0.04878608714466996,
108
+ "acc_norm": 0.4019607843137255,
109
+ "acc_norm_stderr": 0.04878608714466996
110
+ },
111
+ "harness|hendrycksTest-computer_security|5": {
112
+ "acc": 0.76,
113
+ "acc_stderr": 0.042923469599092816,
114
+ "acc_norm": 0.76,
115
+ "acc_norm_stderr": 0.042923469599092816
116
+ },
117
+ "harness|hendrycksTest-conceptual_physics|5": {
118
+ "acc": 0.5914893617021276,
119
+ "acc_stderr": 0.032134180267015755,
120
+ "acc_norm": 0.5914893617021276,
121
+ "acc_norm_stderr": 0.032134180267015755
122
+ },
123
+ "harness|hendrycksTest-econometrics|5": {
124
+ "acc": 0.45614035087719296,
125
+ "acc_stderr": 0.04685473041907789,
126
+ "acc_norm": 0.45614035087719296,
127
+ "acc_norm_stderr": 0.04685473041907789
128
+ },
129
+ "harness|hendrycksTest-electrical_engineering|5": {
130
+ "acc": 0.5379310344827586,
131
+ "acc_stderr": 0.041546596717075474,
132
+ "acc_norm": 0.5379310344827586,
133
+ "acc_norm_stderr": 0.041546596717075474
134
+ },
135
+ "harness|hendrycksTest-elementary_mathematics|5": {
136
+ "acc": 0.4126984126984127,
137
+ "acc_stderr": 0.02535574126305527,
138
+ "acc_norm": 0.4126984126984127,
139
+ "acc_norm_stderr": 0.02535574126305527
140
+ },
141
+ "harness|hendrycksTest-formal_logic|5": {
142
+ "acc": 0.4603174603174603,
143
+ "acc_stderr": 0.04458029125470973,
144
+ "acc_norm": 0.4603174603174603,
145
+ "acc_norm_stderr": 0.04458029125470973
146
+ },
147
+ "harness|hendrycksTest-global_facts|5": {
148
+ "acc": 0.39,
149
+ "acc_stderr": 0.04902071300001975,
150
+ "acc_norm": 0.39,
151
+ "acc_norm_stderr": 0.04902071300001975
152
+ },
153
+ "harness|hendrycksTest-high_school_biology|5": {
154
+ "acc": 0.7967741935483871,
155
+ "acc_stderr": 0.02289168798455496,
156
+ "acc_norm": 0.7967741935483871,
157
+ "acc_norm_stderr": 0.02289168798455496
158
+ },
159
+ "harness|hendrycksTest-high_school_chemistry|5": {
160
+ "acc": 0.5073891625615764,
161
+ "acc_stderr": 0.035176035403610105,
162
+ "acc_norm": 0.5073891625615764,
163
+ "acc_norm_stderr": 0.035176035403610105
164
+ },
165
+ "harness|hendrycksTest-high_school_computer_science|5": {
166
+ "acc": 0.71,
167
+ "acc_stderr": 0.045604802157206845,
168
+ "acc_norm": 0.71,
169
+ "acc_norm_stderr": 0.045604802157206845
170
+ },
171
+ "harness|hendrycksTest-high_school_european_history|5": {
172
+ "acc": 0.7878787878787878,
173
+ "acc_stderr": 0.031922715695483,
174
+ "acc_norm": 0.7878787878787878,
175
+ "acc_norm_stderr": 0.031922715695483
176
+ },
177
+ "harness|hendrycksTest-high_school_geography|5": {
178
+ "acc": 0.7878787878787878,
179
+ "acc_stderr": 0.029126522834586815,
180
+ "acc_norm": 0.7878787878787878,
181
+ "acc_norm_stderr": 0.029126522834586815
182
+ },
183
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
184
+ "acc": 0.8963730569948186,
185
+ "acc_stderr": 0.021995311963644237,
186
+ "acc_norm": 0.8963730569948186,
187
+ "acc_norm_stderr": 0.021995311963644237
188
+ },
189
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
190
+ "acc": 0.6846153846153846,
191
+ "acc_stderr": 0.02355964698318994,
192
+ "acc_norm": 0.6846153846153846,
193
+ "acc_norm_stderr": 0.02355964698318994
194
+ },
195
+ "harness|hendrycksTest-high_school_mathematics|5": {
196
+ "acc": 0.35185185185185186,
197
+ "acc_stderr": 0.02911661760608301,
198
+ "acc_norm": 0.35185185185185186,
199
+ "acc_norm_stderr": 0.02911661760608301
200
+ },
201
+ "harness|hendrycksTest-high_school_microeconomics|5": {
202
+ "acc": 0.7100840336134454,
203
+ "acc_stderr": 0.029472485833136094,
204
+ "acc_norm": 0.7100840336134454,
205
+ "acc_norm_stderr": 0.029472485833136094
206
+ },
207
+ "harness|hendrycksTest-high_school_physics|5": {
208
+ "acc": 0.3708609271523179,
209
+ "acc_stderr": 0.03943966699183629,
210
+ "acc_norm": 0.3708609271523179,
211
+ "acc_norm_stderr": 0.03943966699183629
212
+ },
213
+ "harness|hendrycksTest-high_school_psychology|5": {
214
+ "acc": 0.8293577981651377,
215
+ "acc_stderr": 0.016129271025099857,
216
+ "acc_norm": 0.8293577981651377,
217
+ "acc_norm_stderr": 0.016129271025099857
218
+ },
219
+ "harness|hendrycksTest-high_school_statistics|5": {
220
+ "acc": 0.5462962962962963,
221
+ "acc_stderr": 0.033953227263757976,
222
+ "acc_norm": 0.5462962962962963,
223
+ "acc_norm_stderr": 0.033953227263757976
224
+ },
225
+ "harness|hendrycksTest-high_school_us_history|5": {
226
+ "acc": 0.8333333333333334,
227
+ "acc_stderr": 0.026156867523931045,
228
+ "acc_norm": 0.8333333333333334,
229
+ "acc_norm_stderr": 0.026156867523931045
230
+ },
231
+ "harness|hendrycksTest-high_school_world_history|5": {
232
+ "acc": 0.8227848101265823,
233
+ "acc_stderr": 0.024856364184503224,
234
+ "acc_norm": 0.8227848101265823,
235
+ "acc_norm_stderr": 0.024856364184503224
236
+ },
237
+ "harness|hendrycksTest-human_aging|5": {
238
+ "acc": 0.7130044843049327,
239
+ "acc_stderr": 0.03036037971029195,
240
+ "acc_norm": 0.7130044843049327,
241
+ "acc_norm_stderr": 0.03036037971029195
242
+ },
243
+ "harness|hendrycksTest-human_sexuality|5": {
244
+ "acc": 0.7709923664122137,
245
+ "acc_stderr": 0.036853466317118506,
246
+ "acc_norm": 0.7709923664122137,
247
+ "acc_norm_stderr": 0.036853466317118506
248
+ },
249
+ "harness|hendrycksTest-international_law|5": {
250
+ "acc": 0.7520661157024794,
251
+ "acc_stderr": 0.03941897526516302,
252
+ "acc_norm": 0.7520661157024794,
253
+ "acc_norm_stderr": 0.03941897526516302
254
+ },
255
+ "harness|hendrycksTest-jurisprudence|5": {
256
+ "acc": 0.7592592592592593,
257
+ "acc_stderr": 0.04133119440243839,
258
+ "acc_norm": 0.7592592592592593,
259
+ "acc_norm_stderr": 0.04133119440243839
260
+ },
261
+ "harness|hendrycksTest-logical_fallacies|5": {
262
+ "acc": 0.754601226993865,
263
+ "acc_stderr": 0.03380939813943354,
264
+ "acc_norm": 0.754601226993865,
265
+ "acc_norm_stderr": 0.03380939813943354
266
+ },
267
+ "harness|hendrycksTest-machine_learning|5": {
268
+ "acc": 0.5089285714285714,
269
+ "acc_stderr": 0.04745033255489123,
270
+ "acc_norm": 0.5089285714285714,
271
+ "acc_norm_stderr": 0.04745033255489123
272
+ },
273
+ "harness|hendrycksTest-management|5": {
274
+ "acc": 0.7378640776699029,
275
+ "acc_stderr": 0.043546310772605956,
276
+ "acc_norm": 0.7378640776699029,
277
+ "acc_norm_stderr": 0.043546310772605956
278
+ },
279
+ "harness|hendrycksTest-marketing|5": {
280
+ "acc": 0.8547008547008547,
281
+ "acc_stderr": 0.023086635086841407,
282
+ "acc_norm": 0.8547008547008547,
283
+ "acc_norm_stderr": 0.023086635086841407
284
+ },
285
+ "harness|hendrycksTest-medical_genetics|5": {
286
+ "acc": 0.72,
287
+ "acc_stderr": 0.045126085985421276,
288
+ "acc_norm": 0.72,
289
+ "acc_norm_stderr": 0.045126085985421276
290
+ },
291
+ "harness|hendrycksTest-miscellaneous|5": {
292
+ "acc": 0.8186462324393359,
293
+ "acc_stderr": 0.01377869377846408,
294
+ "acc_norm": 0.8186462324393359,
295
+ "acc_norm_stderr": 0.01377869377846408
296
+ },
297
+ "harness|hendrycksTest-moral_disputes|5": {
298
+ "acc": 0.7341040462427746,
299
+ "acc_stderr": 0.023786203255508283,
300
+ "acc_norm": 0.7341040462427746,
301
+ "acc_norm_stderr": 0.023786203255508283
302
+ },
303
+ "harness|hendrycksTest-moral_scenarios|5": {
304
+ "acc": 0.329608938547486,
305
+ "acc_stderr": 0.01572153107518388,
306
+ "acc_norm": 0.329608938547486,
307
+ "acc_norm_stderr": 0.01572153107518388
308
+ },
309
+ "harness|hendrycksTest-nutrition|5": {
310
+ "acc": 0.7450980392156863,
311
+ "acc_stderr": 0.02495418432487991,
312
+ "acc_norm": 0.7450980392156863,
313
+ "acc_norm_stderr": 0.02495418432487991
314
+ },
315
+ "harness|hendrycksTest-philosophy|5": {
316
+ "acc": 0.6913183279742765,
317
+ "acc_stderr": 0.026236965881153266,
318
+ "acc_norm": 0.6913183279742765,
319
+ "acc_norm_stderr": 0.026236965881153266
320
+ },
321
+ "harness|hendrycksTest-prehistory|5": {
322
+ "acc": 0.7314814814814815,
323
+ "acc_stderr": 0.02465968518596728,
324
+ "acc_norm": 0.7314814814814815,
325
+ "acc_norm_stderr": 0.02465968518596728
326
+ },
327
+ "harness|hendrycksTest-professional_accounting|5": {
328
+ "acc": 0.49645390070921985,
329
+ "acc_stderr": 0.02982674915328092,
330
+ "acc_norm": 0.49645390070921985,
331
+ "acc_norm_stderr": 0.02982674915328092
332
+ },
333
+ "harness|hendrycksTest-professional_law|5": {
334
+ "acc": 0.4817470664928292,
335
+ "acc_stderr": 0.012761723960595472,
336
+ "acc_norm": 0.4817470664928292,
337
+ "acc_norm_stderr": 0.012761723960595472
338
+ },
339
+ "harness|hendrycksTest-professional_medicine|5": {
340
+ "acc": 0.6875,
341
+ "acc_stderr": 0.02815637344037142,
342
+ "acc_norm": 0.6875,
343
+ "acc_norm_stderr": 0.02815637344037142
344
+ },
345
+ "harness|hendrycksTest-professional_psychology|5": {
346
+ "acc": 0.6454248366013072,
347
+ "acc_stderr": 0.0193533605475537,
348
+ "acc_norm": 0.6454248366013072,
349
+ "acc_norm_stderr": 0.0193533605475537
350
+ },
351
+ "harness|hendrycksTest-public_relations|5": {
352
+ "acc": 0.6727272727272727,
353
+ "acc_stderr": 0.0449429086625209,
354
+ "acc_norm": 0.6727272727272727,
355
+ "acc_norm_stderr": 0.0449429086625209
356
+ },
357
+ "harness|hendrycksTest-security_studies|5": {
358
+ "acc": 0.7306122448979592,
359
+ "acc_stderr": 0.02840125202902294,
360
+ "acc_norm": 0.7306122448979592,
361
+ "acc_norm_stderr": 0.02840125202902294
362
+ },
363
+ "harness|hendrycksTest-sociology|5": {
364
+ "acc": 0.8159203980099502,
365
+ "acc_stderr": 0.027403859410786845,
366
+ "acc_norm": 0.8159203980099502,
367
+ "acc_norm_stderr": 0.027403859410786845
368
+ },
369
+ "harness|hendrycksTest-us_foreign_policy|5": {
370
+ "acc": 0.88,
371
+ "acc_stderr": 0.03265986323710906,
372
+ "acc_norm": 0.88,
373
+ "acc_norm_stderr": 0.03265986323710906
374
+ },
375
+ "harness|hendrycksTest-virology|5": {
376
+ "acc": 0.5301204819277109,
377
+ "acc_stderr": 0.03885425420866767,
378
+ "acc_norm": 0.5301204819277109,
379
+ "acc_norm_stderr": 0.03885425420866767
380
+ },
381
+ "harness|hendrycksTest-world_religions|5": {
382
+ "acc": 0.8538011695906432,
383
+ "acc_stderr": 0.02709729011807081,
384
+ "acc_norm": 0.8538011695906432,
385
+ "acc_norm_stderr": 0.02709729011807081
386
+ },
387
+ "harness|truthfulqa:mc|0": {
388
+ "mc1": 0.4283965728274174,
389
+ "mc1_stderr": 0.017323088597314757,
390
+ "mc2": 0.5998074537794252,
391
+ "mc2_stderr": 0.015494960379071198
392
+ },
393
+ "harness|winogrande|5": {
394
+ "acc": 0.7853196527229677,
395
+ "acc_stderr": 0.011539912734345403
396
+ },
397
+ "harness|gsm8k|5": {
398
+ "acc": 0.6156178923426838,
399
+ "acc_stderr": 0.013399219253698186
400
+ }
401
+ }