Corey Morris commited on
Commit
ffdb8d3
1 Parent(s): 22725f7

Added one llama result for MMLU

Browse files
Files changed (2) hide show
  1. app.py +9 -7
  2. llama-30B_mmlu_5-shot.json +416 -0
app.py CHANGED
@@ -1,15 +1,17 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import numpy as np
 
4
 
5
- # Create a sample dataframe with 10 language models and 57 attributes
6
- np.random.seed(0)
7
- data = pd.DataFrame(np.random.rand(10, 57),
8
- columns=[f'Attribute_{i+1}' for i in range(57)],
9
- index=[f'Model_{i+1}' for i in range(10)])
10
 
11
- # Let's consider the first attribute as the main one for sorting the leaderboard
12
- data = data.sort_values('Attribute_1', ascending=False)
 
 
 
13
 
14
  def show_leaderboard():
15
  # Convert dataframe to html so that it can be displayed properly in Gradio
 
1
  import gradio as gr
2
  import pandas as pd
3
  import numpy as np
4
+ import json
5
 
6
+ # Load the data from the JSON file
7
+ with open('llama-30B_mmlu_5-shot.json', 'r') as f:
8
+ results = json.load(f)
 
 
9
 
10
+ # Create a DataFrame from the results dictionary
11
+ data = pd.DataFrame(results['results']).T
12
+
13
+ # Sort the DataFrame by 'acc' column
14
+ data = data.sort_values('acc', ascending=False)
15
 
16
  def show_leaderboard():
17
  # Convert dataframe to html so that it can be displayed properly in Gradio
llama-30B_mmlu_5-shot.json ADDED
@@ -0,0 +1,416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hendrycksTest-high_school_world_history": {
4
+ "acc": 0.6962025316455697,
5
+ "acc_stderr": 0.029936696387138598,
6
+ "acc_norm": 0.569620253164557,
7
+ "acc_norm_stderr": 0.032230171959375976
8
+ },
9
+ "hendrycksTest-formal_logic": {
10
+ "acc": 0.42063492063492064,
11
+ "acc_stderr": 0.04415438226743743,
12
+ "acc_norm": 0.3968253968253968,
13
+ "acc_norm_stderr": 0.043758884927270605
14
+ },
15
+ "hendrycksTest-human_aging": {
16
+ "acc": 0.672645739910314,
17
+ "acc_stderr": 0.03149384670994131,
18
+ "acc_norm": 0.3632286995515695,
19
+ "acc_norm_stderr": 0.032277904428505
20
+ },
21
+ "hendrycksTest-international_law": {
22
+ "acc": 0.7024793388429752,
23
+ "acc_stderr": 0.04173349148083499,
24
+ "acc_norm": 0.768595041322314,
25
+ "acc_norm_stderr": 0.03849856098794088
26
+ },
27
+ "hendrycksTest-security_studies": {
28
+ "acc": 0.5714285714285714,
29
+ "acc_stderr": 0.031680911612338825,
30
+ "acc_norm": 0.40408163265306124,
31
+ "acc_norm_stderr": 0.0314147080258659
32
+ },
33
+ "hendrycksTest-medical_genetics": {
34
+ "acc": 0.6,
35
+ "acc_stderr": 0.049236596391733084,
36
+ "acc_norm": 0.54,
37
+ "acc_norm_stderr": 0.05009082659620332
38
+ },
39
+ "hendrycksTest-econometrics": {
40
+ "acc": 0.3508771929824561,
41
+ "acc_stderr": 0.044895393502707,
42
+ "acc_norm": 0.3157894736842105,
43
+ "acc_norm_stderr": 0.043727482902780064
44
+ },
45
+ "hendrycksTest-high_school_macroeconomics": {
46
+ "acc": 0.5153846153846153,
47
+ "acc_stderr": 0.025339003010106515,
48
+ "acc_norm": 0.4153846153846154,
49
+ "acc_norm_stderr": 0.024985354923102332
50
+ },
51
+ "hendrycksTest-us_foreign_policy": {
52
+ "acc": 0.79,
53
+ "acc_stderr": 0.040936018074033256,
54
+ "acc_norm": 0.59,
55
+ "acc_norm_stderr": 0.049431107042371025
56
+ },
57
+ "hendrycksTest-logical_fallacies": {
58
+ "acc": 0.6993865030674846,
59
+ "acc_stderr": 0.03602511318806771,
60
+ "acc_norm": 0.5398773006134969,
61
+ "acc_norm_stderr": 0.039158572914369714
62
+ },
63
+ "hendrycksTest-prehistory": {
64
+ "acc": 0.6635802469135802,
65
+ "acc_stderr": 0.026289734945952926,
66
+ "acc_norm": 0.42901234567901236,
67
+ "acc_norm_stderr": 0.027538925613470867
68
+ },
69
+ "hendrycksTest-professional_psychology": {
70
+ "acc": 0.5882352941176471,
71
+ "acc_stderr": 0.019910377463105932,
72
+ "acc_norm": 0.43300653594771243,
73
+ "acc_norm_stderr": 0.02004544247332422
74
+ },
75
+ "hendrycksTest-professional_accounting": {
76
+ "acc": 0.3971631205673759,
77
+ "acc_stderr": 0.029189805673587105,
78
+ "acc_norm": 0.33687943262411346,
79
+ "acc_norm_stderr": 0.02819553487396673
80
+ },
81
+ "hendrycksTest-college_biology": {
82
+ "acc": 0.6111111111111112,
83
+ "acc_stderr": 0.04076663253918567,
84
+ "acc_norm": 0.4236111111111111,
85
+ "acc_norm_stderr": 0.04132125019723369
86
+ },
87
+ "hendrycksTest-high_school_biology": {
88
+ "acc": 0.6709677419354839,
89
+ "acc_stderr": 0.02672949906834996,
90
+ "acc_norm": 0.5451612903225806,
91
+ "acc_norm_stderr": 0.028327743091561074
92
+ },
93
+ "hendrycksTest-philosophy": {
94
+ "acc": 0.6752411575562701,
95
+ "acc_stderr": 0.02659678228769704,
96
+ "acc_norm": 0.5016077170418006,
97
+ "acc_norm_stderr": 0.02839794490780661
98
+ },
99
+ "hendrycksTest-high_school_european_history": {
100
+ "acc": 0.696969696969697,
101
+ "acc_stderr": 0.03588624800091707,
102
+ "acc_norm": 0.5636363636363636,
103
+ "acc_norm_stderr": 0.03872592983524754
104
+ },
105
+ "hendrycksTest-college_medicine": {
106
+ "acc": 0.5144508670520231,
107
+ "acc_stderr": 0.03810871630454764,
108
+ "acc_norm": 0.43352601156069365,
109
+ "acc_norm_stderr": 0.03778621079092055
110
+ },
111
+ "hendrycksTest-professional_medicine": {
112
+ "acc": 0.5551470588235294,
113
+ "acc_stderr": 0.03018753206032938,
114
+ "acc_norm": 0.35661764705882354,
115
+ "acc_norm_stderr": 0.02909720956841195
116
+ },
117
+ "hendrycksTest-moral_scenarios": {
118
+ "acc": 0.34301675977653634,
119
+ "acc_stderr": 0.015876912673057724,
120
+ "acc_norm": 0.27262569832402234,
121
+ "acc_norm_stderr": 0.014893391735249588
122
+ },
123
+ "hendrycksTest-high_school_chemistry": {
124
+ "acc": 0.39901477832512317,
125
+ "acc_stderr": 0.03445487686264716,
126
+ "acc_norm": 0.3694581280788177,
127
+ "acc_norm_stderr": 0.03395970381998573
128
+ },
129
+ "hendrycksTest-high_school_physics": {
130
+ "acc": 0.31788079470198677,
131
+ "acc_stderr": 0.038020397601079024,
132
+ "acc_norm": 0.31125827814569534,
133
+ "acc_norm_stderr": 0.03780445850526733
134
+ },
135
+ "hendrycksTest-high_school_government_and_politics": {
136
+ "acc": 0.8082901554404145,
137
+ "acc_stderr": 0.028408953626245282,
138
+ "acc_norm": 0.6113989637305699,
139
+ "acc_norm_stderr": 0.03517739796373132
140
+ },
141
+ "hendrycksTest-high_school_geography": {
142
+ "acc": 0.7575757575757576,
143
+ "acc_stderr": 0.030532892233932026,
144
+ "acc_norm": 0.5505050505050505,
145
+ "acc_norm_stderr": 0.0354413249194797
146
+ },
147
+ "hendrycksTest-global_facts": {
148
+ "acc": 0.47,
149
+ "acc_stderr": 0.05016135580465919,
150
+ "acc_norm": 0.37,
151
+ "acc_norm_stderr": 0.04852365870939099
152
+ },
153
+ "hendrycksTest-professional_law": {
154
+ "acc": 0.4002607561929596,
155
+ "acc_stderr": 0.012513582529136213,
156
+ "acc_norm": 0.3435462842242503,
157
+ "acc_norm_stderr": 0.012128961174190158
158
+ },
159
+ "hendrycksTest-college_mathematics": {
160
+ "acc": 0.37,
161
+ "acc_stderr": 0.048523658709391,
162
+ "acc_norm": 0.3,
163
+ "acc_norm_stderr": 0.046056618647183814
164
+ },
165
+ "hendrycksTest-college_physics": {
166
+ "acc": 0.23529411764705882,
167
+ "acc_stderr": 0.04220773659171452,
168
+ "acc_norm": 0.29411764705882354,
169
+ "acc_norm_stderr": 0.04533838195929774
170
+ },
171
+ "hendrycksTest-high_school_statistics": {
172
+ "acc": 0.4351851851851852,
173
+ "acc_stderr": 0.03381200005643525,
174
+ "acc_norm": 0.35648148148148145,
175
+ "acc_norm_stderr": 0.032664783315272714
176
+ },
177
+ "hendrycksTest-machine_learning": {
178
+ "acc": 0.4017857142857143,
179
+ "acc_stderr": 0.04653333146973646,
180
+ "acc_norm": 0.30357142857142855,
181
+ "acc_norm_stderr": 0.04364226155841044
182
+ },
183
+ "hendrycksTest-public_relations": {
184
+ "acc": 0.6454545454545455,
185
+ "acc_stderr": 0.045820048415054174,
186
+ "acc_norm": 0.4090909090909091,
187
+ "acc_norm_stderr": 0.047093069786618966
188
+ },
189
+ "hendrycksTest-high_school_computer_science": {
190
+ "acc": 0.61,
191
+ "acc_stderr": 0.04902071300001974,
192
+ "acc_norm": 0.47,
193
+ "acc_norm_stderr": 0.05016135580465919
194
+ },
195
+ "hendrycksTest-high_school_psychology": {
196
+ "acc": 0.7706422018348624,
197
+ "acc_stderr": 0.018025349724618684,
198
+ "acc_norm": 0.5541284403669725,
199
+ "acc_norm_stderr": 0.021311335009708582
200
+ },
201
+ "hendrycksTest-virology": {
202
+ "acc": 0.4939759036144578,
203
+ "acc_stderr": 0.03892212195333045,
204
+ "acc_norm": 0.3433734939759036,
205
+ "acc_norm_stderr": 0.03696584317010601
206
+ },
207
+ "hendrycksTest-marketing": {
208
+ "acc": 0.8461538461538461,
209
+ "acc_stderr": 0.023636873317489294,
210
+ "acc_norm": 0.7649572649572649,
211
+ "acc_norm_stderr": 0.027778835904935437
212
+ },
213
+ "hendrycksTest-human_sexuality": {
214
+ "acc": 0.7022900763358778,
215
+ "acc_stderr": 0.04010358942462203,
216
+ "acc_norm": 0.46564885496183206,
217
+ "acc_norm_stderr": 0.04374928560599738
218
+ },
219
+ "hendrycksTest-sociology": {
220
+ "acc": 0.7611940298507462,
221
+ "acc_stderr": 0.03014777593540922,
222
+ "acc_norm": 0.6616915422885572,
223
+ "acc_norm_stderr": 0.033455630703391914
224
+ },
225
+ "hendrycksTest-college_computer_science": {
226
+ "acc": 0.43,
227
+ "acc_stderr": 0.049756985195624284,
228
+ "acc_norm": 0.34,
229
+ "acc_norm_stderr": 0.04760952285695236
230
+ },
231
+ "hendrycksTest-conceptual_physics": {
232
+ "acc": 0.5106382978723404,
233
+ "acc_stderr": 0.03267862331014063,
234
+ "acc_norm": 0.3276595744680851,
235
+ "acc_norm_stderr": 0.030683020843231004
236
+ },
237
+ "hendrycksTest-anatomy": {
238
+ "acc": 0.5185185185185185,
239
+ "acc_stderr": 0.043163785995113245,
240
+ "acc_norm": 0.4074074074074074,
241
+ "acc_norm_stderr": 0.04244633238353228
242
+ },
243
+ "hendrycksTest-miscellaneous": {
244
+ "acc": 0.8186462324393359,
245
+ "acc_stderr": 0.013778693778464062,
246
+ "acc_norm": 0.6143039591315453,
247
+ "acc_norm_stderr": 0.017406476619212907
248
+ },
249
+ "hendrycksTest-jurisprudence": {
250
+ "acc": 0.6666666666666666,
251
+ "acc_stderr": 0.04557239513497751,
252
+ "acc_norm": 0.5555555555555556,
253
+ "acc_norm_stderr": 0.04803752235190193
254
+ },
255
+ "hendrycksTest-moral_disputes": {
256
+ "acc": 0.6184971098265896,
257
+ "acc_stderr": 0.026152198619726792,
258
+ "acc_norm": 0.4595375722543353,
259
+ "acc_norm_stderr": 0.026830805998952236
260
+ },
261
+ "hendrycksTest-high_school_us_history": {
262
+ "acc": 0.7205882352941176,
263
+ "acc_stderr": 0.031493281045079556,
264
+ "acc_norm": 0.553921568627451,
265
+ "acc_norm_stderr": 0.03488845451304974
266
+ },
267
+ "hendrycksTest-high_school_mathematics": {
268
+ "acc": 0.25925925925925924,
269
+ "acc_stderr": 0.026719240783712177,
270
+ "acc_norm": 0.3148148148148148,
271
+ "acc_norm_stderr": 0.02831753349606648
272
+ },
273
+ "hendrycksTest-high_school_microeconomics": {
274
+ "acc": 0.5840336134453782,
275
+ "acc_stderr": 0.032016501007396114,
276
+ "acc_norm": 0.4831932773109244,
277
+ "acc_norm_stderr": 0.03246013680375308
278
+ },
279
+ "hendrycksTest-astronomy": {
280
+ "acc": 0.5723684210526315,
281
+ "acc_stderr": 0.04026097083296564,
282
+ "acc_norm": 0.5657894736842105,
283
+ "acc_norm_stderr": 0.04033565667848319
284
+ },
285
+ "hendrycksTest-world_religions": {
286
+ "acc": 0.8128654970760234,
287
+ "acc_stderr": 0.029913127232368043,
288
+ "acc_norm": 0.7660818713450293,
289
+ "acc_norm_stderr": 0.03246721765117825
290
+ },
291
+ "hendrycksTest-clinical_knowledge": {
292
+ "acc": 0.5320754716981132,
293
+ "acc_stderr": 0.03070948699255654,
294
+ "acc_norm": 0.4641509433962264,
295
+ "acc_norm_stderr": 0.030693675018458003
296
+ },
297
+ "hendrycksTest-college_chemistry": {
298
+ "acc": 0.31,
299
+ "acc_stderr": 0.04648231987117316,
300
+ "acc_norm": 0.32,
301
+ "acc_norm_stderr": 0.046882617226215034
302
+ },
303
+ "hendrycksTest-abstract_algebra": {
304
+ "acc": 0.26,
305
+ "acc_stderr": 0.04408440022768078,
306
+ "acc_norm": 0.29,
307
+ "acc_norm_stderr": 0.04560480215720684
308
+ },
309
+ "hendrycksTest-business_ethics": {
310
+ "acc": 0.67,
311
+ "acc_stderr": 0.04725815626252609,
312
+ "acc_norm": 0.48,
313
+ "acc_norm_stderr": 0.050211673156867795
314
+ },
315
+ "hendrycksTest-elementary_mathematics": {
316
+ "acc": 0.4417989417989418,
317
+ "acc_stderr": 0.02557625706125384,
318
+ "acc_norm": 0.37037037037037035,
319
+ "acc_norm_stderr": 0.024870815251057075
320
+ },
321
+ "hendrycksTest-management": {
322
+ "acc": 0.7184466019417476,
323
+ "acc_stderr": 0.044532548363264673,
324
+ "acc_norm": 0.5533980582524272,
325
+ "acc_norm_stderr": 0.04922424153458933
326
+ },
327
+ "hendrycksTest-electrical_engineering": {
328
+ "acc": 0.5172413793103449,
329
+ "acc_stderr": 0.04164188720169375,
330
+ "acc_norm": 0.38620689655172413,
331
+ "acc_norm_stderr": 0.040573247344190336
332
+ },
333
+ "hendrycksTest-nutrition": {
334
+ "acc": 0.6111111111111112,
335
+ "acc_stderr": 0.02791405551046801,
336
+ "acc_norm": 0.5032679738562091,
337
+ "acc_norm_stderr": 0.028629305194003543
338
+ },
339
+ "hendrycksTest-computer_security": {
340
+ "acc": 0.66,
341
+ "acc_stderr": 0.04760952285695237,
342
+ "acc_norm": 0.58,
343
+ "acc_norm_stderr": 0.049604496374885836
344
+ }
345
+ },
346
+ "versions": {
347
+ "hendrycksTest-high_school_world_history": 0,
348
+ "hendrycksTest-formal_logic": 0,
349
+ "hendrycksTest-human_aging": 0,
350
+ "hendrycksTest-international_law": 0,
351
+ "hendrycksTest-security_studies": 0,
352
+ "hendrycksTest-medical_genetics": 0,
353
+ "hendrycksTest-econometrics": 0,
354
+ "hendrycksTest-high_school_macroeconomics": 0,
355
+ "hendrycksTest-us_foreign_policy": 0,
356
+ "hendrycksTest-logical_fallacies": 0,
357
+ "hendrycksTest-prehistory": 0,
358
+ "hendrycksTest-professional_psychology": 0,
359
+ "hendrycksTest-professional_accounting": 0,
360
+ "hendrycksTest-college_biology": 0,
361
+ "hendrycksTest-high_school_biology": 0,
362
+ "hendrycksTest-philosophy": 0,
363
+ "hendrycksTest-high_school_european_history": 0,
364
+ "hendrycksTest-college_medicine": 0,
365
+ "hendrycksTest-professional_medicine": 0,
366
+ "hendrycksTest-moral_scenarios": 0,
367
+ "hendrycksTest-high_school_chemistry": 0,
368
+ "hendrycksTest-high_school_physics": 0,
369
+ "hendrycksTest-high_school_government_and_politics": 0,
370
+ "hendrycksTest-high_school_geography": 0,
371
+ "hendrycksTest-global_facts": 0,
372
+ "hendrycksTest-professional_law": 0,
373
+ "hendrycksTest-college_mathematics": 0,
374
+ "hendrycksTest-college_physics": 0,
375
+ "hendrycksTest-high_school_statistics": 0,
376
+ "hendrycksTest-machine_learning": 0,
377
+ "hendrycksTest-public_relations": 0,
378
+ "hendrycksTest-high_school_computer_science": 0,
379
+ "hendrycksTest-high_school_psychology": 0,
380
+ "hendrycksTest-virology": 0,
381
+ "hendrycksTest-marketing": 0,
382
+ "hendrycksTest-human_sexuality": 0,
383
+ "hendrycksTest-sociology": 0,
384
+ "hendrycksTest-college_computer_science": 0,
385
+ "hendrycksTest-conceptual_physics": 0,
386
+ "hendrycksTest-anatomy": 0,
387
+ "hendrycksTest-miscellaneous": 0,
388
+ "hendrycksTest-jurisprudence": 0,
389
+ "hendrycksTest-moral_disputes": 0,
390
+ "hendrycksTest-high_school_us_history": 0,
391
+ "hendrycksTest-high_school_mathematics": 0,
392
+ "hendrycksTest-high_school_microeconomics": 0,
393
+ "hendrycksTest-astronomy": 0,
394
+ "hendrycksTest-world_religions": 0,
395
+ "hendrycksTest-clinical_knowledge": 0,
396
+ "hendrycksTest-college_chemistry": 0,
397
+ "hendrycksTest-abstract_algebra": 0,
398
+ "hendrycksTest-business_ethics": 0,
399
+ "hendrycksTest-elementary_mathematics": 0,
400
+ "hendrycksTest-management": 0,
401
+ "hendrycksTest-electrical_engineering": 0,
402
+ "hendrycksTest-nutrition": 0,
403
+ "hendrycksTest-computer_security": 0
404
+ },
405
+ "config": {
406
+ "model": "hf-causal-experimental",
407
+ "model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/30B,use_accelerate=True",
408
+ "num_fewshot": 5,
409
+ "batch_size": "auto",
410
+ "device": "cuda:0",
411
+ "no_cache": true,
412
+ "limit": null,
413
+ "bootstrap_iters": 100000,
414
+ "description_dict": {}
415
+ }
416
+ }