CoreyMorris commited on
Commit
28fcccf
1 Parent(s): 3ebf7a7
Files changed (1) hide show
  1. generate_csv.ipynb +500 -1
generate_csv.ipynb CHANGED
@@ -22,7 +22,16 @@
22
  "1100\n",
23
  "1200\n",
24
  "1300\n",
25
- "1400\n"
 
 
 
 
 
 
 
 
 
26
  ]
27
  }
28
  ],
@@ -31,6 +40,496 @@
31
  "result = ResultDataProcessor()"
32
  ]
33
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  {
35
  "cell_type": "code",
36
  "execution_count": null,
 
22
  "1100\n",
23
  "1200\n",
24
  "1300\n",
25
+ "1400\n",
26
+ "1500\n",
27
+ "1600\n",
28
+ "1700\n",
29
+ "1800\n",
30
+ "full_model_names\n",
31
+ "1889\n",
32
+ "organization_names\n",
33
+ "12\n",
34
+ "['Parameters', 'drop|3', 'gsm8k', 'MMLU_average', 'winogrande', 'all', 'arc:challenge|25', 'hellaswag|10', 'MMLU_abstract_algebra', 'MMLU_anatomy', 'MMLU_astronomy', 'MMLU_business_ethics', 'MMLU_clinical_knowledge', 'MMLU_college_biology', 'MMLU_college_chemistry', 'MMLU_college_computer_science', 'MMLU_college_mathematics', 'MMLU_college_medicine', 'MMLU_college_physics', 'MMLU_computer_security', 'MMLU_conceptual_physics', 'MMLU_econometrics', 'MMLU_electrical_engineering', 'MMLU_elementary_mathematics', 'MMLU_formal_logic', 'MMLU_global_facts', 'MMLU_high_school_biology', 'MMLU_high_school_chemistry', 'MMLU_high_school_computer_science', 'MMLU_high_school_european_history', 'MMLU_high_school_geography', 'MMLU_high_school_government_and_politics', 'MMLU_high_school_macroeconomics', 'MMLU_high_school_mathematics', 'MMLU_high_school_microeconomics', 'MMLU_high_school_physics', 'MMLU_high_school_psychology', 'MMLU_high_school_statistics', 'MMLU_high_school_us_history', 'MMLU_high_school_world_history', 'MMLU_human_aging', 'MMLU_human_sexuality', 'MMLU_international_law', 'MMLU_jurisprudence', 'MMLU_logical_fallacies', 'MMLU_machine_learning', 'MMLU_management', 'MMLU_marketing', 'MMLU_medical_genetics', 'MMLU_miscellaneous', 'MMLU_moral_disputes', 'MMLU_moral_scenarios', 'MMLU_nutrition', 'MMLU_philosophy', 'MMLU_prehistory', 'MMLU_professional_accounting', 'MMLU_professional_law', 'MMLU_professional_medicine', 'MMLU_professional_psychology', 'MMLU_public_relations', 'MMLU_security_studies', 'MMLU_sociology', 'MMLU_us_foreign_policy', 'MMLU_virology', 'MMLU_world_religions', 'truthfulqa:mc|0', 'full_model_name']\n"
35
  ]
36
  }
37
  ],
 
40
  "result = ResultDataProcessor()"
41
  ]
42
  },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": 2,
46
+ "metadata": {},
47
+ "outputs": [
48
+ {
49
+ "data": {
50
+ "text/html": [
51
+ "<div>\n",
52
+ "<style scoped>\n",
53
+ " .dataframe tbody tr th:only-of-type {\n",
54
+ " vertical-align: middle;\n",
55
+ " }\n",
56
+ "\n",
57
+ " .dataframe tbody tr th {\n",
58
+ " vertical-align: top;\n",
59
+ " }\n",
60
+ "\n",
61
+ " .dataframe thead th {\n",
62
+ " text-align: right;\n",
63
+ " }\n",
64
+ "</style>\n",
65
+ "<table border=\"1\" class=\"dataframe\">\n",
66
+ " <thead>\n",
67
+ " <tr style=\"text-align: right;\">\n",
68
+ " <th></th>\n",
69
+ " <th>URL</th>\n",
70
+ " <th>full_model_name</th>\n",
71
+ " <th>Parameters</th>\n",
72
+ " <th>MMLU_average</th>\n",
73
+ " <th>arc:challenge|25</th>\n",
74
+ " <th>hellaswag|10</th>\n",
75
+ " <th>MMLU_abstract_algebra</th>\n",
76
+ " <th>MMLU_anatomy</th>\n",
77
+ " <th>MMLU_astronomy</th>\n",
78
+ " <th>MMLU_business_ethics</th>\n",
79
+ " <th>...</th>\n",
80
+ " <th>MMLU_professional_accounting</th>\n",
81
+ " <th>MMLU_professional_law</th>\n",
82
+ " <th>MMLU_professional_medicine</th>\n",
83
+ " <th>MMLU_professional_psychology</th>\n",
84
+ " <th>MMLU_public_relations</th>\n",
85
+ " <th>MMLU_security_studies</th>\n",
86
+ " <th>MMLU_sociology</th>\n",
87
+ " <th>MMLU_us_foreign_policy</th>\n",
88
+ " <th>MMLU_virology</th>\n",
89
+ " <th>MMLU_world_religions</th>\n",
90
+ " </tr>\n",
91
+ " </thead>\n",
92
+ " <tbody>\n",
93
+ " <tr>\n",
94
+ " <th>SparseOPT-1.3B</th>\n",
95
+ " <td>https://huggingface.co/shaohang/SparseOPT-1.3B</td>\n",
96
+ " <td>shaohang/SparseOPT-1.3B</td>\n",
97
+ " <td>1.3</td>\n",
98
+ " <td>0.255963</td>\n",
99
+ " <td>0.240614</td>\n",
100
+ " <td>0.383689</td>\n",
101
+ " <td>0.22</td>\n",
102
+ " <td>0.214815</td>\n",
103
+ " <td>0.157895</td>\n",
104
+ " <td>0.20</td>\n",
105
+ " <td>...</td>\n",
106
+ " <td>0.262411</td>\n",
107
+ " <td>0.238592</td>\n",
108
+ " <td>0.448529</td>\n",
109
+ " <td>0.254902</td>\n",
110
+ " <td>0.236364</td>\n",
111
+ " <td>0.171429</td>\n",
112
+ " <td>0.228856</td>\n",
113
+ " <td>0.27</td>\n",
114
+ " <td>0.283133</td>\n",
115
+ " <td>0.216374</td>\n",
116
+ " </tr>\n",
117
+ " <tr>\n",
118
+ " <th>Athena-v1</th>\n",
119
+ " <td>https://huggingface.co/IkariDev/Athena-v1</td>\n",
120
+ " <td>IkariDev/Athena-v1</td>\n",
121
+ " <td>NaN</td>\n",
122
+ " <td>0.556052</td>\n",
123
+ " <td>0.560580</td>\n",
124
+ " <td>0.631548</td>\n",
125
+ " <td>0.31</td>\n",
126
+ " <td>0.496296</td>\n",
127
+ " <td>0.526316</td>\n",
128
+ " <td>0.58</td>\n",
129
+ " <td>...</td>\n",
130
+ " <td>0.404255</td>\n",
131
+ " <td>0.392438</td>\n",
132
+ " <td>0.525735</td>\n",
133
+ " <td>0.540850</td>\n",
134
+ " <td>0.645455</td>\n",
135
+ " <td>0.640816</td>\n",
136
+ " <td>0.751244</td>\n",
137
+ " <td>0.83</td>\n",
138
+ " <td>0.493976</td>\n",
139
+ " <td>0.725146</td>\n",
140
+ " </tr>\n",
141
+ " <tr>\n",
142
+ " <th>Athena-tmp</th>\n",
143
+ " <td>https://huggingface.co/IkariDev/Athena-tmp</td>\n",
144
+ " <td>IkariDev/Athena-tmp</td>\n",
145
+ " <td>NaN</td>\n",
146
+ " <td>0.588685</td>\n",
147
+ " <td>0.567406</td>\n",
148
+ " <td>0.621888</td>\n",
149
+ " <td>0.29</td>\n",
150
+ " <td>0.518519</td>\n",
151
+ " <td>0.638158</td>\n",
152
+ " <td>0.62</td>\n",
153
+ " <td>...</td>\n",
154
+ " <td>0.450355</td>\n",
155
+ " <td>0.462842</td>\n",
156
+ " <td>0.569853</td>\n",
157
+ " <td>0.588235</td>\n",
158
+ " <td>0.645455</td>\n",
159
+ " <td>0.653061</td>\n",
160
+ " <td>0.721393</td>\n",
161
+ " <td>0.81</td>\n",
162
+ " <td>0.463855</td>\n",
163
+ " <td>0.801170</td>\n",
164
+ " </tr>\n",
165
+ " <tr>\n",
166
+ " <th>13B-Legerdemain-L2</th>\n",
167
+ " <td>https://huggingface.co/CalderaAI/13B-Legerdema...</td>\n",
168
+ " <td>CalderaAI/13B-Legerdemain-L2</td>\n",
169
+ " <td>13.0</td>\n",
170
+ " <td>0.560030</td>\n",
171
+ " <td>0.573379</td>\n",
172
+ " <td>0.635431</td>\n",
173
+ " <td>0.36</td>\n",
174
+ " <td>0.525926</td>\n",
175
+ " <td>0.572368</td>\n",
176
+ " <td>0.53</td>\n",
177
+ " <td>...</td>\n",
178
+ " <td>0.429078</td>\n",
179
+ " <td>0.424381</td>\n",
180
+ " <td>0.522059</td>\n",
181
+ " <td>0.532680</td>\n",
182
+ " <td>0.609091</td>\n",
183
+ " <td>0.636735</td>\n",
184
+ " <td>0.766169</td>\n",
185
+ " <td>0.87</td>\n",
186
+ " <td>0.427711</td>\n",
187
+ " <td>0.777778</td>\n",
188
+ " </tr>\n",
189
+ " <tr>\n",
190
+ " <th>13B-Ouroboros</th>\n",
191
+ " <td>https://huggingface.co/CalderaAI/13B-Ouroboros</td>\n",
192
+ " <td>CalderaAI/13B-Ouroboros</td>\n",
193
+ " <td>13.0</td>\n",
194
+ " <td>0.514311</td>\n",
195
+ " <td>0.560580</td>\n",
196
+ " <td>0.624378</td>\n",
197
+ " <td>0.31</td>\n",
198
+ " <td>0.466667</td>\n",
199
+ " <td>0.506579</td>\n",
200
+ " <td>0.52</td>\n",
201
+ " <td>...</td>\n",
202
+ " <td>0.365248</td>\n",
203
+ " <td>0.405476</td>\n",
204
+ " <td>0.481618</td>\n",
205
+ " <td>0.524510</td>\n",
206
+ " <td>0.609091</td>\n",
207
+ " <td>0.538776</td>\n",
208
+ " <td>0.691542</td>\n",
209
+ " <td>0.83</td>\n",
210
+ " <td>0.457831</td>\n",
211
+ " <td>0.760234</td>\n",
212
+ " </tr>\n",
213
+ " <tr>\n",
214
+ " <th>...</th>\n",
215
+ " <td>...</td>\n",
216
+ " <td>...</td>\n",
217
+ " <td>...</td>\n",
218
+ " <td>...</td>\n",
219
+ " <td>...</td>\n",
220
+ " <td>...</td>\n",
221
+ " <td>...</td>\n",
222
+ " <td>...</td>\n",
223
+ " <td>...</td>\n",
224
+ " <td>...</td>\n",
225
+ " <td>...</td>\n",
226
+ " <td>...</td>\n",
227
+ " <td>...</td>\n",
228
+ " <td>...</td>\n",
229
+ " <td>...</td>\n",
230
+ " <td>...</td>\n",
231
+ " <td>...</td>\n",
232
+ " <td>...</td>\n",
233
+ " <td>...</td>\n",
234
+ " <td>...</td>\n",
235
+ " <td>...</td>\n",
236
+ " </tr>\n",
237
+ " <tr>\n",
238
+ " <th>Robin-v2</th>\n",
239
+ " <td>https://huggingface.co/HanningZhang/Robin-v2</td>\n",
240
+ " <td>HanningZhang/Robin-v2</td>\n",
241
+ " <td>NaN</td>\n",
242
+ " <td>0.392680</td>\n",
243
+ " <td>0.435154</td>\n",
244
+ " <td>0.545310</td>\n",
245
+ " <td>0.32</td>\n",
246
+ " <td>0.437037</td>\n",
247
+ " <td>0.335526</td>\n",
248
+ " <td>0.46</td>\n",
249
+ " <td>...</td>\n",
250
+ " <td>0.290780</td>\n",
251
+ " <td>0.302477</td>\n",
252
+ " <td>0.382353</td>\n",
253
+ " <td>0.374183</td>\n",
254
+ " <td>0.445455</td>\n",
255
+ " <td>0.326531</td>\n",
256
+ " <td>0.457711</td>\n",
257
+ " <td>0.59</td>\n",
258
+ " <td>0.379518</td>\n",
259
+ " <td>0.590643</td>\n",
260
+ " </tr>\n",
261
+ " <tr>\n",
262
+ " <th>CodeUp-Llama-2-13b-chat-hf</th>\n",
263
+ " <td>https://huggingface.co/deepse/CodeUp-Llama-2-1...</td>\n",
264
+ " <td>deepse/CodeUp-Llama-2-13b-chat-hf</td>\n",
265
+ " <td>13.0</td>\n",
266
+ " <td>0.546262</td>\n",
267
+ " <td>0.558020</td>\n",
268
+ " <td>0.629257</td>\n",
269
+ " <td>0.31</td>\n",
270
+ " <td>0.474074</td>\n",
271
+ " <td>0.546053</td>\n",
272
+ " <td>0.53</td>\n",
273
+ " <td>...</td>\n",
274
+ " <td>0.390071</td>\n",
275
+ " <td>0.391786</td>\n",
276
+ " <td>0.500000</td>\n",
277
+ " <td>0.544118</td>\n",
278
+ " <td>0.663636</td>\n",
279
+ " <td>0.636735</td>\n",
280
+ " <td>0.751244</td>\n",
281
+ " <td>0.81</td>\n",
282
+ " <td>0.481928</td>\n",
283
+ " <td>0.730994</td>\n",
284
+ " </tr>\n",
285
+ " <tr>\n",
286
+ " <th>Hermes-Platypus2-mini-7B</th>\n",
287
+ " <td>https://huggingface.co/edor/Hermes-Platypus2-m...</td>\n",
288
+ " <td>edor/Hermes-Platypus2-mini-7B</td>\n",
289
+ " <td>7.0</td>\n",
290
+ " <td>0.470828</td>\n",
291
+ " <td>0.523038</td>\n",
292
+ " <td>0.601573</td>\n",
293
+ " <td>0.33</td>\n",
294
+ " <td>0.488889</td>\n",
295
+ " <td>0.421053</td>\n",
296
+ " <td>0.48</td>\n",
297
+ " <td>...</td>\n",
298
+ " <td>0.390071</td>\n",
299
+ " <td>0.353977</td>\n",
300
+ " <td>0.470588</td>\n",
301
+ " <td>0.446078</td>\n",
302
+ " <td>0.518182</td>\n",
303
+ " <td>0.563265</td>\n",
304
+ " <td>0.621891</td>\n",
305
+ " <td>0.68</td>\n",
306
+ " <td>0.421687</td>\n",
307
+ " <td>0.637427</td>\n",
308
+ " </tr>\n",
309
+ " <tr>\n",
310
+ " <th>Stable-Platypus2-mini-7B</th>\n",
311
+ " <td>https://huggingface.co/edor/Stable-Platypus2-m...</td>\n",
312
+ " <td>edor/Stable-Platypus2-mini-7B</td>\n",
313
+ " <td>7.0</td>\n",
314
+ " <td>0.517800</td>\n",
315
+ " <td>0.523891</td>\n",
316
+ " <td>0.596594</td>\n",
317
+ " <td>0.37</td>\n",
318
+ " <td>0.488889</td>\n",
319
+ " <td>0.407895</td>\n",
320
+ " <td>0.50</td>\n",
321
+ " <td>...</td>\n",
322
+ " <td>0.390071</td>\n",
323
+ " <td>0.391786</td>\n",
324
+ " <td>0.518382</td>\n",
325
+ " <td>0.509804</td>\n",
326
+ " <td>0.618182</td>\n",
327
+ " <td>0.657143</td>\n",
328
+ " <td>0.631841</td>\n",
329
+ " <td>0.73</td>\n",
330
+ " <td>0.427711</td>\n",
331
+ " <td>0.695906</td>\n",
332
+ " </tr>\n",
333
+ " <tr>\n",
334
+ " <th>llava-v1.5-13b-hf</th>\n",
335
+ " <td>https://huggingface.co/Community-LM/llava-v1.5...</td>\n",
336
+ " <td>Community-LM/llava-v1.5-13b-hf</td>\n",
337
+ " <td>13.0</td>\n",
338
+ " <td>0.568868</td>\n",
339
+ " <td>0.532423</td>\n",
340
+ " <td>0.601175</td>\n",
341
+ " <td>0.30</td>\n",
342
+ " <td>0.496296</td>\n",
343
+ " <td>0.585526</td>\n",
344
+ " <td>0.67</td>\n",
345
+ " <td>...</td>\n",
346
+ " <td>0.407801</td>\n",
347
+ " <td>0.415906</td>\n",
348
+ " <td>0.547794</td>\n",
349
+ " <td>0.578431</td>\n",
350
+ " <td>0.600000</td>\n",
351
+ " <td>0.653061</td>\n",
352
+ " <td>0.761194</td>\n",
353
+ " <td>0.81</td>\n",
354
+ " <td>0.506024</td>\n",
355
+ " <td>0.795322</td>\n",
356
+ " </tr>\n",
357
+ " </tbody>\n",
358
+ "</table>\n",
359
+ "<p>1121 rows × 63 columns</p>\n",
360
+ "</div>"
361
+ ],
362
+ "text/plain": [
363
+ " URL \\\n",
364
+ "SparseOPT-1.3B https://huggingface.co/shaohang/SparseOPT-1.3B \n",
365
+ "Athena-v1 https://huggingface.co/IkariDev/Athena-v1 \n",
366
+ "Athena-tmp https://huggingface.co/IkariDev/Athena-tmp \n",
367
+ "13B-Legerdemain-L2 https://huggingface.co/CalderaAI/13B-Legerdema... \n",
368
+ "13B-Ouroboros https://huggingface.co/CalderaAI/13B-Ouroboros \n",
369
+ "... ... \n",
370
+ "Robin-v2 https://huggingface.co/HanningZhang/Robin-v2 \n",
371
+ "CodeUp-Llama-2-13b-chat-hf https://huggingface.co/deepse/CodeUp-Llama-2-1... \n",
372
+ "Hermes-Platypus2-mini-7B https://huggingface.co/edor/Hermes-Platypus2-m... \n",
373
+ "Stable-Platypus2-mini-7B https://huggingface.co/edor/Stable-Platypus2-m... \n",
374
+ "llava-v1.5-13b-hf https://huggingface.co/Community-LM/llava-v1.5... \n",
375
+ "\n",
376
+ " full_model_name Parameters \\\n",
377
+ "SparseOPT-1.3B shaohang/SparseOPT-1.3B 1.3 \n",
378
+ "Athena-v1 IkariDev/Athena-v1 NaN \n",
379
+ "Athena-tmp IkariDev/Athena-tmp NaN \n",
380
+ "13B-Legerdemain-L2 CalderaAI/13B-Legerdemain-L2 13.0 \n",
381
+ "13B-Ouroboros CalderaAI/13B-Ouroboros 13.0 \n",
382
+ "... ... ... \n",
383
+ "Robin-v2 HanningZhang/Robin-v2 NaN \n",
384
+ "CodeUp-Llama-2-13b-chat-hf deepse/CodeUp-Llama-2-13b-chat-hf 13.0 \n",
385
+ "Hermes-Platypus2-mini-7B edor/Hermes-Platypus2-mini-7B 7.0 \n",
386
+ "Stable-Platypus2-mini-7B edor/Stable-Platypus2-mini-7B 7.0 \n",
387
+ "llava-v1.5-13b-hf Community-LM/llava-v1.5-13b-hf 13.0 \n",
388
+ "\n",
389
+ " MMLU_average arc:challenge|25 hellaswag|10 \\\n",
390
+ "SparseOPT-1.3B 0.255963 0.240614 0.383689 \n",
391
+ "Athena-v1 0.556052 0.560580 0.631548 \n",
392
+ "Athena-tmp 0.588685 0.567406 0.621888 \n",
393
+ "13B-Legerdemain-L2 0.560030 0.573379 0.635431 \n",
394
+ "13B-Ouroboros 0.514311 0.560580 0.624378 \n",
395
+ "... ... ... ... \n",
396
+ "Robin-v2 0.392680 0.435154 0.545310 \n",
397
+ "CodeUp-Llama-2-13b-chat-hf 0.546262 0.558020 0.629257 \n",
398
+ "Hermes-Platypus2-mini-7B 0.470828 0.523038 0.601573 \n",
399
+ "Stable-Platypus2-mini-7B 0.517800 0.523891 0.596594 \n",
400
+ "llava-v1.5-13b-hf 0.568868 0.532423 0.601175 \n",
401
+ "\n",
402
+ " MMLU_abstract_algebra MMLU_anatomy \\\n",
403
+ "SparseOPT-1.3B 0.22 0.214815 \n",
404
+ "Athena-v1 0.31 0.496296 \n",
405
+ "Athena-tmp 0.29 0.518519 \n",
406
+ "13B-Legerdemain-L2 0.36 0.525926 \n",
407
+ "13B-Ouroboros 0.31 0.466667 \n",
408
+ "... ... ... \n",
409
+ "Robin-v2 0.32 0.437037 \n",
410
+ "CodeUp-Llama-2-13b-chat-hf 0.31 0.474074 \n",
411
+ "Hermes-Platypus2-mini-7B 0.33 0.488889 \n",
412
+ "Stable-Platypus2-mini-7B 0.37 0.488889 \n",
413
+ "llava-v1.5-13b-hf 0.30 0.496296 \n",
414
+ "\n",
415
+ " MMLU_astronomy MMLU_business_ethics ... \\\n",
416
+ "SparseOPT-1.3B 0.157895 0.20 ... \n",
417
+ "Athena-v1 0.526316 0.58 ... \n",
418
+ "Athena-tmp 0.638158 0.62 ... \n",
419
+ "13B-Legerdemain-L2 0.572368 0.53 ... \n",
420
+ "13B-Ouroboros 0.506579 0.52 ... \n",
421
+ "... ... ... ... \n",
422
+ "Robin-v2 0.335526 0.46 ... \n",
423
+ "CodeUp-Llama-2-13b-chat-hf 0.546053 0.53 ... \n",
424
+ "Hermes-Platypus2-mini-7B 0.421053 0.48 ... \n",
425
+ "Stable-Platypus2-mini-7B 0.407895 0.50 ... \n",
426
+ "llava-v1.5-13b-hf 0.585526 0.67 ... \n",
427
+ "\n",
428
+ " MMLU_professional_accounting \\\n",
429
+ "SparseOPT-1.3B 0.262411 \n",
430
+ "Athena-v1 0.404255 \n",
431
+ "Athena-tmp 0.450355 \n",
432
+ "13B-Legerdemain-L2 0.429078 \n",
433
+ "13B-Ouroboros 0.365248 \n",
434
+ "... ... \n",
435
+ "Robin-v2 0.290780 \n",
436
+ "CodeUp-Llama-2-13b-chat-hf 0.390071 \n",
437
+ "Hermes-Platypus2-mini-7B 0.390071 \n",
438
+ "Stable-Platypus2-mini-7B 0.390071 \n",
439
+ "llava-v1.5-13b-hf 0.407801 \n",
440
+ "\n",
441
+ " MMLU_professional_law MMLU_professional_medicine \\\n",
442
+ "SparseOPT-1.3B 0.238592 0.448529 \n",
443
+ "Athena-v1 0.392438 0.525735 \n",
444
+ "Athena-tmp 0.462842 0.569853 \n",
445
+ "13B-Legerdemain-L2 0.424381 0.522059 \n",
446
+ "13B-Ouroboros 0.405476 0.481618 \n",
447
+ "... ... ... \n",
448
+ "Robin-v2 0.302477 0.382353 \n",
449
+ "CodeUp-Llama-2-13b-chat-hf 0.391786 0.500000 \n",
450
+ "Hermes-Platypus2-mini-7B 0.353977 0.470588 \n",
451
+ "Stable-Platypus2-mini-7B 0.391786 0.518382 \n",
452
+ "llava-v1.5-13b-hf 0.415906 0.547794 \n",
453
+ "\n",
454
+ " MMLU_professional_psychology \\\n",
455
+ "SparseOPT-1.3B 0.254902 \n",
456
+ "Athena-v1 0.540850 \n",
457
+ "Athena-tmp 0.588235 \n",
458
+ "13B-Legerdemain-L2 0.532680 \n",
459
+ "13B-Ouroboros 0.524510 \n",
460
+ "... ... \n",
461
+ "Robin-v2 0.374183 \n",
462
+ "CodeUp-Llama-2-13b-chat-hf 0.544118 \n",
463
+ "Hermes-Platypus2-mini-7B 0.446078 \n",
464
+ "Stable-Platypus2-mini-7B 0.509804 \n",
465
+ "llava-v1.5-13b-hf 0.578431 \n",
466
+ "\n",
467
+ " MMLU_public_relations MMLU_security_studies \\\n",
468
+ "SparseOPT-1.3B 0.236364 0.171429 \n",
469
+ "Athena-v1 0.645455 0.640816 \n",
470
+ "Athena-tmp 0.645455 0.653061 \n",
471
+ "13B-Legerdemain-L2 0.609091 0.636735 \n",
472
+ "13B-Ouroboros 0.609091 0.538776 \n",
473
+ "... ... ... \n",
474
+ "Robin-v2 0.445455 0.326531 \n",
475
+ "CodeUp-Llama-2-13b-chat-hf 0.663636 0.636735 \n",
476
+ "Hermes-Platypus2-mini-7B 0.518182 0.563265 \n",
477
+ "Stable-Platypus2-mini-7B 0.618182 0.657143 \n",
478
+ "llava-v1.5-13b-hf 0.600000 0.653061 \n",
479
+ "\n",
480
+ " MMLU_sociology MMLU_us_foreign_policy \\\n",
481
+ "SparseOPT-1.3B 0.228856 0.27 \n",
482
+ "Athena-v1 0.751244 0.83 \n",
483
+ "Athena-tmp 0.721393 0.81 \n",
484
+ "13B-Legerdemain-L2 0.766169 0.87 \n",
485
+ "13B-Ouroboros 0.691542 0.83 \n",
486
+ "... ... ... \n",
487
+ "Robin-v2 0.457711 0.59 \n",
488
+ "CodeUp-Llama-2-13b-chat-hf 0.751244 0.81 \n",
489
+ "Hermes-Platypus2-mini-7B 0.621891 0.68 \n",
490
+ "Stable-Platypus2-mini-7B 0.631841 0.73 \n",
491
+ "llava-v1.5-13b-hf 0.761194 0.81 \n",
492
+ "\n",
493
+ " MMLU_virology MMLU_world_religions \n",
494
+ "SparseOPT-1.3B 0.283133 0.216374 \n",
495
+ "Athena-v1 0.493976 0.725146 \n",
496
+ "Athena-tmp 0.463855 0.801170 \n",
497
+ "13B-Legerdemain-L2 0.427711 0.777778 \n",
498
+ "13B-Ouroboros 0.457831 0.760234 \n",
499
+ "... ... ... \n",
500
+ "Robin-v2 0.379518 0.590643 \n",
501
+ "CodeUp-Llama-2-13b-chat-hf 0.481928 0.730994 \n",
502
+ "Hermes-Platypus2-mini-7B 0.421687 0.637427 \n",
503
+ "Stable-Platypus2-mini-7B 0.427711 0.695906 \n",
504
+ "llava-v1.5-13b-hf 0.506024 0.795322 \n",
505
+ "\n",
506
+ "[1121 rows x 63 columns]"
507
+ ]
508
+ },
509
+ "execution_count": 2,
510
+ "metadata": {},
511
+ "output_type": "execute_result"
512
+ }
513
+ ],
514
+ "source": [
515
+ "df = result.data\n",
516
+ "df"
517
+ ]
518
+ },
519
+ {
520
+ "cell_type": "code",
521
+ "execution_count": null,
522
+ "metadata": {},
523
+ "outputs": [],
524
+ "source": []
525
+ },
526
+ {
527
+ "cell_type": "code",
528
+ "execution_count": null,
529
+ "metadata": {},
530
+ "outputs": [],
531
+ "source": []
532
+ },
533
  {
534
  "cell_type": "code",
535
  "execution_count": null,