ekurtic commited on
Commit
356f10f
1 Parent(s): 32b6206

add openllm-v2

Browse files
Files changed (1) hide show
  1. README.md +45 -24
README.md CHANGED
@@ -162,11 +162,44 @@ This version of the lm-evaluation-harness includes versions of MMLU, ARC-Challen
162
  <tr>
163
  <td><strong>Arena Hard</strong>
164
  </td>
165
- <td>85.0
166
  </td>
167
- <td>84.5
168
  </td>
169
- <td>99.41%
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  </td>
171
  </tr>
172
  <tr>
@@ -243,7 +276,6 @@ This version of the lm-evaluation-harness includes versions of MMLU, ARC-Challen
243
  <td>102.05%
244
  </td>
245
  </tr>
246
- <tr>
247
  <td><strong>Average</strong>
248
  </td>
249
  <td><strong>80.13</strong>
@@ -252,7 +284,6 @@ This version of the lm-evaluation-harness includes versions of MMLU, ARC-Challen
252
  </td>
253
  <td><strong>100.2%</strong>
254
  </td>
255
- </tr>
256
  <tr>
257
  <td><strong>OpenLLM v2</strong>
258
  </td>
@@ -260,11 +291,11 @@ This version of the lm-evaluation-harness includes versions of MMLU, ARC-Challen
260
  <tr>
261
  <td>MMLU-Pro (5-shot)
262
  </td>
263
- <td>ToDo
264
  </td>
265
- <td>ToDo
266
  </td>
267
- <td>ToDo
268
  </td>
269
  </tr>
270
  <tr>
@@ -280,11 +311,11 @@ This version of the lm-evaluation-harness includes versions of MMLU, ARC-Challen
280
  <tr>
281
  <td>BBH (3-shot)
282
  </td>
283
- <td>ToDo
284
  </td>
285
- <td>ToDo
286
  </td>
287
- <td>ToDo
288
  </td>
289
  </tr>
290
  <tr>
@@ -297,16 +328,6 @@ This version of the lm-evaluation-harness includes versions of MMLU, ARC-Challen
297
  <td>91.32%
298
  </td>
299
  </tr>
300
- <tr>
301
- <td>GPQA (0-shot)
302
- </td>
303
- <td>34.05
304
- </td>
305
- <td>35.97
306
- </td>
307
- <td>105.63%
308
- </td>
309
- </tr>
310
  <tr>
311
  <td>MuSR (0-shot)
312
  </td>
@@ -320,11 +341,11 @@ This version of the lm-evaluation-harness includes versions of MMLU, ARC-Challen
320
  <tr>
321
  <td><strong>Average</strong>
322
  </td>
323
- <td><strong>ToDo</strong>
324
  </td>
325
- <td><strong>ToDo</strong>
326
  </td>
327
- <td><strong>ToDo</strong>
328
  </td>
329
  </tr>
330
  </table>
 
162
  <tr>
163
  <td><strong>Arena Hard</strong>
164
  </td>
165
+ <td><strong>85.0</strong>
166
  </td>
167
+ <td><strong>84.5</strong>
168
  </td>
169
+ <td><strong>99.41%</strong>
170
+ </td>
171
+ </tr>
172
+ <tr>
173
+ <td><strong>OpenLLM Leaderboard v1</strong>
174
+ </td>
175
+ <td><strong>80.13</strong>
176
+ </td>
177
+ <td><strong>80.29</strong>
178
+ </td>
179
+ <td><strong>100.2%</strong>
180
+ </td>
181
+ </tr>
182
+ <tr>
183
+ <td><strong>OpenLLM Leaderboard v2</strong>
184
+ </td>
185
+ <td><strong>40.25</strong>
186
+ </td>
187
+ <td><strong>39.82</strong>
188
+ </td>
189
+ <td><strong>98.93%</strong>
190
+ </td>
191
+ </tr>
192
+ </table>
193
+
194
+ <table>
195
+ <tr>
196
+ <td><strong>Benchmark (per-task breakdown)</strong>
197
+ </td>
198
+ <td><strong>nvidia/Llama-3.1-Nemotron-70B-Instruct-HF</strong>
199
+ </td>
200
+ <td><strong>neuralmagic/Llama-3.1-Nemotron-70B-Instruct-HF-FP8-dynamic (this model)</strong>
201
+ </td>
202
+ <td><strong>Recovery</strong>
203
  </td>
204
  </tr>
205
  <tr>
 
276
  <td>102.05%
277
  </td>
278
  </tr>
 
279
  <td><strong>Average</strong>
280
  </td>
281
  <td><strong>80.13</strong>
 
284
  </td>
285
  <td><strong>100.2%</strong>
286
  </td>
 
287
  <tr>
288
  <td><strong>OpenLLM v2</strong>
289
  </td>
 
291
  <tr>
292
  <td>MMLU-Pro (5-shot)
293
  </td>
294
+ <td>43.45
295
  </td>
296
+ <td>42.99
297
  </td>
298
+ <td>98.94%
299
  </td>
300
  </tr>
301
  <tr>
 
311
  <tr>
312
  <td>BBH (3-shot)
313
  </td>
314
+ <td>47.12
315
  </td>
316
+ <td>46.88
317
  </td>
318
+ <td>99.5%
319
  </td>
320
  </tr>
321
  <tr>
 
328
  <td>91.32%
329
  </td>
330
  </tr>
 
 
 
 
 
 
 
 
 
 
331
  <tr>
332
  <td>MuSR (0-shot)
333
  </td>
 
341
  <tr>
342
  <td><strong>Average</strong>
343
  </td>
344
+ <td><strong>40.25</strong>
345
  </td>
346
+ <td><strong>39.82</strong>
347
  </td>
348
+ <td><strong>98.93%</strong>
349
  </td>
350
  </tr>
351
  </table>