melaseddik commited on
Commit
0b20cce
1 Parent(s): dfd181f

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +178 -0
README.md CHANGED
@@ -119,6 +119,184 @@ print(tokenizer.decode(outputs[0]))
119
 
120
  # Evaluation
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
 
124
  # Citation
 
119
 
120
  # Evaluation
121
 
122
+ <table border="1" style="width: 100%; text-align: center; border-collapse: collapse;">
123
+ <colgroup>
124
+ <col style="width: 10%;">
125
+ <col style="width: 10%;">
126
+ <col style="width: 7%;">
127
+ <col style="width: 7%;">
128
+ <col style="width: 7%;">
129
+ <col style="background-color: rgba(80, 15, 213, 0.5); width: 7%;">
130
+ <col style="width: 7%;">
131
+ <col style="width: 7%;">
132
+ <col style="width: 7%;">
133
+ <col style="background-color: rgba(80, 15, 213, 0.5); width: 7%;">
134
+ </colgroup>
135
+ <thead>
136
+ <tr>
137
+ <th>Category</th>
138
+ <th>Benchmark</th>
139
+ <th>Llama3.1-8B</th>
140
+ <th>Qwen2-7B</th>
141
+ <th>Qwen2.5-7B</th>
142
+ <th>Falcon3-7B-Base</th>
143
+ <th>Gemma2-9B</th>
144
+ <th>Yi1.5-9B</th>
145
+ <th>Mistral-NeMo-12B</th>
146
+ <th>Falcon3-10B-Base</th>
147
+ </tr>
148
+ </thead>
149
+ <tbody>
150
+ <tr>
151
+ <td rowspan="3">General</td>
152
+ <td>MMLU (5-shot)</td>
153
+ <td>65.2</td>
154
+ <td>70.4</td>
155
+ <td>74.2</td>
156
+ <td>67.5</td>
157
+ <td>0</td>
158
+ <td>69.6</td>
159
+ <td>68.8</td>
160
+ <td>73.1</td>
161
+ </tr>
162
+ <tr>
163
+ <td>MMLU-PRO (5-shot)</td>
164
+ <td>32.7</td>
165
+ <td>42.1</td>
166
+ <td>43.5</td>
167
+ <td>39.2</td>
168
+ <td>0</td>
169
+ <td>39.3</td>
170
+ <td>34.7</td>
171
+ <td>42.5</td>
172
+ </tr>
173
+ <tr>
174
+ <td>IFEval</td>
175
+ <td>12.0</td>
176
+ <td>30.6</td>
177
+ <td>33.9</td>
178
+ <td>34.3</td>
179
+ <td>0</td>
180
+ <td>29.1</td>
181
+ <td>16.1</td>
182
+ <td>36.4</td>
183
+ </tr>
184
+ <tr>
185
+ <td rowspan="2">Math</td>
186
+ <td>GSM8K (5-shot)</td>
187
+ <td>49.4</td>
188
+ <td>77.9</td>
189
+ <td>82.9</td>
190
+ <td>76.2</td>
191
+ <td>69.1</td>
192
+ <td>63.8</td>
193
+ <td>55.3</td>
194
+ <td>81.4</td>
195
+ </tr>
196
+ <tr>
197
+ <td>MATH(4-shot)</td>
198
+ <td>4.1</td>
199
+ <td>17.5</td>
200
+ <td>15.5</td>
201
+ <td>18.0</td>
202
+ <td>0</td>
203
+ <td>9.2</td>
204
+ <td>4.9</td>
205
+ <td>22.9</td>
206
+ </tr>
207
+ <tr>
208
+ <td rowspan="4">Reasoning</td>
209
+ <td>Arc Challenge (25-shot)</td>
210
+ <td>53.4</td>
211
+ <td>57.4</td>
212
+ <td>59.0</td>
213
+ <td>59.6</td>
214
+ <td>63.7</td>
215
+ <td>58.2</td>
216
+ <td>60.6</td>
217
+ <td>62.6</td>
218
+ </tr>
219
+ <tr>
220
+ <td>GPQA (0-shot)</td>
221
+ <td>31.0</td>
222
+ <td>31.9</td>
223
+ <td>33.0</td>
224
+ <td>35.5</td>
225
+ <td>0</td>
226
+ <td>36.6</td>
227
+ <td>28.8</td>
228
+ <td>34.1</td>
229
+ </tr>
230
+ <tr>
231
+ <td>MUSR (0-shot)</td>
232
+ <td>38.0</td>
233
+ <td>44.1</td>
234
+ <td>44.2</td>
235
+ <td>47.3</td>
236
+ <td>0</td>
237
+ <td>43.3</td>
238
+ <td>39.2</td>
239
+ <td>44.2</td>
240
+ </tr>
241
+ <tr>
242
+ <td>BBH (3-shot)</td>
243
+ <td>46.5</td>
244
+ <td>53.3</td>
245
+ <td>54.0</td>
246
+ <td>51.0</td>
247
+ <td>0</td>
248
+ <td>51.3</td>
249
+ <td>50.2</td>
250
+ <td>59.7</td>
251
+ </tr>
252
+ <tr>
253
+ <td rowspan="4">CommonSense Understanding</td>
254
+ <td>PIQA (0-shot)</td>
255
+ <td>80.3</td>
256
+ <td>79.8</td>
257
+ <td>78.7</td>
258
+ <td>77.7</td>
259
+ <td>81.4</td>
260
+ <td>79.8</td>
261
+ <td>81.4</td>
262
+ <td>79.1</td>
263
+ </tr>
264
+ <tr>
265
+ <td>SciQ (0-shot)</td>
266
+ <td>96.3</td>
267
+ <td>95.9</td>
268
+ <td>96.6</td>
269
+ <td>95.3</td>
270
+ <td>97.2</td>
271
+ <td>95.8</td>
272
+ <td>96.4</td>
273
+ <td>96.0</td>
274
+ </tr>
275
+ <tr>
276
+ <td>Winogrande (0-shot)</td>
277
+ <td>74.0</td>
278
+ <td>72.1</td>
279
+ <td>72.9</td>
280
+ <td>71.0</td>
281
+ <td>74.2</td>
282
+ <td>72.7</td>
283
+ <td>73.2</td>
284
+ <td>73.6</td>
285
+ </tr>
286
+ <tr>
287
+ <td>OpenbookQA (0-shot)</td>
288
+ <td>33.4</td>
289
+ <td>35.2</td>
290
+ <td>33.6</td>
291
+ <td>31.4</td>
292
+ <td>34.0</td>
293
+ <td>35.4</td>
294
+ <td>36.4</td>
295
+ <td>34.0</td>
296
+ </tr>
297
+ </tbody>
298
+ </table>
299
+
300
 
301
 
302
  # Citation