DhiyaEddine wdevazelhes commited on
Commit
7f94a35
1 Parent(s): cc56a5a

add scores (#1)

Browse files

- add scores (5a0835aa8b06429472dd5971641cce9645a6f3aa)


Co-authored-by: William de Vazelhes <wdevazelhes@users.noreply.huggingface.co>

Files changed (1) hide show
  1. README.md +79 -64
README.md CHANGED
@@ -128,125 +128,140 @@ Falcon3-7B is trained on 256 H100 nodes (world size 2048).
128
  | | | during the training |
129
 
130
  # Evaluation
131
-
132
  <table border="1" style="width: 100%; text-align: center; border-collapse: collapse;">
133
  <colgroup>
134
- <col style="width: 10%;">
135
- <col style="width: 10%;">
136
- <col style="width: 7%;">
137
- <col style="width: 7%;">
138
- <col style="width: 7%;">
139
- <col style="background-color: rgba(80, 15, 213, 0.5); width: 7%;">
 
140
  </colgroup>
141
  <thead>
142
  <tr>
143
  <th>Category</th>
144
  <th>Benchmark</th>
145
- <th>Llama3.1-8B</th>
146
- <th>Qwen2-7B</th>
147
- <th>Qwen2.5-7B</th>
148
- <th>Falcon3-7B-Base</th>
 
149
  </tr>
150
  </thead>
151
  <tbody>
152
  <tr>
153
  <td rowspan="3">General</td>
154
  <td>MMLU (5-shot)</td>
155
- <td>65.2</td>
156
- <td>70.4</td>
157
- <td>74.2</td>
158
- <td>67.5</td>
 
159
  </tr>
160
  <tr>
161
  <td>MMLU-PRO (5-shot)</td>
162
- <td>32.7</td>
163
- <td>42.1</td>
164
- <td>43.5</td>
165
- <td>39.2</td>
 
166
  </tr>
167
  <tr>
168
  <td>IFEval</td>
169
- <td>12.0</td>
170
- <td>30.6</td>
171
- <td>33.9</td>
172
- <td>34.3</td>
 
173
  </tr>
174
  <tr>
175
  <td rowspan="2">Math</td>
176
  <td>GSM8K (5-shot)</td>
177
- <td>49.4</td>
178
- <td>77.9</td>
179
- <td>82.9</td>
180
- <td>76.2</td>
 
181
  </tr>
182
  <tr>
183
- <td>MATH(4-shot)</td>
184
- <td>4.1</td>
185
- <td>17.5</td>
186
- <td>15.5</td>
187
- <td>18.0</td>
 
188
  </tr>
189
  <tr>
190
  <td rowspan="4">Reasoning</td>
191
  <td>Arc Challenge (25-shot)</td>
192
- <td>53.4</td>
193
- <td>57.4</td>
194
- <td>59.0</td>
195
- <td>59.6</td>
 
196
  </tr>
197
  <tr>
198
  <td>GPQA (0-shot)</td>
199
- <td>31.0</td>
200
- <td>31.9</td>
201
- <td>33.0</td>
202
- <td>35.5</td>
 
203
  </tr>
204
  <tr>
205
  <td>MUSR (0-shot)</td>
206
- <td>38.0</td>
207
- <td>44.1</td>
208
- <td>44.2</td>
209
- <td>47.3</td>
 
210
  </tr>
211
  <tr>
212
  <td>BBH (3-shot)</td>
213
- <td>46.5</td>
214
- <td>53.3</td>
215
- <td>54.0</td>
216
- <td>51.0</td>
 
217
  </tr>
218
  <tr>
219
  <td rowspan="4">CommonSense Understanding</td>
220
  <td>PIQA (0-shot)</td>
221
- <td>80.3</td>
222
- <td>79.8</td>
223
- <td>78.7</td>
224
- <td>77.7</td>
 
225
  </tr>
226
  <tr>
227
  <td>SciQ (0-shot)</td>
228
- <td>96.3</td>
229
- <td>95.9</td>
230
- <td>96.6</td>
231
- <td>95.3</td>
 
232
  </tr>
233
  <tr>
234
  <td>Winogrande (0-shot)</td>
235
- <td>74.0</td>
236
- <td>72.1</td>
237
- <td>72.9</td>
238
- <td>71.0</td>
 
239
  </tr>
240
  <tr>
241
  <td>OpenbookQA (0-shot)</td>
242
- <td>33.4</td>
243
- <td>35.2</td>
244
- <td>33.6</td>
245
- <td>31.4</td>
 
246
  </tr>
247
  </tbody>
248
  </table>
249
 
250
 
251
 
 
252
  # Citation
 
128
  | | | during the training |
129
 
130
  # Evaluation
 
131
  <table border="1" style="width: 100%; text-align: center; border-collapse: collapse;">
132
  <colgroup>
133
+ <col style="width: 15%;">
134
+ <col style="width: 15%;">
135
+ <col style="width: 14%;">
136
+ <col style="width: 14%;">
137
+ <col style="width: 14%;">
138
+ <col style="width: 14%;">
139
+ <col style="background-color: rgba(80, 15, 213, 0.5); width: 14%;">
140
  </colgroup>
141
  <thead>
142
  <tr>
143
  <th>Category</th>
144
  <th>Benchmark</th>
145
+ <th>meta-llama/Llama-3.2-1B</th>
146
+ <th>Qwen/Qwen2.5-1.5B</th>
147
+ <th>HuggingFaceTB/SmolLM2-1.7B</th>
148
+ <th>google/gemma-2-2b</th>
149
+ <th>Falcon3-1B-Base</th>
150
  </tr>
151
  </thead>
152
  <tbody>
153
  <tr>
154
  <td rowspan="3">General</td>
155
  <td>MMLU (5-shot)</td>
156
+ <td>31.1</td>
157
+ <td>61</td>
158
+ <td>50.2</td>
159
+ <td>53.1</td>
160
+ <td>42.5</td>
161
  </tr>
162
  <tr>
163
  <td>MMLU-PRO (5-shot)</td>
164
+ <td>11.7</td>
165
+ <td>28.5</td>
166
+ <td>21.4</td>
167
+ <td>22.1</td>
168
+ <td>16.2</td>
169
  </tr>
170
  <tr>
171
  <td>IFEval</td>
172
+ <td>14.9</td>
173
+ <td>26.1</td>
174
+ <td>24.2</td>
175
+ <td>20.4</td>
176
+ <td>25.3</td>
177
  </tr>
178
  <tr>
179
  <td rowspan="2">Math</td>
180
  <td>GSM8K (5-shot)</td>
181
+ <td>6.6</td>
182
+ <td>62.3</td>
183
+ <td>31.1</td>
184
+ <td>25.6</td>
185
+ <td>34.3</td>
186
  </tr>
187
  <tr>
188
+ <td>MATH (4-shot)</td>
189
+ <td>0.3</td>
190
+ <td>6.8</td>
191
+ <td>1.5</td>
192
+ <td>2.6</td>
193
+ <td>2.2</td>
194
  </tr>
195
  <tr>
196
  <td rowspan="4">Reasoning</td>
197
  <td>Arc Challenge (25-shot)</td>
198
+ <td>40.2</td>
199
+ <td>54.8</td>
200
+ <td>54.1</td>
201
+ <td>53.7</td>
202
+ <td>48.2</td>
203
  </tr>
204
  <tr>
205
  <td>GPQA (0-shot)</td>
206
+ <td>24.3</td>
207
+ <td>28.2</td>
208
+ <td>28.9</td>
209
+ <td>25.5</td>
210
+ <td>28.1</td>
211
  </tr>
212
  <tr>
213
  <td>MUSR (0-shot)</td>
214
+ <td>34.5</td>
215
+ <td>35.5</td>
216
+ <td>34.8</td>
217
+ <td>42.8</td>
218
+ <td>41.9</td>
219
  </tr>
220
  <tr>
221
  <td>BBH (3-shot)</td>
222
+ <td>31.2</td>
223
+ <td>41.1</td>
224
+ <td>34.3</td>
225
+ <td>36.8</td>
226
+ <td>36.1</td>
227
  </tr>
228
  <tr>
229
  <td rowspan="4">CommonSense Understanding</td>
230
  <td>PIQA (0-shot)</td>
231
+ <td>74.6</td>
232
+ <td>76</td>
233
+ <td>77.5</td>
234
+ <td>79.2</td>
235
+ <td>74.5</td>
236
  </tr>
237
  <tr>
238
  <td>SciQ (0-shot)</td>
239
+ <td>88.5</td>
240
+ <td>93.1</td>
241
+ <td>90.8</td>
242
+ <td>95.7</td>
243
+ <td>91.1</td>
244
  </tr>
245
  <tr>
246
  <td>Winogrande (0-shot)</td>
247
+ <td>60.4</td>
248
+ <td>63</td>
249
+ <td>66.1</td>
250
+ <td>68.6</td>
251
+ <td>61.2</td>
252
  </tr>
253
  <tr>
254
  <td>OpenbookQA (0-shot)</td>
255
+ <td>37.4</td>
256
+ <td>40.4</td>
257
+ <td>44</td>
258
+ <td>41.8</td>
259
+ <td>41</td>
260
  </tr>
261
  </tbody>
262
  </table>
263
 
264
 
265
 
266
+
267
  # Citation