puneeshkhanna commited on
Commit
7c7b3f8
1 Parent(s): 2f84a59

Update README.md

Browse files

remove qwen2 and only have qwen2.5

Files changed (1) hide show
  1. README.md +7 -20
README.md CHANGED
@@ -25,7 +25,7 @@ Falcon3-7B-Base supports 4 languages (english, french, spanish, portuguese) and
25
  - Wider head dimension: 256
26
  - High RoPE value to support long context understanding: 1000042
27
  - 32k context length
28
- - 131k vocab size
29
  - Pretrained on 14 Gigatokens of datasets comprising of web, code, STEM, high quality and mutlilingual data using 2048 H100 GPU chips
30
  - Supports EN, FR, ES, PT
31
  - Developed by [Technology Innovation Institute](https://www.tii.ae)
@@ -76,7 +76,6 @@ We report in the following table our internal pipeline benchmarks:
76
  <th>Category</th>
77
  <th>Benchmark</th>
78
  <th>Llama3.1-8B</th>
79
- <th>Qwen2-7B</th>
80
  <th>Qwen2.5-7B</th>
81
  <th>gemma-2-9b</th>
82
  <th>Falcon3-7B-Base</th>
@@ -87,7 +86,6 @@ We report in the following table our internal pipeline benchmarks:
87
  <td rowspan="3">General</td>
88
  <td>MMLU (5-shot)</td>
89
  <td>65.2</td>
90
- <td>70.4</td>
91
  <td>74.2</td>
92
  <td>-</td>
93
  <td>67.5</td>
@@ -95,7 +93,6 @@ We report in the following table our internal pipeline benchmarks:
95
  <tr>
96
  <td>MMLU-PRO (5-shot)</td>
97
  <td>32.7</td>
98
- <td>42.1</td>
99
  <td>43.5</td>
100
  <td>-</td>
101
  <td>39.2</td>
@@ -103,7 +100,6 @@ We report in the following table our internal pipeline benchmarks:
103
  <tr>
104
  <td>IFEval</td>
105
  <td>12.0</td>
106
- <td>30.6</td>
107
  <td>33.9</td>
108
  <td>-</td>
109
  <td>34.3</td>
@@ -112,15 +108,13 @@ We report in the following table our internal pipeline benchmarks:
112
  <td rowspan="2">Math</td>
113
  <td>GSM8K (5-shot)</td>
114
  <td>49.4</td>
115
- <td>77.9</td>
116
  <td>82.9</td>
117
- <td>-</td>
118
  <td>76.2</td>
119
  </tr>
120
  <tr>
121
  <td>MATH(4-shot)</td>
122
  <td>4.1</td>
123
- <td>17.5</td>
124
  <td>15.5</td>
125
  <td>-</td>
126
  <td>18.0</td>
@@ -129,15 +123,13 @@ We report in the following table our internal pipeline benchmarks:
129
  <td rowspan="4">Reasoning</td>
130
  <td>Arc Challenge (25-shot)</td>
131
  <td>53.4</td>
132
- <td>57.4</td>
133
  <td>59.0</td>
134
- <td>-</td>
135
  <td>59.6</td>
136
  </tr>
137
  <tr>
138
  <td>GPQA (0-shot)</td>
139
  <td>31.0</td>
140
- <td>31.9</td>
141
  <td>33.0</td>
142
  <td>-</td>
143
  <td>35.5</td>
@@ -145,7 +137,6 @@ We report in the following table our internal pipeline benchmarks:
145
  <tr>
146
  <td>MUSR (0-shot)</td>
147
  <td>38.0</td>
148
- <td>44.1</td>
149
  <td>44.2</td>
150
  <td>-</td>
151
  <td>47.3</td>
@@ -153,7 +144,6 @@ We report in the following table our internal pipeline benchmarks:
153
  <tr>
154
  <td>BBH (3-shot)</td>
155
  <td>46.5</td>
156
- <td>53.3</td>
157
  <td>54.0</td>
158
  <td>-</td>
159
  <td>51.0</td>
@@ -162,25 +152,22 @@ We report in the following table our internal pipeline benchmarks:
162
  <td rowspan="4">CommonSense Understanding</td>
163
  <td>PIQA (0-shot)</td>
164
  <td>80.3</td>
165
- <td>79.8</td>
166
  <td>78.7</td>
167
- <td>-</td>
168
  <td>77.7</td>
169
  </tr>
170
  <tr>
171
  <td>SciQ (0-shot)</td>
172
  <td>96.3</td>
173
- <td>95.9</td>
174
  <td>96.6</td>
175
- <td>-</td>
176
  <td>95.3</td>
177
  </tr>
178
  <tr>
179
  <td>Winogrande (0-shot)</td>
180
  <td>74.0</td>
181
- <td>72.1</td>
182
  <td>72.9</td>
183
- <td>-</td>
184
  <td>71.0</td>
185
  </tr>
186
  <tr>
@@ -188,7 +175,7 @@ We report in the following table our internal pipeline benchmarks:
188
  <td>33.4</td>
189
  <td>35.2</td>
190
  <td>33.6</td>
191
- <td>-</td>
192
  <td>31.4</td>
193
  </tr>
194
  </tbody>
 
25
  - Wider head dimension: 256
26
  - High RoPE value to support long context understanding: 1000042
27
  - 32k context length
28
+ - 131K vocab size
29
  - Pretrained on 14 Gigatokens of datasets comprising of web, code, STEM, high quality and mutlilingual data using 2048 H100 GPU chips
30
  - Supports EN, FR, ES, PT
31
  - Developed by [Technology Innovation Institute](https://www.tii.ae)
 
76
  <th>Category</th>
77
  <th>Benchmark</th>
78
  <th>Llama3.1-8B</th>
 
79
  <th>Qwen2.5-7B</th>
80
  <th>gemma-2-9b</th>
81
  <th>Falcon3-7B-Base</th>
 
86
  <td rowspan="3">General</td>
87
  <td>MMLU (5-shot)</td>
88
  <td>65.2</td>
 
89
  <td>74.2</td>
90
  <td>-</td>
91
  <td>67.5</td>
 
93
  <tr>
94
  <td>MMLU-PRO (5-shot)</td>
95
  <td>32.7</td>
 
96
  <td>43.5</td>
97
  <td>-</td>
98
  <td>39.2</td>
 
100
  <tr>
101
  <td>IFEval</td>
102
  <td>12.0</td>
 
103
  <td>33.9</td>
104
  <td>-</td>
105
  <td>34.3</td>
 
108
  <td rowspan="2">Math</td>
109
  <td>GSM8K (5-shot)</td>
110
  <td>49.4</td>
 
111
  <td>82.9</td>
112
+ <td>69.1</td>
113
  <td>76.2</td>
114
  </tr>
115
  <tr>
116
  <td>MATH(4-shot)</td>
117
  <td>4.1</td>
 
118
  <td>15.5</td>
119
  <td>-</td>
120
  <td>18.0</td>
 
123
  <td rowspan="4">Reasoning</td>
124
  <td>Arc Challenge (25-shot)</td>
125
  <td>53.4</td>
 
126
  <td>59.0</td>
127
+ <td>63.7</td>
128
  <td>59.6</td>
129
  </tr>
130
  <tr>
131
  <td>GPQA (0-shot)</td>
132
  <td>31.0</td>
 
133
  <td>33.0</td>
134
  <td>-</td>
135
  <td>35.5</td>
 
137
  <tr>
138
  <td>MUSR (0-shot)</td>
139
  <td>38.0</td>
 
140
  <td>44.2</td>
141
  <td>-</td>
142
  <td>47.3</td>
 
144
  <tr>
145
  <td>BBH (3-shot)</td>
146
  <td>46.5</td>
 
147
  <td>54.0</td>
148
  <td>-</td>
149
  <td>51.0</td>
 
152
  <td rowspan="4">CommonSense Understanding</td>
153
  <td>PIQA (0-shot)</td>
154
  <td>80.3</td>
 
155
  <td>78.7</td>
156
+ <td>81.4</td>
157
  <td>77.7</td>
158
  </tr>
159
  <tr>
160
  <td>SciQ (0-shot)</td>
161
  <td>96.3</td>
 
162
  <td>96.6</td>
163
+ <td>97.2</td>
164
  <td>95.3</td>
165
  </tr>
166
  <tr>
167
  <td>Winogrande (0-shot)</td>
168
  <td>74.0</td>
 
169
  <td>72.9</td>
170
+ <td>74.2</td>
171
  <td>71.0</td>
172
  </tr>
173
  <tr>
 
175
  <td>33.4</td>
176
  <td>35.2</td>
177
  <td>33.6</td>
178
+ <td>34.0</td>
179
  <td>31.4</td>
180
  </tr>
181
  </tbody>