yzabc007 commited on
Commit
5e7bf95
1 Parent(s): 2642851

Update space

Browse files
src/results/models_2024-10-07-14:50:12.666068.jsonl ADDED
@@ -0,0 +1,677 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {"config": {
3
+ "model_name": "ChatGPT-4o-latest (2024-09-03)",
4
+ "organization": "OpenAI",
5
+ "license": "Proprietary",
6
+ "knowledge_cutoff": "2023/10"
7
+ },
8
+ "results": {
9
+ "math-algebra": {"Score": 99.19484702, "Avg Rank": 1.666666667, "Min Rank": 1, "Max Rank": 3},
10
+ "math-probability": {"Score": 100, "Avg Rank": 1, "Min Rank": 1, "Max Rank": 1},
11
+ "reasoning-logical": {"Avg Rank": 1, "Min Rank": 1, "Max Rank": 1},
12
+ "overall": {"Avg Rank": 2, "Min Rank": 2, "Max Rank": 2}
13
+ }},
14
+
15
+ {"config": {
16
+ "model_name": "gpt-4o-2024-08-06",
17
+ "organization": "OpenAI",
18
+ "license": "Proprietary",
19
+ "knowledge_cutoff": "2023/10"
20
+ },
21
+ "results": {
22
+ "math-algebra": {"Score": 98.38969404, "Avg Rank": 1.666666667, "Min Rank": 1, "Max Rank": 2},
23
+ "math-probability": {"Score": 96.49758454, "Avg Rank": 2.666666667, "Min Rank": 2, "Max Rank": 4},
24
+ "reasoning-logical": {"Avg Rank": 4.333333333, "Min Rank": 3, "Max Rank": 5},
25
+ "overall": {"Avg Rank": 7.33, "Min Rank": 4, "Max Rank": 9}
26
+ }},
27
+
28
+ {"config": {
29
+ "model_name": "gpt-4o-2024-05-13",
30
+ "organization": "OpenAI",
31
+ "license": "Proprietary",
32
+ "knowledge_cutoff": "2023/10"
33
+ },
34
+ "results": {
35
+ "math-algebra": {"Score": 98.15480333, "Avg Rank": 2.666666667, "Min Rank": 2, "Max Rank": 3},
36
+ "math-probability": {"Score": 94.83939431, "Avg Rank": 3.666666667, "Min Rank": 2, "Max Rank": 5},
37
+ "reasoning-logical": {"Avg Rank": 6.333333333, "Min Rank": 3, "Max Rank": 8},
38
+ "overall": {"Avg Rank": 7.67, "Min Rank": 7, "Max Rank": 9}
39
+ }},
40
+
41
+ {"config": {
42
+ "model_name": "gpt-4-turbo-2024-04-09",
43
+ "organization": "OpenAI",
44
+ "license": "Proprietary",
45
+ "knowledge_cutoff": "2023/12"
46
+ },
47
+ "results": {
48
+ "math-algebra": {"Score": 96.03195879, "Avg Rank": 4, "Min Rank": 4, "Max Rank": 4},
49
+ "math-probability": {"Score": 93.59903382, "Avg Rank": 6.666666667, "Min Rank": 6, "Max Rank": 8},
50
+ "reasoning-logical": {"Avg Rank": 4, "Min Rank": 2, "Max Rank": 7},
51
+ "overall": {"Avg Rank": 6, "Min Rank": 5, "Max Rank": 8}
52
+ }},
53
+
54
+ {"config": {
55
+ "model_name": "gemini-1.5-pro-001",
56
+ "organization": "Google",
57
+ "license": "Proprietary",
58
+ "knowledge_cutoff": "2024-01"
59
+ },
60
+ "results": {
61
+ "math-algebra": {"Score": 94.7572213, "Avg Rank": 5, "Min Rank": 5, "Max Rank": 5},
62
+ "math-probability": {"Score": 91.42512077, "Avg Rank": 9.666666667, "Min Rank": 9, "Max Rank": 10},
63
+ "reasoning-logical": {"Avg Rank": 9.666666667, "Min Rank": 9, "Max Rank": 11},
64
+ "overall": {"Avg Rank": 14, "Min Rank": 13, "Max Rank": 15}
65
+ }},
66
+
67
+ {"config": {
68
+ "model_name": "qwen2-72b-instruct",
69
+ "organization": "Alibaba",
70
+ "license": "Qianwen LICENSE",
71
+ "knowledge_cutoff": "2024-02"
72
+ },
73
+ "results": {
74
+ "math-algebra": {"Score": 93.88818605, "Avg Rank": 6, "Min Rank": 6, "Max Rank": 6},
75
+ "math-probability": {"Score": 91.54326174, "Avg Rank": 4, "Min Rank": 3, "Max Rank": 5},
76
+ "reasoning-logical": {"Avg Rank": 15.66666667, "Min Rank": 15, "Max Rank": 17},
77
+ "overall": {"Avg Rank": 17, "Min Rank": 17, "Max Rank": 17}
78
+ }},
79
+
80
+ {"config": {
81
+ "model_name": "gpt-4o-mini-2024-07-18",
82
+ "organization": "OpenAI",
83
+ "license": "Proprietary",
84
+ "knowledge_cutoff": "2024-07"
85
+ },
86
+ "results": {
87
+ "math-algebra": {"Score": 93.22073596, "Avg Rank": 7, "Min Rank": 7, "Max Rank": 7},
88
+ "math-probability": {"Score": 92.17351456, "Avg Rank": 3.666666667, "Min Rank": 3, "Max Rank": 5},
89
+ "reasoning-logical": {"Avg Rank": 9.666666667, "Min Rank": 9, "Max Rank": 10},
90
+ "overall": {"Avg Rank": 7, "Min Rank": 5, "Max Rank": 8}
91
+ }},
92
+
93
+ {"config": {
94
+ "model_name": "claude-3.5-sonnet",
95
+ "organization": "Anthropic",
96
+ "license": "Proprietary",
97
+ "knowledge_cutoff": "2024-03"
98
+ },
99
+ "results": {
100
+ "math-algebra": {"Score": 91.5823805, "Avg Rank": 8.333333333, "Min Rank": 8, "Max Rank": 9},
101
+ "math-probability": {"Score": 91.55011915, "Avg Rank": 8, "Min Rank": 7, "Max Rank": 9},
102
+ "reasoning-logical": {"Avg Rank": 5, "Min Rank": 2, "Max Rank": 7},
103
+ "overall": {"Avg Rank": 5, "Min Rank": 4, "Max Rank": 7}
104
+ }},
105
+
106
+ {"config": {
107
+ "model_name": "o1-mini",
108
+ "organization": "01 AI",
109
+ "license": "Proprietary",
110
+ "knowledge_cutoff": "2024-01"
111
+ },
112
+ "results": {
113
+ "math-algebra": None,
114
+ "math-probability": None,
115
+ "reasoning-logical": None,
116
+ "overall": {"Avg Rank": 1, "Min Rank": 1, "Max Rank": 1}
117
+ }},
118
+
119
+ {"config": {
120
+ "model_name": "o1-preview",
121
+ "organization": "01 AI",
122
+ "license": "Proprietary",
123
+ "knowledge_cutoff": "2024-01"
124
+ },
125
+ "results": {
126
+ "math-algebra": None,
127
+ "math-probability": None,
128
+ "reasoning-logical": None,
129
+ "overall": {"Avg Rank": 3, "Min Rank": 3, "Max Rank": 3}
130
+ }},
131
+
132
+ {"config": {
133
+ "model_name": "gemini-1.5-flash-001",
134
+ "organization": "Google",
135
+ "license": "Proprietary",
136
+ "knowledge_cutoff": "2024-02"
137
+ },
138
+ "results": {
139
+ "math-algebra": {"Score": 91.30211121, "Avg Rank": 11, "Min Rank": 11, "Max Rank": 11},
140
+ "math-probability": {"Score": 91.066099, "Avg Rank": 12, "Min Rank": 10, "Max Rank": 13},
141
+ "reasoning-logical": {"Avg Rank": 15.66666667, "Min Rank": 15, "Max Rank": 16},
142
+ "overall": {"Avg Rank": 14, "Min Rank": 13, "Max Rank": 15}
143
+ }},
144
+
145
+ {"config": {
146
+ "model_name": "gpt4-1106",
147
+ "organization": "OpenAI",
148
+ "license": "Proprietary",
149
+ "knowledge_cutoff": "2024-04"
150
+ },
151
+ "results": {
152
+ "math-algebra": {"Score": 91.2227739, "Avg Rank": 12, "Min Rank": 12, "Max Rank": 12},
153
+ "math-probability": {"Score": 91.09550085, "Avg Rank": 11.66666667, "Min Rank": 11, "Max Rank": 12},
154
+ "reasoning-logical": {"Avg Rank": 12, "Min Rank": 12, "Max Rank": 12},
155
+ "overall": {"Avg Rank": 12, "Min Rank": 11, "Max Rank": 12}
156
+ }},
157
+
158
+ {"config": {
159
+ "model_name": "gemma-2-27b-it",
160
+ "organization": "Google",
161
+ "license": "Gemma License",
162
+ "knowledge_cutoff": "2024-03"
163
+ },
164
+ "results": {
165
+ "math-algebra": {"Score": 91.08554346, "Avg Rank": 13.33333333, "Min Rank": 13, "Max Rank": 14},
166
+ "math-probability": {"Score": 91.09516215, "Avg Rank": 14, "Min Rank": 14, "Max Rank": 14},
167
+ "reasoning-logical": {"Avg Rank": 13, "Min Rank": 13, "Max Rank": 13},
168
+ "overall": {"Avg Rank": 13, "Min Rank": 12, "Max Rank": 14}
169
+ }},
170
+
171
+ {"config": {
172
+ "model_name": "claude-3-opus",
173
+ "organization": "Anthropic",
174
+ "license": "Proprietary",
175
+ "knowledge_cutoff": "2024-01"
176
+ },
177
+ "results": {
178
+ "math-algebra": {"Score": 89.75345785, "Avg Rank": 13.66666667, "Min Rank": 13, "Max Rank": 14},
179
+ "math-probability": {"Score": 91.06939607, "Avg Rank": 11.33333333, "Min Rank": 11, "Max Rank": 12},
180
+ "reasoning-logical": {"Avg Rank": 10.66666667, "Min Rank": 10, "Max Rank": 11},
181
+ "overall": {"Avg Rank": 12, "Min Rank": 10, "Max Rank": 15}
182
+ }},
183
+
184
+ {"config": {
185
+ "model_name": "gemma-2-9b-it-simpo",
186
+ "organization": "Google",
187
+ "license": "Gemma License",
188
+ "knowledge_cutoff": "2024-02"
189
+ },
190
+ "results": {
191
+ "math-algebra": {"Score": 87.66368227, "Avg Rank": 15, "Min Rank": 15, "Max Rank": 15},
192
+ "math-probability": {"Score": 73.64665336, "Avg Rank": 17, "Min Rank": 17, "Max Rank": 17},
193
+ "reasoning-logical": {"Avg Rank": 19, "Min Rank": 19, "Max Rank": 19},
194
+ "overall": {"Avg Rank": 17, "Min Rank": 15, "Max Rank": 19}
195
+ }},
196
+
197
+ {"config": {
198
+ "model_name": "qwen1.5-72b-chat",
199
+ "organization": "Alibaba",
200
+ "license": "Qianwen LICENSE",
201
+ "knowledge_cutoff": "2024-03"
202
+ },
203
+ "results": {
204
+ "math-algebra": {"Score": 86.56207015, "Avg Rank": 16, "Min Rank": 16, "Max Rank": 16},
205
+ "math-probability": {"Score": 72.7735874, "Avg Rank": 21, "Min Rank": 20, "Max Rank": 22},
206
+ "reasoning-logical": {"Avg Rank": 29.66666667, "Min Rank": 28, "Max Rank": 31},
207
+ "overall": {"Avg Rank": 23, "Min Rank": 16, "Max Rank": 31}
208
+ }},
209
+
210
+ {"config": {
211
+ "model_name": "qwen1.5-32b-chat",
212
+ "organization": "Alibaba",
213
+ "license": "Qianwen LICENSE",
214
+ "knowledge_cutoff": "2024-03"
215
+ },
216
+ "results": {
217
+ "math-algebra": {"Score": 84.59439036, "Avg Rank": 17.33333333, "Min Rank": 17, "Max Rank": 18},
218
+ "math-probability": {"Score": 76.61348265, "Avg Rank": 22.33333333, "Min Rank": 22, "Max Rank": 23},
219
+ "reasoning-logical": {"Avg Rank": 28.66666667, "Min Rank": 27, "Max Rank": 30},
220
+ "overall": {"Avg Rank": 22, "Min Rank": 17, "Max Rank": 30}
221
+ }},
222
+
223
+ {"config": {
224
+ "model_name": "google-gemma-2-9b-it",
225
+ "organization": "Google",
226
+ "license": "Proprietary",
227
+ "knowledge_cutoff": "2024-01"
228
+ },
229
+ "results": {
230
+ "math-algebra": {"Score": 84.18901776, "Avg Rank": 18, "Min Rank": 17, "Max Rank": 19},
231
+ "math-probability": {"Score": 74.46332504, "Avg Rank": 16, "Min Rank": 16, "Max Rank": 16},
232
+ "reasoning-logical": {"Avg Rank": 14, "Min Rank": 14, "Max Rank": 14},
233
+ "overall": {"Avg Rank": 16, "Min Rank": 14, "Max Rank": 19}
234
+ }},
235
+
236
+ {"config": {
237
+ "model_name": "yi-1.5-34b-chat",
238
+ "organization": "01 AI",
239
+ "license": "Proprietary",
240
+ "knowledge_cutoff": "2024-01"
241
+ },
242
+ "results": {
243
+ "math-algebra": {"Score": 81.82921677, "Avg Rank": 18.66666667, "Min Rank": 18, "Max Rank": 19},
244
+ "math-probability": {"Score": 77.41945842, "Avg Rank": 15, "Min Rank": 15, "Max Rank": 15},
245
+ "reasoning-logical": {"Avg Rank": 17.33333333, "Min Rank": 17, "Max Rank": 18},
246
+ "overall": {"Avg Rank": 18, "Min Rank": 15, "Max Rank": 19}
247
+ }},
248
+
249
+ {"config": {
250
+ "model_name": "meta-llama-3.1-8b-instruct",
251
+ "organization": "Meta",
252
+ "license": "Llama 3.1 Community",
253
+ "knowledge_cutoff": "2024-02"
254
+ },
255
+ "results": {
256
+ "math-algebra": {"Score": 75.57121963, "Avg Rank": 20.33333333, "Min Rank": 20, "Max Rank": 21},
257
+ "math-probability": {"Score": 75.46243493, "Avg Rank": 20.33333333, "Min Rank": 20, "Max Rank": 21},
258
+ "reasoning-logical": {"Avg Rank": 23.66666667, "Min Rank": 23, "Max Rank": 24},
259
+ "overall": {"Avg Rank": 21, "Min Rank": 20, "Max Rank": 24}
260
+ }},
261
+
262
+ {"config": {
263
+ "model_name": "gpt3.5-turbo-0125",
264
+ "organization": "OpenAI",
265
+ "license": "Proprietary",
266
+ "knowledge_cutoff": "2023-12"
267
+ },
268
+ "results": {
269
+ "math-algebra": {"Score": 73.29235048, "Avg Rank": 21.33333333, "Min Rank": 21, "Max Rank": 22},
270
+ "math-probability": {"Score": 66.27452275, "Avg Rank": 24, "Min Rank": 24, "Max Rank": 24},
271
+ "reasoning-logical": {"Avg Rank": 42.66666667, "Min Rank": 42, "Max Rank": 44},
272
+ "overall": {"Avg Rank": 29, "Min Rank": 21, "Max Rank": 44}
273
+ }},
274
+
275
+ {"config": {
276
+ "model_name": "llama-3-70b-instruct",
277
+ "organization": "Meta",
278
+ "license": "Llama 3 Community",
279
+ "knowledge_cutoff": "2024-03"
280
+ },
281
+ "results": {
282
+ "math-algebra": {"Score": 73.75419539, "Avg Rank": 21.33333333, "Min Rank": 20, "Max Rank": 22},
283
+ "math-probability": {"Score": 87.86358478, "Avg Rank": 18.33333333, "Min Rank": 18, "Max Rank": 19},
284
+ "reasoning-logical": {"Avg Rank": 3.333333333, "Min Rank": 2, "Max Rank": 4},
285
+ "overall": {"Avg Rank": 15, "Min Rank": 3, "Max Rank": 22}
286
+ }},
287
+
288
+ {"config": {
289
+ "model_name": "claude-3-sonnet",
290
+ "organization": "Anthropic",
291
+ "license": "Proprietary",
292
+ "knowledge_cutoff": "2024-02"
293
+ },
294
+ "results": {
295
+ "math-algebra": {"Score": 71.15353833, "Avg Rank": 23, "Min Rank": 23, "Max Rank": 23},
296
+ "math-probability": {"Score": 88.02362801, "Avg Rank": 18.66666667, "Min Rank": 18, "Max Rank": 19},
297
+ "reasoning-logical": {"Avg Rank": 17.33333333, "Min Rank": 16, "Max Rank": 18},
298
+ "overall": {"Avg Rank": 20, "Min Rank": 16, "Max Rank": 23}
299
+ }},
300
+
301
+ {"config": {
302
+ "model_name": "qwen1.5-14b-chat",
303
+ "organization": "Alibaba",
304
+ "license": "Qianwen LICENSE",
305
+ "knowledge_cutoff": "2024-01"
306
+ },
307
+ "results": {
308
+ "math-algebra": {"Score": 69.70470323, "Avg Rank": 24, "Min Rank": 24, "Max Rank": 24},
309
+ "math-probability": {"Score": 66.41420544, "Avg Rank": 28.66666667, "Min Rank": 28, "Max Rank": 29},
310
+ "reasoning-logical": {"Avg Rank": 34, "Min Rank": 34, "Max Rank": 34},
311
+ "overall": {"Avg Rank": 28, "Min Rank": 24, "Max Rank": 34}
312
+ }},
313
+
314
+ {"config": {
315
+ "model_name": "claude-3-haiku",
316
+ "organization": "Anthropic",
317
+ "license": "Proprietary",
318
+ "knowledge_cutoff": "2024-01"
319
+ },
320
+ "results": {
321
+ "math-algebra": {"Score": 68.44060149, "Avg Rank": 25, "Min Rank": 25, "Max Rank": 25},
322
+ "math-probability": {"Score": 76.46075239, "Avg Rank": 22.33333333, "Min Rank": 21, "Max Rank": 23},
323
+ "reasoning-logical": {"Avg Rank": 20, "Min Rank": 20, "Max Rank": 20},
324
+ "overall": {"Avg Rank": 22, "Min Rank": 20, "Max Rank": 25}
325
+ }},
326
+
327
+ {"config": {
328
+ "model_name": "claude-2.1",
329
+ "organization": "Anthropic",
330
+ "license": "Proprietary",
331
+ "knowledge_cutoff": "2023-12"
332
+ },
333
+ "results": {
334
+ "math-algebra": {"Score": 67.59939121, "Avg Rank": 26, "Min Rank": 26, "Max Rank": 26},
335
+ "math-probability": {"Score": 68.89772398, "Avg Rank": 27, "Min Rank": 27, "Max Rank": 27},
336
+ "reasoning-logical": {"Avg Rank": 21, "Min Rank": 21, "Max Rank": 21},
337
+ "overall": {"Avg Rank": 25, "Min Rank": 21, "Max Rank": 27}
338
+ }},
339
+
340
+ {"config": {
341
+ "model_name": "mistral-8x7b-instruct-v0.1",
342
+ "organization": "Mistral",
343
+ "license": "Apache 2.0",
344
+ "knowledge_cutoff": "2023-12"
345
+ },
346
+ "results": {
347
+ "math-algebra": {"Score": 64.71364004, "Avg Rank": 27, "Min Rank": 27, "Max Rank": 27},
348
+ "math-probability": {"Score": 67.67468595, "Avg Rank": 26, "Min Rank": 26, "Max Rank": 26},
349
+ "reasoning-logical": {"Avg Rank": 29, "Min Rank": 28, "Max Rank": 30},
350
+ "overall": {"Avg Rank": 27, "Min Rank": 26, "Max Rank": 30}
351
+ }},
352
+
353
+ {"config": {
354
+ "model_name": "claude-2.0",
355
+ "organization": "Anthropic",
356
+ "license": "Proprietary",
357
+ "knowledge_cutoff": "2023-10"
358
+ },
359
+ "results": {
360
+ "math-algebra": {"Score": 64.77311289, "Avg Rank": 28, "Min Rank": 28, "Max Rank": 28},
361
+ "math-probability": {"Score": 74.34063069, "Avg Rank": 25, "Min Rank": 25, "Max Rank": 25},
362
+ "reasoning-logical": {"Avg Rank": 23.33333333, "Min Rank": 23, "Max Rank": 24},
363
+ "overall": {"Avg Rank": 25, "Min Rank": 23, "Max Rank": 28}
364
+ }},
365
+
366
+ {"config": {
367
+ "model_name": "starling-lm-7b-beta",
368
+ "organization": "Nexusflow",
369
+ "license": "Apache-2.0",
370
+ "knowledge_cutoff": "2024-01"
371
+ },
372
+ "results": {
373
+ "math-algebra": {"Score": 64.01222884, "Avg Rank": 29.33333333, "Min Rank": 29, "Max Rank": 30},
374
+ "math-probability": {"Score": 70.42025806, "Avg Rank": 28.33333333, "Min Rank": 28, "Max Rank": 29},
375
+ "reasoning-logical": {"Avg Rank": 25, "Min Rank": 25, "Max Rank": 25},
376
+ "overall": {"Avg Rank": 27, "Min Rank": 25, "Max Rank": 30}
377
+ }},
378
+
379
+ {"config": {
380
+ "model_name": "gemini-1.0-pro-001",
381
+ "organization": "Google",
382
+ "license": "Proprietary",
383
+ "knowledge_cutoff": "2023-11"
384
+ },
385
+ "results": {
386
+ "math-algebra": {"Score": 63.93365247, "Avg Rank": 29.66666667, "Min Rank": 29, "Max Rank": 30},
387
+ "math-probability": {"Score": 62.13077748, "Avg Rank": 36.66666667, "Min Rank": 36, "Max Rank": 38},
388
+ "reasoning-logical": {"Avg Rank": 37.33333333, "Min Rank": 36, "Max Rank": 40},
389
+ "overall": {"Avg Rank": 34, "Min Rank": 29, "Max Rank": 40}
390
+ }},
391
+
392
+ {"config": {
393
+ "model_name": "openchat-3.5-0106",
394
+ "organization": "OpenChat",
395
+ "license": "Apache-2.0",
396
+ "knowledge_cutoff": "2024-01"
397
+ },
398
+ "results": {
399
+ "math-algebra": {"Score": 63.02959506, "Avg Rank": 31, "Min Rank": 31, "Max Rank": 31},
400
+ "math-probability": {"Score": 61.00599665, "Avg Rank": 30, "Min Rank": 30, "Max Rank": 30},
401
+ "reasoning-logical": {"Avg Rank": 27.66666667, "Min Rank": 27, "Max Rank": 29},
402
+ "overall": {"Avg Rank": 29, "Min Rank": 27, "Max Rank": 31}
403
+ }},
404
+
405
+ {"config": {
406
+ "model_name": "openchat-3.5",
407
+ "organization": "OpenChat",
408
+ "license": "Apache-2.0",
409
+ "knowledge_cutoff": "2023-12"
410
+ },
411
+ "results": {
412
+ "math-algebra": {"Score": 61.45954168, "Avg Rank": 32.33333333, "Min Rank": 32, "Max Rank": 33},
413
+ "math-probability": {"Score": 62.56195929, "Avg Rank": 32, "Min Rank": 32, "Max Rank": 32},
414
+ "reasoning-logical": {"Avg Rank": 32, "Min Rank": 31, "Max Rank": 33},
415
+ "overall": {"Avg Rank": 32, "Min Rank": 31, "Max Rank": 33}
416
+ }},
417
+
418
+ {"config": {
419
+ "model_name": "command-r-(08-2024)",
420
+ "organization": "Cohere",
421
+ "license": "CC-BY-NC-4.0",
422
+ "knowledge_cutoff": "2024-08"
423
+ },
424
+ "results": {
425
+ "math-algebra": {"Score": 61.0679475, "Avg Rank": 32.66666667, "Min Rank": 32, "Max Rank": 33},
426
+ "math-probability": {"Score": 66.00833826, "Avg Rank": 31, "Min Rank": 31, "Max Rank": 31},
427
+ "reasoning-logical": {"Avg Rank": 37.66666667, "Min Rank": 37, "Max Rank": 38},
428
+ "overall": {"Avg Rank": 34, "Min Rank": 31, "Max Rank": 38}
429
+ }},
430
+
431
+ {"config": {
432
+ "model_name": "gemma-1.1-7b-it",
433
+ "organization": "Google",
434
+ "license": "Gemma License",
435
+ "knowledge_cutoff": "2023-11"
436
+ },
437
+ "results": {
438
+ "math-algebra": {"Score": 60.92904194, "Avg Rank": 34.33333333, "Min Rank": 34, "Max Rank": 35},
439
+ "math-probability": {"Score": 62.17574935, "Avg Rank": 36.66666667, "Min Rank": 36, "Max Rank": 37},
440
+ "reasoning-logical": {"Avg Rank": 30.33333333, "Min Rank": 28, "Max Rank": 32},
441
+ "overall": {"Avg Rank": 34, "Min Rank": 28, "Max Rank": 37}
442
+ }},
443
+
444
+ {"config": {
445
+ "model_name": "llama3-8b-instruct",
446
+ "organization": "Meta",
447
+ "license": "Llama 3 Community",
448
+ "knowledge_cutoff": "2024-01"
449
+ },
450
+ "results": {
451
+ "math-algebra": {"Score": 61.06411319, "Avg Rank": 35, "Min Rank": 34, "Max Rank": 36},
452
+ "math-probability": {"Score": 62.13077748, "Avg Rank": 34.66666667, "Min Rank": 34, "Max Rank": 35},
453
+ "reasoning-logical": {"Avg Rank": 22, "Min Rank": 22, "Max Rank": 22},
454
+ "overall": {"Avg Rank": 30, "Min Rank": 22, "Max Rank": 36}
455
+ }},
456
+
457
+ {"config": {
458
+ "model_name": "gemma-2-2b-it",
459
+ "organization": "Google",
460
+ "license": "Gemma License",
461
+ "knowledge_cutoff": "2023-12"
462
+ },
463
+ "results": {
464
+ "math-algebra": {"Score": 59.70248014, "Avg Rank": 36, "Min Rank": 35, "Max Rank": 37},
465
+ "math-probability": {"Score": 61.08084527, "Avg Rank": 33.66666667, "Min Rank": 33, "Max Rank": 35},
466
+ "reasoning-logical": {"Avg Rank": 26, "Min Rank": 26, "Max Rank": 26},
467
+ "overall": {"Avg Rank": 32, "Min Rank": 26, "Max Rank": 37}
468
+ }},
469
+
470
+ {"config": {
471
+ "model_name": "starling-lm-7b-alpha",
472
+ "organization": "Nexusflow",
473
+ "license": "Apache-2.0",
474
+ "knowledge_cutoff": "2023-12"
475
+ },
476
+ "results": {
477
+ "math-algebra": {"Score": 59.574329, "Avg Rank": 36.66666667, "Min Rank": 36, "Max Rank": 37},
478
+ "math-probability": {"Score": 64.03683254, "Avg Rank": 33.66666667, "Min Rank": 33, "Max Rank": 34},
479
+ "reasoning-logical": {"Avg Rank": 35, "Min Rank": 35, "Max Rank": 35},
480
+ "overall": {"Avg Rank": 35, "Min Rank": 33, "Max Rank": 37}
481
+ }},
482
+
483
+ {"config": {
484
+ "model_name": "qwen1.5-4b-chat",
485
+ "organization": "Alibaba",
486
+ "license": "Qianwen LICENSE",
487
+ "knowledge_cutoff": "2024-02"
488
+ },
489
+ "results": {
490
+ "math-algebra": {"Score": 56.66282914, "Avg Rank": 38.33333333, "Min Rank": 38, "Max Rank": 39},
491
+ "math-probability": {"Score": 57.39032697, "Avg Rank": 43, "Min Rank": 43, "Max Rank": 43},
492
+ "reasoning-logical": {"Avg Rank": 46, "Min Rank": 46, "Max Rank": 46},
493
+ "overall": {"Avg Rank": 42, "Min Rank": 38, "Max Rank": 46}
494
+ }},
495
+
496
+ {"config": {
497
+ "model_name": "command-r-(04-2024)",
498
+ "organization": "Cohere",
499
+ "license": "CC-BY-NC-4.0",
500
+ "knowledge_cutoff": "2024-04"
501
+ },
502
+ "results": {
503
+ "math-algebra": {"Score": 56.19063413, "Avg Rank": 38.66666667, "Min Rank": 38, "Max Rank": 39},
504
+ "math-probability": {"Score": 54.37641509, "Avg Rank": 37.66666667, "Min Rank": 37, "Max Rank": 38},
505
+ "reasoning-logical": {"Avg Rank": 32.66666667, "Min Rank": 32, "Max Rank": 33},
506
+ "overall": {"Avg Rank": 36, "Min Rank": 32, "Max Rank": 39}
507
+ }},
508
+
509
+ {"config": {
510
+ "model_name": "vicuna-33b",
511
+ "organization": "LMSYS",
512
+ "license": "Non-commercial",
513
+ "knowledge_cutoff": "2023-12"
514
+ },
515
+ "results": {
516
+ "math-algebra": {"Score": 54.71037983, "Avg Rank": 40.66666667, "Min Rank": 40, "Max Rank": 42},
517
+ "math-probability": {"Score": 55.02214588, "Avg Rank": 41, "Min Rank": 41, "Max Rank": 41},
518
+ "reasoning-logical": {"Avg Rank": 41, "Min Rank": 41, "Max Rank": 41},
519
+ "overall": {"Avg Rank": 41, "Min Rank": 40, "Max Rank": 42}
520
+ }},
521
+
522
+ {"config": {
523
+ "model_name": "gemma-7b-it",
524
+ "organization": "Google",
525
+ "license": "Gemma License",
526
+ "knowledge_cutoff": "2023-12"
527
+ },
528
+ "results": {
529
+ "math-algebra": {"Score": 54.35817186, "Avg Rank": 40.66666667, "Min Rank": 40, "Max Rank": 41},
530
+ "math-probability": {"Score": 58.19573446, "Avg Rank": 42, "Min Rank": 42, "Max Rank": 42},
531
+ "reasoning-logical": {"Avg Rank": 39.33333333, "Min Rank": 39, "Max Rank": 40},
532
+ "overall": {"Avg Rank": 41, "Min Rank": 39, "Max Rank": 42}
533
+ }},
534
+
535
+ {"config": {
536
+ "model_name": "mistral-7b-instruct-2",
537
+ "organization": "Mistral",
538
+ "license": "Apache 2.0",
539
+ "knowledge_cutoff": "2023-12"
540
+ },
541
+ "results": {
542
+ "math-algebra": {"Score": 54.39240703, "Avg Rank": 41.66666667, "Min Rank": 41, "Max Rank": 42},
543
+ "math-probability": {"Score": 60.35257542, "Avg Rank": 39, "Min Rank": 39, "Max Rank": 39},
544
+ "reasoning-logical": {"Avg Rank": 36.66666667, "Min Rank": 36, "Max Rank": 37},
545
+ "overall": {"Avg Rank": 39, "Min Rank": 36, "Max Rank": 42}
546
+ }},
547
+
548
+ {"config": {
549
+ "model_name": "mistral-7b-instruct-1",
550
+ "organization": "Mistral",
551
+ "license": "Apache 2.0",
552
+ "knowledge_cutoff": "2023-12"
553
+ },
554
+ "results": {
555
+ "math-algebra": {"Score": 53.80157944, "Avg Rank": 43, "Min Rank": 43, "Max Rank": 43},
556
+ "math-probability": {"Score": 56.51960666, "Avg Rank": 40, "Min Rank": 40, "Max Rank": 40},
557
+ "reasoning-logical": {"Avg Rank": 45, "Min Rank": 45, "Max Rank": 45},
558
+ "overall": {"Avg Rank": 43, "Min Rank": 40, "Max Rank": 45}
559
+ }},
560
+
561
+ {"config": {
562
+ "model_name": "vicuna-13b",
563
+ "organization": "LMSYS",
564
+ "license": "Non-commercial",
565
+ "knowledge_cutoff": "2023-11"
566
+ },
567
+ "results": {
568
+ "math-algebra": {"Score": 53.5413765, "Avg Rank": 44, "Min Rank": 44, "Max Rank": 44},
569
+ "math-probability": {"Score": 53.53586693, "Avg Rank": 44, "Min Rank": 44, "Max Rank": 44},
570
+ "reasoning-logical": {"Avg Rank": 43.66666667, "Min Rank": 43, "Max Rank": 44},
571
+ "overall": {"Avg Rank": 44, "Min Rank": 43, "Max Rank": 44}
572
+ }},
573
+
574
+ {"config": {
575
+ "model_name": "zephyr-7b-beta",
576
+ "organization": "HuggingFace",
577
+ "license": "MIT",
578
+ "knowledge_cutoff": "2023-10"
579
+ },
580
+ "results": {
581
+ "math-algebra": {"Score": 52.23039742, "Avg Rank": 46, "Min Rank": 45, "Max Rank": 48},
582
+ "math-probability": {"Score": 51.67173535, "Avg Rank": 47.33333333, "Min Rank": 47, "Max Rank": 48},
583
+ "reasoning-logical": {"Avg Rank": 50, "Min Rank": 50, "Max Rank": 50},
584
+ "overall": {"Avg Rank": 48, "Min Rank": 45, "Max Rank": 50}
585
+ }},
586
+
587
+ {"config": {
588
+ "model_name": "gemma-1.1-2b-it",
589
+ "organization": "Google",
590
+ "license": "Gemma License",
591
+ "knowledge_cutoff": "2023-12"
592
+ },
593
+ "results": {
594
+ "math-algebra": {"Score": 52.22372428, "Avg Rank": 46, "Min Rank": 45, "Max Rank": 47},
595
+ "math-probability": {"Score": 51.74306688, "Avg Rank": 46.33333333, "Min Rank": 46, "Max Rank": 47},
596
+ "reasoning-logical": {"Avg Rank": 48, "Min Rank": 48, "Max Rank": 48},
597
+ "overall": {"Avg Rank": 47, "Min Rank": 45, "Max Rank": 48}
598
+ }},
599
+
600
+ {"config": {
601
+ "model_name": "llama2-7b-chat",
602
+ "organization": "Meta",
603
+ "license": "Llama 2 Community",
604
+ "knowledge_cutoff": "2023-10"
605
+ },
606
+ "results": {
607
+ "math-algebra": {"Score": 51.83025857, "Avg Rank": 46.33333333, "Min Rank": 46, "Max Rank": 47},
608
+ "math-probability": {"Score": 51.19585847, "Avg Rank": 47.33333333, "Min Rank": 46, "Max Rank": 48},
609
+ "reasoning-logical": {"Avg Rank": 42.66666667, "Min Rank": 42, "Max Rank": 43},
610
+ "overall": {"Avg Rank": 45, "Min Rank": 42, "Max Rank": 48}
611
+ }},
612
+
613
+ {"config": {
614
+ "model_name": "gemma-2b-it",
615
+ "organization": "Google",
616
+ "license": "Gemma License",
617
+ "knowledge_cutoff": "2023-11"
618
+ },
619
+ "results": {
620
+ "math-algebra": {"Score": 51.60281474, "Avg Rank": 47.66666667, "Min Rank": 47, "Max Rank": 48},
621
+ "math-probability": {"Score": 51.52250905, "Avg Rank": 50, "Min Rank": 50, "Max Rank": 50},
622
+ "reasoning-logical": {"Avg Rank": 51, "Min Rank": 51, "Max Rank": 51},
623
+ "overall": {"Avg Rank": 49, "Min Rank": 47, "Max Rank": 51}
624
+ }},
625
+
626
+ {"config": {
627
+ "model_name": "llama2-13b-chat",
628
+ "organization": "Meta",
629
+ "license": "Llama 2 Community",
630
+ "knowledge_cutoff": "2023-12"
631
+ },
632
+ "results": {
633
+ "math-algebra": {"Score": 51.21273132, "Avg Rank": 49, "Min Rank": 49, "Max Rank": 49},
634
+ "math-probability": {"Score": 51.72056522, "Avg Rank": 45, "Min Rank": 45, "Max Rank": 45},
635
+ "reasoning-logical": {"Avg Rank": 39, "Min Rank": 38, "Max Rank": 40},
636
+ "overall": {"Avg Rank": 44, "Min Rank": 38, "Max Rank": 49}
637
+ }},
638
+
639
+ {"config": {
640
+ "model_name": "vicuna-7b",
641
+ "organization": "LMSYS",
642
+ "license": "Non-commercial",
643
+ "knowledge_cutoff": "2023-11"
644
+ },
645
+ "results": {
646
+ "math-algebra": {"Score": 51.31450547, "Avg Rank": 50, "Min Rank": 50, "Max Rank": 50},
647
+ "math-probability": {"Score": 52.72504618, "Avg Rank": 49, "Min Rank": 49, "Max Rank": 49},
648
+ "reasoning-logical": {"Avg Rank": 47, "Min Rank": 47, "Max Rank": 47},
649
+ "overall": {"Avg Rank": 48, "Min Rank": 47, "Max Rank": 50}
650
+ }},
651
+
652
+ {"config": {
653
+ "model_name": "koala-13b",
654
+ "organization": "UC Berkeley",
655
+ "license": "Non-commercial",
656
+ "knowledge_cutoff": "2023-10"
657
+ },
658
+ "results": {
659
+ "math-algebra": {"Score": 50.19054677, "Avg Rank": 51, "Min Rank": 51, "Max Rank": 51},
660
+ "math-probability": {"Score": 50.741989, "Avg Rank": 51, "Min Rank": 51, "Max Rank": 51},
661
+ "reasoning-logical": {"Avg Rank": 49, "Min Rank": 49, "Max Rank": 49},
662
+ "overall": {"Avg Rank": 50, "Min Rank": 49, "Max Rank": 51}
663
+ }},
664
+
665
+ {"config": {
666
+ "model_name": "openassistant-pythia-12b",
667
+ "organization": "OpenAssistant",
668
+ "license": "Non-commercial",
669
+ "knowledge_cutoff": "2023-09"
670
+ },
671
+ "results": {
672
+ "math-algebra": {"Score": 50, "Avg Rank": 52, "Min Rank": 52, "Max Rank": 52},
673
+ "math-probability": {"Score": 50, "Avg Rank": 52, "Min Rank": 52, "Max Rank": 52},
674
+ "reasoning-logical": {"Avg Rank": 52, "Min Rank": 52, "Max Rank": 52},
675
+ "overall": {"Avg Rank": 52, "Min Rank": 52, "Max Rank": 52}
676
+ }}
677
+ ]