yzabc007 commited on
Commit
8ef75a7
·
1 Parent(s): 92d7d3c
app.py CHANGED
@@ -105,8 +105,9 @@ def init_leaderboard(dataframe):
105
  # model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
106
  # model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
107
  # model_result_path = "./src/results/models_2024-10-09-06:22:21.122422.json"
108
- model_result_path = "./src/results/models_2024-10-10-06:18:54.263527.json"
109
  # model_result_path = "./src/results/models_2024-10-18-14:06:13.588399.json"
 
110
  # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
111
 
112
 
@@ -156,7 +157,7 @@ with demo:
156
  with gr.TabItem("🏅 Overview", elem_id="llm-benchmark-tab-table", id=0):
157
 
158
  DESCRIPTION_TEXT = """
159
- Total #models: 53 (Last updated: 2024-10-09)
160
 
161
  This page prvovides a comprehensive overview of model ranks across various dimensions, based on their averaged ranks.
162
  (Missing values are due to the slow or problemtic model responses to be fixed soom.)
@@ -182,7 +183,7 @@ with demo:
182
  )
183
  )
184
 
185
- with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
186
  DESCRIPTION_TEXT = """
187
  Overall dimension measures the comprehensive performance of LLMs across diverse tasks.
188
  We start with diverse questions from the widely-used [MT-Bench](https://arxiv.org/abs/2306.05685),
@@ -190,21 +191,23 @@ with demo:
190
  """
191
  gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
192
 
193
- leaderboard = overall_leaderboard(
194
- get_model_leaderboard_df(
195
- model_result_path,
196
- benchmark_cols=[
197
- AutoEvalColumn.rank_overall.name,
198
- AutoEvalColumn.model.name,
199
- AutoEvalColumn.score_overall.name,
200
- AutoEvalColumn.sd_overall.name,
201
- AutoEvalColumn.license.name,
202
- AutoEvalColumn.organization.name,
203
- AutoEvalColumn.knowledge_cutoff.name,
204
- ],
205
- rank_col=[AutoEvalColumn.rank_overall.name],
206
- ))
 
207
 
 
208
  with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
209
  DESCRIPTION_TEXT="""
210
  Algebra, Geometry, and Probability are the current three main math domains in the leaderboard.
@@ -223,7 +226,22 @@ with demo:
223
  gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
224
 
225
  # leaderboard = init_leaderboard(LEADERBOARD_DF)
226
- with gr.TabItem("🧮 Algebra", elem_id="algebra_subtab", id=0, elem_classes="subtab"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  leaderboard = overall_leaderboard(
228
  get_model_leaderboard_df(
229
  model_result_path,
@@ -231,7 +249,7 @@ with demo:
231
  AutoEvalColumn.rank_math_algebra.name,
232
  AutoEvalColumn.model.name,
233
  AutoEvalColumn.score_math_algebra.name,
234
- AutoEvalColumn.sd_math_algebra.name,
235
  AutoEvalColumn.license.name,
236
  AutoEvalColumn.organization.name,
237
  AutoEvalColumn.knowledge_cutoff.name,
@@ -240,7 +258,7 @@ with demo:
240
  )
241
  )
242
 
243
- with gr.TabItem("📐 Geometry", elem_id="geometry_subtab", id=1, elem_classes="subtab"):
244
  leaderboard = overall_leaderboard(
245
  get_model_leaderboard_df(
246
  model_result_path,
@@ -248,7 +266,7 @@ with demo:
248
  AutoEvalColumn.rank_math_geometry.name,
249
  AutoEvalColumn.model.name,
250
  AutoEvalColumn.score_math_geometry.name,
251
- AutoEvalColumn.sd_math_geometry.name,
252
  AutoEvalColumn.license.name,
253
  AutoEvalColumn.organization.name,
254
  AutoEvalColumn.knowledge_cutoff.name,
@@ -257,7 +275,7 @@ with demo:
257
  )
258
  )
259
 
260
- with gr.TabItem("📊 Probability", elem_id="prob_subtab", id=2, elem_classes="subtab"):
261
  leaderboard = overall_leaderboard(
262
  get_model_leaderboard_df(
263
  model_result_path,
@@ -265,7 +283,7 @@ with demo:
265
  AutoEvalColumn.rank_math_probability.name,
266
  AutoEvalColumn.model.name,
267
  AutoEvalColumn.score_math_probability.name,
268
- AutoEvalColumn.sd_math_probability.name,
269
  AutoEvalColumn.license.name,
270
  AutoEvalColumn.organization.name,
271
  AutoEvalColumn.knowledge_cutoff.name,
@@ -299,7 +317,20 @@ with demo:
299
  """
300
  gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
301
 
302
- with gr.TabItem("🧩 Logical", elem_id="logical_subtab", id=0, elem_classes="subtab"):
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  leaderboard = overall_leaderboard(
304
  get_model_leaderboard_df(
305
  model_result_path,
@@ -307,7 +338,7 @@ with demo:
307
  AutoEvalColumn.rank_reason_logical.name,
308
  AutoEvalColumn.model.name,
309
  AutoEvalColumn.score_reason_logical.name,
310
- AutoEvalColumn.sd_reason_logical.name,
311
  AutoEvalColumn.license.name,
312
  AutoEvalColumn.organization.name,
313
  AutoEvalColumn.knowledge_cutoff.name,
@@ -316,7 +347,7 @@ with demo:
316
  )
317
  )
318
 
319
- with gr.TabItem("🗣️ Social", elem_id="social_subtab", id=1, elem_classes="subtab"):
320
  leaderboard = overall_leaderboard(
321
  get_model_leaderboard_df(
322
  model_result_path,
@@ -324,7 +355,7 @@ with demo:
324
  AutoEvalColumn.rank_reason_social.name,
325
  AutoEvalColumn.model.name,
326
  AutoEvalColumn.score_reason_social.name,
327
- AutoEvalColumn.sd_reason_social.name,
328
  AutoEvalColumn.license.name,
329
  AutoEvalColumn.organization.name,
330
  AutoEvalColumn.knowledge_cutoff.name,
@@ -348,7 +379,19 @@ with demo:
348
  """
349
  gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
350
 
351
- with gr.TabItem("🧪 Chemistry", elem_id="chemistry_subtab", id=0, elem_classes="subtab"):
 
 
 
 
 
 
 
 
 
 
 
 
352
  leaderboard = overall_leaderboard(
353
  get_model_leaderboard_df(
354
  model_result_path,
 
105
  # model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
106
  # model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
107
  # model_result_path = "./src/results/models_2024-10-09-06:22:21.122422.json"
108
+ # model_result_path = "./src/results/models_2024-10-10-06:18:54.263527.json"
109
  # model_result_path = "./src/results/models_2024-10-18-14:06:13.588399.json"
110
+ model_result_path = "./src/results/models_2024-10-20-23:34:57.242641.json"
111
  # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
112
 
113
 
 
157
  with gr.TabItem("🏅 Overview", elem_id="llm-benchmark-tab-table", id=0):
158
 
159
  DESCRIPTION_TEXT = """
160
+ Total #models: 57 (Last updated: 2024-10-21)
161
 
162
  This page prvovides a comprehensive overview of model ranks across various dimensions, based on their averaged ranks.
163
  (Missing values are due to the slow or problemtic model responses to be fixed soom.)
 
183
  )
184
  )
185
 
186
+ with gr.TabItem("🎯 Mixed", elem_id="llm-benchmark-tab-table", id=1):
187
  DESCRIPTION_TEXT = """
188
  Overall dimension measures the comprehensive performance of LLMs across diverse tasks.
189
  We start with diverse questions from the widely-used [MT-Bench](https://arxiv.org/abs/2306.05685),
 
191
  """
192
  gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
193
 
194
+ with gr.TabItem("MT-Bench", elem_id="mt-bench_subtab", id=0, elem_classes="subtab"):
195
+ leaderboard = overall_leaderboard(
196
+ get_model_leaderboard_df(
197
+ model_result_path,
198
+ benchmark_cols=[
199
+ AutoEvalColumn.rank_overall.name,
200
+ AutoEvalColumn.model.name,
201
+ AutoEvalColumn.score_overall.name,
202
+ AutoEvalColumn.sd_overall.name,
203
+ AutoEvalColumn.license.name,
204
+ AutoEvalColumn.organization.name,
205
+ AutoEvalColumn.knowledge_cutoff.name,
206
+ ],
207
+ rank_col=[AutoEvalColumn.rank_overall.name],
208
+ ))
209
 
210
+
211
  with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
212
  DESCRIPTION_TEXT="""
213
  Algebra, Geometry, and Probability are the current three main math domains in the leaderboard.
 
226
  gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
227
 
228
  # leaderboard = init_leaderboard(LEADERBOARD_DF)
229
+ with gr.TabItem("Overall", elem_id="math_overall_subtab", id=0, elem_classes="subtab"):
230
+ leaderboard = overall_leaderboard(
231
+ get_model_leaderboard_df(
232
+ model_result_path,
233
+ benchmark_cols=[
234
+ AutoEvalColumn.model.name,
235
+ AutoEvalColumn.rank_math_algebra.name,
236
+ AutoEvalColumn.rank_math_geometry.name,
237
+ AutoEvalColumn.rank_math_probability.name,
238
+ ],
239
+ rank_col=[],
240
+ )
241
+ )
242
+
243
+
244
+ with gr.TabItem("🧮 Algebra", elem_id="algebra_subtab", id=1, elem_classes="subtab"):
245
  leaderboard = overall_leaderboard(
246
  get_model_leaderboard_df(
247
  model_result_path,
 
249
  AutoEvalColumn.rank_math_algebra.name,
250
  AutoEvalColumn.model.name,
251
  AutoEvalColumn.score_math_algebra.name,
252
+ # AutoEvalColumn.sd_math_algebra.name,
253
  AutoEvalColumn.license.name,
254
  AutoEvalColumn.organization.name,
255
  AutoEvalColumn.knowledge_cutoff.name,
 
258
  )
259
  )
260
 
261
+ with gr.TabItem("📐 Geometry", elem_id="geometry_subtab", id=2, elem_classes="subtab"):
262
  leaderboard = overall_leaderboard(
263
  get_model_leaderboard_df(
264
  model_result_path,
 
266
  AutoEvalColumn.rank_math_geometry.name,
267
  AutoEvalColumn.model.name,
268
  AutoEvalColumn.score_math_geometry.name,
269
+ # AutoEvalColumn.sd_math_geometry.name,
270
  AutoEvalColumn.license.name,
271
  AutoEvalColumn.organization.name,
272
  AutoEvalColumn.knowledge_cutoff.name,
 
275
  )
276
  )
277
 
278
+ with gr.TabItem("📊 Probability", elem_id="prob_subtab", id=3, elem_classes="subtab"):
279
  leaderboard = overall_leaderboard(
280
  get_model_leaderboard_df(
281
  model_result_path,
 
283
  AutoEvalColumn.rank_math_probability.name,
284
  AutoEvalColumn.model.name,
285
  AutoEvalColumn.score_math_probability.name,
286
+ # AutoEvalColumn.sd_math_probability.name,
287
  AutoEvalColumn.license.name,
288
  AutoEvalColumn.organization.name,
289
  AutoEvalColumn.knowledge_cutoff.name,
 
317
  """
318
  gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
319
 
320
+ with gr.TabItem("Overall", elem_id="reasoning_overall_subtab", id=0, elem_classes="subtab"):
321
+ leaderboard = overall_leaderboard(
322
+ get_model_leaderboard_df(
323
+ model_result_path,
324
+ benchmark_cols=[
325
+ AutoEvalColumn.model.name,
326
+ AutoEvalColumn.rank_reason_logical.name,
327
+ AutoEvalColumn.rank_reason_social.name,
328
+ ],
329
+ rank_col=[],
330
+ )
331
+ )
332
+
333
+ with gr.TabItem("🧩 Logical", elem_id="logical_subtab", id=1, elem_classes="subtab"):
334
  leaderboard = overall_leaderboard(
335
  get_model_leaderboard_df(
336
  model_result_path,
 
338
  AutoEvalColumn.rank_reason_logical.name,
339
  AutoEvalColumn.model.name,
340
  AutoEvalColumn.score_reason_logical.name,
341
+ # AutoEvalColumn.sd_reason_logical.name,
342
  AutoEvalColumn.license.name,
343
  AutoEvalColumn.organization.name,
344
  AutoEvalColumn.knowledge_cutoff.name,
 
347
  )
348
  )
349
 
350
+ with gr.TabItem("🗣️ Social", elem_id="social_subtab", id=2, elem_classes="subtab"):
351
  leaderboard = overall_leaderboard(
352
  get_model_leaderboard_df(
353
  model_result_path,
 
355
  AutoEvalColumn.rank_reason_social.name,
356
  AutoEvalColumn.model.name,
357
  AutoEvalColumn.score_reason_social.name,
358
+ # AutoEvalColumn.sd_reason_social.name,
359
  AutoEvalColumn.license.name,
360
  AutoEvalColumn.organization.name,
361
  AutoEvalColumn.knowledge_cutoff.name,
 
379
  """
380
  gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
381
 
382
+ with gr.TabItem("Overall", elem_id="science_overall_subtab", id=0, elem_classes="subtab"):
383
+ leaderboard = overall_leaderboard(
384
+ get_model_leaderboard_df(
385
+ model_result_path,
386
+ benchmark_cols=[
387
+ AutoEvalColumn.model.name,
388
+ AutoEvalColumn.rank_chemistry.name,
389
+ ],
390
+ rank_col=[],
391
+ )
392
+ )
393
+
394
+ with gr.TabItem("🧪 Chemistry", elem_id="chemistry_subtab", id=1, elem_classes="subtab"):
395
  leaderboard = overall_leaderboard(
396
  get_model_leaderboard_df(
397
  model_result_path,
src/display/utils.py CHANGED
@@ -64,35 +64,48 @@ auto_eval_column_dict.append(["score_sd", ColumnContent, field(default_factory=l
64
  auto_eval_column_dict.append(["rank", ColumnContent, field(default_factory=lambda: ColumnContent("Rank", "number", True))])
65
 
66
  # fine-grained dimensions
67
- auto_eval_column_dict.append(["score_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Overall)", "number", True))])
68
- auto_eval_column_dict.append(["score_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Algebra)", "number", True))])
69
- auto_eval_column_dict.append(["score_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Geometry)", "number", True))])
70
- auto_eval_column_dict.append(["score_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Probability)", "number", True))])
71
- auto_eval_column_dict.append(["score_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Logical Reasoning)", "number", True))])
72
- auto_eval_column_dict.append(["score_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Social Reasoning)", "number", True))])
73
 
74
- auto_eval_column_dict.append(["sd_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev(Overall)", "number", True))])
75
  auto_eval_column_dict.append(["sd_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Math Algebra)", "number", True))])
76
- auto_eval_column_dict.append(["sd_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Math Geometry)", "number", True))])
77
- auto_eval_column_dict.append(["sd_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Math Probability)", "number", True))])
78
- auto_eval_column_dict.append(["sd_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Logical Reasoning)", "number", True))])
79
- auto_eval_column_dict.append(["sd_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Social Reasoning)", "number", True))])
80
-
81
- auto_eval_column_dict.append(["rank_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Overall)", "number", True))])
82
  auto_eval_column_dict.append(["rank_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Math Algebra)", "number", True))])
 
 
 
83
  auto_eval_column_dict.append(["rank_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Math Geometry)", "number", True))])
 
 
 
84
  auto_eval_column_dict.append(["rank_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Math Probability)", "number", True))])
 
 
 
85
  auto_eval_column_dict.append(["rank_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Logical Reasoning)", "number", True))])
 
 
 
86
  auto_eval_column_dict.append(["rank_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Social Reasoning)", "number", True))])
87
 
88
  auto_eval_column_dict.append(["score_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Chemistry)", "number", True))])
89
  auto_eval_column_dict.append(["sd_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Chemistry)", "number", True))])
90
  auto_eval_column_dict.append(["rank_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Chemistry)", "number", True))])
91
 
 
 
 
 
 
 
 
 
 
92
  auto_eval_column_dict.append(["score_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Score (C++)", "number", True))])
93
  auto_eval_column_dict.append(["sd_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (C++)", "number", True))])
94
  auto_eval_column_dict.append(["rank_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (C++)", "number", True))])
95
 
 
96
  for task in Tasks:
97
  auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])
98
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, field(default_factory=lambda: ColumnContent("T", "str", True, never_hidden=True))])
 
64
  auto_eval_column_dict.append(["rank", ColumnContent, field(default_factory=lambda: ColumnContent("Rank", "number", True))])
65
 
66
  # fine-grained dimensions
67
+ auto_eval_column_dict.append(["score_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Score (MT-Bench)", "number", True))])
68
+ auto_eval_column_dict.append(["sd_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev(MT-Bench)", "number", True))])
69
+ auto_eval_column_dict.append(["rank_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (MT-Bench)", "number", True))])
 
 
 
70
 
71
+ auto_eval_column_dict.append(["score_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Algebra)", "number", True))])
72
  auto_eval_column_dict.append(["sd_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Math Algebra)", "number", True))])
 
 
 
 
 
 
73
  auto_eval_column_dict.append(["rank_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Math Algebra)", "number", True))])
74
+
75
+ auto_eval_column_dict.append(["score_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Geometry)", "number", True))])
76
+ auto_eval_column_dict.append(["sd_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Math Geometry)", "number", True))])
77
  auto_eval_column_dict.append(["rank_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Math Geometry)", "number", True))])
78
+
79
+ auto_eval_column_dict.append(["score_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Probability)", "number", True))])
80
+ auto_eval_column_dict.append(["sd_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Math Probability)", "number", True))])
81
  auto_eval_column_dict.append(["rank_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Math Probability)", "number", True))])
82
+
83
+ auto_eval_column_dict.append(["score_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Logical Reasoning)", "number", True))])
84
+ auto_eval_column_dict.append(["sd_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Logical Reasoning)", "number", True))])
85
  auto_eval_column_dict.append(["rank_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Logical Reasoning)", "number", True))])
86
+
87
+ auto_eval_column_dict.append(["score_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Social Reasoning)", "number", True))])
88
+ auto_eval_column_dict.append(["sd_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Social Reasoning)", "number", True))])
89
  auto_eval_column_dict.append(["rank_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Social Reasoning)", "number", True))])
90
 
91
  auto_eval_column_dict.append(["score_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Chemistry)", "number", True))])
92
  auto_eval_column_dict.append(["sd_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Chemistry)", "number", True))])
93
  auto_eval_column_dict.append(["rank_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Chemistry)", "number", True))])
94
 
95
+ auto_eval_column_dict.append(["score_physics", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Physics)", "number", True))])
96
+ auto_eval_column_dict.append(["sd_physics", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Physics)", "number", True))])
97
+ auto_eval_column_dict.append(["rank_physics", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Physics)", "number", True))])
98
+
99
+ auto_eval_column_dict.append(["score_biology", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Biology)", "number", True))])
100
+ auto_eval_column_dict.append(["sd_biology", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Biology)", "number", True))])
101
+ auto_eval_column_dict.append(["rank_biology", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Biology)", "number", True))])
102
+
103
+
104
  auto_eval_column_dict.append(["score_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Score (C++)", "number", True))])
105
  auto_eval_column_dict.append(["sd_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (C++)", "number", True))])
106
  auto_eval_column_dict.append(["rank_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (C++)", "number", True))])
107
 
108
+
109
  for task in Tasks:
110
  auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])
111
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, field(default_factory=lambda: ColumnContent("T", "str", True, never_hidden=True))])
src/populate.py CHANGED
@@ -15,14 +15,20 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
15
  """Creates a dataframe from all the individual experiment results"""
16
  raw_data = get_raw_model_results(results_path)
17
  all_data_json = [v.to_dict() for v in raw_data]
 
18
 
19
  df = pd.DataFrame.from_records(all_data_json)
20
 
21
  df = df[benchmark_cols]
22
  # print(df.head())
23
 
24
- if rank_col: # if there is one col in rank_col, sort by that column and remove NaN values
25
- df = df.dropna(subset=benchmark_cols)
 
 
 
 
 
26
  df = df.sort_values(by=[rank_col[0]], ascending=True)
27
  # print(rank_col, benchmark_cols)
28
  # print(df.head())
@@ -31,7 +37,7 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
31
  avg_rank = df.iloc[:, 1:].mean(axis=1)
32
  df["Average Rank"] = avg_rank.round(decimals=4)
33
  df = df.sort_values(by=["Average Rank"], ascending=True)
34
- df["Average Rank"] = df["Average Rank"].map('{:.4f}'.format)
35
 
36
  # we'll skip NaN, instrad of deleting the whole row
37
  df = df.fillna('--')
@@ -41,19 +47,25 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
41
 
42
 
43
  for col in benchmark_cols:
44
- # print(col)
45
- # if 'Std dev' in col or 'Score' in col:
46
  if 'Std dev' in col or 'Score' in col:
47
- # if set(['Chemistry', 'Reasoning']).intersection(set(col.split())):
48
- # df[col] = (df[col]).map('{:.2f}'.format)
49
- # else:
50
- # df[col] = (df[col]*100).map('{:.2f}'.format)
51
- if "Chemistry" in col or "C++" in col:
52
- # if "Chemistry" in col or "C++" in col or "Overall" in col or "Probability" in col or "Logical" in col:
53
- df[col] = (df[col]).map('{:.2f}'.format)
54
- else:
55
- df[col] = (df[col]*100).map('{:.2f}'.format)
56
  df[col] = df[col].round(decimals=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  # df = df.sort_values(by=[AutoEvalColumn.score.name], ascending=True)
59
  # df[AutoEvalColumn.rank.name] = df[AutoEvalColumn.score.name].rank(ascending=True, method="min")
 
15
  """Creates a dataframe from all the individual experiment results"""
16
  raw_data = get_raw_model_results(results_path)
17
  all_data_json = [v.to_dict() for v in raw_data]
18
+ assert len(rank_col) <= 1, "Only one column can be selected for ranking"
19
 
20
  df = pd.DataFrame.from_records(all_data_json)
21
 
22
  df = df[benchmark_cols]
23
  # print(df.head())
24
 
25
+ # if there is one col in rank_col, this is an isolated dimension to rank by
26
+ # sort by that selected column and remove NaN values
27
+ if rank_col:
28
+ # df = df.dropna(subset=benchmark_cols)
29
+ df = df.dropna(subset=rank_col)
30
+ df = df.fillna(0.00)
31
+ # print(df[rank_col[0]])
32
  df = df.sort_values(by=[rank_col[0]], ascending=True)
33
  # print(rank_col, benchmark_cols)
34
  # print(df.head())
 
37
  avg_rank = df.iloc[:, 1:].mean(axis=1)
38
  df["Average Rank"] = avg_rank.round(decimals=4)
39
  df = df.sort_values(by=["Average Rank"], ascending=True)
40
+ df["Average Rank"] = df["Average Rank"].map('{:.2f}'.format)
41
 
42
  # we'll skip NaN, instrad of deleting the whole row
43
  df = df.fillna('--')
 
47
 
48
 
49
  for col in benchmark_cols:
 
 
50
  if 'Std dev' in col or 'Score' in col:
51
+ df[col] = (df[col]).map('{:.2f}'.format)
 
 
 
 
 
 
 
 
52
  df[col] = df[col].round(decimals=2)
53
+
54
+
55
+ # for col in benchmark_cols:
56
+ # # print(col)
57
+ # # if 'Std dev' in col or 'Score' in col:
58
+ # if 'Std dev' in col or 'Score' in col:
59
+ # # if set(['Chemistry', 'Reasoning']).intersection(set(col.split())):
60
+ # # df[col] = (df[col]).map('{:.2f}'.format)
61
+ # # else:
62
+ # # df[col] = (df[col]*100).map('{:.2f}'.format)
63
+ # # if "Chemistry" in col or "C++" in col:
64
+ # if "Chemistry" in col or "C++" in col or "Overall" in col or "Probability" in col or "Logical" in col:
65
+ # df[col] = (df[col]).map('{:.2f}'.format)
66
+ # else:
67
+ # df[col] = (df[col]*100).map('{:.2f}'.format)
68
+ # df[col] = df[col].round(decimals=2)
69
 
70
  # df = df.sort_values(by=[AutoEvalColumn.score.name], ascending=True)
71
  # df[AutoEvalColumn.rank.name] = df[AutoEvalColumn.score.name].rank(ascending=True, method="min")
src/results/models_2024-10-20-23:34:57.242641.json ADDED
@@ -0,0 +1,2802 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "config": {
4
+ "model_name": "ChatGPT-4o-latest (2024-09-03)",
5
+ "organization": "OpenAI",
6
+ "license": "Proprietary",
7
+ "knowledge_cutoff": "2023/10"
8
+ },
9
+ "results": {
10
+ "OVERALL": {
11
+ "Average Score": 93.51557945652831,
12
+ "Standard Deviation": 3.1900396436407785,
13
+ "Rank": 4
14
+ },
15
+ "Geometry": {
16
+ "Average Score": 81.8536937387725,
17
+ "Standard Deviation": null,
18
+ "Rank": 5
19
+ },
20
+ "Algebra": {
21
+ "Average Score": 89.3642910524324,
22
+ "Standard Deviation": null,
23
+ "Rank": 3
24
+ },
25
+ "Probability": {
26
+ "Average Score": 86.55761073510537,
27
+ "Standard Deviation": null,
28
+ "Rank": 4
29
+ },
30
+ "Logical": {
31
+ "Average Score": 97.39734315785844,
32
+ "Standard Deviation": null,
33
+ "Rank": 2
34
+ },
35
+ "Social": {
36
+ "Average Score": 91.03727530739368,
37
+ "Standard Deviation": null,
38
+ "Rank": 7
39
+ },
40
+ "Chemistry": {
41
+ "Average Score": 100.0,
42
+ "Standard Deviation": null,
43
+ "Rank": 1
44
+ },
45
+ "CPP": {
46
+ "Average Score": 100.0,
47
+ "Standard Deviation": null,
48
+ "Rank": 1
49
+ }
50
+ }
51
+ },
52
+ {
53
+ "config": {
54
+ "model_name": "gpt-4o-2024-08-06",
55
+ "organization": "OpenAI",
56
+ "license": "Proprietary",
57
+ "knowledge_cutoff": "2023/10"
58
+ },
59
+ "results": {
60
+ "OVERALL": {
61
+ "Average Score": 79.7806321863411,
62
+ "Standard Deviation": 0.8302330946013555,
63
+ "Rank": 14
64
+ },
65
+ "Geometry": {
66
+ "Average Score": 86.29041459755453,
67
+ "Standard Deviation": null,
68
+ "Rank": 2
69
+ },
70
+ "Algebra": {
71
+ "Average Score": 88.53373721863113,
72
+ "Standard Deviation": null,
73
+ "Rank": 4
74
+ },
75
+ "Probability": {
76
+ "Average Score": 78.694360721361,
77
+ "Standard Deviation": null,
78
+ "Rank": 7
79
+ },
80
+ "Logical": {
81
+ "Average Score": 78.3116623496895,
82
+ "Standard Deviation": null,
83
+ "Rank": 12
84
+ },
85
+ "Social": {
86
+ "Average Score": 79.90944696263446,
87
+ "Standard Deviation": null,
88
+ "Rank": 11
89
+ },
90
+ "Chemistry": {
91
+ "Average Score": 86.96011263543132,
92
+ "Standard Deviation": null,
93
+ "Rank": 7
94
+ },
95
+ "CPP": {
96
+ "Average Score": 92.43090226400756,
97
+ "Standard Deviation": null,
98
+ "Rank": 2
99
+ }
100
+ }
101
+ },
102
+ {
103
+ "config": {
104
+ "model_name": "gpt-4o-2024-05-13",
105
+ "organization": "OpenAI",
106
+ "license": "Proprietary",
107
+ "knowledge_cutoff": "2023/10"
108
+ },
109
+ "results": {
110
+ "OVERALL": {
111
+ "Average Score": 86.40675398236253,
112
+ "Standard Deviation": 6.473604235710212,
113
+ "Rank": 9
114
+ },
115
+ "Geometry": {
116
+ "Average Score": 82.42032988843268,
117
+ "Standard Deviation": null,
118
+ "Rank": 4
119
+ },
120
+ "Algebra": {
121
+ "Average Score": 83.51580675782952,
122
+ "Standard Deviation": null,
123
+ "Rank": 9
124
+ },
125
+ "Probability": {
126
+ "Average Score": 81.88434691830915,
127
+ "Standard Deviation": null,
128
+ "Rank": 5
129
+ },
130
+ "Logical": {
131
+ "Average Score": 87.92744931984977,
132
+ "Standard Deviation": null,
133
+ "Rank": 9
134
+ },
135
+ "Social": {
136
+ "Average Score": 76.12369632852445,
137
+ "Standard Deviation": null,
138
+ "Rank": 15
139
+ },
140
+ "Chemistry": {
141
+ "Average Score": 90.93459148149344,
142
+ "Standard Deviation": null,
143
+ "Rank": 4
144
+ },
145
+ "CPP": {
146
+ "Average Score": 79.1592634699295,
147
+ "Standard Deviation": null,
148
+ "Rank": 6
149
+ }
150
+ }
151
+ },
152
+ {
153
+ "config": {
154
+ "model_name": "gpt-4-turbo-2024-04-09",
155
+ "organization": "OpenAI",
156
+ "license": "Proprietary",
157
+ "knowledge_cutoff": "2023/12"
158
+ },
159
+ "results": {
160
+ "OVERALL": {
161
+ "Average Score": 87.17581147282237,
162
+ "Standard Deviation": 8.716963621850567,
163
+ "Rank": 8
164
+ },
165
+ "Geometry": {
166
+ "Average Score": 78.76635545274637,
167
+ "Standard Deviation": null,
168
+ "Rank": 7
169
+ },
170
+ "Algebra": {
171
+ "Average Score": 79.96323615621023,
172
+ "Standard Deviation": null,
173
+ "Rank": 11
174
+ },
175
+ "Probability": {
176
+ "Average Score": 77.65333799733705,
177
+ "Standard Deviation": null,
178
+ "Rank": 9
179
+ },
180
+ "Logical": {
181
+ "Average Score": 89.33307138659873,
182
+ "Standard Deviation": null,
183
+ "Rank": 8
184
+ },
185
+ "Social": {
186
+ "Average Score": 76.86597570996584,
187
+ "Standard Deviation": null,
188
+ "Rank": 14
189
+ },
190
+ "Chemistry": {
191
+ "Average Score": 84.02855687506661,
192
+ "Standard Deviation": null,
193
+ "Rank": 9
194
+ },
195
+ "CPP": {
196
+ "Average Score": 70.73143363230263,
197
+ "Standard Deviation": null,
198
+ "Rank": 11
199
+ }
200
+ }
201
+ },
202
+ {
203
+ "config": {
204
+ "model_name": "gemini-1.5-pro-001",
205
+ "organization": "Google",
206
+ "license": "Proprietary",
207
+ "knowledge_cutoff": "2023/11"
208
+ },
209
+ "results": {
210
+ "OVERALL": {
211
+ "Average Score": 80.38345723548734,
212
+ "Standard Deviation": 2.4635699815143584,
213
+ "Rank": 13
214
+ },
215
+ "Geometry": {
216
+ "Average Score": 84.30455076458965,
217
+ "Standard Deviation": null,
218
+ "Rank": 3
219
+ },
220
+ "Algebra": {
221
+ "Average Score": 85.9212061409364,
222
+ "Standard Deviation": null,
223
+ "Rank": 6
224
+ },
225
+ "Probability": {
226
+ "Average Score": 73.11806712394745,
227
+ "Standard Deviation": null,
228
+ "Rank": 13
229
+ },
230
+ "Logical": {
231
+ "Average Score": 78.27369746632996,
232
+ "Standard Deviation": null,
233
+ "Rank": 12
234
+ },
235
+ "Social": {
236
+ "Average Score": 79.57606824531047,
237
+ "Standard Deviation": null,
238
+ "Rank": 13
239
+ }
240
+ }
241
+ },
242
+ {
243
+ "config": {
244
+ "model_name": "qwen2-72b-instruct",
245
+ "organization": "Alibaba",
246
+ "license": "Qianwen LICENSE",
247
+ "knowledge_cutoff": "2024/09"
248
+ },
249
+ "results": {
250
+ "OVERALL": {
251
+ "Average Score": 74.44059692248071,
252
+ "Standard Deviation": 2.3957041566666697,
253
+ "Rank": 16
254
+ },
255
+ "Geometry": {
256
+ "Average Score": 72.58490369919883,
257
+ "Standard Deviation": null,
258
+ "Rank": 11
259
+ },
260
+ "Algebra": {
261
+ "Average Score": 88.53359632761772,
262
+ "Standard Deviation": null,
263
+ "Rank": 4
264
+ },
265
+ "Probability": {
266
+ "Average Score": 80.19789976985243,
267
+ "Standard Deviation": null,
268
+ "Rank": 6
269
+ },
270
+ "Logical": {
271
+ "Average Score": 72.76843081200641,
272
+ "Standard Deviation": null,
273
+ "Rank": 17
274
+ },
275
+ "Social": {
276
+ "Average Score": 57.256064868444426,
277
+ "Standard Deviation": null,
278
+ "Rank": 19
279
+ },
280
+ "Chemistry": {
281
+ "Average Score": 75.47190401351077,
282
+ "Standard Deviation": null,
283
+ "Rank": 12
284
+ },
285
+ "CPP": {
286
+ "Average Score": 73.54037778797029,
287
+ "Standard Deviation": null,
288
+ "Rank": 7
289
+ }
290
+ }
291
+ },
292
+ {
293
+ "config": {
294
+ "model_name": "gpt-4o-mini-2024-07-18",
295
+ "organization": "OpenAI",
296
+ "license": "Proprietary",
297
+ "knowledge_cutoff": "2023/10"
298
+ },
299
+ "results": {
300
+ "OVERALL": {
301
+ "Average Score": 82.82456893277315,
302
+ "Standard Deviation": 7.714840109805867,
303
+ "Rank": 12
304
+ },
305
+ "Geometry": {
306
+ "Average Score": 78.89323869622943,
307
+ "Standard Deviation": null,
308
+ "Rank": 6
309
+ },
310
+ "Algebra": {
311
+ "Average Score": 84.8722603687823,
312
+ "Standard Deviation": null,
313
+ "Rank": 8
314
+ },
315
+ "Probability": {
316
+ "Average Score": 78.6942843346463,
317
+ "Standard Deviation": null,
318
+ "Rank": 7
319
+ },
320
+ "Logical": {
321
+ "Average Score": 85.68921109829361,
322
+ "Standard Deviation": null,
323
+ "Rank": 10
324
+ },
325
+ "Social": {
326
+ "Average Score": 81.79892848722542,
327
+ "Standard Deviation": null,
328
+ "Rank": 10
329
+ },
330
+ "Chemistry": {
331
+ "Average Score": 81.46805623180109,
332
+ "Standard Deviation": null,
333
+ "Rank": 10
334
+ },
335
+ "CPP": {
336
+ "Average Score": 88.3877070580296,
337
+ "Standard Deviation": null,
338
+ "Rank": 3
339
+ }
340
+ }
341
+ },
342
+ {
343
+ "config": {
344
+ "model_name": "claude-3.5-sonnet",
345
+ "organization": "Anthropic",
346
+ "license": "Proprietary",
347
+ "knowledge_cutoff": "2024/04"
348
+ },
349
+ "results": {
350
+ "OVERALL": {
351
+ "Average Score": 88.43557924843628,
352
+ "Standard Deviation": 5.680338106806327,
353
+ "Rank": 7
354
+ },
355
+ "Geometry": {
356
+ "Average Score": 76.26169400931595,
357
+ "Standard Deviation": null,
358
+ "Rank": 10
359
+ },
360
+ "Algebra": {
361
+ "Average Score": 77.15040433072186,
362
+ "Standard Deviation": null,
363
+ "Rank": 13
364
+ },
365
+ "Probability": {
366
+ "Average Score": 73.9942759783754,
367
+ "Standard Deviation": null,
368
+ "Rank": 11
369
+ },
370
+ "Logical": {
371
+ "Average Score": 89.70827617930533,
372
+ "Standard Deviation": null,
373
+ "Rank": 7
374
+ },
375
+ "Social": {
376
+ "Average Score": 97.3810636467068,
377
+ "Standard Deviation": null,
378
+ "Rank": 3
379
+ },
380
+ "Chemistry": {
381
+ "Average Score": 94.92819763202698,
382
+ "Standard Deviation": null,
383
+ "Rank": 3
384
+ },
385
+ "CPP": {
386
+ "Average Score": 82.37734076815008,
387
+ "Standard Deviation": null,
388
+ "Rank": 5
389
+ }
390
+ }
391
+ },
392
+ {
393
+ "config": {
394
+ "model_name": "o1-mini",
395
+ "organization": "OpenAI",
396
+ "license": "Proprietary",
397
+ "knowledge_cutoff": "2023/10"
398
+ },
399
+ "results": {
400
+ "OVERALL": {
401
+ "Average Score": 96.12399889226096,
402
+ "Standard Deviation": 0.5674965705992511,
403
+ "Rank": 2
404
+ },
405
+ "Geometry": {
406
+ "Average Score": 100.0,
407
+ "Standard Deviation": null,
408
+ "Rank": 1
409
+ },
410
+ "Algebra": {
411
+ "Average Score": 100.0,
412
+ "Standard Deviation": null,
413
+ "Rank": 1
414
+ },
415
+ "Probability": {
416
+ "Average Score": 100.0,
417
+ "Standard Deviation": null,
418
+ "Rank": 1
419
+ },
420
+ "Logical": {
421
+ "Average Score": 96.52089445393929,
422
+ "Standard Deviation": null,
423
+ "Rank": 3
424
+ },
425
+ "Social": {
426
+ "Average Score": 95.00695256918654,
427
+ "Standard Deviation": null,
428
+ "Rank": 5
429
+ }
430
+ }
431
+ },
432
+ {
433
+ "config": {
434
+ "model_name": "o1-preview",
435
+ "organization": "OpenAI",
436
+ "license": "Proprietary",
437
+ "knowledge_cutoff": "2023/10"
438
+ },
439
+ "results": {
440
+ "OVERALL": {
441
+ "Average Score": 91.08240629161766,
442
+ "Standard Deviation": 4.83378135710071,
443
+ "Rank": 5
444
+ },
445
+ "Geometry": {
446
+ "Average Score": "N/A",
447
+ "Standard Deviation": "N/A",
448
+ "Rank": "N/A"
449
+ },
450
+ "Algebra": {
451
+ "Average Score": 98.1870991822192,
452
+ "Standard Deviation": null,
453
+ "Rank": 2
454
+ },
455
+ "Probability": {
456
+ "Average Score": 94.12657646584134,
457
+ "Standard Deviation": null,
458
+ "Rank": 2
459
+ },
460
+ "Logical": {
461
+ "Average Score": 100.0,
462
+ "Standard Deviation": null,
463
+ "Rank": 1
464
+ },
465
+ "Social": {
466
+ "Average Score": 96.56802743955569,
467
+ "Standard Deviation": null,
468
+ "Rank": 4
469
+ }
470
+ }
471
+ },
472
+ {
473
+ "config": {
474
+ "model_name": "gemini-1.5-flash-001",
475
+ "organization": "Google",
476
+ "license": "Proprietary",
477
+ "knowledge_cutoff": "2023/11"
478
+ },
479
+ "results": {
480
+ "OVERALL": {
481
+ "Average Score": 66.25275609135964,
482
+ "Standard Deviation": 2.5314573702881438,
483
+ "Rank": 20
484
+ },
485
+ "Geometry": {
486
+ "Average Score": 66.8010242138006,
487
+ "Standard Deviation": null,
488
+ "Rank": 13
489
+ },
490
+ "Algebra": {
491
+ "Average Score": 78.24639082497596,
492
+ "Standard Deviation": null,
493
+ "Rank": 12
494
+ },
495
+ "Probability": {
496
+ "Average Score": 67.84602916736804,
497
+ "Standard Deviation": null,
498
+ "Rank": 15
499
+ },
500
+ "Logical": {
501
+ "Average Score": 72.76845749138818,
502
+ "Standard Deviation": null,
503
+ "Rank": 17
504
+ },
505
+ "Social": {
506
+ "Average Score": 68.57728479711058,
507
+ "Standard Deviation": null,
508
+ "Rank": 16
509
+ },
510
+ "Chemistry": {
511
+ "Average Score": 75.47188329078935,
512
+ "Standard Deviation": null,
513
+ "Rank": 12
514
+ },
515
+ "CPP": {
516
+ "Average Score": 72.1127762005651,
517
+ "Standard Deviation": null,
518
+ "Rank": 10
519
+ }
520
+ }
521
+ },
522
+ {
523
+ "config": {
524
+ "model_name": "gpt4-1106",
525
+ "organization": "OpenAI",
526
+ "license": "Proprietary",
527
+ "knowledge_cutoff": "2024/04"
528
+ },
529
+ "results": {
530
+ "OVERALL": {
531
+ "Average Score": 85.660054434658,
532
+ "Standard Deviation": 7.392502344300497,
533
+ "Rank": 10
534
+ },
535
+ "Geometry": {
536
+ "Average Score": 63.36396165140893,
537
+ "Standard Deviation": null,
538
+ "Rank": 15
539
+ },
540
+ "Algebra": {
541
+ "Average Score": 74.67191687355754,
542
+ "Standard Deviation": null,
543
+ "Rank": 15
544
+ },
545
+ "Probability": {
546
+ "Average Score": 71.35141952665965,
547
+ "Standard Deviation": null,
548
+ "Rank": 14
549
+ },
550
+ "Logical": {
551
+ "Average Score": 76.34506017196868,
552
+ "Standard Deviation": null,
553
+ "Rank": 15
554
+ },
555
+ "Social": {
556
+ "Average Score": 46.00126575332808,
557
+ "Standard Deviation": null,
558
+ "Rank": 25
559
+ },
560
+ "Chemistry": {
561
+ "Average Score": 78.70156756289569,
562
+ "Standard Deviation": null,
563
+ "Rank": 11
564
+ },
565
+ "CPP": {
566
+ "Average Score": 69.11824072252848,
567
+ "Standard Deviation": null,
568
+ "Rank": 12
569
+ }
570
+ }
571
+ },
572
+ {
573
+ "config": {
574
+ "model_name": "gemma-2-27b-it",
575
+ "organization": "Google",
576
+ "license": "Gemma License",
577
+ "knowledge_cutoff": "2024/06"
578
+ },
579
+ "results": {
580
+ "OVERALL": {
581
+ "Average Score": 70.82622192650408,
582
+ "Standard Deviation": 0.18962869075029884,
583
+ "Rank": 18
584
+ },
585
+ "Geometry": {
586
+ "Average Score": 58.25724467150374,
587
+ "Standard Deviation": null,
588
+ "Rank": 16
589
+ },
590
+ "Algebra": {
591
+ "Average Score": 73.71614711121721,
592
+ "Standard Deviation": null,
593
+ "Rank": 16
594
+ },
595
+ "Probability": {
596
+ "Average Score": 66.08200742339983,
597
+ "Standard Deviation": null,
598
+ "Rank": 17
599
+ },
600
+ "Logical": {
601
+ "Average Score": 72.76841354275011,
602
+ "Standard Deviation": null,
603
+ "Rank": 17
604
+ },
605
+ "Social": {
606
+ "Average Score": 53.736358144621576,
607
+ "Standard Deviation": null,
608
+ "Rank": 21
609
+ },
610
+ "Chemistry": {
611
+ "Average Score": 68.1178055540124,
612
+ "Standard Deviation": null,
613
+ "Rank": 17
614
+ },
615
+ "CPP": {
616
+ "Average Score": 63.28920072143611,
617
+ "Standard Deviation": null,
618
+ "Rank": 14
619
+ }
620
+ }
621
+ },
622
+ {
623
+ "config": {
624
+ "model_name": "claude-3-opus",
625
+ "organization": "Anthropic",
626
+ "license": "Proprietary",
627
+ "knowledge_cutoff": "2023/08"
628
+ },
629
+ "results": {
630
+ "OVERALL": {
631
+ "Average Score": 82.28903171580336,
632
+ "Standard Deviation": 10.093273304495547,
633
+ "Rank": 11
634
+ },
635
+ "Geometry": {
636
+ "Average Score": 57.98602891013921,
637
+ "Standard Deviation": null,
638
+ "Rank": 17
639
+ },
640
+ "Algebra": {
641
+ "Average Score": 73.54334730242743,
642
+ "Standard Deviation": null,
643
+ "Rank": 18
644
+ },
645
+ "Probability": {
646
+ "Average Score": 67.8341594991468,
647
+ "Standard Deviation": null,
648
+ "Rank": 15
649
+ },
650
+ "Logical": {
651
+ "Average Score": 78.31155849680502,
652
+ "Standard Deviation": null,
653
+ "Rank": 12
654
+ },
655
+ "Social": {
656
+ "Average Score": 90.45833112761075,
657
+ "Standard Deviation": null,
658
+ "Rank": 8
659
+ },
660
+ "Chemistry": {
661
+ "Average Score": 85.97349470177741,
662
+ "Standard Deviation": null,
663
+ "Rank": 8
664
+ },
665
+ "CPP": {
666
+ "Average Score": 73.5404403567132,
667
+ "Standard Deviation": null,
668
+ "Rank": 8
669
+ }
670
+ }
671
+ },
672
+ {
673
+ "config": {
674
+ "model_name": "gemma-2-9b-it-simpo",
675
+ "organization": "Google",
676
+ "license": "Gemma License",
677
+ "knowledge_cutoff": "2024/07"
678
+ },
679
+ "results": {
680
+ "OVERALL": {
681
+ "Average Score": "N/A",
682
+ "Standard Deviation": "N/A",
683
+ "Rank": "N/A"
684
+ },
685
+ "Geometry": {
686
+ "Average Score": 52.80896798216458,
687
+ "Standard Deviation": null,
688
+ "Rank": 19
689
+ },
690
+ "Algebra": {
691
+ "Average Score": 69.60260038105677,
692
+ "Standard Deviation": null,
693
+ "Rank": 19
694
+ },
695
+ "Probability": {
696
+ "Average Score": 59.52630271491633,
697
+ "Standard Deviation": null,
698
+ "Rank": 21
699
+ },
700
+ "Logical": {
701
+ "Average Score": 63.57920031465781,
702
+ "Standard Deviation": null,
703
+ "Rank": 23
704
+ },
705
+ "Social": {
706
+ "Average Score": 79.90950201631269,
707
+ "Standard Deviation": null,
708
+ "Rank": 11
709
+ },
710
+ "Chemistry": {
711
+ "Average Score": 90.36508196626548,
712
+ "Standard Deviation": null,
713
+ "Rank": 5
714
+ },
715
+ "CPP": {
716
+ "Average Score": 73.43757596214863,
717
+ "Standard Deviation": null,
718
+ "Rank": 9
719
+ }
720
+ }
721
+ },
722
+ {
723
+ "config": {
724
+ "model_name": "qwen1.5-72b-chat",
725
+ "organization": "Alibaba",
726
+ "license": "Qianwen LICENSE",
727
+ "knowledge_cutoff": "2024/03"
728
+ },
729
+ "results": {
730
+ "OVERALL": {
731
+ "Average Score": 65.26710370586439,
732
+ "Standard Deviation": 9.198700753743012,
733
+ "Rank": 19
734
+ },
735
+ "Geometry": {
736
+ "Average Score": 48.52417714351894,
737
+ "Standard Deviation": null,
738
+ "Rank": 24
739
+ },
740
+ "Algebra": {
741
+ "Average Score": 68.55765479604507,
742
+ "Standard Deviation": null,
743
+ "Rank": 20
744
+ },
745
+ "Probability": {
746
+ "Average Score": 49.52382148131357,
747
+ "Standard Deviation": null,
748
+ "Rank": 26
749
+ },
750
+ "Logical": {
751
+ "Average Score": 37.33563924001827,
752
+ "Standard Deviation": null,
753
+ "Rank": 35
754
+ },
755
+ "Social": {
756
+ "Average Score": 46.00141195402727,
757
+ "Standard Deviation": null,
758
+ "Rank": 25
759
+ },
760
+ "Chemistry": {
761
+ "Average Score": 52.625823960166215,
762
+ "Standard Deviation": null,
763
+ "Rank": 23
764
+ },
765
+ "CPP": {
766
+ "Average Score": 48.69302376665551,
767
+ "Standard Deviation": null,
768
+ "Rank": 20
769
+ }
770
+ }
771
+ },
772
+ {
773
+ "config": {
774
+ "model_name": "qwen1.5-32b-chat",
775
+ "organization": "Alibaba",
776
+ "license": "Qianwen LICENSE",
777
+ "knowledge_cutoff": "2024/03"
778
+ },
779
+ "results": {
780
+ "OVERALL": {
781
+ "Average Score": 46.74335731441104,
782
+ "Standard Deviation": 4.096227849530709,
783
+ "Rank": 28
784
+ },
785
+ "Geometry": {
786
+ "Average Score": 44.96670224519297,
787
+ "Standard Deviation": null,
788
+ "Rank": 26
789
+ },
790
+ "Algebra": {
791
+ "Average Score": 63.19715848628476,
792
+ "Standard Deviation": null,
793
+ "Rank": 23
794
+ },
795
+ "Probability": {
796
+ "Average Score": 48.59873650270336,
797
+ "Standard Deviation": null,
798
+ "Rank": 27
799
+ },
800
+ "Logical": {
801
+ "Average Score": 42.028753105249216,
802
+ "Standard Deviation": null,
803
+ "Rank": 33
804
+ },
805
+ "Social": {
806
+ "Average Score": 43.183938768454986,
807
+ "Standard Deviation": null,
808
+ "Rank": 28
809
+ },
810
+ "Chemistry": {
811
+ "Average Score": 47.84488021045937,
812
+ "Standard Deviation": null,
813
+ "Rank": 26
814
+ },
815
+ "CPP": {
816
+ "Average Score": 45.14284028264288,
817
+ "Standard Deviation": null,
818
+ "Rank": 24
819
+ }
820
+ }
821
+ },
822
+ {
823
+ "config": {
824
+ "model_name": "google-gemma-2-9b-it",
825
+ "organization": "Google",
826
+ "license": "Proprietary",
827
+ "knowledge_cutoff": "2024/06"
828
+ },
829
+ "results": {
830
+ "OVERALL": {
831
+ "Average Score": 60.71065949101693,
832
+ "Standard Deviation": 0.12283018509137462,
833
+ "Rank": 23
834
+ },
835
+ "Geometry": {
836
+ "Average Score": 52.49270527783856,
837
+ "Standard Deviation": null,
838
+ "Rank": 20
839
+ },
840
+ "Algebra": {
841
+ "Average Score": 63.446032975128176,
842
+ "Standard Deviation": null,
843
+ "Rank": 21
844
+ },
845
+ "Probability": {
846
+ "Average Score": 63.95287475488081,
847
+ "Standard Deviation": null,
848
+ "Rank": 20
849
+ },
850
+ "Logical": {
851
+ "Average Score": 70.18644584116615,
852
+ "Standard Deviation": null,
853
+ "Rank": 20
854
+ },
855
+ "Social": {
856
+ "Average Score": 86.45401862572464,
857
+ "Standard Deviation": null,
858
+ "Rank": 9
859
+ },
860
+ "Chemistry": {
861
+ "Average Score": 57.56342217758078,
862
+ "Standard Deviation": null,
863
+ "Rank": 20
864
+ },
865
+ "CPP": {
866
+ "Average Score": 54.03167523687635,
867
+ "Standard Deviation": null,
868
+ "Rank": 17
869
+ }
870
+ }
871
+ },
872
+ {
873
+ "config": {
874
+ "model_name": "yi-1.5-34b-chat",
875
+ "organization": "01 AI",
876
+ "license": "Proprietary",
877
+ "knowledge_cutoff": "2024/05"
878
+ },
879
+ "results": {
880
+ "OVERALL": {
881
+ "Average Score": 71.53811567931923,
882
+ "Standard Deviation": 0.4838075734512934,
883
+ "Rank": 17
884
+ },
885
+ "Geometry": {
886
+ "Average Score": 53.98343904373819,
887
+ "Standard Deviation": null,
888
+ "Rank": 18
889
+ },
890
+ "Algebra": {
891
+ "Average Score": 63.317896075817885,
892
+ "Standard Deviation": null,
893
+ "Rank": 22
894
+ },
895
+ "Probability": {
896
+ "Average Score": 64.73492918491159,
897
+ "Standard Deviation": null,
898
+ "Rank": 19
899
+ },
900
+ "Logical": {
901
+ "Average Score": 66.39420245024361,
902
+ "Standard Deviation": null,
903
+ "Rank": 21
904
+ },
905
+ "Social": {
906
+ "Average Score": 53.73650350964252,
907
+ "Standard Deviation": null,
908
+ "Rank": 21
909
+ },
910
+ "Chemistry": {
911
+ "Average Score": 56.722360677914686,
912
+ "Standard Deviation": null,
913
+ "Rank": 21
914
+ },
915
+ "CPP": {
916
+ "Average Score": 52.148798061768964,
917
+ "Standard Deviation": null,
918
+ "Rank": 18
919
+ }
920
+ }
921
+ },
922
+ {
923
+ "config": {
924
+ "model_name": "meta-llama-3.1-70b-instruct",
925
+ "organization": "Meta",
926
+ "license": "Llama 3.1 Community",
927
+ "knowledge_cutoff": "2023/12"
928
+ },
929
+ "results": {
930
+ "OVERALL": {
931
+ "Average Score": 74.01502078434305,
932
+ "Standard Deviation": 0.24116839515156926,
933
+ "Rank": 15
934
+ },
935
+ "Geometry": {
936
+ "Average Score": 66.80097850274383,
937
+ "Standard Deviation": null,
938
+ "Rank": 13
939
+ },
940
+ "Algebra": {
941
+ "Average Score": 74.7667367179752,
942
+ "Standard Deviation": null,
943
+ "Rank": 14
944
+ },
945
+ "Probability": {
946
+ "Average Score": 66.0819470113051,
947
+ "Standard Deviation": null,
948
+ "Rank": 17
949
+ },
950
+ "Logical": {
951
+ "Average Score": 73.68238947162197,
952
+ "Standard Deviation": null,
953
+ "Rank": 16
954
+ },
955
+ "Social": {
956
+ "Average Score": 68.577541438994,
957
+ "Standard Deviation": null,
958
+ "Rank": 16
959
+ },
960
+ "Chemistry": {
961
+ "Average Score": 70.4019514562452,
962
+ "Standard Deviation": null,
963
+ "Rank": 15
964
+ },
965
+ "CPP": {
966
+ "Average Score": 84.36815192532764,
967
+ "Standard Deviation": null,
968
+ "Rank": 4
969
+ }
970
+ }
971
+ },
972
+ {
973
+ "config": {
974
+ "model_name": "meta-llama-3.1-8b-instruct",
975
+ "organization": "Meta",
976
+ "license": "Llama 3.1 Community",
977
+ "knowledge_cutoff": "2023/12"
978
+ },
979
+ "results": {
980
+ "OVERALL": {
981
+ "Average Score": 55.268736955905695,
982
+ "Standard Deviation": 7.060517225126177,
983
+ "Rank": 26
984
+ },
985
+ "Geometry": {
986
+ "Average Score": 42.44262022417502,
987
+ "Standard Deviation": null,
988
+ "Rank": 28
989
+ },
990
+ "Algebra": {
991
+ "Average Score": 60.632347391080486,
992
+ "Standard Deviation": null,
993
+ "Rank": 25
994
+ },
995
+ "Probability": {
996
+ "Average Score": 52.372362507453694,
997
+ "Standard Deviation": null,
998
+ "Rank": 24
999
+ },
1000
+ "Logical": {
1001
+ "Average Score": 54.17571378414435,
1002
+ "Standard Deviation": null,
1003
+ "Rank": 28
1004
+ },
1005
+ "Social": {
1006
+ "Average Score": 39.07966801070027,
1007
+ "Standard Deviation": null,
1008
+ "Rank": 31
1009
+ },
1010
+ "Chemistry": {
1011
+ "Average Score": 45.0170262190059,
1012
+ "Standard Deviation": null,
1013
+ "Rank": 29
1014
+ },
1015
+ "CPP": {
1016
+ "Average Score": 44.41846841004584,
1017
+ "Standard Deviation": null,
1018
+ "Rank": 26
1019
+ }
1020
+ }
1021
+ },
1022
+ {
1023
+ "config": {
1024
+ "model_name": "gpt3.5-turbo-0125",
1025
+ "organization": "OpenAI",
1026
+ "license": "Proprietary",
1027
+ "knowledge_cutoff": "2021/09"
1028
+ },
1029
+ "results": {
1030
+ "OVERALL": {
1031
+ "Average Score": 29.17379433602279,
1032
+ "Standard Deviation": 2.6813415847393878,
1033
+ "Rank": 44
1034
+ },
1035
+ "Geometry": {
1036
+ "Average Score": 51.47279337094397,
1037
+ "Standard Deviation": null,
1038
+ "Rank": 21
1039
+ },
1040
+ "Algebra": {
1041
+ "Average Score": 59.03601450977881,
1042
+ "Standard Deviation": null,
1043
+ "Rank": 26
1044
+ },
1045
+ "Probability": {
1046
+ "Average Score": 46.71541304474977,
1047
+ "Standard Deviation": null,
1048
+ "Rank": 28
1049
+ },
1050
+ "Logical": {
1051
+ "Average Score": 20.82026871015984,
1052
+ "Standard Deviation": null,
1053
+ "Rank": 46
1054
+ },
1055
+ "Social": {
1056
+ "Average Score": 28.31096293069848,
1057
+ "Standard Deviation": null,
1058
+ "Rank": 41
1059
+ },
1060
+ "Chemistry": {
1061
+ "Average Score": 42.899594571904004,
1062
+ "Standard Deviation": null,
1063
+ "Rank": 31
1064
+ },
1065
+ "CPP": {
1066
+ "Average Score": 40.46958736582551,
1067
+ "Standard Deviation": null,
1068
+ "Rank": 29
1069
+ }
1070
+ }
1071
+ },
1072
+ {
1073
+ "config": {
1074
+ "model_name": "llama-3-70b-instruct",
1075
+ "organization": "Meta",
1076
+ "license": "Llama 3 Community",
1077
+ "knowledge_cutoff": "2023/12"
1078
+ },
1079
+ "results": {
1080
+ "OVERALL": {
1081
+ "Average Score": 65.90407336557487,
1082
+ "Standard Deviation": 66.63940143516267,
1083
+ "Rank": 24
1084
+ },
1085
+ "Geometry": {
1086
+ "Average Score": 46.40555349958932,
1087
+ "Standard Deviation": null,
1088
+ "Rank": 25
1089
+ },
1090
+ "Algebra": {
1091
+ "Average Score": 60.86276607976933,
1092
+ "Standard Deviation": null,
1093
+ "Rank": 24
1094
+ },
1095
+ "Probability": {
1096
+ "Average Score": 55.0233135868055,
1097
+ "Standard Deviation": null,
1098
+ "Rank": 22
1099
+ },
1100
+ "Logical": {
1101
+ "Average Score": 83.99546392889077,
1102
+ "Standard Deviation": null,
1103
+ "Rank": 11
1104
+ },
1105
+ "Social": {
1106
+ "Average Score": 47.90189246663785,
1107
+ "Standard Deviation": null,
1108
+ "Rank": 23
1109
+ },
1110
+ "Chemistry": {
1111
+ "Average Score": 70.40198909396582,
1112
+ "Standard Deviation": null,
1113
+ "Rank": 15
1114
+ },
1115
+ "CPP": {
1116
+ "Average Score": 65.32140697218945,
1117
+ "Standard Deviation": null,
1118
+ "Rank": 13
1119
+ }
1120
+ }
1121
+ },
1122
+ {
1123
+ "config": {
1124
+ "model_name": "claude-3-sonnet",
1125
+ "organization": "Anthropic",
1126
+ "license": "Proprietary",
1127
+ "knowledge_cutoff": "2023/08"
1128
+ },
1129
+ "results": {
1130
+ "OVERALL": {
1131
+ "Average Score": 64.4278622266347,
1132
+ "Standard Deviation": 3.089828107392469,
1133
+ "Rank": 21
1134
+ },
1135
+ "Geometry": {
1136
+ "Average Score": 51.4677627365698,
1137
+ "Standard Deviation": null,
1138
+ "Rank": 21
1139
+ },
1140
+ "Algebra": {
1141
+ "Average Score": 57.157810499255426,
1142
+ "Standard Deviation": null,
1143
+ "Rank": 27
1144
+ },
1145
+ "Probability": {
1146
+ "Average Score": 54.68761427070592,
1147
+ "Standard Deviation": null,
1148
+ "Rank": 23
1149
+ },
1150
+ "Logical": {
1151
+ "Average Score": 65.8346271849297,
1152
+ "Standard Deviation": null,
1153
+ "Rank": 22
1154
+ },
1155
+ "Social": {
1156
+ "Average Score": 62.842721798877186,
1157
+ "Standard Deviation": null,
1158
+ "Rank": 18
1159
+ },
1160
+ "Chemistry": {
1161
+ "Average Score": 66.1914400411681,
1162
+ "Standard Deviation": null,
1163
+ "Rank": 18
1164
+ },
1165
+ "CPP": {
1166
+ "Average Score": 61.33538592327427,
1167
+ "Standard Deviation": null,
1168
+ "Rank": 15
1169
+ }
1170
+ }
1171
+ },
1172
+ {
1173
+ "config": {
1174
+ "model_name": "qwen1.5-14b-chat",
1175
+ "organization": "Alibaba",
1176
+ "license": "Qianwen LICENSE",
1177
+ "knowledge_cutoff": "2024/02"
1178
+ },
1179
+ "results": {
1180
+ "OVERALL": {
1181
+ "Average Score": 44.920016997055804,
1182
+ "Standard Deviation": 0.3041914765974254,
1183
+ "Rank": 30
1184
+ },
1185
+ "Geometry": {
1186
+ "Average Score": 36.40735570120079,
1187
+ "Standard Deviation": null,
1188
+ "Rank": 30
1189
+ },
1190
+ "Algebra": {
1191
+ "Average Score": 56.004717588310726,
1192
+ "Standard Deviation": null,
1193
+ "Rank": 28
1194
+ },
1195
+ "Probability": {
1196
+ "Average Score": 39.24866255465088,
1197
+ "Standard Deviation": null,
1198
+ "Rank": 33
1199
+ },
1200
+ "Logical": {
1201
+ "Average Score": 35.15462916949486,
1202
+ "Standard Deviation": null,
1203
+ "Rank": 38
1204
+ },
1205
+ "Social": {
1206
+ "Average Score": 35.236185321936766,
1207
+ "Standard Deviation": null,
1208
+ "Rank": 34
1209
+ },
1210
+ "Chemistry": {
1211
+ "Average Score": 40.803706763362605,
1212
+ "Standard Deviation": null,
1213
+ "Rank": 34
1214
+ },
1215
+ "CPP": {
1216
+ "Average Score": 38.552779976347026,
1217
+ "Standard Deviation": null,
1218
+ "Rank": 31
1219
+ }
1220
+ }
1221
+ },
1222
+ {
1223
+ "config": {
1224
+ "model_name": "claude-3-haiku",
1225
+ "organization": "Anthropic",
1226
+ "license": "Proprietary",
1227
+ "knowledge_cutoff": "2023/08"
1228
+ },
1229
+ "results": {
1230
+ "OVERALL": {
1231
+ "Average Score": 53.46814061793852,
1232
+ "Standard Deviation": 10.143567097006747,
1233
+ "Rank": 25
1234
+ },
1235
+ "Geometry": {
1236
+ "Average Score": 42.87542087805953,
1237
+ "Standard Deviation": null,
1238
+ "Rank": 27
1239
+ },
1240
+ "Algebra": {
1241
+ "Average Score": 53.706856083803686,
1242
+ "Standard Deviation": null,
1243
+ "Rank": 30
1244
+ },
1245
+ "Probability": {
1246
+ "Average Score": 49.80372052799326,
1247
+ "Standard Deviation": null,
1248
+ "Rank": 25
1249
+ },
1250
+ "Logical": {
1251
+ "Average Score": 62.585349577709394,
1252
+ "Standard Deviation": null,
1253
+ "Rank": 24
1254
+ },
1255
+ "Social": {
1256
+ "Average Score": 57.25601125762336,
1257
+ "Standard Deviation": null,
1258
+ "Rank": 19
1259
+ },
1260
+ "Chemistry": {
1261
+ "Average Score": 60.48921113945562,
1262
+ "Standard Deviation": null,
1263
+ "Rank": 19
1264
+ },
1265
+ "CPP": {
1266
+ "Average Score": 56.40200048817984,
1267
+ "Standard Deviation": null,
1268
+ "Rank": 16
1269
+ }
1270
+ }
1271
+ },
1272
+ {
1273
+ "config": {
1274
+ "model_name": "claude-2.1",
1275
+ "organization": "Anthropic",
1276
+ "license": "Proprietary",
1277
+ "knowledge_cutoff": "Unknown"
1278
+ },
1279
+ "results": {
1280
+ "OVERALL": {
1281
+ "Average Score": 39.855928282633364,
1282
+ "Standard Deviation": 8.396129652430814,
1283
+ "Rank": 35
1284
+ },
1285
+ "Geometry": {
1286
+ "Average Score": 51.1749207092159,
1287
+ "Standard Deviation": null,
1288
+ "Rank": 23
1289
+ },
1290
+ "Algebra": {
1291
+ "Average Score": 53.05386216145516,
1292
+ "Standard Deviation": null,
1293
+ "Rank": 31
1294
+ },
1295
+ "Probability": {
1296
+ "Average Score": 44.42150447611455,
1297
+ "Standard Deviation": null,
1298
+ "Rank": 30
1299
+ },
1300
+ "Logical": {
1301
+ "Average Score": 60.51381867118053,
1302
+ "Standard Deviation": null,
1303
+ "Rank": 25
1304
+ },
1305
+ "Social": {
1306
+ "Average Score": 38.492280755756035,
1307
+ "Standard Deviation": null,
1308
+ "Rank": 32
1309
+ },
1310
+ "Chemistry": {
1311
+ "Average Score": 50.66182745698702,
1312
+ "Standard Deviation": null,
1313
+ "Rank": 24
1314
+ },
1315
+ "CPP": {
1316
+ "Average Score": 47.23672563994903,
1317
+ "Standard Deviation": null,
1318
+ "Rank": 21
1319
+ }
1320
+ }
1321
+ },
1322
+ {
1323
+ "config": {
1324
+ "model_name": "mistral-8x7b-instruct-v0.1",
1325
+ "organization": "Mistral",
1326
+ "license": "Apache 2.0",
1327
+ "knowledge_cutoff": "2023/12"
1328
+ },
1329
+ "results": {
1330
+ "OVERALL": {
1331
+ "Average Score": 42.70451051343715,
1332
+ "Standard Deviation": 9.965602920103015,
1333
+ "Rank": 31
1334
+ },
1335
+ "Geometry": {
1336
+ "Average Score": 33.473933494899164,
1337
+ "Standard Deviation": null,
1338
+ "Rank": 34
1339
+ },
1340
+ "Algebra": {
1341
+ "Average Score": 48.99207852115047,
1342
+ "Standard Deviation": null,
1343
+ "Rank": 34
1344
+ },
1345
+ "Probability": {
1346
+ "Average Score": 44.46936520340586,
1347
+ "Standard Deviation": null,
1348
+ "Rank": 30
1349
+ },
1350
+ "Logical": {
1351
+ "Average Score": 42.656238987207246,
1352
+ "Standard Deviation": null,
1353
+ "Rank": 31
1354
+ },
1355
+ "Social": {
1356
+ "Average Score": 30.32900110312259,
1357
+ "Standard Deviation": null,
1358
+ "Rank": 40
1359
+ },
1360
+ "Chemistry": {
1361
+ "Average Score": 47.047104057571026,
1362
+ "Standard Deviation": null,
1363
+ "Rank": 27
1364
+ },
1365
+ "CPP": {
1366
+ "Average Score": 44.533118241976666,
1367
+ "Standard Deviation": null,
1368
+ "Rank": 25
1369
+ }
1370
+ }
1371
+ },
1372
+ {
1373
+ "config": {
1374
+ "model_name": "claude-2.0",
1375
+ "organization": "Anthropic",
1376
+ "license": "Proprietary",
1377
+ "knowledge_cutoff": "Unknown"
1378
+ },
1379
+ "results": {
1380
+ "OVERALL": {
1381
+ "Average Score": 33.53990717968659,
1382
+ "Standard Deviation": 7.640386327990536,
1383
+ "Rank": 41
1384
+ },
1385
+ "Geometry": {
1386
+ "Average Score": 38.40953902052666,
1387
+ "Standard Deviation": null,
1388
+ "Rank": 29
1389
+ },
1390
+ "Algebra": {
1391
+ "Average Score": 49.07235259762855,
1392
+ "Standard Deviation": null,
1393
+ "Rank": 33
1394
+ },
1395
+ "Probability": {
1396
+ "Average Score": 46.71546649299419,
1397
+ "Standard Deviation": null,
1398
+ "Rank": 28
1399
+ },
1400
+ "Logical": {
1401
+ "Average Score": 56.26908965013192,
1402
+ "Standard Deviation": null,
1403
+ "Rank": 27
1404
+ },
1405
+ "Social": {
1406
+ "Average Score": 47.84034165469707,
1407
+ "Standard Deviation": null,
1408
+ "Rank": 23
1409
+ },
1410
+ "Chemistry": {
1411
+ "Average Score": 55.20362543510563,
1412
+ "Standard Deviation": null,
1413
+ "Rank": 22
1414
+ },
1415
+ "CPP": {
1416
+ "Average Score": 50.773143448036464,
1417
+ "Standard Deviation": null,
1418
+ "Rank": 19
1419
+ }
1420
+ }
1421
+ },
1422
+ {
1423
+ "config": {
1424
+ "model_name": "starling-lm-7b-beta",
1425
+ "organization": "Nexusflow",
1426
+ "license": "Apache-2.0",
1427
+ "knowledge_cutoff": "2024/03"
1428
+ },
1429
+ "results": {
1430
+ "OVERALL": {
1431
+ "Average Score": 50.90398580969381,
1432
+ "Standard Deviation": 0.2839403187065694,
1433
+ "Rank": 27
1434
+ },
1435
+ "Geometry": {
1436
+ "Average Score": 34.653904247826965,
1437
+ "Standard Deviation": null,
1438
+ "Rank": 33
1439
+ },
1440
+ "Algebra": {
1441
+ "Average Score": 49.66265150940668,
1442
+ "Standard Deviation": null,
1443
+ "Rank": 32
1444
+ },
1445
+ "Probability": {
1446
+ "Average Score": 40.04695085773174,
1447
+ "Standard Deviation": null,
1448
+ "Rank": 32
1449
+ },
1450
+ "Logical": {
1451
+ "Average Score": 48.02284849364292,
1452
+ "Standard Deviation": null,
1453
+ "Rank": 29
1454
+ },
1455
+ "Social": {
1456
+ "Average Score": 42.82322308642107,
1457
+ "Standard Deviation": null,
1458
+ "Rank": 29
1459
+ },
1460
+ "Chemistry": {
1461
+ "Average Score": 40.54467030566931,
1462
+ "Standard Deviation": null,
1463
+ "Rank": 35
1464
+ },
1465
+ "CPP": {
1466
+ "Average Score": 38.27587102395908,
1467
+ "Standard Deviation": null,
1468
+ "Rank": 32
1469
+ }
1470
+ }
1471
+ },
1472
+ {
1473
+ "config": {
1474
+ "model_name": "gemini-1.0-pro-001",
1475
+ "organization": "Google",
1476
+ "license": "Proprietary",
1477
+ "knowledge_cutoff": "2023/04"
1478
+ },
1479
+ "results": {
1480
+ "OVERALL": {
1481
+ "Average Score": 37.91102687366529,
1482
+ "Standard Deviation": 15.15111885239772,
1483
+ "Rank": 38
1484
+ },
1485
+ "Geometry": {
1486
+ "Average Score": 35.480853719259684,
1487
+ "Standard Deviation": null,
1488
+ "Rank": 32
1489
+ },
1490
+ "Algebra": {
1491
+ "Average Score": 48.08542847805497,
1492
+ "Standard Deviation": null,
1493
+ "Rank": 35
1494
+ },
1495
+ "Probability": {
1496
+ "Average Score": 29.862669786973395,
1497
+ "Standard Deviation": null,
1498
+ "Rank": 42
1499
+ },
1500
+ "Logical": {
1501
+ "Average Score": 24.141794297157134,
1502
+ "Standard Deviation": null,
1503
+ "Rank": 43
1504
+ },
1505
+ "Social": {
1506
+ "Average Score": 15.062345665891504,
1507
+ "Standard Deviation": null,
1508
+ "Rank": 51
1509
+ },
1510
+ "Chemistry": {
1511
+ "Average Score": 46.52522766257804,
1512
+ "Standard Deviation": null,
1513
+ "Rank": 28
1514
+ },
1515
+ "CPP": {
1516
+ "Average Score": 45.22204471452975,
1517
+ "Standard Deviation": null,
1518
+ "Rank": 23
1519
+ }
1520
+ }
1521
+ },
1522
+ {
1523
+ "config": {
1524
+ "model_name": "openchat-3.5-0106",
1525
+ "organization": "OpenChat",
1526
+ "license": "Apache-2.0",
1527
+ "knowledge_cutoff": "2024/01"
1528
+ },
1529
+ "results": {
1530
+ "OVERALL": {
1531
+ "Average Score": 41.34314082389491,
1532
+ "Standard Deviation": 4.394481877390224,
1533
+ "Rank": 32
1534
+ },
1535
+ "Geometry": {
1536
+ "Average Score": 29.859015723426758,
1537
+ "Standard Deviation": null,
1538
+ "Rank": 36
1539
+ },
1540
+ "Algebra": {
1541
+ "Average Score": 45.79428201943078,
1542
+ "Standard Deviation": null,
1543
+ "Rank": 36
1544
+ },
1545
+ "Probability": {
1546
+ "Average Score": 38.766888608782956,
1547
+ "Standard Deviation": null,
1548
+ "Rank": 34
1549
+ },
1550
+ "Logical": {
1551
+ "Average Score": 42.1345774485532,
1552
+ "Standard Deviation": null,
1553
+ "Rank": 32
1554
+ },
1555
+ "Social": {
1556
+ "Average Score": 32.07155544930587,
1557
+ "Standard Deviation": null,
1558
+ "Rank": 39
1559
+ },
1560
+ "Chemistry": {
1561
+ "Average Score": 35.28601797606463,
1562
+ "Standard Deviation": null,
1563
+ "Rank": 37
1564
+ },
1565
+ "CPP": {
1566
+ "Average Score": 33.70639271807677,
1567
+ "Standard Deviation": null,
1568
+ "Rank": 33
1569
+ }
1570
+ }
1571
+ },
1572
+ {
1573
+ "config": {
1574
+ "model_name": "openchat-3.5",
1575
+ "organization": "OpenChat",
1576
+ "license": "Apache-2.0",
1577
+ "knowledge_cutoff": "2023/11"
1578
+ },
1579
+ "results": {
1580
+ "OVERALL": {
1581
+ "Average Score": 39.60454188051808,
1582
+ "Standard Deviation": 0.8232501722386516,
1583
+ "Rank": 36
1584
+ },
1585
+ "Geometry": {
1586
+ "Average Score": 30.77657388742533,
1587
+ "Standard Deviation": null,
1588
+ "Rank": 35
1589
+ },
1590
+ "Algebra": {
1591
+ "Average Score": 42.13028451761782,
1592
+ "Standard Deviation": null,
1593
+ "Rank": 38
1594
+ },
1595
+ "Probability": {
1596
+ "Average Score": 34.817635171077754,
1597
+ "Standard Deviation": null,
1598
+ "Rank": 37
1599
+ },
1600
+ "Logical": {
1601
+ "Average Score": 36.21944706732088,
1602
+ "Standard Deviation": null,
1603
+ "Rank": 36
1604
+ },
1605
+ "Social": {
1606
+ "Average Score": 37.59265084241427,
1607
+ "Standard Deviation": null,
1608
+ "Rank": 33
1609
+ },
1610
+ "Chemistry": {
1611
+ "Average Score": 37.21911183748652,
1612
+ "Standard Deviation": null,
1613
+ "Rank": 36
1614
+ },
1615
+ "CPP": {
1616
+ "Average Score": 33.020911255646965,
1617
+ "Standard Deviation": null,
1618
+ "Rank": 34
1619
+ }
1620
+ }
1621
+ },
1622
+ {
1623
+ "config": {
1624
+ "model_name": "command-r-(08-2024)",
1625
+ "organization": "Cohere",
1626
+ "license": "CC-BY-NC-4.0",
1627
+ "knowledge_cutoff": "2024/08"
1628
+ },
1629
+ "results": {
1630
+ "OVERALL": {
1631
+ "Average Score": 45.84310421663912,
1632
+ "Standard Deviation": 0.14535750785421472,
1633
+ "Rank": 29
1634
+ },
1635
+ "Geometry": {
1636
+ "Average Score": 36.33550343578038,
1637
+ "Standard Deviation": null,
1638
+ "Rank": 31
1639
+ },
1640
+ "Algebra": {
1641
+ "Average Score": 41.87079446639028,
1642
+ "Standard Deviation": null,
1643
+ "Rank": 39
1644
+ },
1645
+ "Probability": {
1646
+ "Average Score": 36.87662939858684,
1647
+ "Standard Deviation": null,
1648
+ "Rank": 36
1649
+ },
1650
+ "Logical": {
1651
+ "Average Score": 26.22482921268266,
1652
+ "Standard Deviation": null,
1653
+ "Rank": 41
1654
+ },
1655
+ "Social": {
1656
+ "Average Score": 35.11019761697373,
1657
+ "Standard Deviation": null,
1658
+ "Rank": 35
1659
+ },
1660
+ "Chemistry": {
1661
+ "Average Score": 41.81772722027254,
1662
+ "Standard Deviation": null,
1663
+ "Rank": 33
1664
+ },
1665
+ "CPP": {
1666
+ "Average Score": 39.61492485677676,
1667
+ "Standard Deviation": null,
1668
+ "Rank": 30
1669
+ }
1670
+ }
1671
+ },
1672
+ {
1673
+ "config": {
1674
+ "model_name": "gemma-1.1-7b-it",
1675
+ "organization": "Google",
1676
+ "license": "Gemma License",
1677
+ "knowledge_cutoff": "2024/02"
1678
+ },
1679
+ "results": {
1680
+ "OVERALL": {
1681
+ "Average Score": 35.873210924652795,
1682
+ "Standard Deviation": 6.462625645064649,
1683
+ "Rank": 37
1684
+ },
1685
+ "Geometry": {
1686
+ "Average Score": 25.79207201693066,
1687
+ "Standard Deviation": null,
1688
+ "Rank": 40
1689
+ },
1690
+ "Algebra": {
1691
+ "Average Score": 40.58046616460041,
1692
+ "Standard Deviation": null,
1693
+ "Rank": 40
1694
+ },
1695
+ "Probability": {
1696
+ "Average Score": 29.581773053230897,
1697
+ "Standard Deviation": null,
1698
+ "Rank": 43
1699
+ },
1700
+ "Logical": {
1701
+ "Average Score": 41.99821650962693,
1702
+ "Standard Deviation": null,
1703
+ "Rank": 33
1704
+ },
1705
+ "Social": {
1706
+ "Average Score": 24.39015213949678,
1707
+ "Standard Deviation": null,
1708
+ "Rank": 43
1709
+ },
1710
+ "Chemistry": {
1711
+ "Average Score": 45.01706482033765,
1712
+ "Standard Deviation": null,
1713
+ "Rank": 29
1714
+ },
1715
+ "CPP": {
1716
+ "Average Score": 42.666504105798204,
1717
+ "Standard Deviation": null,
1718
+ "Rank": 27
1719
+ }
1720
+ }
1721
+ },
1722
+ {
1723
+ "config": {
1724
+ "model_name": "llama3-8b-instruct",
1725
+ "organization": "Meta",
1726
+ "license": "Llama 3 Community",
1727
+ "knowledge_cutoff": "2023/03"
1728
+ },
1729
+ "results": {
1730
+ "OVERALL": {
1731
+ "Average Score": 39.00917270775336,
1732
+ "Standard Deviation": 3.999506140299149,
1733
+ "Rank": 39
1734
+ },
1735
+ "Geometry": {
1736
+ "Average Score": 29.224089668837465,
1737
+ "Standard Deviation": null,
1738
+ "Rank": 38
1739
+ },
1740
+ "Algebra": {
1741
+ "Average Score": 42.90961619082775,
1742
+ "Standard Deviation": null,
1743
+ "Rank": 37
1744
+ },
1745
+ "Probability": {
1746
+ "Average Score": 34.15721355738147,
1747
+ "Standard Deviation": null,
1748
+ "Rank": 38
1749
+ },
1750
+ "Logical": {
1751
+ "Average Score": 58.39773915370141,
1752
+ "Standard Deviation": null,
1753
+ "Rank": 26
1754
+ },
1755
+ "Social": {
1756
+ "Average Score": 40.88535401371015,
1757
+ "Standard Deviation": null,
1758
+ "Rank": 30
1759
+ },
1760
+ "Chemistry": {
1761
+ "Average Score": 49.70839372661025,
1762
+ "Standard Deviation": null,
1763
+ "Rank": 25
1764
+ },
1765
+ "CPP": {
1766
+ "Average Score": 45.35392139264795,
1767
+ "Standard Deviation": null,
1768
+ "Rank": 22
1769
+ }
1770
+ }
1771
+ },
1772
+ {
1773
+ "config": {
1774
+ "model_name": "gemma-2-2b-it",
1775
+ "organization": "Google",
1776
+ "license": "Gemma License",
1777
+ "knowledge_cutoff": "2024/07"
1778
+ },
1779
+ "results": {
1780
+ "OVERALL": {
1781
+ "Average Score": 57.45780847204313,
1782
+ "Standard Deviation": 16.310023687014333,
1783
+ "Rank": 22
1784
+ },
1785
+ "Geometry": {
1786
+ "Average Score": 29.820233374501843,
1787
+ "Standard Deviation": null,
1788
+ "Rank": 36
1789
+ },
1790
+ "Algebra": {
1791
+ "Average Score": 39.873024674507214,
1792
+ "Standard Deviation": null,
1793
+ "Rank": 41
1794
+ },
1795
+ "Probability": {
1796
+ "Average Score": 31.85692359301203,
1797
+ "Standard Deviation": null,
1798
+ "Rank": 40
1799
+ },
1800
+ "Logical": {
1801
+ "Average Score": 43.93437465788311,
1802
+ "Standard Deviation": null,
1803
+ "Rank": 30
1804
+ },
1805
+ "Social": {
1806
+ "Average Score": 44.689420554662476,
1807
+ "Standard Deviation": null,
1808
+ "Rank": 27
1809
+ },
1810
+ "Chemistry": {
1811
+ "Average Score": 32.05704364512495,
1812
+ "Standard Deviation": null,
1813
+ "Rank": 40
1814
+ },
1815
+ "CPP": {
1816
+ "Average Score": 30.53406933106768,
1817
+ "Standard Deviation": null,
1818
+ "Rank": 36
1819
+ }
1820
+ }
1821
+ },
1822
+ {
1823
+ "config": {
1824
+ "model_name": "starling-lm-7b-alpha",
1825
+ "organization": "Nexusflow",
1826
+ "license": "Apache-2.0",
1827
+ "knowledge_cutoff": "2023/11"
1828
+ },
1829
+ "results": {
1830
+ "OVERALL": {
1831
+ "Average Score": 40.625443347641045,
1832
+ "Standard Deviation": 3.0544259540377268,
1833
+ "Rank": 34
1834
+ },
1835
+ "Geometry": {
1836
+ "Average Score": 26.171147508308422,
1837
+ "Standard Deviation": null,
1838
+ "Rank": 39
1839
+ },
1840
+ "Algebra": {
1841
+ "Average Score": 39.149463007523856,
1842
+ "Standard Deviation": null,
1843
+ "Rank": 42
1844
+ },
1845
+ "Probability": {
1846
+ "Average Score": 32.36862021879827,
1847
+ "Standard Deviation": null,
1848
+ "Rank": 39
1849
+ },
1850
+ "Logical": {
1851
+ "Average Score": 34.17344938419256,
1852
+ "Standard Deviation": null,
1853
+ "Rank": 39
1854
+ },
1855
+ "Social": {
1856
+ "Average Score": 35.06966333212518,
1857
+ "Standard Deviation": null,
1858
+ "Rank": 35
1859
+ },
1860
+ "Chemistry": {
1861
+ "Average Score": 32.15932739848045,
1862
+ "Standard Deviation": null,
1863
+ "Rank": 39
1864
+ },
1865
+ "CPP": {
1866
+ "Average Score": 30.07926487356878,
1867
+ "Standard Deviation": null,
1868
+ "Rank": 37
1869
+ }
1870
+ }
1871
+ },
1872
+ {
1873
+ "config": {
1874
+ "model_name": "qwen1.5-4b-chat",
1875
+ "organization": "Alibaba",
1876
+ "license": "Qianwen LICENSE",
1877
+ "knowledge_cutoff": "2024/02"
1878
+ },
1879
+ "results": {
1880
+ "OVERALL": {
1881
+ "Average Score": 11.723779019126527,
1882
+ "Standard Deviation": 0.856230353584155,
1883
+ "Rank": 53
1884
+ },
1885
+ "Geometry": {
1886
+ "Average Score": 16.072772563608115,
1887
+ "Standard Deviation": null,
1888
+ "Rank": 45
1889
+ },
1890
+ "Algebra": {
1891
+ "Average Score": 32.22626131587612,
1892
+ "Standard Deviation": null,
1893
+ "Rank": 44
1894
+ },
1895
+ "Probability": {
1896
+ "Average Score": 13.98282712349133,
1897
+ "Standard Deviation": null,
1898
+ "Rank": 48
1899
+ },
1900
+ "Logical": {
1901
+ "Average Score": 13.993097991375581,
1902
+ "Standard Deviation": null,
1903
+ "Rank": 51
1904
+ },
1905
+ "Social": {
1906
+ "Average Score": 22.955898106386442,
1907
+ "Standard Deviation": null,
1908
+ "Rank": 45
1909
+ },
1910
+ "Chemistry": {
1911
+ "Average Score": 13.907481529463642,
1912
+ "Standard Deviation": null,
1913
+ "Rank": 51
1914
+ },
1915
+ "CPP": {
1916
+ "Average Score": 13.21208067122554,
1917
+ "Standard Deviation": null,
1918
+ "Rank": 47
1919
+ }
1920
+ }
1921
+ },
1922
+ {
1923
+ "config": {
1924
+ "model_name": "command-r-(04-2024)",
1925
+ "organization": "Cohere",
1926
+ "license": "CC-BY-NC-4.0",
1927
+ "knowledge_cutoff": "2024/04"
1928
+ },
1929
+ "results": {
1930
+ "OVERALL": {
1931
+ "Average Score": 43.08187135994592,
1932
+ "Standard Deviation": 0.7654553730614279,
1933
+ "Rank": 33
1934
+ },
1935
+ "Geometry": {
1936
+ "Average Score": 24.037084801508428,
1937
+ "Standard Deviation": null,
1938
+ "Rank": 41
1939
+ },
1940
+ "Algebra": {
1941
+ "Average Score": 32.37474440275246,
1942
+ "Standard Deviation": null,
1943
+ "Rank": 43
1944
+ },
1945
+ "Probability": {
1946
+ "Average Score": 31.014039425232298,
1947
+ "Standard Deviation": null,
1948
+ "Rank": 41
1949
+ },
1950
+ "Logical": {
1951
+ "Average Score": 35.49507014348235,
1952
+ "Standard Deviation": null,
1953
+ "Rank": 37
1954
+ },
1955
+ "Social": {
1956
+ "Average Score": 34.782695172510856,
1957
+ "Standard Deviation": null,
1958
+ "Rank": 37
1959
+ },
1960
+ "Chemistry": {
1961
+ "Average Score": 42.46395478814961,
1962
+ "Standard Deviation": null,
1963
+ "Rank": 32
1964
+ },
1965
+ "CPP": {
1966
+ "Average Score": 41.346336503003236,
1967
+ "Standard Deviation": null,
1968
+ "Rank": 28
1969
+ }
1970
+ }
1971
+ },
1972
+ {
1973
+ "config": {
1974
+ "model_name": "vicuna-33b",
1975
+ "organization": "LMSYS",
1976
+ "license": "Non-commercial",
1977
+ "knowledge_cutoff": "2023/08"
1978
+ },
1979
+ "results": {
1980
+ "OVERALL": {
1981
+ "Average Score": 30.8582386682731,
1982
+ "Standard Deviation": 2.3851186735858945,
1983
+ "Rank": 42
1984
+ },
1985
+ "Geometry": {
1986
+ "Average Score": 17.058968577112452,
1987
+ "Standard Deviation": null,
1988
+ "Rank": 44
1989
+ },
1990
+ "Algebra": {
1991
+ "Average Score": 25.22004544023738,
1992
+ "Standard Deviation": null,
1993
+ "Rank": 45
1994
+ },
1995
+ "Probability": {
1996
+ "Average Score": 21.097169680647767,
1997
+ "Standard Deviation": null,
1998
+ "Rank": 46
1999
+ },
2000
+ "Logical": {
2001
+ "Average Score": 23.212667585279515,
2002
+ "Standard Deviation": null,
2003
+ "Rank": 45
2004
+ },
2005
+ "Social": {
2006
+ "Average Score": 32.357116321848025,
2007
+ "Standard Deviation": null,
2008
+ "Rank": 38
2009
+ },
2010
+ "Chemistry": {
2011
+ "Average Score": 29.376389899632898,
2012
+ "Standard Deviation": null,
2013
+ "Rank": 42
2014
+ },
2015
+ "CPP": {
2016
+ "Average Score": 28.01838653090379,
2017
+ "Standard Deviation": null,
2018
+ "Rank": 38
2019
+ }
2020
+ }
2021
+ },
2022
+ {
2023
+ "config": {
2024
+ "model_name": "gemma-7b-it",
2025
+ "organization": "Google",
2026
+ "license": "Gemma License",
2027
+ "knowledge_cutoff": "2024/02"
2028
+ },
2029
+ "results": {
2030
+ "OVERALL": {
2031
+ "Average Score": 27.609692676933715,
2032
+ "Standard Deviation": 5.8350892031427435,
2033
+ "Rank": 45
2034
+ },
2035
+ "Geometry": {
2036
+ "Average Score": 20.127802528542947,
2037
+ "Standard Deviation": null,
2038
+ "Rank": 42
2039
+ },
2040
+ "Algebra": {
2041
+ "Average Score": 23.46400816161807,
2042
+ "Standard Deviation": null,
2043
+ "Rank": 47
2044
+ },
2045
+ "Probability": {
2046
+ "Average Score": 17.139514453170445,
2047
+ "Standard Deviation": null,
2048
+ "Rank": 47
2049
+ },
2050
+ "Logical": {
2051
+ "Average Score": 24.625290351028372,
2052
+ "Standard Deviation": null,
2053
+ "Rank": 42
2054
+ },
2055
+ "Social": {
2056
+ "Average Score": 26.715025606557614,
2057
+ "Standard Deviation": null,
2058
+ "Rank": 42
2059
+ },
2060
+ "Chemistry": {
2061
+ "Average Score": 29.383105099269972,
2062
+ "Standard Deviation": null,
2063
+ "Rank": 41
2064
+ },
2065
+ "CPP": {
2066
+ "Average Score": 28.014658234926813,
2067
+ "Standard Deviation": null,
2068
+ "Rank": 39
2069
+ }
2070
+ }
2071
+ },
2072
+ {
2073
+ "config": {
2074
+ "model_name": "mistral-7b-instruct-2",
2075
+ "organization": "Mistral",
2076
+ "license": "Apache 2.0",
2077
+ "knowledge_cutoff": "2023/12"
2078
+ },
2079
+ "results": {
2080
+ "OVERALL": {
2081
+ "Average Score": 32.583755237895794,
2082
+ "Standard Deviation": 1.6860156811686553,
2083
+ "Rank": 40
2084
+ },
2085
+ "Geometry": {
2086
+ "Average Score": 17.27716649229315,
2087
+ "Standard Deviation": null,
2088
+ "Rank": 43
2089
+ },
2090
+ "Algebra": {
2091
+ "Average Score": 23.58916877939791,
2092
+ "Standard Deviation": null,
2093
+ "Rank": 46
2094
+ },
2095
+ "Probability": {
2096
+ "Average Score": 25.1012270940144,
2097
+ "Standard Deviation": null,
2098
+ "Rank": 44
2099
+ },
2100
+ "Logical": {
2101
+ "Average Score": 29.07002036532878,
2102
+ "Standard Deviation": null,
2103
+ "Rank": 40
2104
+ },
2105
+ "Social": {
2106
+ "Average Score": 24.39006275978174,
2107
+ "Standard Deviation": null,
2108
+ "Rank": 43
2109
+ },
2110
+ "Chemistry": {
2111
+ "Average Score": 32.76096708662236,
2112
+ "Standard Deviation": null,
2113
+ "Rank": 38
2114
+ },
2115
+ "CPP": {
2116
+ "Average Score": 31.382959631870822,
2117
+ "Standard Deviation": null,
2118
+ "Rank": 35
2119
+ }
2120
+ }
2121
+ },
2122
+ {
2123
+ "config": {
2124
+ "model_name": "mistral-7b-instruct-1",
2125
+ "organization": "Mistral",
2126
+ "license": "Apache 2.0",
2127
+ "knowledge_cutoff": "2023/12"
2128
+ },
2129
+ "results": {
2130
+ "OVERALL": {
2131
+ "Average Score": 22.167930858422395,
2132
+ "Standard Deviation": 3.328543828571604,
2133
+ "Rank": 50
2134
+ },
2135
+ "Geometry": {
2136
+ "Average Score": 11.300762460776488,
2137
+ "Standard Deviation": null,
2138
+ "Rank": 49
2139
+ },
2140
+ "Algebra": {
2141
+ "Average Score": 21.016466430115493,
2142
+ "Standard Deviation": null,
2143
+ "Rank": 48
2144
+ },
2145
+ "Probability": {
2146
+ "Average Score": 24.506863192031716,
2147
+ "Standard Deviation": null,
2148
+ "Rank": 45
2149
+ },
2150
+ "Logical": {
2151
+ "Average Score": 17.0066100312336,
2152
+ "Standard Deviation": null,
2153
+ "Rank": 49
2154
+ },
2155
+ "Social": {
2156
+ "Average Score": 14.049392081101905,
2157
+ "Standard Deviation": null,
2158
+ "Rank": 52
2159
+ },
2160
+ "Chemistry": {
2161
+ "Average Score": 20.796521445473058,
2162
+ "Standard Deviation": null,
2163
+ "Rank": 45
2164
+ },
2165
+ "CPP": {
2166
+ "Average Score": 18.929093202755805,
2167
+ "Standard Deviation": null,
2168
+ "Rank": 42
2169
+ }
2170
+ }
2171
+ },
2172
+ {
2173
+ "config": {
2174
+ "model_name": "vicuna-13b",
2175
+ "organization": "LMSYS",
2176
+ "license": "Non-commercial",
2177
+ "knowledge_cutoff": "2023/07"
2178
+ },
2179
+ "results": {
2180
+ "OVERALL": {
2181
+ "Average Score": 20.105123059326157,
2182
+ "Standard Deviation": 4.100609090750239,
2183
+ "Rank": 51
2184
+ },
2185
+ "Geometry": {
2186
+ "Average Score": 13.080654946737525,
2187
+ "Standard Deviation": null,
2188
+ "Rank": 48
2189
+ },
2190
+ "Algebra": {
2191
+ "Average Score": 20.125194674408167,
2192
+ "Standard Deviation": null,
2193
+ "Rank": 49
2194
+ },
2195
+ "Probability": {
2196
+ "Average Score": 13.125942598704368,
2197
+ "Standard Deviation": null,
2198
+ "Rank": 49
2199
+ },
2200
+ "Logical": {
2201
+ "Average Score": 17.182300978389822,
2202
+ "Standard Deviation": null,
2203
+ "Rank": 48
2204
+ },
2205
+ "Social": {
2206
+ "Average Score": 16.258399348520832,
2207
+ "Standard Deviation": null,
2208
+ "Rank": 50
2209
+ },
2210
+ "Chemistry": {
2211
+ "Average Score": 23.79065696739089,
2212
+ "Standard Deviation": null,
2213
+ "Rank": 44
2214
+ },
2215
+ "CPP": {
2216
+ "Average Score": 21.840013221590294,
2217
+ "Standard Deviation": null,
2218
+ "Rank": 40
2219
+ }
2220
+ }
2221
+ },
2222
+ {
2223
+ "config": {
2224
+ "model_name": "zephyr-7b-beta",
2225
+ "organization": "HuggingFace",
2226
+ "license": "MIT",
2227
+ "knowledge_cutoff": "2023/10"
2228
+ },
2229
+ "results": {
2230
+ "OVERALL": {
2231
+ "Average Score": 11.581258432641418,
2232
+ "Standard Deviation": 1.677081510212375,
2233
+ "Rank": 54
2234
+ },
2235
+ "Geometry": {
2236
+ "Average Score": 8.432624521698594,
2237
+ "Standard Deviation": null,
2238
+ "Rank": 50
2239
+ },
2240
+ "Algebra": {
2241
+ "Average Score": 12.912859660357217,
2242
+ "Standard Deviation": null,
2243
+ "Rank": 51
2244
+ },
2245
+ "Probability": {
2246
+ "Average Score": 7.643552619113196,
2247
+ "Standard Deviation": null,
2248
+ "Rank": 54
2249
+ },
2250
+ "Logical": {
2251
+ "Average Score": 7.444095116649809,
2252
+ "Standard Deviation": null,
2253
+ "Rank": 55
2254
+ },
2255
+ "Social": {
2256
+ "Average Score": 0.0,
2257
+ "Standard Deviation": null,
2258
+ "Rank": 57
2259
+ },
2260
+ "Chemistry": {
2261
+ "Average Score": 16.150157007299235,
2262
+ "Standard Deviation": null,
2263
+ "Rank": 49
2264
+ },
2265
+ "CPP": {
2266
+ "Average Score": 18.92902220864132,
2267
+ "Standard Deviation": null,
2268
+ "Rank": 43
2269
+ }
2270
+ }
2271
+ },
2272
+ {
2273
+ "config": {
2274
+ "model_name": "gemma-1.1-2b-it",
2275
+ "organization": "Google",
2276
+ "license": "Gemma License",
2277
+ "knowledge_cutoff": "2024/02"
2278
+ },
2279
+ "results": {
2280
+ "OVERALL": {
2281
+ "Average Score": 25.06653151900311,
2282
+ "Standard Deviation": 5.340973431345662,
2283
+ "Rank": 48
2284
+ },
2285
+ "Geometry": {
2286
+ "Average Score": 13.161686218568628,
2287
+ "Standard Deviation": null,
2288
+ "Rank": 47
2289
+ },
2290
+ "Algebra": {
2291
+ "Average Score": 15.592205919293873,
2292
+ "Standard Deviation": null,
2293
+ "Rank": 50
2294
+ },
2295
+ "Probability": {
2296
+ "Average Score": 8.305764696120711,
2297
+ "Standard Deviation": null,
2298
+ "Rank": 51
2299
+ },
2300
+ "Logical": {
2301
+ "Average Score": 10.940766703849592,
2302
+ "Standard Deviation": null,
2303
+ "Rank": 53
2304
+ },
2305
+ "Social": {
2306
+ "Average Score": 21.925546766366356,
2307
+ "Standard Deviation": null,
2308
+ "Rank": 46
2309
+ },
2310
+ "Chemistry": {
2311
+ "Average Score": 18.700936936742952,
2312
+ "Standard Deviation": null,
2313
+ "Rank": 46
2314
+ },
2315
+ "CPP": {
2316
+ "Average Score": 20.724691953843916,
2317
+ "Standard Deviation": null,
2318
+ "Rank": 41
2319
+ }
2320
+ }
2321
+ },
2322
+ {
2323
+ "config": {
2324
+ "model_name": "llama2-7b-chat",
2325
+ "organization": "Meta",
2326
+ "license": "Llama 2 Community",
2327
+ "knowledge_cutoff": "2023/07"
2328
+ },
2329
+ "results": {
2330
+ "OVERALL": {
2331
+ "Average Score": 25.633612357313762,
2332
+ "Standard Deviation": 2.805639153654191,
2333
+ "Rank": 46
2334
+ },
2335
+ "Geometry": {
2336
+ "Average Score": 5.825877827672446,
2337
+ "Standard Deviation": null,
2338
+ "Rank": 51
2339
+ },
2340
+ "Algebra": {
2341
+ "Average Score": 8.58657284915635,
2342
+ "Standard Deviation": null,
2343
+ "Rank": 53
2344
+ },
2345
+ "Probability": {
2346
+ "Average Score": 8.164826137672431,
2347
+ "Standard Deviation": null,
2348
+ "Rank": 53
2349
+ },
2350
+ "Logical": {
2351
+ "Average Score": 20.697630462723275,
2352
+ "Standard Deviation": null,
2353
+ "Rank": 47
2354
+ },
2355
+ "Social": {
2356
+ "Average Score": 18.13821609304045,
2357
+ "Standard Deviation": null,
2358
+ "Rank": 47
2359
+ },
2360
+ "Chemistry": {
2361
+ "Average Score": 17.065363968846427,
2362
+ "Standard Deviation": null,
2363
+ "Rank": 47
2364
+ },
2365
+ "CPP": {
2366
+ "Average Score": 15.730513733660898,
2367
+ "Standard Deviation": null,
2368
+ "Rank": 45
2369
+ }
2370
+ }
2371
+ },
2372
+ {
2373
+ "config": {
2374
+ "model_name": "gemma-2b-it",
2375
+ "organization": "Google",
2376
+ "license": "Gemma License",
2377
+ "knowledge_cutoff": "2024/02"
2378
+ },
2379
+ "results": {
2380
+ "OVERALL": {
2381
+ "Average Score": 22.935122315202772,
2382
+ "Standard Deviation": 1.9451357494738446,
2383
+ "Rank": 49
2384
+ },
2385
+ "Geometry": {
2386
+ "Average Score": 15.523844579555126,
2387
+ "Standard Deviation": null,
2388
+ "Rank": 46
2389
+ },
2390
+ "Algebra": {
2391
+ "Average Score": 8.997563653883809,
2392
+ "Standard Deviation": null,
2393
+ "Rank": 52
2394
+ },
2395
+ "Probability": {
2396
+ "Average Score": 6.750305898269558,
2397
+ "Standard Deviation": null,
2398
+ "Rank": 55
2399
+ },
2400
+ "Logical": {
2401
+ "Average Score": 5.354222904092569,
2402
+ "Standard Deviation": null,
2403
+ "Rank": 56
2404
+ },
2405
+ "Social": {
2406
+ "Average Score": 10.938132042877358,
2407
+ "Standard Deviation": null,
2408
+ "Rank": 54
2409
+ },
2410
+ "Chemistry": {
2411
+ "Average Score": 17.06532733699507,
2412
+ "Standard Deviation": null,
2413
+ "Rank": 47
2414
+ },
2415
+ "CPP": {
2416
+ "Average Score": 17.2715657115764,
2417
+ "Standard Deviation": null,
2418
+ "Rank": 44
2419
+ }
2420
+ }
2421
+ },
2422
+ {
2423
+ "config": {
2424
+ "model_name": "llama2-13b-chat",
2425
+ "organization": "Meta",
2426
+ "license": "Llama 2 Community",
2427
+ "knowledge_cutoff": "2023/07"
2428
+ },
2429
+ "results": {
2430
+ "OVERALL": {
2431
+ "Average Score": 25.828530292775856,
2432
+ "Standard Deviation": 3.2503558704879296,
2433
+ "Rank": 47
2434
+ },
2435
+ "Geometry": {
2436
+ "Average Score": 4.119943280135397,
2437
+ "Standard Deviation": null,
2438
+ "Rank": 53
2439
+ },
2440
+ "Algebra": {
2441
+ "Average Score": 6.355347828676415,
2442
+ "Standard Deviation": null,
2443
+ "Rank": 54
2444
+ },
2445
+ "Probability": {
2446
+ "Average Score": 11.5585998384148,
2447
+ "Standard Deviation": null,
2448
+ "Rank": 50
2449
+ },
2450
+ "Logical": {
2451
+ "Average Score": 24.172674067890938,
2452
+ "Standard Deviation": null,
2453
+ "Rank": 43
2454
+ },
2455
+ "Social": {
2456
+ "Average Score": 17.850287642446094,
2457
+ "Standard Deviation": null,
2458
+ "Rank": 49
2459
+ },
2460
+ "Chemistry": {
2461
+ "Average Score": 13.887442704655687,
2462
+ "Standard Deviation": null,
2463
+ "Rank": 52
2464
+ },
2465
+ "CPP": {
2466
+ "Average Score": 13.17258252933903,
2467
+ "Standard Deviation": null,
2468
+ "Rank": 48
2469
+ }
2470
+ }
2471
+ },
2472
+ {
2473
+ "config": {
2474
+ "model_name": "vicuna-7b",
2475
+ "organization": "LMSYS",
2476
+ "license": "Non-commercial",
2477
+ "knowledge_cutoff": "2023/07"
2478
+ },
2479
+ "results": {
2480
+ "OVERALL": {
2481
+ "Average Score": 19.78471384913738,
2482
+ "Standard Deviation": 3.7936645273402276,
2483
+ "Rank": 52
2484
+ },
2485
+ "Geometry": {
2486
+ "Average Score": 5.434763675792798,
2487
+ "Standard Deviation": null,
2488
+ "Rank": 52
2489
+ },
2490
+ "Algebra": {
2491
+ "Average Score": 5.925959137419872,
2492
+ "Standard Deviation": null,
2493
+ "Rank": 55
2494
+ },
2495
+ "Probability": {
2496
+ "Average Score": 8.30566475354697,
2497
+ "Standard Deviation": null,
2498
+ "Rank": 51
2499
+ },
2500
+ "Logical": {
2501
+ "Average Score": 11.881223740003346,
2502
+ "Standard Deviation": null,
2503
+ "Rank": 52
2504
+ },
2505
+ "Social": {
2506
+ "Average Score": 12.864677350128595,
2507
+ "Standard Deviation": null,
2508
+ "Rank": 53
2509
+ },
2510
+ "Chemistry": {
2511
+ "Average Score": 14.187574975522333,
2512
+ "Standard Deviation": null,
2513
+ "Rank": 50
2514
+ },
2515
+ "CPP": {
2516
+ "Average Score": 14.255194156624162,
2517
+ "Standard Deviation": null,
2518
+ "Rank": 46
2519
+ }
2520
+ }
2521
+ },
2522
+ {
2523
+ "config": {
2524
+ "model_name": "koala-13b",
2525
+ "organization": "UC Berkeley",
2526
+ "license": "Non-commercial",
2527
+ "knowledge_cutoff": "2023/04"
2528
+ },
2529
+ "results": {
2530
+ "OVERALL": {
2531
+ "Average Score": 10.216910767982592,
2532
+ "Standard Deviation": 2.0597606260293655,
2533
+ "Rank": 55
2534
+ },
2535
+ "Geometry": {
2536
+ "Average Score": 0.1600118163292883,
2537
+ "Standard Deviation": null,
2538
+ "Rank": 54
2539
+ },
2540
+ "Algebra": {
2541
+ "Average Score": 2.2219841274068948,
2542
+ "Standard Deviation": null,
2543
+ "Rank": 56
2544
+ },
2545
+ "Probability": {
2546
+ "Average Score": 3.353938470588142,
2547
+ "Standard Deviation": null,
2548
+ "Rank": 56
2549
+ },
2550
+ "Logical": {
2551
+ "Average Score": 8.24436273551765,
2552
+ "Standard Deviation": null,
2553
+ "Rank": 54
2554
+ },
2555
+ "Social": {
2556
+ "Average Score": 10.96000067573448,
2557
+ "Standard Deviation": null,
2558
+ "Rank": 54
2559
+ },
2560
+ "Chemistry": {
2561
+ "Average Score": 6.272570799004611,
2562
+ "Standard Deviation": null,
2563
+ "Rank": 53
2564
+ },
2565
+ "CPP": {
2566
+ "Average Score": 6.36433272373514,
2567
+ "Standard Deviation": null,
2568
+ "Rank": 49
2569
+ }
2570
+ }
2571
+ },
2572
+ {
2573
+ "config": {
2574
+ "model_name": "openassistant-pythia-12b",
2575
+ "organization": "OpenAssistant",
2576
+ "license": "Non-commercial",
2577
+ "knowledge_cutoff": "2023/04"
2578
+ },
2579
+ "results": {
2580
+ "OVERALL": {
2581
+ "Average Score": 0.0,
2582
+ "Standard Deviation": 0.0,
2583
+ "Rank": 56
2584
+ },
2585
+ "Geometry": {
2586
+ "Average Score": 0.0,
2587
+ "Standard Deviation": null,
2588
+ "Rank": 55
2589
+ },
2590
+ "Algebra": {
2591
+ "Average Score": 0.0,
2592
+ "Standard Deviation": null,
2593
+ "Rank": 57
2594
+ },
2595
+ "Probability": {
2596
+ "Average Score": 0.0,
2597
+ "Standard Deviation": null,
2598
+ "Rank": 57
2599
+ },
2600
+ "Logical": {
2601
+ "Average Score": 0.0,
2602
+ "Standard Deviation": null,
2603
+ "Rank": 57
2604
+ },
2605
+ "Social": {
2606
+ "Average Score": 1.859688217710296,
2607
+ "Standard Deviation": null,
2608
+ "Rank": 56
2609
+ },
2610
+ "Chemistry": {
2611
+ "Average Score": 0.0,
2612
+ "Standard Deviation": null,
2613
+ "Rank": 54
2614
+ },
2615
+ "CPP": {
2616
+ "Average Score": 0.0,
2617
+ "Standard Deviation": null,
2618
+ "Rank": 50
2619
+ }
2620
+ }
2621
+ },
2622
+ {
2623
+ "config": {
2624
+ "model_name": "nemotron-70b",
2625
+ "organization": "NVIDIA",
2626
+ "license": "Unknown",
2627
+ "knowledge_cutoff": "Unknown"
2628
+ },
2629
+ "results": {
2630
+ "OVERALL": {
2631
+ "Average Score": 100.0,
2632
+ "Standard Deviation": 0.0,
2633
+ "Rank": 1
2634
+ },
2635
+ "Geometry": {
2636
+ "Average Score": 68.72757963233221,
2637
+ "Standard Deviation": null,
2638
+ "Rank": 12
2639
+ },
2640
+ "Algebra": {
2641
+ "Average Score": 73.71625129267943,
2642
+ "Standard Deviation": null,
2643
+ "Rank": 16
2644
+ },
2645
+ "Chemistry": {
2646
+ "Average Score": 72.48678626772566,
2647
+ "Standard Deviation": null,
2648
+ "Rank": 14
2649
+ },
2650
+ "Logical": {
2651
+ "Average Score": 92.57864400540329,
2652
+ "Standard Deviation": null,
2653
+ "Rank": 5
2654
+ },
2655
+ "Social": {
2656
+ "Average Score": 99.63342284899149,
2657
+ "Standard Deviation": null,
2658
+ "Rank": 2
2659
+ },
2660
+ "Probability": {
2661
+ "Average Score": 75.30735899300154,
2662
+ "Standard Deviation": null,
2663
+ "Rank": 10
2664
+ }
2665
+ }
2666
+ },
2667
+ {
2668
+ "config": {
2669
+ "model_name": "llama-3.2-3b-it",
2670
+ "organization": "Meta",
2671
+ "license": "Llama 3 Community",
2672
+ "knowledge_cutoff": "Unknown"
2673
+ },
2674
+ "results": {
2675
+ "OVERALL": {
2676
+ "Average Score": 29.47099904114387,
2677
+ "Standard Deviation": 1.6836027650802912,
2678
+ "Rank": 43
2679
+ },
2680
+ "Geometry": {
2681
+ "Average Score": 0.0,
2682
+ "Standard Deviation": 0.0,
2683
+ "Rank": 50
2684
+ },
2685
+ "Algebra": {
2686
+ "Average Score": 55.31592410564261,
2687
+ "Standard Deviation": null,
2688
+ "Rank": 29
2689
+ },
2690
+ "Chemistry": {
2691
+ "Average Score": 28.667640602193643,
2692
+ "Standard Deviation": null,
2693
+ "Rank": 43
2694
+ },
2695
+ "Logical": {
2696
+ "Average Score": 15.35430947415723,
2697
+ "Standard Deviation": null,
2698
+ "Rank": 49
2699
+ },
2700
+ "Social": {
2701
+ "Average Score": 18.087938295545133,
2702
+ "Standard Deviation": null,
2703
+ "Rank": 48
2704
+ },
2705
+ "Probability": {
2706
+ "Average Score": 37.84631410688676,
2707
+ "Standard Deviation": null,
2708
+ "Rank": 35
2709
+ }
2710
+ }
2711
+ },
2712
+ {
2713
+ "config": {
2714
+ "model_name": "yi-lightning",
2715
+ "organization": "01 AI",
2716
+ "license": "Proprietary",
2717
+ "knowledge_cutoff": "Unknown"
2718
+ },
2719
+ "results": {
2720
+ "OVERALL": {
2721
+ "Average Score": 96.10303362688546,
2722
+ "Standard Deviation": 0.5365246195716372,
2723
+ "Rank": 3
2724
+ },
2725
+ "Geometry": {
2726
+ "Average Score": 77.09570683128703,
2727
+ "Standard Deviation": null,
2728
+ "Rank": 8
2729
+ },
2730
+ "Algebra": {
2731
+ "Average Score": 85.92132293392635,
2732
+ "Standard Deviation": null,
2733
+ "Rank": 6
2734
+ },
2735
+ "Chemistry": {
2736
+ "Average Score": 95.7205664118507,
2737
+ "Standard Deviation": null,
2738
+ "Rank": 2
2739
+ },
2740
+ "Logical": {
2741
+ "Average Score": 94.60171867702756,
2742
+ "Standard Deviation": null,
2743
+ "Rank": 4
2744
+ },
2745
+ "Social": {
2746
+ "Average Score": 93.93680225135506,
2747
+ "Standard Deviation": null,
2748
+ "Rank": 6
2749
+ },
2750
+ "Probability": {
2751
+ "Average Score": 90.23858748317501,
2752
+ "Standard Deviation": null,
2753
+ "Rank": 3
2754
+ }
2755
+ }
2756
+ },
2757
+ {
2758
+ "config": {
2759
+ "model_name": "glm-4-plus",
2760
+ "organization": "Zhipu AI",
2761
+ "license": "Proprietary",
2762
+ "knowledge_cutoff": "Unknown"
2763
+ },
2764
+ "results": {
2765
+ "OVERALL": {
2766
+ "Average Score": 90.50303579501356,
2767
+ "Standard Deviation": 5.202472970969946,
2768
+ "Rank": 6
2769
+ },
2770
+ "Geometry": {
2771
+ "Average Score": 76.37543021571776,
2772
+ "Standard Deviation": null,
2773
+ "Rank": 9
2774
+ },
2775
+ "Algebra": {
2776
+ "Average Score": 81.39859078752944,
2777
+ "Standard Deviation": null,
2778
+ "Rank": 10
2779
+ },
2780
+ "Chemistry": {
2781
+ "Average Score": 90.15506569759444,
2782
+ "Standard Deviation": null,
2783
+ "Rank": 6
2784
+ },
2785
+ "Logical": {
2786
+ "Average Score": 92.26403821208403,
2787
+ "Standard Deviation": null,
2788
+ "Rank": 6
2789
+ },
2790
+ "Social": {
2791
+ "Average Score": 100.0,
2792
+ "Standard Deviation": null,
2793
+ "Rank": 1
2794
+ },
2795
+ "Probability": {
2796
+ "Average Score": 73.99418447190348,
2797
+ "Standard Deviation": null,
2798
+ "Rank": 11
2799
+ }
2800
+ }
2801
+ }
2802
+ ]