yzabc007 commited on
Commit
79d1bee
1 Parent(s): 6c01f70

Update space

Browse files
Files changed (2) hide show
  1. app.py +124 -59
  2. src/populate.py +24 -8
app.py CHANGED
@@ -164,44 +164,45 @@ with demo:
164
  """
165
  gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
166
 
167
- with gr.TabItem("Sort by Rank", elem_id="overall_sort_by_rank_subtab", id=0, elem_classes="subtab"):
168
  leaderboard = overall_leaderboard(
169
  get_model_leaderboard_df(
170
  model_result_path,
171
  benchmark_cols=[
172
  # AutoEvalColumn.rank_overall.name,
173
  AutoEvalColumn.model.name,
174
- AutoEvalColumn.rank_overall.name,
175
  AutoEvalColumn.rank_math_algebra.name,
176
  AutoEvalColumn.rank_math_geometry.name,
177
  AutoEvalColumn.rank_math_probability.name,
178
  AutoEvalColumn.rank_reason_logical.name,
179
  AutoEvalColumn.rank_reason_social.name,
180
  AutoEvalColumn.rank_chemistry.name,
 
181
  # AutoEvalColumn.rank_cpp.name,
182
  ],
183
- rank_col=[],
184
  )
185
  )
186
 
187
- with gr.TabItem("Sort by Score", elem_id="overall_sort_by_score_subtab", id=1, elem_classes="subtab"):
188
  leaderboard = overall_leaderboard(
189
  get_model_leaderboard_df(
190
  model_result_path,
191
  benchmark_cols=[
192
  # AutoEvalColumn.rank_overall.name,
193
  AutoEvalColumn.model.name,
194
- AutoEvalColumn.license.name,
195
- AutoEvalColumn.organization.name,
196
- AutoEvalColumn.knowledge_cutoff.name,
197
 
198
- AutoEvalColumn.score_overall.name,
199
  AutoEvalColumn.score_math_algebra.name,
200
  AutoEvalColumn.score_math_geometry.name,
201
  AutoEvalColumn.score_math_probability.name,
202
  AutoEvalColumn.score_reason_logical.name,
203
  AutoEvalColumn.score_reason_social.name,
204
  AutoEvalColumn.score_chemistry.name,
 
205
  # AutoEvalColumn.score_cpp.name,
206
 
207
  # AutoEvalColumn.rank_overall.name,
@@ -213,7 +214,7 @@ with demo:
213
  # AutoEvalColumn.rank_chemistry.name,
214
  # AutoEvalColumn.rank_cpp.name,
215
  ],
216
- rank_col=['sort_by_score'],
217
  )
218
  )
219
 
@@ -236,26 +237,50 @@ with demo:
236
  gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
237
 
238
  # leaderboard = init_leaderboard(LEADERBOARD_DF)
239
- with gr.TabItem("Overall", elem_id="math_overall_subtab", id=0, elem_classes="subtab"):
240
- leaderboard = overall_leaderboard(
241
- get_model_leaderboard_df(
242
- model_result_path,
243
- benchmark_cols=[
244
- AutoEvalColumn.model.name,
245
- AutoEvalColumn.license.name,
246
- AutoEvalColumn.organization.name,
247
- AutoEvalColumn.knowledge_cutoff.name,
248
-
249
- AutoEvalColumn.score_math_algebra.name,
250
- AutoEvalColumn.score_math_geometry.name,
251
- AutoEvalColumn.score_math_probability.name,
252
- # AutoEvalColumn.rank_math_algebra.name,
253
- # AutoEvalColumn.rank_math_geometry.name,
254
- # AutoEvalColumn.rank_math_probability.name,
255
- ],
256
- rank_col=['sort_by_score'],
 
 
 
257
  )
258
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
 
261
  with gr.TabItem("🧮 Algebra", elem_id="algebra_subtab", id=1, elem_classes="subtab"):
@@ -349,24 +374,42 @@ with demo:
349
  """
350
  gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
351
 
352
- with gr.TabItem("Overall", elem_id="reasoning_overall_subtab", id=0, elem_classes="subtab"):
353
- leaderboard = overall_leaderboard(
354
- get_model_leaderboard_df(
355
- model_result_path,
356
- benchmark_cols=[
357
- AutoEvalColumn.model.name,
358
- AutoEvalColumn.license.name,
359
- AutoEvalColumn.organization.name,
360
- AutoEvalColumn.knowledge_cutoff.name,
 
 
 
 
 
 
 
 
 
361
 
362
- AutoEvalColumn.score_reason_logical.name,
363
- AutoEvalColumn.score_reason_social.name,
364
- # AutoEvalColumn.rank_reason_logical.name,
365
- # AutoEvalColumn.rank_reason_social.name,
366
- ],
367
- rank_col=['sort_by_score'],
 
 
 
 
 
 
 
 
 
368
  )
369
- )
370
 
371
  with gr.TabItem("🧩 Logical", elem_id="logical_subtab", id=1, elem_classes="subtab"):
372
  leaderboard = overall_leaderboard(
@@ -430,23 +473,45 @@ with demo:
430
  """
431
  gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
432
 
433
- with gr.TabItem("Overall", elem_id="science_overall_subtab", id=0, elem_classes="subtab"):
434
- leaderboard = overall_leaderboard(
435
- get_model_leaderboard_df(
436
- model_result_path,
437
- benchmark_cols=[
438
- AutoEvalColumn.model.name,
439
- AutoEvalColumn.license.name,
440
- AutoEvalColumn.organization.name,
441
- AutoEvalColumn.knowledge_cutoff.name,
442
-
443
- AutoEvalColumn.score_chemistry.name,
444
- # AutoEvalColumn.rank_chemistry.name,
445
- ],
446
- rank_col=['sort_by_score'],
 
 
 
447
  )
448
- )
449
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
  with gr.TabItem("🧪 Chemistry", elem_id="chemistry_subtab", id=1, elem_classes="subtab"):
451
  leaderboard = overall_leaderboard(
452
  get_model_leaderboard_df(
 
164
  """
165
  gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
166
 
167
+ with gr.TabItem("Sort by Rank", elem_id="overall_sort_by_rank_subtab", id=0, elem_classes="subtab"):
168
  leaderboard = overall_leaderboard(
169
  get_model_leaderboard_df(
170
  model_result_path,
171
  benchmark_cols=[
172
  # AutoEvalColumn.rank_overall.name,
173
  AutoEvalColumn.model.name,
174
+
175
  AutoEvalColumn.rank_math_algebra.name,
176
  AutoEvalColumn.rank_math_geometry.name,
177
  AutoEvalColumn.rank_math_probability.name,
178
  AutoEvalColumn.rank_reason_logical.name,
179
  AutoEvalColumn.rank_reason_social.name,
180
  AutoEvalColumn.rank_chemistry.name,
181
+ AutoEvalColumn.rank_overall.name,
182
  # AutoEvalColumn.rank_cpp.name,
183
  ],
184
+ rank_col=['sort_by_rank', 1, 8],
185
  )
186
  )
187
 
188
+ with gr.TabItem("Sort by Score", elem_id="overall_sort_by_score_subtab", id=1, elem_classes="subtab"):
189
  leaderboard = overall_leaderboard(
190
  get_model_leaderboard_df(
191
  model_result_path,
192
  benchmark_cols=[
193
  # AutoEvalColumn.rank_overall.name,
194
  AutoEvalColumn.model.name,
195
+ # AutoEvalColumn.license.name,
196
+ # AutoEvalColumn.organization.name,
197
+ # AutoEvalColumn.knowledge_cutoff.name,
198
 
 
199
  AutoEvalColumn.score_math_algebra.name,
200
  AutoEvalColumn.score_math_geometry.name,
201
  AutoEvalColumn.score_math_probability.name,
202
  AutoEvalColumn.score_reason_logical.name,
203
  AutoEvalColumn.score_reason_social.name,
204
  AutoEvalColumn.score_chemistry.name,
205
+ AutoEvalColumn.score_overall.name,
206
  # AutoEvalColumn.score_cpp.name,
207
 
208
  # AutoEvalColumn.rank_overall.name,
 
214
  # AutoEvalColumn.rank_chemistry.name,
215
  # AutoEvalColumn.rank_cpp.name,
216
  ],
217
+ rank_col=['sort_by_score', 1, 8],
218
  )
219
  )
220
 
 
237
  gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
238
 
239
  # leaderboard = init_leaderboard(LEADERBOARD_DF)
240
+ with gr.TabItem("🏆 Overview", elem_id="math_overview_subtab", id=0, elem_classes="subtab"):
241
+
242
+ with gr.TabItem("⭐ Sort by Rank", elem_id="math_overview_sort_by_rank_subtab", id=0, elem_classes="subtab"):
243
+ leaderboard = overall_leaderboard(
244
+ get_model_leaderboard_df(
245
+ model_result_path,
246
+ benchmark_cols=[
247
+ AutoEvalColumn.model.name,
248
+ # AutoEvalColumn.license.name,
249
+ # AutoEvalColumn.organization.name,
250
+ # AutoEvalColumn.knowledge_cutoff.name,
251
+
252
+ # AutoEvalColumn.score_math_algebra.name,
253
+ # AutoEvalColumn.score_math_geometry.name,
254
+ # AutoEvalColumn.score_math_probability.name,
255
+ AutoEvalColumn.rank_math_algebra.name,
256
+ AutoEvalColumn.rank_math_geometry.name,
257
+ AutoEvalColumn.rank_math_probability.name,
258
+ ],
259
+ rank_col=['sort_by_rank', 1, 4],
260
+ )
261
  )
262
+
263
+ with gr.TabItem("⭐ Sort by Score", elem_id="math_overview_sort_by_score_subtab", id=1, elem_classes="subtab"):
264
+ leaderboard = overall_leaderboard(
265
+ get_model_leaderboard_df(
266
+ model_result_path,
267
+ benchmark_cols=[
268
+ AutoEvalColumn.model.name,
269
+ # AutoEvalColumn.license.name,
270
+ # AutoEvalColumn.organization.name,
271
+ # AutoEvalColumn.knowledge_cutoff.name,
272
+
273
+ AutoEvalColumn.score_math_algebra.name,
274
+ AutoEvalColumn.score_math_geometry.name,
275
+ AutoEvalColumn.score_math_probability.name,
276
+ # AutoEvalColumn.rank_math_algebra.name,
277
+ # AutoEvalColumn.rank_math_geometry.name,
278
+ # AutoEvalColumn.rank_math_probability.name,
279
+ ],
280
+ rank_col=['sort_by_score', 1, 4],
281
+ )
282
+ )
283
+
284
 
285
 
286
  with gr.TabItem("🧮 Algebra", elem_id="algebra_subtab", id=1, elem_classes="subtab"):
 
374
  """
375
  gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
376
 
377
+ with gr.TabItem("🏆 Overview", elem_id="reasoning_overview_subtab", id=0, elem_classes="subtab"):
378
+
379
+ with gr.TabItem("⭐ Sort by Rank", elem_id="reasoning_overview_sort_by_rank_subtab", id=0, elem_classes="subtab"):
380
+ leaderboard = overall_leaderboard(
381
+ get_model_leaderboard_df(
382
+ model_result_path,
383
+ benchmark_cols=[
384
+ AutoEvalColumn.model.name,
385
+ # AutoEvalColumn.license.name,
386
+ # AutoEvalColumn.organization.name,
387
+ # AutoEvalColumn.knowledge_cutoff.name,
388
+
389
+ AutoEvalColumn.rank_reason_logical.name,
390
+ AutoEvalColumn.rank_reason_social.name,
391
+ ],
392
+ rank_col=['sort_by_rank', 1, 3],
393
+ )
394
+ )
395
 
396
+ with gr.TabItem("⭐ Sort by Score", elem_id="reasoning_overview_sort_by_score_subtab", id=1, elem_classes="subtab"):
397
+ leaderboard = overall_leaderboard(
398
+ get_model_leaderboard_df(
399
+ model_result_path,
400
+ benchmark_cols=[
401
+ AutoEvalColumn.model.name,
402
+ # AutoEvalColumn.license.name,
403
+ # AutoEvalColumn.organization.name,
404
+ # AutoEvalColumn.knowledge_cutoff.name,
405
+
406
+ AutoEvalColumn.score_reason_logical.name,
407
+ AutoEvalColumn.score_reason_social.name,
408
+ ],
409
+ rank_col=['sort_by_score', 1, 3],
410
+ )
411
  )
412
+
413
 
414
  with gr.TabItem("🧩 Logical", elem_id="logical_subtab", id=1, elem_classes="subtab"):
415
  leaderboard = overall_leaderboard(
 
473
  """
474
  gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
475
 
476
+ with gr.TabItem("🏆 Overview", elem_id="science_overview_subtab", id=0, elem_classes="subtab"):
477
+
478
+ with gr.TabItem("⭐ Sort by Rank", elem_id="science_overview_sort_by_rank_subtab", id=0, elem_classes="subtab"):
479
+ leaderboard = overall_leaderboard(
480
+ get_model_leaderboard_df(
481
+ model_result_path,
482
+ benchmark_cols=[
483
+ AutoEvalColumn.model.name,
484
+
485
+ AutoEvalColumn.license.name,
486
+ AutoEvalColumn.organization.name,
487
+ AutoEvalColumn.knowledge_cutoff.name,
488
+
489
+ AutoEvalColumn.rank_chemistry.name,
490
+ ],
491
+ rank_col=['sort_by_rank', 4, 5],
492
+ )
493
  )
 
494
 
495
+ with gr.TabItem("⭐ Sort by Score", elem_id="science_overview_sort_by_score_subtab", id=1, elem_classes="subtab"):
496
+ leaderboard = overall_leaderboard(
497
+ get_model_leaderboard_df(
498
+ model_result_path,
499
+ benchmark_cols=[
500
+ AutoEvalColumn.model.name,
501
+
502
+ AutoEvalColumn.license.name,
503
+ AutoEvalColumn.organization.name,
504
+ AutoEvalColumn.knowledge_cutoff.name,
505
+
506
+ AutoEvalColumn.score_chemistry.name,
507
+ # AutoEvalColumn.rank_chemistry.name,
508
+ ],
509
+ rank_col=['sort_by_score', 4, 5], # two numbers are index to select the columns to average and sort
510
+ )
511
+ )
512
+
513
+
514
+
515
  with gr.TabItem("🧪 Chemistry", elem_id="chemistry_subtab", id=1, elem_classes="subtab"):
516
  leaderboard = overall_leaderboard(
517
  get_model_leaderboard_df(
src/populate.py CHANGED
@@ -15,7 +15,7 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
15
  """Creates a dataframe from all the individual experiment results"""
16
  raw_data = get_raw_model_results(results_path)
17
  all_data_json = [v.to_dict() for v in raw_data]
18
- assert len(rank_col) <= 1, "Only one column can be selected for ranking"
19
 
20
  df = pd.DataFrame.from_records(all_data_json)
21
 
@@ -24,7 +24,7 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
24
 
25
  # if there is one col in rank_col, this is an isolated dimension to rank by
26
  # sort by that selected column and remove NaN values
27
- if rank_col and rank_col[0] != "sort_by_score":
28
  # df = df.dropna(subset=benchmark_cols)
29
  df = df.dropna(subset=rank_col)
30
  df = df.fillna(0.00)
@@ -39,23 +39,37 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
39
  df[col] = df[col].round(decimals=2)
40
 
41
  elif rank_col and rank_col[0] == "sort_by_score": # sorting by averaging all benchmark cols, except cols before offset_idx
42
- offset_idx = 4
43
- avg_scores = df.iloc[:, offset_idx:].mean(axis=1)
 
44
  df.insert(1, "Average Score", avg_scores)
45
 
46
  df["Average Score"] = avg_scores.round(decimals=4)
47
  df = df.sort_values(by=["Average Score"], ascending=False)
48
  df["Average Score"] = df["Average Score"].map('{:.2f}'.format)
49
 
50
- df = df.drop(columns=benchmark_cols[offset_idx:])
51
  # print(benchmark_cols)
52
  # print(df.head())
53
  # insert a rank column
54
  rank = np.arange(1, len(df)+1)
55
  df.insert(0, 'Rank', rank)
56
-
57
- else: # when rank_col, the first in benchmark_cols is empty, sort by averaging all the benchmarks, except the first one
58
- avg_rank = df.iloc[:, 1:].mean(axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  df["Average Rank"] = avg_rank.round(decimals=4)
60
  df = df.sort_values(by=["Average Rank"], ascending=True)
61
  df["Average Rank"] = df["Average Rank"].map('{:.2f}'.format)
@@ -66,6 +80,8 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
66
  rank = np.arange(1, len(df)+1)
67
  df.insert(0, 'Rank', rank)
68
 
 
 
69
 
70
 
71
 
 
15
  """Creates a dataframe from all the individual experiment results"""
16
  raw_data = get_raw_model_results(results_path)
17
  all_data_json = [v.to_dict() for v in raw_data]
18
+ # assert len(rank_col) <= 1, "Only one column can be selected for ranking"
19
 
20
  df = pd.DataFrame.from_records(all_data_json)
21
 
 
24
 
25
  # if there is one col in rank_col, this is an isolated dimension to rank by
26
  # sort by that selected column and remove NaN values
27
+ if rank_col and rank_col[0] not in ["sort_by_score", "sort_by_rank"]:
28
  # df = df.dropna(subset=benchmark_cols)
29
  df = df.dropna(subset=rank_col)
30
  df = df.fillna(0.00)
 
39
  df[col] = df[col].round(decimals=2)
40
 
41
  elif rank_col and rank_col[0] == "sort_by_score": # sorting by averaging all benchmark cols, except cols before offset_idx
42
+ start_idx = rank_col[1]
43
+ end_idx = rank_col[2]
44
+ avg_scores = df.iloc[:, start_idx:end_idx].mean(axis=1)
45
  df.insert(1, "Average Score", avg_scores)
46
 
47
  df["Average Score"] = avg_scores.round(decimals=4)
48
  df = df.sort_values(by=["Average Score"], ascending=False)
49
  df["Average Score"] = df["Average Score"].map('{:.2f}'.format)
50
 
51
+ # df = df.drop(columns=benchmark_cols[offset_idx:])
52
  # print(benchmark_cols)
53
  # print(df.head())
54
  # insert a rank column
55
  rank = np.arange(1, len(df)+1)
56
  df.insert(0, 'Rank', rank)
57
+
58
+ for col in benchmark_cols:
59
+ if 'Std dev' in col or 'Score' in col:
60
+ df[col] = (df[col]).map('{:.2f}'.format)
61
+ df[col] = df[col].round(decimals=2)
62
+
63
+ # df = df.fillna('--')
64
+ df.replace("nan", '--', inplace=True)
65
+
66
+ elif rank_col and rank_col[0] == "sort_by_rank":
67
+ # else: # when rank_col, the first in benchmark_cols is empty, sort by averaging all the benchmarks, except the first one
68
+ start_idx = rank_col[1]
69
+ end_idx = rank_col[2]
70
+ avg_rank = df.iloc[:, start_idx:end_idx].mean(axis=1)
71
+ df.insert(1, "Average Rank", avg_rank)
72
+
73
  df["Average Rank"] = avg_rank.round(decimals=4)
74
  df = df.sort_values(by=["Average Rank"], ascending=True)
75
  df["Average Rank"] = df["Average Rank"].map('{:.2f}'.format)
 
80
  rank = np.arange(1, len(df)+1)
81
  df.insert(0, 'Rank', rank)
82
 
83
+ # df.style.background_gradient(cmap='coolwarm', subset=benchmark_cols)
84
+
85
 
86
 
87