Update space
Browse files- app.py +124 -59
- src/populate.py +24 -8
app.py
CHANGED
@@ -164,44 +164,45 @@ with demo:
|
|
164 |
"""
|
165 |
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
|
166 |
|
167 |
-
with gr.TabItem("Sort by Rank", elem_id="overall_sort_by_rank_subtab", id=0, elem_classes="subtab"):
|
168 |
leaderboard = overall_leaderboard(
|
169 |
get_model_leaderboard_df(
|
170 |
model_result_path,
|
171 |
benchmark_cols=[
|
172 |
# AutoEvalColumn.rank_overall.name,
|
173 |
AutoEvalColumn.model.name,
|
174 |
-
|
175 |
AutoEvalColumn.rank_math_algebra.name,
|
176 |
AutoEvalColumn.rank_math_geometry.name,
|
177 |
AutoEvalColumn.rank_math_probability.name,
|
178 |
AutoEvalColumn.rank_reason_logical.name,
|
179 |
AutoEvalColumn.rank_reason_social.name,
|
180 |
AutoEvalColumn.rank_chemistry.name,
|
|
|
181 |
# AutoEvalColumn.rank_cpp.name,
|
182 |
],
|
183 |
-
rank_col=[],
|
184 |
)
|
185 |
)
|
186 |
|
187 |
-
with gr.TabItem("Sort by Score", elem_id="overall_sort_by_score_subtab", id=1, elem_classes="subtab"):
|
188 |
leaderboard = overall_leaderboard(
|
189 |
get_model_leaderboard_df(
|
190 |
model_result_path,
|
191 |
benchmark_cols=[
|
192 |
# AutoEvalColumn.rank_overall.name,
|
193 |
AutoEvalColumn.model.name,
|
194 |
-
AutoEvalColumn.license.name,
|
195 |
-
AutoEvalColumn.organization.name,
|
196 |
-
AutoEvalColumn.knowledge_cutoff.name,
|
197 |
|
198 |
-
AutoEvalColumn.score_overall.name,
|
199 |
AutoEvalColumn.score_math_algebra.name,
|
200 |
AutoEvalColumn.score_math_geometry.name,
|
201 |
AutoEvalColumn.score_math_probability.name,
|
202 |
AutoEvalColumn.score_reason_logical.name,
|
203 |
AutoEvalColumn.score_reason_social.name,
|
204 |
AutoEvalColumn.score_chemistry.name,
|
|
|
205 |
# AutoEvalColumn.score_cpp.name,
|
206 |
|
207 |
# AutoEvalColumn.rank_overall.name,
|
@@ -213,7 +214,7 @@ with demo:
|
|
213 |
# AutoEvalColumn.rank_chemistry.name,
|
214 |
# AutoEvalColumn.rank_cpp.name,
|
215 |
],
|
216 |
-
rank_col=['sort_by_score'],
|
217 |
)
|
218 |
)
|
219 |
|
@@ -236,26 +237,50 @@ with demo:
|
|
236 |
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
|
237 |
|
238 |
# leaderboard = init_leaderboard(LEADERBOARD_DF)
|
239 |
-
with gr.TabItem("
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
|
|
|
|
|
|
257 |
)
|
258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
|
260 |
|
261 |
with gr.TabItem("🧮 Algebra", elem_id="algebra_subtab", id=1, elem_classes="subtab"):
|
@@ -349,24 +374,42 @@ with demo:
|
|
349 |
"""
|
350 |
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
|
351 |
|
352 |
-
with gr.TabItem("
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
361 |
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
)
|
369 |
-
|
370 |
|
371 |
with gr.TabItem("🧩 Logical", elem_id="logical_subtab", id=1, elem_classes="subtab"):
|
372 |
leaderboard = overall_leaderboard(
|
@@ -430,23 +473,45 @@ with demo:
|
|
430 |
"""
|
431 |
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
432 |
|
433 |
-
with gr.TabItem("
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
|
|
|
|
|
|
447 |
)
|
448 |
-
)
|
449 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
450 |
with gr.TabItem("🧪 Chemistry", elem_id="chemistry_subtab", id=1, elem_classes="subtab"):
|
451 |
leaderboard = overall_leaderboard(
|
452 |
get_model_leaderboard_df(
|
|
|
164 |
"""
|
165 |
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
|
166 |
|
167 |
+
with gr.TabItem("⭐ Sort by Rank", elem_id="overall_sort_by_rank_subtab", id=0, elem_classes="subtab"):
|
168 |
leaderboard = overall_leaderboard(
|
169 |
get_model_leaderboard_df(
|
170 |
model_result_path,
|
171 |
benchmark_cols=[
|
172 |
# AutoEvalColumn.rank_overall.name,
|
173 |
AutoEvalColumn.model.name,
|
174 |
+
|
175 |
AutoEvalColumn.rank_math_algebra.name,
|
176 |
AutoEvalColumn.rank_math_geometry.name,
|
177 |
AutoEvalColumn.rank_math_probability.name,
|
178 |
AutoEvalColumn.rank_reason_logical.name,
|
179 |
AutoEvalColumn.rank_reason_social.name,
|
180 |
AutoEvalColumn.rank_chemistry.name,
|
181 |
+
AutoEvalColumn.rank_overall.name,
|
182 |
# AutoEvalColumn.rank_cpp.name,
|
183 |
],
|
184 |
+
rank_col=['sort_by_rank', 1, 8],
|
185 |
)
|
186 |
)
|
187 |
|
188 |
+
with gr.TabItem("⭐ Sort by Score", elem_id="overall_sort_by_score_subtab", id=1, elem_classes="subtab"):
|
189 |
leaderboard = overall_leaderboard(
|
190 |
get_model_leaderboard_df(
|
191 |
model_result_path,
|
192 |
benchmark_cols=[
|
193 |
# AutoEvalColumn.rank_overall.name,
|
194 |
AutoEvalColumn.model.name,
|
195 |
+
# AutoEvalColumn.license.name,
|
196 |
+
# AutoEvalColumn.organization.name,
|
197 |
+
# AutoEvalColumn.knowledge_cutoff.name,
|
198 |
|
|
|
199 |
AutoEvalColumn.score_math_algebra.name,
|
200 |
AutoEvalColumn.score_math_geometry.name,
|
201 |
AutoEvalColumn.score_math_probability.name,
|
202 |
AutoEvalColumn.score_reason_logical.name,
|
203 |
AutoEvalColumn.score_reason_social.name,
|
204 |
AutoEvalColumn.score_chemistry.name,
|
205 |
+
AutoEvalColumn.score_overall.name,
|
206 |
# AutoEvalColumn.score_cpp.name,
|
207 |
|
208 |
# AutoEvalColumn.rank_overall.name,
|
|
|
214 |
# AutoEvalColumn.rank_chemistry.name,
|
215 |
# AutoEvalColumn.rank_cpp.name,
|
216 |
],
|
217 |
+
rank_col=['sort_by_score', 1, 8],
|
218 |
)
|
219 |
)
|
220 |
|
|
|
237 |
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
|
238 |
|
239 |
# leaderboard = init_leaderboard(LEADERBOARD_DF)
|
240 |
+
with gr.TabItem("🏆 Overview", elem_id="math_overview_subtab", id=0, elem_classes="subtab"):
|
241 |
+
|
242 |
+
with gr.TabItem("⭐ Sort by Rank", elem_id="math_overview_sort_by_rank_subtab", id=0, elem_classes="subtab"):
|
243 |
+
leaderboard = overall_leaderboard(
|
244 |
+
get_model_leaderboard_df(
|
245 |
+
model_result_path,
|
246 |
+
benchmark_cols=[
|
247 |
+
AutoEvalColumn.model.name,
|
248 |
+
# AutoEvalColumn.license.name,
|
249 |
+
# AutoEvalColumn.organization.name,
|
250 |
+
# AutoEvalColumn.knowledge_cutoff.name,
|
251 |
+
|
252 |
+
# AutoEvalColumn.score_math_algebra.name,
|
253 |
+
# AutoEvalColumn.score_math_geometry.name,
|
254 |
+
# AutoEvalColumn.score_math_probability.name,
|
255 |
+
AutoEvalColumn.rank_math_algebra.name,
|
256 |
+
AutoEvalColumn.rank_math_geometry.name,
|
257 |
+
AutoEvalColumn.rank_math_probability.name,
|
258 |
+
],
|
259 |
+
rank_col=['sort_by_rank', 1, 4],
|
260 |
+
)
|
261 |
)
|
262 |
+
|
263 |
+
with gr.TabItem("⭐ Sort by Score", elem_id="math_overview_sort_by_score_subtab", id=1, elem_classes="subtab"):
|
264 |
+
leaderboard = overall_leaderboard(
|
265 |
+
get_model_leaderboard_df(
|
266 |
+
model_result_path,
|
267 |
+
benchmark_cols=[
|
268 |
+
AutoEvalColumn.model.name,
|
269 |
+
# AutoEvalColumn.license.name,
|
270 |
+
# AutoEvalColumn.organization.name,
|
271 |
+
# AutoEvalColumn.knowledge_cutoff.name,
|
272 |
+
|
273 |
+
AutoEvalColumn.score_math_algebra.name,
|
274 |
+
AutoEvalColumn.score_math_geometry.name,
|
275 |
+
AutoEvalColumn.score_math_probability.name,
|
276 |
+
# AutoEvalColumn.rank_math_algebra.name,
|
277 |
+
# AutoEvalColumn.rank_math_geometry.name,
|
278 |
+
# AutoEvalColumn.rank_math_probability.name,
|
279 |
+
],
|
280 |
+
rank_col=['sort_by_score', 1, 4],
|
281 |
+
)
|
282 |
+
)
|
283 |
+
|
284 |
|
285 |
|
286 |
with gr.TabItem("🧮 Algebra", elem_id="algebra_subtab", id=1, elem_classes="subtab"):
|
|
|
374 |
"""
|
375 |
gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
|
376 |
|
377 |
+
with gr.TabItem("🏆 Overview", elem_id="reasoning_overview_subtab", id=0, elem_classes="subtab"):
|
378 |
+
|
379 |
+
with gr.TabItem("⭐ Sort by Rank", elem_id="reasoning_overview_sort_by_rank_subtab", id=0, elem_classes="subtab"):
|
380 |
+
leaderboard = overall_leaderboard(
|
381 |
+
get_model_leaderboard_df(
|
382 |
+
model_result_path,
|
383 |
+
benchmark_cols=[
|
384 |
+
AutoEvalColumn.model.name,
|
385 |
+
# AutoEvalColumn.license.name,
|
386 |
+
# AutoEvalColumn.organization.name,
|
387 |
+
# AutoEvalColumn.knowledge_cutoff.name,
|
388 |
+
|
389 |
+
AutoEvalColumn.rank_reason_logical.name,
|
390 |
+
AutoEvalColumn.rank_reason_social.name,
|
391 |
+
],
|
392 |
+
rank_col=['sort_by_rank', 1, 3],
|
393 |
+
)
|
394 |
+
)
|
395 |
|
396 |
+
with gr.TabItem("⭐ Sort by Score", elem_id="reasoning_overview_sort_by_score_subtab", id=1, elem_classes="subtab"):
|
397 |
+
leaderboard = overall_leaderboard(
|
398 |
+
get_model_leaderboard_df(
|
399 |
+
model_result_path,
|
400 |
+
benchmark_cols=[
|
401 |
+
AutoEvalColumn.model.name,
|
402 |
+
# AutoEvalColumn.license.name,
|
403 |
+
# AutoEvalColumn.organization.name,
|
404 |
+
# AutoEvalColumn.knowledge_cutoff.name,
|
405 |
+
|
406 |
+
AutoEvalColumn.score_reason_logical.name,
|
407 |
+
AutoEvalColumn.score_reason_social.name,
|
408 |
+
],
|
409 |
+
rank_col=['sort_by_score', 1, 3],
|
410 |
+
)
|
411 |
)
|
412 |
+
|
413 |
|
414 |
with gr.TabItem("🧩 Logical", elem_id="logical_subtab", id=1, elem_classes="subtab"):
|
415 |
leaderboard = overall_leaderboard(
|
|
|
473 |
"""
|
474 |
gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
|
475 |
|
476 |
+
with gr.TabItem("🏆 Overview", elem_id="science_overview_subtab", id=0, elem_classes="subtab"):
|
477 |
+
|
478 |
+
with gr.TabItem("⭐ Sort by Rank", elem_id="science_overview_sort_by_rank_subtab", id=0, elem_classes="subtab"):
|
479 |
+
leaderboard = overall_leaderboard(
|
480 |
+
get_model_leaderboard_df(
|
481 |
+
model_result_path,
|
482 |
+
benchmark_cols=[
|
483 |
+
AutoEvalColumn.model.name,
|
484 |
+
|
485 |
+
AutoEvalColumn.license.name,
|
486 |
+
AutoEvalColumn.organization.name,
|
487 |
+
AutoEvalColumn.knowledge_cutoff.name,
|
488 |
+
|
489 |
+
AutoEvalColumn.rank_chemistry.name,
|
490 |
+
],
|
491 |
+
rank_col=['sort_by_rank', 4, 5],
|
492 |
+
)
|
493 |
)
|
|
|
494 |
|
495 |
+
with gr.TabItem("⭐ Sort by Score", elem_id="science_overview_sort_by_score_subtab", id=1, elem_classes="subtab"):
|
496 |
+
leaderboard = overall_leaderboard(
|
497 |
+
get_model_leaderboard_df(
|
498 |
+
model_result_path,
|
499 |
+
benchmark_cols=[
|
500 |
+
AutoEvalColumn.model.name,
|
501 |
+
|
502 |
+
AutoEvalColumn.license.name,
|
503 |
+
AutoEvalColumn.organization.name,
|
504 |
+
AutoEvalColumn.knowledge_cutoff.name,
|
505 |
+
|
506 |
+
AutoEvalColumn.score_chemistry.name,
|
507 |
+
# AutoEvalColumn.rank_chemistry.name,
|
508 |
+
],
|
509 |
+
rank_col=['sort_by_score', 4, 5], # two numbers are index to select the columns to average and sort
|
510 |
+
)
|
511 |
+
)
|
512 |
+
|
513 |
+
|
514 |
+
|
515 |
with gr.TabItem("🧪 Chemistry", elem_id="chemistry_subtab", id=1, elem_classes="subtab"):
|
516 |
leaderboard = overall_leaderboard(
|
517 |
get_model_leaderboard_df(
|
src/populate.py
CHANGED
@@ -15,7 +15,7 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
15 |
"""Creates a dataframe from all the individual experiment results"""
|
16 |
raw_data = get_raw_model_results(results_path)
|
17 |
all_data_json = [v.to_dict() for v in raw_data]
|
18 |
-
assert len(rank_col) <= 1, "Only one column can be selected for ranking"
|
19 |
|
20 |
df = pd.DataFrame.from_records(all_data_json)
|
21 |
|
@@ -24,7 +24,7 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
24 |
|
25 |
# if there is one col in rank_col, this is an isolated dimension to rank by
|
26 |
# sort by that selected column and remove NaN values
|
27 |
-
if rank_col and rank_col[0]
|
28 |
# df = df.dropna(subset=benchmark_cols)
|
29 |
df = df.dropna(subset=rank_col)
|
30 |
df = df.fillna(0.00)
|
@@ -39,23 +39,37 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
39 |
df[col] = df[col].round(decimals=2)
|
40 |
|
41 |
elif rank_col and rank_col[0] == "sort_by_score": # sorting by averaging all benchmark cols, except cols before offset_idx
|
42 |
-
|
43 |
-
|
|
|
44 |
df.insert(1, "Average Score", avg_scores)
|
45 |
|
46 |
df["Average Score"] = avg_scores.round(decimals=4)
|
47 |
df = df.sort_values(by=["Average Score"], ascending=False)
|
48 |
df["Average Score"] = df["Average Score"].map('{:.2f}'.format)
|
49 |
|
50 |
-
df = df.drop(columns=benchmark_cols[offset_idx:])
|
51 |
# print(benchmark_cols)
|
52 |
# print(df.head())
|
53 |
# insert a rank column
|
54 |
rank = np.arange(1, len(df)+1)
|
55 |
df.insert(0, 'Rank', rank)
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
df["Average Rank"] = avg_rank.round(decimals=4)
|
60 |
df = df.sort_values(by=["Average Rank"], ascending=True)
|
61 |
df["Average Rank"] = df["Average Rank"].map('{:.2f}'.format)
|
@@ -66,6 +80,8 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
66 |
rank = np.arange(1, len(df)+1)
|
67 |
df.insert(0, 'Rank', rank)
|
68 |
|
|
|
|
|
69 |
|
70 |
|
71 |
|
|
|
15 |
"""Creates a dataframe from all the individual experiment results"""
|
16 |
raw_data = get_raw_model_results(results_path)
|
17 |
all_data_json = [v.to_dict() for v in raw_data]
|
18 |
+
# assert len(rank_col) <= 1, "Only one column can be selected for ranking"
|
19 |
|
20 |
df = pd.DataFrame.from_records(all_data_json)
|
21 |
|
|
|
24 |
|
25 |
# if there is one col in rank_col, this is an isolated dimension to rank by
|
26 |
# sort by that selected column and remove NaN values
|
27 |
+
if rank_col and rank_col[0] not in ["sort_by_score", "sort_by_rank"]:
|
28 |
# df = df.dropna(subset=benchmark_cols)
|
29 |
df = df.dropna(subset=rank_col)
|
30 |
df = df.fillna(0.00)
|
|
|
39 |
df[col] = df[col].round(decimals=2)
|
40 |
|
41 |
elif rank_col and rank_col[0] == "sort_by_score": # sorting by averaging all benchmark cols, except cols before offset_idx
|
42 |
+
start_idx = rank_col[1]
|
43 |
+
end_idx = rank_col[2]
|
44 |
+
avg_scores = df.iloc[:, start_idx:end_idx].mean(axis=1)
|
45 |
df.insert(1, "Average Score", avg_scores)
|
46 |
|
47 |
df["Average Score"] = avg_scores.round(decimals=4)
|
48 |
df = df.sort_values(by=["Average Score"], ascending=False)
|
49 |
df["Average Score"] = df["Average Score"].map('{:.2f}'.format)
|
50 |
|
51 |
+
# df = df.drop(columns=benchmark_cols[offset_idx:])
|
52 |
# print(benchmark_cols)
|
53 |
# print(df.head())
|
54 |
# insert a rank column
|
55 |
rank = np.arange(1, len(df)+1)
|
56 |
df.insert(0, 'Rank', rank)
|
57 |
+
|
58 |
+
for col in benchmark_cols:
|
59 |
+
if 'Std dev' in col or 'Score' in col:
|
60 |
+
df[col] = (df[col]).map('{:.2f}'.format)
|
61 |
+
df[col] = df[col].round(decimals=2)
|
62 |
+
|
63 |
+
# df = df.fillna('--')
|
64 |
+
df.replace("nan", '--', inplace=True)
|
65 |
+
|
66 |
+
elif rank_col and rank_col[0] == "sort_by_rank":
|
67 |
+
# else: # when rank_col, the first in benchmark_cols is empty, sort by averaging all the benchmarks, except the first one
|
68 |
+
start_idx = rank_col[1]
|
69 |
+
end_idx = rank_col[2]
|
70 |
+
avg_rank = df.iloc[:, start_idx:end_idx].mean(axis=1)
|
71 |
+
df.insert(1, "Average Rank", avg_rank)
|
72 |
+
|
73 |
df["Average Rank"] = avg_rank.round(decimals=4)
|
74 |
df = df.sort_values(by=["Average Rank"], ascending=True)
|
75 |
df["Average Rank"] = df["Average Rank"].map('{:.2f}'.format)
|
|
|
80 |
rank = np.arange(1, len(df)+1)
|
81 |
df.insert(0, 'Rank', rank)
|
82 |
|
83 |
+
# df.style.background_gradient(cmap='coolwarm', subset=benchmark_cols)
|
84 |
+
|
85 |
|
86 |
|
87 |
|