Spaces:
Running
Running
Amber Tanaka
commited on
Bug Bash Fixes (#79)
Browse files- leaderboard_transformer.py +19 -31
- main_page.py +3 -2
- submission.py +3 -2
- ui_components.py +1 -1
leaderboard_transformer.py
CHANGED
|
@@ -87,7 +87,7 @@ ORDER_MAP = {
|
|
| 87 |
}
|
| 88 |
|
| 89 |
|
| 90 |
-
def _safe_round(value, digits=
|
| 91 |
"""Rounds a number if it's a valid float/int, otherwise returns it as is."""
|
| 92 |
return round(value, digits) if isinstance(value, (float, int)) and pd.notna(value) else value
|
| 93 |
|
|
@@ -278,13 +278,7 @@ class DataTransformer:
|
|
| 278 |
if primary_metric == "Overall":
|
| 279 |
def calculate_attempted(row):
|
| 280 |
main_categories = ['Literature Understanding', 'Code & Execution', 'Data Analysis', 'End-to-End Discovery']
|
| 281 |
-
count = sum(1 for category in main_categories if
|
| 282 |
-
|
| 283 |
-
# Return the formatted string with the correct emoji
|
| 284 |
-
if count == 4:
|
| 285 |
-
return f"4/4"
|
| 286 |
-
if count == 0:
|
| 287 |
-
return f"0/4"
|
| 288 |
return f"{count}/4"
|
| 289 |
|
| 290 |
# Apply the function row-wise to create the new column
|
|
@@ -295,13 +289,8 @@ class DataTransformer:
|
|
| 295 |
total_benchmarks = len(group_metrics)
|
| 296 |
def calculate_benchmarks_attempted(row):
|
| 297 |
# Count how many benchmarks in this category have COST data reported
|
| 298 |
-
count = sum(1 for benchmark in group_metrics if pd.notna(row.get(f"{benchmark}
|
| 299 |
-
|
| 300 |
-
return f"{count}/{total_benchmarks} "
|
| 301 |
-
elif count == 0:
|
| 302 |
-
return f"{count}/{total_benchmarks} "
|
| 303 |
-
else:
|
| 304 |
-
return f"{count}/{total_benchmarks}"
|
| 305 |
# Insert the new column, for example, after "Date"
|
| 306 |
df_view.insert((cols - 2), "Benchmarks Attempted", df_view.apply(calculate_benchmarks_attempted, axis=1))
|
| 307 |
|
|
@@ -459,7 +448,7 @@ def _plot_scatter_plotly(
|
|
| 459 |
h_pad = " "
|
| 460 |
parts = ["<br>"]
|
| 461 |
parts.append(f"{h_pad}<b>{row[agent_col]}</b>{h_pad}<br>")
|
| 462 |
-
parts.append(f"{h_pad}Score: <b>{row[y_col]:.
|
| 463 |
parts.append(f"{h_pad}{x_axis_label}: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
|
| 464 |
parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}<br>")
|
| 465 |
parts.append(f"{h_pad}Tooling: <b>{row['Agent Tooling']}</b>{h_pad}")
|
|
@@ -531,15 +520,14 @@ def _plot_scatter_plotly(
|
|
| 531 |
)
|
| 532 |
|
| 533 |
# ---Adjust x-axis range to make room for the new points ---
|
| 534 |
-
xaxis_config['range'] = [0, (max_reported_cost + (max_reported_cost / 4))]
|
| 535 |
|
| 536 |
-
logo_data_uri = svg_to_data_uri("assets/just-icon.svg")
|
| 537 |
|
| 538 |
fig.update_layout(
|
| 539 |
template="plotly_white",
|
| 540 |
title=f"AstaBench {name} Leaderboard",
|
| 541 |
xaxis=xaxis_config, # Use the updated config
|
| 542 |
-
yaxis=dict(title="Average (mean) score",
|
| 543 |
legend=dict(
|
| 544 |
bgcolor='#FAF2E9',
|
| 545 |
),
|
|
@@ -551,17 +539,17 @@ def _plot_scatter_plotly(
|
|
| 551 |
font_color="#d3dedc",
|
| 552 |
),
|
| 553 |
)
|
| 554 |
-
fig.add_layout_image(
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
)
|
| 565 |
|
| 566 |
return fig
|
| 567 |
|
|
@@ -621,7 +609,7 @@ def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame:
|
|
| 621 |
|
| 622 |
# For all other numbers, format them for consistency.
|
| 623 |
if isinstance(score_value, (int, float)):
|
| 624 |
-
return f"{score_value:.
|
| 625 |
|
| 626 |
# Fallback for any unexpected non-numeric data
|
| 627 |
return score_value
|
|
|
|
| 87 |
}
|
| 88 |
|
| 89 |
|
| 90 |
+
def _safe_round(value, digits=3):
|
| 91 |
"""Rounds a number if it's a valid float/int, otherwise returns it as is."""
|
| 92 |
return round(value, digits) if isinstance(value, (float, int)) and pd.notna(value) else value
|
| 93 |
|
|
|
|
| 278 |
if primary_metric == "Overall":
|
| 279 |
def calculate_attempted(row):
|
| 280 |
main_categories = ['Literature Understanding', 'Code & Execution', 'Data Analysis', 'End-to-End Discovery']
|
| 281 |
+
count = sum(1 for category in main_categories if row.get(f"{category} Score") != 0.0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
return f"{count}/4"
|
| 283 |
|
| 284 |
# Apply the function row-wise to create the new column
|
|
|
|
| 289 |
total_benchmarks = len(group_metrics)
|
| 290 |
def calculate_benchmarks_attempted(row):
|
| 291 |
# Count how many benchmarks in this category have COST data reported
|
| 292 |
+
count = sum(1 for benchmark in group_metrics if pd.notna(row.get(f"{benchmark} Score")))
|
| 293 |
+
return f"{count}/{total_benchmarks}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
# Insert the new column, for example, after "Date"
|
| 295 |
df_view.insert((cols - 2), "Benchmarks Attempted", df_view.apply(calculate_benchmarks_attempted, axis=1))
|
| 296 |
|
|
|
|
| 448 |
h_pad = " "
|
| 449 |
parts = ["<br>"]
|
| 450 |
parts.append(f"{h_pad}<b>{row[agent_col]}</b>{h_pad}<br>")
|
| 451 |
+
parts.append(f"{h_pad}Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
|
| 452 |
parts.append(f"{h_pad}{x_axis_label}: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
|
| 453 |
parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}<br>")
|
| 454 |
parts.append(f"{h_pad}Tooling: <b>{row['Agent Tooling']}</b>{h_pad}")
|
|
|
|
| 520 |
)
|
| 521 |
|
| 522 |
# ---Adjust x-axis range to make room for the new points ---
|
| 523 |
+
xaxis_config['range'] = [-0.2, (max_reported_cost + (max_reported_cost / 4))]
|
| 524 |
|
|
|
|
| 525 |
|
| 526 |
fig.update_layout(
|
| 527 |
template="plotly_white",
|
| 528 |
title=f"AstaBench {name} Leaderboard",
|
| 529 |
xaxis=xaxis_config, # Use the updated config
|
| 530 |
+
yaxis=dict(title="Average (mean) score", range=[-0.2, None]),
|
| 531 |
legend=dict(
|
| 532 |
bgcolor='#FAF2E9',
|
| 533 |
),
|
|
|
|
| 539 |
font_color="#d3dedc",
|
| 540 |
),
|
| 541 |
)
|
| 542 |
+
# fig.add_layout_image(
|
| 543 |
+
# dict(
|
| 544 |
+
# source=logo_data_uri,
|
| 545 |
+
# xref="x domain", yref="y domain",
|
| 546 |
+
# x=1.1, y=1.1,
|
| 547 |
+
# sizex=0.2, sizey=0.2,
|
| 548 |
+
# xanchor="left",
|
| 549 |
+
# yanchor="bottom",
|
| 550 |
+
# layer="above",
|
| 551 |
+
# ),
|
| 552 |
+
# )
|
| 553 |
|
| 554 |
return fig
|
| 555 |
|
|
|
|
| 609 |
|
| 610 |
# For all other numbers, format them for consistency.
|
| 611 |
if isinstance(score_value, (int, float)):
|
| 612 |
+
return f"{score_value:.3f}"
|
| 613 |
|
| 614 |
# Fallback for any unexpected non-numeric data
|
| 615 |
return score_value
|
main_page.py
CHANGED
|
@@ -57,8 +57,9 @@ def build_page():
|
|
| 57 |
else:
|
| 58 |
gr.Markdown("No data available for validation split.")
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
|
|
|
| 62 |
|
| 63 |
|
| 64 |
# JavaScript to show the TEST nav, hide the VALIDATION nav, AND fix the plots.
|
|
|
|
| 57 |
else:
|
| 58 |
gr.Markdown("No data available for validation split.")
|
| 59 |
|
| 60 |
+
# hiding this for now till we have the real paper data
|
| 61 |
+
# with gr.Accordion("📙 Citation", open=False):
|
| 62 |
+
# gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button-main", interactive=False)
|
| 63 |
|
| 64 |
|
| 65 |
# JavaScript to show the TEST nav, hide the VALIDATION nav, AND fix the plots.
|
submission.py
CHANGED
|
@@ -428,5 +428,6 @@ def build_page():
|
|
| 428 |
],
|
| 429 |
[error_message, error_modal, success_modal, loading_modal],
|
| 430 |
)
|
| 431 |
-
|
| 432 |
-
|
|
|
|
|
|
| 428 |
],
|
| 429 |
[error_message, error_modal, success_modal, loading_modal],
|
| 430 |
)
|
| 431 |
+
# hiding this for now till we have the real paper data
|
| 432 |
+
# with gr.Accordion("📙 Citation", open=False):
|
| 433 |
+
# gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button-main", interactive=False)
|
ui_components.py
CHANGED
|
@@ -596,7 +596,7 @@ def create_leaderboard_display(
|
|
| 596 |
with gr.Column(scale=3):
|
| 597 |
plot_component = gr.Plot(
|
| 598 |
value=scatter_plot,
|
| 599 |
-
show_label=False
|
| 600 |
)
|
| 601 |
with gr.Column(scale=1):
|
| 602 |
gr.HTML(value=plot_legend_html)
|
|
|
|
| 596 |
with gr.Column(scale=3):
|
| 597 |
plot_component = gr.Plot(
|
| 598 |
value=scatter_plot,
|
| 599 |
+
show_label=False,
|
| 600 |
)
|
| 601 |
with gr.Column(scale=1):
|
| 602 |
gr.HTML(value=plot_legend_html)
|