Amber Tanaka commited on
Commit
8bd1c00
·
unverified ·
1 Parent(s): d60c9d9

Bug Bash Fixes (#79)

Browse files
Files changed (4) hide show
  1. leaderboard_transformer.py +19 -31
  2. main_page.py +3 -2
  3. submission.py +3 -2
  4. ui_components.py +1 -1
leaderboard_transformer.py CHANGED
@@ -87,7 +87,7 @@ ORDER_MAP = {
87
  }
88
 
89
 
90
- def _safe_round(value, digits=2):
91
  """Rounds a number if it's a valid float/int, otherwise returns it as is."""
92
  return round(value, digits) if isinstance(value, (float, int)) and pd.notna(value) else value
93
 
@@ -278,13 +278,7 @@ class DataTransformer:
278
  if primary_metric == "Overall":
279
  def calculate_attempted(row):
280
  main_categories = ['Literature Understanding', 'Code & Execution', 'Data Analysis', 'End-to-End Discovery']
281
- count = sum(1 for category in main_categories if pd.notna(row.get(f"{category} Cost")))
282
-
283
- # Return the formatted string with the correct emoji
284
- if count == 4:
285
- return f"4/4"
286
- if count == 0:
287
- return f"0/4"
288
  return f"{count}/4"
289
 
290
  # Apply the function row-wise to create the new column
@@ -295,13 +289,8 @@ class DataTransformer:
295
  total_benchmarks = len(group_metrics)
296
  def calculate_benchmarks_attempted(row):
297
  # Count how many benchmarks in this category have COST data reported
298
- count = sum(1 for benchmark in group_metrics if pd.notna(row.get(f"{benchmark} Cost")))
299
- if count == total_benchmarks:
300
- return f"{count}/{total_benchmarks} "
301
- elif count == 0:
302
- return f"{count}/{total_benchmarks} "
303
- else:
304
- return f"{count}/{total_benchmarks}"
305
  # Insert the new column, for example, after "Date"
306
  df_view.insert((cols - 2), "Benchmarks Attempted", df_view.apply(calculate_benchmarks_attempted, axis=1))
307
 
@@ -459,7 +448,7 @@ def _plot_scatter_plotly(
459
  h_pad = " "
460
  parts = ["<br>"]
461
  parts.append(f"{h_pad}<b>{row[agent_col]}</b>{h_pad}<br>")
462
- parts.append(f"{h_pad}Score: <b>{row[y_col]:.2f}</b>{h_pad}<br>")
463
  parts.append(f"{h_pad}{x_axis_label}: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
464
  parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}<br>")
465
  parts.append(f"{h_pad}Tooling: <b>{row['Agent Tooling']}</b>{h_pad}")
@@ -531,15 +520,14 @@ def _plot_scatter_plotly(
531
  )
532
 
533
  # ---Adjust x-axis range to make room for the new points ---
534
- xaxis_config['range'] = [0, (max_reported_cost + (max_reported_cost / 4))]
535
 
536
- logo_data_uri = svg_to_data_uri("assets/just-icon.svg")
537
 
538
  fig.update_layout(
539
  template="plotly_white",
540
  title=f"AstaBench {name} Leaderboard",
541
  xaxis=xaxis_config, # Use the updated config
542
- yaxis=dict(title="Average (mean) score", rangemode="tozero"),
543
  legend=dict(
544
  bgcolor='#FAF2E9',
545
  ),
@@ -551,17 +539,17 @@ def _plot_scatter_plotly(
551
  font_color="#d3dedc",
552
  ),
553
  )
554
- fig.add_layout_image(
555
- dict(
556
- source=logo_data_uri,
557
- xref="x domain", yref="y domain",
558
- x=1.1, y=1.1,
559
- sizex=0.2, sizey=0.2,
560
- xanchor="left",
561
- yanchor="bottom",
562
- layer="above",
563
- ),
564
- )
565
 
566
  return fig
567
 
@@ -621,7 +609,7 @@ def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame:
621
 
622
  # For all other numbers, format them for consistency.
623
  if isinstance(score_value, (int, float)):
624
- return f"{score_value:.2f}"
625
 
626
  # Fallback for any unexpected non-numeric data
627
  return score_value
 
87
  }
88
 
89
 
90
+ def _safe_round(value, digits=3):
91
  """Rounds a number if it's a valid float/int, otherwise returns it as is."""
92
  return round(value, digits) if isinstance(value, (float, int)) and pd.notna(value) else value
93
 
 
278
  if primary_metric == "Overall":
279
  def calculate_attempted(row):
280
  main_categories = ['Literature Understanding', 'Code & Execution', 'Data Analysis', 'End-to-End Discovery']
281
+ count = sum(1 for category in main_categories if row.get(f"{category} Score") != 0.0)
 
 
 
 
 
 
282
  return f"{count}/4"
283
 
284
  # Apply the function row-wise to create the new column
 
289
  total_benchmarks = len(group_metrics)
290
  def calculate_benchmarks_attempted(row):
291
  # Count how many benchmarks in this category have COST data reported
292
+ count = sum(1 for benchmark in group_metrics if pd.notna(row.get(f"{benchmark} Score")))
293
+ return f"{count}/{total_benchmarks}"
 
 
 
 
 
294
  # Insert the new column, for example, after "Date"
295
  df_view.insert((cols - 2), "Benchmarks Attempted", df_view.apply(calculate_benchmarks_attempted, axis=1))
296
 
 
448
  h_pad = " "
449
  parts = ["<br>"]
450
  parts.append(f"{h_pad}<b>{row[agent_col]}</b>{h_pad}<br>")
451
+ parts.append(f"{h_pad}Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
452
  parts.append(f"{h_pad}{x_axis_label}: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
453
  parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}<br>")
454
  parts.append(f"{h_pad}Tooling: <b>{row['Agent Tooling']}</b>{h_pad}")
 
520
  )
521
 
522
  # ---Adjust x-axis range to make room for the new points ---
523
+ xaxis_config['range'] = [-0.2, (max_reported_cost + (max_reported_cost / 4))]
524
 
 
525
 
526
  fig.update_layout(
527
  template="plotly_white",
528
  title=f"AstaBench {name} Leaderboard",
529
  xaxis=xaxis_config, # Use the updated config
530
+ yaxis=dict(title="Average (mean) score", range=[-0.2, None]),
531
  legend=dict(
532
  bgcolor='#FAF2E9',
533
  ),
 
539
  font_color="#d3dedc",
540
  ),
541
  )
542
+ # fig.add_layout_image(
543
+ # dict(
544
+ # source=logo_data_uri,
545
+ # xref="x domain", yref="y domain",
546
+ # x=1.1, y=1.1,
547
+ # sizex=0.2, sizey=0.2,
548
+ # xanchor="left",
549
+ # yanchor="bottom",
550
+ # layer="above",
551
+ # ),
552
+ # )
553
 
554
  return fig
555
 
 
609
 
610
  # For all other numbers, format them for consistency.
611
  if isinstance(score_value, (int, float)):
612
+ return f"{score_value:.3f}"
613
 
614
  # Fallback for any unexpected non-numeric data
615
  return score_value
main_page.py CHANGED
@@ -57,8 +57,9 @@ def build_page():
57
  else:
58
  gr.Markdown("No data available for validation split.")
59
 
60
- with gr.Accordion("📙 Citation", open=False):
61
- gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button-main", interactive=False)
 
62
 
63
 
64
  # JavaScript to show the TEST nav, hide the VALIDATION nav, AND fix the plots.
 
57
  else:
58
  gr.Markdown("No data available for validation split.")
59
 
60
+ # hiding this for now till we have the real paper data
61
+ # with gr.Accordion("📙 Citation", open=False):
62
+ # gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button-main", interactive=False)
63
 
64
 
65
  # JavaScript to show the TEST nav, hide the VALIDATION nav, AND fix the plots.
submission.py CHANGED
@@ -428,5 +428,6 @@ def build_page():
428
  ],
429
  [error_message, error_modal, success_modal, loading_modal],
430
  )
431
- with gr.Accordion("📙 Citation", open=False):
432
- gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button-main", interactive=False)
 
 
428
  ],
429
  [error_message, error_modal, success_modal, loading_modal],
430
  )
431
+ # hiding this for now till we have the real paper data
432
+ # with gr.Accordion("📙 Citation", open=False):
433
+ # gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button-main", interactive=False)
ui_components.py CHANGED
@@ -596,7 +596,7 @@ def create_leaderboard_display(
596
  with gr.Column(scale=3):
597
  plot_component = gr.Plot(
598
  value=scatter_plot,
599
- show_label=False
600
  )
601
  with gr.Column(scale=1):
602
  gr.HTML(value=plot_legend_html)
 
596
  with gr.Column(scale=3):
597
  plot_component = gr.Plot(
598
  value=scatter_plot,
599
+ show_label=False,
600
  )
601
  with gr.Column(scale=1):
602
  gr.HTML(value=plot_legend_html)