deepmage121's picture
moving to EEE hf org
a92080e
raw
history blame
19.9 kB
"""
Evaluation Leaderboard - Gradio Interface
Displays model evaluation results from HuggingFace datasets.
"""
import gradio as gr
import pandas as pd
from pathlib import Path
from data_loader import (
load_hf_dataset_on_startup,
get_available_leaderboards,
get_eval_metadata,
build_leaderboard_table,
clear_cache,
search_model_across_leaderboards,
get_all_model_names,
DATA_DIR
)
from ui_components import (
get_theme,
get_custom_css,
format_leaderboard_header,
format_metric_details,
format_model_card,
format_model_comparison,
)
PAGE_SIZE = 50
def update_leaderboard_table(selected_leaderboard, search_query="", current_page=1, sort_column=None, selected_columns=None, progress=gr.Progress()):
"""Loads and aggregates data for the selected leaderboard."""
if not selected_leaderboard:
return (
pd.DataFrame(),
format_leaderboard_header(None, {}),
format_metric_details(None, {}),
gr.update(choices=[], value=None),
gr.update(interactive=False),
gr.update(interactive=False),
gr.update(choices=[], value=None),
"0 / 0",
gr.update(choices=[], value=[]),
)
metadata = get_eval_metadata(selected_leaderboard)
def progress_callback(value, desc):
progress(value, desc=desc)
df = build_leaderboard_table(selected_leaderboard, "", progress_callback)
# Get all available columns BEFORE filtering (for column selector)
all_available_columns = list(df.columns) if not df.empty else []
# Filter columns if selected (if None or empty, show all columns)
if selected_columns is not None and len(selected_columns) > 0:
# Ensure Model column is always included
base_cols = ["Model"]
available_cols = list(df.columns)
cols_to_show = [col for col in base_cols if col in available_cols]
# Add Developer and other selected columns
cols_to_show.extend([col for col in selected_columns if col in available_cols and col not in cols_to_show])
if cols_to_show:
df = df[cols_to_show]
if search_query and not df.empty:
mask = df.astype(str).apply(lambda row: row.str.contains(search_query, case=False, na=False).any(), axis=1)
df = df[mask]
filtered_count = len(df)
if sort_column and sort_column in df.columns and not df.empty:
df = df.sort_values(by=sort_column, ascending=False, na_position='last')
total_pages = max(1, (filtered_count + PAGE_SIZE - 1) // PAGE_SIZE) if filtered_count > 0 else 1
current_page = max(1, min(current_page, total_pages))
start_idx = (current_page - 1) * PAGE_SIZE
end_idx = start_idx + PAGE_SIZE
df_paginated = df.iloc[start_idx:end_idx] if not df.empty else df
page_choices = [str(i) for i in range(1, total_pages + 1)]
page_dropdown = gr.update(choices=page_choices, value=str(current_page))
prev_btn = gr.update(interactive=(current_page > 1))
next_btn = gr.update(interactive=(current_page < total_pages))
page_info = f"{current_page} / {total_pages}"
sort_choices = list(df.columns) if not df.empty else []
default_sort = sort_column if sort_column and sort_column in sort_choices else ("Average" if "Average" in sort_choices else (sort_choices[0] if sort_choices else None))
sort_column_update = gr.update(choices=sort_choices, value=default_sort)
# Get all available columns for column selector (use full list, not filtered)
# Include all columns except Model in the selector (Model is always shown)
column_choices = [col for col in all_available_columns if col != "Model"]
# Preserve current selection, or default to all columns if None or empty
if selected_columns is None or len(selected_columns) == 0:
column_value = column_choices
else:
# Preserve user's selection, filtering out any invalid choices
column_value = [col for col in selected_columns if col in column_choices]
column_selector_update = gr.update(choices=column_choices, value=column_value)
return (
df_paginated,
format_leaderboard_header(selected_leaderboard, metadata),
format_metric_details(selected_leaderboard, metadata),
page_dropdown,
prev_btn,
next_btn,
sort_column_update,
page_info,
column_selector_update,
)
def search_model(model_query):
"""Search for a model and return formatted card."""
if not model_query or len(model_query) < 2:
return """
<div class="no-results">
<h3>Search for a model</h3>
<p>Enter a model name to see its benchmarks across all leaderboards</p>
</div>
"""
results, _ = search_model_across_leaderboards(model_query)
if not results:
return f"""
<div class="no-results">
<h3>No results for "{model_query}"</h3>
<p>Try a different model name or check the spelling</p>
</div>
"""
# Use the first matching model
model_name = list(results.keys())[0]
model_data = results[model_name]
return format_model_card(model_name, model_data)
def compare_models(selected_models):
"""Compare multiple selected models."""
if not selected_models or len(selected_models) == 0:
return """
<div class="no-results">
<h3>Select models to compare</h3>
<p>Choose multiple models from the dropdown to see a side-by-side comparison</p>
</div>
"""
# Get data for all selected models
all_results = {}
for model_name in selected_models:
results, _ = search_model_across_leaderboards(model_name)
if results:
# Use the first matching model (exact match preferred)
matched_model = list(results.keys())[0]
all_results[matched_model] = results[matched_model]
if len(all_results) == 1:
# Single model - show card view
model_name = list(all_results.keys())[0]
return format_model_card(model_name, all_results[model_name])
elif len(all_results) > 1:
# Multiple models - show comparison
return format_model_comparison(list(all_results.keys()), all_results)
else:
return """
<div class="no-results">
<h3>No results found</h3>
<p>Try selecting different models</p>
</div>
"""
def get_model_suggestions(query):
"""Get model name suggestions for autocomplete."""
if not query or len(query) < 2:
return gr.update(choices=[])
_, matches = search_model_across_leaderboards(query)
return gr.update(choices=matches[:15])
# Load data at startup
load_hf_dataset_on_startup()
# Build interface
with gr.Blocks(title="Every Eval Ever", theme=get_theme(), css=get_custom_css()) as demo:
# Header
gr.HTML("""
<div class="app-header">
<div class="logo-mark">E³</div>
<div class="brand">
<h1>Every Eval Ever</h1>
<span class="tagline">Browse and compare model benchmarks</span>
</div>
<div class="header-right">
<span class="version-badge">beta</span>
</div>
</div>
""")
with gr.Tabs():
# === TAB 1: Leaderboard View ===
with gr.TabItem("📊 Leaderboards"):
with gr.Row(elem_classes="controls-bar"):
initial_choices = get_available_leaderboards()
initial_value = initial_choices[0] if initial_choices else None
with gr.Column(scale=2, min_width=200):
leaderboard_selector = gr.Dropdown(
choices=initial_choices,
value=initial_value,
label="Leaderboard",
interactive=True
)
with gr.Column(scale=3, min_width=250):
search_box = gr.Textbox(
label="Filter",
placeholder="Filter models...",
show_label=True
)
with gr.Column(scale=1, min_width=100):
refresh_btn = gr.Button("↻ Refresh", variant="secondary", size="sm")
init_df, init_header, init_metrics, init_page_dropdown, init_prev, init_next, init_sort_cols, init_page_info, init_column_selector = update_leaderboard_table(initial_value, "", 1, "Average", None)
header_view = gr.HTML(value=init_header)
# Hidden sort state (default to Average)
sort_column_dropdown = gr.Dropdown(
choices=init_sort_cols.get("choices", []) if hasattr(init_sort_cols, 'get') else [],
value=init_sort_cols.get("value") if hasattr(init_sort_cols, 'get') else None,
visible=False,
)
# Column selector
with gr.Row(elem_classes="controls-bar"):
column_selector = gr.CheckboxGroup(
choices=init_column_selector.get("choices", []) if isinstance(init_column_selector, dict) else [],
value=init_column_selector.get("value", []) if isinstance(init_column_selector, dict) else [],
label="Columns to Display",
interactive=True,
show_label=True,
)
leaderboard_table = gr.Dataframe(
value=init_df,
label=None,
interactive=False,
wrap=False,
elem_classes="dataframe",
)
# Pagination below table - centered
with gr.Row(elem_classes="pagination-bar"):
prev_btn = gr.Button("←", variant="secondary", size="sm", min_width=60)
page_info = gr.Markdown(value=init_page_info, elem_classes="page-info")
next_btn = gr.Button("→", variant="secondary", size="sm", min_width=60)
# Extract choices and value from gr.update() dict, ensuring value is in choices
if isinstance(init_page_dropdown, dict):
page_choices = init_page_dropdown.get("choices", ["1"])
page_value = str(init_page_dropdown.get("value", "1")) if init_page_dropdown.get("value") is not None else "1"
# Ensure value exists in choices
if page_value not in page_choices:
page_value = page_choices[0] if page_choices else "1"
if not page_choices:
page_choices = ["1"]
else:
page_choices = ["1"]
page_value = "1"
page_dropdown = gr.Dropdown(
choices=page_choices,
value=page_value,
visible=False,
)
metrics_view = gr.HTML(value=init_metrics)
# === TAB 2: Model View ===
with gr.TabItem("🔍 Model Lookup"):
gr.Markdown("### Find and compare models across all leaderboards")
selected_models_state = gr.State(value=[])
default_compare_html = """
<div class="no-results">
<h3>Search for models to compare</h3>
<p>Type in the dropdown above, then click a model to add it</p>
</div>
"""
with gr.Row(elem_classes="controls-bar"):
with gr.Column(scale=4):
all_models = get_all_model_names()
model_dropdown = gr.Dropdown(
choices=all_models,
label="Search models to add",
interactive=True,
allow_custom_value=False,
filterable=True,
)
with gr.Column(scale=1, min_width=100):
clear_models_btn = gr.Button("Clear All", variant="secondary", size="sm")
selected_models_group = gr.CheckboxGroup(
choices=[],
value=[],
label="Selected Models (click to remove)",
interactive=True,
elem_classes="selected-models-group"
)
model_card_view = gr.HTML(value=default_compare_html)
# Submission guide
with gr.Accordion("📤 How to Submit Data", open=False):
gr.Markdown("""
**Submit via GitHub Pull Request:**
1. Fork [evaleval/every_eval_ever](https://github.com/evaleval/every_eval_ever)
2. Add JSON files to `data/<leaderboard>/<developer>/<model>/`
3. Open a PR — automated validation runs on submission
4. After merge, data syncs to HuggingFace automatically
[Submission Guide](https://github.com/evaleval/every_eval_ever#contributor-guide) · [JSON Schema](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json)
""")
# === State ===
current_page_state = gr.State(value=1)
sort_column_state = gr.State(value="Average")
def go_prev(current):
return max(1, current - 1)
def go_next(current):
return current + 1
def reset_page():
return 1
def update_table_only(selected_leaderboard, search_query, current_page, sort_column, selected_columns):
"""Update table without modifying column selector (for column changes)."""
result = update_leaderboard_table(selected_leaderboard, search_query, current_page, sort_column, selected_columns)
# Return all outputs except the last one (column_selector)
return result[:-1]
# === Leaderboard Events ===
leaderboard_selector.change(
fn=reset_page, outputs=[current_page_state]
).then(
fn=lambda: "Average", outputs=[sort_column_state]
).then(
fn=lambda: None, outputs=[column_selector]
).then(
fn=update_leaderboard_table,
inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info, column_selector]
)
search_box.input(
fn=reset_page, outputs=[current_page_state]
).then(
fn=update_table_only,
inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
)
sort_column_dropdown.change(
fn=lambda col: col,
inputs=[sort_column_dropdown],
outputs=[sort_column_state]
).then(
fn=reset_page, outputs=[current_page_state]
).then(
fn=update_table_only,
inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
)
column_selector.change(
fn=reset_page, outputs=[current_page_state]
).then(
fn=update_table_only,
inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
)
page_dropdown.change(
fn=lambda p: int(p) if p else 1,
inputs=[page_dropdown],
outputs=[current_page_state]
).then(
fn=update_table_only,
inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
)
prev_btn.click(
fn=go_prev, inputs=[current_page_state], outputs=[current_page_state]
).then(
fn=update_table_only,
inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
)
next_btn.click(
fn=go_next, inputs=[current_page_state], outputs=[current_page_state]
).then(
fn=update_table_only,
inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
)
refresh_btn.click(
fn=lambda: gr.Dropdown(choices=get_available_leaderboards()),
outputs=[leaderboard_selector]
).then(
fn=lambda: clear_cache()
).then(
fn=reset_page, outputs=[current_page_state]
).then(
fn=lambda: "Average", outputs=[sort_column_state]
).then(
fn=lambda: None, outputs=[column_selector]
).then(
fn=update_leaderboard_table,
inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info, column_selector]
)
# === Model Search Events ===
def add_model_and_compare(selected_model, current_selected):
"""Add a model and auto-compare."""
if not selected_model:
comparison_html = compare_models(current_selected) if current_selected else default_compare_html
return (
current_selected,
gr.update(value=None),
gr.update(choices=current_selected, value=current_selected),
comparison_html
)
if current_selected is None:
current_selected = []
if selected_model not in current_selected:
current_selected = current_selected + [selected_model]
comparison_html = compare_models(current_selected)
return (
current_selected,
gr.update(value=None),
gr.update(choices=current_selected, value=current_selected),
comparison_html
)
def update_selection(selected_list):
"""Update selection from checkbox changes."""
selected_list = selected_list or []
comparison_html = compare_models(selected_list) if selected_list else default_compare_html
return selected_list, comparison_html
def clear_all_models():
"""Clear all selected models."""
return (
[],
gr.update(value=None),
gr.update(choices=[], value=[]),
default_compare_html
)
# Select from dropdown adds model and auto-compares
model_dropdown.select(
fn=add_model_and_compare,
inputs=[model_dropdown, selected_models_state],
outputs=[selected_models_state, model_dropdown, selected_models_group, model_card_view]
)
selected_models_group.change(
fn=update_selection,
inputs=[selected_models_group],
outputs=[selected_models_state, model_card_view]
)
clear_models_btn.click(
fn=clear_all_models,
outputs=[selected_models_state, model_dropdown, selected_models_group, model_card_view]
)
DATA_DIR.mkdir(exist_ok=True)
if __name__ == "__main__":
demo.launch()