Spaces:
Running
Running
| """ | |
| Evaluation Leaderboard - Gradio Interface | |
| Displays model evaluation results from HuggingFace datasets. | |
| """ | |
| import gradio as gr | |
| import pandas as pd | |
| from pathlib import Path | |
| from data_loader import ( | |
| load_hf_dataset_on_startup, | |
| get_available_leaderboards, | |
| get_eval_metadata, | |
| build_leaderboard_table, | |
| clear_cache, | |
| search_model_across_leaderboards, | |
| get_all_model_names, | |
| DATA_DIR | |
| ) | |
| from ui_components import ( | |
| get_theme, | |
| get_custom_css, | |
| format_leaderboard_header, | |
| format_metric_details, | |
| format_model_card, | |
| format_model_comparison, | |
| ) | |
| PAGE_SIZE = 50 | |
| def update_leaderboard_table(selected_leaderboard, search_query="", current_page=1, sort_column=None, selected_columns=None, progress=gr.Progress()): | |
| """Loads and aggregates data for the selected leaderboard.""" | |
| if not selected_leaderboard: | |
| return ( | |
| pd.DataFrame(), | |
| format_leaderboard_header(None, {}), | |
| format_metric_details(None, {}), | |
| gr.update(choices=[], value=None), | |
| gr.update(interactive=False), | |
| gr.update(interactive=False), | |
| gr.update(choices=[], value=None), | |
| "0 / 0", | |
| gr.update(choices=[], value=[]), | |
| ) | |
| metadata = get_eval_metadata(selected_leaderboard) | |
| def progress_callback(value, desc): | |
| progress(value, desc=desc) | |
| df = build_leaderboard_table(selected_leaderboard, "", progress_callback) | |
| # Get all available columns BEFORE filtering (for column selector) | |
| all_available_columns = list(df.columns) if not df.empty else [] | |
| # Filter columns if selected (if None or empty, show all columns) | |
| if selected_columns is not None and len(selected_columns) > 0: | |
| # Ensure Model column is always included | |
| base_cols = ["Model"] | |
| available_cols = list(df.columns) | |
| cols_to_show = [col for col in base_cols if col in available_cols] | |
| # Add Developer and other selected columns | |
| cols_to_show.extend([col for col in selected_columns if col in available_cols and col not in cols_to_show]) | |
| if cols_to_show: | |
| df = df[cols_to_show] | |
| if search_query and not df.empty: | |
| mask = df.astype(str).apply(lambda row: row.str.contains(search_query, case=False, na=False).any(), axis=1) | |
| df = df[mask] | |
| filtered_count = len(df) | |
| if sort_column and sort_column in df.columns and not df.empty: | |
| df = df.sort_values(by=sort_column, ascending=False, na_position='last') | |
| total_pages = max(1, (filtered_count + PAGE_SIZE - 1) // PAGE_SIZE) if filtered_count > 0 else 1 | |
| current_page = max(1, min(current_page, total_pages)) | |
| start_idx = (current_page - 1) * PAGE_SIZE | |
| end_idx = start_idx + PAGE_SIZE | |
| df_paginated = df.iloc[start_idx:end_idx] if not df.empty else df | |
| page_choices = [str(i) for i in range(1, total_pages + 1)] | |
| page_dropdown = gr.update(choices=page_choices, value=str(current_page)) | |
| prev_btn = gr.update(interactive=(current_page > 1)) | |
| next_btn = gr.update(interactive=(current_page < total_pages)) | |
| page_info = f"{current_page} / {total_pages}" | |
| sort_choices = list(df.columns) if not df.empty else [] | |
| default_sort = sort_column if sort_column and sort_column in sort_choices else ("Average" if "Average" in sort_choices else (sort_choices[0] if sort_choices else None)) | |
| sort_column_update = gr.update(choices=sort_choices, value=default_sort) | |
| # Get all available columns for column selector (use full list, not filtered) | |
| # Include all columns except Model in the selector (Model is always shown) | |
| column_choices = [col for col in all_available_columns if col != "Model"] | |
| # Preserve current selection, or default to all columns if None or empty | |
| if selected_columns is None or len(selected_columns) == 0: | |
| column_value = column_choices | |
| else: | |
| # Preserve user's selection, filtering out any invalid choices | |
| column_value = [col for col in selected_columns if col in column_choices] | |
| column_selector_update = gr.update(choices=column_choices, value=column_value) | |
| return ( | |
| df_paginated, | |
| format_leaderboard_header(selected_leaderboard, metadata), | |
| format_metric_details(selected_leaderboard, metadata), | |
| page_dropdown, | |
| prev_btn, | |
| next_btn, | |
| sort_column_update, | |
| page_info, | |
| column_selector_update, | |
| ) | |
| def search_model(model_query): | |
| """Search for a model and return formatted card.""" | |
| if not model_query or len(model_query) < 2: | |
| return """ | |
| <div class="no-results"> | |
| <h3>Search for a model</h3> | |
| <p>Enter a model name to see its benchmarks across all leaderboards</p> | |
| </div> | |
| """ | |
| results, _ = search_model_across_leaderboards(model_query) | |
| if not results: | |
| return f""" | |
| <div class="no-results"> | |
| <h3>No results for "{model_query}"</h3> | |
| <p>Try a different model name or check the spelling</p> | |
| </div> | |
| """ | |
| # Use the first matching model | |
| model_name = list(results.keys())[0] | |
| model_data = results[model_name] | |
| return format_model_card(model_name, model_data) | |
| def compare_models(selected_models): | |
| """Compare multiple selected models.""" | |
| if not selected_models or len(selected_models) == 0: | |
| return """ | |
| <div class="no-results"> | |
| <h3>Select models to compare</h3> | |
| <p>Choose multiple models from the dropdown to see a side-by-side comparison</p> | |
| </div> | |
| """ | |
| # Get data for all selected models | |
| all_results = {} | |
| for model_name in selected_models: | |
| results, _ = search_model_across_leaderboards(model_name) | |
| if results: | |
| # Use the first matching model (exact match preferred) | |
| matched_model = list(results.keys())[0] | |
| all_results[matched_model] = results[matched_model] | |
| if len(all_results) == 1: | |
| # Single model - show card view | |
| model_name = list(all_results.keys())[0] | |
| return format_model_card(model_name, all_results[model_name]) | |
| elif len(all_results) > 1: | |
| # Multiple models - show comparison | |
| return format_model_comparison(list(all_results.keys()), all_results) | |
| else: | |
| return """ | |
| <div class="no-results"> | |
| <h3>No results found</h3> | |
| <p>Try selecting different models</p> | |
| </div> | |
| """ | |
| def get_model_suggestions(query): | |
| """Get model name suggestions for autocomplete.""" | |
| if not query or len(query) < 2: | |
| return gr.update(choices=[]) | |
| _, matches = search_model_across_leaderboards(query) | |
| return gr.update(choices=matches[:15]) | |
| # Load data at startup | |
| load_hf_dataset_on_startup() | |
| # Build interface | |
| with gr.Blocks(title="Every Eval Ever", theme=get_theme(), css=get_custom_css()) as demo: | |
| # Header | |
| gr.HTML(""" | |
| <div class="app-header"> | |
| <div class="logo-mark">E³</div> | |
| <div class="brand"> | |
| <h1>Every Eval Ever</h1> | |
| <span class="tagline">Browse and compare model benchmarks</span> | |
| </div> | |
| <div class="header-right"> | |
| <span class="version-badge">beta</span> | |
| </div> | |
| </div> | |
| """) | |
| with gr.Tabs(): | |
| # === TAB 1: Leaderboard View === | |
| with gr.TabItem("📊 Leaderboards"): | |
| with gr.Row(elem_classes="controls-bar"): | |
| initial_choices = get_available_leaderboards() | |
| initial_value = initial_choices[0] if initial_choices else None | |
| with gr.Column(scale=2, min_width=200): | |
| leaderboard_selector = gr.Dropdown( | |
| choices=initial_choices, | |
| value=initial_value, | |
| label="Leaderboard", | |
| interactive=True | |
| ) | |
| with gr.Column(scale=3, min_width=250): | |
| search_box = gr.Textbox( | |
| label="Filter", | |
| placeholder="Filter models...", | |
| show_label=True | |
| ) | |
| with gr.Column(scale=1, min_width=100): | |
| refresh_btn = gr.Button("↻ Refresh", variant="secondary", size="sm") | |
| init_df, init_header, init_metrics, init_page_dropdown, init_prev, init_next, init_sort_cols, init_page_info, init_column_selector = update_leaderboard_table(initial_value, "", 1, "Average", None) | |
| header_view = gr.HTML(value=init_header) | |
| # Hidden sort state (default to Average) | |
| sort_column_dropdown = gr.Dropdown( | |
| choices=init_sort_cols.get("choices", []) if hasattr(init_sort_cols, 'get') else [], | |
| value=init_sort_cols.get("value") if hasattr(init_sort_cols, 'get') else None, | |
| visible=False, | |
| ) | |
| # Column selector | |
| with gr.Row(elem_classes="controls-bar"): | |
| column_selector = gr.CheckboxGroup( | |
| choices=init_column_selector.get("choices", []) if isinstance(init_column_selector, dict) else [], | |
| value=init_column_selector.get("value", []) if isinstance(init_column_selector, dict) else [], | |
| label="Columns to Display", | |
| interactive=True, | |
| show_label=True, | |
| ) | |
| leaderboard_table = gr.Dataframe( | |
| value=init_df, | |
| label=None, | |
| interactive=False, | |
| wrap=False, | |
| elem_classes="dataframe", | |
| ) | |
| # Pagination below table - centered | |
| with gr.Row(elem_classes="pagination-bar"): | |
| prev_btn = gr.Button("←", variant="secondary", size="sm", min_width=60) | |
| page_info = gr.Markdown(value=init_page_info, elem_classes="page-info") | |
| next_btn = gr.Button("→", variant="secondary", size="sm", min_width=60) | |
| # Extract choices and value from gr.update() dict, ensuring value is in choices | |
| if isinstance(init_page_dropdown, dict): | |
| page_choices = init_page_dropdown.get("choices", ["1"]) | |
| page_value = str(init_page_dropdown.get("value", "1")) if init_page_dropdown.get("value") is not None else "1" | |
| # Ensure value exists in choices | |
| if page_value not in page_choices: | |
| page_value = page_choices[0] if page_choices else "1" | |
| if not page_choices: | |
| page_choices = ["1"] | |
| else: | |
| page_choices = ["1"] | |
| page_value = "1" | |
| page_dropdown = gr.Dropdown( | |
| choices=page_choices, | |
| value=page_value, | |
| visible=False, | |
| ) | |
| metrics_view = gr.HTML(value=init_metrics) | |
| # === TAB 2: Model View === | |
| with gr.TabItem("🔍 Model Lookup"): | |
| gr.Markdown("### Find and compare models across all leaderboards") | |
| selected_models_state = gr.State(value=[]) | |
| default_compare_html = """ | |
| <div class="no-results"> | |
| <h3>Search for models to compare</h3> | |
| <p>Type in the dropdown above, then click a model to add it</p> | |
| </div> | |
| """ | |
| with gr.Row(elem_classes="controls-bar"): | |
| with gr.Column(scale=4): | |
| all_models = get_all_model_names() | |
| model_dropdown = gr.Dropdown( | |
| choices=all_models, | |
| label="Search models to add", | |
| interactive=True, | |
| allow_custom_value=False, | |
| filterable=True, | |
| ) | |
| with gr.Column(scale=1, min_width=100): | |
| clear_models_btn = gr.Button("Clear All", variant="secondary", size="sm") | |
| selected_models_group = gr.CheckboxGroup( | |
| choices=[], | |
| value=[], | |
| label="Selected Models (click to remove)", | |
| interactive=True, | |
| elem_classes="selected-models-group" | |
| ) | |
| model_card_view = gr.HTML(value=default_compare_html) | |
| # Submission guide | |
| with gr.Accordion("📤 How to Submit Data", open=False): | |
| gr.Markdown(""" | |
| **Submit via GitHub Pull Request:** | |
| 1. Fork [evaleval/every_eval_ever](https://github.com/evaleval/every_eval_ever) | |
| 2. Add JSON files to `data/<leaderboard>/<developer>/<model>/` | |
| 3. Open a PR — automated validation runs on submission | |
| 4. After merge, data syncs to HuggingFace automatically | |
| [Submission Guide](https://github.com/evaleval/every_eval_ever#contributor-guide) · [JSON Schema](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json) | |
| """) | |
| # === State === | |
| current_page_state = gr.State(value=1) | |
| sort_column_state = gr.State(value="Average") | |
| def go_prev(current): | |
| return max(1, current - 1) | |
| def go_next(current): | |
| return current + 1 | |
| def reset_page(): | |
| return 1 | |
| def update_table_only(selected_leaderboard, search_query, current_page, sort_column, selected_columns): | |
| """Update table without modifying column selector (for column changes).""" | |
| result = update_leaderboard_table(selected_leaderboard, search_query, current_page, sort_column, selected_columns) | |
| # Return all outputs except the last one (column_selector) | |
| return result[:-1] | |
| # === Leaderboard Events === | |
| leaderboard_selector.change( | |
| fn=reset_page, outputs=[current_page_state] | |
| ).then( | |
| fn=lambda: "Average", outputs=[sort_column_state] | |
| ).then( | |
| fn=lambda: None, outputs=[column_selector] | |
| ).then( | |
| fn=update_leaderboard_table, | |
| inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector], | |
| outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info, column_selector] | |
| ) | |
| search_box.input( | |
| fn=reset_page, outputs=[current_page_state] | |
| ).then( | |
| fn=update_table_only, | |
| inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector], | |
| outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info] | |
| ) | |
| sort_column_dropdown.change( | |
| fn=lambda col: col, | |
| inputs=[sort_column_dropdown], | |
| outputs=[sort_column_state] | |
| ).then( | |
| fn=reset_page, outputs=[current_page_state] | |
| ).then( | |
| fn=update_table_only, | |
| inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector], | |
| outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info] | |
| ) | |
| column_selector.change( | |
| fn=reset_page, outputs=[current_page_state] | |
| ).then( | |
| fn=update_table_only, | |
| inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector], | |
| outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info] | |
| ) | |
| page_dropdown.change( | |
| fn=lambda p: int(p) if p else 1, | |
| inputs=[page_dropdown], | |
| outputs=[current_page_state] | |
| ).then( | |
| fn=update_table_only, | |
| inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector], | |
| outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info] | |
| ) | |
| prev_btn.click( | |
| fn=go_prev, inputs=[current_page_state], outputs=[current_page_state] | |
| ).then( | |
| fn=update_table_only, | |
| inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector], | |
| outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info] | |
| ) | |
| next_btn.click( | |
| fn=go_next, inputs=[current_page_state], outputs=[current_page_state] | |
| ).then( | |
| fn=update_table_only, | |
| inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector], | |
| outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info] | |
| ) | |
| refresh_btn.click( | |
| fn=lambda: gr.Dropdown(choices=get_available_leaderboards()), | |
| outputs=[leaderboard_selector] | |
| ).then( | |
| fn=lambda: clear_cache() | |
| ).then( | |
| fn=reset_page, outputs=[current_page_state] | |
| ).then( | |
| fn=lambda: "Average", outputs=[sort_column_state] | |
| ).then( | |
| fn=lambda: None, outputs=[column_selector] | |
| ).then( | |
| fn=update_leaderboard_table, | |
| inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector], | |
| outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info, column_selector] | |
| ) | |
| # === Model Search Events === | |
| def add_model_and_compare(selected_model, current_selected): | |
| """Add a model and auto-compare.""" | |
| if not selected_model: | |
| comparison_html = compare_models(current_selected) if current_selected else default_compare_html | |
| return ( | |
| current_selected, | |
| gr.update(value=None), | |
| gr.update(choices=current_selected, value=current_selected), | |
| comparison_html | |
| ) | |
| if current_selected is None: | |
| current_selected = [] | |
| if selected_model not in current_selected: | |
| current_selected = current_selected + [selected_model] | |
| comparison_html = compare_models(current_selected) | |
| return ( | |
| current_selected, | |
| gr.update(value=None), | |
| gr.update(choices=current_selected, value=current_selected), | |
| comparison_html | |
| ) | |
| def update_selection(selected_list): | |
| """Update selection from checkbox changes.""" | |
| selected_list = selected_list or [] | |
| comparison_html = compare_models(selected_list) if selected_list else default_compare_html | |
| return selected_list, comparison_html | |
| def clear_all_models(): | |
| """Clear all selected models.""" | |
| return ( | |
| [], | |
| gr.update(value=None), | |
| gr.update(choices=[], value=[]), | |
| default_compare_html | |
| ) | |
| # Select from dropdown adds model and auto-compares | |
| model_dropdown.select( | |
| fn=add_model_and_compare, | |
| inputs=[model_dropdown, selected_models_state], | |
| outputs=[selected_models_state, model_dropdown, selected_models_group, model_card_view] | |
| ) | |
| selected_models_group.change( | |
| fn=update_selection, | |
| inputs=[selected_models_group], | |
| outputs=[selected_models_state, model_card_view] | |
| ) | |
| clear_models_btn.click( | |
| fn=clear_all_models, | |
| outputs=[selected_models_state, model_dropdown, selected_models_group, model_card_view] | |
| ) | |
| DATA_DIR.mkdir(exist_ok=True) | |
| if __name__ == "__main__": | |
| demo.launch() | |