| from datetime import datetime, timezone |
| import json |
| import os |
| from typing import Optional |
|
|
| import gradio as gr |
| import pandas as pd |
| from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns |
|
|
| from src.about import ( |
| CITATION_BUTTON_LABEL, |
| CITATION_BUTTON_TEXT, |
| INTRODUCTION_TEXT, |
| LLM_BENCHMARKS_TEXT, |
| TITLE, |
| ) |
| from src.display.css_html_js import custom_css |
| from src.envs import RESULTS_PATH, SUBMISSIONS_PATH |
| from src.leaderboard.load_results import ( |
| ResultsValidationError, |
| build_dataframe, |
| load_records, |
| validate_records, |
| ) |
| from src.leaderboard.schema import SCHEMA |
|
|
|
|
| def load_leaderboard_data() -> tuple[pd.DataFrame, list[str], Optional[str]]: |
| try: |
| records = load_records(RESULTS_PATH) |
| df, column_order = build_dataframe(records) |
| return df, column_order, None |
| except ResultsValidationError as exc: |
| fallback_cols = list(SCHEMA.identity_fields) + list(SCHEMA.required_metrics) |
| df = pd.DataFrame(columns=fallback_cols) |
| return df, fallback_cols, str(exc) |
|
|
|
|
| LEADERBOARD_DF, COLUMN_ORDER, LOAD_ERROR = load_leaderboard_data() |
|
|
| DATASET_DISPLAY_NAMES = ["FreshRetailNet", "PSML", "Causal Chambers", "MIMIC"] |
| DATASET_PREFIX_MAP = { |
| "FreshRetailNet": "FreshRetailNet", |
| "PSML": "PSML", |
| "Causal Chambers": "CausalChambers", |
| "MIMIC": "MIMIC", |
| } |
| DATASET_PREFIXES = [f"{prefix}_" for prefix in DATASET_PREFIX_MAP.values()] |
|
|
|
|
| def is_dataset_metric(column: str) -> bool: |
| return any(column.startswith(prefix) for prefix in DATASET_PREFIXES) |
|
|
|
|
| BASE_COLUMNS = list(SCHEMA.identity_fields) + list(SCHEMA.required_metrics) |
| ALL_DATASET_COLUMNS = [c for c in COLUMN_ORDER if is_dataset_metric(c)] |
|
|
| AGGREGATE_FORECAST_COLUMNS = [ |
| "overall_mcq_acc", |
| "T2_MAE", |
| "T2_sMAPE", |
| "T4_MAE", |
| "T4_sMAPE", |
| "MIMIC_T2_OW_sMAPE", |
| "MIMIC_T2_OW_RMSSE", |
| "MIMIC_T4_OW_sMAPE", |
| "MIMIC_T4_OW_RMSSE", |
| ] |
| AGGREGATE_COLUMNS = BASE_COLUMNS + [ |
| c for c in AGGREGATE_FORECAST_COLUMNS if c in COLUMN_ORDER |
| ] |
|
|
| DISPLAY_ALL_COLUMNS = BASE_COLUMNS + ALL_DATASET_COLUMNS |
| BY_DOMAIN_COLUMNS = BASE_COLUMNS + ALL_DATASET_COLUMNS |
| BY_DOMAIN_MAX_COLUMNS = 40 |
|
|
|
|
| def column_types(column_order: list[str]) -> list[str]: |
| types = [] |
| for col in column_order: |
| if col in SCHEMA.identity_fields: |
| types.append("str") |
| else: |
| types.append("number") |
| return types |
|
|
|
|
| def init_leaderboard(dataframe, column_order): |
| if dataframe is None or dataframe.empty: |
| dataframe = pd.DataFrame(columns=column_order) |
| dataframe = dataframe.reindex(columns=column_order) |
|
|
| required_cols = list(SCHEMA.identity_fields) + list(SCHEMA.required_metrics) |
| cant_deselect = [c for c in required_cols if c in column_order] |
|
|
| search_columns = [c for c in ["model_name", "agent_name"] if c in column_order] |
|
|
| return Leaderboard( |
| value=dataframe, |
| datatype=column_types(column_order), |
| select_columns=SelectColumns( |
| default_selection=column_order, |
| cant_deselect=cant_deselect, |
| label="Select Columns to Display:", |
| ), |
| search_columns=search_columns, |
| filter_columns=[ |
| ColumnFilter("agent_type", type="checkboxgroup", label="Agent type"), |
| ], |
| interactive=False, |
| ) |
|
|
|
|
| |
|
|
|
|
| def save_submission(uploaded_file) -> str: |
| if uploaded_file is None: |
| return "Please upload a results file." |
|
|
| file_path = uploaded_file.name if hasattr(uploaded_file, "name") else str(uploaded_file) |
|
|
| try: |
| records = load_records(file_path) |
| validate_records(records) |
| except ResultsValidationError as exc: |
| return f"**Validation error:** {exc}" |
|
|
| os.makedirs(SUBMISSIONS_PATH, exist_ok=True) |
| timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") |
| out_path = os.path.join(SUBMISSIONS_PATH, f"submission_{timestamp}.json") |
| payload = { |
| "submitted_at": timestamp, |
| "source_filename": os.path.basename(file_path), |
| "records": records, |
| } |
| with open(out_path, "w") as fp: |
| json.dump(payload, fp, indent=2) |
|
|
| return f"Submission received for review. Saved to `{out_path}`." |
|
|
|
|
| def build_example_record_markdown() -> str: |
| try: |
| records = load_records(RESULTS_PATH) |
| if not records: |
| return "No example data available." |
| example = records[0] |
| return "Example record (JSON):\n```json\n" + json.dumps(example, indent=2) + "\n```" |
| except Exception as exc: |
| return f"Could not load example record: {exc}" |
|
|
|
|
| EXAMPLE_RECORD_MD = build_example_record_markdown() |
|
|
|
|
| demo = gr.Blocks(css=custom_css, analytics_enabled=False) |
| with demo: |
| gr.HTML(TITLE) |
| gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
| if LOAD_ERROR: |
| gr.Markdown(f"**Data validation error:** {LOAD_ERROR}", elem_classes="markdown-text") |
|
|
| with gr.Tabs(elem_classes="tab-buttons") as tabs: |
| with gr.TabItem("π
Leaderboard", elem_id="tab-leaderboard", id=0): |
| leaderboard = init_leaderboard(LEADERBOARD_DF, AGGREGATE_COLUMNS) |
|
|
| with gr.TabItem("π§ By Domain", elem_id="tab-by-domain", id=1): |
| by_domain_columns = BY_DOMAIN_COLUMNS[:BY_DOMAIN_MAX_COLUMNS] |
| by_domain_df = LEADERBOARD_DF.reindex(columns=by_domain_columns) |
| init_leaderboard(by_domain_df, by_domain_columns) |
|
|
| |
| with gr.TabItem("π€ Submit Results", elem_id="tab-submit", id=2): |
| gr.Markdown( |
| ( |
| "Upload submission files for manual review.\n\n" |
| "Required files:\n" |
| "1. `results_on_dev_dataset.json`: task-level metrics in leaderboard format.\n" |
| "2. `results_on_test_dataset.json`: per-example test outputs with at least " |
| "`id`, `tier`, `source_dataset`, `label`, and `output` " |
| "(required when the sample contains forecasting).\n\n" |
| "Please also include model architecture code and LLM/system details for verification." |
| ), |
| elem_classes="markdown-text", |
| ) |
| gr.Markdown(EXAMPLE_RECORD_MD, elem_classes="markdown-text") |
| submission_file = gr.File( |
| label="Submission package (.zip or .rar)", |
| file_types=[".zip", ".rar"], |
| ) |
| submit_button = gr.Button("Submit for Review") |
| submission_status = gr.Markdown() |
| submit_button.click(save_submission, [submission_file], submission_status) |
|
|
| with gr.TabItem("π About", elem_id="tab-about", id=3): |
| gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") |
| gr.Markdown(f"## Citation\n{CITATION_BUTTON_LABEL}", elem_classes="markdown-text") |
| gr.Markdown(f"```bibtex\n{CITATION_BUTTON_TEXT.strip()}\n```", elem_classes="markdown-text") |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| demo.launch() |
|
|