Spaces:

AssistantBench
/

leaderboard

Running

App Files Files Community

Ori commited on Jul 13

Commit

12ca829

•

1 Parent(s): c0479ad

Update app.py

Browse files

Files changed (1) hide show

app.py +221 -181

app.py CHANGED Viewed

@@ -1,204 +1,244 @@
 import gradio as gr
-from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import snapshot_download
-from src.about import (
-    CITATION_BUTTON_LABEL,
-    CITATION_BUTTON_TEXT,
-    EVALUATION_QUEUE_TEXT,
-    INTRODUCTION_TEXT,
-    LLM_BENCHMARKS_TEXT,
-    TITLE,
-)
-from src.display.css_html_js import custom_css
-from src.display.utils import (
-    BENCHMARK_COLS,
-    COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
-    AutoEvalColumn,
-    ModelType,
-    fields,
-    WeightType,
-    Precision
-)
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
 def restart_space():
-    API.restart_space(repo_id=REPO_ID)
-### Space initialisation
-try:
-    print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-def init_leaderboard(dataframe):
-    if dataframe is None or dataframe.empty:
-        raise ValueError("Leaderboard DataFrame is empty or None.")
-    return Leaderboard(
-        value=dataframe,
-        datatype=[c.type for c in fields(AutoEvalColumn)],
-        select_columns=SelectColumns(
-            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
-            label="Select Columns to Display:",
-        ),
-        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
-        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
-        filter_columns=[
-            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
-            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
-            ColumnFilter(
-                AutoEvalColumn.params.name,
-                type="slider",
-                min=0.01,
-                max=150,
-                label="Select the number of parameters (B)",
-            ),
-            ColumnFilter(
-                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
-            ),
-        ],
-        bool_checkboxgroup_label="Hide models",
-        interactive=False,
-    )
-demo = gr.Blocks(css=custom_css)
-with demo:
-    gr.HTML(TITLE)
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(LEADERBOARD_DF)
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
             with gr.Column():
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-            with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
-            with gr.Row():
-                with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
-                    )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            submit_button.click(
-                add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                ],
-                submission_result,
-            )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(
-                value=CITATION_BUTTON_TEXT,
-                label=CITATION_BUTTON_LABEL,
                 lines=20,
                 elem_id="citation-button",
-                show_copy_button=True,
             )
 scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

+import os
+import json
+import datetime
+from email.utils import parseaddr
 import gradio as gr
 import pandas as pd
+from datasets import load_dataset
 from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import HfApi
+from content import format_error, format_warning, format_log, TITLE
+# Placeholder for the question_scorer function
+def question_scorer(prediction, gold_answer):
+    return 1 if prediction == gold_answer else 0
+# Constants and Configuration
+TOKEN = os.environ.get("TOKEN", None)
+OWNER = "Ori"
+DATA_DATASET = f"Ori/AssistantBench_V1.0"
+RESULTS_DATASET = f"Ori/results"
+SUBMISSION_DATASET = f"{OWNER}/submissions"
+LEADERBOARD_PATH = f"{OWNER}/leaderboard"
+api = HfApi()
+YEAR_VERSION = "2024"
+os.makedirs("scored", exist_ok=True)
+# Load datasets
+eval_results = load_dataset(RESULTS_DATASET, token=TOKEN, download_mode="force_redownload",
+                            ignore_verifications=True, trust_remote_code=True)
+gold_results = load_dataset(DATA_DATASET, token=TOKEN, trust_remote_code=True)
+gold_answers = {split: {row["id"]: row["answer"] for row in gold_results[split]} for split in ["test"]}
+# Function to get dataframe from results
+def get_dataframe_from_results(eval_results, split):
+    local_df = eval_results[split]
+    df = pd.DataFrame(local_df)
+    df = df.sort_values(by=["Accuracy"], ascending=False)
+    numeric_cols = [c for c in local_df.column_names if "score" in c]
+    df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
+    return df
+eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
+# Function to restart the space
 def restart_space():
+    api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
+TYPES = ["markdown", "number", "number", "number", "number", "str", "str"]
+# Function to add a new evaluation
+def add_new_eval(
+        model_name: str,
+        model_family: str,
+        url: str,
+        path_to_file: str,
+        organization: str,
+        mail: str,
+):
+    _, parsed_mail = parseaddr(mail)
+    if "@" not in parsed_mail:
+        return format_warning("Please provide a valid email address.")
+    print("Adding new eval")
+    if model_name.lower() in set(
+            [m.lower() for m in eval_results["test"]["Model Name"]]) and organization.lower() in set(
+            [o.lower() for o in eval_results["test"]["Organization"]]):
+        return format_warning("This model has already been submitted.")
+    if path_to_file is None:
+        return format_warning("Please attach a file.")
+    api.upload_file(
+        repo_id=SUBMISSION_DATASET,
+        path_or_fileobj=path_to_file.name,
+        path_in_repo=f"{organization}/{model_name}/{YEAR_VERSION}_test_raw_{datetime.datetime.today()}.jsonl",
+        repo_type="dataset",
+        token=TOKEN
+    )
+    file_path = path_to_file.name
+    scores = 0
+    num_questions = 0
+    with open(f"scored/{organization}_{model_name}.jsonl", "w") as scored_file:
+        with open(file_path, 'r') as f:
+            for ix, line in enumerate(f):
+                try:
+                    task = json.loads(line)
+                except Exception:
+                    return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")
+                if "answer" not in task:
+                    return format_error(
+                        f"Line {ix} contains no answer key. Please fix it and resubmit your file.")
+                answer = task["answer"]
+                task_id = task["id"]
+                if task_id not in gold_answers["test"]:
+                    return format_error(
+                        f"{task_id} not found in test set. Are you sure you submitted the correct file?")
+                score = question_scorer(task['answer'], gold_answers["test"][task_id])
+                scored_file.write(
+                    json.dumps({
+                        "id": task_id,
+                        "model_answer": answer,
+                        "score": score
+                    }) + "\n"
+                )
+                scores += score
+                num_questions += 1
+    api.upload_file(
+        repo_id=SUBMISSION_DATASET,
+        path_or_fileobj=f"scored/{organization}_{model_name}.jsonl",
+        path_in_repo=f"{organization}/{model_name}/{YEAR_VERSION}_test_scored_{datetime.datetime.today()}.jsonl",
+        repo_type="dataset",
+        token=TOKEN
+    )
+    eval_entry = {
+        "Model Name": model_name,
+        "Model Family": model_family,
+        "URL": url,
+        "Organization": organization,
+        "Accuracy": scores / num_questions if num_questions > 0 else 0,
+        "Answer rate": scores / num_questions if num_questions > 0 else 0,
+        "Precision": scores / num_questions if num_questions > 0 else 0,
+        "EM": scores if num_questions > 0 else 0,
+        "Cost": 0,  # Placeholder for cost, update with actual value if needed
+    }
+    eval_results["test"] = eval_results["test"].add_item(eval_entry)
+    eval_results.push_to_hub(RESULTS_DATASET, config_name=YEAR_VERSION, token=TOKEN)
+    return format_log(
+        f"Model {model_name} submitted by {organization} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed.")
+# Function to refresh the results
+def refresh():
+    eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload",
+                                ignore_verifications=True, trust_remote_code=True)
+    eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
+    return eval_dataframe_test
+# Gradio interface
+demo = gr.Blocks()
+with demo:
+    gr.HTML("<h1>AssistantBench</h1>")
+    gr.Markdown("""
+        AssistantBench aims to evaluate the ability of web agents to assist with real and time-consuming tasks.
+        For more information, please check out our paper or the official website.
+        To download AssistantBench, press [here](https://huggingface.co/datasets/Ori/AssistantBench_V1.0).
+    """)
+    gr.HTML("<h2>AssistantBench Leaderboard</h2>")
+    with gr.Tab("Results: Test"):
+        leaderboard_table_test = gr.Dataframe(
+            value=eval_dataframe_test, datatype=TYPES, interactive=False,
+            column_widths=["20%"]
+        )
+    refresh_button = gr.Button("Refresh")
+    refresh_button.click(
+        refresh,
+        inputs=[],
+        outputs=[
+            leaderboard_table_test,
+        ],
+    )
+    gr.HTML("<h2>Making a New Submission</h2>")
+    with gr.Accordion("Submit a new model for evaluation"):
+        with gr.Row():
+            gr.Markdown("""
+                To make a new submission, upload a predictions file. We support JSONL files with the following format:
+                ```
+                {"id": "task_id_1", "answer": "Answer 1 from your model"}
+                {"id": "task_id_2", "answer": "Answer 2 from your model"}
+                ```
+                Our scoring function can be found [here](https://huggingface.co/spaces/AssistantBench/leaderboard/blob/main/scorer.py).
+            """)
+        with gr.Row():
             with gr.Column():
+                model_name_textbox = gr.Textbox(label="Model Name")
+                model_family_textbox = gr.Textbox(label="Model Family")
+                url_textbox = gr.Textbox(label="URL to Model Information")
+            with gr.Column():
+                organization = gr.Textbox(label="Organization")
+                mail = gr.Textbox(
+                    label="Contact Email (will be stored privately & used if there is an issue with your submission)")
+                file_output = gr.File()
+        submit_button = gr.Button("Submit Eval")
+        submission_result = gr.Markdown()
+        submit_button.click(
+            add_new_eval,
+            [
+                model_name_textbox,
+                model_family_textbox,
+                url_textbox,
+                file_output,
+                organization,
+                mail
+            ],
+            submission_result,
+        )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
+            citation_text = """@article{yoran-etal-2023-assistantbench,
+    title={AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?},
+    author={Ori Yoran and Samuel Amouyal and Chaitanya Malaviya and Ben Bogin and Ofir Press and Jonathan Berant},
+    year={2024},
+    eprint={TODO},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}"""
             citation_button = gr.Textbox(
+                value=citation_text,
+                label="Citation",
                 lines=20,
                 elem_id="citation-button",
+                show_copy_button=True
             )
+    gr.HTML(
+        "<p>We would like to thank the GAIA team on which this leaderboard is based on their template and HuggingFace for hosting the leaderboard.</p>")
 scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=3600)
 scheduler.start()
+demo.launch(debug=True)