Spaces:

autogenCTF
/

agent_ctf_leaderboard

Running

App Files Files Community

胥基 commited on Mar 28, 2024

Commit

193b86e

1 Parent(s): fefeca2

copy gaia-leaderboard

Browse files

Files changed (6) hide show

.gitattributes +0 -1
README.md +9 -7
app.py +254 -0
content.py +47 -0
requirements.txt +5 -0
scorer.py +101 -0

.gitattributes CHANGED Viewed

@@ -25,7 +25,6 @@
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text

 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,13 +1,15 @@
 ---
-title: Agent Ctf Leaderboard
-emoji: 📊
-colorFrom: purple
-colorTo: blue
 sdk: gradio
-sdk_version: 4.24.0
 app_file: app.py
-pinned: false
-license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: GAIA Leaderboard
+emoji: 🦾
+colorFrom: yellow
+colorTo: indigo
 sdk: gradio
+sdk_version: 4.3.0
 app_file: app.py
+pinned: true
+license: apache-2.0
+tags:
+  - leaderboard
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,254 @@

+import os
+import json
+import datetime
+from email.utils import parseaddr
+import gradio as gr
+import pandas as pd
+import numpy as np
+from datasets import load_dataset
+from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import HfApi
+# InfoStrings
+from scorer import question_scorer
+from content import format_error, format_warning, format_log, TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink
+TOKEN = os.environ.get("TOKEN", None)
+OWNER="gaia-benchmark"
+DATA_DATASET = f"{OWNER}/GAIA"
+INTERNAL_DATA_DATASET = f"{OWNER}/GAIA_internal"
+SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
+CONTACT_DATASET = f"{OWNER}/contact_info"
+RESULTS_DATASET = f"{OWNER}/results_public"
+LEADERBOARD_PATH = f"{OWNER}/leaderboard"
+api = HfApi()
+YEAR_VERSION = "2023"
+os.makedirs("scored", exist_ok=True)
+# Display the results
+eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+def get_dataframe_from_results(eval_results, split):
+    local_df = eval_results[split]
+    local_df = local_df.map(lambda row: {"model": model_hyperlink(row["url"], row["model"])})
+    local_df = local_df.remove_columns(["system_prompt", "url"])
+    local_df = local_df.rename_column("model", "Model name")
+    local_df = local_df.rename_column("model_family", "Model family")
+    local_df = local_df.rename_column("score", "Average score (%)")
+    for i in [1, 2, 3]:
+        local_df = local_df.rename_column(f"score_level{i}", f"Level {i} score (%)")
+    df = pd.DataFrame(local_df)
+    df = df.sort_values(by=["Average score (%)"], ascending=False)
+    numeric_cols = [c for c in local_df.column_names if "score" in c]
+    df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
+    #df = df.style.format("{:.2%}", subset=numeric_cols)
+    return df
+eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation")
+eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
+# Gold answers
+gold_results = {}
+gold_dataset = load_dataset(INTERNAL_DATA_DATASET, f"{YEAR_VERSION}_all", token=TOKEN)
+gold_results = {split: {row["task_id"]: row for row in gold_dataset[split]} for split in ["test", "validation"]}
+def restart_space():
+    api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
+TYPES = ["markdown", "number", "number", "number", "number", "str", "str"]
+def add_new_eval(
+    val_or_test: str,
+    model: str,
+    model_family: str,
+    system_prompt: str,
+    url: str,
+    path_to_file: str,
+    organisation: str,
+    mail: str,
+):
+    # Very basic email parsing
+    _, parsed_mail = parseaddr(mail)
+    if not "@" in parsed_mail:
+        return format_warning("Please provide a valid email adress.")
+    print("Adding new eval")
+    # Check if the combination model/org already exists and prints a warning message if yes
+    if model.lower() in set([m.lower() for m in eval_results[val_or_test]["model"]]) and organisation.lower() in set([o.lower() for l in eval_results[val_or_test]["organisation"]]):
+        return format_warning("This model has been already submitted.")
+    if path_to_file is None:
+        return format_warning("Please attach a file.")
+    # Save submitted file
+    api.upload_file(
+        repo_id=SUBMISSION_DATASET,
+        path_or_fileobj=path_to_file.name,
+        path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl",
+        repo_type="dataset",
+        token=TOKEN
+    )
+    # Compute score
+    file_path = path_to_file.name
+    scores = {"all": 0, 1: 0, 2: 0, 3: 0}
+    num_questions = {"all": 0, 1: 0, 2: 0, 3: 0}
+    with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
+        with open(file_path, 'r') as f:
+            for ix, line in enumerate(f):
+                try:
+                    task = json.loads(line)
+                except Exception:
+                    return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")
+                if "model_answer" not in task:
+                    raise format_error(f"Line {ix} contains no model_answer key. Please fix it and resubmit your file.")
+                answer = task["model_answer"]
+                task_id = task["task_id"]
+                try:
+                    level = int(gold_results[val_or_test][task_id]["Level"])
+                except KeyError:
+                    return format_error(f"{task_id} not found in split {val_or_test}. Are you sure you submitted the correct file?")
+                score = question_scorer(task['model_answer'], gold_results[val_or_test][task_id]["Final answer"])
+                scored_file.write(
+                    json.dumps({
+                        "id": task_id,
+                        "model_answer": answer,
+                        "score": score,
+                        "level": level
+                    }) + "\n"
+                )
+                scores["all"] += score
+                scores[level] += score
+                num_questions["all"] += 1
+                num_questions[level] += 1
+    # Save scored file
+    api.upload_file(
+        repo_id=SUBMISSION_DATASET,
+        path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
+        path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
+        repo_type="dataset",
+        token=TOKEN
+    )
+    # Actual submission
+    eval_entry = {
+        "model": model,
+        "model_family": model_family,
+        "system_prompt": system_prompt,
+        "url": url,
+        "organisation": organisation,
+        "score": scores["all"]/num_questions["all"],
+        "score_level1": scores[1]/num_questions[1],
+        "score_level2": scores[2]/num_questions[2],
+        "score_level3": scores[3]/num_questions[3],
+    }
+    eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
+    print(eval_results)
+    eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
+    contact_info = {
+        "model": model,
+        "model_family": model_family,
+        "url": url,
+        "organisation": organisation,
+        "mail": mail,
+    }
+    contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info)
+    contact_infos.push_to_hub(CONTACT_DATASET, config_name = YEAR_VERSION, token=TOKEN)
+    return format_log(f"Model {model} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed")
+def refresh():
+    eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+    eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation")
+    eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
+    return eval_dataframe_val, eval_dataframe_test
+def upload_file(files):
+    file_paths = [file.name for file in files]
+    return file_paths
+demo = gr.Blocks()
+with demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Row():
+        with gr.Accordion("📙 Citation", open=False):
+            citation_button = gr.Textbox(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                elem_id="citation-button",
+            ) #.style(show_copy_button=True)
+    with gr.Tab("Results: Test"):
+        leaderboard_table_test = gr.components.Dataframe(
+            value=eval_dataframe_test, datatype=TYPES, interactive=False,
+            column_widths=["20%"]
+        )
+    with gr.Tab("Results: Validation"):
+        leaderboard_table_val = gr.components.Dataframe(
+            value=eval_dataframe_val, datatype=TYPES, interactive=False,
+            column_widths=["20%"]
+        )
+    refresh_button = gr.Button("Refresh")
+    refresh_button.click(
+        refresh,
+        inputs=[],
+        outputs=[
+            leaderboard_table_val,
+            leaderboard_table_test,
+        ],
+    )
+    with gr.Accordion("Submit a new model for evaluation"):
+        with gr.Row():
+            with gr.Column():
+                level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
+                model_name_textbox = gr.Textbox(label="Model name")
+                model_family_textbox = gr.Textbox(label="Model family")
+                system_prompt_textbox = gr.Textbox(label="System prompt example")
+                url_textbox = gr.Textbox(label="Url to model information")
+            with gr.Column():
+                organisation = gr.Textbox(label="Organisation")
+                mail = gr.Textbox(label="Contact email (will be stored privately, & used if there is an issue with your submission)")
+                file_output = gr.File()
+        submit_button = gr.Button("Submit Eval")
+        submission_result = gr.Markdown()
+        submit_button.click(
+            add_new_eval,
+            [
+                level_of_test,
+                model_name_textbox,
+                model_family_textbox,
+                system_prompt_textbox,
+                url_textbox,
+                file_output,
+                organisation,
+                mail
+            ],
+            submission_result,
+        )
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=3600)
+scheduler.start()
+demo.launch(debug=True)

content.py ADDED Viewed

	@@ -0,0 +1,47 @@

+TITLE = """<h1 align="center" id="space-title">GAIA Leaderboard</h1>"""
+INTRODUCTION_TEXT = """
+GAIA is a benchmark which aims at evaluating next-generation LLMs (LLMs with augmented capabilities due to added tooling, efficient prompting, access to search, etc). (See our [paper](https://arxiv.org/abs/2311.12983) for more details.)
+## Data
+GAIA is made of more than 450 non-trivial question with an unambiguous answer, requiring different levels of tooling and autonomy to solve.
+It is therefore divided in 3 levels, where level 1 should be breakable by very good LLMs, and level 3 indicate a strong jump in model capabilities. Each level is divided into a fully public dev set for validation, and a test set with private answers and metadata.
+GAIA data can be found in [this dataset](https://huggingface.co/datasets/gaia-benchmark/GAIA). Questions are contained in `metadata.jsonl`. Some questions come with an additional file, that can be found in the same folder and whose id is given in the field `file_name`.
+## Submissions
+Results can be submitted for both validation and test. Scores are expressed as the percentage of correct answers for a given split.
+We expect submissions to be json-line files with the following format. The first two fields are mandatory, `reasoning_trace` is optionnal:
+```
+{"task_id": "task_id_1", "model_answer": "Answer 1 from your model", "reasoning_trace": "The different steps by which your model reached answer 1"}
+{"task_id": "task_id_2", "model_answer": "Answer 2 from your model", "reasoning_trace": "The different steps by which your model reached answer 2"}
+```
+Submission made by our team are labelled "GAIA authors". While we report average scores over different runs when possible in our paper, we only report the best run in the leaderboard.
+**Please do not repost the public dev set, nor use it in training data for your models.**
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""@misc{mialon2023gaia,
+      title={GAIA: a benchmark for General AI Assistants},
+      author={Grégoire Mialon and Clémentine Fourrier and Craig Swift and Thomas Wolf and Yann LeCun and Thomas Scialom},
+      year={2023},
+      eprint={2311.12983},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}"""
+def format_error(msg):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
+def format_warning(msg):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
+def format_log(msg):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+datasets==2.14.5
+gradio==4.3.0
+huggingface-hub==0.18.0
+numpy==1.24.2
+APScheduler==3.10.1

scorer.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import json
+import re
+import string
+import warnings
+import numpy as np
+def normalize_number_str(number_str: str) -> float:
+    # we replace these common units and commas to allow
+    # conversion to float
+    for char in ["$", "%", ","]:
+        number_str = number_str.replace(char, "")
+    try:
+        return float(number_str)
+    except ValueError:
+        print(f"String {number_str} cannot be normalized to number str.")
+        return float("inf")
+def split_string(
+    s: str,
+    char_list: list[str] = [",", ";"],
+) -> list[str]:
+    pattern = f"[{''.join(char_list)}]"
+    return re.split(pattern, s)
+def question_scorer(
+    model_answer: str,
+    ground_truth: str,
+) -> bool:
+    def is_float(element: any) -> bool:
+        try:
+            float(element)
+            return True
+        except ValueError:
+            return False
+    # if gt is a number
+    if is_float(ground_truth):
+        print(f"Evaluating {model_answer} as a number.")
+        normalized_answer = normalize_number_str(model_answer)
+        return normalized_answer == float(ground_truth)
+    # if gt is a list
+    elif any(char in ground_truth for char in [",", ";"]):
+        print(f"Evaluating {model_answer} as a comma separated list.")
+        # question with the fish: normalization removes punct
+        gt_elems = split_string(ground_truth)
+        ma_elems = split_string(model_answer)
+        # check length is the same
+        if len(gt_elems) != len(ma_elems):
+            warnings.warn(
+                "Answer lists have different lengths, returning False.", UserWarning
+            )
+            return False
+        # compare each element as float or str
+        comparisons = []
+        for ma_elem, gt_elem in zip(ma_elems, gt_elems):
+            if is_float(gt_elem):
+                normalized_ma_elem = normalize_number_str(ma_elem)
+                comparisons.append(normalized_ma_elem == float(gt_elem))
+            else:
+                # we do not remove punct since comparisons can include punct
+                comparisons.append(
+                    normalize_str(ma_elem, remove_punct=False)
+                    == normalize_str(gt_elem, remove_punct=False)
+                )
+        return all(comparisons)
+    # if gt is a str
+    else:
+        print(f"Evaluating {model_answer} as a string.")
+        return normalize_str(model_answer) == normalize_str(ground_truth)
+def normalize_str(input_str, remove_punct=True) -> str:
+    """
+    Normalize a string by:
+    - Removing all white spaces
+    - Optionally removing punctuation (if remove_punct is True)
+    - Converting to lowercase
+    Parameters:
+    - input_str: str, the string to normalize
+    - remove_punct: bool, whether to remove punctuation (default: True)
+    Returns:
+    - str, the normalized string
+    """
+    # Remove all white spaces. Required e.g for seagull vs. sea gull
+    no_spaces = re.sub(r"\s", "", input_str)
+    # Remove punctuation, if specified.
+    if remove_punct:
+        translator = str.maketrans("", "", string.punctuation)
+        return no_spaces.lower().translate(translator)
+    else:
+        return no_spaces.lower()