Spaces:

Skywork
/

agent-studio-leaderboard

Runtime error

App Files Files Community

thisiszy commited on Mar 25

Commit

b3920ca

•

1 Parent(s): 029e650

init leaderboard

Browse files

Files changed (5) hide show

.gitignore +1 -0
README.md +1 -1
app.py +269 -0
requirements.txt +78 -0
utils.py +78 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

README.md CHANGED Viewed

@@ -6,7 +6,7 @@ colorTo: red
 sdk: gradio
 sdk_version: 4.22.0
 app_file: app.py
-pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 sdk: gradio
 sdk_version: 4.22.0
 app_file: app.py
+pinned: true
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,269 @@

+import os
+import zipfile
+import json
+from pathlib import Path
+import gradio as gr
+import pandas as pd
+from email_validator import validate_email, EmailNotValidError
+from datasets import load_dataset, DatasetDict, Dataset, IterableDatasetDict, IterableDataset
+from huggingface_hub import HfApi
+from utils import (
+    INTRODUCTION_TEXT,
+    TITLE,
+    format_error,
+    format_log,
+    format_warning,
+    model_hyperlink,
+    read_jsonl,
+)
+TOKEN = os.environ.get("TOKEN", None)
+OWNER="agent-studio"
+RESULTS_DATASET = f"{OWNER}/public_results"
+SUBMISSION_DATASET = f"{OWNER}/submitted_results"
+class ScoreManager:
+    def __init__(self) -> None:
+        self.eval_results : DatasetDict | Dataset | IterableDatasetDict | IterableDataset
+        self.api = HfApi()
+        self.refresh()
+    @staticmethod
+    def calc_score(base_path: Path):
+        apps = ["filesystem", "google", "GUI"]
+        scores_per_app = {}
+        for app in apps:
+            if app == "google":
+                data = []
+                try:
+                    data += read_jsonl((base_path / "gcalendar.jsonl").as_posix())
+                    data += read_jsonl((base_path / "gmail.jsonl").as_posix())
+                    data += read_jsonl((base_path / "gdocs.jsonl").as_posix())
+                except FileNotFoundError:
+                    print("No google data found")
+                    continue
+            elif app == "filesystem":
+                try:
+                    data = read_jsonl((base_path / "filesystem.jsonl").as_posix())
+                except FileNotFoundError:
+                    print("No filesystem data found")
+                    continue
+            elif app == "GUI":
+                data = []
+                try:
+                    data += read_jsonl((base_path / "desktop_hard.jsonl").as_posix())
+                    data += read_jsonl((base_path / "vscode.jsonl").as_posix())
+                except FileNotFoundError:
+                    print("No GUI data found")
+                    continue
+            else:
+                raise ValueError("Invalid app")
+            scores = [entry["score"] for entry in data]
+            tp = 0
+            fp = 0
+            tn = 0
+            fn = 0
+            for entry in data:
+                if entry["score"] > 0:
+                    if entry["self_eval"]["score"] > 0:
+                        tp += 1
+                    else:
+                        fp += 1
+                else:
+                    if entry["self_eval"]["score"] > 0:
+                        fn += 1
+                    else:
+                        tn += 1
+            score = round(sum(scores) / len(scores) * 100, 1)
+            accuracy = round((tp + tn) / (tp + tn + fp + fn) * 100, 1)
+            # print(f"Average score: {score}")
+            # print(f"Total tasks: {tp + fp + tn + fn}")
+            # print(f"True positive: {tp}")
+            # print(f"False positive: {fp}")
+            # print(f"True negative: {tn}")
+            # print(f"False negative: {fn}")
+            # print(f"Accuracy: {accuracy}\n")
+            scores_per_app[app] = {
+                "score": score,
+                "accuracy": accuracy,
+            }
+        return scores_per_app
+    @staticmethod
+    def dataset2table(dataset):
+        df = pd.DataFrame(data=dataset["test"])
+        df['model'] = df.apply(
+            lambda row: model_hyperlink(row['url'], row['model']) if row['url'] != "" else row['model'],
+            axis=1
+        )
+        df = df.drop(columns=["url", "organization"])
+        df = df[["model", "agent_type", "filesystem_score", "google_score", "GUI_score", "model_family"]]
+        return df
+    def refresh(self):
+        self.eval_results = load_dataset(
+            RESULTS_DATASET,
+            token=TOKEN,
+            download_mode="force_redownload",
+        )
+        self.pd_eval_results = self.dataset2table(self.eval_results)
+        return self.pd_eval_results
+    def add_new_eval(
+        self,
+        # level_of_test: str,
+        model_name: str,
+        model_family: str,
+        agent_type: str,
+        url: str,
+        uploaded_file_path: str,
+        organization: str,
+        mail: str,
+    ):
+        # Mandatory fields
+        if model_name == "":
+            return format_error("Model name cannot be empty")
+        elif model_family == "":
+            return format_error("Model family cannot be empty")
+        elif agent_type == "":
+            return format_error("Agent type cannot be empty")
+        elif uploaded_file_path == "":
+            return format_error("File cannot be empty")
+        elif organization == "":
+            return format_error("Organization cannot be empty")
+        elif mail == "":
+            return format_error("Mail cannot be empty")
+        # Check if the model has been already submitted
+        if model_name.lower() in set([m.lower() for m in self.eval_results["test"]["model"]]) \
+            and organization.lower() in set([l.lower() for l in self.eval_results["test"]["organization"]]):
+            return format_warning("This model has been already submitted.")
+        # Check if the email is valid
+        try:
+            validate_email(mail, check_deliverability=True)
+        except EmailNotValidError as e:
+            return format_error(f"Invalid email")
+        try:
+            file_path = Path(uploaded_file_path)
+            results_folder_path = file_path.parent / model_name
+            with zipfile.ZipFile(file_path, 'r') as zip_file:
+                zip_file.extractall(results_folder_path)
+            print(results_folder_path)
+            scores = self.calc_score(results_folder_path)
+            if scores == {}:
+                return format_error("No data found in the zip file, please make sure the file structure is correct.")
+            eval_entry = {
+                "model": model_name,
+                "model_family": model_family,
+                "agent_type": agent_type,
+                "url": url,
+                "organization": organization,
+            }
+            for app, scores in scores.items():
+                eval_entry[f"{app}_score"] = scores["score"]
+            print(eval_entry)
+            self.eval_results = self.eval_results["test"].add_item(eval_entry)
+            self.upload2hub(
+                results_folder_path,
+                model_name.lower(),
+                model_family,
+                organization.lower(),
+                mail,
+                url,
+            )
+        except Exception as e:
+            return format_error(f"Internal Error: {e}")
+        return format_log("Submitted successfully")
+    def upload2hub(
+        self,
+        folder_path: Path,
+        model_name: str,
+        model_family: str,
+        organization: str,
+        mail: str,
+        url: str,
+    ) -> None:
+        self.eval_results.push_to_hub(RESULTS_DATASET, token=TOKEN)
+        contact_info = {
+            "model": model_name,
+            "model_family": model_family,
+            "url": url,
+            "organization": organization,
+            "mail": mail,
+        }
+        with open(folder_path / "contact_info.json", "w") as f:
+            f.write(json.dumps(contact_info))
+        self.api.upload_folder(
+            folder_path=folder_path,
+            path_in_repo=f"{organization}/{model_name}",
+            repo_id=SUBMISSION_DATASET,
+            repo_type="dataset",
+            token=TOKEN
+        )
+if __name__ == "__main__":
+    score_manager = ScoreManager()
+    iface = gr.Blocks()
+    with iface:
+        gr.HTML(TITLE)
+        gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+        with gr.Tab("Results: Test"):
+            leaderboard_table = gr.components.Dataframe(
+                value=score_manager.pd_eval_results,
+                datatype=["str", "str", "number", "number", "number", "str", "str", "str"],
+                interactive=False,
+                column_widths=["16%"]
+            )
+        refresh_button = gr.Button("Refresh")
+        refresh_button.click(
+            score_manager.refresh,
+            inputs=[],
+            outputs=[
+                leaderboard_table,
+            ],
+        )
+        with gr.Accordion("Submit a new model for evaluation (field with * are required)"):
+            with gr.Row():
+                with gr.Column():
+                    # level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
+                    model_name_textbox = gr.Textbox(label="Model name*")
+                    model_family_textbox = gr.Textbox(label="Model family*")
+                    agent_type_textbox = gr.Textbox(label="Agent type*")
+                    url_textbox = gr.Textbox(label="Url to model information")
+                with gr.Column():
+                    organization = gr.Textbox(label="Organisation*")
+                    mail = gr.Textbox(label="Contact email* (will be stored privately, & used if there is an issue with your submission)")
+                    file_output = gr.File(label="Upload model output* (one zip file)")
+            submit_button = gr.Button("Submit Eval")
+            submission_result = gr.Markdown()
+            submit_button.click(
+                score_manager.add_new_eval,
+                [
+                    # level_of_test,
+                    model_name_textbox,
+                    model_family_textbox,
+                    agent_type_textbox,
+                    url_textbox,
+                    file_output,
+                    organization,
+                    mail
+                ],
+                submission_result,
+            )
+    iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,78 @@

+aiofiles==23.2.1
+aiohttp==3.9.3
+aiosignal==1.3.1
+altair==5.2.0
+annotated-types==0.6.0
+anyio==4.3.0
+attrs==23.2.0
+certifi==2024.2.2
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+contourpy==1.2.0
+cycler==0.12.1
+datasets==2.18.0
+dill==0.3.8
+dnspython==2.6.1
+email_validator==2.1.1
+fastapi==0.110.0
+ffmpy==0.3.2
+filelock==3.13.2
+fonttools==4.50.0
+frozenlist==1.4.1
+fsspec==2024.2.0
+gradio==4.22.0
+gradio_client==0.13.0
+h11==0.14.0
+httpcore==1.0.4
+httpx==0.27.0
+huggingface-hub==0.22.0
+idna==3.6
+importlib_resources==6.4.0
+Jinja2==3.1.3
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.8.3
+mdurl==0.1.2
+multidict==6.0.5
+multiprocess==0.70.16
+numpy==1.26.4
+orjson==3.9.15
+packaging==24.0
+pandas==2.2.1
+pillow==10.2.0
+pyarrow==15.0.2
+pyarrow-hotfix==0.6
+pydantic==2.6.4
+pydantic_core==2.16.3
+pydub==0.25.1
+Pygments==2.17.2
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+python-multipart==0.0.9
+pytz==2024.1
+PyYAML==6.0.1
+referencing==0.34.0
+requests==2.31.0
+rich==13.7.1
+rpds-py==0.18.0
+ruff==0.3.4
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+starlette==0.36.3
+tomlkit==0.12.0
+toolz==0.12.1
+tqdm==4.66.2
+typer==0.10.0
+typing_extensions==4.10.0
+tzdata==2024.1
+urllib3==2.2.1
+uvicorn==0.29.0
+websockets==11.0.3
+xxhash==3.4.1
+yarl==1.9.4

utils.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import json
+TITLE = """<h1 align="center" id="space-title">Agent-Studio Leaderboard</h1>"""
+INTRODUCTION_TEXT = """
+AgentStudio is an open toolkit covering the entire lifespan of
+building virtual agents that can interact with everything on digital worlds. Here, we open-source the beta of environment implementations, benchmark suite, data collection pipeline, and graphical interfaces to promote research towards generalist virtual agents of the future.
+## Submissions
+You should submit a zip file containing the agent-studio output.
+**Do not change the file names**. The file name is used to identify the scores of each category.
+The file structure should be as follows:
+```
+results.zip
+├── filesystem.jsonl
+├── gcalendar.jsonl
+├── gdocs.jsonl
+├── gmail.jsonl
+├── vscode.jsonl
+├── desktop_hard.jsonl
+├── ...
+```
+"""
+def format_error(msg):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
+def format_warning(msg):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
+def format_log(msg):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def read_jsonl(file_path: str, start_idx: int = 0, end_idx: int | None = None) -> list:
+    """Reads lines from a .jsonl file between start_idx and end_idx.
+    Args:
+        file_path (str): Path to the .jsonl file
+        start_idx (int, optional): The starting index of lines to read
+        end_idx (int | None, optional): The ending index of lines to read
+    Returns:
+        list[dict]: A list of dictionaries, each dictionary is a line from
+            the .jsonl file
+    """
+    if end_idx is not None and start_idx > end_idx:
+        raise ValueError("start_idx must be less or equal to end_idx")
+    data = []
+    with open(file_path, "r") as file:
+        for i, line in enumerate(file):
+            if end_idx is not None and i >= end_idx:
+                break
+            if i >= start_idx:
+                data.append(json.loads(line))
+    return data
+def add_jsonl(data: list, file_path: str, mode="a"):
+    """Adds a list of dictionaries to a .jsonl file.
+    Args:
+        data (list[dict]): A list of json objects to add to the file
+        file_path (str): Path to the .jsonl file
+    """
+    with open(file_path, mode) as file:
+        for item in data:
+            json_str = json.dumps(item)
+            file.write(json_str + "\n")