Spaces:

Skywork
/

agent-studio-leaderboard

Runtime error

App Files Files Community

thisiszy commited on Mar 26

Commit

5f7cbd2

•

1 Parent(s): 2c113e5

update leaderboard

Browse files

Files changed (3) hide show

README.md +1 -1
app.py +145 -57
utils.py +36 -6

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Agent-Studio-Leaderboard
-emoji: 🌍
 colorFrom: pink
 colorTo: red
 sdk: gradio

 ---
 title: Agent-Studio-Leaderboard
+emoji: 🏆
 colorFrom: pink
 colorTo: red
 sdk: gradio

app.py CHANGED Viewed

@@ -23,21 +23,27 @@ from utils import (
 TOKEN = os.environ.get("TOKEN", None)
 OWNER="agent-studio"
-RESULTS_FILE = f"hf://datasets/{OWNER}/submitted_results/leaderboard/results.parquet"
 SUBMISSION_DATASET = f"{OWNER}/submitted_results"
 class ScoreManager:
     def __init__(self) -> None:
-        self.apps = ["filesystem", "google", "GUI"]
         self.eval_results : pd.DataFrame
-        self.pd_grounding_results: pd.DataFrame
         self.api = HfApi(token=TOKEN)
         self.fs = HfFileSystem(token=TOKEN)
         self.refresh()
-    def calc_score(self, base_path: Path):
-        apps = self.apps
         scores_per_app = {}
         for app in apps:
@@ -101,6 +107,46 @@ class ScoreManager:
         return scores_per_app
     @staticmethod
     def to_displayed_table(df: pd.DataFrame):
         df['model'] = df.apply(
@@ -108,22 +154,30 @@ class ScoreManager:
             axis=1
         )
         df = df.drop(columns=["url", "organization"])
-        df = df[["model", "agent_type", "filesystem_score", "google_score", "GUI_score", "model_family"]]
         df = df.sort_values(by="model")
         return df
     def refresh(self):
         try:
-            with self.fs.open(RESULTS_FILE, "rb") as f:
                 self.eval_results = pd.read_parquet(f)
         except FileNotFoundError:
             self.eval_results = pd.DataFrame(
-                columns=["model", "agent_type", "filesystem_score", "google_score", "GUI_score", "organization", "url", "model_family"]
             )
-        self.pd_eval_results = self.to_displayed_table(self.eval_results)
-        return self.pd_eval_results
     def add_new_eval(
         self,
@@ -143,12 +197,12 @@ class ScoreManager:
             return format_error("Model family cannot be empty")
         elif agent_type == "":
             return format_error("Agent type cannot be empty")
-        elif uploaded_file_path == "":
-            return format_error("File cannot be empty")
         elif organization == "":
             return format_error("Organization cannot be empty")
         elif mail == "":
             return format_error("Mail cannot be empty")
         # Check if the model has been already submitted
         if model_name.lower() in set([m.lower() for m in self.eval_results["model"]]) \
             and organization.lower() in set([l.lower() for l in self.eval_results["organization"]]):
@@ -165,32 +219,73 @@ class ScoreManager:
             with zipfile.ZipFile(file_path, 'r') as zip_file:
                 zip_file.extractall(results_folder_path)
             print(results_folder_path)
-            scores = self.calc_score(results_folder_path)
-            if scores == {}:
-                return format_error("No data found in the zip file, please make sure the file structure is correct.")
-            eval_entry = {
                 "model": model_name,
                 "model_family": model_family,
-                "agent_type": agent_type,
                 "url": url,
                 "organization": organization,
             }
-            for app, scores in scores.items():
-                eval_entry[f"{app}_score"] = scores["score"]
-            print(eval_entry)
-            self.eval_results = pd.concat(
-                [self.eval_results, pd.DataFrame([eval_entry])],
-                ignore_index=True
-            )
-            self.upload2hub(
-                results_folder_path,
-                model_name.lower(),
-                model_family,
-                organization.lower(),
-                mail,
-                url,
-            )
         except Exception as e:
             return format_error(f"Internal Error: {e}")
@@ -198,27 +293,19 @@ class ScoreManager:
     def upload2hub(
         self,
         folder_path: Path,
-        model_name: str,
-        model_family: str,
-        organization: str,
-        mail: str,
-        url: str,
     ) -> None:
-        with self.fs.open(RESULTS_FILE, "wb") as f:
-            self.eval_results.to_parquet(f)
-        contact_info = {
-            "model": model_name,
-            "model_family": model_family,
-            "url": url,
-            "organization": organization,
-            "mail": mail,
-        }
         with open(folder_path / "contact_info.json", "w") as f:
             f.write(json.dumps(contact_info))
         self.api.upload_folder(
             folder_path=folder_path,
-            path_in_repo=f"origin/{organization}/{model_name}",
             repo_id=SUBMISSION_DATASET,
             repo_type="dataset",
         )
@@ -233,25 +320,26 @@ if __name__ == "__main__":
         gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
         with gr.Tabs(elem_classes="main_tabs") as main_tabs:
             with gr.TabItem("🌍 Real-world tasks table", id=0):
-                leaderboard_table = gr.components.Dataframe(
-                    value=score_manager.pd_eval_results,
-                    datatype=["str", "str", "number", "number", "number", "str", "str", "str"],
                     interactive=False,
-                    column_widths=["16%"]
                 )
             with gr.TabItem("🌍 GUI grounding tasks table", id=1):
-                leaderboard_table = gr.components.Dataframe(
-                    value=score_manager.pd_grounding_results,
-                    datatype=["str", "str", "number", "number", "number", "str", "str", "str"],
                     interactive=False,
-                    column_widths=["16%"]
                 )
         refresh_button = gr.Button("Refresh")
         refresh_button.click(
             score_manager.refresh,
             inputs=[],
             outputs=[
-                leaderboard_table,
             ],
         )
         with gr.Accordion("Submit a new model for evaluation (field with * are required)"):
@@ -263,7 +351,7 @@ if __name__ == "__main__":
                     agent_type_textbox = gr.Textbox(label="Agent type*")
                     url_textbox = gr.Textbox(label="Url to model information")
                 with gr.Column():
-                    organization = gr.Textbox(label="Organisation*")
                     mail = gr.Textbox(label="Contact email* (will be stored privately, & used if there is an issue with your submission)")
                     file_output = gr.File(label="Upload model output* (one zip file)")

 TOKEN = os.environ.get("TOKEN", None)
 OWNER="agent-studio"
+REAL_WORLD_RESULTS_FILE = f"hf://datasets/{OWNER}/submitted_results/leaderboard/real_world_result.parquet"
+GUI_GROUNDING_RESULTS_FILE = f"hf://datasets/{OWNER}/submitted_results/leaderboard/gui_grounding_result.parquet"
 SUBMISSION_DATASET = f"{OWNER}/submitted_results"
+GROUNDING_FOLDER = f"hf://datasets/{OWNER}/agent-studio-data/grounding"
 class ScoreManager:
     def __init__(self) -> None:
         self.eval_results : pd.DataFrame
+        self.display_eval_results : pd.DataFrame
+        self.grounding_results: pd.DataFrame
+        self.display_grounding_results: pd.DataFrame
         self.api = HfApi(token=TOKEN)
         self.fs = HfFileSystem(token=TOKEN)
         self.refresh()
+    @staticmethod
+    def calc_real_task_scores(base_path: Path):
+        apps = ["filesystem", "google", "GUI"]
         scores_per_app = {}
         for app in apps:
         return scores_per_app
+    def calc_gui_grounding_scores(self, base_path: Path):
+        def calc_per_app_grounding_scores(result_dict, task_configs):
+            total_tasks = len(result_dict)
+            task_ids = set([task_config["task_id"] for task_config in task_configs])
+            success = 0
+            for result in result_dict:
+                if result["task_id"] not in task_ids:
+                    raise ValueError(f"Task id {result['task_id']} not found!")
+                if result["score"] == 1.0:
+                    success += 1
+            return {
+                "score": success / total_tasks * 100,
+                "total_tasks": total_tasks,
+                "success_tasks": success,
+            }
+        scores_per_os = {}
+        for os in base_path.iterdir():
+            if not os.is_dir():
+                continue
+            try:
+                scores_per_app = {}
+                for app in os.iterdir():
+                    if not app.is_dir():
+                        continue
+                    with self.fs.open(
+                        f"{GROUNDING_FOLDER}/{app.relative_to(base_path).as_posix()}/actions.jsonl",
+                        "r"
+                    ) as f:
+                        task_configs = read_jsonl(f)
+                    results_dict = read_jsonl((base_path / os / app / "results.jsonl").as_posix())
+                    results = calc_per_app_grounding_scores(results_dict, task_configs)
+                    scores_per_app[app.name] = results
+                scores_per_os[os.name] = scores_per_app
+            except FileNotFoundError:
+                print(f"No data found for {os.name}")
+                continue
+        return scores_per_os
     @staticmethod
     def to_displayed_table(df: pd.DataFrame):
         df['model'] = df.apply(
             axis=1
         )
         df = df.drop(columns=["url", "organization"])
         df = df.sort_values(by="model")
+        df = df.map(lambda x: round(x, 2) if isinstance(x, float) else x)
         return df
     def refresh(self):
         try:
+            with self.fs.open(REAL_WORLD_RESULTS_FILE, "rb") as f:
                 self.eval_results = pd.read_parquet(f)
         except FileNotFoundError:
             self.eval_results = pd.DataFrame(
+                columns=["model", "agent_type", "filesystem (%)", "google (%)", "GUI (%)", "organization", "url", "model_family"]
+            )
+        try:
+            with self.fs.open(GUI_GROUNDING_RESULTS_FILE, "rb") as f:
+                self.grounding_results = pd.read_parquet(f)
+        except FileNotFoundError:
+            self.grounding_results = pd.DataFrame(
+                columns=["model", "agent_type", "windows (%)", "linux (%)", "macos (%)", "organization", "url", "model_family"]
             )
+        self.display_eval_results = self.to_displayed_table(self.eval_results)
+        self.display_grounding_results = self.to_displayed_table(self.grounding_results)
+        return self.display_eval_results, self.display_grounding_results
     def add_new_eval(
         self,
             return format_error("Model family cannot be empty")
         elif agent_type == "":
             return format_error("Agent type cannot be empty")
         elif organization == "":
             return format_error("Organization cannot be empty")
         elif mail == "":
             return format_error("Mail cannot be empty")
+        elif uploaded_file_path == "":
+            return format_error("File cannot be empty")
         # Check if the model has been already submitted
         if model_name.lower() in set([m.lower() for m in self.eval_results["model"]]) \
             and organization.lower() in set([l.lower() for l in self.eval_results["organization"]]):
             with zipfile.ZipFile(file_path, 'r') as zip_file:
                 zip_file.extractall(results_folder_path)
             print(results_folder_path)
+            contact_info = {
                 "model": model_name,
                 "model_family": model_family,
                 "url": url,
                 "organization": organization,
+                "mail": mail,
             }
+            if dataset_selection == "Real-world tasks":
+                scores = self.calc_real_task_scores(results_folder_path)
+                if scores == {}:
+                    return format_error("No data found in the zip file, please make sure the file structure is correct.")
+                eval_entry = {
+                    "model": model_name,
+                    "model_family": model_family,
+                    "agent_type": agent_type,
+                    "url": url,
+                    "organization": organization,
+                }
+                for app, scores in scores.items():
+                    eval_entry[f"{app} (%)"] = scores["score"]
+                print(eval_entry)
+                self.eval_results = pd.concat(
+                    [self.eval_results, pd.DataFrame([eval_entry])],
+                    ignore_index=True
+                )
+                self.upload2hub(
+                    results_path=REAL_WORLD_RESULTS_FILE,
+                    results=self.eval_results,
+                    folder_path=results_folder_path,
+                    path_in_repo=f"origin/{organization}/{model_name}/real_world",
+                    contact_info=contact_info,
+                )
+            elif dataset_selection == "GUI grounding tasks":
+                scores = self.calc_gui_grounding_scores(results_folder_path)
+                if scores == {}:
+                    return format_error("No data found in the zip file, please make sure the file structure is correct.")
+                print(scores)
+                eval_entry = {
+                    "model": model_name,
+                    "model_family": model_family,
+                    "agent_type": agent_type,
+                    "url": url,
+                    "organization": organization,
+                }
+                for os, app_scores in scores.items():
+                    succ = 0
+                    total = 0
+                    for app, score in app_scores.items():
+                        succ += score["success_tasks"]
+                        total += score["total_tasks"]
+                    eval_entry[f"{os} (%)"] = succ / total * 100
+                self.grounding_results = pd.concat(
+                    [self.display_grounding_results, pd.DataFrame([eval_entry])],
+                    ignore_index=True
+                )
+                self.upload2hub(
+                    results_path=GUI_GROUNDING_RESULTS_FILE,
+                    results=self.grounding_results,
+                    folder_path=results_folder_path,
+                    path_in_repo=f"origin/{organization}/{model_name}/grounding",
+                    contact_info=contact_info,
+                )
+            else:
+                return format_error("Invalid dataset selection")
         except Exception as e:
             return format_error(f"Internal Error: {e}")
     def upload2hub(
         self,
+        results_path: str,
+        results: pd.DataFrame,
         folder_path: Path,
+        path_in_repo: str,
+        contact_info: str,
     ) -> None:
+        with self.fs.open(results_path, "wb") as f:
+            results.to_parquet(f)
         with open(folder_path / "contact_info.json", "w") as f:
             f.write(json.dumps(contact_info))
         self.api.upload_folder(
             folder_path=folder_path,
+            path_in_repo=path_in_repo,
             repo_id=SUBMISSION_DATASET,
             repo_type="dataset",
         )
         gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
         with gr.Tabs(elem_classes="main_tabs") as main_tabs:
             with gr.TabItem("🌍 Real-world tasks table", id=0):
+                leaderboard_real_world_table = gr.components.Dataframe(
+                    value=score_manager.display_eval_results,
+                    datatype=["str", "str", "number", "number", "number", "str"],
                     interactive=False,
+                    column_widths=["20%"]
                 )
             with gr.TabItem("🌍 GUI grounding tasks table", id=1):
+                leaderboard_gui_grounding_table = gr.components.Dataframe(
+                    value=score_manager.display_grounding_results,
+                    datatype=["str", "str", "number", "number", "number", "str"],
                     interactive=False,
+                    column_widths=["20%"]
                 )
         refresh_button = gr.Button("Refresh")
         refresh_button.click(
             score_manager.refresh,
             inputs=[],
             outputs=[
+                leaderboard_real_world_table,
+                leaderboard_gui_grounding_table,
             ],
         )
         with gr.Accordion("Submit a new model for evaluation (field with * are required)"):
                     agent_type_textbox = gr.Textbox(label="Agent type*")
                     url_textbox = gr.Textbox(label="Url to model information")
                 with gr.Column():
+                    organization = gr.Textbox(label="Organization*")
                     mail = gr.Textbox(label="Contact email* (will be stored privately, & used if there is an issue with your submission)")
                     file_output = gr.File(label="Upload model output* (one zip file)")

utils.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
 TITLE = """<h1 align="center" id="space-title">Agent-Studio Leaderboard</h1>"""
@@ -11,6 +12,8 @@ You should submit a zip file containing the agent-studio output.
 **Do not change the file names**. The file name is used to identify the scores of each category.
 The file structure should be as follows:
 ```
 results.zip
@@ -23,6 +26,26 @@ results.zip
 ├── ...
 ```
 """
 def format_error(msg):
@@ -38,11 +61,11 @@ def model_hyperlink(link, model_name):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
-def read_jsonl(file_path: str, start_idx: int = 0, end_idx: int | None = None) -> list:
     """Reads lines from a .jsonl file between start_idx and end_idx.
     Args:
-        file_path (str): Path to the .jsonl file
         start_idx (int, optional): The starting index of lines to read
         end_idx (int | None, optional): The ending index of lines to read
@@ -54,7 +77,14 @@ def read_jsonl(file_path: str, start_idx: int = 0, end_idx: int | None = None) -
         raise ValueError("start_idx must be less or equal to end_idx")
     data = []
-    with open(file_path, "r") as file:
         for i, line in enumerate(file):
             if end_idx is not None and i >= end_idx:
                 break
@@ -64,14 +94,14 @@ def read_jsonl(file_path: str, start_idx: int = 0, end_idx: int | None = None) -
     return data
-def add_jsonl(data: list, file_path: str, mode="a"):
     """Adds a list of dictionaries to a .jsonl file.
     Args:
         data (list[dict]): A list of json objects to add to the file
-        file_path (str): Path to the .jsonl file
     """
-    with open(file_path, mode) as file:
         for item in data:
             json_str = json.dumps(item)
             file.write(json_str + "\n")

 import json
+from io import TextIOWrapper
 TITLE = """<h1 align="center" id="space-title">Agent-Studio Leaderboard</h1>"""
 **Do not change the file names**. The file name is used to identify the scores of each category.
+### Real-world tasks
 The file structure should be as follows:
 ```
 results.zip
 ├── ...
 ```
+### GUI grounding tasks
+The file structure should be as follows:
+```
+results.zip
+├── linux
+│   ├── browser
+│   │   ├── results.jsonl
+|   ├── os
+│   │   ├── results.jsonl
+│   ├── ...
+├── windows
+|   ├── word
+│   │   ├── results.jsonl
+|   ├── os
+│   │   ├── results.jsonl
+│   ├── ...
+├── macos
+```
 """
 def format_error(msg):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def read_jsonl(file: str | TextIOWrapper, start_idx: int = 0, end_idx: int | None = None) -> list:
     """Reads lines from a .jsonl file between start_idx and end_idx.
     Args:
+        file (str | TextIOWrapper): Path to the .jsonl file or an open file object
         start_idx (int, optional): The starting index of lines to read
         end_idx (int | None, optional): The ending index of lines to read
         raise ValueError("start_idx must be less or equal to end_idx")
     data = []
+    if isinstance(file, str):
+        with open(file, "r") as file:
+            for i, line in enumerate(file):
+                if end_idx is not None and i >= end_idx:
+                    break
+                if i >= start_idx:
+                    data.append(json.loads(line))
+    else:
         for i, line in enumerate(file):
             if end_idx is not None and i >= end_idx:
                 break
     return data
+def add_jsonl(data: list, file: str, mode="a"):
     """Adds a list of dictionaries to a .jsonl file.
     Args:
         data (list[dict]): A list of json objects to add to the file
+        file (str): Path to the .jsonl file
     """
+    with open(file, mode) as file:
         for item in data:
             json_str = json.dumps(item)
             file.write(json_str + "\n")