thisiszy commited on
Commit
b3920ca
β€’
1 Parent(s): 029e650

init leaderboard

Browse files
Files changed (5) hide show
  1. .gitignore +1 -0
  2. README.md +1 -1
  3. app.py +269 -0
  4. requirements.txt +78 -0
  5. utils.py +78 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
README.md CHANGED
@@ -6,7 +6,7 @@ colorTo: red
6
  sdk: gradio
7
  sdk_version: 4.22.0
8
  app_file: app.py
9
- pinned: false
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
6
  sdk: gradio
7
  sdk_version: 4.22.0
8
  app_file: app.py
9
+ pinned: true
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import zipfile
3
+ import json
4
+ from pathlib import Path
5
+
6
+ import gradio as gr
7
+ import pandas as pd
8
+ from email_validator import validate_email, EmailNotValidError
9
+ from datasets import load_dataset, DatasetDict, Dataset, IterableDatasetDict, IterableDataset
10
+ from huggingface_hub import HfApi
11
+
12
+ from utils import (
13
+ INTRODUCTION_TEXT,
14
+ TITLE,
15
+ format_error,
16
+ format_log,
17
+ format_warning,
18
+ model_hyperlink,
19
+ read_jsonl,
20
+ )
21
+
22
+
23
+ TOKEN = os.environ.get("TOKEN", None)
24
+ OWNER="agent-studio"
25
+
26
+ RESULTS_DATASET = f"{OWNER}/public_results"
27
+ SUBMISSION_DATASET = f"{OWNER}/submitted_results"
28
+
29
+
30
+ class ScoreManager:
31
+ def __init__(self) -> None:
32
+ self.eval_results : DatasetDict | Dataset | IterableDatasetDict | IterableDataset
33
+ self.api = HfApi()
34
+ self.refresh()
35
+
36
+ @staticmethod
37
+ def calc_score(base_path: Path):
38
+ apps = ["filesystem", "google", "GUI"]
39
+
40
+ scores_per_app = {}
41
+ for app in apps:
42
+ if app == "google":
43
+ data = []
44
+ try:
45
+ data += read_jsonl((base_path / "gcalendar.jsonl").as_posix())
46
+ data += read_jsonl((base_path / "gmail.jsonl").as_posix())
47
+ data += read_jsonl((base_path / "gdocs.jsonl").as_posix())
48
+ except FileNotFoundError:
49
+ print("No google data found")
50
+ continue
51
+ elif app == "filesystem":
52
+ try:
53
+ data = read_jsonl((base_path / "filesystem.jsonl").as_posix())
54
+ except FileNotFoundError:
55
+ print("No filesystem data found")
56
+ continue
57
+ elif app == "GUI":
58
+ data = []
59
+ try:
60
+ data += read_jsonl((base_path / "desktop_hard.jsonl").as_posix())
61
+ data += read_jsonl((base_path / "vscode.jsonl").as_posix())
62
+ except FileNotFoundError:
63
+ print("No GUI data found")
64
+ continue
65
+ else:
66
+ raise ValueError("Invalid app")
67
+ scores = [entry["score"] for entry in data]
68
+
69
+ tp = 0
70
+ fp = 0
71
+ tn = 0
72
+ fn = 0
73
+ for entry in data:
74
+ if entry["score"] > 0:
75
+ if entry["self_eval"]["score"] > 0:
76
+ tp += 1
77
+ else:
78
+ fp += 1
79
+ else:
80
+ if entry["self_eval"]["score"] > 0:
81
+ fn += 1
82
+ else:
83
+ tn += 1
84
+
85
+ score = round(sum(scores) / len(scores) * 100, 1)
86
+ accuracy = round((tp + tn) / (tp + tn + fp + fn) * 100, 1)
87
+ # print(f"Average score: {score}")
88
+ # print(f"Total tasks: {tp + fp + tn + fn}")
89
+ # print(f"True positive: {tp}")
90
+ # print(f"False positive: {fp}")
91
+ # print(f"True negative: {tn}")
92
+ # print(f"False negative: {fn}")
93
+ # print(f"Accuracy: {accuracy}\n")
94
+
95
+ scores_per_app[app] = {
96
+ "score": score,
97
+ "accuracy": accuracy,
98
+ }
99
+
100
+ return scores_per_app
101
+
102
+ @staticmethod
103
+ def dataset2table(dataset):
104
+ df = pd.DataFrame(data=dataset["test"])
105
+ df['model'] = df.apply(
106
+ lambda row: model_hyperlink(row['url'], row['model']) if row['url'] != "" else row['model'],
107
+ axis=1
108
+ )
109
+ df = df.drop(columns=["url", "organization"])
110
+ df = df[["model", "agent_type", "filesystem_score", "google_score", "GUI_score", "model_family"]]
111
+ return df
112
+
113
+
114
+ def refresh(self):
115
+ self.eval_results = load_dataset(
116
+ RESULTS_DATASET,
117
+ token=TOKEN,
118
+ download_mode="force_redownload",
119
+ )
120
+ self.pd_eval_results = self.dataset2table(self.eval_results)
121
+ return self.pd_eval_results
122
+
123
+ def add_new_eval(
124
+ self,
125
+ # level_of_test: str,
126
+ model_name: str,
127
+ model_family: str,
128
+ agent_type: str,
129
+ url: str,
130
+ uploaded_file_path: str,
131
+ organization: str,
132
+ mail: str,
133
+ ):
134
+ # Mandatory fields
135
+ if model_name == "":
136
+ return format_error("Model name cannot be empty")
137
+ elif model_family == "":
138
+ return format_error("Model family cannot be empty")
139
+ elif agent_type == "":
140
+ return format_error("Agent type cannot be empty")
141
+ elif uploaded_file_path == "":
142
+ return format_error("File cannot be empty")
143
+ elif organization == "":
144
+ return format_error("Organization cannot be empty")
145
+ elif mail == "":
146
+ return format_error("Mail cannot be empty")
147
+ # Check if the model has been already submitted
148
+ if model_name.lower() in set([m.lower() for m in self.eval_results["test"]["model"]]) \
149
+ and organization.lower() in set([l.lower() for l in self.eval_results["test"]["organization"]]):
150
+ return format_warning("This model has been already submitted.")
151
+ # Check if the email is valid
152
+ try:
153
+ validate_email(mail, check_deliverability=True)
154
+ except EmailNotValidError as e:
155
+ return format_error(f"Invalid email")
156
+
157
+ try:
158
+ file_path = Path(uploaded_file_path)
159
+ results_folder_path = file_path.parent / model_name
160
+ with zipfile.ZipFile(file_path, 'r') as zip_file:
161
+ zip_file.extractall(results_folder_path)
162
+ print(results_folder_path)
163
+ scores = self.calc_score(results_folder_path)
164
+ if scores == {}:
165
+ return format_error("No data found in the zip file, please make sure the file structure is correct.")
166
+ eval_entry = {
167
+ "model": model_name,
168
+ "model_family": model_family,
169
+ "agent_type": agent_type,
170
+ "url": url,
171
+ "organization": organization,
172
+ }
173
+ for app, scores in scores.items():
174
+ eval_entry[f"{app}_score"] = scores["score"]
175
+ print(eval_entry)
176
+ self.eval_results = self.eval_results["test"].add_item(eval_entry)
177
+ self.upload2hub(
178
+ results_folder_path,
179
+ model_name.lower(),
180
+ model_family,
181
+ organization.lower(),
182
+ mail,
183
+ url,
184
+ )
185
+ except Exception as e:
186
+ return format_error(f"Internal Error: {e}")
187
+
188
+ return format_log("Submitted successfully")
189
+
190
+ def upload2hub(
191
+ self,
192
+ folder_path: Path,
193
+ model_name: str,
194
+ model_family: str,
195
+ organization: str,
196
+ mail: str,
197
+ url: str,
198
+ ) -> None:
199
+ self.eval_results.push_to_hub(RESULTS_DATASET, token=TOKEN)
200
+ contact_info = {
201
+ "model": model_name,
202
+ "model_family": model_family,
203
+ "url": url,
204
+ "organization": organization,
205
+ "mail": mail,
206
+ }
207
+ with open(folder_path / "contact_info.json", "w") as f:
208
+ f.write(json.dumps(contact_info))
209
+ self.api.upload_folder(
210
+ folder_path=folder_path,
211
+ path_in_repo=f"{organization}/{model_name}",
212
+ repo_id=SUBMISSION_DATASET,
213
+ repo_type="dataset",
214
+ token=TOKEN
215
+ )
216
+
217
+
218
+ if __name__ == "__main__":
219
+ score_manager = ScoreManager()
220
+
221
+ iface = gr.Blocks()
222
+ with iface:
223
+ gr.HTML(TITLE)
224
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
225
+ with gr.Tab("Results: Test"):
226
+ leaderboard_table = gr.components.Dataframe(
227
+ value=score_manager.pd_eval_results,
228
+ datatype=["str", "str", "number", "number", "number", "str", "str", "str"],
229
+ interactive=False,
230
+ column_widths=["16%"]
231
+ )
232
+ refresh_button = gr.Button("Refresh")
233
+ refresh_button.click(
234
+ score_manager.refresh,
235
+ inputs=[],
236
+ outputs=[
237
+ leaderboard_table,
238
+ ],
239
+ )
240
+ with gr.Accordion("Submit a new model for evaluation (field with * are required)"):
241
+ with gr.Row():
242
+ with gr.Column():
243
+ # level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
244
+ model_name_textbox = gr.Textbox(label="Model name*")
245
+ model_family_textbox = gr.Textbox(label="Model family*")
246
+ agent_type_textbox = gr.Textbox(label="Agent type*")
247
+ url_textbox = gr.Textbox(label="Url to model information")
248
+ with gr.Column():
249
+ organization = gr.Textbox(label="Organisation*")
250
+ mail = gr.Textbox(label="Contact email* (will be stored privately, & used if there is an issue with your submission)")
251
+ file_output = gr.File(label="Upload model output* (one zip file)")
252
+
253
+ submit_button = gr.Button("Submit Eval")
254
+ submission_result = gr.Markdown()
255
+ submit_button.click(
256
+ score_manager.add_new_eval,
257
+ [
258
+ # level_of_test,
259
+ model_name_textbox,
260
+ model_family_textbox,
261
+ agent_type_textbox,
262
+ url_textbox,
263
+ file_output,
264
+ organization,
265
+ mail
266
+ ],
267
+ submission_result,
268
+ )
269
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohttp==3.9.3
3
+ aiosignal==1.3.1
4
+ altair==5.2.0
5
+ annotated-types==0.6.0
6
+ anyio==4.3.0
7
+ attrs==23.2.0
8
+ certifi==2024.2.2
9
+ charset-normalizer==3.3.2
10
+ click==8.1.7
11
+ colorama==0.4.6
12
+ contourpy==1.2.0
13
+ cycler==0.12.1
14
+ datasets==2.18.0
15
+ dill==0.3.8
16
+ dnspython==2.6.1
17
+ email_validator==2.1.1
18
+ fastapi==0.110.0
19
+ ffmpy==0.3.2
20
+ filelock==3.13.2
21
+ fonttools==4.50.0
22
+ frozenlist==1.4.1
23
+ fsspec==2024.2.0
24
+ gradio==4.22.0
25
+ gradio_client==0.13.0
26
+ h11==0.14.0
27
+ httpcore==1.0.4
28
+ httpx==0.27.0
29
+ huggingface-hub==0.22.0
30
+ idna==3.6
31
+ importlib_resources==6.4.0
32
+ Jinja2==3.1.3
33
+ jsonschema==4.21.1
34
+ jsonschema-specifications==2023.12.1
35
+ kiwisolver==1.4.5
36
+ markdown-it-py==3.0.0
37
+ MarkupSafe==2.1.5
38
+ matplotlib==3.8.3
39
+ mdurl==0.1.2
40
+ multidict==6.0.5
41
+ multiprocess==0.70.16
42
+ numpy==1.26.4
43
+ orjson==3.9.15
44
+ packaging==24.0
45
+ pandas==2.2.1
46
+ pillow==10.2.0
47
+ pyarrow==15.0.2
48
+ pyarrow-hotfix==0.6
49
+ pydantic==2.6.4
50
+ pydantic_core==2.16.3
51
+ pydub==0.25.1
52
+ Pygments==2.17.2
53
+ pyparsing==3.1.2
54
+ python-dateutil==2.9.0.post0
55
+ python-multipart==0.0.9
56
+ pytz==2024.1
57
+ PyYAML==6.0.1
58
+ referencing==0.34.0
59
+ requests==2.31.0
60
+ rich==13.7.1
61
+ rpds-py==0.18.0
62
+ ruff==0.3.4
63
+ semantic-version==2.10.0
64
+ shellingham==1.5.4
65
+ six==1.16.0
66
+ sniffio==1.3.1
67
+ starlette==0.36.3
68
+ tomlkit==0.12.0
69
+ toolz==0.12.1
70
+ tqdm==4.66.2
71
+ typer==0.10.0
72
+ typing_extensions==4.10.0
73
+ tzdata==2024.1
74
+ urllib3==2.2.1
75
+ uvicorn==0.29.0
76
+ websockets==11.0.3
77
+ xxhash==3.4.1
78
+ yarl==1.9.4
utils.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ TITLE = """<h1 align="center" id="space-title">Agent-Studio Leaderboard</h1>"""
4
+
5
+ INTRODUCTION_TEXT = """
6
+ AgentStudio is an open toolkit covering the entire lifespan of
7
+ building virtual agents that can interact with everything on digital worlds. Here, we open-source the beta of environment implementations, benchmark suite, data collection pipeline, and graphical interfaces to promote research towards generalist virtual agents of the future.
8
+
9
+ ## Submissions
10
+ You should submit a zip file containing the agent-studio output.
11
+
12
+ **Do not change the file names**. The file name is used to identify the scores of each category.
13
+
14
+ The file structure should be as follows:
15
+ ```
16
+ results.zip
17
+ β”œβ”€β”€ filesystem.jsonl
18
+ β”œβ”€β”€ gcalendar.jsonl
19
+ β”œβ”€β”€ gdocs.jsonl
20
+ β”œβ”€β”€ gmail.jsonl
21
+ β”œβ”€β”€ vscode.jsonl
22
+ β”œβ”€β”€ desktop_hard.jsonl
23
+ β”œβ”€β”€ ...
24
+ ```
25
+
26
+ """
27
+
28
+ def format_error(msg):
29
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
30
+
31
+ def format_warning(msg):
32
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
33
+
34
+ def format_log(msg):
35
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
36
+
37
+ def model_hyperlink(link, model_name):
38
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
39
+
40
+
41
+ def read_jsonl(file_path: str, start_idx: int = 0, end_idx: int | None = None) -> list:
42
+ """Reads lines from a .jsonl file between start_idx and end_idx.
43
+
44
+ Args:
45
+ file_path (str): Path to the .jsonl file
46
+ start_idx (int, optional): The starting index of lines to read
47
+ end_idx (int | None, optional): The ending index of lines to read
48
+
49
+ Returns:
50
+ list[dict]: A list of dictionaries, each dictionary is a line from
51
+ the .jsonl file
52
+ """
53
+ if end_idx is not None and start_idx > end_idx:
54
+ raise ValueError("start_idx must be less or equal to end_idx")
55
+
56
+ data = []
57
+ with open(file_path, "r") as file:
58
+ for i, line in enumerate(file):
59
+ if end_idx is not None and i >= end_idx:
60
+ break
61
+ if i >= start_idx:
62
+ data.append(json.loads(line))
63
+
64
+ return data
65
+
66
+
67
+ def add_jsonl(data: list, file_path: str, mode="a"):
68
+ """Adds a list of dictionaries to a .jsonl file.
69
+
70
+ Args:
71
+ data (list[dict]): A list of json objects to add to the file
72
+ file_path (str): Path to the .jsonl file
73
+ """
74
+ with open(file_path, mode) as file:
75
+ for item in data:
76
+ json_str = json.dumps(item)
77
+ file.write(json_str + "\n")
78
+