thisiszy commited on
Commit
5f7cbd2
β€’
1 Parent(s): 2c113e5

update leaderboard

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +145 -57
  3. utils.py +36 -6
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Agent-Studio-Leaderboard
3
- emoji: 🌍
4
  colorFrom: pink
5
  colorTo: red
6
  sdk: gradio
 
1
  ---
2
  title: Agent-Studio-Leaderboard
3
+ emoji: πŸ†
4
  colorFrom: pink
5
  colorTo: red
6
  sdk: gradio
app.py CHANGED
@@ -23,21 +23,27 @@ from utils import (
23
  TOKEN = os.environ.get("TOKEN", None)
24
  OWNER="agent-studio"
25
 
26
- RESULTS_FILE = f"hf://datasets/{OWNER}/submitted_results/leaderboard/results.parquet"
 
27
  SUBMISSION_DATASET = f"{OWNER}/submitted_results"
 
 
 
28
 
29
 
30
  class ScoreManager:
31
  def __init__(self) -> None:
32
- self.apps = ["filesystem", "google", "GUI"]
33
  self.eval_results : pd.DataFrame
34
- self.pd_grounding_results: pd.DataFrame
 
 
35
  self.api = HfApi(token=TOKEN)
36
  self.fs = HfFileSystem(token=TOKEN)
37
  self.refresh()
38
 
39
- def calc_score(self, base_path: Path):
40
- apps = self.apps
 
41
 
42
  scores_per_app = {}
43
  for app in apps:
@@ -101,6 +107,46 @@ class ScoreManager:
101
 
102
  return scores_per_app
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  @staticmethod
105
  def to_displayed_table(df: pd.DataFrame):
106
  df['model'] = df.apply(
@@ -108,22 +154,30 @@ class ScoreManager:
108
  axis=1
109
  )
110
  df = df.drop(columns=["url", "organization"])
111
- df = df[["model", "agent_type", "filesystem_score", "google_score", "GUI_score", "model_family"]]
112
  df = df.sort_values(by="model")
 
113
  return df
114
 
115
 
116
  def refresh(self):
117
  try:
118
- with self.fs.open(RESULTS_FILE, "rb") as f:
119
  self.eval_results = pd.read_parquet(f)
120
  except FileNotFoundError:
121
  self.eval_results = pd.DataFrame(
122
- columns=["model", "agent_type", "filesystem_score", "google_score", "GUI_score", "organization", "url", "model_family"]
 
 
 
 
 
 
 
123
  )
124
 
125
- self.pd_eval_results = self.to_displayed_table(self.eval_results)
126
- return self.pd_eval_results
 
127
 
128
  def add_new_eval(
129
  self,
@@ -143,12 +197,12 @@ class ScoreManager:
143
  return format_error("Model family cannot be empty")
144
  elif agent_type == "":
145
  return format_error("Agent type cannot be empty")
146
- elif uploaded_file_path == "":
147
- return format_error("File cannot be empty")
148
  elif organization == "":
149
  return format_error("Organization cannot be empty")
150
  elif mail == "":
151
  return format_error("Mail cannot be empty")
 
 
152
  # Check if the model has been already submitted
153
  if model_name.lower() in set([m.lower() for m in self.eval_results["model"]]) \
154
  and organization.lower() in set([l.lower() for l in self.eval_results["organization"]]):
@@ -165,32 +219,73 @@ class ScoreManager:
165
  with zipfile.ZipFile(file_path, 'r') as zip_file:
166
  zip_file.extractall(results_folder_path)
167
  print(results_folder_path)
168
- scores = self.calc_score(results_folder_path)
169
- if scores == {}:
170
- return format_error("No data found in the zip file, please make sure the file structure is correct.")
171
- eval_entry = {
172
  "model": model_name,
173
  "model_family": model_family,
174
- "agent_type": agent_type,
175
  "url": url,
176
  "organization": organization,
 
177
  }
178
- for app, scores in scores.items():
179
- eval_entry[f"{app}_score"] = scores["score"]
180
- print(eval_entry)
181
- self.eval_results = pd.concat(
182
- [self.eval_results, pd.DataFrame([eval_entry])],
183
- ignore_index=True
184
- )
185
 
186
- self.upload2hub(
187
- results_folder_path,
188
- model_name.lower(),
189
- model_family,
190
- organization.lower(),
191
- mail,
192
- url,
193
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  except Exception as e:
195
  return format_error(f"Internal Error: {e}")
196
 
@@ -198,27 +293,19 @@ class ScoreManager:
198
 
199
  def upload2hub(
200
  self,
 
 
201
  folder_path: Path,
202
- model_name: str,
203
- model_family: str,
204
- organization: str,
205
- mail: str,
206
- url: str,
207
  ) -> None:
208
- with self.fs.open(RESULTS_FILE, "wb") as f:
209
- self.eval_results.to_parquet(f)
210
- contact_info = {
211
- "model": model_name,
212
- "model_family": model_family,
213
- "url": url,
214
- "organization": organization,
215
- "mail": mail,
216
- }
217
  with open(folder_path / "contact_info.json", "w") as f:
218
  f.write(json.dumps(contact_info))
219
  self.api.upload_folder(
220
  folder_path=folder_path,
221
- path_in_repo=f"origin/{organization}/{model_name}",
222
  repo_id=SUBMISSION_DATASET,
223
  repo_type="dataset",
224
  )
@@ -233,25 +320,26 @@ if __name__ == "__main__":
233
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
234
  with gr.Tabs(elem_classes="main_tabs") as main_tabs:
235
  with gr.TabItem("🌍 Real-world tasks table", id=0):
236
- leaderboard_table = gr.components.Dataframe(
237
- value=score_manager.pd_eval_results,
238
- datatype=["str", "str", "number", "number", "number", "str", "str", "str"],
239
  interactive=False,
240
- column_widths=["16%"]
241
  )
242
  with gr.TabItem("🌍 GUI grounding tasks table", id=1):
243
- leaderboard_table = gr.components.Dataframe(
244
- value=score_manager.pd_grounding_results,
245
- datatype=["str", "str", "number", "number", "number", "str", "str", "str"],
246
  interactive=False,
247
- column_widths=["16%"]
248
  )
249
  refresh_button = gr.Button("Refresh")
250
  refresh_button.click(
251
  score_manager.refresh,
252
  inputs=[],
253
  outputs=[
254
- leaderboard_table,
 
255
  ],
256
  )
257
  with gr.Accordion("Submit a new model for evaluation (field with * are required)"):
@@ -263,7 +351,7 @@ if __name__ == "__main__":
263
  agent_type_textbox = gr.Textbox(label="Agent type*")
264
  url_textbox = gr.Textbox(label="Url to model information")
265
  with gr.Column():
266
- organization = gr.Textbox(label="Organisation*")
267
  mail = gr.Textbox(label="Contact email* (will be stored privately, & used if there is an issue with your submission)")
268
  file_output = gr.File(label="Upload model output* (one zip file)")
269
 
 
23
  TOKEN = os.environ.get("TOKEN", None)
24
  OWNER="agent-studio"
25
 
26
+ REAL_WORLD_RESULTS_FILE = f"hf://datasets/{OWNER}/submitted_results/leaderboard/real_world_result.parquet"
27
+ GUI_GROUNDING_RESULTS_FILE = f"hf://datasets/{OWNER}/submitted_results/leaderboard/gui_grounding_result.parquet"
28
  SUBMISSION_DATASET = f"{OWNER}/submitted_results"
29
+ GROUNDING_FOLDER = f"hf://datasets/{OWNER}/agent-studio-data/grounding"
30
+
31
+
32
 
33
 
34
  class ScoreManager:
35
  def __init__(self) -> None:
 
36
  self.eval_results : pd.DataFrame
37
+ self.display_eval_results : pd.DataFrame
38
+ self.grounding_results: pd.DataFrame
39
+ self.display_grounding_results: pd.DataFrame
40
  self.api = HfApi(token=TOKEN)
41
  self.fs = HfFileSystem(token=TOKEN)
42
  self.refresh()
43
 
44
+ @staticmethod
45
+ def calc_real_task_scores(base_path: Path):
46
+ apps = ["filesystem", "google", "GUI"]
47
 
48
  scores_per_app = {}
49
  for app in apps:
 
107
 
108
  return scores_per_app
109
 
110
+ def calc_gui_grounding_scores(self, base_path: Path):
111
+ def calc_per_app_grounding_scores(result_dict, task_configs):
112
+ total_tasks = len(result_dict)
113
+ task_ids = set([task_config["task_id"] for task_config in task_configs])
114
+ success = 0
115
+ for result in result_dict:
116
+ if result["task_id"] not in task_ids:
117
+ raise ValueError(f"Task id {result['task_id']} not found!")
118
+ if result["score"] == 1.0:
119
+ success += 1
120
+
121
+ return {
122
+ "score": success / total_tasks * 100,
123
+ "total_tasks": total_tasks,
124
+ "success_tasks": success,
125
+ }
126
+
127
+ scores_per_os = {}
128
+ for os in base_path.iterdir():
129
+ if not os.is_dir():
130
+ continue
131
+ try:
132
+ scores_per_app = {}
133
+ for app in os.iterdir():
134
+ if not app.is_dir():
135
+ continue
136
+ with self.fs.open(
137
+ f"{GROUNDING_FOLDER}/{app.relative_to(base_path).as_posix()}/actions.jsonl",
138
+ "r"
139
+ ) as f:
140
+ task_configs = read_jsonl(f)
141
+ results_dict = read_jsonl((base_path / os / app / "results.jsonl").as_posix())
142
+ results = calc_per_app_grounding_scores(results_dict, task_configs)
143
+ scores_per_app[app.name] = results
144
+ scores_per_os[os.name] = scores_per_app
145
+ except FileNotFoundError:
146
+ print(f"No data found for {os.name}")
147
+ continue
148
+ return scores_per_os
149
+
150
  @staticmethod
151
  def to_displayed_table(df: pd.DataFrame):
152
  df['model'] = df.apply(
 
154
  axis=1
155
  )
156
  df = df.drop(columns=["url", "organization"])
 
157
  df = df.sort_values(by="model")
158
+ df = df.map(lambda x: round(x, 2) if isinstance(x, float) else x)
159
  return df
160
 
161
 
162
  def refresh(self):
163
  try:
164
+ with self.fs.open(REAL_WORLD_RESULTS_FILE, "rb") as f:
165
  self.eval_results = pd.read_parquet(f)
166
  except FileNotFoundError:
167
  self.eval_results = pd.DataFrame(
168
+ columns=["model", "agent_type", "filesystem (%)", "google (%)", "GUI (%)", "organization", "url", "model_family"]
169
+ )
170
+ try:
171
+ with self.fs.open(GUI_GROUNDING_RESULTS_FILE, "rb") as f:
172
+ self.grounding_results = pd.read_parquet(f)
173
+ except FileNotFoundError:
174
+ self.grounding_results = pd.DataFrame(
175
+ columns=["model", "agent_type", "windows (%)", "linux (%)", "macos (%)", "organization", "url", "model_family"]
176
  )
177
 
178
+ self.display_eval_results = self.to_displayed_table(self.eval_results)
179
+ self.display_grounding_results = self.to_displayed_table(self.grounding_results)
180
+ return self.display_eval_results, self.display_grounding_results
181
 
182
  def add_new_eval(
183
  self,
 
197
  return format_error("Model family cannot be empty")
198
  elif agent_type == "":
199
  return format_error("Agent type cannot be empty")
 
 
200
  elif organization == "":
201
  return format_error("Organization cannot be empty")
202
  elif mail == "":
203
  return format_error("Mail cannot be empty")
204
+ elif uploaded_file_path == "":
205
+ return format_error("File cannot be empty")
206
  # Check if the model has been already submitted
207
  if model_name.lower() in set([m.lower() for m in self.eval_results["model"]]) \
208
  and organization.lower() in set([l.lower() for l in self.eval_results["organization"]]):
 
219
  with zipfile.ZipFile(file_path, 'r') as zip_file:
220
  zip_file.extractall(results_folder_path)
221
  print(results_folder_path)
222
+ contact_info = {
 
 
 
223
  "model": model_name,
224
  "model_family": model_family,
 
225
  "url": url,
226
  "organization": organization,
227
+ "mail": mail,
228
  }
 
 
 
 
 
 
 
229
 
230
+ if dataset_selection == "Real-world tasks":
231
+ scores = self.calc_real_task_scores(results_folder_path)
232
+ if scores == {}:
233
+ return format_error("No data found in the zip file, please make sure the file structure is correct.")
234
+ eval_entry = {
235
+ "model": model_name,
236
+ "model_family": model_family,
237
+ "agent_type": agent_type,
238
+ "url": url,
239
+ "organization": organization,
240
+ }
241
+ for app, scores in scores.items():
242
+ eval_entry[f"{app} (%)"] = scores["score"]
243
+ print(eval_entry)
244
+ self.eval_results = pd.concat(
245
+ [self.eval_results, pd.DataFrame([eval_entry])],
246
+ ignore_index=True
247
+ )
248
+
249
+ self.upload2hub(
250
+ results_path=REAL_WORLD_RESULTS_FILE,
251
+ results=self.eval_results,
252
+ folder_path=results_folder_path,
253
+ path_in_repo=f"origin/{organization}/{model_name}/real_world",
254
+ contact_info=contact_info,
255
+ )
256
+ elif dataset_selection == "GUI grounding tasks":
257
+ scores = self.calc_gui_grounding_scores(results_folder_path)
258
+ if scores == {}:
259
+ return format_error("No data found in the zip file, please make sure the file structure is correct.")
260
+ print(scores)
261
+ eval_entry = {
262
+ "model": model_name,
263
+ "model_family": model_family,
264
+ "agent_type": agent_type,
265
+ "url": url,
266
+ "organization": organization,
267
+ }
268
+ for os, app_scores in scores.items():
269
+ succ = 0
270
+ total = 0
271
+ for app, score in app_scores.items():
272
+ succ += score["success_tasks"]
273
+ total += score["total_tasks"]
274
+ eval_entry[f"{os} (%)"] = succ / total * 100
275
+ self.grounding_results = pd.concat(
276
+ [self.display_grounding_results, pd.DataFrame([eval_entry])],
277
+ ignore_index=True
278
+ )
279
+
280
+ self.upload2hub(
281
+ results_path=GUI_GROUNDING_RESULTS_FILE,
282
+ results=self.grounding_results,
283
+ folder_path=results_folder_path,
284
+ path_in_repo=f"origin/{organization}/{model_name}/grounding",
285
+ contact_info=contact_info,
286
+ )
287
+ else:
288
+ return format_error("Invalid dataset selection")
289
  except Exception as e:
290
  return format_error(f"Internal Error: {e}")
291
 
 
293
 
294
  def upload2hub(
295
  self,
296
+ results_path: str,
297
+ results: pd.DataFrame,
298
  folder_path: Path,
299
+ path_in_repo: str,
300
+ contact_info: str,
 
 
 
301
  ) -> None:
302
+ with self.fs.open(results_path, "wb") as f:
303
+ results.to_parquet(f)
 
 
 
 
 
 
 
304
  with open(folder_path / "contact_info.json", "w") as f:
305
  f.write(json.dumps(contact_info))
306
  self.api.upload_folder(
307
  folder_path=folder_path,
308
+ path_in_repo=path_in_repo,
309
  repo_id=SUBMISSION_DATASET,
310
  repo_type="dataset",
311
  )
 
320
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
321
  with gr.Tabs(elem_classes="main_tabs") as main_tabs:
322
  with gr.TabItem("🌍 Real-world tasks table", id=0):
323
+ leaderboard_real_world_table = gr.components.Dataframe(
324
+ value=score_manager.display_eval_results,
325
+ datatype=["str", "str", "number", "number", "number", "str"],
326
  interactive=False,
327
+ column_widths=["20%"]
328
  )
329
  with gr.TabItem("🌍 GUI grounding tasks table", id=1):
330
+ leaderboard_gui_grounding_table = gr.components.Dataframe(
331
+ value=score_manager.display_grounding_results,
332
+ datatype=["str", "str", "number", "number", "number", "str"],
333
  interactive=False,
334
+ column_widths=["20%"]
335
  )
336
  refresh_button = gr.Button("Refresh")
337
  refresh_button.click(
338
  score_manager.refresh,
339
  inputs=[],
340
  outputs=[
341
+ leaderboard_real_world_table,
342
+ leaderboard_gui_grounding_table,
343
  ],
344
  )
345
  with gr.Accordion("Submit a new model for evaluation (field with * are required)"):
 
351
  agent_type_textbox = gr.Textbox(label="Agent type*")
352
  url_textbox = gr.Textbox(label="Url to model information")
353
  with gr.Column():
354
+ organization = gr.Textbox(label="Organization*")
355
  mail = gr.Textbox(label="Contact email* (will be stored privately, & used if there is an issue with your submission)")
356
  file_output = gr.File(label="Upload model output* (one zip file)")
357
 
utils.py CHANGED
@@ -1,4 +1,5 @@
1
  import json
 
2
 
3
  TITLE = """<h1 align="center" id="space-title">Agent-Studio Leaderboard</h1>"""
4
 
@@ -11,6 +12,8 @@ You should submit a zip file containing the agent-studio output.
11
 
12
  **Do not change the file names**. The file name is used to identify the scores of each category.
13
 
 
 
14
  The file structure should be as follows:
15
  ```
16
  results.zip
@@ -23,6 +26,26 @@ results.zip
23
  β”œβ”€β”€ ...
24
  ```
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  """
27
 
28
  def format_error(msg):
@@ -38,11 +61,11 @@ def model_hyperlink(link, model_name):
38
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
39
 
40
 
41
- def read_jsonl(file_path: str, start_idx: int = 0, end_idx: int | None = None) -> list:
42
  """Reads lines from a .jsonl file between start_idx and end_idx.
43
 
44
  Args:
45
- file_path (str): Path to the .jsonl file
46
  start_idx (int, optional): The starting index of lines to read
47
  end_idx (int | None, optional): The ending index of lines to read
48
 
@@ -54,7 +77,14 @@ def read_jsonl(file_path: str, start_idx: int = 0, end_idx: int | None = None) -
54
  raise ValueError("start_idx must be less or equal to end_idx")
55
 
56
  data = []
57
- with open(file_path, "r") as file:
 
 
 
 
 
 
 
58
  for i, line in enumerate(file):
59
  if end_idx is not None and i >= end_idx:
60
  break
@@ -64,14 +94,14 @@ def read_jsonl(file_path: str, start_idx: int = 0, end_idx: int | None = None) -
64
  return data
65
 
66
 
67
- def add_jsonl(data: list, file_path: str, mode="a"):
68
  """Adds a list of dictionaries to a .jsonl file.
69
 
70
  Args:
71
  data (list[dict]): A list of json objects to add to the file
72
- file_path (str): Path to the .jsonl file
73
  """
74
- with open(file_path, mode) as file:
75
  for item in data:
76
  json_str = json.dumps(item)
77
  file.write(json_str + "\n")
 
1
  import json
2
+ from io import TextIOWrapper
3
 
4
  TITLE = """<h1 align="center" id="space-title">Agent-Studio Leaderboard</h1>"""
5
 
 
12
 
13
  **Do not change the file names**. The file name is used to identify the scores of each category.
14
 
15
+ ### Real-world tasks
16
+
17
  The file structure should be as follows:
18
  ```
19
  results.zip
 
26
  β”œβ”€β”€ ...
27
  ```
28
 
29
+ ### GUI grounding tasks
30
+
31
+ The file structure should be as follows:
32
+ ```
33
+ results.zip
34
+ β”œβ”€β”€ linux
35
+ β”‚ β”œβ”€β”€ browser
36
+ β”‚ β”‚ β”œβ”€β”€ results.jsonl
37
+ | β”œβ”€β”€ os
38
+ β”‚ β”‚ β”œβ”€β”€ results.jsonl
39
+ β”‚ β”œβ”€β”€ ...
40
+ β”œβ”€β”€ windows
41
+ | β”œβ”€β”€ word
42
+ β”‚ β”‚ β”œβ”€β”€ results.jsonl
43
+ | β”œβ”€β”€ os
44
+ β”‚ β”‚ β”œβ”€β”€ results.jsonl
45
+ β”‚ β”œβ”€β”€ ...
46
+ β”œβ”€β”€ macos
47
+ ```
48
+
49
  """
50
 
51
  def format_error(msg):
 
61
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
62
 
63
 
64
+ def read_jsonl(file: str | TextIOWrapper, start_idx: int = 0, end_idx: int | None = None) -> list:
65
  """Reads lines from a .jsonl file between start_idx and end_idx.
66
 
67
  Args:
68
+ file (str | TextIOWrapper): Path to the .jsonl file or an open file object
69
  start_idx (int, optional): The starting index of lines to read
70
  end_idx (int | None, optional): The ending index of lines to read
71
 
 
77
  raise ValueError("start_idx must be less or equal to end_idx")
78
 
79
  data = []
80
+ if isinstance(file, str):
81
+ with open(file, "r") as file:
82
+ for i, line in enumerate(file):
83
+ if end_idx is not None and i >= end_idx:
84
+ break
85
+ if i >= start_idx:
86
+ data.append(json.loads(line))
87
+ else:
88
  for i, line in enumerate(file):
89
  if end_idx is not None and i >= end_idx:
90
  break
 
94
  return data
95
 
96
 
97
+ def add_jsonl(data: list, file: str, mode="a"):
98
  """Adds a list of dictionaries to a .jsonl file.
99
 
100
  Args:
101
  data (list[dict]): A list of json objects to add to the file
102
+ file (str): Path to the .jsonl file
103
  """
104
+ with open(file, mode) as file:
105
  for item in data:
106
  json_str = json.dumps(item)
107
  file.write(json_str + "\n")