simonrosenberg1 commited on
Commit
bef7ade
·
verified ·
1 Parent(s): 7949205

Show ACP agent results in the leaderboard

Browse files

## Summary

The HF Space currently only loads `results/{model}/` (default OpenHands runs).
The ACP runs (`acp-claude`, `acp-codex`, `acp-gemini`, `openhands_subagents`)
live in `alternative_agents/{type}/{model}/` in the openhands-index-results
repo and never made it into the dataframe, so the website silently dropped
them. After OpenHands/openhands-index-results#820–#829 + #830, all the ACP
Claude Code data from the master table in OpenHands/benchmarks#576 is in
the canonical location, but the leaderboard still doesn't show it.

This PR teaches the loader to ingest `alternative_agents/` and adds an
**Agent** column to the leaderboard so OpenHands vs Claude Code vs Codex
vs Gemini CLI are visible at a glance.

## Changes

- **`setup_data.py`** — copy `alternative_agents/` alongside `results/` when fetching the index repo, so all submissions land in the data dir.
- **`simple_data_loader.py`**:
- Factor per-directory loading into `_records_from_agent_dir` and have `_load_from_agent_dirs` walk both `results/` and `alternative_agents/{type}/{model}/`.
- Default `agent_name` per `agent_type` (Claude Code / Codex / Gemini CLI / OpenHands Sub-agents), matching the `AGENT_NAME_BY_TYPE` map in `OpenHands/evaluation push_to_index_from_archive.py`.
- Include `agent_name` in `agent_id` (`name_version_model`) so an OpenHands run and a Claude Code run on the same SDK version + model don't collide into one row.
- Surface `agent_name` on the transformed record.
- **`leaderboard_transformer.py`**:
- Map `agent_name` → "Agent" in `_pretty_column_name`.
- Insert "Agent" into `base_cols` between `id` and `Language Model`.

## Local verification

Cloned the latest `openhands-index-results` and pointed the loader at it.
The loader now returns 29 rows: 24 OpenHands + 2 Claude Code + 1 Codex + 2
OpenHands Sub-agents. The new Claude Code rows match the master table in
OpenHands/benchmarks#576:

```
Claude Code / claude-opus-4-6: swebench 74.4 swtbench 66.7 gaia 66.1 commit0 50.0 swe-bench-multimodal 32.4
Claude Code / claude-sonnet-4-5: swebench 74.4 swtbench 69.3 gaia 63.0 commit0 31.2 swe-bench-multimodal 35.3
```

## Test plan

- [ ] Reviewer: load the Space preview built from this PR, confirm the leaderboard table now has an **Agent** column and shows Claude Code / Codex / OpenHands Sub-agents rows.
- [ ] Confirm the existing OpenHands rows look unchanged (same scores, no missing entries).

Files changed (3) hide show
  1. leaderboard_transformer.py +6 -1
  2. setup_data.py +17 -5
  3. simple_data_loader.py +120 -56
leaderboard_transformer.py CHANGED
@@ -655,6 +655,7 @@ def _pretty_column_name(raw_col: str) -> str:
655
  # Case 1: Handle fixed, special-case mappings first.
656
  fixed_mappings = {
657
  'id': 'id',
 
658
  'SDK version': 'SDK Version',
659
  'Openhands version': 'SDK Version', # Legacy support
660
  'Language model': 'Language Model',
@@ -815,7 +816,11 @@ class DataTransformer:
815
  df_view = df_sorted.copy()
816
 
817
  # --- 3. Add Columns for Agent Openness ---
818
- base_cols = ["id","Language Model","SDK Version","Source"]
 
 
 
 
819
  new_cols = ["Openness"]
820
  ending_cols = ["Date", "Logs", "Visualization"]
821
 
 
655
  # Case 1: Handle fixed, special-case mappings first.
656
  fixed_mappings = {
657
  'id': 'id',
658
+ 'agent_name': 'Agent',
659
  'SDK version': 'SDK Version',
660
  'Openhands version': 'SDK Version', # Legacy support
661
  'Language model': 'Language Model',
 
816
  df_view = df_sorted.copy()
817
 
818
  # --- 3. Add Columns for Agent Openness ---
819
+ # "Agent" sits between id and Language Model so OpenHands vs
820
+ # alternative agents (Claude Code / Codex / Gemini CLI) are visible
821
+ # at a glance, and so the same model with two different agents
822
+ # doesn't look like a duplicate row.
823
+ base_cols = ["id", "Agent", "Language Model", "SDK Version", "Source"]
824
  new_cols = ["Openness"]
825
  ending_cols = ["Date", "Logs", "Visualization"]
826
 
setup_data.py CHANGED
@@ -70,27 +70,39 @@ def fetch_data_from_github():
70
 
71
  # Look for data files in the cloned repository
72
  results_source = clone_dir / "results"
73
-
74
  if not results_source.exists():
75
  print(f"Results directory not found in repository")
76
  return False
77
-
78
  # Check if there are any agent result directories
79
  result_dirs = list(results_source.iterdir())
80
  if not result_dirs:
81
  print(f"No agent results found in {results_source}")
82
  return False
83
-
84
  print(f"Found {len(result_dirs)} agent result directories")
85
-
86
  # Create target directory and copy the results structure
87
  os.makedirs(target_dir.parent, exist_ok=True)
88
  if target_dir.exists():
89
  shutil.rmtree(target_dir)
90
-
91
  # Copy the entire results directory
92
  target_results = target_dir / "results"
93
  shutil.copytree(results_source, target_results)
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")
96
 
 
70
 
71
  # Look for data files in the cloned repository
72
  results_source = clone_dir / "results"
73
+
74
  if not results_source.exists():
75
  print(f"Results directory not found in repository")
76
  return False
77
+
78
  # Check if there are any agent result directories
79
  result_dirs = list(results_source.iterdir())
80
  if not result_dirs:
81
  print(f"No agent results found in {results_source}")
82
  return False
83
+
84
  print(f"Found {len(result_dirs)} agent result directories")
85
+
86
  # Create target directory and copy the results structure
87
  os.makedirs(target_dir.parent, exist_ok=True)
88
  if target_dir.exists():
89
  shutil.rmtree(target_dir)
90
+
91
  # Copy the entire results directory
92
  target_results = target_dir / "results"
93
  shutil.copytree(results_source, target_results)
94
+
95
+ # Also copy alternative_agents/ if present, so the loader can pick up
96
+ # ACP runs (acp-claude, acp-codex, acp-gemini, openhands_subagents, ...)
97
+ # alongside the default OpenHands agent results.
98
+ alt_source = clone_dir / "alternative_agents"
99
+ if alt_source.exists():
100
+ alt_target = target_dir / "alternative_agents"
101
+ shutil.copytree(alt_source, alt_target)
102
+ agent_types = sorted(p.name for p in alt_source.iterdir() if p.is_dir())
103
+ print(f"Found alternative agent types: {agent_types}")
104
+ else:
105
+ print("No alternative_agents/ directory in repository (skipping)")
106
 
107
  print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")
108
 
simple_data_loader.py CHANGED
@@ -127,55 +127,109 @@ class SimpleLeaderboardViewer:
127
  if benchmark not in self.tag_map[category]:
128
  self.tag_map[category].append(benchmark)
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  def _load_from_agent_dirs(self):
131
- """Load data from new agent-centric directory structure (results/YYYYMMDD_model/)."""
132
- results_dir = self.config_path / "results"
133
-
134
- if not results_dir.exists():
135
- return None # Fall back to old format
136
-
 
 
137
  all_records = []
138
  all_validation_errors = []
139
-
140
- # Iterate through each agent directory
141
- for agent_dir in results_dir.iterdir():
142
- if not agent_dir.is_dir():
143
- continue
144
-
145
- # Load and validate using pydantic models
146
- metadata, scores, errors = load_and_validate_agent_data(agent_dir)
147
-
148
- if errors:
149
  all_validation_errors.extend(errors)
150
-
151
- if metadata is None or scores is None:
152
- continue
153
-
154
- # Skip entries that are hidden from the leaderboard
155
- if metadata.get('hide_from_leaderboard', False):
156
- logger.info(f"Skipping {agent_dir.name}: hide_from_leaderboard is True")
157
- continue
158
-
159
- # Create one record per benchmark (mimicking old JSONL format)
160
- for score_entry in scores:
161
- record = {
162
- 'agent_version': metadata.get('agent_version', 'Unknown'),
163
- 'llm_base': metadata.get('model', 'unknown'),
164
- 'openness': metadata.get('openness', 'unknown'),
165
- 'submission_time': score_entry.get('submission_time', metadata.get('submission_time', '')),
166
- 'release_date': metadata.get('release_date', ''), # Model release date
167
- 'parameter_count_b': metadata.get('parameter_count_b'), # Total params in billions
168
- 'active_parameter_count_b': metadata.get('active_parameter_count_b'), # Active params for MoE
169
- 'score': score_entry.get('score'),
170
- 'metric': score_entry.get('metric', 'unknown'),
171
- 'cost_per_instance': score_entry.get('cost_per_instance'),
172
- 'average_runtime': score_entry.get('average_runtime'),
173
- 'tags': [score_entry.get('benchmark')],
174
- 'full_archive': score_entry.get('full_archive', ''), # Download URL for trajectories
175
- 'eval_visualization_page': score_entry.get('eval_visualization_page', ''), # Laminar visualization URL
176
- }
177
- all_records.append(record)
178
-
179
  # Log validation errors if any
180
  if all_validation_errors:
181
  logger.warning(f"Schema validation errors ({len(all_validation_errors)} total):")
@@ -183,10 +237,10 @@ class SimpleLeaderboardViewer:
183
  logger.warning(f" - {error}")
184
  if len(all_validation_errors) > 5:
185
  logger.warning(f" ... and {len(all_validation_errors) - 5} more")
186
-
187
  if not all_records:
188
- return None # Fall back to old format
189
-
190
  return pd.DataFrame(all_records)
191
 
192
  def _load(self):
@@ -206,26 +260,36 @@ class SimpleLeaderboardViewer:
206
  # Group by agent (version + model combination) to aggregate results across datasets
207
  transformed_records = []
208
 
209
- # Create a unique identifier for each agent (version + model)
210
- df['agent_id'] = df['agent_version'] + '_' + df['llm_base']
211
-
 
 
 
 
 
 
 
 
212
  for agent_id in df['agent_id'].unique():
213
  agent_records = df[df['agent_id'] == agent_id]
214
-
215
  # Build a single record for this agent
216
  first_record = agent_records.iloc[0]
217
  agent_version = first_record['agent_version']
218
-
 
219
  # Normalize openness to "open" or "closed"
220
  from aliases import OPENNESS_MAPPING
221
  raw_openness = first_record['openness']
222
  normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
223
-
224
  # All 5 categories for the leaderboard
225
  ALL_CATEGORIES = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
226
-
227
  record = {
228
  # Core agent info - use final display names
 
229
  'SDK version': agent_version, # Will become "SDK Version"
230
  'Language model': first_record['llm_base'], # Will become "Language Model"
231
  'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
@@ -235,7 +299,7 @@ class SimpleLeaderboardViewer:
235
  'parameter_count_b': first_record.get('parameter_count_b'), # Total params in billions
236
  'active_parameter_count_b': first_record.get('active_parameter_count_b'), # Active params for MoE
237
  # Additional columns expected by the transformer
238
- # Use agent_id (version_model) as unique identifier for Pareto frontier calculation
239
  'id': agent_id,
240
  'source': first_record.get('source', ''), # Will become "Source"
241
  'logs': first_record.get('logs', ''), # Will become "Logs"
 
127
  if benchmark not in self.tag_map[category]:
128
  self.tag_map[category].append(benchmark)
129
 
130
+ # Default agent_name when metadata.json doesn't carry one. Matches the
131
+ # default-agent value used by push_to_index_from_archive.py so legacy
132
+ # entries (which omit the field) still group cleanly with new entries.
133
+ DEFAULT_AGENT_NAME = "OpenHands"
134
+
135
+ def _records_from_agent_dir(self, agent_dir: Path, default_agent_name: str | None = None) -> tuple[list[dict], list[str]]:
136
+ """Build per-benchmark records from a single agent directory.
137
+
138
+ Shared by ``_load_from_agent_dirs`` (default OpenHands results) and
139
+ ``_load_from_alternative_agents_dirs`` (acp-claude / acp-codex / etc.).
140
+ Returns ``(records, validation_errors)``. Returns an empty list of
141
+ records when the directory has no scores or is hidden from the
142
+ leaderboard.
143
+ """
144
+ records: list[dict] = []
145
+ metadata, scores, errors = load_and_validate_agent_data(agent_dir)
146
+
147
+ if metadata is None or scores is None:
148
+ return records, errors
149
+
150
+ if metadata.get('hide_from_leaderboard', False):
151
+ logger.info(f"Skipping {agent_dir.name}: hide_from_leaderboard is True")
152
+ return records, errors
153
+
154
+ # Resolve the agent display name. Prefer the value stamped into
155
+ # metadata.json by push-to-index; fall back to the directory's
156
+ # default (e.g. "Claude Code" for acp-claude/) and finally to
157
+ # "OpenHands" for legacy results/ entries that predate the field.
158
+ agent_name = (
159
+ metadata.get('agent_name')
160
+ or default_agent_name
161
+ or self.DEFAULT_AGENT_NAME
162
+ )
163
+
164
+ for score_entry in scores:
165
+ record = {
166
+ 'agent_name': agent_name,
167
+ 'agent_version': metadata.get('agent_version', 'Unknown'),
168
+ 'llm_base': metadata.get('model', 'unknown'),
169
+ 'openness': metadata.get('openness', 'unknown'),
170
+ 'submission_time': score_entry.get('submission_time', metadata.get('submission_time', '')),
171
+ 'release_date': metadata.get('release_date', ''),
172
+ 'parameter_count_b': metadata.get('parameter_count_b'),
173
+ 'active_parameter_count_b': metadata.get('active_parameter_count_b'),
174
+ 'score': score_entry.get('score'),
175
+ 'metric': score_entry.get('metric', 'unknown'),
176
+ 'cost_per_instance': score_entry.get('cost_per_instance'),
177
+ 'average_runtime': score_entry.get('average_runtime'),
178
+ 'tags': [score_entry.get('benchmark')],
179
+ 'full_archive': score_entry.get('full_archive', ''),
180
+ 'eval_visualization_page': score_entry.get('eval_visualization_page', ''),
181
+ }
182
+ records.append(record)
183
+ return records, errors
184
+
185
  def _load_from_agent_dirs(self):
186
+ """Load default-agent results plus any alternative_agents/ entries.
187
+
188
+ Reads ``{config}/results/{model}/`` for default OpenHands runs and
189
+ ``{config}/alternative_agents/{type}/{model}/`` for ACP agent runs
190
+ (acp-claude, acp-codex, acp-gemini, ...) so they all surface in the
191
+ same leaderboard. Returns ``None`` if neither directory yields any
192
+ records (which makes the caller render an empty-state placeholder).
193
+ """
194
  all_records = []
195
  all_validation_errors = []
196
+
197
+ # 1. Default OpenHands agent results
198
+ results_dir = self.config_path / "results"
199
+ if results_dir.exists():
200
+ for agent_dir in results_dir.iterdir():
201
+ if not agent_dir.is_dir():
202
+ continue
203
+ records, errors = self._records_from_agent_dir(agent_dir)
204
+ all_records.extend(records)
 
205
  all_validation_errors.extend(errors)
206
+
207
+ # 2. Alternative agents (one subdirectory per agent_type, then per model)
208
+ # Default agent_name per agent_type matches the AGENT_NAME_BY_TYPE map
209
+ # in OpenHands/evaluation push_to_index_from_archive.py — keeping it
210
+ # in sync ensures rows are labelled the same way the index repo
211
+ # records them.
212
+ agent_type_default_name = {
213
+ 'acp-claude': 'Claude Code',
214
+ 'acp-codex': 'Codex',
215
+ 'acp-gemini': 'Gemini CLI',
216
+ 'openhands_subagents': 'OpenHands Sub-agents',
217
+ }
218
+ alt_dir = self.config_path / "alternative_agents"
219
+ if alt_dir.exists():
220
+ for type_dir in alt_dir.iterdir():
221
+ if not type_dir.is_dir():
222
+ continue
223
+ default_name = agent_type_default_name.get(type_dir.name)
224
+ for agent_dir in type_dir.iterdir():
225
+ if not agent_dir.is_dir():
226
+ continue
227
+ records, errors = self._records_from_agent_dir(
228
+ agent_dir, default_agent_name=default_name
229
+ )
230
+ all_records.extend(records)
231
+ all_validation_errors.extend(errors)
232
+
 
 
233
  # Log validation errors if any
234
  if all_validation_errors:
235
  logger.warning(f"Schema validation errors ({len(all_validation_errors)} total):")
 
237
  logger.warning(f" - {error}")
238
  if len(all_validation_errors) > 5:
239
  logger.warning(f" ... and {len(all_validation_errors) - 5} more")
240
+
241
  if not all_records:
242
+ return None # Caller will render empty-state placeholder
243
+
244
  return pd.DataFrame(all_records)
245
 
246
  def _load(self):
 
260
  # Group by agent (version + model combination) to aggregate results across datasets
261
  transformed_records = []
262
 
263
+ # Create a unique identifier per (agent_name, agent_version, model)
264
+ # tuple. Including agent_name keeps an OpenHands run and a Claude
265
+ # Code run on the same SDK version + model from collapsing into
266
+ # one row when both submit to the leaderboard.
267
+ df['agent_name'] = df['agent_name'].fillna(self.DEFAULT_AGENT_NAME)
268
+ df['agent_id'] = (
269
+ df['agent_name'].astype(str)
270
+ + '_' + df['agent_version'].astype(str)
271
+ + '_' + df['llm_base'].astype(str)
272
+ )
273
+
274
  for agent_id in df['agent_id'].unique():
275
  agent_records = df[df['agent_id'] == agent_id]
276
+
277
  # Build a single record for this agent
278
  first_record = agent_records.iloc[0]
279
  agent_version = first_record['agent_version']
280
+ agent_name = first_record['agent_name']
281
+
282
  # Normalize openness to "open" or "closed"
283
  from aliases import OPENNESS_MAPPING
284
  raw_openness = first_record['openness']
285
  normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
286
+
287
  # All 5 categories for the leaderboard
288
  ALL_CATEGORIES = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
289
+
290
  record = {
291
  # Core agent info - use final display names
292
+ 'agent_name': agent_name, # Will become "Agent"
293
  'SDK version': agent_version, # Will become "SDK Version"
294
  'Language model': first_record['llm_base'], # Will become "Language Model"
295
  'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
 
299
  'parameter_count_b': first_record.get('parameter_count_b'), # Total params in billions
300
  'active_parameter_count_b': first_record.get('active_parameter_count_b'), # Active params for MoE
301
  # Additional columns expected by the transformer
302
+ # Use agent_id (name_version_model) as unique identifier for Pareto frontier calculation
303
  'id': agent_id,
304
  'source': first_record.get('source', ''), # Will become "Source"
305
  'logs': first_record.get('logs', ''), # Will become "Logs"