Spaces:
Running
Show ACP agent results in the leaderboard
Browse files## Summary
The HF Space currently only loads `results/{model}/` (default OpenHands runs).
The ACP runs (`acp-claude`, `acp-codex`, `acp-gemini`, `openhands_subagents`)
live in `alternative_agents/{type}/{model}/` in the openhands-index-results
repo and never made it into the dataframe, so the website silently dropped
them. After OpenHands/openhands-index-results#820–#829 + #830, all the ACP
Claude Code data from the master table in OpenHands/benchmarks#576 is in
the canonical location, but the leaderboard still doesn't show it.
This PR teaches the loader to ingest `alternative_agents/` and adds an
**Agent** column to the leaderboard so OpenHands vs Claude Code vs Codex
vs Gemini CLI are visible at a glance.
## Changes
- **`setup_data.py`** — copy `alternative_agents/` alongside `results/` when fetching the index repo, so all submissions land in the data dir.
- **`simple_data_loader.py`**:
- Factor per-directory loading into `_records_from_agent_dir` and have `_load_from_agent_dirs` walk both `results/` and `alternative_agents/{type}/{model}/`.
- Default `agent_name` per `agent_type` (Claude Code / Codex / Gemini CLI / OpenHands Sub-agents), matching the `AGENT_NAME_BY_TYPE` map in `OpenHands/evaluation push_to_index_from_archive.py`.
- Include `agent_name` in `agent_id` (`name_version_model`) so an OpenHands run and a Claude Code run on the same SDK version + model don't collide into one row.
- Surface `agent_name` on the transformed record.
- **`leaderboard_transformer.py`**:
- Map `agent_name` → "Agent" in `_pretty_column_name`.
- Insert "Agent" into `base_cols` between `id` and `Language Model`.
## Local verification
Cloned the latest `openhands-index-results` and pointed the loader at it.
The loader now returns 29 rows: 24 OpenHands + 2 Claude Code + 1 Codex + 2
OpenHands Sub-agents. The new Claude Code rows match the master table in
OpenHands/benchmarks#576:
```
Claude Code / claude-opus-4-6: swebench 74.4 swtbench 66.7 gaia 66.1 commit0 50.0 swe-bench-multimodal 32.4
Claude Code / claude-sonnet-4-5: swebench 74.4 swtbench 69.3 gaia 63.0 commit0 31.2 swe-bench-multimodal 35.3
```
## Test plan
- [ ] Reviewer: load the Space preview built from this PR, confirm the leaderboard table now has an **Agent** column and shows Claude Code / Codex / OpenHands Sub-agents rows.
- [ ] Confirm the existing OpenHands rows look unchanged (same scores, no missing entries).
- leaderboard_transformer.py +6 -1
- setup_data.py +17 -5
- simple_data_loader.py +120 -56
|
@@ -655,6 +655,7 @@ def _pretty_column_name(raw_col: str) -> str:
|
|
| 655 |
# Case 1: Handle fixed, special-case mappings first.
|
| 656 |
fixed_mappings = {
|
| 657 |
'id': 'id',
|
|
|
|
| 658 |
'SDK version': 'SDK Version',
|
| 659 |
'Openhands version': 'SDK Version', # Legacy support
|
| 660 |
'Language model': 'Language Model',
|
|
@@ -815,7 +816,11 @@ class DataTransformer:
|
|
| 815 |
df_view = df_sorted.copy()
|
| 816 |
|
| 817 |
# --- 3. Add Columns for Agent Openness ---
|
| 818 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 819 |
new_cols = ["Openness"]
|
| 820 |
ending_cols = ["Date", "Logs", "Visualization"]
|
| 821 |
|
|
|
|
| 655 |
# Case 1: Handle fixed, special-case mappings first.
|
| 656 |
fixed_mappings = {
|
| 657 |
'id': 'id',
|
| 658 |
+
'agent_name': 'Agent',
|
| 659 |
'SDK version': 'SDK Version',
|
| 660 |
'Openhands version': 'SDK Version', # Legacy support
|
| 661 |
'Language model': 'Language Model',
|
|
|
|
| 816 |
df_view = df_sorted.copy()
|
| 817 |
|
| 818 |
# --- 3. Add Columns for Agent Openness ---
|
| 819 |
+
# "Agent" sits between id and Language Model so OpenHands vs
|
| 820 |
+
# alternative agents (Claude Code / Codex / Gemini CLI) are visible
|
| 821 |
+
# at a glance, and so the same model with two different agents
|
| 822 |
+
# doesn't look like a duplicate row.
|
| 823 |
+
base_cols = ["id", "Agent", "Language Model", "SDK Version", "Source"]
|
| 824 |
new_cols = ["Openness"]
|
| 825 |
ending_cols = ["Date", "Logs", "Visualization"]
|
| 826 |
|
|
@@ -70,27 +70,39 @@ def fetch_data_from_github():
|
|
| 70 |
|
| 71 |
# Look for data files in the cloned repository
|
| 72 |
results_source = clone_dir / "results"
|
| 73 |
-
|
| 74 |
if not results_source.exists():
|
| 75 |
print(f"Results directory not found in repository")
|
| 76 |
return False
|
| 77 |
-
|
| 78 |
# Check if there are any agent result directories
|
| 79 |
result_dirs = list(results_source.iterdir())
|
| 80 |
if not result_dirs:
|
| 81 |
print(f"No agent results found in {results_source}")
|
| 82 |
return False
|
| 83 |
-
|
| 84 |
print(f"Found {len(result_dirs)} agent result directories")
|
| 85 |
-
|
| 86 |
# Create target directory and copy the results structure
|
| 87 |
os.makedirs(target_dir.parent, exist_ok=True)
|
| 88 |
if target_dir.exists():
|
| 89 |
shutil.rmtree(target_dir)
|
| 90 |
-
|
| 91 |
# Copy the entire results directory
|
| 92 |
target_results = target_dir / "results"
|
| 93 |
shutil.copytree(results_source, target_results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")
|
| 96 |
|
|
|
|
| 70 |
|
| 71 |
# Look for data files in the cloned repository
|
| 72 |
results_source = clone_dir / "results"
|
| 73 |
+
|
| 74 |
if not results_source.exists():
|
| 75 |
print(f"Results directory not found in repository")
|
| 76 |
return False
|
| 77 |
+
|
| 78 |
# Check if there are any agent result directories
|
| 79 |
result_dirs = list(results_source.iterdir())
|
| 80 |
if not result_dirs:
|
| 81 |
print(f"No agent results found in {results_source}")
|
| 82 |
return False
|
| 83 |
+
|
| 84 |
print(f"Found {len(result_dirs)} agent result directories")
|
| 85 |
+
|
| 86 |
# Create target directory and copy the results structure
|
| 87 |
os.makedirs(target_dir.parent, exist_ok=True)
|
| 88 |
if target_dir.exists():
|
| 89 |
shutil.rmtree(target_dir)
|
| 90 |
+
|
| 91 |
# Copy the entire results directory
|
| 92 |
target_results = target_dir / "results"
|
| 93 |
shutil.copytree(results_source, target_results)
|
| 94 |
+
|
| 95 |
+
# Also copy alternative_agents/ if present, so the loader can pick up
|
| 96 |
+
# ACP runs (acp-claude, acp-codex, acp-gemini, openhands_subagents, ...)
|
| 97 |
+
# alongside the default OpenHands agent results.
|
| 98 |
+
alt_source = clone_dir / "alternative_agents"
|
| 99 |
+
if alt_source.exists():
|
| 100 |
+
alt_target = target_dir / "alternative_agents"
|
| 101 |
+
shutil.copytree(alt_source, alt_target)
|
| 102 |
+
agent_types = sorted(p.name for p in alt_source.iterdir() if p.is_dir())
|
| 103 |
+
print(f"Found alternative agent types: {agent_types}")
|
| 104 |
+
else:
|
| 105 |
+
print("No alternative_agents/ directory in repository (skipping)")
|
| 106 |
|
| 107 |
print(f"Successfully fetched data from GitHub. Files: {list(target_dir.glob('*'))}")
|
| 108 |
|
|
@@ -127,55 +127,109 @@ class SimpleLeaderboardViewer:
|
|
| 127 |
if benchmark not in self.tag_map[category]:
|
| 128 |
self.tag_map[category].append(benchmark)
|
| 129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
def _load_from_agent_dirs(self):
|
| 131 |
-
"""Load
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
| 137 |
all_records = []
|
| 138 |
all_validation_errors = []
|
| 139 |
-
|
| 140 |
-
#
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
if errors:
|
| 149 |
all_validation_errors.extend(errors)
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
all_records.append(record)
|
| 178 |
-
|
| 179 |
# Log validation errors if any
|
| 180 |
if all_validation_errors:
|
| 181 |
logger.warning(f"Schema validation errors ({len(all_validation_errors)} total):")
|
|
@@ -183,10 +237,10 @@ class SimpleLeaderboardViewer:
|
|
| 183 |
logger.warning(f" - {error}")
|
| 184 |
if len(all_validation_errors) > 5:
|
| 185 |
logger.warning(f" ... and {len(all_validation_errors) - 5} more")
|
| 186 |
-
|
| 187 |
if not all_records:
|
| 188 |
-
return None #
|
| 189 |
-
|
| 190 |
return pd.DataFrame(all_records)
|
| 191 |
|
| 192 |
def _load(self):
|
|
@@ -206,26 +260,36 @@ class SimpleLeaderboardViewer:
|
|
| 206 |
# Group by agent (version + model combination) to aggregate results across datasets
|
| 207 |
transformed_records = []
|
| 208 |
|
| 209 |
-
# Create a unique identifier
|
| 210 |
-
|
| 211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
for agent_id in df['agent_id'].unique():
|
| 213 |
agent_records = df[df['agent_id'] == agent_id]
|
| 214 |
-
|
| 215 |
# Build a single record for this agent
|
| 216 |
first_record = agent_records.iloc[0]
|
| 217 |
agent_version = first_record['agent_version']
|
| 218 |
-
|
|
|
|
| 219 |
# Normalize openness to "open" or "closed"
|
| 220 |
from aliases import OPENNESS_MAPPING
|
| 221 |
raw_openness = first_record['openness']
|
| 222 |
normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
|
| 223 |
-
|
| 224 |
# All 5 categories for the leaderboard
|
| 225 |
ALL_CATEGORIES = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
|
| 226 |
-
|
| 227 |
record = {
|
| 228 |
# Core agent info - use final display names
|
|
|
|
| 229 |
'SDK version': agent_version, # Will become "SDK Version"
|
| 230 |
'Language model': first_record['llm_base'], # Will become "Language Model"
|
| 231 |
'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
|
|
@@ -235,7 +299,7 @@ class SimpleLeaderboardViewer:
|
|
| 235 |
'parameter_count_b': first_record.get('parameter_count_b'), # Total params in billions
|
| 236 |
'active_parameter_count_b': first_record.get('active_parameter_count_b'), # Active params for MoE
|
| 237 |
# Additional columns expected by the transformer
|
| 238 |
-
# Use agent_id (
|
| 239 |
'id': agent_id,
|
| 240 |
'source': first_record.get('source', ''), # Will become "Source"
|
| 241 |
'logs': first_record.get('logs', ''), # Will become "Logs"
|
|
|
|
| 127 |
if benchmark not in self.tag_map[category]:
|
| 128 |
self.tag_map[category].append(benchmark)
|
| 129 |
|
| 130 |
+
# Default agent_name when metadata.json doesn't carry one. Matches the
|
| 131 |
+
# default-agent value used by push_to_index_from_archive.py so legacy
|
| 132 |
+
# entries (which omit the field) still group cleanly with new entries.
|
| 133 |
+
DEFAULT_AGENT_NAME = "OpenHands"
|
| 134 |
+
|
| 135 |
+
def _records_from_agent_dir(self, agent_dir: Path, default_agent_name: str | None = None) -> tuple[list[dict], list[str]]:
|
| 136 |
+
"""Build per-benchmark records from a single agent directory.
|
| 137 |
+
|
| 138 |
+
Shared by ``_load_from_agent_dirs`` (default OpenHands results) and
|
| 139 |
+
``_load_from_alternative_agents_dirs`` (acp-claude / acp-codex / etc.).
|
| 140 |
+
Returns ``(records, validation_errors)``. Returns an empty list of
|
| 141 |
+
records when the directory has no scores or is hidden from the
|
| 142 |
+
leaderboard.
|
| 143 |
+
"""
|
| 144 |
+
records: list[dict] = []
|
| 145 |
+
metadata, scores, errors = load_and_validate_agent_data(agent_dir)
|
| 146 |
+
|
| 147 |
+
if metadata is None or scores is None:
|
| 148 |
+
return records, errors
|
| 149 |
+
|
| 150 |
+
if metadata.get('hide_from_leaderboard', False):
|
| 151 |
+
logger.info(f"Skipping {agent_dir.name}: hide_from_leaderboard is True")
|
| 152 |
+
return records, errors
|
| 153 |
+
|
| 154 |
+
# Resolve the agent display name. Prefer the value stamped into
|
| 155 |
+
# metadata.json by push-to-index; fall back to the directory's
|
| 156 |
+
# default (e.g. "Claude Code" for acp-claude/) and finally to
|
| 157 |
+
# "OpenHands" for legacy results/ entries that predate the field.
|
| 158 |
+
agent_name = (
|
| 159 |
+
metadata.get('agent_name')
|
| 160 |
+
or default_agent_name
|
| 161 |
+
or self.DEFAULT_AGENT_NAME
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
for score_entry in scores:
|
| 165 |
+
record = {
|
| 166 |
+
'agent_name': agent_name,
|
| 167 |
+
'agent_version': metadata.get('agent_version', 'Unknown'),
|
| 168 |
+
'llm_base': metadata.get('model', 'unknown'),
|
| 169 |
+
'openness': metadata.get('openness', 'unknown'),
|
| 170 |
+
'submission_time': score_entry.get('submission_time', metadata.get('submission_time', '')),
|
| 171 |
+
'release_date': metadata.get('release_date', ''),
|
| 172 |
+
'parameter_count_b': metadata.get('parameter_count_b'),
|
| 173 |
+
'active_parameter_count_b': metadata.get('active_parameter_count_b'),
|
| 174 |
+
'score': score_entry.get('score'),
|
| 175 |
+
'metric': score_entry.get('metric', 'unknown'),
|
| 176 |
+
'cost_per_instance': score_entry.get('cost_per_instance'),
|
| 177 |
+
'average_runtime': score_entry.get('average_runtime'),
|
| 178 |
+
'tags': [score_entry.get('benchmark')],
|
| 179 |
+
'full_archive': score_entry.get('full_archive', ''),
|
| 180 |
+
'eval_visualization_page': score_entry.get('eval_visualization_page', ''),
|
| 181 |
+
}
|
| 182 |
+
records.append(record)
|
| 183 |
+
return records, errors
|
| 184 |
+
|
| 185 |
def _load_from_agent_dirs(self):
|
| 186 |
+
"""Load default-agent results plus any alternative_agents/ entries.
|
| 187 |
+
|
| 188 |
+
Reads ``{config}/results/{model}/`` for default OpenHands runs and
|
| 189 |
+
``{config}/alternative_agents/{type}/{model}/`` for ACP agent runs
|
| 190 |
+
(acp-claude, acp-codex, acp-gemini, ...) so they all surface in the
|
| 191 |
+
same leaderboard. Returns ``None`` if neither directory yields any
|
| 192 |
+
records (which makes the caller render an empty-state placeholder).
|
| 193 |
+
"""
|
| 194 |
all_records = []
|
| 195 |
all_validation_errors = []
|
| 196 |
+
|
| 197 |
+
# 1. Default OpenHands agent results
|
| 198 |
+
results_dir = self.config_path / "results"
|
| 199 |
+
if results_dir.exists():
|
| 200 |
+
for agent_dir in results_dir.iterdir():
|
| 201 |
+
if not agent_dir.is_dir():
|
| 202 |
+
continue
|
| 203 |
+
records, errors = self._records_from_agent_dir(agent_dir)
|
| 204 |
+
all_records.extend(records)
|
|
|
|
| 205 |
all_validation_errors.extend(errors)
|
| 206 |
+
|
| 207 |
+
# 2. Alternative agents (one subdirectory per agent_type, then per model)
|
| 208 |
+
# Default agent_name per agent_type matches the AGENT_NAME_BY_TYPE map
|
| 209 |
+
# in OpenHands/evaluation push_to_index_from_archive.py — keeping it
|
| 210 |
+
# in sync ensures rows are labelled the same way the index repo
|
| 211 |
+
# records them.
|
| 212 |
+
agent_type_default_name = {
|
| 213 |
+
'acp-claude': 'Claude Code',
|
| 214 |
+
'acp-codex': 'Codex',
|
| 215 |
+
'acp-gemini': 'Gemini CLI',
|
| 216 |
+
'openhands_subagents': 'OpenHands Sub-agents',
|
| 217 |
+
}
|
| 218 |
+
alt_dir = self.config_path / "alternative_agents"
|
| 219 |
+
if alt_dir.exists():
|
| 220 |
+
for type_dir in alt_dir.iterdir():
|
| 221 |
+
if not type_dir.is_dir():
|
| 222 |
+
continue
|
| 223 |
+
default_name = agent_type_default_name.get(type_dir.name)
|
| 224 |
+
for agent_dir in type_dir.iterdir():
|
| 225 |
+
if not agent_dir.is_dir():
|
| 226 |
+
continue
|
| 227 |
+
records, errors = self._records_from_agent_dir(
|
| 228 |
+
agent_dir, default_agent_name=default_name
|
| 229 |
+
)
|
| 230 |
+
all_records.extend(records)
|
| 231 |
+
all_validation_errors.extend(errors)
|
| 232 |
+
|
|
|
|
|
|
|
| 233 |
# Log validation errors if any
|
| 234 |
if all_validation_errors:
|
| 235 |
logger.warning(f"Schema validation errors ({len(all_validation_errors)} total):")
|
|
|
|
| 237 |
logger.warning(f" - {error}")
|
| 238 |
if len(all_validation_errors) > 5:
|
| 239 |
logger.warning(f" ... and {len(all_validation_errors) - 5} more")
|
| 240 |
+
|
| 241 |
if not all_records:
|
| 242 |
+
return None # Caller will render empty-state placeholder
|
| 243 |
+
|
| 244 |
return pd.DataFrame(all_records)
|
| 245 |
|
| 246 |
def _load(self):
|
|
|
|
| 260 |
# Group by agent (version + model combination) to aggregate results across datasets
|
| 261 |
transformed_records = []
|
| 262 |
|
| 263 |
+
# Create a unique identifier per (agent_name, agent_version, model)
|
| 264 |
+
# tuple. Including agent_name keeps an OpenHands run and a Claude
|
| 265 |
+
# Code run on the same SDK version + model from collapsing into
|
| 266 |
+
# one row when both submit to the leaderboard.
|
| 267 |
+
df['agent_name'] = df['agent_name'].fillna(self.DEFAULT_AGENT_NAME)
|
| 268 |
+
df['agent_id'] = (
|
| 269 |
+
df['agent_name'].astype(str)
|
| 270 |
+
+ '_' + df['agent_version'].astype(str)
|
| 271 |
+
+ '_' + df['llm_base'].astype(str)
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
for agent_id in df['agent_id'].unique():
|
| 275 |
agent_records = df[df['agent_id'] == agent_id]
|
| 276 |
+
|
| 277 |
# Build a single record for this agent
|
| 278 |
first_record = agent_records.iloc[0]
|
| 279 |
agent_version = first_record['agent_version']
|
| 280 |
+
agent_name = first_record['agent_name']
|
| 281 |
+
|
| 282 |
# Normalize openness to "open" or "closed"
|
| 283 |
from aliases import OPENNESS_MAPPING
|
| 284 |
raw_openness = first_record['openness']
|
| 285 |
normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
|
| 286 |
+
|
| 287 |
# All 5 categories for the leaderboard
|
| 288 |
ALL_CATEGORIES = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
|
| 289 |
+
|
| 290 |
record = {
|
| 291 |
# Core agent info - use final display names
|
| 292 |
+
'agent_name': agent_name, # Will become "Agent"
|
| 293 |
'SDK version': agent_version, # Will become "SDK Version"
|
| 294 |
'Language model': first_record['llm_base'], # Will become "Language Model"
|
| 295 |
'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
|
|
|
|
| 299 |
'parameter_count_b': first_record.get('parameter_count_b'), # Total params in billions
|
| 300 |
'active_parameter_count_b': first_record.get('active_parameter_count_b'), # Active params for MoE
|
| 301 |
# Additional columns expected by the transformer
|
| 302 |
+
# Use agent_id (name_version_model) as unique identifier for Pareto frontier calculation
|
| 303 |
'id': agent_id,
|
| 304 |
'source': first_record.get('source', ''), # Will become "Source"
|
| 305 |
'logs': first_record.get('logs', ''), # Will become "Logs"
|