egor-bogomolov's picture
Fix FAIL_TO_PASS/PASS_TO_PASS parsing for SWE-bench Multilingual
e080b1d
"""Code editing benchmark adapters (SWE-bench, DebugBench, CanItEdit, CodeEditorBench)."""
from __future__ import annotations
import json
from typing import Any
from adapters import DatasetAdapter
# Injected at runtime by _set_helpers()
_highlight_code = None
_code_offset = None
_extract_test_classes = None
# ---------------------------------------------------------------------------
# SWE-bench Lite adapter (HuggingFace: princeton-nlp/SWE-bench_Lite)
# ---------------------------------------------------------------------------
class SWEBenchLiteAdapter(DatasetAdapter):
slug = "swebenchlite"
display_name = "SWE-bench Lite"
has_ground_truth = False
has_tasks = False
def __init__(self, hf_dataset):
self._ds = hf_dataset
def problem_count(self) -> int:
return len(self._ds)
def get_problem_summary(self, idx: int) -> dict[str, Any]:
row = self._ds[idx]
return {
"idx": idx,
"task_id": row["instance_id"],
"entry_point": row["instance_id"].split("__")[-1],
"num_inputs": 0,
"source": row["repo"],
}
@staticmethod
def _github_links(instance_id: str, repo: str, base_commit: str) -> dict[str, str]:
"""Build GitHub URLs from SWE-bench instance metadata."""
links: dict[str, str] = {}
if repo:
links["repo_url"] = f"https://github.com/{repo}"
# instance_id format: "repo__issue-number" e.g. "astropy__astropy-12907"
parts = instance_id.rsplit("-", 1)
if len(parts) == 2 and parts[1].isdigit() and repo:
links["issue_url"] = f"https://github.com/{repo}/issues/{parts[1]}"
if base_commit and repo:
links["commit_url"] = f"https://github.com/{repo}/commit/{base_commit}"
return links
def get_problem_detail(self, idx: int) -> dict[str, Any]:
row = self._ds[idx]
patch = row["patch"]
raw_f2p = row["FAIL_TO_PASS"]
fail_to_pass = raw_f2p if isinstance(raw_f2p, list) else (json.loads(raw_f2p) if raw_f2p else [])
raw_p2p = row["PASS_TO_PASS"]
pass_to_pass = raw_p2p if isinstance(raw_p2p, list) else (json.loads(raw_p2p) if raw_p2p else [])
instance_id = row["instance_id"]
repo = row["repo"]
base_commit = row.get("base_commit", "")
return {
"idx": idx,
"task_id": instance_id,
"entry_point": instance_id.split("__")[-1],
"code": patch,
"highlighted_code": "",
"inputs": [],
"outputs": [],
"test": None,
"tasks": [],
"source": repo,
"has_ground_truth": False,
"has_tasks": False,
"description": row["problem_statement"],
"patch": patch,
"test_patch": row.get("test_patch", ""),
"fail_to_pass": fail_to_pass,
"pass_to_pass": pass_to_pass,
"hints": row.get("hints_text", ""),
"repo": repo,
"base_commit": base_commit,
"version": row.get("version", ""),
"created_at": row.get("created_at", ""),
**self._github_links(instance_id, repo, base_commit),
}
# ---------------------------------------------------------------------------
# SWE-bench Verified adapter (HuggingFace: princeton-nlp/SWE-bench_Verified)
# ---------------------------------------------------------------------------
class SWEBenchVerifiedAdapter(SWEBenchLiteAdapter):
slug = "swebenchverified"
display_name = "SWE-bench Verified"
class SWEBenchFullAdapter(SWEBenchLiteAdapter):
slug = "swebenchfull"
display_name = "SWE-bench"
# ---------------------------------------------------------------------------
# DebugBench adapter (HuggingFace: Rtian/DebugBench)
# ---------------------------------------------------------------------------
class DebugBenchAdapter(DatasetAdapter):
slug = "debugbench"
display_name = "DebugBench"
has_ground_truth = False
has_tasks = False
def __init__(self, hf_dataset):
self._ds = hf_dataset
def problem_count(self) -> int:
return len(self._ds)
def get_problem_summary(self, idx: int) -> dict[str, Any]:
row = self._ds[idx]
return {
"idx": idx,
"task_id": row["slug"],
"entry_point": row["slug"],
"num_inputs": len(row["examples"]),
"source": f"{row['language']}/{row['category']}",
}
def get_problem_detail(self, idx: int) -> dict[str, Any]:
row = self._ds[idx]
lang = row["language"]
buggy = row["buggy_code"]
fixed = row["solution"]
return {
"idx": idx,
"task_id": row["slug"],
"entry_point": row["slug"],
"code": fixed,
"highlighted_code": _highlight_code(fixed, language=lang),
"inputs": [],
"outputs": [],
"test": None,
"tasks": [],
"source": f"{lang}/{row['category']}",
"has_ground_truth": False,
"has_tasks": False,
"description": row["question"],
"language": lang,
"buggy_code": buggy,
"buggy_highlighted_code": _highlight_code(buggy, language=lang),
"fixed_code": fixed,
"fixed_highlighted_code": _highlight_code(fixed, language=lang),
"bug_category": row["category"],
"bug_subtype": row["subtype"],
"bug_explanation": row["bug_explanation"],
"difficulty": row["level"],
"examples": list(row["examples"]),
}
# ---------------------------------------------------------------------------
# CanItEdit adapter (HuggingFace: nuprl/CanItEdit)
# ---------------------------------------------------------------------------
class CanItEditAdapter(DatasetAdapter):
slug = "canitedit"
display_name = "CanItEdit"
has_ground_truth = False
has_tasks = False
def __init__(self, hf_dataset):
self._ds = hf_dataset
def problem_count(self) -> int:
return len(self._ds)
def get_problem_summary(self, idx: int) -> dict[str, Any]:
row = self._ds[idx]
taxonomy = row.get("taxonomy", {})
change_kind = taxonomy.get("change_kind", "") if isinstance(taxonomy, dict) else ""
return {
"idx": idx,
"task_id": row.get("full_name", str(row.get("id", idx))),
"entry_point": row.get("name", f"edit_{idx}"),
"num_inputs": 0,
"source": change_kind or "CanItEdit",
}
def get_problem_detail(self, idx: int) -> dict[str, Any]:
row = self._ds[idx]
before = row["before"]
after = row["after"]
taxonomy = row.get("taxonomy", {})
if not isinstance(taxonomy, dict):
taxonomy = {}
return {
"idx": idx,
"task_id": row.get("full_name", str(row.get("id", idx))),
"entry_point": row.get("name", f"edit_{idx}"),
"code": after,
"highlighted_code": _highlight_code(after),
"inputs": [],
"outputs": [],
"test": row.get("tests", ""),
"tasks": [],
"source": taxonomy.get("change_kind", "CanItEdit"),
"has_ground_truth": False,
"has_tasks": False,
"description": row.get("instruction_descriptive", ""),
"buggy_code": before,
"buggy_highlighted_code": _highlight_code(before),
"fixed_code": after,
"fixed_highlighted_code": _highlight_code(after),
"bug_category": taxonomy.get("change_kind", ""),
"bug_subtype": taxonomy.get("topic", ""),
"bug_explanation": row.get("instruction_lazy", ""),
}
# ---------------------------------------------------------------------------
# CodeEditorBench adapter (HuggingFace: m-a-p/CodeEditorBench)
# ---------------------------------------------------------------------------
class CodeEditorBenchAdapter(DatasetAdapter):
slug = "codeeditorbench"
display_name = "CodeEditorBench"
has_ground_truth = False
has_tasks = False
def __init__(self, rows: list[dict[str, Any]]):
self._rows = rows
def problem_count(self) -> int:
return len(self._rows)
def get_problem_summary(self, idx: int) -> dict[str, Any]:
row = self._rows[idx]
return {
"idx": idx,
"task_id": str(row.get("idx", idx)),
"entry_point": row.get("title", f"problem_{idx}"),
"num_inputs": 0,
"source": row.get("_task_type", "unknown"),
}
def get_problem_detail(self, idx: int) -> dict[str, Any]:
row = self._rows[idx]
task_type = row.get("_task_type", "unknown")
lang = row.get("code_language", row.get("source_lang", "python")) or "python"
lang_key = lang.lower()
if task_type == "code_debug":
buggy = row.get("incorrect_solutions", "")
fixed = row.get("solutions", "")
elif task_type == "code_translate":
buggy = row.get("source_code", "")
fixed = row.get("solutions", row.get("source_code", ""))
elif task_type == "code_polishment":
buggy = row.get("source_code", "")
fixed = row.get("solutions", row.get("source_code", ""))
else: # code_switch
buggy = row.get("similar_source_code", row.get("source_code", ""))
fixed = row.get("solutions", row.get("source_code", ""))
return {
"idx": idx,
"task_id": str(row.get("idx", idx)),
"entry_point": row.get("title", f"problem_{idx}"),
"code": fixed,
"highlighted_code": _highlight_code(fixed, language=lang_key) if fixed else "",
"inputs": [],
"outputs": [],
"test": None,
"tasks": [],
"source": task_type,
"has_ground_truth": False,
"has_tasks": False,
"description": "",
"buggy_code": buggy,
"buggy_highlighted_code": _highlight_code(buggy, language=lang_key) if buggy else "",
"fixed_code": fixed,
"fixed_highlighted_code": _highlight_code(fixed, language=lang_key) if fixed else "",
"bug_category": task_type,
"bug_subtype": row.get("difficulty", ""),
"bug_explanation": "",
"difficulty": row.get("difficulty", ""),
"language": lang,
}
# ---------------------------------------------------------------------------
# CodeXGLUE Code Refinement adapter (HuggingFace: google/code_x_glue_cc_code_refinement)
# ---------------------------------------------------------------------------
class CodeXGLUERefinementAdapter(DatasetAdapter):
slug = "codexgluerefinement"
display_name = "CodeXGLUE Code Refinement"
has_ground_truth = False
has_tasks = False
def __init__(self, hf_dataset):
self._ds = hf_dataset
def problem_count(self) -> int:
return len(self._ds)
def get_problem_summary(self, idx: int) -> dict[str, Any]:
row = self._ds[idx]
return {
"idx": idx,
"task_id": str(row.get("id", idx)),
"entry_point": f"refinement_{row.get('id', idx)}",
"num_inputs": 0,
"source": "CodeXGLUE",
}
def get_problem_detail(self, idx: int) -> dict[str, Any]:
row = self._ds[idx]
buggy = row.get("buggy", "")
fixed = row.get("fixed", "")
return {
"idx": idx,
"task_id": str(row.get("id", idx)),
"entry_point": f"refinement_{row.get('id', idx)}",
"code": fixed,
"highlighted_code": _highlight_code(fixed, language="java") if fixed else "",
"inputs": [],
"outputs": [],
"test": None,
"tasks": [],
"source": "CodeXGLUE",
"has_ground_truth": False,
"has_tasks": False,
"description": "",
"buggy_code": buggy,
"buggy_highlighted_code": _highlight_code(buggy, language="java") if buggy else "",
"fixed_code": fixed,
"fixed_highlighted_code": _highlight_code(fixed, language="java") if fixed else "",
"bug_category": "Code Refinement",
"bug_subtype": "",
"bug_explanation": "",
"language": "Java",
}
# ---------------------------------------------------------------------------
# CommitBench adapter (HuggingFace: Maxscha/commitbench)
# ---------------------------------------------------------------------------
class CommitBenchAdapter(DatasetAdapter):
slug = "commitbench"
display_name = "CommitBench"
has_ground_truth = False
has_tasks = False
def __init__(self, hf_dataset):
self._ds = hf_dataset
def problem_count(self) -> int:
return len(self._ds)
def get_problem_summary(self, idx: int) -> dict[str, Any]:
row = self._ds[idx]
return {
"idx": idx,
"task_id": row.get("hash", str(idx))[:12],
"entry_point": row.get("project", f"commit_{idx}"),
"num_inputs": 0,
"source": row.get("diff_languages", "unknown"),
}
def get_problem_detail(self, idx: int) -> dict[str, Any]:
row = self._ds[idx]
diff = row.get("diff", "")
message = row.get("message", "")
return {
"idx": idx,
"task_id": row.get("hash", str(idx))[:12],
"entry_point": row.get("project", f"commit_{idx}"),
"code": diff,
"highlighted_code": "",
"inputs": [],
"outputs": [],
"test": None,
"tasks": [],
"source": row.get("diff_languages", "unknown"),
"has_ground_truth": False,
"has_tasks": False,
"description": message,
"patch": diff,
"repo": row.get("project", ""),
"commit_hash": row.get("hash", ""),
"diff_languages": row.get("diff_languages", ""),
}