| | """Code editing benchmark adapters (SWE-bench, DebugBench, CanItEdit, CodeEditorBench).""" |
| |
|
| | from __future__ import annotations |
| |
|
| | import json |
| | from typing import Any |
| |
|
| | from adapters import DatasetAdapter |
| |
|
| | |
| | _highlight_code = None |
| | _code_offset = None |
| | _extract_test_classes = None |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | class SWEBenchLiteAdapter(DatasetAdapter): |
| | slug = "swebenchlite" |
| | display_name = "SWE-bench Lite" |
| | has_ground_truth = False |
| | has_tasks = False |
| |
|
| | def __init__(self, hf_dataset): |
| | self._ds = hf_dataset |
| |
|
| | def problem_count(self) -> int: |
| | return len(self._ds) |
| |
|
| | def get_problem_summary(self, idx: int) -> dict[str, Any]: |
| | row = self._ds[idx] |
| | return { |
| | "idx": idx, |
| | "task_id": row["instance_id"], |
| | "entry_point": row["instance_id"].split("__")[-1], |
| | "num_inputs": 0, |
| | "source": row["repo"], |
| | } |
| |
|
| | @staticmethod |
| | def _github_links(instance_id: str, repo: str, base_commit: str) -> dict[str, str]: |
| | """Build GitHub URLs from SWE-bench instance metadata.""" |
| | links: dict[str, str] = {} |
| | if repo: |
| | links["repo_url"] = f"https://github.com/{repo}" |
| | |
| | parts = instance_id.rsplit("-", 1) |
| | if len(parts) == 2 and parts[1].isdigit() and repo: |
| | links["issue_url"] = f"https://github.com/{repo}/issues/{parts[1]}" |
| | if base_commit and repo: |
| | links["commit_url"] = f"https://github.com/{repo}/commit/{base_commit}" |
| | return links |
| |
|
| | def get_problem_detail(self, idx: int) -> dict[str, Any]: |
| | row = self._ds[idx] |
| | patch = row["patch"] |
| | raw_f2p = row["FAIL_TO_PASS"] |
| | fail_to_pass = raw_f2p if isinstance(raw_f2p, list) else (json.loads(raw_f2p) if raw_f2p else []) |
| | raw_p2p = row["PASS_TO_PASS"] |
| | pass_to_pass = raw_p2p if isinstance(raw_p2p, list) else (json.loads(raw_p2p) if raw_p2p else []) |
| | instance_id = row["instance_id"] |
| | repo = row["repo"] |
| | base_commit = row.get("base_commit", "") |
| | return { |
| | "idx": idx, |
| | "task_id": instance_id, |
| | "entry_point": instance_id.split("__")[-1], |
| | "code": patch, |
| | "highlighted_code": "", |
| | "inputs": [], |
| | "outputs": [], |
| | "test": None, |
| | "tasks": [], |
| | "source": repo, |
| | "has_ground_truth": False, |
| | "has_tasks": False, |
| | "description": row["problem_statement"], |
| | "patch": patch, |
| | "test_patch": row.get("test_patch", ""), |
| | "fail_to_pass": fail_to_pass, |
| | "pass_to_pass": pass_to_pass, |
| | "hints": row.get("hints_text", ""), |
| | "repo": repo, |
| | "base_commit": base_commit, |
| | "version": row.get("version", ""), |
| | "created_at": row.get("created_at", ""), |
| | **self._github_links(instance_id, repo, base_commit), |
| | } |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | class SWEBenchVerifiedAdapter(SWEBenchLiteAdapter): |
| | slug = "swebenchverified" |
| | display_name = "SWE-bench Verified" |
| |
|
| |
|
| | class SWEBenchFullAdapter(SWEBenchLiteAdapter): |
| | slug = "swebenchfull" |
| | display_name = "SWE-bench" |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | class DebugBenchAdapter(DatasetAdapter): |
| | slug = "debugbench" |
| | display_name = "DebugBench" |
| | has_ground_truth = False |
| | has_tasks = False |
| |
|
| | def __init__(self, hf_dataset): |
| | self._ds = hf_dataset |
| |
|
| | def problem_count(self) -> int: |
| | return len(self._ds) |
| |
|
| | def get_problem_summary(self, idx: int) -> dict[str, Any]: |
| | row = self._ds[idx] |
| | return { |
| | "idx": idx, |
| | "task_id": row["slug"], |
| | "entry_point": row["slug"], |
| | "num_inputs": len(row["examples"]), |
| | "source": f"{row['language']}/{row['category']}", |
| | } |
| |
|
| | def get_problem_detail(self, idx: int) -> dict[str, Any]: |
| | row = self._ds[idx] |
| | lang = row["language"] |
| | buggy = row["buggy_code"] |
| | fixed = row["solution"] |
| | return { |
| | "idx": idx, |
| | "task_id": row["slug"], |
| | "entry_point": row["slug"], |
| | "code": fixed, |
| | "highlighted_code": _highlight_code(fixed, language=lang), |
| | "inputs": [], |
| | "outputs": [], |
| | "test": None, |
| | "tasks": [], |
| | "source": f"{lang}/{row['category']}", |
| | "has_ground_truth": False, |
| | "has_tasks": False, |
| | "description": row["question"], |
| | "language": lang, |
| | "buggy_code": buggy, |
| | "buggy_highlighted_code": _highlight_code(buggy, language=lang), |
| | "fixed_code": fixed, |
| | "fixed_highlighted_code": _highlight_code(fixed, language=lang), |
| | "bug_category": row["category"], |
| | "bug_subtype": row["subtype"], |
| | "bug_explanation": row["bug_explanation"], |
| | "difficulty": row["level"], |
| | "examples": list(row["examples"]), |
| | } |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | class CanItEditAdapter(DatasetAdapter): |
| | slug = "canitedit" |
| | display_name = "CanItEdit" |
| | has_ground_truth = False |
| | has_tasks = False |
| |
|
| | def __init__(self, hf_dataset): |
| | self._ds = hf_dataset |
| |
|
| | def problem_count(self) -> int: |
| | return len(self._ds) |
| |
|
| | def get_problem_summary(self, idx: int) -> dict[str, Any]: |
| | row = self._ds[idx] |
| | taxonomy = row.get("taxonomy", {}) |
| | change_kind = taxonomy.get("change_kind", "") if isinstance(taxonomy, dict) else "" |
| | return { |
| | "idx": idx, |
| | "task_id": row.get("full_name", str(row.get("id", idx))), |
| | "entry_point": row.get("name", f"edit_{idx}"), |
| | "num_inputs": 0, |
| | "source": change_kind or "CanItEdit", |
| | } |
| |
|
| | def get_problem_detail(self, idx: int) -> dict[str, Any]: |
| | row = self._ds[idx] |
| | before = row["before"] |
| | after = row["after"] |
| | taxonomy = row.get("taxonomy", {}) |
| | if not isinstance(taxonomy, dict): |
| | taxonomy = {} |
| | return { |
| | "idx": idx, |
| | "task_id": row.get("full_name", str(row.get("id", idx))), |
| | "entry_point": row.get("name", f"edit_{idx}"), |
| | "code": after, |
| | "highlighted_code": _highlight_code(after), |
| | "inputs": [], |
| | "outputs": [], |
| | "test": row.get("tests", ""), |
| | "tasks": [], |
| | "source": taxonomy.get("change_kind", "CanItEdit"), |
| | "has_ground_truth": False, |
| | "has_tasks": False, |
| | "description": row.get("instruction_descriptive", ""), |
| | "buggy_code": before, |
| | "buggy_highlighted_code": _highlight_code(before), |
| | "fixed_code": after, |
| | "fixed_highlighted_code": _highlight_code(after), |
| | "bug_category": taxonomy.get("change_kind", ""), |
| | "bug_subtype": taxonomy.get("topic", ""), |
| | "bug_explanation": row.get("instruction_lazy", ""), |
| | } |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | class CodeEditorBenchAdapter(DatasetAdapter): |
| | slug = "codeeditorbench" |
| | display_name = "CodeEditorBench" |
| | has_ground_truth = False |
| | has_tasks = False |
| |
|
| | def __init__(self, rows: list[dict[str, Any]]): |
| | self._rows = rows |
| |
|
| | def problem_count(self) -> int: |
| | return len(self._rows) |
| |
|
| | def get_problem_summary(self, idx: int) -> dict[str, Any]: |
| | row = self._rows[idx] |
| | return { |
| | "idx": idx, |
| | "task_id": str(row.get("idx", idx)), |
| | "entry_point": row.get("title", f"problem_{idx}"), |
| | "num_inputs": 0, |
| | "source": row.get("_task_type", "unknown"), |
| | } |
| |
|
| | def get_problem_detail(self, idx: int) -> dict[str, Any]: |
| | row = self._rows[idx] |
| | task_type = row.get("_task_type", "unknown") |
| | lang = row.get("code_language", row.get("source_lang", "python")) or "python" |
| | lang_key = lang.lower() |
| |
|
| | if task_type == "code_debug": |
| | buggy = row.get("incorrect_solutions", "") |
| | fixed = row.get("solutions", "") |
| | elif task_type == "code_translate": |
| | buggy = row.get("source_code", "") |
| | fixed = row.get("solutions", row.get("source_code", "")) |
| | elif task_type == "code_polishment": |
| | buggy = row.get("source_code", "") |
| | fixed = row.get("solutions", row.get("source_code", "")) |
| | else: |
| | buggy = row.get("similar_source_code", row.get("source_code", "")) |
| | fixed = row.get("solutions", row.get("source_code", "")) |
| |
|
| | return { |
| | "idx": idx, |
| | "task_id": str(row.get("idx", idx)), |
| | "entry_point": row.get("title", f"problem_{idx}"), |
| | "code": fixed, |
| | "highlighted_code": _highlight_code(fixed, language=lang_key) if fixed else "", |
| | "inputs": [], |
| | "outputs": [], |
| | "test": None, |
| | "tasks": [], |
| | "source": task_type, |
| | "has_ground_truth": False, |
| | "has_tasks": False, |
| | "description": "", |
| | "buggy_code": buggy, |
| | "buggy_highlighted_code": _highlight_code(buggy, language=lang_key) if buggy else "", |
| | "fixed_code": fixed, |
| | "fixed_highlighted_code": _highlight_code(fixed, language=lang_key) if fixed else "", |
| | "bug_category": task_type, |
| | "bug_subtype": row.get("difficulty", ""), |
| | "bug_explanation": "", |
| | "difficulty": row.get("difficulty", ""), |
| | "language": lang, |
| | } |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | class CodeXGLUERefinementAdapter(DatasetAdapter): |
| | slug = "codexgluerefinement" |
| | display_name = "CodeXGLUE Code Refinement" |
| | has_ground_truth = False |
| | has_tasks = False |
| |
|
| | def __init__(self, hf_dataset): |
| | self._ds = hf_dataset |
| |
|
| | def problem_count(self) -> int: |
| | return len(self._ds) |
| |
|
| | def get_problem_summary(self, idx: int) -> dict[str, Any]: |
| | row = self._ds[idx] |
| | return { |
| | "idx": idx, |
| | "task_id": str(row.get("id", idx)), |
| | "entry_point": f"refinement_{row.get('id', idx)}", |
| | "num_inputs": 0, |
| | "source": "CodeXGLUE", |
| | } |
| |
|
| | def get_problem_detail(self, idx: int) -> dict[str, Any]: |
| | row = self._ds[idx] |
| | buggy = row.get("buggy", "") |
| | fixed = row.get("fixed", "") |
| | return { |
| | "idx": idx, |
| | "task_id": str(row.get("id", idx)), |
| | "entry_point": f"refinement_{row.get('id', idx)}", |
| | "code": fixed, |
| | "highlighted_code": _highlight_code(fixed, language="java") if fixed else "", |
| | "inputs": [], |
| | "outputs": [], |
| | "test": None, |
| | "tasks": [], |
| | "source": "CodeXGLUE", |
| | "has_ground_truth": False, |
| | "has_tasks": False, |
| | "description": "", |
| | "buggy_code": buggy, |
| | "buggy_highlighted_code": _highlight_code(buggy, language="java") if buggy else "", |
| | "fixed_code": fixed, |
| | "fixed_highlighted_code": _highlight_code(fixed, language="java") if fixed else "", |
| | "bug_category": "Code Refinement", |
| | "bug_subtype": "", |
| | "bug_explanation": "", |
| | "language": "Java", |
| | } |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | class CommitBenchAdapter(DatasetAdapter): |
| | slug = "commitbench" |
| | display_name = "CommitBench" |
| | has_ground_truth = False |
| | has_tasks = False |
| |
|
| | def __init__(self, hf_dataset): |
| | self._ds = hf_dataset |
| |
|
| | def problem_count(self) -> int: |
| | return len(self._ds) |
| |
|
| | def get_problem_summary(self, idx: int) -> dict[str, Any]: |
| | row = self._ds[idx] |
| | return { |
| | "idx": idx, |
| | "task_id": row.get("hash", str(idx))[:12], |
| | "entry_point": row.get("project", f"commit_{idx}"), |
| | "num_inputs": 0, |
| | "source": row.get("diff_languages", "unknown"), |
| | } |
| |
|
| | def get_problem_detail(self, idx: int) -> dict[str, Any]: |
| | row = self._ds[idx] |
| | diff = row.get("diff", "") |
| | message = row.get("message", "") |
| | return { |
| | "idx": idx, |
| | "task_id": row.get("hash", str(idx))[:12], |
| | "entry_point": row.get("project", f"commit_{idx}"), |
| | "code": diff, |
| | "highlighted_code": "", |
| | "inputs": [], |
| | "outputs": [], |
| | "test": None, |
| | "tasks": [], |
| | "source": row.get("diff_languages", "unknown"), |
| | "has_ground_truth": False, |
| | "has_tasks": False, |
| | "description": message, |
| | "patch": diff, |
| | "repo": row.get("project", ""), |
| | "commit_hash": row.get("hash", ""), |
| | "diff_languages": row.get("diff_languages", ""), |
| | } |
| |
|