Spaces:

JetBrains-Research
/

ml4se-evals-visualization

Running

App Files Files Community

ml4se-evals-visualization / adapters /code_editing.py

egor-bogomolov

Fix FAIL_TO_PASS/PASS_TO_PASS parsing for SWE-bench Multilingual

e080b1d 1 day ago

raw

history blame contribute delete

14.4 kB

	"""Code editing benchmark adapters (SWE-bench, DebugBench, CanItEdit, CodeEditorBench)."""

	from __future__ import annotations

	import json
	from typing import Any

	from adapters import DatasetAdapter

	# Injected at runtime by _set_helpers()
	_highlight_code = None
	_code_offset = None
	_extract_test_classes = None


	# ---------------------------------------------------------------------------
	# SWE-bench Lite adapter (HuggingFace: princeton-nlp/SWE-bench_Lite)
	# ---------------------------------------------------------------------------


	class SWEBenchLiteAdapter(DatasetAdapter):
	slug = "swebenchlite"
	display_name = "SWE-bench Lite"
	has_ground_truth = False
	has_tasks = False

	def __init__(self, hf_dataset):
	self._ds = hf_dataset

	def problem_count(self) -> int:
	return len(self._ds)

	def get_problem_summary(self, idx: int) -> dict[str, Any]:
	row = self._ds[idx]
	return {
	"idx": idx,
	"task_id": row["instance_id"],
	"entry_point": row["instance_id"].split("__")[-1],
	"num_inputs": 0,
	"source": row["repo"],
	}

	@staticmethod
	def _github_links(instance_id: str, repo: str, base_commit: str) -> dict[str, str]:
	"""Build GitHub URLs from SWE-bench instance metadata."""
	links: dict[str, str] = {}
	if repo:
	links["repo_url"] = f"https://github.com/{repo}"
	# instance_id format: "repo__issue-number" e.g. "astropy__astropy-12907"
	parts = instance_id.rsplit("-", 1)
	if len(parts) == 2 and parts[1].isdigit() and repo:
	links["issue_url"] = f"https://github.com/{repo}/issues/{parts[1]}"
	if base_commit and repo:
	links["commit_url"] = f"https://github.com/{repo}/commit/{base_commit}"
	return links

	def get_problem_detail(self, idx: int) -> dict[str, Any]:
	row = self._ds[idx]
	patch = row["patch"]
	raw_f2p = row["FAIL_TO_PASS"]
	fail_to_pass = raw_f2p if isinstance(raw_f2p, list) else (json.loads(raw_f2p) if raw_f2p else [])
	raw_p2p = row["PASS_TO_PASS"]
	pass_to_pass = raw_p2p if isinstance(raw_p2p, list) else (json.loads(raw_p2p) if raw_p2p else [])
	instance_id = row["instance_id"]
	repo = row["repo"]
	base_commit = row.get("base_commit", "")
	return {
	"idx": idx,
	"task_id": instance_id,
	"entry_point": instance_id.split("__")[-1],
	"code": patch,
	"highlighted_code": "",
	"inputs": [],
	"outputs": [],
	"test": None,
	"tasks": [],
	"source": repo,
	"has_ground_truth": False,
	"has_tasks": False,
	"description": row["problem_statement"],
	"patch": patch,
	"test_patch": row.get("test_patch", ""),
	"fail_to_pass": fail_to_pass,
	"pass_to_pass": pass_to_pass,
	"hints": row.get("hints_text", ""),
	"repo": repo,
	"base_commit": base_commit,
	"version": row.get("version", ""),
	"created_at": row.get("created_at", ""),
	**self._github_links(instance_id, repo, base_commit),
	}


	# ---------------------------------------------------------------------------
	# SWE-bench Verified adapter (HuggingFace: princeton-nlp/SWE-bench_Verified)
	# ---------------------------------------------------------------------------


	class SWEBenchVerifiedAdapter(SWEBenchLiteAdapter):
	slug = "swebenchverified"
	display_name = "SWE-bench Verified"


	class SWEBenchFullAdapter(SWEBenchLiteAdapter):
	slug = "swebenchfull"
	display_name = "SWE-bench"


	# ---------------------------------------------------------------------------
	# DebugBench adapter (HuggingFace: Rtian/DebugBench)
	# ---------------------------------------------------------------------------


	class DebugBenchAdapter(DatasetAdapter):
	slug = "debugbench"
	display_name = "DebugBench"
	has_ground_truth = False
	has_tasks = False

	def __init__(self, hf_dataset):
	self._ds = hf_dataset

	def problem_count(self) -> int:
	return len(self._ds)

	def get_problem_summary(self, idx: int) -> dict[str, Any]:
	row = self._ds[idx]
	return {
	"idx": idx,
	"task_id": row["slug"],
	"entry_point": row["slug"],
	"num_inputs": len(row["examples"]),
	"source": f"{row['language']}/{row['category']}",
	}

	def get_problem_detail(self, idx: int) -> dict[str, Any]:
	row = self._ds[idx]
	lang = row["language"]
	buggy = row["buggy_code"]
	fixed = row["solution"]
	return {
	"idx": idx,
	"task_id": row["slug"],
	"entry_point": row["slug"],
	"code": fixed,
	"highlighted_code": _highlight_code(fixed, language=lang),
	"inputs": [],
	"outputs": [],
	"test": None,
	"tasks": [],
	"source": f"{lang}/{row['category']}",
	"has_ground_truth": False,
	"has_tasks": False,
	"description": row["question"],
	"language": lang,
	"buggy_code": buggy,
	"buggy_highlighted_code": _highlight_code(buggy, language=lang),
	"fixed_code": fixed,
	"fixed_highlighted_code": _highlight_code(fixed, language=lang),
	"bug_category": row["category"],
	"bug_subtype": row["subtype"],
	"bug_explanation": row["bug_explanation"],
	"difficulty": row["level"],
	"examples": list(row["examples"]),
	}


	# ---------------------------------------------------------------------------
	# CanItEdit adapter (HuggingFace: nuprl/CanItEdit)
	# ---------------------------------------------------------------------------


	class CanItEditAdapter(DatasetAdapter):
	slug = "canitedit"
	display_name = "CanItEdit"
	has_ground_truth = False
	has_tasks = False

	def __init__(self, hf_dataset):
	self._ds = hf_dataset

	def problem_count(self) -> int:
	return len(self._ds)

	def get_problem_summary(self, idx: int) -> dict[str, Any]:
	row = self._ds[idx]
	taxonomy = row.get("taxonomy", {})
	change_kind = taxonomy.get("change_kind", "") if isinstance(taxonomy, dict) else ""
	return {
	"idx": idx,
	"task_id": row.get("full_name", str(row.get("id", idx))),
	"entry_point": row.get("name", f"edit_{idx}"),
	"num_inputs": 0,
	"source": change_kind or "CanItEdit",
	}

	def get_problem_detail(self, idx: int) -> dict[str, Any]:
	row = self._ds[idx]
	before = row["before"]
	after = row["after"]
	taxonomy = row.get("taxonomy", {})
	if not isinstance(taxonomy, dict):
	taxonomy = {}
	return {
	"idx": idx,
	"task_id": row.get("full_name", str(row.get("id", idx))),
	"entry_point": row.get("name", f"edit_{idx}"),
	"code": after,
	"highlighted_code": _highlight_code(after),
	"inputs": [],
	"outputs": [],
	"test": row.get("tests", ""),
	"tasks": [],
	"source": taxonomy.get("change_kind", "CanItEdit"),
	"has_ground_truth": False,
	"has_tasks": False,
	"description": row.get("instruction_descriptive", ""),
	"buggy_code": before,
	"buggy_highlighted_code": _highlight_code(before),
	"fixed_code": after,
	"fixed_highlighted_code": _highlight_code(after),
	"bug_category": taxonomy.get("change_kind", ""),
	"bug_subtype": taxonomy.get("topic", ""),
	"bug_explanation": row.get("instruction_lazy", ""),
	}


	# ---------------------------------------------------------------------------
	# CodeEditorBench adapter (HuggingFace: m-a-p/CodeEditorBench)
	# ---------------------------------------------------------------------------


	class CodeEditorBenchAdapter(DatasetAdapter):
	slug = "codeeditorbench"
	display_name = "CodeEditorBench"
	has_ground_truth = False
	has_tasks = False

	def __init__(self, rows: list[dict[str, Any]]):
	self._rows = rows

	def problem_count(self) -> int:
	return len(self._rows)

	def get_problem_summary(self, idx: int) -> dict[str, Any]:
	row = self._rows[idx]
	return {
	"idx": idx,
	"task_id": str(row.get("idx", idx)),
	"entry_point": row.get("title", f"problem_{idx}"),
	"num_inputs": 0,
	"source": row.get("_task_type", "unknown"),
	}

	def get_problem_detail(self, idx: int) -> dict[str, Any]:
	row = self._rows[idx]
	task_type = row.get("_task_type", "unknown")
	lang = row.get("code_language", row.get("source_lang", "python")) or "python"
	lang_key = lang.lower()

	if task_type == "code_debug":
	buggy = row.get("incorrect_solutions", "")
	fixed = row.get("solutions", "")
	elif task_type == "code_translate":
	buggy = row.get("source_code", "")
	fixed = row.get("solutions", row.get("source_code", ""))
	elif task_type == "code_polishment":
	buggy = row.get("source_code", "")
	fixed = row.get("solutions", row.get("source_code", ""))
	else: # code_switch
	buggy = row.get("similar_source_code", row.get("source_code", ""))
	fixed = row.get("solutions", row.get("source_code", ""))

	return {
	"idx": idx,
	"task_id": str(row.get("idx", idx)),
	"entry_point": row.get("title", f"problem_{idx}"),
	"code": fixed,
	"highlighted_code": _highlight_code(fixed, language=lang_key) if fixed else "",
	"inputs": [],
	"outputs": [],
	"test": None,
	"tasks": [],
	"source": task_type,
	"has_ground_truth": False,
	"has_tasks": False,
	"description": "",
	"buggy_code": buggy,
	"buggy_highlighted_code": _highlight_code(buggy, language=lang_key) if buggy else "",
	"fixed_code": fixed,
	"fixed_highlighted_code": _highlight_code(fixed, language=lang_key) if fixed else "",
	"bug_category": task_type,
	"bug_subtype": row.get("difficulty", ""),
	"bug_explanation": "",
	"difficulty": row.get("difficulty", ""),
	"language": lang,
	}


	# ---------------------------------------------------------------------------
	# CodeXGLUE Code Refinement adapter (HuggingFace: google/code_x_glue_cc_code_refinement)
	# ---------------------------------------------------------------------------


	class CodeXGLUERefinementAdapter(DatasetAdapter):
	slug = "codexgluerefinement"
	display_name = "CodeXGLUE Code Refinement"
	has_ground_truth = False
	has_tasks = False

	def __init__(self, hf_dataset):
	self._ds = hf_dataset

	def problem_count(self) -> int:
	return len(self._ds)

	def get_problem_summary(self, idx: int) -> dict[str, Any]:
	row = self._ds[idx]
	return {
	"idx": idx,
	"task_id": str(row.get("id", idx)),
	"entry_point": f"refinement_{row.get('id', idx)}",
	"num_inputs": 0,
	"source": "CodeXGLUE",
	}

	def get_problem_detail(self, idx: int) -> dict[str, Any]:
	row = self._ds[idx]
	buggy = row.get("buggy", "")
	fixed = row.get("fixed", "")
	return {
	"idx": idx,
	"task_id": str(row.get("id", idx)),
	"entry_point": f"refinement_{row.get('id', idx)}",
	"code": fixed,
	"highlighted_code": _highlight_code(fixed, language="java") if fixed else "",
	"inputs": [],
	"outputs": [],
	"test": None,
	"tasks": [],
	"source": "CodeXGLUE",
	"has_ground_truth": False,
	"has_tasks": False,
	"description": "",
	"buggy_code": buggy,
	"buggy_highlighted_code": _highlight_code(buggy, language="java") if buggy else "",
	"fixed_code": fixed,
	"fixed_highlighted_code": _highlight_code(fixed, language="java") if fixed else "",
	"bug_category": "Code Refinement",
	"bug_subtype": "",
	"bug_explanation": "",
	"language": "Java",
	}


	# ---------------------------------------------------------------------------
	# CommitBench adapter (HuggingFace: Maxscha/commitbench)
	# ---------------------------------------------------------------------------


	class CommitBenchAdapter(DatasetAdapter):
	slug = "commitbench"
	display_name = "CommitBench"
	has_ground_truth = False
	has_tasks = False

	def __init__(self, hf_dataset):
	self._ds = hf_dataset

	def problem_count(self) -> int:
	return len(self._ds)

	def get_problem_summary(self, idx: int) -> dict[str, Any]:
	row = self._ds[idx]
	return {
	"idx": idx,
	"task_id": row.get("hash", str(idx))[:12],
	"entry_point": row.get("project", f"commit_{idx}"),
	"num_inputs": 0,
	"source": row.get("diff_languages", "unknown"),
	}

	def get_problem_detail(self, idx: int) -> dict[str, Any]:
	row = self._ds[idx]
	diff = row.get("diff", "")
	message = row.get("message", "")
	return {
	"idx": idx,
	"task_id": row.get("hash", str(idx))[:12],
	"entry_point": row.get("project", f"commit_{idx}"),
	"code": diff,
	"highlighted_code": "",
	"inputs": [],
	"outputs": [],
	"test": None,
	"tasks": [],
	"source": row.get("diff_languages", "unknown"),
	"has_ground_truth": False,
	"has_tasks": False,
	"description": message,
	"patch": diff,
	"repo": row.get("project", ""),
	"commit_hash": row.get("hash", ""),
	"diff_languages": row.get("diff_languages", ""),
	}