Upload patchjudge/models.py

3bb15c1 verified 6 days ago

5.05 kB

	"""Data models for PatchJudge."""

	from dataclasses import dataclass, field, asdict
	from typing import Optional
	import json


	@dataclass
	class PatchExample:
	"""Unified format for a single patch evaluation example."""
	instance_id: str
	repo: str
	problem_statement: str
	gold_patch: str # Human-written reference patch
	agent_patch: str # AI-generated patch
	agent_name: str # Which agent produced this
	test_passed: bool # Did the agent's patch pass tests?
	base_commit: str
	repo_context: dict = field(default_factory=dict) # {filename: file_content}
	difficulty: str = ""

	def to_dict(self) -> dict:
	return asdict(self)

	@classmethod
	def from_dict(cls, d: dict) -> "PatchExample":
	return cls(**d)

	def to_json(self) -> str:
	return json.dumps(self.to_dict(), indent=2)

	@classmethod
	def from_json(cls, s: str) -> "PatchExample":
	return cls.from_dict(json.loads(s))


	@dataclass
	class PatchFeatures:
	"""Structured features extracted from a patch."""
	# Diff statistics
	num_files_changed: int = 0
	num_lines_added: int = 0
	num_lines_removed: int = 0
	num_hunks: int = 0

	# Code structure
	added_functions: list = field(default_factory=list)
	modified_functions: list = field(default_factory=list)
	has_error_handling: bool = False
	has_edge_case_handling: bool = False

	# Issue-patch alignment
	issue_keywords_addressed: list = field(default_factory=list)
	issue_components_mentioned: list = field(default_factory=list)
	keyword_coverage_ratio: float = 0.0

	# Code quality signals
	has_todos: bool = False
	has_hardcoded_values: bool = False
	has_debug_statements: bool = False
	follows_project_style: bool = True
	style_violations: list = field(default_factory=list)

	# Risk signals
	modifies_core_files: bool = False
	change_scope: str = "minimal" # minimal, moderate, extensive
	has_imports_added: bool = False
	new_imports: list = field(default_factory=list)
	touches_tests: bool = False

	# Complexity
	cyclomatic_complexity_delta: int = 0
	nesting_depth_max: int = 0

	def to_dict(self) -> dict:
	return asdict(self)


	@dataclass
	class DimensionScore:
	"""Score for a single evaluation dimension."""
	score: int # 0-10
	reasoning: str
	flags: list = field(default_factory=list)

	def to_dict(self) -> dict:
	return asdict(self)


	@dataclass
	class JudgeResult:
	"""Complete judge evaluation output."""
	merge_score: float # 0-100 weighted score
	dimension_scores: dict = field(default_factory=dict) # dim_name -> DimensionScore
	raw_output: str = ""
	features: Optional[PatchFeatures] = None
	model_used: str = ""

	@property
	def correctness(self) -> int:
	return self.dimension_scores.get("correctness", {}).get("score", 0)

	@property
	def completeness(self) -> int:
	return self.dimension_scores.get("completeness", {}).get("score", 0)

	@property
	def code_quality(self) -> int:
	return self.dimension_scores.get("code_quality", {}).get("score", 0)

	@property
	def non_regression_risk(self) -> int:
	return self.dimension_scores.get("non_regression_risk", {}).get("score", 0)

	@property
	def merge_readiness(self) -> int:
	return self.dimension_scores.get("merge_readiness", {}).get("score", 0)

	def to_dict(self) -> dict:
	d = {
	"merge_score": self.merge_score,
	"dimension_scores": self.dimension_scores,
	"raw_output": self.raw_output,
	"model_used": self.model_used,
	}
	if self.features:
	d["features"] = self.features.to_dict()
	return d

	def summary(self) -> str:
	lines = [f"MergeScore: {self.merge_score:.1f}/100"]
	for dim, data in self.dimension_scores.items():
	score = data.get("score", "?")
	lines.append(f" {dim}: {score}/10")
	if data.get("flags"):
	for flag in data["flags"]:
	lines.append(f" ⚠ {flag}")
	return "\n".join(lines)


	@dataclass
	class ValidationResult:
	"""Result of validating PatchJudge against ground truth."""
	total_examples: int = 0
	# METR alignment: fraction of test-passing patches scoring below 50
	test_passing_below_50_pct: float = 0.0
	# Correlation metrics
	score_resolved_correlation: float = 0.0
	mean_score_resolved: float = 0.0
	mean_score_unresolved: float = 0.0
	# Known-bad detection
	known_bad_detected: int = 0
	known_bad_total: int = 0
	known_bad_detection_rate: float = 0.0
	# Score distribution
	score_mean: float = 0.0
	score_std: float = 0.0
	score_median: float = 0.0
	# Per-dimension stats
	dimension_stats: dict = field(default_factory=dict)

	def to_dict(self) -> dict:
	return asdict(self)