PatchJudge / patchjudge /models.py
VD10's picture
Upload patchjudge/models.py
3bb15c1 verified
"""Data models for PatchJudge."""
from dataclasses import dataclass, field, asdict
from typing import Optional
import json
@dataclass
class PatchExample:
"""Unified format for a single patch evaluation example."""
instance_id: str
repo: str
problem_statement: str
gold_patch: str # Human-written reference patch
agent_patch: str # AI-generated patch
agent_name: str # Which agent produced this
test_passed: bool # Did the agent's patch pass tests?
base_commit: str
repo_context: dict = field(default_factory=dict) # {filename: file_content}
difficulty: str = ""
def to_dict(self) -> dict:
return asdict(self)
@classmethod
def from_dict(cls, d: dict) -> "PatchExample":
return cls(**d)
def to_json(self) -> str:
return json.dumps(self.to_dict(), indent=2)
@classmethod
def from_json(cls, s: str) -> "PatchExample":
return cls.from_dict(json.loads(s))
@dataclass
class PatchFeatures:
"""Structured features extracted from a patch."""
# Diff statistics
num_files_changed: int = 0
num_lines_added: int = 0
num_lines_removed: int = 0
num_hunks: int = 0
# Code structure
added_functions: list = field(default_factory=list)
modified_functions: list = field(default_factory=list)
has_error_handling: bool = False
has_edge_case_handling: bool = False
# Issue-patch alignment
issue_keywords_addressed: list = field(default_factory=list)
issue_components_mentioned: list = field(default_factory=list)
keyword_coverage_ratio: float = 0.0
# Code quality signals
has_todos: bool = False
has_hardcoded_values: bool = False
has_debug_statements: bool = False
follows_project_style: bool = True
style_violations: list = field(default_factory=list)
# Risk signals
modifies_core_files: bool = False
change_scope: str = "minimal" # minimal, moderate, extensive
has_imports_added: bool = False
new_imports: list = field(default_factory=list)
touches_tests: bool = False
# Complexity
cyclomatic_complexity_delta: int = 0
nesting_depth_max: int = 0
def to_dict(self) -> dict:
return asdict(self)
@dataclass
class DimensionScore:
"""Score for a single evaluation dimension."""
score: int # 0-10
reasoning: str
flags: list = field(default_factory=list)
def to_dict(self) -> dict:
return asdict(self)
@dataclass
class JudgeResult:
"""Complete judge evaluation output."""
merge_score: float # 0-100 weighted score
dimension_scores: dict = field(default_factory=dict) # dim_name -> DimensionScore
raw_output: str = ""
features: Optional[PatchFeatures] = None
model_used: str = ""
@property
def correctness(self) -> int:
return self.dimension_scores.get("correctness", {}).get("score", 0)
@property
def completeness(self) -> int:
return self.dimension_scores.get("completeness", {}).get("score", 0)
@property
def code_quality(self) -> int:
return self.dimension_scores.get("code_quality", {}).get("score", 0)
@property
def non_regression_risk(self) -> int:
return self.dimension_scores.get("non_regression_risk", {}).get("score", 0)
@property
def merge_readiness(self) -> int:
return self.dimension_scores.get("merge_readiness", {}).get("score", 0)
def to_dict(self) -> dict:
d = {
"merge_score": self.merge_score,
"dimension_scores": self.dimension_scores,
"raw_output": self.raw_output,
"model_used": self.model_used,
}
if self.features:
d["features"] = self.features.to_dict()
return d
def summary(self) -> str:
lines = [f"MergeScore: {self.merge_score:.1f}/100"]
for dim, data in self.dimension_scores.items():
score = data.get("score", "?")
lines.append(f" {dim}: {score}/10")
if data.get("flags"):
for flag in data["flags"]:
lines.append(f" ⚠ {flag}")
return "\n".join(lines)
@dataclass
class ValidationResult:
"""Result of validating PatchJudge against ground truth."""
total_examples: int = 0
# METR alignment: fraction of test-passing patches scoring below 50
test_passing_below_50_pct: float = 0.0
# Correlation metrics
score_resolved_correlation: float = 0.0
mean_score_resolved: float = 0.0
mean_score_unresolved: float = 0.0
# Known-bad detection
known_bad_detected: int = 0
known_bad_total: int = 0
known_bad_detection_rate: float = 0.0
# Score distribution
score_mean: float = 0.0
score_std: float = 0.0
score_median: float = 0.0
# Per-dimension stats
dimension_stats: dict = field(default_factory=dict)
def to_dict(self) -> dict:
return asdict(self)