VD10
/

PatchJudge

+"""Data models for PatchJudge."""
+from dataclasses import dataclass, field, asdict
+from typing import Optional
+import json
+@dataclass
+class PatchExample:
+    """Unified format for a single patch evaluation example."""
+    instance_id: str
+    repo: str
+    problem_statement: str
+    gold_patch: str              # Human-written reference patch
+    agent_patch: str             # AI-generated patch
+    agent_name: str              # Which agent produced this
+    test_passed: bool            # Did the agent's patch pass tests?
+    base_commit: str
+    repo_context: dict = field(default_factory=dict)  # {filename: file_content}
+    difficulty: str = ""
+    def to_dict(self) -> dict:
+        return asdict(self)
+    @classmethod
+    def from_dict(cls, d: dict) -> "PatchExample":
+        return cls(**d)
+    def to_json(self) -> str:
+        return json.dumps(self.to_dict(), indent=2)
+    @classmethod
+    def from_json(cls, s: str) -> "PatchExample":
+        return cls.from_dict(json.loads(s))
+@dataclass
+class PatchFeatures:
+    """Structured features extracted from a patch."""
+    # Diff statistics
+    num_files_changed: int = 0
+    num_lines_added: int = 0
+    num_lines_removed: int = 0
+    num_hunks: int = 0
+    # Code structure
+    added_functions: list = field(default_factory=list)
+    modified_functions: list = field(default_factory=list)
+    has_error_handling: bool = False
+    has_edge_case_handling: bool = False
+    # Issue-patch alignment
+    issue_keywords_addressed: list = field(default_factory=list)
+    issue_components_mentioned: list = field(default_factory=list)
+    keyword_coverage_ratio: float = 0.0
+    # Code quality signals
+    has_todos: bool = False
+    has_hardcoded_values: bool = False
+    has_debug_statements: bool = False
+    follows_project_style: bool = True
+    style_violations: list = field(default_factory=list)
+    # Risk signals
+    modifies_core_files: bool = False
+    change_scope: str = "minimal"  # minimal, moderate, extensive
+    has_imports_added: bool = False
+    new_imports: list = field(default_factory=list)
+    touches_tests: bool = False
+    # Complexity
+    cyclomatic_complexity_delta: int = 0
+    nesting_depth_max: int = 0
+    def to_dict(self) -> dict:
+        return asdict(self)
+@dataclass
+class DimensionScore:
+    """Score for a single evaluation dimension."""
+    score: int  # 0-10
+    reasoning: str
+    flags: list = field(default_factory=list)
+    def to_dict(self) -> dict:
+        return asdict(self)
+@dataclass
+class JudgeResult:
+    """Complete judge evaluation output."""
+    merge_score: float  # 0-100 weighted score
+    dimension_scores: dict = field(default_factory=dict)  # dim_name -> DimensionScore
+    raw_output: str = ""
+    features: Optional[PatchFeatures] = None
+    model_used: str = ""
+    @property
+    def correctness(self) -> int:
+        return self.dimension_scores.get("correctness", {}).get("score", 0)
+    @property
+    def completeness(self) -> int:
+        return self.dimension_scores.get("completeness", {}).get("score", 0)
+    @property
+    def code_quality(self) -> int:
+        return self.dimension_scores.get("code_quality", {}).get("score", 0)
+    @property
+    def non_regression_risk(self) -> int:
+        return self.dimension_scores.get("non_regression_risk", {}).get("score", 0)
+    @property
+    def merge_readiness(self) -> int:
+        return self.dimension_scores.get("merge_readiness", {}).get("score", 0)
+    def to_dict(self) -> dict:
+        d = {
+            "merge_score": self.merge_score,
+            "dimension_scores": self.dimension_scores,
+            "raw_output": self.raw_output,
+            "model_used": self.model_used,
+        }
+        if self.features:
+            d["features"] = self.features.to_dict()
+        return d
+    def summary(self) -> str:
+        lines = [f"MergeScore: {self.merge_score:.1f}/100"]
+        for dim, data in self.dimension_scores.items():
+            score = data.get("score", "?")
+            lines.append(f"  {dim}: {score}/10")
+            if data.get("flags"):
+                for flag in data["flags"]:
+                    lines.append(f"    ⚠ {flag}")
+        return "\n".join(lines)
+@dataclass
+class ValidationResult:
+    """Result of validating PatchJudge against ground truth."""
+    total_examples: int = 0
+    # METR alignment: fraction of test-passing patches scoring below 50
+    test_passing_below_50_pct: float = 0.0
+    # Correlation metrics
+    score_resolved_correlation: float = 0.0
+    mean_score_resolved: float = 0.0
+    mean_score_unresolved: float = 0.0
+    # Known-bad detection
+    known_bad_detected: int = 0
+    known_bad_total: int = 0
+    known_bad_detection_rate: float = 0.0
+    # Score distribution
+    score_mean: float = 0.0
+    score_std: float = 0.0
+    score_median: float = 0.0
+    # Per-dimension stats
+    dimension_stats: dict = field(default_factory=dict)
+    def to_dict(self) -> dict:
+        return asdict(self)