| """Data models for PatchJudge.""" |
|
|
| from dataclasses import dataclass, field, asdict |
| from typing import Optional |
| import json |
|
|
|
|
| @dataclass |
| class PatchExample: |
| """Unified format for a single patch evaluation example.""" |
| instance_id: str |
| repo: str |
| problem_statement: str |
| gold_patch: str |
| agent_patch: str |
| agent_name: str |
| test_passed: bool |
| base_commit: str |
| repo_context: dict = field(default_factory=dict) |
| difficulty: str = "" |
| |
| def to_dict(self) -> dict: |
| return asdict(self) |
| |
| @classmethod |
| def from_dict(cls, d: dict) -> "PatchExample": |
| return cls(**d) |
| |
| def to_json(self) -> str: |
| return json.dumps(self.to_dict(), indent=2) |
| |
| @classmethod |
| def from_json(cls, s: str) -> "PatchExample": |
| return cls.from_dict(json.loads(s)) |
|
|
|
|
| @dataclass |
| class PatchFeatures: |
| """Structured features extracted from a patch.""" |
| |
| num_files_changed: int = 0 |
| num_lines_added: int = 0 |
| num_lines_removed: int = 0 |
| num_hunks: int = 0 |
| |
| |
| added_functions: list = field(default_factory=list) |
| modified_functions: list = field(default_factory=list) |
| has_error_handling: bool = False |
| has_edge_case_handling: bool = False |
| |
| |
| issue_keywords_addressed: list = field(default_factory=list) |
| issue_components_mentioned: list = field(default_factory=list) |
| keyword_coverage_ratio: float = 0.0 |
| |
| |
| has_todos: bool = False |
| has_hardcoded_values: bool = False |
| has_debug_statements: bool = False |
| follows_project_style: bool = True |
| style_violations: list = field(default_factory=list) |
| |
| |
| modifies_core_files: bool = False |
| change_scope: str = "minimal" |
| has_imports_added: bool = False |
| new_imports: list = field(default_factory=list) |
| touches_tests: bool = False |
| |
| |
| cyclomatic_complexity_delta: int = 0 |
| nesting_depth_max: int = 0 |
| |
| def to_dict(self) -> dict: |
| return asdict(self) |
|
|
|
|
| @dataclass |
| class DimensionScore: |
| """Score for a single evaluation dimension.""" |
| score: int |
| reasoning: str |
| flags: list = field(default_factory=list) |
| |
| def to_dict(self) -> dict: |
| return asdict(self) |
|
|
|
|
| @dataclass |
| class JudgeResult: |
| """Complete judge evaluation output.""" |
| merge_score: float |
| dimension_scores: dict = field(default_factory=dict) |
| raw_output: str = "" |
| features: Optional[PatchFeatures] = None |
| model_used: str = "" |
| |
| @property |
| def correctness(self) -> int: |
| return self.dimension_scores.get("correctness", {}).get("score", 0) |
| |
| @property |
| def completeness(self) -> int: |
| return self.dimension_scores.get("completeness", {}).get("score", 0) |
| |
| @property |
| def code_quality(self) -> int: |
| return self.dimension_scores.get("code_quality", {}).get("score", 0) |
| |
| @property |
| def non_regression_risk(self) -> int: |
| return self.dimension_scores.get("non_regression_risk", {}).get("score", 0) |
| |
| @property |
| def merge_readiness(self) -> int: |
| return self.dimension_scores.get("merge_readiness", {}).get("score", 0) |
| |
| def to_dict(self) -> dict: |
| d = { |
| "merge_score": self.merge_score, |
| "dimension_scores": self.dimension_scores, |
| "raw_output": self.raw_output, |
| "model_used": self.model_used, |
| } |
| if self.features: |
| d["features"] = self.features.to_dict() |
| return d |
| |
| def summary(self) -> str: |
| lines = [f"MergeScore: {self.merge_score:.1f}/100"] |
| for dim, data in self.dimension_scores.items(): |
| score = data.get("score", "?") |
| lines.append(f" {dim}: {score}/10") |
| if data.get("flags"): |
| for flag in data["flags"]: |
| lines.append(f" ⚠ {flag}") |
| return "\n".join(lines) |
|
|
|
|
| @dataclass |
| class ValidationResult: |
| """Result of validating PatchJudge against ground truth.""" |
| total_examples: int = 0 |
| |
| test_passing_below_50_pct: float = 0.0 |
| |
| score_resolved_correlation: float = 0.0 |
| mean_score_resolved: float = 0.0 |
| mean_score_unresolved: float = 0.0 |
| |
| known_bad_detected: int = 0 |
| known_bad_total: int = 0 |
| known_bad_detection_rate: float = 0.0 |
| |
| score_mean: float = 0.0 |
| score_std: float = 0.0 |
| score_median: float = 0.0 |
| |
| dimension_stats: dict = field(default_factory=dict) |
| |
| def to_dict(self) -> dict: |
| return asdict(self) |
|
|