VD10 commited on
Commit
3bb15c1
·
verified ·
1 Parent(s): a06a32d

Upload patchjudge/models.py

Browse files
Files changed (1) hide show
  1. patchjudge/models.py +163 -0
patchjudge/models.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Data models for PatchJudge."""
2
+
3
+ from dataclasses import dataclass, field, asdict
4
+ from typing import Optional
5
+ import json
6
+
7
+
8
+ @dataclass
9
+ class PatchExample:
10
+ """Unified format for a single patch evaluation example."""
11
+ instance_id: str
12
+ repo: str
13
+ problem_statement: str
14
+ gold_patch: str # Human-written reference patch
15
+ agent_patch: str # AI-generated patch
16
+ agent_name: str # Which agent produced this
17
+ test_passed: bool # Did the agent's patch pass tests?
18
+ base_commit: str
19
+ repo_context: dict = field(default_factory=dict) # {filename: file_content}
20
+ difficulty: str = ""
21
+
22
+ def to_dict(self) -> dict:
23
+ return asdict(self)
24
+
25
+ @classmethod
26
+ def from_dict(cls, d: dict) -> "PatchExample":
27
+ return cls(**d)
28
+
29
+ def to_json(self) -> str:
30
+ return json.dumps(self.to_dict(), indent=2)
31
+
32
+ @classmethod
33
+ def from_json(cls, s: str) -> "PatchExample":
34
+ return cls.from_dict(json.loads(s))
35
+
36
+
37
+ @dataclass
38
+ class PatchFeatures:
39
+ """Structured features extracted from a patch."""
40
+ # Diff statistics
41
+ num_files_changed: int = 0
42
+ num_lines_added: int = 0
43
+ num_lines_removed: int = 0
44
+ num_hunks: int = 0
45
+
46
+ # Code structure
47
+ added_functions: list = field(default_factory=list)
48
+ modified_functions: list = field(default_factory=list)
49
+ has_error_handling: bool = False
50
+ has_edge_case_handling: bool = False
51
+
52
+ # Issue-patch alignment
53
+ issue_keywords_addressed: list = field(default_factory=list)
54
+ issue_components_mentioned: list = field(default_factory=list)
55
+ keyword_coverage_ratio: float = 0.0
56
+
57
+ # Code quality signals
58
+ has_todos: bool = False
59
+ has_hardcoded_values: bool = False
60
+ has_debug_statements: bool = False
61
+ follows_project_style: bool = True
62
+ style_violations: list = field(default_factory=list)
63
+
64
+ # Risk signals
65
+ modifies_core_files: bool = False
66
+ change_scope: str = "minimal" # minimal, moderate, extensive
67
+ has_imports_added: bool = False
68
+ new_imports: list = field(default_factory=list)
69
+ touches_tests: bool = False
70
+
71
+ # Complexity
72
+ cyclomatic_complexity_delta: int = 0
73
+ nesting_depth_max: int = 0
74
+
75
+ def to_dict(self) -> dict:
76
+ return asdict(self)
77
+
78
+
79
+ @dataclass
80
+ class DimensionScore:
81
+ """Score for a single evaluation dimension."""
82
+ score: int # 0-10
83
+ reasoning: str
84
+ flags: list = field(default_factory=list)
85
+
86
+ def to_dict(self) -> dict:
87
+ return asdict(self)
88
+
89
+
90
+ @dataclass
91
+ class JudgeResult:
92
+ """Complete judge evaluation output."""
93
+ merge_score: float # 0-100 weighted score
94
+ dimension_scores: dict = field(default_factory=dict) # dim_name -> DimensionScore
95
+ raw_output: str = ""
96
+ features: Optional[PatchFeatures] = None
97
+ model_used: str = ""
98
+
99
+ @property
100
+ def correctness(self) -> int:
101
+ return self.dimension_scores.get("correctness", {}).get("score", 0)
102
+
103
+ @property
104
+ def completeness(self) -> int:
105
+ return self.dimension_scores.get("completeness", {}).get("score", 0)
106
+
107
+ @property
108
+ def code_quality(self) -> int:
109
+ return self.dimension_scores.get("code_quality", {}).get("score", 0)
110
+
111
+ @property
112
+ def non_regression_risk(self) -> int:
113
+ return self.dimension_scores.get("non_regression_risk", {}).get("score", 0)
114
+
115
+ @property
116
+ def merge_readiness(self) -> int:
117
+ return self.dimension_scores.get("merge_readiness", {}).get("score", 0)
118
+
119
+ def to_dict(self) -> dict:
120
+ d = {
121
+ "merge_score": self.merge_score,
122
+ "dimension_scores": self.dimension_scores,
123
+ "raw_output": self.raw_output,
124
+ "model_used": self.model_used,
125
+ }
126
+ if self.features:
127
+ d["features"] = self.features.to_dict()
128
+ return d
129
+
130
+ def summary(self) -> str:
131
+ lines = [f"MergeScore: {self.merge_score:.1f}/100"]
132
+ for dim, data in self.dimension_scores.items():
133
+ score = data.get("score", "?")
134
+ lines.append(f" {dim}: {score}/10")
135
+ if data.get("flags"):
136
+ for flag in data["flags"]:
137
+ lines.append(f" ⚠ {flag}")
138
+ return "\n".join(lines)
139
+
140
+
141
+ @dataclass
142
+ class ValidationResult:
143
+ """Result of validating PatchJudge against ground truth."""
144
+ total_examples: int = 0
145
+ # METR alignment: fraction of test-passing patches scoring below 50
146
+ test_passing_below_50_pct: float = 0.0
147
+ # Correlation metrics
148
+ score_resolved_correlation: float = 0.0
149
+ mean_score_resolved: float = 0.0
150
+ mean_score_unresolved: float = 0.0
151
+ # Known-bad detection
152
+ known_bad_detected: int = 0
153
+ known_bad_total: int = 0
154
+ known_bad_detection_rate: float = 0.0
155
+ # Score distribution
156
+ score_mean: float = 0.0
157
+ score_std: float = 0.0
158
+ score_median: float = 0.0
159
+ # Per-dimension stats
160
+ dimension_stats: dict = field(default_factory=dict)
161
+
162
+ def to_dict(self) -> dict:
163
+ return asdict(self)