Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- models.py +4 -13
- server/python_env_environment.py +12 -12
models.py
CHANGED
|
@@ -184,14 +184,11 @@ class RewardSummary(BaseModel):
|
|
| 184 |
class PythonReviewAction(Action):
|
| 185 |
"""Structured review action emitted by a model or trainer."""
|
| 186 |
|
| 187 |
-
# Primary UI Fields (
|
| 188 |
operation: str = Field(default="submit_findings", description="The operation to perform.")
|
| 189 |
-
|
| 190 |
-
patched_code: Optional[str] = Field(default=None, description="
|
| 191 |
-
note: Optional[str] = Field(default=None, description="Optional note about the review.")
|
| 192 |
-
findings: List[ReviewFinding] = Field(default_factory=list, description="The structured findings list.")
|
| 193 |
|
| 194 |
-
# Optional Review Fields (for benchmark compatibility)
|
| 195 |
action_type: ActionType = ActionType.ADD_COMMENT
|
| 196 |
line_number: Optional[int] = Field(default=None, ge=1)
|
| 197 |
issue_type: Optional[IssueType] = None
|
|
@@ -203,15 +200,9 @@ class PythonReviewAction(Action):
|
|
| 203 |
@model_validator(mode="after")
|
| 204 |
def validate_action_shape(self) -> "PythonReviewAction":
|
| 205 |
"""Require the right fields for each action type."""
|
| 206 |
-
|
| 207 |
-
# Synchronize code and patched_code
|
| 208 |
-
if self.code is None and self.patched_code is not None:
|
| 209 |
-
self.code = self.patched_code
|
| 210 |
-
elif self.patched_code is None and self.code is not None:
|
| 211 |
-
self.patched_code = self.code
|
| 212 |
|
| 213 |
# Bypass benchmark validation if using the template 'operation' style (e.g. submit_findings)
|
| 214 |
-
if self.operation !=
|
| 215 |
return self
|
| 216 |
|
| 217 |
if self.action_type == ActionType.ADD_COMMENT:
|
|
|
|
| 184 |
class PythonReviewAction(Action):
|
| 185 |
"""Structured review action emitted by a model or trainer."""
|
| 186 |
|
| 187 |
+
# Primary UI Fields (7 benchmark + 3 template = 10 total)
|
| 188 |
operation: str = Field(default="submit_findings", description="The operation to perform.")
|
| 189 |
+
findings: List[ReviewFinding] = Field(default_factory=list, description="The findings list.")
|
| 190 |
+
patched_code: Optional[str] = Field(default=None, description="The fixed source code.")
|
|
|
|
|
|
|
| 191 |
|
|
|
|
| 192 |
action_type: ActionType = ActionType.ADD_COMMENT
|
| 193 |
line_number: Optional[int] = Field(default=None, ge=1)
|
| 194 |
issue_type: Optional[IssueType] = None
|
|
|
|
| 200 |
@model_validator(mode="after")
|
| 201 |
def validate_action_shape(self) -> "PythonReviewAction":
|
| 202 |
"""Require the right fields for each action type."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
# Bypass benchmark validation if using the template 'operation' style (e.g. submit_findings)
|
| 205 |
+
if self.operation != "ADD_COMMENT":
|
| 206 |
return self
|
| 207 |
|
| 208 |
if self.action_type == ActionType.ADD_COMMENT:
|
server/python_env_environment.py
CHANGED
|
@@ -64,7 +64,7 @@ class ReviewTask:
|
|
| 64 |
descriptor: TaskDescriptor
|
| 65 |
references: tuple[ReferenceFinding, ...]
|
| 66 |
hint: str
|
| 67 |
-
|
| 68 |
|
| 69 |
|
| 70 |
TASK_BANK: Dict[str, ReviewTask] = {
|
|
@@ -96,7 +96,7 @@ TASK_BANK: Dict[str, ReviewTask] = {
|
|
| 96 |
),
|
| 97 |
),
|
| 98 |
hint="Look for state that survives between separate function calls.",
|
| 99 |
-
|
| 100 |
"def add_tag(tag, tags=None):\n"
|
| 101 |
" if tags is None:\n"
|
| 102 |
" tags = []\n"
|
|
@@ -132,7 +132,7 @@ TASK_BANK: Dict[str, ReviewTask] = {
|
|
| 132 |
),
|
| 133 |
),
|
| 134 |
hint="Check how external commands are invoked and whether user input is escaped.",
|
| 135 |
-
|
| 136 |
"import subprocess\n\n"
|
| 137 |
"def run_backup(path):\n"
|
| 138 |
" subprocess.run([\"tar\", \"-czf\", \"backup.tgz\", path], check=True)\n"
|
|
@@ -184,7 +184,7 @@ TASK_BANK: Dict[str, ReviewTask] = {
|
|
| 184 |
),
|
| 185 |
),
|
| 186 |
hint="Consider what happens to the final error after the retry loop finishes.",
|
| 187 |
-
|
| 188 |
"import time\n\n"
|
| 189 |
"def fetch_with_retry(client, url, retries=3):\n"
|
| 190 |
" last_error = None\n"
|
|
@@ -274,12 +274,12 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
|
|
| 274 |
if operation == "request_hint":
|
| 275 |
self._hints_used += 1
|
| 276 |
feedback = self._current_task.hint
|
| 277 |
-
evaluation = self._evaluate(self._submitted_findings, action.
|
| 278 |
reward = evaluation.score
|
| 279 |
else:
|
| 280 |
if action.findings:
|
| 281 |
self._submitted_findings.extend(action.findings)
|
| 282 |
-
evaluation = self._evaluate(self._submitted_findings, action.
|
| 283 |
reward = evaluation.score
|
| 284 |
if operation == "finalize":
|
| 285 |
done = True
|
|
@@ -307,7 +307,7 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
|
|
| 307 |
feedback=feedback,
|
| 308 |
reward=reward,
|
| 309 |
done=done,
|
| 310 |
-
|
| 311 |
)
|
| 312 |
|
| 313 |
def _build_observation(
|
|
@@ -316,10 +316,10 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
|
|
| 316 |
feedback: str,
|
| 317 |
reward: float,
|
| 318 |
done: bool,
|
| 319 |
-
|
| 320 |
) -> PythonObservation:
|
| 321 |
assert self._current_task is not None
|
| 322 |
-
evaluation = self._evaluate(self._submitted_findings,
|
| 323 |
attempts_remaining = max(
|
| 324 |
self._max_steps() - self._state.step_count,
|
| 325 |
0,
|
|
@@ -345,7 +345,7 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
|
|
| 345 |
def _evaluate(
|
| 346 |
self,
|
| 347 |
findings: Iterable[ReviewFinding],
|
| 348 |
-
|
| 349 |
) -> TaskEvaluation:
|
| 350 |
assert self._current_task is not None
|
| 351 |
|
|
@@ -372,9 +372,9 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
|
|
| 372 |
weighted_recall = min(matched_weight / total_weight, 1.0)
|
| 373 |
|
| 374 |
patch_score = 0.0
|
| 375 |
-
if self._current_task.
|
| 376 |
patch_score = float(
|
| 377 |
-
_normalize_code(
|
| 378 |
)
|
| 379 |
|
| 380 |
raw_score = (
|
|
|
|
| 64 |
descriptor: TaskDescriptor
|
| 65 |
references: tuple[ReferenceFinding, ...]
|
| 66 |
hint: str
|
| 67 |
+
patched_code: Optional[str] = None
|
| 68 |
|
| 69 |
|
| 70 |
TASK_BANK: Dict[str, ReviewTask] = {
|
|
|
|
| 96 |
),
|
| 97 |
),
|
| 98 |
hint="Look for state that survives between separate function calls.",
|
| 99 |
+
patched_code=(
|
| 100 |
"def add_tag(tag, tags=None):\n"
|
| 101 |
" if tags is None:\n"
|
| 102 |
" tags = []\n"
|
|
|
|
| 132 |
),
|
| 133 |
),
|
| 134 |
hint="Check how external commands are invoked and whether user input is escaped.",
|
| 135 |
+
patched_code=(
|
| 136 |
"import subprocess\n\n"
|
| 137 |
"def run_backup(path):\n"
|
| 138 |
" subprocess.run([\"tar\", \"-czf\", \"backup.tgz\", path], check=True)\n"
|
|
|
|
| 184 |
),
|
| 185 |
),
|
| 186 |
hint="Consider what happens to the final error after the retry loop finishes.",
|
| 187 |
+
patched_code=(
|
| 188 |
"import time\n\n"
|
| 189 |
"def fetch_with_retry(client, url, retries=3):\n"
|
| 190 |
" last_error = None\n"
|
|
|
|
| 274 |
if operation == "request_hint":
|
| 275 |
self._hints_used += 1
|
| 276 |
feedback = self._current_task.hint
|
| 277 |
+
evaluation = self._evaluate(self._submitted_findings, action.patched_code)
|
| 278 |
reward = evaluation.score
|
| 279 |
else:
|
| 280 |
if action.findings:
|
| 281 |
self._submitted_findings.extend(action.findings)
|
| 282 |
+
evaluation = self._evaluate(self._submitted_findings, action.patched_code)
|
| 283 |
reward = evaluation.score
|
| 284 |
if operation == "finalize":
|
| 285 |
done = True
|
|
|
|
| 307 |
feedback=feedback,
|
| 308 |
reward=reward,
|
| 309 |
done=done,
|
| 310 |
+
patched_code=action.patched_code,
|
| 311 |
)
|
| 312 |
|
| 313 |
def _build_observation(
|
|
|
|
| 316 |
feedback: str,
|
| 317 |
reward: float,
|
| 318 |
done: bool,
|
| 319 |
+
patched_code: Optional[str] = None,
|
| 320 |
) -> PythonObservation:
|
| 321 |
assert self._current_task is not None
|
| 322 |
+
evaluation = self._evaluate(self._submitted_findings, patched_code)
|
| 323 |
attempts_remaining = max(
|
| 324 |
self._max_steps() - self._state.step_count,
|
| 325 |
0,
|
|
|
|
| 345 |
def _evaluate(
|
| 346 |
self,
|
| 347 |
findings: Iterable[ReviewFinding],
|
| 348 |
+
patched_code: Optional[str],
|
| 349 |
) -> TaskEvaluation:
|
| 350 |
assert self._current_task is not None
|
| 351 |
|
|
|
|
| 372 |
weighted_recall = min(matched_weight / total_weight, 1.0)
|
| 373 |
|
| 374 |
patch_score = 0.0
|
| 375 |
+
if self._current_task.patched_code and patched_code:
|
| 376 |
patch_score = float(
|
| 377 |
+
_normalize_code(patched_code) == _normalize_code(self._current_task.patched_code)
|
| 378 |
)
|
| 379 |
|
| 380 |
raw_score = (
|