darshanajudiya7 commited on
Commit
b577709
·
verified ·
1 Parent(s): c6da15a

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. models.py +4 -13
  2. server/python_env_environment.py +12 -12
models.py CHANGED
@@ -184,14 +184,11 @@ class RewardSummary(BaseModel):
184
  class PythonReviewAction(Action):
185
  """Structured review action emitted by a model or trainer."""
186
 
187
- # Primary UI Fields (matches CodingEnv style)
188
  operation: str = Field(default="submit_findings", description="The operation to perform.")
189
- code: Optional[str] = Field(default=None, description="The fixed source code.")
190
- patched_code: Optional[str] = Field(default=None, description="Compatibility alias for code.")
191
- note: Optional[str] = Field(default=None, description="Optional note about the review.")
192
- findings: List[ReviewFinding] = Field(default_factory=list, description="The structured findings list.")
193
 
194
- # Optional Review Fields (for benchmark compatibility)
195
  action_type: ActionType = ActionType.ADD_COMMENT
196
  line_number: Optional[int] = Field(default=None, ge=1)
197
  issue_type: Optional[IssueType] = None
@@ -203,15 +200,9 @@ class PythonReviewAction(Action):
203
  @model_validator(mode="after")
204
  def validate_action_shape(self) -> "PythonReviewAction":
205
  """Require the right fields for each action type."""
206
-
207
- # Synchronize code and patched_code
208
- if self.code is None and self.patched_code is not None:
209
- self.code = self.patched_code
210
- elif self.patched_code is None and self.code is not None:
211
- self.patched_code = self.code
212
 
213
  # Bypass benchmark validation if using the template 'operation' style (e.g. submit_findings)
214
- if self.operation != ActionType.ADD_COMMENT:
215
  return self
216
 
217
  if self.action_type == ActionType.ADD_COMMENT:
 
184
  class PythonReviewAction(Action):
185
  """Structured review action emitted by a model or trainer."""
186
 
187
+ # Primary UI Fields (7 benchmark + 3 template = 10 total)
188
  operation: str = Field(default="submit_findings", description="The operation to perform.")
189
+ findings: List[ReviewFinding] = Field(default_factory=list, description="The findings list.")
190
+ patched_code: Optional[str] = Field(default=None, description="The fixed source code.")
 
 
191
 
 
192
  action_type: ActionType = ActionType.ADD_COMMENT
193
  line_number: Optional[int] = Field(default=None, ge=1)
194
  issue_type: Optional[IssueType] = None
 
200
  @model_validator(mode="after")
201
  def validate_action_shape(self) -> "PythonReviewAction":
202
  """Require the right fields for each action type."""
 
 
 
 
 
 
203
 
204
  # Bypass benchmark validation if using the template 'operation' style (e.g. submit_findings)
205
+ if self.operation != "ADD_COMMENT":
206
  return self
207
 
208
  if self.action_type == ActionType.ADD_COMMENT:
server/python_env_environment.py CHANGED
@@ -64,7 +64,7 @@ class ReviewTask:
64
  descriptor: TaskDescriptor
65
  references: tuple[ReferenceFinding, ...]
66
  hint: str
67
- code: Optional[str] = None
68
 
69
 
70
  TASK_BANK: Dict[str, ReviewTask] = {
@@ -96,7 +96,7 @@ TASK_BANK: Dict[str, ReviewTask] = {
96
  ),
97
  ),
98
  hint="Look for state that survives between separate function calls.",
99
- code=(
100
  "def add_tag(tag, tags=None):\n"
101
  " if tags is None:\n"
102
  " tags = []\n"
@@ -132,7 +132,7 @@ TASK_BANK: Dict[str, ReviewTask] = {
132
  ),
133
  ),
134
  hint="Check how external commands are invoked and whether user input is escaped.",
135
- code=(
136
  "import subprocess\n\n"
137
  "def run_backup(path):\n"
138
  " subprocess.run([\"tar\", \"-czf\", \"backup.tgz\", path], check=True)\n"
@@ -184,7 +184,7 @@ TASK_BANK: Dict[str, ReviewTask] = {
184
  ),
185
  ),
186
  hint="Consider what happens to the final error after the retry loop finishes.",
187
- code=(
188
  "import time\n\n"
189
  "def fetch_with_retry(client, url, retries=3):\n"
190
  " last_error = None\n"
@@ -274,12 +274,12 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
274
  if operation == "request_hint":
275
  self._hints_used += 1
276
  feedback = self._current_task.hint
277
- evaluation = self._evaluate(self._submitted_findings, action.code)
278
  reward = evaluation.score
279
  else:
280
  if action.findings:
281
  self._submitted_findings.extend(action.findings)
282
- evaluation = self._evaluate(self._submitted_findings, action.code)
283
  reward = evaluation.score
284
  if operation == "finalize":
285
  done = True
@@ -307,7 +307,7 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
307
  feedback=feedback,
308
  reward=reward,
309
  done=done,
310
- code=action.code,
311
  )
312
 
313
  def _build_observation(
@@ -316,10 +316,10 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
316
  feedback: str,
317
  reward: float,
318
  done: bool,
319
- code: Optional[str] = None,
320
  ) -> PythonObservation:
321
  assert self._current_task is not None
322
- evaluation = self._evaluate(self._submitted_findings, code)
323
  attempts_remaining = max(
324
  self._max_steps() - self._state.step_count,
325
  0,
@@ -345,7 +345,7 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
345
  def _evaluate(
346
  self,
347
  findings: Iterable[ReviewFinding],
348
- code: Optional[str],
349
  ) -> TaskEvaluation:
350
  assert self._current_task is not None
351
 
@@ -372,9 +372,9 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
372
  weighted_recall = min(matched_weight / total_weight, 1.0)
373
 
374
  patch_score = 0.0
375
- if self._current_task.code and code:
376
  patch_score = float(
377
- _normalize_code(code) == _normalize_code(self._current_task.code)
378
  )
379
 
380
  raw_score = (
 
64
  descriptor: TaskDescriptor
65
  references: tuple[ReferenceFinding, ...]
66
  hint: str
67
+ patched_code: Optional[str] = None
68
 
69
 
70
  TASK_BANK: Dict[str, ReviewTask] = {
 
96
  ),
97
  ),
98
  hint="Look for state that survives between separate function calls.",
99
+ patched_code=(
100
  "def add_tag(tag, tags=None):\n"
101
  " if tags is None:\n"
102
  " tags = []\n"
 
132
  ),
133
  ),
134
  hint="Check how external commands are invoked and whether user input is escaped.",
135
+ patched_code=(
136
  "import subprocess\n\n"
137
  "def run_backup(path):\n"
138
  " subprocess.run([\"tar\", \"-czf\", \"backup.tgz\", path], check=True)\n"
 
184
  ),
185
  ),
186
  hint="Consider what happens to the final error after the retry loop finishes.",
187
+ patched_code=(
188
  "import time\n\n"
189
  "def fetch_with_retry(client, url, retries=3):\n"
190
  " last_error = None\n"
 
274
  if operation == "request_hint":
275
  self._hints_used += 1
276
  feedback = self._current_task.hint
277
+ evaluation = self._evaluate(self._submitted_findings, action.patched_code)
278
  reward = evaluation.score
279
  else:
280
  if action.findings:
281
  self._submitted_findings.extend(action.findings)
282
+ evaluation = self._evaluate(self._submitted_findings, action.patched_code)
283
  reward = evaluation.score
284
  if operation == "finalize":
285
  done = True
 
307
  feedback=feedback,
308
  reward=reward,
309
  done=done,
310
+ patched_code=action.patched_code,
311
  )
312
 
313
  def _build_observation(
 
316
  feedback: str,
317
  reward: float,
318
  done: bool,
319
+ patched_code: Optional[str] = None,
320
  ) -> PythonObservation:
321
  assert self._current_task is not None
322
+ evaluation = self._evaluate(self._submitted_findings, patched_code)
323
  attempts_remaining = max(
324
  self._max_steps() - self._state.step_count,
325
  0,
 
345
  def _evaluate(
346
  self,
347
  findings: Iterable[ReviewFinding],
348
+ patched_code: Optional[str],
349
  ) -> TaskEvaluation:
350
  assert self._current_task is not None
351
 
 
372
  weighted_recall = min(matched_weight / total_weight, 1.0)
373
 
374
  patch_score = 0.0
375
+ if self._current_task.patched_code and patched_code:
376
  patch_score = float(
377
+ _normalize_code(patched_code) == _normalize_code(self._current_task.patched_code)
378
  )
379
 
380
  raw_score = (