Spaces:

darshanajudiya7
/

python_env

Sleeping

App Files Files Community

darshanajudiya7 commited on 14 days ago

Commit

b577709

verified ·

1 Parent(s): c6da15a

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

models.py +4 -13
server/python_env_environment.py +12 -12

models.py CHANGED Viewed

@@ -184,14 +184,11 @@ class RewardSummary(BaseModel):
 class PythonReviewAction(Action):
     """Structured review action emitted by a model or trainer."""
-    # Primary UI Fields (matches CodingEnv style)
     operation: str = Field(default="submit_findings", description="The operation to perform.")
-    code: Optional[str] = Field(default=None, description="The fixed source code.")
-    patched_code: Optional[str] = Field(default=None, description="Compatibility alias for code.")
-    note: Optional[str] = Field(default=None, description="Optional note about the review.")
-    findings: List[ReviewFinding] = Field(default_factory=list, description="The structured findings list.")
-    # Optional Review Fields (for benchmark compatibility)
     action_type: ActionType = ActionType.ADD_COMMENT
     line_number: Optional[int] = Field(default=None, ge=1)
     issue_type: Optional[IssueType] = None
@@ -203,15 +200,9 @@ class PythonReviewAction(Action):
     @model_validator(mode="after")
     def validate_action_shape(self) -> "PythonReviewAction":
         """Require the right fields for each action type."""
-        # Synchronize code and patched_code
-        if self.code is None and self.patched_code is not None:
-            self.code = self.patched_code
-        elif self.patched_code is None and self.code is not None:
-            self.patched_code = self.code
         # Bypass benchmark validation if using the template 'operation' style (e.g. submit_findings)
-        if self.operation != ActionType.ADD_COMMENT:
              return self
         if self.action_type == ActionType.ADD_COMMENT:

 class PythonReviewAction(Action):
     """Structured review action emitted by a model or trainer."""
+    # Primary UI Fields (7 benchmark + 3 template = 10 total)
     operation: str = Field(default="submit_findings", description="The operation to perform.")
+    findings: List[ReviewFinding] = Field(default_factory=list, description="The findings list.")
+    patched_code: Optional[str] = Field(default=None, description="The fixed source code.")
     action_type: ActionType = ActionType.ADD_COMMENT
     line_number: Optional[int] = Field(default=None, ge=1)
     issue_type: Optional[IssueType] = None
     @model_validator(mode="after")
     def validate_action_shape(self) -> "PythonReviewAction":
         """Require the right fields for each action type."""
         # Bypass benchmark validation if using the template 'operation' style (e.g. submit_findings)
+        if self.operation != "ADD_COMMENT":
              return self
         if self.action_type == ActionType.ADD_COMMENT:

server/python_env_environment.py CHANGED Viewed

@@ -64,7 +64,7 @@ class ReviewTask:
     descriptor: TaskDescriptor
     references: tuple[ReferenceFinding, ...]
     hint: str
-    code: Optional[str] = None
 TASK_BANK: Dict[str, ReviewTask] = {
@@ -96,7 +96,7 @@ TASK_BANK: Dict[str, ReviewTask] = {
             ),
         ),
         hint="Look for state that survives between separate function calls.",
-        code=(
             "def add_tag(tag, tags=None):\n"
             "    if tags is None:\n"
             "        tags = []\n"
@@ -132,7 +132,7 @@ TASK_BANK: Dict[str, ReviewTask] = {
             ),
         ),
         hint="Check how external commands are invoked and whether user input is escaped.",
-        code=(
             "import subprocess\n\n"
             "def run_backup(path):\n"
             "    subprocess.run([\"tar\", \"-czf\", \"backup.tgz\", path], check=True)\n"
@@ -184,7 +184,7 @@ TASK_BANK: Dict[str, ReviewTask] = {
             ),
         ),
         hint="Consider what happens to the final error after the retry loop finishes.",
-        code=(
             "import time\n\n"
             "def fetch_with_retry(client, url, retries=3):\n"
             "    last_error = None\n"
@@ -274,12 +274,12 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
         if operation == "request_hint":
             self._hints_used += 1
             feedback = self._current_task.hint
-            evaluation = self._evaluate(self._submitted_findings, action.code)
             reward = evaluation.score
         else:
             if action.findings:
                 self._submitted_findings.extend(action.findings)
-            evaluation = self._evaluate(self._submitted_findings, action.code)
             reward = evaluation.score
             if operation == "finalize":
                 done = True
@@ -307,7 +307,7 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
             feedback=feedback,
             reward=reward,
             done=done,
-            code=action.code,
         )
     def _build_observation(
@@ -316,10 +316,10 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
         feedback: str,
         reward: float,
         done: bool,
-        code: Optional[str] = None,
     ) -> PythonObservation:
         assert self._current_task is not None
-        evaluation = self._evaluate(self._submitted_findings, code)
         attempts_remaining = max(
             self._max_steps() - self._state.step_count,
             0,
@@ -345,7 +345,7 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
     def _evaluate(
         self,
         findings: Iterable[ReviewFinding],
-        code: Optional[str],
     ) -> TaskEvaluation:
         assert self._current_task is not None
@@ -372,9 +372,9 @@ class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
         weighted_recall = min(matched_weight / total_weight, 1.0)
         patch_score = 0.0
-        if self._current_task.code and code:
             patch_score = float(
-                _normalize_code(code) == _normalize_code(self._current_task.code)
             )
         raw_score = (

     descriptor: TaskDescriptor
     references: tuple[ReferenceFinding, ...]
     hint: str
+    patched_code: Optional[str] = None
 TASK_BANK: Dict[str, ReviewTask] = {
             ),
         ),
         hint="Look for state that survives between separate function calls.",
+        patched_code=(
             "def add_tag(tag, tags=None):\n"
             "    if tags is None:\n"
             "        tags = []\n"
             ),
         ),
         hint="Check how external commands are invoked and whether user input is escaped.",
+        patched_code=(
             "import subprocess\n\n"
             "def run_backup(path):\n"
             "    subprocess.run([\"tar\", \"-czf\", \"backup.tgz\", path], check=True)\n"
             ),
         ),
         hint="Consider what happens to the final error after the retry loop finishes.",
+        patched_code=(
             "import time\n\n"
             "def fetch_with_retry(client, url, retries=3):\n"
             "    last_error = None\n"
         if operation == "request_hint":
             self._hints_used += 1
             feedback = self._current_task.hint
+            evaluation = self._evaluate(self._submitted_findings, action.patched_code)
             reward = evaluation.score
         else:
             if action.findings:
                 self._submitted_findings.extend(action.findings)
+            evaluation = self._evaluate(self._submitted_findings, action.patched_code)
             reward = evaluation.score
             if operation == "finalize":
                 done = True
             feedback=feedback,
             reward=reward,
             done=done,
+            patched_code=action.patched_code,
         )
     def _build_observation(
         feedback: str,
         reward: float,
         done: bool,
+        patched_code: Optional[str] = None,
     ) -> PythonObservation:
         assert self._current_task is not None
+        evaluation = self._evaluate(self._submitted_findings, patched_code)
         attempts_remaining = max(
             self._max_steps() - self._state.step_count,
             0,
     def _evaluate(
         self,
         findings: Iterable[ReviewFinding],
+        patched_code: Optional[str],
     ) -> TaskEvaluation:
         assert self._current_task is not None
         weighted_recall = min(matched_weight / total_weight, 1.0)
         patch_score = 0.0
+        if self._current_task.patched_code and patched_code:
             patch_score = float(
+                _normalize_code(patched_code) == _normalize_code(self._current_task.patched_code)
             )
         raw_score = (