Spaces:

SolusOps
/

tracefix_rl

Sleeping

App Files Files Community

databoysu commited on Apr 8

Commit

985e10f

1 Parent(s): a1e4e94

protect sandbox

Browse files

Files changed (9) hide show

__pycache__/__init__.cpython-312.pyc +0 -0
__pycache__/environment.cpython-312.pyc +0 -0
__pycache__/sandbox.cpython-312.pyc +0 -0
context.py +32 -0
environment.py +16 -4
inference.py +9 -1
models.py +4 -3
sandbox.py +132 -7
uv.lock +1 -1

__pycache__/__init__.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/__init__.cpython-312.pyc and b/__pycache__/__init__.cpython-312.pyc differ

__pycache__/environment.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/environment.cpython-312.pyc and b/__pycache__/environment.cpython-312.pyc differ

__pycache__/sandbox.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/sandbox.cpython-312.pyc and b/__pycache__/sandbox.cpython-312.pyc differ

context.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from __future__ import annotations
 from typing import List, Optional
 WINDOW_LINES: int = 10
@@ -9,6 +10,37 @@ WINDOW_LINES: int = 10
 MAX_CONTEXT_CHARS: int = 2_000
 def get_localized_context(
     code_lines: List[str],
     anchor_line: Optional[int],

 from __future__ import annotations
+import re
 from typing import List, Optional
 WINDOW_LINES: int = 10
 MAX_CONTEXT_CHARS: int = 2_000
+_TRACEBACK_FILE_LINE_RE = re.compile(r'File "([^"]+)", line (\d+)')
+_SYNTAX_LINE_RE = re.compile(r"SyntaxError at line (\d+)")
+def extract_error_line(traceback_str: str) -> Optional[int]:
+    """
+    Extract the most relevant crashing line number from sandbox output.
+    Preference order:
+    1) Last frame pointing to agent code pseudo-files (<agent_code>, <string>).
+    2) Last traceback frame line number.
+    3) "SyntaxError at line N" fallback.
+    """
+    if not traceback_str:
+        return None
+    matches = _TRACEBACK_FILE_LINE_RE.findall(traceback_str)
+    if matches:
+        preferred_files = {"<agent_code>", "<string>"}
+        for file_name, line_str in reversed(matches):
+            if file_name in preferred_files:
+                return int(line_str)
+        return int(matches[-1][1])
+    syntax_match = _SYNTAX_LINE_RE.search(traceback_str)
+    if syntax_match:
+        return int(syntax_match.group(1))
+    return None
 def get_localized_context(
     code_lines: List[str],
     anchor_line: Optional[int],

environment.py CHANGED Viewed

@@ -7,12 +7,12 @@ import uuid
 from typing import Any, Dict, List, Optional, Tuple
 try:
-    from .context import get_localized_context
     from .models import CodeAction, CodeObservation, TestResult
     from .sandbox import check_syntax, run_code_with_tests
     from .tasks import ALL_TASKS, TASKS_BY_DIFFICULTY
 except ImportError:
-    from context import get_localized_context
     from models import CodeAction, CodeObservation, TestResult
     from sandbox import check_syntax, run_code_with_tests
     from tasks import ALL_TASKS, TASKS_BY_DIFFICULTY
@@ -147,6 +147,7 @@ class TraceFixRLGym:
         self._original_code: List[str] = []
         self._edit_history: List[List[str]] = []
         self.training_step: int = 0
     def _sample_task(self, task_override=None) -> Dict[str, Any]:
@@ -217,6 +218,7 @@ class TraceFixRLGym:
         self._edit_history  = []
         self._last_action: Optional[str] = None
         self._consecutive_count: int = 0
         obs = self._build_observation(reward=0.0)
@@ -336,11 +338,13 @@ class TraceFixRLGym:
         if syntax_err:
             reward += R_SYNTAX_ERROR
         else:
             current_pass = sum(1 for t in results if t.passed)
             new_passes   = max(0, current_pass - self._prev_pass_count)
             reward       += new_passes * R_PER_NEW_PASS
             self._prev_pass_count = current_pass
         return reward
@@ -483,10 +487,18 @@ class TraceFixRLGym:
     def _build_observation(self, reward: float) -> CodeObservation:
         syntax_valid, _ = check_syntax(self._source())
-        localized = get_localized_context(self._code_lines, self._last_edited_line)
         return CodeObservation(
-            code_lines            = list(self._code_lines),
             localized_context     = localized,
             last_execution_output = self._last_output,
             syntax_error          = not syntax_valid,

 from typing import Any, Dict, List, Optional, Tuple
 try:
+    from .context import extract_error_line, get_localized_context
     from .models import CodeAction, CodeObservation, TestResult
     from .sandbox import check_syntax, run_code_with_tests
     from .tasks import ALL_TASKS, TASKS_BY_DIFFICULTY
 except ImportError:
+    from context import extract_error_line, get_localized_context
     from models import CodeAction, CodeObservation, TestResult
     from sandbox import check_syntax, run_code_with_tests
     from tasks import ALL_TASKS, TASKS_BY_DIFFICULTY
         self._original_code: List[str] = []
         self._edit_history: List[List[str]] = []
         self.training_step: int = 0
+        self._last_run_all_passed: bool = False
     def _sample_task(self, task_override=None) -> Dict[str, Any]:
         self._edit_history  = []
         self._last_action: Optional[str] = None
         self._consecutive_count: int = 0
+        self._last_run_all_passed = False
         obs = self._build_observation(reward=0.0)
         if syntax_err:
             reward += R_SYNTAX_ERROR
+            self._last_run_all_passed = False
         else:
             current_pass = sum(1 for t in results if t.passed)
             new_passes   = max(0, current_pass - self._prev_pass_count)
             reward       += new_passes * R_PER_NEW_PASS
             self._prev_pass_count = current_pass
+            self._last_run_all_passed = all(t.passed for t in results)
         return reward
     def _build_observation(self, reward: float) -> CodeObservation:
         syntax_valid, _ = check_syntax(self._source())
+        context_anchor = self._last_edited_line
+        if self._last_action == "RUN_TESTS" and not self._last_run_all_passed:
+            extracted_line = extract_error_line(self._last_output)
+            if extracted_line is not None:
+                context_anchor = extracted_line
+        localized = get_localized_context(self._code_lines, context_anchor)
         return CodeObservation(
+            code_dict             = {
+                idx + 1: line for idx, line in enumerate(self._code_lines)
+            },
             localized_context     = localized,
             last_execution_output = self._last_output,
             syntax_error          = not syntax_valid,

inference.py CHANGED Viewed

@@ -134,7 +134,15 @@ def _extract_json(text: str) -> dict[str, Any]:
 def _build_observation_text(observation: Any) -> str:
-    code_preview = "\n".join(observation.code_lines[:30]) if observation.code_lines else ""
     return (
         f"step_count={observation.step_count}\n"
         f"steps_remaining={observation.steps_remaining}\n"

 def _build_observation_text(observation: Any) -> str:
+    code_dict = getattr(observation, "code_dict", {}) or {}
+    sorted_items = sorted(
+        ((int(line_num), text) for line_num, text in code_dict.items()),
+        key=lambda x: x[0],
+    )
+    code_preview = "\n".join(
+        f"{line_num} | {text}"
+        for line_num, text in sorted_items[:30]
+    )
     return (
         f"step_count={observation.step_count}\n"
         f"steps_remaining={observation.steps_remaining}\n"

models.py CHANGED Viewed

@@ -95,7 +95,7 @@ class TestResult(BaseModel):
 class CodeObservation(Observation):
     """Full observation returned after each step."""
-    code_lines: List[str] = Field(default_factory=list)
     localized_context: str = Field(default="")
     last_execution_output: str = Field(default="")
     syntax_error: bool = Field(default=False)
@@ -107,8 +107,9 @@ class CodeObservation(Observation):
     def render_code(self) -> str:
         """Render source with 1-indexed line numbers for prompts."""
-        if not self.code_lines:
             return "<empty>"
         return "\n".join(
-            f"{idx + 1:>3} | {line}" for idx, line in enumerate(self.code_lines)
         )

 class CodeObservation(Observation):
     """Full observation returned after each step."""
+    code_dict: Dict[int, str] = Field(default_factory=dict)
     localized_context: str = Field(default="")
     last_execution_output: str = Field(default="")
     syntax_error: bool = Field(default=False)
     def render_code(self) -> str:
         """Render source with 1-indexed line numbers for prompts."""
+        if not self.code_dict:
             return "<empty>"
         return "\n".join(
+            f"{line_num:>3} | {self.code_dict[line_num]}"
+            for line_num in sorted(self.code_dict.keys())
         )

sandbox.py CHANGED Viewed

@@ -32,11 +32,12 @@ import ast
 import io
 import inspect
 import multiprocessing
 import signal
 import sys
 import textwrap
 import traceback
-from typing import Any, Callable, Dict, List, Tuple
 try:
     from .models import TestResult
@@ -62,7 +63,20 @@ def _make_safe_stub(name: str) -> Callable:
     return _stub
-_SAFE_BUILTINS: Dict[str, Any] = {
     "int": int, "float": float, "str": str, "bool": bool,
     "list": list, "dict": dict, "set": set, "tuple": tuple,
     "bytes": bytes, "bytearray": bytearray, "frozenset": frozenset,
@@ -104,6 +118,102 @@ _SAFE_BUILTINS: Dict[str, Any] = {
 }
 def _tail_truncate(s: str, limit: int = MAX_OUTPUT_CHARS) -> str:
     """
@@ -150,9 +260,15 @@ def _worker(
             result_queue.put((_tail_truncate(err), [], True))
             return
-        namespace: Dict[str, Any] = {"__builtins__": __builtins__}
         try:
-            exec(code_obj, namespace)  # noqa: S102
         except Exception:  # noqa: BLE001
             tb = traceback.format_exc()
             sys.stdout, sys.stderr = old_stdout, old_stderr
@@ -162,15 +278,24 @@ def _worker(
         for test_src in test_sources:
             fn_name = "<unknown>"
             try:
-                exec(test_src, namespace)  # noqa: S102
                 fn_name = [
                     ln.split("(")[0].replace("def ", "").strip()
-                    for ln in test_src.splitlines()
                     if ln.startswith("def ")
                 ][-1]
-                namespace[fn_name](namespace)
                 test_results.append({"test_name": fn_name, "passed": True})
             except AssertionError as exc:

 import io
 import inspect
 import multiprocessing
+import importlib
 import signal
 import sys
 import textwrap
 import traceback
+from typing import Any, Callable, Dict, List, Set, Tuple
 try:
     from .models import TestResult
     return _stub
+TEST_SUITE_ALLOWED_MODULES: Set[str] = {
+    "bisect",
+    "collections",
+    "functools",
+    "heapq",
+    "itertools",
+    "math",
+    "re",
+    "string",
+    "typing",
+}
+SAFE_BUILTINS: Dict[str, Any] = {
     "int": int, "float": float, "str": str, "bool": bool,
     "list": list, "dict": dict, "set": set, "tuple": tuple,
     "bytes": bytes, "bytearray": bytearray, "frozenset": frozenset,
 }
+def _sanitize_imports_and_prepare_bindings(
+    source: str,
+    allowed_modules: Set[str],
+) -> Tuple[str, List[Tuple[str, str, str]], List[Tuple[str, str]]]:
+    """
+    Parse source, validate imports against allowlist, and strip import statements.
+    Returns
+    -------
+    sanitized_source:
+      Source with all import statements removed (so code never calls __import__).
+    module_alias_bindings:
+      List[(local_name, module_name, attribute_name)].
+      `attribute_name == ""` means bind module object itself.
+    modules_to_preload:
+      List[(root_name, import_target)] pairs.
+    """
+    tree = ast.parse(source)
+    blocked_lines: Set[int] = set()
+    module_alias_bindings: List[Tuple[str, str, str]] = []
+    modules_to_preload: Set[Tuple[str, str]] = set()
+    for node in ast.walk(tree):
+        if isinstance(node, ast.Import):
+            for alias in node.names:
+                module_name = alias.name
+                root_name = module_name.split(".")[0]
+                if root_name not in allowed_modules:
+                    raise ImportError(
+                        f"Import of '{root_name}' is not allowed in this sandbox."
+                    )
+                local_name = alias.asname or root_name
+                module_alias_bindings.append((local_name, module_name, ""))
+                modules_to_preload.add((root_name, module_name))
+            if hasattr(node, "lineno") and hasattr(node, "end_lineno"):
+                blocked_lines.update(range(node.lineno, node.end_lineno + 1))
+        if isinstance(node, ast.ImportFrom):
+            if node.level != 0 or not node.module:
+                raise ImportError(
+                    "Relative imports are not allowed in this sandbox."
+                )
+            module_name = node.module
+            root_name = module_name.split(".")[0]
+            if root_name not in allowed_modules:
+                raise ImportError(
+                    f"Import of '{root_name}' is not allowed in this sandbox."
+                )
+            for alias in node.names:
+                if alias.name == "*":
+                    raise ImportError(
+                        "Wildcard imports are not allowed in this sandbox."
+                    )
+                local_name = alias.asname or alias.name
+                module_alias_bindings.append((local_name, module_name, alias.name))
+            modules_to_preload.add((root_name, module_name))
+            if hasattr(node, "lineno") and hasattr(node, "end_lineno"):
+                blocked_lines.update(range(node.lineno, node.end_lineno + 1))
+    sanitized_lines = [
+        line
+        for i, line in enumerate(source.splitlines(), start=1)
+        if i not in blocked_lines
+    ]
+    return "\n".join(sanitized_lines), module_alias_bindings, sorted(modules_to_preload)
+def _build_local_env_for_source(
+    source: str,
+    allowed_modules: Set[str],
+) -> Tuple[str, Dict[str, Any]]:
+    """
+    Build a local env with preloaded authorized modules/symbols.
+    """
+    sanitized_source, bindings, modules_to_preload = _sanitize_imports_and_prepare_bindings(
+        source, allowed_modules
+    )
+    local_env: Dict[str, Any] = {}
+    loaded_modules: Dict[str, Any] = {}
+    for root_name, import_target in modules_to_preload:
+        if import_target not in loaded_modules:
+            loaded_modules[import_target] = importlib.import_module(import_target)
+        if root_name not in loaded_modules:
+            loaded_modules[root_name] = importlib.import_module(root_name)
+    for local_name, module_name, attribute_name in bindings:
+        module_obj = loaded_modules[module_name]
+        if attribute_name:
+            local_env[local_name] = getattr(module_obj, attribute_name)
+        else:
+            local_env[local_name] = module_obj
+    return sanitized_source, local_env
 def _tail_truncate(s: str, limit: int = MAX_OUTPUT_CHARS) -> str:
     """
             result_queue.put((_tail_truncate(err), [], True))
             return
         try:
+            sanitized_source, local_env = _build_local_env_for_source(
+                source,
+                TEST_SUITE_ALLOWED_MODULES,
+            )
+            exec_env: Dict[str, Any] = {"__builtins__": SAFE_BUILTINS}
+            exec_env.update(local_env)
+            code_obj = compile(sanitized_source, "<agent_code>", "exec")
+            exec(code_obj, exec_env, exec_env)  # noqa: S102
         except Exception:  # noqa: BLE001
             tb = traceback.format_exc()
             sys.stdout, sys.stderr = old_stdout, old_stderr
         for test_src in test_sources:
             fn_name = "<unknown>"
             try:
+                sanitized_test_src, test_env_injections = _build_local_env_for_source(
+                    test_src,
+                    TEST_SUITE_ALLOWED_MODULES,
+                )
+                exec_env.update(test_env_injections)
+                exec(
+                    compile(sanitized_test_src, "<sandbox_test>", "exec"),
+                    exec_env,
+                    exec_env,
+                )  # noqa: S102
                 fn_name = [
                     ln.split("(")[0].replace("def ", "").strip()
+                    for ln in sanitized_test_src.splitlines()
                     if ln.startswith("def ")
                 ][-1]
+                exec_env[fn_name](exec_env)
                 test_results.append({"test_name": fn_name, "passed": True})
             except AssertionError as exc:

uv.lock CHANGED Viewed

@@ -1599,7 +1599,7 @@ core = [
 ]
 [[package]]
-name = "openenv-python-debugging-gym"
 version = "0.1.0"
 source = { editable = "." }
 dependencies = [

 ]
 [[package]]
+name = "openenv-tracefix-rl"
 version = "0.1.0"
 source = { editable = "." }
 dependencies = [