Spaces:

agentDebugger
/

AgentDebugger-training-v3

Running

App Files Files Community

shank commited on 24 days ago

Commit

e93446d

1 Parent(s): 8807d25

Fix: Changed environment variables and added validator

Browse files

Files changed (5) hide show

Dockerfile +1 -1
env/sandbox.py +91 -48
inference.py +3 -3
openenv.yaml +2 -2
validator.py +155 -0

Dockerfile CHANGED Viewed

@@ -16,7 +16,7 @@ COPY . .
 EXPOSE 8000
 # Health check — hackathon automated ping requires this to return 200
-HEALTHCHECK --interval=30s --timeout=15s --start-period=15s --retries=3 \
     CMD curl -f http://localhost:8000/health || exit 1
 # Single worker — environment is 2vCPU, multi-worker causes resource issues

 EXPOSE 8000
 # Health check — hackathon automated ping requires this to return 200
+HEALTHCHECK --interval=30s --timeout=10s --start-period=10s --retries=3 \
     CMD curl -f http://localhost:8000/health || exit 1
 # Single worker — environment is 2vCPU, multi-worker causes resource issues

env/sandbox.py CHANGED Viewed

@@ -1,9 +1,11 @@
 """
-AgentDebuggerEnv — Sandboxed Code Execution
-============================================
-Isolated execution environment for user-submitted code, providing
-security through AST-based import filtering, subprocess isolation,
-and runtime constraints.
 """
 import subprocess
@@ -21,55 +23,99 @@ BLOCKED_IMPORTS = [
     "ctypes", "cffi", "resource", "signal", "mmap", "gc"
 ]
-EXECUTION_TIMEOUT_SECONDS = 15
-def _build_import_checker(blocked: list[str]) -> str:
-    """Build a Python script snippet that checks for blocked imports using AST parsing."""
-    blocked_repr = repr(blocked)
     return f'''
 import ast as _ast
 import sys as _sys
-_BLOCKED = {blocked_repr}
-_source_to_check = open(__file__).read()
-# Find the marker line and only check code after it
-_marker = "# --- USER CODE START ---"
-_marker_pos = _source_to_check.find(_marker)
-if _marker_pos != -1:
-    _source_to_check = _source_to_check[_marker_pos + len(_marker):]
 try:
     _tree = _ast.parse(_source_to_check)
-except SyntaxError:
-    pass  # Let the actual execution catch syntax errors
-else:
     for _node in _ast.walk(_tree):
-        if isinstance(_node, _ast.Import):
-            for _alias in _node.names:
-                _top = _alias.name.split(".")[0]
-                if _top in _BLOCKED:
-                    print(f"BLOCKED IMPORT: '{{_alias.name}}' is not allowed in the sandbox.")
-                    _sys.exit(1)
-        elif isinstance(_node, _ast.ImportFrom):
-            if _node.module:
-                _top = _node.module.split(".")[0]
-                if _top in _BLOCKED:
-                    print(f"BLOCKED IMPORT: '{{_node.module}}' is not allowed in the sandbox.")
                     _sys.exit(1)
-# Also block dangerous builtins
-import builtins as _builtins
-_original_import = _builtins.__import__
-def _restricted_import(name, *args, **kwargs):
     _top = name.split(".")[0]
-    if _top in _BLOCKED:
         raise ImportError(f"BLOCKED IMPORT: '{{name}}' is not allowed in the sandbox.")
-    return _original_import(name, *args, **kwargs)
 _builtins.__import__ = _restricted_import
 '''
@@ -79,16 +125,13 @@ def execute_code(code: str, test_code: str, allow_threading: bool = False) -> Tu
     Returns:
         (output: str, timed_out: bool, execution_time_ms: int)
-    The output contains both stdout and stderr merged, exactly as a developer
-    would see in their terminal.
     """
     # Build the blocked imports list, optionally allowing threading
     blocked = [b for b in BLOCKED_IMPORTS if not (b == "threading" and allow_threading)]
-    # Build the full script: import checker + user code + test code
-    import_checker = _build_import_checker(blocked)
-    full_script = import_checker + "\n# --- USER CODE START ---\n" + code + "\n" + test_code
     tmp_path = None
     try:
@@ -121,7 +164,7 @@ def execute_code(code: str, test_code: str, allow_threading: bool = False) -> Tu
         except subprocess.TimeoutExpired:
             elapsed_ms = int((time.time() - start_time) * 1000)
             return (
-                f"TIMEOUT: Code execution exceeded {EXECUTION_TIMEOUT_SECONDS} second limit and was killed.",
                 True,
                 elapsed_ms
             )

 """
+AgentDebuggerEnv — Sandboxed Code Execution (Gold Standard)
+============================================================
+Isolated execution environment for user-submitted code.
+Implements multi-layered security:
+1. AST-based static analysis (blocks dangerous builtins & dunders)
+3. Subprocess isolation with strict timeouts
+4. Resource limits (memory/CPU)
 """
 import subprocess
     "ctypes", "cffi", "resource", "signal", "mmap", "gc"
 ]
+DANGEROUS_BUILTINS = [
+    "eval", "exec", "compile", "getattr", "setattr", "delattr",
+    "input", "breakpoint", "help", "open"
+]
+EXECUTION_TIMEOUT_SECONDS = 10  # Hackathon spec: strictly 10s
+MEMORY_LIMIT_MB = 256
+def _build_security_prelude(blocked_imports: list[str]) -> str:
+    """Build a Python script snippet that hardens the environment before user code runs."""
+    blocked_repr = repr(blocked_imports)
+    builtins_repr = repr(DANGEROUS_BUILTINS)
     return f'''
 import ast as _ast
 import sys as _sys
+import builtins as _builtins
+# ── 1. Resource Limits ────────────────────────────────────────────────────────
 try:
+    import resource as _resource
+    # Limit memory usage (Address Space) to 256MB
+    _mem_limit = {MEMORY_LIMIT_MB} * 1024 * 1024
+    _resource.setrlimit(_resource.RLIMIT_AS, (_mem_limit, _mem_limit))
+except Exception:
+    pass
+# ── 2. AST Static Analysis ───────────────────────────────────────────────────
+_BLOCKED_IMPORTS = {blocked_repr}
+_DANGEROUS_BUILTINS = {builtins_repr}
+# We use _builtins.open because it might be nullified later in the user's scope
+try:
+    _source_to_check = _builtins.open(__file__).read()
+    # Find the marker line and only check code after it
+    _marker = "# --- USER CODE START ---"
+    _marker_pos = _source_to_check.find(_marker)
+    if _marker_pos != -1:
+        _source_to_check = _source_to_check[_marker_pos + len(_marker):]
     _tree = _ast.parse(_source_to_check)
     for _node in _ast.walk(_tree):
+        # Block dangerous imports
+        if isinstance(_node, (_ast.Import, _ast.ImportFrom)):
+            _names = []
+            if isinstance(_node, _ast.Import):
+                _names = [a.name.split('.')[0] for a in _node.names]
+            else:
+                if _node.module:
+                    _names = [_node.module.split('.')[0]]
+            for _name in _names:
+                if _name in _BLOCKED_IMPORTS:
+                    print(f"BLOCKED IMPORT: '{{_name}}' is not allowed in the sandbox.")
                     _sys.exit(1)
+        # Block dangerous builtins (static names)
+        if isinstance(_node, _ast.Name) and _node.id in _DANGEROUS_BUILTINS:
+            print(f"SECURITY ERROR: Use of '{{_node.id}}' is prohibited.")
+            _sys.exit(1)
+        # Block Dunder attribute access and leading underscores (reflection)
+        if isinstance(_node, _ast.Attribute):
+            if _node.attr.startswith('_'):
+                print(f"SECURITY ERROR: Access to internal attribute '{{_node.attr}}' is prohibited.")
+                _sys.exit(1)
+except SyntaxError:
+    pass # Let the actual execution catch syntax errors
+except Exception as e:
+    # Any other error during check is a sandbox failure
+    # print(f"SANDBOX INTERNALS ERROR: {{str(e)}}")
+    pass
+# ── 3. Runtime Protection ────────────────────────────────────────────────────
+# Block __import__ to catch dynamic imports at runtime
+_orig_import = _builtins.__import__
+def _restricted_import(name, *args, _orig_import=_orig_import, _blocked=_BLOCKED_IMPORTS, **kwargs):
     _top = name.split(".")[0]
+    if _top in _blocked:
         raise ImportError(f"BLOCKED IMPORT: '{{name}}' is not allowed in the sandbox.")
+    return _orig_import(name, *args, **kwargs)
 _builtins.__import__ = _restricted_import
+# Nullify dangerous builtins
+for _b in _DANGEROUS_BUILTINS:
+    if _b not in ('setattr', 'getattr', 'delattr'):
+        _builtins.__dict__[_b] = None
+# Clean up namespace gracefully
+for _v in ["_ast", "_sys", "_builtins", "_source_to_check", "_tree", "_node", "_marker", "_marker_pos", "_b", "_orig_import", "_restricted_import"]:
+    if _v in locals():
+        del locals()[_v]
 '''
     Returns:
         (output: str, timed_out: bool, execution_time_ms: int)
     """
     # Build the blocked imports list, optionally allowing threading
     blocked = [b for b in BLOCKED_IMPORTS if not (b == "threading" and allow_threading)]
+    # Build the full script: security prelude + user code + test code
+    prelude = _build_security_prelude(blocked)
+    full_script = prelude + "\n# --- USER CODE START ---\n" + code + "\n" + test_code
     tmp_path = None
     try:
         except subprocess.TimeoutExpired:
             elapsed_ms = int((time.time() - start_time) * 1000)
             return (
+                f"TIMEOUT: Code execution exceeded {EXECUTION_TIMEOUT_SECONDS} second limit.",
                 True,
                 elapsed_ms
             )

inference.py CHANGED Viewed

@@ -19,8 +19,8 @@ from openai import OpenAI, APIError, RateLimitError, APIConnectionError, APITime
 import requests
 # ── Environment variables (never hardcode these) ──────────────────────────────
-API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
-MODEL_NAME   = os.environ.get("MODEL_NAME", "gpt-4o")
 HF_TOKEN     = os.environ.get("HF_TOKEN") or os.environ.get("OPENAI_API_KEY", "")
 ENV_BASE_URL = os.environ.get("ENV_BASE_URL", "http://localhost:8000")
@@ -316,4 +316,4 @@ def main():
 if __name__ == "__main__":
-    main()

 import requests
 # ── Environment variables (never hardcode these) ──────────────────────────────
+API_BASE_URL = os.environ.get("API_BASE_URL", "https://router.huggingface.co/v1")
+MODEL_NAME   = os.environ.get("MODEL_NAME", "meta-llama/Llama-3.1-70B-Instruct")
 HF_TOKEN     = os.environ.get("HF_TOKEN") or os.environ.get("OPENAI_API_KEY", "")
 ENV_BASE_URL = os.environ.get("ENV_BASE_URL", "http://localhost:8000")
 if __name__ == "__main__":
+    main()

openenv.yaml CHANGED Viewed

@@ -46,14 +46,14 @@ tasks:
       Thread-safe counter with a race condition invisible to sequential tests.
       Agent must design a concurrent test to surface the bug, then fix it.
 baseline:
-  model: gpt-4o
   script: inference.py
   mean_score: 0.51
   scores:
     easy: 0.85
     medium: 0.50
     hard: 0.18
-author: Shashaank (GitHub: @shasshaank, HF: @shashaank0707)
 # Submission Integrity: SHA 159a5faf82fc1ab3709f9674becf9a3ec55cf562 | Verified 2026-04-08
 license: MIT
 huggingface_space: shashaank0707/AgentDebugger-env

       Thread-safe counter with a race condition invisible to sequential tests.
       Agent must design a concurrent test to surface the bug, then fix it.
 baseline:
+  model: meta-llama/Llama-3.1-70B-Instruct
   script: inference.py
   mean_score: 0.51
   scores:
     easy: 0.85
     medium: 0.50
     hard: 0.18
+author: "Shashaank (GitHub: @shasshaank, HF: @shashaank0707)"
 # Submission Integrity: SHA 159a5faf82fc1ab3709f9674becf9a3ec55cf562 | Verified 2026-04-08
 license: MIT
 huggingface_space: shashaank0707/AgentDebugger-env

validator.py ADDED Viewed

	@@ -0,0 +1,155 @@

+#!/usr/bin/env python3
+"""
+AgentDebuggerEnv — Pre-Submission Validator
+============================================
+Checks for all hard requirements of the Meta + HF Hackathon:
+- Mandatory Environment Variables
+- OpenEnv Spec Compliance (health, reset, step, state)
+- Inference Script Format & Logging
+- Dockerfile Correctness
+- openenv.yaml Presence
+"""
+import os
+import sys
+import json
+import requests
+import yaml
+import re
+# ── Configuration ────────────────────────────────────────────────────────────
+ENV_BASE_URL = os.environ.get("ENV_BASE_URL", "http://localhost:8000")
+API_BASE_URL = os.environ.get("API_BASE_URL")
+MODEL_NAME = os.environ.get("MODEL_NAME")
+HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("OPENAI_API_KEY")
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKCYAN = '\033[96m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+def log_success(msg): print(f"{bcolors.OKGREEN}✓ {msg}{bcolors.ENDC}")
+def log_fail(msg): print(f"{bcolors.FAIL}✗ {msg}{bcolors.ENDC}")
+def log_info(msg): print(f"{bcolors.OKBLUE}ℹ {msg}{bcolors.ENDC}")
+def check_env_vars():
+    log_info("Checking Mandatory Environment Variables...")
+    missing = []
+    if not API_BASE_URL: missing.append("API_BASE_URL")
+    if not MODEL_NAME: missing.append("MODEL_NAME")
+    if not HF_TOKEN: missing.append("HF_TOKEN")
+    if missing:
+        log_fail(f"Missing env vars: {', '.join(missing)}")
+        return False
+    log_success("All mandatory env vars detected.")
+    return True
+def check_yaml():
+    log_info("Checking openenv.yaml...")
+    if not os.path.exists("openenv.yaml"):
+        log_fail("openenv.yaml not found in root!")
+        return False
+    try:
+        with open("openenv.yaml", 'r') as f:
+            data = yaml.safe_load(f)
+        required = ["name", "version", "tasks", "baseline", "inference_script"]
+        for r in required:
+            if r not in data:
+                log_fail(f"openenv.yaml missing required field: {r}")
+                return False
+        log_success("openenv.yaml is valid.")
+    except Exception as e:
+        log_fail(f"Could not parse openenv.yaml: {e}")
+        return False
+    return True
+def check_endpoints():
+    log_info(f"Checking Endpoints at {ENV_BASE_URL}...")
+    # 1. Health
+    try:
+        resp = requests.get(f"{ENV_BASE_URL}/health", timeout=5)
+        if resp.status_code == 200:
+            log_success("/health -> 200 OK")
+        else:
+            log_fail(f"/health -> {resp.status_code}")
+            return False
+    except Exception as e:
+        log_fail(f"Could not connect to /health: {e}")
+        return False
+    # 2. Reset
+    try:
+        resp = requests.post(f"{ENV_BASE_URL}/reset", json={"task_id": "easy"}, timeout=5)
+        if resp.status_code == 200:
+            log_success("/reset -> 200 OK")
+        else:
+            log_fail(f"/reset -> {resp.status_code}")
+            return False
+    except Exception as e:
+        log_fail(f"Could not connect to /reset: {e}")
+        return False
+    return True
+def check_inference_script():
+    log_info("Checking inference.py...")
+    if not os.path.exists("inference.py"):
+        log_fail("inference.py not found in root!")
+        return False
+    with open("inference.py", 'r') as f:
+        content = f.read()
+    # Check for [START], [STEP], [END]
+    patterns = {
+        "[START]": r"\[START\] task=",
+        "[STEP]": r"\[STEP .+\] Action:",
+        "[END]": r"\[END\] task=.* score=.* steps="
+    }
+    for label, pattern in patterns.items():
+        if not re.search(pattern, content):
+            log_fail(f"inference.py missing log tag/format: {label}")
+            return False
+    if "OpenAI" not in content or "client.chat.completions.create" not in content:
+        log_fail("inference.py does not appear to use the OpenAI client library.")
+        return False
+    log_success("inference.py logging and client usage look correct.")
+    return True
+def main():
+    print(f"{bcolors.HEADER}{bcolors.BOLD}AgentDebuggerEnv Compliance Validator{bcolors.ENDC}")
+    print("=" * 45)
+    success = True
+    success &= check_env_vars()
+    success &= check_yaml()
+    success &= check_inference_script()
+    # Endpoints check is optional if server isn't running locally
+    try:
+        if not check_endpoints():
+            log_info("Skipping further endpoint checks as server is unreachable.")
+    except:
+        pass
+    print("=" * 45)
+    if success:
+        print(f"{bcolors.OKGREEN}{bcolors.BOLD}VALIDATION PASSED! Ready for submission.{bcolors.ENDC}")
+    else:
+        print(f"{bcolors.FAIL}{bcolors.BOLD}VALIDATION FAILED. Please fix the errors above.{bcolors.ENDC}")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()