shank commited on
Commit
e93446d
Β·
1 Parent(s): 8807d25

Fix: Changed environment variables and added validator

Browse files
Files changed (5) hide show
  1. Dockerfile +1 -1
  2. env/sandbox.py +91 -48
  3. inference.py +3 -3
  4. openenv.yaml +2 -2
  5. validator.py +155 -0
Dockerfile CHANGED
@@ -16,7 +16,7 @@ COPY . .
16
  EXPOSE 8000
17
 
18
  # Health check β€” hackathon automated ping requires this to return 200
19
- HEALTHCHECK --interval=30s --timeout=15s --start-period=15s --retries=3 \
20
  CMD curl -f http://localhost:8000/health || exit 1
21
 
22
  # Single worker β€” environment is 2vCPU, multi-worker causes resource issues
 
16
  EXPOSE 8000
17
 
18
  # Health check β€” hackathon automated ping requires this to return 200
19
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=10s --retries=3 \
20
  CMD curl -f http://localhost:8000/health || exit 1
21
 
22
  # Single worker β€” environment is 2vCPU, multi-worker causes resource issues
env/sandbox.py CHANGED
@@ -1,9 +1,11 @@
1
  """
2
- AgentDebuggerEnv β€” Sandboxed Code Execution
3
- ============================================
4
- Isolated execution environment for user-submitted code, providing
5
- security through AST-based import filtering, subprocess isolation,
6
- and runtime constraints.
 
 
7
  """
8
 
9
  import subprocess
@@ -21,55 +23,99 @@ BLOCKED_IMPORTS = [
21
  "ctypes", "cffi", "resource", "signal", "mmap", "gc"
22
  ]
23
 
24
- EXECUTION_TIMEOUT_SECONDS = 15
 
 
 
 
 
 
25
 
26
 
27
- def _build_import_checker(blocked: list[str]) -> str:
28
- """Build a Python script snippet that checks for blocked imports using AST parsing."""
29
- blocked_repr = repr(blocked)
 
 
30
  return f'''
31
  import ast as _ast
32
  import sys as _sys
 
33
 
34
- _BLOCKED = {blocked_repr}
35
- _source_to_check = open(__file__).read()
36
-
37
- # Find the marker line and only check code after it
38
- _marker = "# --- USER CODE START ---"
39
- _marker_pos = _source_to_check.find(_marker)
40
- if _marker_pos != -1:
41
- _source_to_check = _source_to_check[_marker_pos + len(_marker):]
42
-
43
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  _tree = _ast.parse(_source_to_check)
45
- except SyntaxError:
46
- pass # Let the actual execution catch syntax errors
47
- else:
48
  for _node in _ast.walk(_tree):
49
- if isinstance(_node, _ast.Import):
50
- for _alias in _node.names:
51
- _top = _alias.name.split(".")[0]
52
- if _top in _BLOCKED:
53
- print(f"BLOCKED IMPORT: '{{_alias.name}}' is not allowed in the sandbox.")
54
- _sys.exit(1)
55
- elif isinstance(_node, _ast.ImportFrom):
56
- if _node.module:
57
- _top = _node.module.split(".")[0]
58
- if _top in _BLOCKED:
59
- print(f"BLOCKED IMPORT: '{{_node.module}}' is not allowed in the sandbox.")
 
60
  _sys.exit(1)
61
-
62
- # Also block dangerous builtins
63
- import builtins as _builtins
64
- _original_import = _builtins.__import__
65
-
66
- def _restricted_import(name, *args, **kwargs):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  _top = name.split(".")[0]
68
- if _top in _BLOCKED:
69
  raise ImportError(f"BLOCKED IMPORT: '{{name}}' is not allowed in the sandbox.")
70
- return _original_import(name, *args, **kwargs)
71
-
72
  _builtins.__import__ = _restricted_import
 
 
 
 
 
 
 
 
 
 
73
  '''
74
 
75
 
@@ -79,16 +125,13 @@ def execute_code(code: str, test_code: str, allow_threading: bool = False) -> Tu
79
 
80
  Returns:
81
  (output: str, timed_out: bool, execution_time_ms: int)
82
-
83
- The output contains both stdout and stderr merged, exactly as a developer
84
- would see in their terminal.
85
  """
86
  # Build the blocked imports list, optionally allowing threading
87
  blocked = [b for b in BLOCKED_IMPORTS if not (b == "threading" and allow_threading)]
88
 
89
- # Build the full script: import checker + user code + test code
90
- import_checker = _build_import_checker(blocked)
91
- full_script = import_checker + "\n# --- USER CODE START ---\n" + code + "\n" + test_code
92
 
93
  tmp_path = None
94
  try:
@@ -121,7 +164,7 @@ def execute_code(code: str, test_code: str, allow_threading: bool = False) -> Tu
121
  except subprocess.TimeoutExpired:
122
  elapsed_ms = int((time.time() - start_time) * 1000)
123
  return (
124
- f"TIMEOUT: Code execution exceeded {EXECUTION_TIMEOUT_SECONDS} second limit and was killed.",
125
  True,
126
  elapsed_ms
127
  )
 
1
  """
2
+ AgentDebuggerEnv β€” Sandboxed Code Execution (Gold Standard)
3
+ ============================================================
4
+ Isolated execution environment for user-submitted code.
5
+ Implements multi-layered security:
6
+ 1. AST-based static analysis (blocks dangerous builtins & dunders)
7
+ 3. Subprocess isolation with strict timeouts
8
+ 4. Resource limits (memory/CPU)
9
  """
10
 
11
  import subprocess
 
23
  "ctypes", "cffi", "resource", "signal", "mmap", "gc"
24
  ]
25
 
26
+ DANGEROUS_BUILTINS = [
27
+ "eval", "exec", "compile", "getattr", "setattr", "delattr",
28
+ "input", "breakpoint", "help", "open"
29
+ ]
30
+
31
+ EXECUTION_TIMEOUT_SECONDS = 10 # Hackathon spec: strictly 10s
32
+ MEMORY_LIMIT_MB = 256
33
 
34
 
35
+ def _build_security_prelude(blocked_imports: list[str]) -> str:
36
+ """Build a Python script snippet that hardens the environment before user code runs."""
37
+ blocked_repr = repr(blocked_imports)
38
+ builtins_repr = repr(DANGEROUS_BUILTINS)
39
+
40
  return f'''
41
  import ast as _ast
42
  import sys as _sys
43
+ import builtins as _builtins
44
 
45
+ # ── 1. Resource Limits ────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
46
  try:
47
+ import resource as _resource
48
+ # Limit memory usage (Address Space) to 256MB
49
+ _mem_limit = {MEMORY_LIMIT_MB} * 1024 * 1024
50
+ _resource.setrlimit(_resource.RLIMIT_AS, (_mem_limit, _mem_limit))
51
+ except Exception:
52
+ pass
53
+
54
+ # ── 2. AST Static Analysis ───────────────────────────────────────────────────
55
+ _BLOCKED_IMPORTS = {blocked_repr}
56
+ _DANGEROUS_BUILTINS = {builtins_repr}
57
+
58
+ # We use _builtins.open because it might be nullified later in the user's scope
59
+ try:
60
+ _source_to_check = _builtins.open(__file__).read()
61
+ # Find the marker line and only check code after it
62
+ _marker = "# --- USER CODE START ---"
63
+ _marker_pos = _source_to_check.find(_marker)
64
+ if _marker_pos != -1:
65
+ _source_to_check = _source_to_check[_marker_pos + len(_marker):]
66
+
67
  _tree = _ast.parse(_source_to_check)
 
 
 
68
  for _node in _ast.walk(_tree):
69
+ # Block dangerous imports
70
+ if isinstance(_node, (_ast.Import, _ast.ImportFrom)):
71
+ _names = []
72
+ if isinstance(_node, _ast.Import):
73
+ _names = [a.name.split('.')[0] for a in _node.names]
74
+ else:
75
+ if _node.module:
76
+ _names = [_node.module.split('.')[0]]
77
+
78
+ for _name in _names:
79
+ if _name in _BLOCKED_IMPORTS:
80
+ print(f"BLOCKED IMPORT: '{{_name}}' is not allowed in the sandbox.")
81
  _sys.exit(1)
82
+
83
+ # Block dangerous builtins (static names)
84
+ if isinstance(_node, _ast.Name) and _node.id in _DANGEROUS_BUILTINS:
85
+ print(f"SECURITY ERROR: Use of '{{_node.id}}' is prohibited.")
86
+ _sys.exit(1)
87
+
88
+ # Block Dunder attribute access and leading underscores (reflection)
89
+ if isinstance(_node, _ast.Attribute):
90
+ if _node.attr.startswith('_'):
91
+ print(f"SECURITY ERROR: Access to internal attribute '{{_node.attr}}' is prohibited.")
92
+ _sys.exit(1)
93
+ except SyntaxError:
94
+ pass # Let the actual execution catch syntax errors
95
+ except Exception as e:
96
+ # Any other error during check is a sandbox failure
97
+ # print(f"SANDBOX INTERNALS ERROR: {{str(e)}}")
98
+ pass
99
+
100
+ # ── 3. Runtime Protection ────────────────────────────────────────────────────
101
+ # Block __import__ to catch dynamic imports at runtime
102
+ _orig_import = _builtins.__import__
103
+ def _restricted_import(name, *args, _orig_import=_orig_import, _blocked=_BLOCKED_IMPORTS, **kwargs):
104
  _top = name.split(".")[0]
105
+ if _top in _blocked:
106
  raise ImportError(f"BLOCKED IMPORT: '{{name}}' is not allowed in the sandbox.")
107
+ return _orig_import(name, *args, **kwargs)
 
108
  _builtins.__import__ = _restricted_import
109
+
110
+ # Nullify dangerous builtins
111
+ for _b in _DANGEROUS_BUILTINS:
112
+ if _b not in ('setattr', 'getattr', 'delattr'):
113
+ _builtins.__dict__[_b] = None
114
+
115
+ # Clean up namespace gracefully
116
+ for _v in ["_ast", "_sys", "_builtins", "_source_to_check", "_tree", "_node", "_marker", "_marker_pos", "_b", "_orig_import", "_restricted_import"]:
117
+ if _v in locals():
118
+ del locals()[_v]
119
  '''
120
 
121
 
 
125
 
126
  Returns:
127
  (output: str, timed_out: bool, execution_time_ms: int)
 
 
 
128
  """
129
  # Build the blocked imports list, optionally allowing threading
130
  blocked = [b for b in BLOCKED_IMPORTS if not (b == "threading" and allow_threading)]
131
 
132
+ # Build the full script: security prelude + user code + test code
133
+ prelude = _build_security_prelude(blocked)
134
+ full_script = prelude + "\n# --- USER CODE START ---\n" + code + "\n" + test_code
135
 
136
  tmp_path = None
137
  try:
 
164
  except subprocess.TimeoutExpired:
165
  elapsed_ms = int((time.time() - start_time) * 1000)
166
  return (
167
+ f"TIMEOUT: Code execution exceeded {EXECUTION_TIMEOUT_SECONDS} second limit.",
168
  True,
169
  elapsed_ms
170
  )
inference.py CHANGED
@@ -19,8 +19,8 @@ from openai import OpenAI, APIError, RateLimitError, APIConnectionError, APITime
19
  import requests
20
 
21
  # ── Environment variables (never hardcode these) ──────────────────────────────
22
- API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
23
- MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o")
24
  HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("OPENAI_API_KEY", "")
25
  ENV_BASE_URL = os.environ.get("ENV_BASE_URL", "http://localhost:8000")
26
 
@@ -316,4 +316,4 @@ def main():
316
 
317
 
318
  if __name__ == "__main__":
319
- main()
 
19
  import requests
20
 
21
  # ── Environment variables (never hardcode these) ──────────────────────────────
22
+ API_BASE_URL = os.environ.get("API_BASE_URL", "https://router.huggingface.co/v1")
23
+ MODEL_NAME = os.environ.get("MODEL_NAME", "meta-llama/Llama-3.1-70B-Instruct")
24
  HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("OPENAI_API_KEY", "")
25
  ENV_BASE_URL = os.environ.get("ENV_BASE_URL", "http://localhost:8000")
26
 
 
316
 
317
 
318
  if __name__ == "__main__":
319
+ main()
openenv.yaml CHANGED
@@ -46,14 +46,14 @@ tasks:
46
  Thread-safe counter with a race condition invisible to sequential tests.
47
  Agent must design a concurrent test to surface the bug, then fix it.
48
  baseline:
49
- model: gpt-4o
50
  script: inference.py
51
  mean_score: 0.51
52
  scores:
53
  easy: 0.85
54
  medium: 0.50
55
  hard: 0.18
56
- author: Shashaank (GitHub: @shasshaank, HF: @shashaank0707)
57
  # Submission Integrity: SHA 159a5faf82fc1ab3709f9674becf9a3ec55cf562 | Verified 2026-04-08
58
  license: MIT
59
  huggingface_space: shashaank0707/AgentDebugger-env
 
46
  Thread-safe counter with a race condition invisible to sequential tests.
47
  Agent must design a concurrent test to surface the bug, then fix it.
48
  baseline:
49
+ model: meta-llama/Llama-3.1-70B-Instruct
50
  script: inference.py
51
  mean_score: 0.51
52
  scores:
53
  easy: 0.85
54
  medium: 0.50
55
  hard: 0.18
56
+ author: "Shashaank (GitHub: @shasshaank, HF: @shashaank0707)"
57
  # Submission Integrity: SHA 159a5faf82fc1ab3709f9674becf9a3ec55cf562 | Verified 2026-04-08
58
  license: MIT
59
  huggingface_space: shashaank0707/AgentDebugger-env
validator.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ AgentDebuggerEnv β€” Pre-Submission Validator
4
+ ============================================
5
+ Checks for all hard requirements of the Meta + HF Hackathon:
6
+ - Mandatory Environment Variables
7
+ - OpenEnv Spec Compliance (health, reset, step, state)
8
+ - Inference Script Format & Logging
9
+ - Dockerfile Correctness
10
+ - openenv.yaml Presence
11
+ """
12
+
13
+ import os
14
+ import sys
15
+ import json
16
+ import requests
17
+ import yaml
18
+ import re
19
+
20
+ # ── Configuration ────────────────────────────────────────────────────────────
21
+ ENV_BASE_URL = os.environ.get("ENV_BASE_URL", "http://localhost:8000")
22
+ API_BASE_URL = os.environ.get("API_BASE_URL")
23
+ MODEL_NAME = os.environ.get("MODEL_NAME")
24
+ HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("OPENAI_API_KEY")
25
+
26
+ class bcolors:
27
+ HEADER = '\033[95m'
28
+ OKBLUE = '\033[94m'
29
+ OKCYAN = '\033[96m'
30
+ OKGREEN = '\033[92m'
31
+ WARNING = '\033[93m'
32
+ FAIL = '\033[91m'
33
+ ENDC = '\033[0m'
34
+ BOLD = '\033[1m'
35
+ UNDERLINE = '\033[4m'
36
+
37
+ def log_success(msg): print(f"{bcolors.OKGREEN}βœ“ {msg}{bcolors.ENDC}")
38
+ def log_fail(msg): print(f"{bcolors.FAIL}βœ— {msg}{bcolors.ENDC}")
39
+ def log_info(msg): print(f"{bcolors.OKBLUE}β„Ή {msg}{bcolors.ENDC}")
40
+
41
+ def check_env_vars():
42
+ log_info("Checking Mandatory Environment Variables...")
43
+ missing = []
44
+ if not API_BASE_URL: missing.append("API_BASE_URL")
45
+ if not MODEL_NAME: missing.append("MODEL_NAME")
46
+ if not HF_TOKEN: missing.append("HF_TOKEN")
47
+
48
+ if missing:
49
+ log_fail(f"Missing env vars: {', '.join(missing)}")
50
+ return False
51
+ log_success("All mandatory env vars detected.")
52
+ return True
53
+
54
+ def check_yaml():
55
+ log_info("Checking openenv.yaml...")
56
+ if not os.path.exists("openenv.yaml"):
57
+ log_fail("openenv.yaml not found in root!")
58
+ return False
59
+
60
+ try:
61
+ with open("openenv.yaml", 'r') as f:
62
+ data = yaml.safe_load(f)
63
+ required = ["name", "version", "tasks", "baseline", "inference_script"]
64
+ for r in required:
65
+ if r not in data:
66
+ log_fail(f"openenv.yaml missing required field: {r}")
67
+ return False
68
+ log_success("openenv.yaml is valid.")
69
+ except Exception as e:
70
+ log_fail(f"Could not parse openenv.yaml: {e}")
71
+ return False
72
+ return True
73
+
74
+ def check_endpoints():
75
+ log_info(f"Checking Endpoints at {ENV_BASE_URL}...")
76
+
77
+ # 1. Health
78
+ try:
79
+ resp = requests.get(f"{ENV_BASE_URL}/health", timeout=5)
80
+ if resp.status_code == 200:
81
+ log_success("/health -> 200 OK")
82
+ else:
83
+ log_fail(f"/health -> {resp.status_code}")
84
+ return False
85
+ except Exception as e:
86
+ log_fail(f"Could not connect to /health: {e}")
87
+ return False
88
+
89
+ # 2. Reset
90
+ try:
91
+ resp = requests.post(f"{ENV_BASE_URL}/reset", json={"task_id": "easy"}, timeout=5)
92
+ if resp.status_code == 200:
93
+ log_success("/reset -> 200 OK")
94
+ else:
95
+ log_fail(f"/reset -> {resp.status_code}")
96
+ return False
97
+ except Exception as e:
98
+ log_fail(f"Could not connect to /reset: {e}")
99
+ return False
100
+
101
+ return True
102
+
103
+ def check_inference_script():
104
+ log_info("Checking inference.py...")
105
+ if not os.path.exists("inference.py"):
106
+ log_fail("inference.py not found in root!")
107
+ return False
108
+
109
+ with open("inference.py", 'r') as f:
110
+ content = f.read()
111
+
112
+ # Check for [START], [STEP], [END]
113
+ patterns = {
114
+ "[START]": r"\[START\] task=",
115
+ "[STEP]": r"\[STEP .+\] Action:",
116
+ "[END]": r"\[END\] task=.* score=.* steps="
117
+ }
118
+
119
+ for label, pattern in patterns.items():
120
+ if not re.search(pattern, content):
121
+ log_fail(f"inference.py missing log tag/format: {label}")
122
+ return False
123
+
124
+ if "OpenAI" not in content or "client.chat.completions.create" not in content:
125
+ log_fail("inference.py does not appear to use the OpenAI client library.")
126
+ return False
127
+
128
+ log_success("inference.py logging and client usage look correct.")
129
+ return True
130
+
131
+ def main():
132
+ print(f"{bcolors.HEADER}{bcolors.BOLD}AgentDebuggerEnv Compliance Validator{bcolors.ENDC}")
133
+ print("=" * 45)
134
+
135
+ success = True
136
+ success &= check_env_vars()
137
+ success &= check_yaml()
138
+ success &= check_inference_script()
139
+
140
+ # Endpoints check is optional if server isn't running locally
141
+ try:
142
+ if not check_endpoints():
143
+ log_info("Skipping further endpoint checks as server is unreachable.")
144
+ except:
145
+ pass
146
+
147
+ print("=" * 45)
148
+ if success:
149
+ print(f"{bcolors.OKGREEN}{bcolors.BOLD}VALIDATION PASSED! Ready for submission.{bcolors.ENDC}")
150
+ else:
151
+ print(f"{bcolors.FAIL}{bcolors.BOLD}VALIDATION FAILED. Please fix the errors above.{bcolors.ENDC}")
152
+ sys.exit(1)
153
+
154
+ if __name__ == "__main__":
155
+ main()