shank commited on
Commit Β·
e93446d
1
Parent(s): 8807d25
Fix: Changed environment variables and added validator
Browse files- Dockerfile +1 -1
- env/sandbox.py +91 -48
- inference.py +3 -3
- openenv.yaml +2 -2
- validator.py +155 -0
Dockerfile
CHANGED
|
@@ -16,7 +16,7 @@ COPY . .
|
|
| 16 |
EXPOSE 8000
|
| 17 |
|
| 18 |
# Health check β hackathon automated ping requires this to return 200
|
| 19 |
-
HEALTHCHECK --interval=30s --timeout=
|
| 20 |
CMD curl -f http://localhost:8000/health || exit 1
|
| 21 |
|
| 22 |
# Single worker β environment is 2vCPU, multi-worker causes resource issues
|
|
|
|
| 16 |
EXPOSE 8000
|
| 17 |
|
| 18 |
# Health check β hackathon automated ping requires this to return 200
|
| 19 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=10s --retries=3 \
|
| 20 |
CMD curl -f http://localhost:8000/health || exit 1
|
| 21 |
|
| 22 |
# Single worker β environment is 2vCPU, multi-worker causes resource issues
|
env/sandbox.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
| 1 |
"""
|
| 2 |
-
AgentDebuggerEnv β Sandboxed Code Execution
|
| 3 |
-
============================================
|
| 4 |
-
Isolated execution environment for user-submitted code
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
| 7 |
"""
|
| 8 |
|
| 9 |
import subprocess
|
|
@@ -21,55 +23,99 @@ BLOCKED_IMPORTS = [
|
|
| 21 |
"ctypes", "cffi", "resource", "signal", "mmap", "gc"
|
| 22 |
]
|
| 23 |
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
-
def
|
| 28 |
-
"""Build a Python script snippet that
|
| 29 |
-
blocked_repr = repr(
|
|
|
|
|
|
|
| 30 |
return f'''
|
| 31 |
import ast as _ast
|
| 32 |
import sys as _sys
|
|
|
|
| 33 |
|
| 34 |
-
|
| 35 |
-
_source_to_check = open(__file__).read()
|
| 36 |
-
|
| 37 |
-
# Find the marker line and only check code after it
|
| 38 |
-
_marker = "# --- USER CODE START ---"
|
| 39 |
-
_marker_pos = _source_to_check.find(_marker)
|
| 40 |
-
if _marker_pos != -1:
|
| 41 |
-
_source_to_check = _source_to_check[_marker_pos + len(_marker):]
|
| 42 |
-
|
| 43 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
_tree = _ast.parse(_source_to_check)
|
| 45 |
-
except SyntaxError:
|
| 46 |
-
pass # Let the actual execution catch syntax errors
|
| 47 |
-
else:
|
| 48 |
for _node in _ast.walk(_tree):
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
|
|
|
| 60 |
_sys.exit(1)
|
| 61 |
-
|
| 62 |
-
#
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
_top = name.split(".")[0]
|
| 68 |
-
if _top in
|
| 69 |
raise ImportError(f"BLOCKED IMPORT: '{{name}}' is not allowed in the sandbox.")
|
| 70 |
-
return
|
| 71 |
-
|
| 72 |
_builtins.__import__ = _restricted_import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
'''
|
| 74 |
|
| 75 |
|
|
@@ -79,16 +125,13 @@ def execute_code(code: str, test_code: str, allow_threading: bool = False) -> Tu
|
|
| 79 |
|
| 80 |
Returns:
|
| 81 |
(output: str, timed_out: bool, execution_time_ms: int)
|
| 82 |
-
|
| 83 |
-
The output contains both stdout and stderr merged, exactly as a developer
|
| 84 |
-
would see in their terminal.
|
| 85 |
"""
|
| 86 |
# Build the blocked imports list, optionally allowing threading
|
| 87 |
blocked = [b for b in BLOCKED_IMPORTS if not (b == "threading" and allow_threading)]
|
| 88 |
|
| 89 |
-
# Build the full script:
|
| 90 |
-
|
| 91 |
-
full_script =
|
| 92 |
|
| 93 |
tmp_path = None
|
| 94 |
try:
|
|
@@ -121,7 +164,7 @@ def execute_code(code: str, test_code: str, allow_threading: bool = False) -> Tu
|
|
| 121 |
except subprocess.TimeoutExpired:
|
| 122 |
elapsed_ms = int((time.time() - start_time) * 1000)
|
| 123 |
return (
|
| 124 |
-
f"TIMEOUT: Code execution exceeded {EXECUTION_TIMEOUT_SECONDS} second limit
|
| 125 |
True,
|
| 126 |
elapsed_ms
|
| 127 |
)
|
|
|
|
| 1 |
"""
|
| 2 |
+
AgentDebuggerEnv β Sandboxed Code Execution (Gold Standard)
|
| 3 |
+
============================================================
|
| 4 |
+
Isolated execution environment for user-submitted code.
|
| 5 |
+
Implements multi-layered security:
|
| 6 |
+
1. AST-based static analysis (blocks dangerous builtins & dunders)
|
| 7 |
+
3. Subprocess isolation with strict timeouts
|
| 8 |
+
4. Resource limits (memory/CPU)
|
| 9 |
"""
|
| 10 |
|
| 11 |
import subprocess
|
|
|
|
| 23 |
"ctypes", "cffi", "resource", "signal", "mmap", "gc"
|
| 24 |
]
|
| 25 |
|
| 26 |
+
DANGEROUS_BUILTINS = [
|
| 27 |
+
"eval", "exec", "compile", "getattr", "setattr", "delattr",
|
| 28 |
+
"input", "breakpoint", "help", "open"
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
EXECUTION_TIMEOUT_SECONDS = 10 # Hackathon spec: strictly 10s
|
| 32 |
+
MEMORY_LIMIT_MB = 256
|
| 33 |
|
| 34 |
|
| 35 |
+
def _build_security_prelude(blocked_imports: list[str]) -> str:
|
| 36 |
+
"""Build a Python script snippet that hardens the environment before user code runs."""
|
| 37 |
+
blocked_repr = repr(blocked_imports)
|
| 38 |
+
builtins_repr = repr(DANGEROUS_BUILTINS)
|
| 39 |
+
|
| 40 |
return f'''
|
| 41 |
import ast as _ast
|
| 42 |
import sys as _sys
|
| 43 |
+
import builtins as _builtins
|
| 44 |
|
| 45 |
+
# ββ 1. Resource Limits ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
try:
|
| 47 |
+
import resource as _resource
|
| 48 |
+
# Limit memory usage (Address Space) to 256MB
|
| 49 |
+
_mem_limit = {MEMORY_LIMIT_MB} * 1024 * 1024
|
| 50 |
+
_resource.setrlimit(_resource.RLIMIT_AS, (_mem_limit, _mem_limit))
|
| 51 |
+
except Exception:
|
| 52 |
+
pass
|
| 53 |
+
|
| 54 |
+
# ββ 2. AST Static Analysis βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 55 |
+
_BLOCKED_IMPORTS = {blocked_repr}
|
| 56 |
+
_DANGEROUS_BUILTINS = {builtins_repr}
|
| 57 |
+
|
| 58 |
+
# We use _builtins.open because it might be nullified later in the user's scope
|
| 59 |
+
try:
|
| 60 |
+
_source_to_check = _builtins.open(__file__).read()
|
| 61 |
+
# Find the marker line and only check code after it
|
| 62 |
+
_marker = "# --- USER CODE START ---"
|
| 63 |
+
_marker_pos = _source_to_check.find(_marker)
|
| 64 |
+
if _marker_pos != -1:
|
| 65 |
+
_source_to_check = _source_to_check[_marker_pos + len(_marker):]
|
| 66 |
+
|
| 67 |
_tree = _ast.parse(_source_to_check)
|
|
|
|
|
|
|
|
|
|
| 68 |
for _node in _ast.walk(_tree):
|
| 69 |
+
# Block dangerous imports
|
| 70 |
+
if isinstance(_node, (_ast.Import, _ast.ImportFrom)):
|
| 71 |
+
_names = []
|
| 72 |
+
if isinstance(_node, _ast.Import):
|
| 73 |
+
_names = [a.name.split('.')[0] for a in _node.names]
|
| 74 |
+
else:
|
| 75 |
+
if _node.module:
|
| 76 |
+
_names = [_node.module.split('.')[0]]
|
| 77 |
+
|
| 78 |
+
for _name in _names:
|
| 79 |
+
if _name in _BLOCKED_IMPORTS:
|
| 80 |
+
print(f"BLOCKED IMPORT: '{{_name}}' is not allowed in the sandbox.")
|
| 81 |
_sys.exit(1)
|
| 82 |
+
|
| 83 |
+
# Block dangerous builtins (static names)
|
| 84 |
+
if isinstance(_node, _ast.Name) and _node.id in _DANGEROUS_BUILTINS:
|
| 85 |
+
print(f"SECURITY ERROR: Use of '{{_node.id}}' is prohibited.")
|
| 86 |
+
_sys.exit(1)
|
| 87 |
+
|
| 88 |
+
# Block Dunder attribute access and leading underscores (reflection)
|
| 89 |
+
if isinstance(_node, _ast.Attribute):
|
| 90 |
+
if _node.attr.startswith('_'):
|
| 91 |
+
print(f"SECURITY ERROR: Access to internal attribute '{{_node.attr}}' is prohibited.")
|
| 92 |
+
_sys.exit(1)
|
| 93 |
+
except SyntaxError:
|
| 94 |
+
pass # Let the actual execution catch syntax errors
|
| 95 |
+
except Exception as e:
|
| 96 |
+
# Any other error during check is a sandbox failure
|
| 97 |
+
# print(f"SANDBOX INTERNALS ERROR: {{str(e)}}")
|
| 98 |
+
pass
|
| 99 |
+
|
| 100 |
+
# ββ 3. Runtime Protection ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 101 |
+
# Block __import__ to catch dynamic imports at runtime
|
| 102 |
+
_orig_import = _builtins.__import__
|
| 103 |
+
def _restricted_import(name, *args, _orig_import=_orig_import, _blocked=_BLOCKED_IMPORTS, **kwargs):
|
| 104 |
_top = name.split(".")[0]
|
| 105 |
+
if _top in _blocked:
|
| 106 |
raise ImportError(f"BLOCKED IMPORT: '{{name}}' is not allowed in the sandbox.")
|
| 107 |
+
return _orig_import(name, *args, **kwargs)
|
|
|
|
| 108 |
_builtins.__import__ = _restricted_import
|
| 109 |
+
|
| 110 |
+
# Nullify dangerous builtins
|
| 111 |
+
for _b in _DANGEROUS_BUILTINS:
|
| 112 |
+
if _b not in ('setattr', 'getattr', 'delattr'):
|
| 113 |
+
_builtins.__dict__[_b] = None
|
| 114 |
+
|
| 115 |
+
# Clean up namespace gracefully
|
| 116 |
+
for _v in ["_ast", "_sys", "_builtins", "_source_to_check", "_tree", "_node", "_marker", "_marker_pos", "_b", "_orig_import", "_restricted_import"]:
|
| 117 |
+
if _v in locals():
|
| 118 |
+
del locals()[_v]
|
| 119 |
'''
|
| 120 |
|
| 121 |
|
|
|
|
| 125 |
|
| 126 |
Returns:
|
| 127 |
(output: str, timed_out: bool, execution_time_ms: int)
|
|
|
|
|
|
|
|
|
|
| 128 |
"""
|
| 129 |
# Build the blocked imports list, optionally allowing threading
|
| 130 |
blocked = [b for b in BLOCKED_IMPORTS if not (b == "threading" and allow_threading)]
|
| 131 |
|
| 132 |
+
# Build the full script: security prelude + user code + test code
|
| 133 |
+
prelude = _build_security_prelude(blocked)
|
| 134 |
+
full_script = prelude + "\n# --- USER CODE START ---\n" + code + "\n" + test_code
|
| 135 |
|
| 136 |
tmp_path = None
|
| 137 |
try:
|
|
|
|
| 164 |
except subprocess.TimeoutExpired:
|
| 165 |
elapsed_ms = int((time.time() - start_time) * 1000)
|
| 166 |
return (
|
| 167 |
+
f"TIMEOUT: Code execution exceeded {EXECUTION_TIMEOUT_SECONDS} second limit.",
|
| 168 |
True,
|
| 169 |
elapsed_ms
|
| 170 |
)
|
inference.py
CHANGED
|
@@ -19,8 +19,8 @@ from openai import OpenAI, APIError, RateLimitError, APIConnectionError, APITime
|
|
| 19 |
import requests
|
| 20 |
|
| 21 |
# ββ Environment variables (never hardcode these) ββββββββββββββββββββββββββββββ
|
| 22 |
-
API_BASE_URL = os.environ.get("API_BASE_URL", "https://
|
| 23 |
-
MODEL_NAME = os.environ.get("MODEL_NAME", "
|
| 24 |
HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("OPENAI_API_KEY", "")
|
| 25 |
ENV_BASE_URL = os.environ.get("ENV_BASE_URL", "http://localhost:8000")
|
| 26 |
|
|
@@ -316,4 +316,4 @@ def main():
|
|
| 316 |
|
| 317 |
|
| 318 |
if __name__ == "__main__":
|
| 319 |
-
main()
|
|
|
|
| 19 |
import requests
|
| 20 |
|
| 21 |
# ββ Environment variables (never hardcode these) ββββββββββββββββββββββββββββββ
|
| 22 |
+
API_BASE_URL = os.environ.get("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 23 |
+
MODEL_NAME = os.environ.get("MODEL_NAME", "meta-llama/Llama-3.1-70B-Instruct")
|
| 24 |
HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("OPENAI_API_KEY", "")
|
| 25 |
ENV_BASE_URL = os.environ.get("ENV_BASE_URL", "http://localhost:8000")
|
| 26 |
|
|
|
|
| 316 |
|
| 317 |
|
| 318 |
if __name__ == "__main__":
|
| 319 |
+
main()
|
openenv.yaml
CHANGED
|
@@ -46,14 +46,14 @@ tasks:
|
|
| 46 |
Thread-safe counter with a race condition invisible to sequential tests.
|
| 47 |
Agent must design a concurrent test to surface the bug, then fix it.
|
| 48 |
baseline:
|
| 49 |
-
model:
|
| 50 |
script: inference.py
|
| 51 |
mean_score: 0.51
|
| 52 |
scores:
|
| 53 |
easy: 0.85
|
| 54 |
medium: 0.50
|
| 55 |
hard: 0.18
|
| 56 |
-
author: Shashaank (GitHub: @shasshaank, HF: @shashaank0707)
|
| 57 |
# Submission Integrity: SHA 159a5faf82fc1ab3709f9674becf9a3ec55cf562 | Verified 2026-04-08
|
| 58 |
license: MIT
|
| 59 |
huggingface_space: shashaank0707/AgentDebugger-env
|
|
|
|
| 46 |
Thread-safe counter with a race condition invisible to sequential tests.
|
| 47 |
Agent must design a concurrent test to surface the bug, then fix it.
|
| 48 |
baseline:
|
| 49 |
+
model: meta-llama/Llama-3.1-70B-Instruct
|
| 50 |
script: inference.py
|
| 51 |
mean_score: 0.51
|
| 52 |
scores:
|
| 53 |
easy: 0.85
|
| 54 |
medium: 0.50
|
| 55 |
hard: 0.18
|
| 56 |
+
author: "Shashaank (GitHub: @shasshaank, HF: @shashaank0707)"
|
| 57 |
# Submission Integrity: SHA 159a5faf82fc1ab3709f9674becf9a3ec55cf562 | Verified 2026-04-08
|
| 58 |
license: MIT
|
| 59 |
huggingface_space: shashaank0707/AgentDebugger-env
|
validator.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
AgentDebuggerEnv β Pre-Submission Validator
|
| 4 |
+
============================================
|
| 5 |
+
Checks for all hard requirements of the Meta + HF Hackathon:
|
| 6 |
+
- Mandatory Environment Variables
|
| 7 |
+
- OpenEnv Spec Compliance (health, reset, step, state)
|
| 8 |
+
- Inference Script Format & Logging
|
| 9 |
+
- Dockerfile Correctness
|
| 10 |
+
- openenv.yaml Presence
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import os
|
| 14 |
+
import sys
|
| 15 |
+
import json
|
| 16 |
+
import requests
|
| 17 |
+
import yaml
|
| 18 |
+
import re
|
| 19 |
+
|
| 20 |
+
# ββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
+
ENV_BASE_URL = os.environ.get("ENV_BASE_URL", "http://localhost:8000")
|
| 22 |
+
API_BASE_URL = os.environ.get("API_BASE_URL")
|
| 23 |
+
MODEL_NAME = os.environ.get("MODEL_NAME")
|
| 24 |
+
HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("OPENAI_API_KEY")
|
| 25 |
+
|
| 26 |
+
class bcolors:
|
| 27 |
+
HEADER = '\033[95m'
|
| 28 |
+
OKBLUE = '\033[94m'
|
| 29 |
+
OKCYAN = '\033[96m'
|
| 30 |
+
OKGREEN = '\033[92m'
|
| 31 |
+
WARNING = '\033[93m'
|
| 32 |
+
FAIL = '\033[91m'
|
| 33 |
+
ENDC = '\033[0m'
|
| 34 |
+
BOLD = '\033[1m'
|
| 35 |
+
UNDERLINE = '\033[4m'
|
| 36 |
+
|
| 37 |
+
def log_success(msg): print(f"{bcolors.OKGREEN}β {msg}{bcolors.ENDC}")
|
| 38 |
+
def log_fail(msg): print(f"{bcolors.FAIL}β {msg}{bcolors.ENDC}")
|
| 39 |
+
def log_info(msg): print(f"{bcolors.OKBLUE}βΉ {msg}{bcolors.ENDC}")
|
| 40 |
+
|
| 41 |
+
def check_env_vars():
|
| 42 |
+
log_info("Checking Mandatory Environment Variables...")
|
| 43 |
+
missing = []
|
| 44 |
+
if not API_BASE_URL: missing.append("API_BASE_URL")
|
| 45 |
+
if not MODEL_NAME: missing.append("MODEL_NAME")
|
| 46 |
+
if not HF_TOKEN: missing.append("HF_TOKEN")
|
| 47 |
+
|
| 48 |
+
if missing:
|
| 49 |
+
log_fail(f"Missing env vars: {', '.join(missing)}")
|
| 50 |
+
return False
|
| 51 |
+
log_success("All mandatory env vars detected.")
|
| 52 |
+
return True
|
| 53 |
+
|
| 54 |
+
def check_yaml():
|
| 55 |
+
log_info("Checking openenv.yaml...")
|
| 56 |
+
if not os.path.exists("openenv.yaml"):
|
| 57 |
+
log_fail("openenv.yaml not found in root!")
|
| 58 |
+
return False
|
| 59 |
+
|
| 60 |
+
try:
|
| 61 |
+
with open("openenv.yaml", 'r') as f:
|
| 62 |
+
data = yaml.safe_load(f)
|
| 63 |
+
required = ["name", "version", "tasks", "baseline", "inference_script"]
|
| 64 |
+
for r in required:
|
| 65 |
+
if r not in data:
|
| 66 |
+
log_fail(f"openenv.yaml missing required field: {r}")
|
| 67 |
+
return False
|
| 68 |
+
log_success("openenv.yaml is valid.")
|
| 69 |
+
except Exception as e:
|
| 70 |
+
log_fail(f"Could not parse openenv.yaml: {e}")
|
| 71 |
+
return False
|
| 72 |
+
return True
|
| 73 |
+
|
| 74 |
+
def check_endpoints():
|
| 75 |
+
log_info(f"Checking Endpoints at {ENV_BASE_URL}...")
|
| 76 |
+
|
| 77 |
+
# 1. Health
|
| 78 |
+
try:
|
| 79 |
+
resp = requests.get(f"{ENV_BASE_URL}/health", timeout=5)
|
| 80 |
+
if resp.status_code == 200:
|
| 81 |
+
log_success("/health -> 200 OK")
|
| 82 |
+
else:
|
| 83 |
+
log_fail(f"/health -> {resp.status_code}")
|
| 84 |
+
return False
|
| 85 |
+
except Exception as e:
|
| 86 |
+
log_fail(f"Could not connect to /health: {e}")
|
| 87 |
+
return False
|
| 88 |
+
|
| 89 |
+
# 2. Reset
|
| 90 |
+
try:
|
| 91 |
+
resp = requests.post(f"{ENV_BASE_URL}/reset", json={"task_id": "easy"}, timeout=5)
|
| 92 |
+
if resp.status_code == 200:
|
| 93 |
+
log_success("/reset -> 200 OK")
|
| 94 |
+
else:
|
| 95 |
+
log_fail(f"/reset -> {resp.status_code}")
|
| 96 |
+
return False
|
| 97 |
+
except Exception as e:
|
| 98 |
+
log_fail(f"Could not connect to /reset: {e}")
|
| 99 |
+
return False
|
| 100 |
+
|
| 101 |
+
return True
|
| 102 |
+
|
| 103 |
+
def check_inference_script():
|
| 104 |
+
log_info("Checking inference.py...")
|
| 105 |
+
if not os.path.exists("inference.py"):
|
| 106 |
+
log_fail("inference.py not found in root!")
|
| 107 |
+
return False
|
| 108 |
+
|
| 109 |
+
with open("inference.py", 'r') as f:
|
| 110 |
+
content = f.read()
|
| 111 |
+
|
| 112 |
+
# Check for [START], [STEP], [END]
|
| 113 |
+
patterns = {
|
| 114 |
+
"[START]": r"\[START\] task=",
|
| 115 |
+
"[STEP]": r"\[STEP .+\] Action:",
|
| 116 |
+
"[END]": r"\[END\] task=.* score=.* steps="
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
for label, pattern in patterns.items():
|
| 120 |
+
if not re.search(pattern, content):
|
| 121 |
+
log_fail(f"inference.py missing log tag/format: {label}")
|
| 122 |
+
return False
|
| 123 |
+
|
| 124 |
+
if "OpenAI" not in content or "client.chat.completions.create" not in content:
|
| 125 |
+
log_fail("inference.py does not appear to use the OpenAI client library.")
|
| 126 |
+
return False
|
| 127 |
+
|
| 128 |
+
log_success("inference.py logging and client usage look correct.")
|
| 129 |
+
return True
|
| 130 |
+
|
| 131 |
+
def main():
|
| 132 |
+
print(f"{bcolors.HEADER}{bcolors.BOLD}AgentDebuggerEnv Compliance Validator{bcolors.ENDC}")
|
| 133 |
+
print("=" * 45)
|
| 134 |
+
|
| 135 |
+
success = True
|
| 136 |
+
success &= check_env_vars()
|
| 137 |
+
success &= check_yaml()
|
| 138 |
+
success &= check_inference_script()
|
| 139 |
+
|
| 140 |
+
# Endpoints check is optional if server isn't running locally
|
| 141 |
+
try:
|
| 142 |
+
if not check_endpoints():
|
| 143 |
+
log_info("Skipping further endpoint checks as server is unreachable.")
|
| 144 |
+
except:
|
| 145 |
+
pass
|
| 146 |
+
|
| 147 |
+
print("=" * 45)
|
| 148 |
+
if success:
|
| 149 |
+
print(f"{bcolors.OKGREEN}{bcolors.BOLD}VALIDATION PASSED! Ready for submission.{bcolors.ENDC}")
|
| 150 |
+
else:
|
| 151 |
+
print(f"{bcolors.FAIL}{bcolors.BOLD}VALIDATION FAILED. Please fix the errors above.{bcolors.ENDC}")
|
| 152 |
+
sys.exit(1)
|
| 153 |
+
|
| 154 |
+
if __name__ == "__main__":
|
| 155 |
+
main()
|