shank commited on
Commit Β·
0ee66d2
1
Parent(s): 6318243
complete project
Browse files- env/__pycache__/__init__.cpython-310.pyc +0 -0
- env/__pycache__/__init__.cpython-313.pyc +0 -0
- env/__pycache__/environment.cpython-310.pyc +0 -0
- env/__pycache__/environment.cpython-313.pyc +0 -0
- env/__pycache__/models.cpython-310.pyc +0 -0
- env/__pycache__/models.cpython-313.pyc +0 -0
- env/__pycache__/sandbox.cpython-310.pyc +0 -0
- env/environment.py +511 -0
- env/graders/__init__.py +17 -1
- env/graders/__pycache__/__init__.cpython-310.pyc +0 -0
- env/graders/__pycache__/base_grader.cpython-310.pyc +0 -0
- env/graders/__pycache__/grader_easy.cpython-310.pyc +0 -0
- env/graders/__pycache__/grader_hard.cpython-310.pyc +0 -0
- env/graders/__pycache__/grader_medium.cpython-310.pyc +0 -0
- env/graders/base_grader.py +54 -0
- env/graders/grader_easy.py +51 -0
- env/graders/grader_hard.py +100 -0
- env/graders/grader_medium.py +72 -0
- env/models.py +71 -0
- env/sandbox.py +1 -1
- env/server.py +92 -0
- env/tasks/__init__.py +2 -1
- env/tasks/__pycache__/__init__.cpython-310.pyc +0 -0
- env/tasks/__pycache__/registry.cpython-310.pyc +0 -0
- env/tasks/__pycache__/task_easy.cpython-310.pyc +0 -0
- env/tasks/__pycache__/task_hard.cpython-310.pyc +0 -0
- env/tasks/__pycache__/task_medium.cpython-310.pyc +0 -0
- env/tasks/registry.py +27 -0
- inference.py +239 -0
- openenv.yaml +61 -0
- requirements.txt +1 -1
- tests/__pycache__/__init__.cpython-310.pyc +0 -0
- tests/__pycache__/test_environment.cpython-310-pytest-8.1.0.pyc +0 -0
- tests/__pycache__/test_graders.cpython-310-pytest-8.1.0.pyc +0 -0
- tests/__pycache__/test_sandbox.cpython-310-pytest-8.1.0.pyc +0 -0
- tests/test_environment.py +229 -0
- tests/test_graders.py +157 -0
- tests/test_sandbox.py +3 -4
env/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (152 Bytes). View file
|
|
|
env/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (156 Bytes). View file
|
|
|
env/__pycache__/environment.cpython-310.pyc
ADDED
|
Binary file (11.7 kB). View file
|
|
|
env/__pycache__/environment.cpython-313.pyc
ADDED
|
Binary file (19.3 kB). View file
|
|
|
env/__pycache__/models.cpython-310.pyc
ADDED
|
Binary file (2.08 kB). View file
|
|
|
env/__pycache__/models.cpython-313.pyc
ADDED
|
Binary file (2.59 kB). View file
|
|
|
env/__pycache__/sandbox.cpython-310.pyc
ADDED
|
Binary file (4.69 kB). View file
|
|
|
env/environment.py
ADDED
|
@@ -0,0 +1,511 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
AgentDebuggerEnv β Core Environment
|
| 3 |
+
=====================================
|
| 4 |
+
OpenEnv-compliant environment with reset(), step(), state() methods.
|
| 5 |
+
Manages the full debugging episode lifecycle.
|
| 6 |
+
|
| 7 |
+
NEVER crashes β all errors are returned in info["error"].
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import re
|
| 11 |
+
import math
|
| 12 |
+
from typing import Dict, Any, Optional, Tuple
|
| 13 |
+
|
| 14 |
+
from env.models import Observation, Action, Reward, FixAttempt
|
| 15 |
+
from env.sandbox import execute_code
|
| 16 |
+
from env.tasks.registry import get_task, list_tasks
|
| 17 |
+
from env.graders import get_grader
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class DebuggerEnvironment:
|
| 21 |
+
"""Core debugging environment implementing the OpenEnv interface."""
|
| 22 |
+
|
| 23 |
+
def __init__(self):
|
| 24 |
+
self._task_config: Optional[dict] = None
|
| 25 |
+
self._observation: Optional[Observation] = None
|
| 26 |
+
self._cumulative_reward: float = 0.0
|
| 27 |
+
self._attempts_used: int = 0
|
| 28 |
+
self._best_tests_passed: int = 0
|
| 29 |
+
self._all_hypotheses: list[str] = []
|
| 30 |
+
self._all_attempts: list[dict] = []
|
| 31 |
+
self._queries_used: int = 0
|
| 32 |
+
self._done: bool = True
|
| 33 |
+
self._step_number: int = 0
|
| 34 |
+
self._prev_tests_passed: int = 0
|
| 35 |
+
|
| 36 |
+
def reset(self, task_id: str) -> dict:
|
| 37 |
+
"""
|
| 38 |
+
Start a fresh episode. Clears all state.
|
| 39 |
+
Returns the initial Observation as a dict.
|
| 40 |
+
"""
|
| 41 |
+
try:
|
| 42 |
+
task_config = get_task(task_id)
|
| 43 |
+
except ValueError as e:
|
| 44 |
+
raise ValueError(str(e))
|
| 45 |
+
|
| 46 |
+
self._task_config = task_config
|
| 47 |
+
self._cumulative_reward = 0.0
|
| 48 |
+
self._attempts_used = 0
|
| 49 |
+
self._best_tests_passed = 0
|
| 50 |
+
self._all_hypotheses = []
|
| 51 |
+
self._all_attempts = []
|
| 52 |
+
self._queries_used = 0
|
| 53 |
+
self._done = False
|
| 54 |
+
self._step_number = 0
|
| 55 |
+
|
| 56 |
+
# Run buggy code through sandbox to get initial error output
|
| 57 |
+
buggy_code = task_config["buggy_code"]
|
| 58 |
+
test_executable = task_config["test_suite"] + "\n\n" + task_config["test_suite_executable"]
|
| 59 |
+
allow_threading = task_config.get("allow_threading", False)
|
| 60 |
+
|
| 61 |
+
initial_output, timed_out, exec_time = execute_code(
|
| 62 |
+
buggy_code, test_executable, allow_threading=allow_threading
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
# Parse initial test results
|
| 66 |
+
initial_passed = self._parse_tests_passed(initial_output, task_config["tests_total"])
|
| 67 |
+
self._prev_tests_passed = initial_passed
|
| 68 |
+
self._best_tests_passed = initial_passed
|
| 69 |
+
|
| 70 |
+
self._observation = Observation(
|
| 71 |
+
task_id=task_id,
|
| 72 |
+
task_description=task_config["task_description"],
|
| 73 |
+
buggy_code=buggy_code,
|
| 74 |
+
test_suite=task_config["test_suite"],
|
| 75 |
+
initial_error_output=initial_output,
|
| 76 |
+
current_code=buggy_code,
|
| 77 |
+
current_error_output=initial_output,
|
| 78 |
+
tests_passed=initial_passed,
|
| 79 |
+
tests_total=task_config["tests_total"],
|
| 80 |
+
previous_attempts=[],
|
| 81 |
+
attempts_remaining=task_config["max_attempts"],
|
| 82 |
+
max_attempts=task_config["max_attempts"],
|
| 83 |
+
step_number=0,
|
| 84 |
+
max_steps=task_config["max_steps"],
|
| 85 |
+
done=False,
|
| 86 |
+
score_estimate=0.0,
|
| 87 |
+
hint_used=False,
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
return self._observation.model_dump()
|
| 91 |
+
|
| 92 |
+
def step(self, action: Action) -> Dict[str, Any]:
|
| 93 |
+
"""
|
| 94 |
+
Process one action. Returns {observation, reward, done, info}.
|
| 95 |
+
Never crashes β errors go in info["error"].
|
| 96 |
+
"""
|
| 97 |
+
# Safety: if episode is already done, return current state
|
| 98 |
+
if self._done:
|
| 99 |
+
return self._make_response(
|
| 100 |
+
step_reward=0.0,
|
| 101 |
+
info={"error": "Episode is already done. Call /reset to start a new episode."},
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
# Increment step
|
| 105 |
+
self._step_number += 1
|
| 106 |
+
|
| 107 |
+
# Check max_steps exceeded
|
| 108 |
+
if self._step_number > self._task_config["max_steps"]:
|
| 109 |
+
return self._force_truncation()
|
| 110 |
+
|
| 111 |
+
action_type = action.action_type
|
| 112 |
+
|
| 113 |
+
if action_type == "submit_fix":
|
| 114 |
+
return self._handle_submit_fix(action)
|
| 115 |
+
elif action_type == "query_context":
|
| 116 |
+
return self._handle_query_context(action)
|
| 117 |
+
elif action_type == "give_up":
|
| 118 |
+
return self._handle_give_up(action)
|
| 119 |
+
else:
|
| 120 |
+
return self._make_response(
|
| 121 |
+
step_reward=-0.05,
|
| 122 |
+
info={"error": f"Unknown action_type: '{action_type}'. Use 'submit_fix', 'query_context', or 'give_up'."},
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
def state(self) -> dict:
|
| 126 |
+
"""Return the full internal environment state as a plain dict."""
|
| 127 |
+
if self._observation is None:
|
| 128 |
+
return {
|
| 129 |
+
"task_id": None,
|
| 130 |
+
"step_number": 0,
|
| 131 |
+
"attempts_used": 0,
|
| 132 |
+
"current_tests_passed": 0,
|
| 133 |
+
"current_tests_total": 0,
|
| 134 |
+
"best_tests_passed": 0,
|
| 135 |
+
"all_hypotheses": [],
|
| 136 |
+
"cumulative_reward": 0.0,
|
| 137 |
+
"done": True,
|
| 138 |
+
"hint_used": False,
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
return {
|
| 142 |
+
"task_id": self._observation.task_id,
|
| 143 |
+
"step_number": self._step_number,
|
| 144 |
+
"attempts_used": self._attempts_used,
|
| 145 |
+
"current_tests_passed": self._observation.tests_passed,
|
| 146 |
+
"current_tests_total": self._observation.tests_total,
|
| 147 |
+
"best_tests_passed": self._best_tests_passed,
|
| 148 |
+
"all_hypotheses": list(self._all_hypotheses),
|
| 149 |
+
"cumulative_reward": self._cumulative_reward,
|
| 150 |
+
"done": self._done,
|
| 151 |
+
"hint_used": self._observation.hint_used,
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
# ββ Action Handlers ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 155 |
+
|
| 156 |
+
def _handle_submit_fix(self, action: Action) -> Dict[str, Any]:
|
| 157 |
+
"""Handle submit_fix action."""
|
| 158 |
+
# Check: hypothesis is required
|
| 159 |
+
if not action.hypothesis or not action.hypothesis.strip():
|
| 160 |
+
return self._make_response(
|
| 161 |
+
step_reward=-0.10,
|
| 162 |
+
info={"error": "submit_fix requires a 'hypothesis' field. Fix was NOT executed."},
|
| 163 |
+
count_step=True,
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
# Check: attempts remaining
|
| 167 |
+
if self._observation.attempts_remaining <= 0:
|
| 168 |
+
return self._make_response(
|
| 169 |
+
step_reward=-0.15,
|
| 170 |
+
info={"error": "No attempts remaining. Use 'query_context' or 'give_up'."},
|
| 171 |
+
count_step=True,
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
# Get submitted code
|
| 175 |
+
fixed_code = action.fixed_code or ""
|
| 176 |
+
hypothesis = action.hypothesis.strip()
|
| 177 |
+
self._all_hypotheses.append(hypothesis)
|
| 178 |
+
self._attempts_used += 1
|
| 179 |
+
|
| 180 |
+
# Execute in sandbox
|
| 181 |
+
test_executable = self._task_config["test_suite"] + "\n\n" + self._task_config["test_suite_executable"]
|
| 182 |
+
allow_threading = self._task_config.get("allow_threading", False)
|
| 183 |
+
output, timed_out, exec_time = execute_code(
|
| 184 |
+
fixed_code, test_executable, allow_threading=allow_threading
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
# Parse test results
|
| 188 |
+
tests_total = self._task_config["tests_total"]
|
| 189 |
+
tests_passed = self._parse_tests_passed(output, tests_total)
|
| 190 |
+
|
| 191 |
+
# Update best
|
| 192 |
+
self._best_tests_passed = max(self._best_tests_passed, tests_passed)
|
| 193 |
+
|
| 194 |
+
# Calculate step reward
|
| 195 |
+
step_reward = self._calculate_step_reward(
|
| 196 |
+
tests_passed, tests_total, timed_out, hypothesis
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
# Record attempt
|
| 200 |
+
attempt = FixAttempt(
|
| 201 |
+
attempt_number=self._attempts_used,
|
| 202 |
+
code_submitted=fixed_code,
|
| 203 |
+
hypothesis=hypothesis,
|
| 204 |
+
execution_output=output,
|
| 205 |
+
tests_passed=tests_passed,
|
| 206 |
+
tests_total=tests_total,
|
| 207 |
+
execution_time_ms=exec_time,
|
| 208 |
+
timed_out=timed_out,
|
| 209 |
+
)
|
| 210 |
+
self._all_attempts.append(attempt.model_dump())
|
| 211 |
+
|
| 212 |
+
# Update observation
|
| 213 |
+
attempts_remaining = self._task_config["max_attempts"] - self._attempts_used
|
| 214 |
+
self._observation = self._observation.model_copy(update={
|
| 215 |
+
"current_code": fixed_code,
|
| 216 |
+
"current_error_output": output,
|
| 217 |
+
"tests_passed": tests_passed,
|
| 218 |
+
"previous_attempts": [FixAttempt(**a) for a in self._all_attempts],
|
| 219 |
+
"attempts_remaining": attempts_remaining,
|
| 220 |
+
"step_number": self._step_number,
|
| 221 |
+
"score_estimate": self._estimate_score(),
|
| 222 |
+
})
|
| 223 |
+
self._prev_tests_passed = tests_passed
|
| 224 |
+
|
| 225 |
+
# Check if solved
|
| 226 |
+
all_pass = tests_passed == tests_total
|
| 227 |
+
info = {
|
| 228 |
+
"step_number": self._step_number,
|
| 229 |
+
"attempts_used": self._attempts_used,
|
| 230 |
+
"attempts_remaining": attempts_remaining,
|
| 231 |
+
"tests_passed": tests_passed,
|
| 232 |
+
"tests_total": tests_total,
|
| 233 |
+
"hypothesis_matched_bug": None,
|
| 234 |
+
"query_result": None,
|
| 235 |
+
"error": None,
|
| 236 |
+
"execution_time_ms": exec_time,
|
| 237 |
+
"timed_out": timed_out,
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
if all_pass:
|
| 241 |
+
# Episode solved!
|
| 242 |
+
step_reward += 0.50 # Major bonus
|
| 243 |
+
return self._end_episode(step_reward, info)
|
| 244 |
+
|
| 245 |
+
# Check if out of attempts
|
| 246 |
+
if attempts_remaining <= 0:
|
| 247 |
+
return self._end_episode(step_reward, info)
|
| 248 |
+
|
| 249 |
+
return self._make_response(step_reward=step_reward, info=info, count_step=True)
|
| 250 |
+
|
| 251 |
+
def _handle_query_context(self, action: Action) -> Dict[str, Any]:
|
| 252 |
+
"""Handle query_context action."""
|
| 253 |
+
valid_query_types = ["function_signature", "related_code", "error_explanation", "test_details"]
|
| 254 |
+
|
| 255 |
+
if action.query_type not in valid_query_types:
|
| 256 |
+
return self._make_response(
|
| 257 |
+
step_reward=-0.05,
|
| 258 |
+
info={
|
| 259 |
+
"error": f"Invalid query_type: '{action.query_type}'. Valid: {valid_query_types}",
|
| 260 |
+
"query_result": None,
|
| 261 |
+
},
|
| 262 |
+
count_step=True,
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
# Generate context response
|
| 266 |
+
query_result = self._generate_query_response(action.query_type, action.query_target)
|
| 267 |
+
|
| 268 |
+
# First query is free, subsequent cost -0.05
|
| 269 |
+
if self._queries_used == 0:
|
| 270 |
+
step_reward = 0.0
|
| 271 |
+
self._observation = self._observation.model_copy(update={
|
| 272 |
+
"hint_used": True,
|
| 273 |
+
"step_number": self._step_number,
|
| 274 |
+
})
|
| 275 |
+
else:
|
| 276 |
+
step_reward = -0.05
|
| 277 |
+
|
| 278 |
+
self._queries_used += 1
|
| 279 |
+
|
| 280 |
+
info = {
|
| 281 |
+
"step_number": self._step_number,
|
| 282 |
+
"attempts_used": self._attempts_used,
|
| 283 |
+
"attempts_remaining": self._observation.attempts_remaining,
|
| 284 |
+
"tests_passed": self._observation.tests_passed,
|
| 285 |
+
"tests_total": self._observation.tests_total,
|
| 286 |
+
"hypothesis_matched_bug": None,
|
| 287 |
+
"query_result": query_result,
|
| 288 |
+
"error": None,
|
| 289 |
+
"execution_time_ms": None,
|
| 290 |
+
"timed_out": False,
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
return self._make_response(step_reward=step_reward, info=info, count_step=True)
|
| 294 |
+
|
| 295 |
+
def _handle_give_up(self, action: Action) -> Dict[str, Any]:
|
| 296 |
+
"""Handle give_up action. Ends episode, runs grader."""
|
| 297 |
+
if action.final_diagnosis:
|
| 298 |
+
self._all_hypotheses.append(action.final_diagnosis)
|
| 299 |
+
|
| 300 |
+
info = {
|
| 301 |
+
"step_number": self._step_number,
|
| 302 |
+
"attempts_used": self._attempts_used,
|
| 303 |
+
"attempts_remaining": self._observation.attempts_remaining,
|
| 304 |
+
"tests_passed": self._observation.tests_passed,
|
| 305 |
+
"tests_total": self._observation.tests_total,
|
| 306 |
+
"hypothesis_matched_bug": None,
|
| 307 |
+
"query_result": None,
|
| 308 |
+
"error": None,
|
| 309 |
+
"execution_time_ms": None,
|
| 310 |
+
"timed_out": False,
|
| 311 |
+
}
|
| 312 |
+
return self._end_episode(step_reward=0.0, info=info)
|
| 313 |
+
|
| 314 |
+
# ββ Internal Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 315 |
+
|
| 316 |
+
def _calculate_step_reward(
|
| 317 |
+
self, tests_passed: int, tests_total: int, timed_out: bool, hypothesis: str
|
| 318 |
+
) -> float:
|
| 319 |
+
"""Calculate the step-level reward for a fix attempt."""
|
| 320 |
+
reward = 0.0
|
| 321 |
+
prev = self._prev_tests_passed
|
| 322 |
+
|
| 323 |
+
if timed_out:
|
| 324 |
+
reward -= 0.10
|
| 325 |
+
|
| 326 |
+
if tests_passed > prev:
|
| 327 |
+
# Progress reward
|
| 328 |
+
reward += 0.15 * (tests_passed - prev) / tests_total
|
| 329 |
+
elif tests_passed < prev:
|
| 330 |
+
# Regression penalty
|
| 331 |
+
reward -= 0.10 * (prev - tests_passed) / tests_total
|
| 332 |
+
else:
|
| 333 |
+
# Stagnation
|
| 334 |
+
reward -= 0.05
|
| 335 |
+
|
| 336 |
+
return reward
|
| 337 |
+
|
| 338 |
+
def _end_episode(self, step_reward: float, info: dict) -> Dict[str, Any]:
|
| 339 |
+
"""End the episode, run grader, return final response."""
|
| 340 |
+
self._done = True
|
| 341 |
+
|
| 342 |
+
# Run grader
|
| 343 |
+
grader = get_grader(self._task_config["task_id"])
|
| 344 |
+
grader_score = grader.score(
|
| 345 |
+
task_config=self._task_config,
|
| 346 |
+
attempts=self._all_attempts,
|
| 347 |
+
best_tests_passed=self._best_tests_passed,
|
| 348 |
+
tests_total=self._task_config["tests_total"],
|
| 349 |
+
attempts_used=self._attempts_used,
|
| 350 |
+
max_attempts=self._task_config["max_attempts"],
|
| 351 |
+
hypotheses=self._all_hypotheses,
|
| 352 |
+
)
|
| 353 |
+
|
| 354 |
+
# Check hypothesis accuracy for info
|
| 355 |
+
ground_truth = self._task_config["ground_truth"]
|
| 356 |
+
keywords = ground_truth["hypothesis_keywords"]
|
| 357 |
+
if self._all_hypotheses:
|
| 358 |
+
any_match = any(
|
| 359 |
+
any(kw.lower() in h.lower() for kw in keywords)
|
| 360 |
+
for h in self._all_hypotheses
|
| 361 |
+
)
|
| 362 |
+
info["hypothesis_matched_bug"] = any_match
|
| 363 |
+
|
| 364 |
+
self._observation = self._observation.model_copy(update={
|
| 365 |
+
"done": True,
|
| 366 |
+
"step_number": self._step_number,
|
| 367 |
+
"score_estimate": grader_score,
|
| 368 |
+
})
|
| 369 |
+
|
| 370 |
+
return self._make_response(
|
| 371 |
+
step_reward=step_reward,
|
| 372 |
+
info=info,
|
| 373 |
+
grader_score=grader_score,
|
| 374 |
+
force_done=True,
|
| 375 |
+
)
|
| 376 |
+
|
| 377 |
+
def _force_truncation(self) -> Dict[str, Any]:
|
| 378 |
+
"""Force episode end due to max_steps exceeded."""
|
| 379 |
+
info = {
|
| 380 |
+
"step_number": self._step_number,
|
| 381 |
+
"attempts_used": self._attempts_used,
|
| 382 |
+
"attempts_remaining": self._observation.attempts_remaining,
|
| 383 |
+
"tests_passed": self._observation.tests_passed,
|
| 384 |
+
"tests_total": self._observation.tests_total,
|
| 385 |
+
"hypothesis_matched_bug": None,
|
| 386 |
+
"query_result": None,
|
| 387 |
+
"error": "Max steps exceeded. Episode truncated.",
|
| 388 |
+
"execution_time_ms": None,
|
| 389 |
+
"timed_out": False,
|
| 390 |
+
}
|
| 391 |
+
return self._end_episode(step_reward=-0.20, info=info)
|
| 392 |
+
|
| 393 |
+
def _make_response(
|
| 394 |
+
self,
|
| 395 |
+
step_reward: float,
|
| 396 |
+
info: dict,
|
| 397 |
+
grader_score: float = 0.0,
|
| 398 |
+
force_done: bool = False,
|
| 399 |
+
count_step: bool = False,
|
| 400 |
+
) -> Dict[str, Any]:
|
| 401 |
+
"""Build the standard step response dict."""
|
| 402 |
+
self._cumulative_reward += step_reward
|
| 403 |
+
|
| 404 |
+
# Update observation step number
|
| 405 |
+
if self._observation:
|
| 406 |
+
self._observation = self._observation.model_copy(update={
|
| 407 |
+
"step_number": self._step_number,
|
| 408 |
+
"done": force_done or self._done,
|
| 409 |
+
})
|
| 410 |
+
|
| 411 |
+
# Fill in default info fields
|
| 412 |
+
default_info = {
|
| 413 |
+
"step_number": self._step_number,
|
| 414 |
+
"attempts_used": self._attempts_used,
|
| 415 |
+
"attempts_remaining": self._observation.attempts_remaining if self._observation else 0,
|
| 416 |
+
"tests_passed": self._observation.tests_passed if self._observation else 0,
|
| 417 |
+
"tests_total": self._observation.tests_total if self._observation else 0,
|
| 418 |
+
"hypothesis_matched_bug": None,
|
| 419 |
+
"query_result": None,
|
| 420 |
+
"error": None,
|
| 421 |
+
"execution_time_ms": None,
|
| 422 |
+
"timed_out": False,
|
| 423 |
+
}
|
| 424 |
+
for k, v in default_info.items():
|
| 425 |
+
if k not in info or info[k] is None and v is not None and k not in ("error", "query_result", "hypothesis_matched_bug", "execution_time_ms"):
|
| 426 |
+
pass # Keep info values
|
| 427 |
+
info.setdefault(k, v)
|
| 428 |
+
|
| 429 |
+
reward = Reward(
|
| 430 |
+
step_reward=step_reward,
|
| 431 |
+
cumulative_reward=self._cumulative_reward,
|
| 432 |
+
grader_score=grader_score,
|
| 433 |
+
breakdown={
|
| 434 |
+
"step_reward": step_reward,
|
| 435 |
+
"cumulative_reward": self._cumulative_reward,
|
| 436 |
+
},
|
| 437 |
+
)
|
| 438 |
+
|
| 439 |
+
return {
|
| 440 |
+
"observation": self._observation.model_dump() if self._observation else {},
|
| 441 |
+
"reward": reward.model_dump(),
|
| 442 |
+
"done": force_done or self._done,
|
| 443 |
+
"info": info,
|
| 444 |
+
}
|
| 445 |
+
|
| 446 |
+
def _estimate_score(self) -> float:
|
| 447 |
+
"""Running estimate of what the grader would return right now."""
|
| 448 |
+
if not self._task_config:
|
| 449 |
+
return 0.0
|
| 450 |
+
tests_total = self._task_config["tests_total"]
|
| 451 |
+
if tests_total == 0:
|
| 452 |
+
return 0.0
|
| 453 |
+
return (self._best_tests_passed / tests_total) * 0.60
|
| 454 |
+
|
| 455 |
+
def _parse_tests_passed(self, output: str, tests_total: int) -> int:
|
| 456 |
+
"""Parse the number of tests passed from sandbox output."""
|
| 457 |
+
# Look for pattern like "7 passed, 1 failed" or "8 passed, 0 failed"
|
| 458 |
+
match = re.search(r'(\d+)\s+passed', output)
|
| 459 |
+
if match:
|
| 460 |
+
return min(int(match.group(1)), tests_total)
|
| 461 |
+
# If no match, assume 0
|
| 462 |
+
return 0
|
| 463 |
+
|
| 464 |
+
def _generate_query_response(self, query_type: str, query_target: str = None) -> str:
|
| 465 |
+
"""Generate a context response for a query_context action."""
|
| 466 |
+
task = self._task_config
|
| 467 |
+
buggy_code = task["buggy_code"]
|
| 468 |
+
test_suite = task["test_suite"]
|
| 469 |
+
ground_truth = task["ground_truth"]
|
| 470 |
+
|
| 471 |
+
if query_type == "function_signature":
|
| 472 |
+
# Extract function signatures from buggy code
|
| 473 |
+
lines = buggy_code.split('\n')
|
| 474 |
+
sigs = [line.strip() for line in lines if line.strip().startswith('def ')]
|
| 475 |
+
if query_target:
|
| 476 |
+
sigs = [s for s in sigs if query_target in s] or sigs
|
| 477 |
+
return "Function signatures:\n" + "\n".join(f" {s}" for s in sigs)
|
| 478 |
+
|
| 479 |
+
elif query_type == "related_code":
|
| 480 |
+
# Return the full buggy code
|
| 481 |
+
return f"Full source code:\n{buggy_code}"
|
| 482 |
+
|
| 483 |
+
elif query_type == "error_explanation":
|
| 484 |
+
# Return the current error output with context
|
| 485 |
+
current_error = self._observation.current_error_output if self._observation else ""
|
| 486 |
+
return (
|
| 487 |
+
f"Current error output:\n{current_error}\n\n"
|
| 488 |
+
f"This output shows the result of running the test suite against "
|
| 489 |
+
f"the current version of the code. Failed tests indicate assertions "
|
| 490 |
+
f"that did not hold."
|
| 491 |
+
)
|
| 492 |
+
|
| 493 |
+
elif query_type == "test_details":
|
| 494 |
+
# Return specific test details
|
| 495 |
+
if query_target:
|
| 496 |
+
lines = test_suite.split('\n')
|
| 497 |
+
relevant = []
|
| 498 |
+
in_test = False
|
| 499 |
+
for line in lines:
|
| 500 |
+
if f"def {query_target}" in line or (query_target in line and 'def test_' in line):
|
| 501 |
+
in_test = True
|
| 502 |
+
if in_test:
|
| 503 |
+
relevant.append(line)
|
| 504 |
+
if line.strip() == '' and len(relevant) > 1:
|
| 505 |
+
break
|
| 506 |
+
if relevant:
|
| 507 |
+
return f"Test details for '{query_target}':\n" + "\n".join(relevant)
|
| 508 |
+
|
| 509 |
+
return f"Full test suite:\n{test_suite}"
|
| 510 |
+
|
| 511 |
+
return "No information available for this query."
|
env/graders/__init__.py
CHANGED
|
@@ -1 +1,17 @@
|
|
| 1 |
-
# AgentDebuggerEnv -
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AgentDebuggerEnv - Graders package
|
| 2 |
+
from env.graders.grader_easy import EasyGrader
|
| 3 |
+
from env.graders.grader_medium import MediumGrader
|
| 4 |
+
from env.graders.grader_hard import HardGrader
|
| 5 |
+
|
| 6 |
+
GRADER_REGISTRY = {
|
| 7 |
+
"easy": EasyGrader(),
|
| 8 |
+
"medium": MediumGrader(),
|
| 9 |
+
"hard": HardGrader(),
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def get_grader(task_id: str):
|
| 14 |
+
"""Get the grader instance for a task_id."""
|
| 15 |
+
if task_id not in GRADER_REGISTRY:
|
| 16 |
+
raise ValueError(f"No grader for task_id: '{task_id}'")
|
| 17 |
+
return GRADER_REGISTRY[task_id]
|
env/graders/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (644 Bytes). View file
|
|
|
env/graders/__pycache__/base_grader.cpython-310.pyc
ADDED
|
Binary file (2.51 kB). View file
|
|
|
env/graders/__pycache__/grader_easy.cpython-310.pyc
ADDED
|
Binary file (1.74 kB). View file
|
|
|
env/graders/__pycache__/grader_hard.cpython-310.pyc
ADDED
|
Binary file (3.13 kB). View file
|
|
|
env/graders/__pycache__/grader_medium.cpython-310.pyc
ADDED
|
Binary file (2.72 kB). View file
|
|
|
env/graders/base_grader.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Base Grader β Abstract base class for all graders.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from abc import ABC, abstractmethod
|
| 6 |
+
from typing import List, Dict, Any
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class BaseGrader(ABC):
|
| 10 |
+
"""Abstract base grader. All graders must implement score()."""
|
| 11 |
+
|
| 12 |
+
@abstractmethod
|
| 13 |
+
def score(
|
| 14 |
+
self,
|
| 15 |
+
task_config: dict,
|
| 16 |
+
attempts: List[Dict[str, Any]],
|
| 17 |
+
best_tests_passed: int,
|
| 18 |
+
tests_total: int,
|
| 19 |
+
attempts_used: int,
|
| 20 |
+
max_attempts: int,
|
| 21 |
+
hypotheses: List[str],
|
| 22 |
+
) -> float:
|
| 23 |
+
"""
|
| 24 |
+
Score an episode. Must return a float in [0.0, 1.0].
|
| 25 |
+
Must be deterministic: same inputs β same output.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
task_config: The full task config dict
|
| 29 |
+
attempts: List of attempt dicts with code_submitted, hypothesis, tests_passed, etc.
|
| 30 |
+
best_tests_passed: Best test pass count across all attempts
|
| 31 |
+
tests_total: Total tests in the suite
|
| 32 |
+
attempts_used: Number of fix attempts used
|
| 33 |
+
max_attempts: Maximum allowed attempts
|
| 34 |
+
hypotheses: All hypotheses submitted
|
| 35 |
+
|
| 36 |
+
Returns:
|
| 37 |
+
float in [0.0, 1.0]
|
| 38 |
+
"""
|
| 39 |
+
pass
|
| 40 |
+
|
| 41 |
+
def _check_hypothesis_keywords(
|
| 42 |
+
self, hypothesis: str, keywords: List[str], mode: str = "any"
|
| 43 |
+
) -> bool:
|
| 44 |
+
"""Check if a hypothesis matches any/all of the ground truth keywords."""
|
| 45 |
+
hypothesis_lower = hypothesis.lower()
|
| 46 |
+
if mode == "any":
|
| 47 |
+
return any(kw.lower() in hypothesis_lower for kw in keywords)
|
| 48 |
+
elif mode == "all":
|
| 49 |
+
return all(kw.lower() in hypothesis_lower for kw in keywords)
|
| 50 |
+
return False
|
| 51 |
+
|
| 52 |
+
def _clamp(self, value: float) -> float:
|
| 53 |
+
"""Clamp a value to [0.0, 1.0]."""
|
| 54 |
+
return max(0.0, min(1.0, value))
|
env/graders/grader_easy.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Grader Easy β Standard scoring formula for the binary search task.
|
| 3 |
+
Formula: 0.60 test_pass_ratio + 0.20 efficiency + 0.15 hypothesis + 0.05 early_solve
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import math
|
| 7 |
+
from typing import List, Dict, Any
|
| 8 |
+
from env.graders.base_grader import BaseGrader
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class EasyGrader(BaseGrader):
|
| 12 |
+
|
| 13 |
+
def score(
|
| 14 |
+
self,
|
| 15 |
+
task_config: dict,
|
| 16 |
+
attempts: List[Dict[str, Any]],
|
| 17 |
+
best_tests_passed: int,
|
| 18 |
+
tests_total: int,
|
| 19 |
+
attempts_used: int,
|
| 20 |
+
max_attempts: int,
|
| 21 |
+
hypotheses: List[str],
|
| 22 |
+
) -> float:
|
| 23 |
+
ground_truth = task_config["ground_truth"]
|
| 24 |
+
keywords = ground_truth["hypothesis_keywords"]
|
| 25 |
+
|
| 26 |
+
# 1. Test pass ratio (weight: 0.60)
|
| 27 |
+
test_pass_ratio = (best_tests_passed / tests_total) if tests_total > 0 else 0.0
|
| 28 |
+
test_score = test_pass_ratio * 0.60
|
| 29 |
+
|
| 30 |
+
# 2. Efficiency bonus (weight: 0.20)
|
| 31 |
+
efficiency = max(0.0, (max_attempts - attempts_used) / max_attempts) if max_attempts > 0 else 0.0
|
| 32 |
+
efficiency_score = efficiency * 0.20
|
| 33 |
+
|
| 34 |
+
# 3. Hypothesis accuracy (weight: 0.15)
|
| 35 |
+
if hypotheses:
|
| 36 |
+
matches = sum(
|
| 37 |
+
1 for h in hypotheses
|
| 38 |
+
if self._check_hypothesis_keywords(h, keywords, "any")
|
| 39 |
+
)
|
| 40 |
+
hypothesis_ratio = matches / len(hypotheses)
|
| 41 |
+
else:
|
| 42 |
+
hypothesis_ratio = 0.0
|
| 43 |
+
hypothesis_score = hypothesis_ratio * 0.15
|
| 44 |
+
|
| 45 |
+
# 4. Early solve bonus (weight: 0.05)
|
| 46 |
+
early_threshold = math.ceil(max_attempts / 3)
|
| 47 |
+
all_pass = best_tests_passed == tests_total
|
| 48 |
+
early_solve_score = 0.05 if (all_pass and attempts_used <= early_threshold) else 0.0
|
| 49 |
+
|
| 50 |
+
total = test_score + efficiency_score + hypothesis_score + early_solve_score
|
| 51 |
+
return self._clamp(total)
|
env/graders/grader_hard.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Grader Hard β Concurrent stress test scoring.
|
| 3 |
+
Custom weights:
|
| 4 |
+
0.40 β original 8 tests pass
|
| 5 |
+
0.30 β concurrent stress test (1000 threads)
|
| 6 |
+
0.20 β hypothesis accuracy
|
| 7 |
+
0.10 β efficiency bonus (solved within 5 attempts)
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import threading
|
| 11 |
+
from typing import List, Dict, Any
|
| 12 |
+
from env.graders.base_grader import BaseGrader
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class HardGrader(BaseGrader):
|
| 16 |
+
|
| 17 |
+
def _run_concurrent_stress_test(self, code: str) -> bool:
|
| 18 |
+
"""
|
| 19 |
+
Run a 1000-thread concurrent stress test against the submitted code.
|
| 20 |
+
Returns True if the counter ends at exactly 1000 after 1000 concurrent increments.
|
| 21 |
+
"""
|
| 22 |
+
try:
|
| 23 |
+
# Execute the code in an isolated namespace
|
| 24 |
+
namespace = {}
|
| 25 |
+
exec(code, namespace)
|
| 26 |
+
|
| 27 |
+
CounterClass = namespace.get("ConnectionCounter")
|
| 28 |
+
if CounterClass is None:
|
| 29 |
+
return False
|
| 30 |
+
|
| 31 |
+
counter = CounterClass()
|
| 32 |
+
num_threads = 1000
|
| 33 |
+
|
| 34 |
+
threads = [
|
| 35 |
+
threading.Thread(target=counter.increment)
|
| 36 |
+
for _ in range(num_threads)
|
| 37 |
+
]
|
| 38 |
+
for t in threads:
|
| 39 |
+
t.start()
|
| 40 |
+
for t in threads:
|
| 41 |
+
t.join(timeout=10)
|
| 42 |
+
|
| 43 |
+
return counter.get_count() == num_threads
|
| 44 |
+
except Exception:
|
| 45 |
+
return False
|
| 46 |
+
|
| 47 |
+
def score(
|
| 48 |
+
self,
|
| 49 |
+
task_config: dict,
|
| 50 |
+
attempts: List[Dict[str, Any]],
|
| 51 |
+
best_tests_passed: int,
|
| 52 |
+
tests_total: int,
|
| 53 |
+
attempts_used: int,
|
| 54 |
+
max_attempts: int,
|
| 55 |
+
hypotheses: List[str],
|
| 56 |
+
) -> float:
|
| 57 |
+
ground_truth = task_config["ground_truth"]
|
| 58 |
+
keywords = ground_truth["hypothesis_keywords"]
|
| 59 |
+
|
| 60 |
+
# 1. Original tests pass (weight: 0.40)
|
| 61 |
+
test_pass_ratio = (best_tests_passed / tests_total) if tests_total > 0 else 0.0
|
| 62 |
+
original_test_score = test_pass_ratio * 0.40
|
| 63 |
+
|
| 64 |
+
# 2. Concurrent stress test (weight: 0.30)
|
| 65 |
+
# Use the best attempt's code (highest tests_passed, then latest)
|
| 66 |
+
concurrent_score = 0.0
|
| 67 |
+
if attempts:
|
| 68 |
+
# Find the best attempt
|
| 69 |
+
best_attempt = max(
|
| 70 |
+
attempts,
|
| 71 |
+
key=lambda a: (a.get("tests_passed", 0), a.get("attempt_number", 0))
|
| 72 |
+
)
|
| 73 |
+
best_code = best_attempt.get("code_submitted", "")
|
| 74 |
+
if best_code:
|
| 75 |
+
# Run the stress test 3 times β must pass all 3 for full credit
|
| 76 |
+
passes = sum(
|
| 77 |
+
1 for _ in range(3)
|
| 78 |
+
if self._run_concurrent_stress_test(best_code)
|
| 79 |
+
)
|
| 80 |
+
if passes == 3:
|
| 81 |
+
concurrent_score = 0.30
|
| 82 |
+
elif passes >= 1:
|
| 83 |
+
concurrent_score = 0.15 # Partial β inconsistent fix
|
| 84 |
+
|
| 85 |
+
# 3. Hypothesis accuracy (weight: 0.20)
|
| 86 |
+
if hypotheses:
|
| 87 |
+
matches = sum(
|
| 88 |
+
1 for h in hypotheses
|
| 89 |
+
if self._check_hypothesis_keywords(h, keywords, "any")
|
| 90 |
+
)
|
| 91 |
+
hypothesis_ratio = matches / len(hypotheses)
|
| 92 |
+
else:
|
| 93 |
+
hypothesis_ratio = 0.0
|
| 94 |
+
hypothesis_score = hypothesis_ratio * 0.20
|
| 95 |
+
|
| 96 |
+
# 4. Efficiency bonus (weight: 0.10)
|
| 97 |
+
efficiency_score = 0.10 if attempts_used <= 5 else 0.0
|
| 98 |
+
|
| 99 |
+
total = original_test_score + concurrent_score + hypothesis_score + efficiency_score
|
| 100 |
+
return self._clamp(total)
|
env/graders/grader_medium.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Grader Medium β Scoring with red herring detection.
|
| 3 |
+
Same base formula as easy, but with special hypothesis logic:
|
| 4 |
+
- Hypothesis mentioning ONLY "authenticate_user" scores 0.0 for hypothesis_accuracy
|
| 5 |
+
- Must mention "hash_password" AND at least 1 other keyword to get full marks
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import math
|
| 9 |
+
from typing import List, Dict, Any
|
| 10 |
+
from env.graders.base_grader import BaseGrader
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class MediumGrader(BaseGrader):
|
| 14 |
+
|
| 15 |
+
def _score_hypothesis(self, hypothesis: str, ground_truth: dict) -> float:
|
| 16 |
+
"""Score a single hypothesis with red herring detection."""
|
| 17 |
+
h_lower = hypothesis.lower()
|
| 18 |
+
keywords = ground_truth["hypothesis_keywords"]
|
| 19 |
+
red_herring = ground_truth.get("red_herring_keyword", "authenticate_user")
|
| 20 |
+
|
| 21 |
+
# Check if only the red herring is mentioned (no correct keywords)
|
| 22 |
+
mentions_red_herring = red_herring.lower() in h_lower
|
| 23 |
+
mentions_hash_password = "hash_password" in h_lower
|
| 24 |
+
|
| 25 |
+
# Must mention "hash_password" AND at least 1 other keyword
|
| 26 |
+
other_keywords = [kw for kw in keywords if kw.lower() != "hash_password"]
|
| 27 |
+
mentions_other = any(kw.lower() in h_lower for kw in other_keywords)
|
| 28 |
+
|
| 29 |
+
if mentions_hash_password and mentions_other:
|
| 30 |
+
return 1.0 # Full credit
|
| 31 |
+
elif mentions_hash_password:
|
| 32 |
+
return 0.5 # Partial β found right function but no detail
|
| 33 |
+
elif mentions_red_herring and not mentions_hash_password:
|
| 34 |
+
return 0.0 # Red herring was followed
|
| 35 |
+
else:
|
| 36 |
+
return 0.1 # Generic hypothesis
|
| 37 |
+
|
| 38 |
+
def score(
|
| 39 |
+
self,
|
| 40 |
+
task_config: dict,
|
| 41 |
+
attempts: List[Dict[str, Any]],
|
| 42 |
+
best_tests_passed: int,
|
| 43 |
+
tests_total: int,
|
| 44 |
+
attempts_used: int,
|
| 45 |
+
max_attempts: int,
|
| 46 |
+
hypotheses: List[str],
|
| 47 |
+
) -> float:
|
| 48 |
+
ground_truth = task_config["ground_truth"]
|
| 49 |
+
|
| 50 |
+
# 1. Test pass ratio (weight: 0.60)
|
| 51 |
+
test_pass_ratio = (best_tests_passed / tests_total) if tests_total > 0 else 0.0
|
| 52 |
+
test_score = test_pass_ratio * 0.60
|
| 53 |
+
|
| 54 |
+
# 2. Efficiency bonus (weight: 0.20)
|
| 55 |
+
efficiency = max(0.0, (max_attempts - attempts_used) / max_attempts) if max_attempts > 0 else 0.0
|
| 56 |
+
efficiency_score = efficiency * 0.20
|
| 57 |
+
|
| 58 |
+
# 3. Hypothesis accuracy with red herring detection (weight: 0.15)
|
| 59 |
+
if hypotheses:
|
| 60 |
+
h_scores = [self._score_hypothesis(h, ground_truth) for h in hypotheses]
|
| 61 |
+
hypothesis_ratio = sum(h_scores) / len(h_scores)
|
| 62 |
+
else:
|
| 63 |
+
hypothesis_ratio = 0.0
|
| 64 |
+
hypothesis_score = hypothesis_ratio * 0.15
|
| 65 |
+
|
| 66 |
+
# 4. Early solve bonus (weight: 0.05)
|
| 67 |
+
early_threshold = math.ceil(max_attempts / 3)
|
| 68 |
+
all_pass = best_tests_passed == tests_total
|
| 69 |
+
early_solve_score = 0.05 if (all_pass and attempts_used <= early_threshold) else 0.0
|
| 70 |
+
|
| 71 |
+
total = test_score + efficiency_score + hypothesis_score + early_solve_score
|
| 72 |
+
return self._clamp(total)
|
env/models.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
AgentDebuggerEnv β Pydantic Data Models
|
| 3 |
+
========================================
|
| 4 |
+
All models are Pydantic v2 BaseModel subclasses with exact field names
|
| 5 |
+
required by the OpenEnv spec and hackathon validation pipeline.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from pydantic import BaseModel
|
| 9 |
+
from typing import List, Dict, Optional
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class FixAttempt(BaseModel):
|
| 13 |
+
attempt_number: int # 1-indexed attempt number this episode
|
| 14 |
+
code_submitted: str # The full code the agent submitted for this attempt
|
| 15 |
+
hypothesis: str # Agent's stated hypothesis about the bug before this attempt
|
| 16 |
+
execution_output: str # Full stdout + stderr from running the test suite
|
| 17 |
+
tests_passed: int # Number of tests that passed after this fix
|
| 18 |
+
tests_total: int # Total number of tests in the suite
|
| 19 |
+
execution_time_ms: int # How long the sandbox took to run (milliseconds)
|
| 20 |
+
timed_out: bool # Whether this attempt hit the 10-second sandbox timeout
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class Observation(BaseModel):
|
| 24 |
+
# Task context β fixed for the episode
|
| 25 |
+
task_id: str # "easy" | "medium" | "hard"
|
| 26 |
+
task_description: str # Plain English description of what the code is supposed to do
|
| 27 |
+
buggy_code: str # The original broken code (shown once at reset, always available)
|
| 28 |
+
test_suite: str # The full test suite code
|
| 29 |
+
initial_error_output: str # Output of running the test suite against the buggy code at reset()
|
| 30 |
+
|
| 31 |
+
# Dynamic state β changes each step
|
| 32 |
+
current_code: str # The most recent version of the code
|
| 33 |
+
current_error_output: str # Output of running tests against current_code
|
| 34 |
+
tests_passed: int # Tests passing on current_code
|
| 35 |
+
tests_total: int # Total tests in suite
|
| 36 |
+
previous_attempts: List[FixAttempt] # Full history of all fix attempts this episode
|
| 37 |
+
|
| 38 |
+
# Budget tracking
|
| 39 |
+
attempts_remaining: int # How many more fix submissions are allowed
|
| 40 |
+
max_attempts: int # Total attempt budget for this task
|
| 41 |
+
|
| 42 |
+
# Step tracking
|
| 43 |
+
step_number: int # Current step number (increments on every action)
|
| 44 |
+
max_steps: int # Total step budget (includes both fix and query actions)
|
| 45 |
+
done: bool # Whether the episode has ended
|
| 46 |
+
|
| 47 |
+
# Scoring signal (shown to agent for learning)
|
| 48 |
+
score_estimate: float # Running estimate of current grader score (0.0β1.0)
|
| 49 |
+
hint_used: bool # Whether the agent has used their one hint this episode
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class Action(BaseModel):
|
| 53 |
+
action_type: str # "submit_fix" | "query_context" | "give_up"
|
| 54 |
+
|
| 55 |
+
# ββ submit_fix ββ
|
| 56 |
+
fixed_code: Optional[str] = None
|
| 57 |
+
hypothesis: Optional[str] = None
|
| 58 |
+
|
| 59 |
+
# ββ query_context ββ
|
| 60 |
+
query_type: Optional[str] = None
|
| 61 |
+
query_target: Optional[str] = None
|
| 62 |
+
|
| 63 |
+
# ββ give_up ββ
|
| 64 |
+
final_diagnosis: Optional[str] = None
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class Reward(BaseModel):
|
| 68 |
+
step_reward: float # Reward for THIS step only. Range: -1.0 to +1.0
|
| 69 |
+
cumulative_reward: float # Sum of all step_rewards this episode
|
| 70 |
+
grader_score: float # 0.0 during episode. Set ONLY on terminal step (done=True).
|
| 71 |
+
breakdown: Dict[str, float] # Itemized components
|
env/sandbox.py
CHANGED
|
@@ -49,7 +49,7 @@ if _marker_pos != -1:
|
|
| 49 |
|
| 50 |
try:
|
| 51 |
_tree = _ast.parse(_source_to_check)
|
| 52 |
-
except
|
| 53 |
pass # Let the actual execution catch syntax errors
|
| 54 |
else:
|
| 55 |
for _node in _ast.walk(_tree):
|
|
|
|
| 49 |
|
| 50 |
try:
|
| 51 |
_tree = _ast.parse(_source_to_check)
|
| 52 |
+
except SyntaxError:
|
| 53 |
pass # Let the actual execution catch syntax errors
|
| 54 |
else:
|
| 55 |
for _node in _ast.walk(_tree):
|
env/server.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
AgentDebuggerEnv β FastAPI Server
|
| 3 |
+
===================================
|
| 4 |
+
Exposes the environment as REST endpoints:
|
| 5 |
+
POST /reset β Start a fresh episode
|
| 6 |
+
POST /step β Submit one action
|
| 7 |
+
GET /state β Full internal state
|
| 8 |
+
GET /health β Deployment health check (must return 200)
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from fastapi import FastAPI, HTTPException
|
| 12 |
+
from fastapi.responses import JSONResponse
|
| 13 |
+
from pydantic import BaseModel
|
| 14 |
+
from typing import Optional
|
| 15 |
+
|
| 16 |
+
from env.environment import DebuggerEnvironment
|
| 17 |
+
from env.models import Action
|
| 18 |
+
from env.tasks.registry import list_tasks
|
| 19 |
+
|
| 20 |
+
app = FastAPI(
|
| 21 |
+
title="AgentDebuggerEnv",
|
| 22 |
+
description="An OpenEnv-compliant debugging environment for AI agents",
|
| 23 |
+
version="1.0.0",
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# Single environment instance (single-session design as per hackathon constraints)
|
| 27 |
+
env = DebuggerEnvironment()
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class ResetRequest(BaseModel):
|
| 31 |
+
task_id: str
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@app.get("/health")
|
| 35 |
+
async def health():
|
| 36 |
+
"""Health check β must return HTTP 200 always. Critical for hackathon Phase 1."""
|
| 37 |
+
return {"status": "ok", "environment": "agentdebugger-env", "version": "1.0.0"}
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@app.post("/reset")
|
| 41 |
+
async def reset(request: ResetRequest):
|
| 42 |
+
"""Start a fresh episode. Returns initial Observation."""
|
| 43 |
+
try:
|
| 44 |
+
observation = env.reset(request.task_id)
|
| 45 |
+
return JSONResponse(content=observation, status_code=200)
|
| 46 |
+
except ValueError as e:
|
| 47 |
+
return JSONResponse(
|
| 48 |
+
content={"error": str(e), "available_tasks": list_tasks()},
|
| 49 |
+
status_code=400,
|
| 50 |
+
)
|
| 51 |
+
except Exception as e:
|
| 52 |
+
return JSONResponse(
|
| 53 |
+
content={"error": f"Internal error during reset: {str(e)}"},
|
| 54 |
+
status_code=200,
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@app.post("/step")
|
| 59 |
+
async def step(action: Action):
|
| 60 |
+
"""Submit one action. Returns {observation, reward, done, info}. Always HTTP 200."""
|
| 61 |
+
try:
|
| 62 |
+
result = env.step(action)
|
| 63 |
+
return JSONResponse(content=result, status_code=200)
|
| 64 |
+
except Exception as e:
|
| 65 |
+
# Never return 500 β all errors go in response body
|
| 66 |
+
return JSONResponse(
|
| 67 |
+
content={
|
| 68 |
+
"observation": {},
|
| 69 |
+
"reward": {
|
| 70 |
+
"step_reward": 0.0,
|
| 71 |
+
"cumulative_reward": 0.0,
|
| 72 |
+
"grader_score": 0.0,
|
| 73 |
+
"breakdown": {},
|
| 74 |
+
},
|
| 75 |
+
"done": False,
|
| 76 |
+
"info": {"error": f"Internal error: {str(e)}"},
|
| 77 |
+
},
|
| 78 |
+
status_code=200,
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
@app.get("/state")
|
| 83 |
+
async def get_state():
|
| 84 |
+
"""Return full internal environment state as a plain dict."""
|
| 85 |
+
try:
|
| 86 |
+
state = env.state()
|
| 87 |
+
return JSONResponse(content=state, status_code=200)
|
| 88 |
+
except Exception as e:
|
| 89 |
+
return JSONResponse(
|
| 90 |
+
content={"error": f"Internal error: {str(e)}"},
|
| 91 |
+
status_code=200,
|
| 92 |
+
)
|
env/tasks/__init__.py
CHANGED
|
@@ -1 +1,2 @@
|
|
| 1 |
-
# AgentDebuggerEnv - Task definitions
|
|
|
|
|
|
| 1 |
+
# AgentDebuggerEnv - Task definitions
|
| 2 |
+
from env.tasks.registry import get_task, list_tasks
|
env/tasks/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (230 Bytes). View file
|
|
|
env/tasks/__pycache__/registry.cpython-310.pyc
ADDED
|
Binary file (957 Bytes). View file
|
|
|
env/tasks/__pycache__/task_easy.cpython-310.pyc
ADDED
|
Binary file (3.85 kB). View file
|
|
|
env/tasks/__pycache__/task_hard.cpython-310.pyc
ADDED
|
Binary file (5.54 kB). View file
|
|
|
env/tasks/__pycache__/task_medium.cpython-310.pyc
ADDED
|
Binary file (9.52 kB). View file
|
|
|
env/tasks/registry.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Task Registry β Maps task_id strings to task configurations.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from env.tasks.task_easy import TASK_CONFIG as EASY_CONFIG
|
| 6 |
+
from env.tasks.task_medium import TASK_CONFIG as MEDIUM_CONFIG
|
| 7 |
+
from env.tasks.task_hard import TASK_CONFIG as HARD_CONFIG
|
| 8 |
+
|
| 9 |
+
TASK_REGISTRY = {
|
| 10 |
+
"easy": EASY_CONFIG,
|
| 11 |
+
"medium": MEDIUM_CONFIG,
|
| 12 |
+
"hard": HARD_CONFIG,
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def get_task(task_id: str) -> dict:
|
| 17 |
+
"""Get a task config by task_id. Raises ValueError if not found."""
|
| 18 |
+
if task_id not in TASK_REGISTRY:
|
| 19 |
+
raise ValueError(
|
| 20 |
+
f"Unknown task_id: '{task_id}'. Available: {list(TASK_REGISTRY.keys())}"
|
| 21 |
+
)
|
| 22 |
+
return TASK_REGISTRY[task_id]
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def list_tasks() -> list[str]:
|
| 26 |
+
"""Return list of available task IDs."""
|
| 27 |
+
return list(TASK_REGISTRY.keys())
|
inference.py
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
AgentDebuggerEnv Baseline Inference Script
|
| 3 |
+
==========================================
|
| 4 |
+
Filename: inference.py (ROOT directory β not in any subdirectory)
|
| 5 |
+
|
| 6 |
+
Reads from environment variables (never hardcoded):
|
| 7 |
+
API_BASE_URL β LLM API endpoint
|
| 8 |
+
MODEL_NAME β Model identifier
|
| 9 |
+
HF_TOKEN β API key / HuggingFace token
|
| 10 |
+
|
| 11 |
+
Uses openai Python client for all LLM calls (hackathon requirement).
|
| 12 |
+
Must complete all 3 tasks in under 20 minutes total.
|
| 13 |
+
Saves results to baseline_results.json on completion.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import os
|
| 17 |
+
import json
|
| 18 |
+
import time
|
| 19 |
+
import re
|
| 20 |
+
from openai import OpenAI
|
| 21 |
+
import requests
|
| 22 |
+
|
| 23 |
+
# ββ Environment variables (never hardcode these) ββββββββββββββββββββββββββββββ
|
| 24 |
+
API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
|
| 25 |
+
MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o")
|
| 26 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 27 |
+
ENV_BASE_URL = os.environ.get("ENV_BASE_URL", "http://localhost:8000")
|
| 28 |
+
|
| 29 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
|
| 30 |
+
|
| 31 |
+
SYSTEM_PROMPT = """You are an expert software debugger. You will be given broken code and a
|
| 32 |
+
failing test suite. Your job is to:
|
| 33 |
+
1. Analyze the error output carefully
|
| 34 |
+
2. Form a hypothesis about the root cause (required for every fix attempt)
|
| 35 |
+
3. Submit a corrected version of the complete code
|
| 36 |
+
4. Observe the new test results and update your hypothesis if needed
|
| 37 |
+
5. Repeat until all tests pass or you run out of attempts
|
| 38 |
+
|
| 39 |
+
You must ALWAYS respond with a valid JSON action object. Available actions:
|
| 40 |
+
|
| 41 |
+
Submit a fix:
|
| 42 |
+
{
|
| 43 |
+
"action_type": "submit_fix",
|
| 44 |
+
"fixed_code": "<complete corrected Python code as a string>",
|
| 45 |
+
"hypothesis": "<your hypothesis about what the bug is and where>"
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
Query for more context (use sparingly β first one is free):
|
| 49 |
+
{
|
| 50 |
+
"action_type": "query_context",
|
| 51 |
+
"query_type": "error_explanation" | "function_signature" | "related_code" | "test_details",
|
| 52 |
+
"query_target": "<function name or line number or test name>"
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
Give up (if you cannot find the bug):
|
| 56 |
+
{
|
| 57 |
+
"action_type": "give_up",
|
| 58 |
+
"final_diagnosis": "<your best guess at what the bug was>"
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
CRITICAL RULES:
|
| 62 |
+
- hypothesis field is REQUIRED in submit_fix β missing it costs reward
|
| 63 |
+
- Submit COMPLETE code files, not diffs or partial functions
|
| 64 |
+
- Read the error output carefully before each attempt β it tells you what changed
|
| 65 |
+
- For concurrent bugs, think about thread safety and atomic operations"""
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def parse_action(raw: str) -> dict:
|
| 69 |
+
"""Parse LLM response to action dict. Handle markdown code blocks."""
|
| 70 |
+
raw = raw.strip()
|
| 71 |
+
# Strip markdown code blocks if present
|
| 72 |
+
raw = re.sub(r'^```(?:json)?\s*', '', raw, flags=re.MULTILINE)
|
| 73 |
+
raw = re.sub(r'\s*```$', '', raw, flags=re.MULTILINE)
|
| 74 |
+
try:
|
| 75 |
+
return json.loads(raw)
|
| 76 |
+
except json.JSONDecodeError:
|
| 77 |
+
# Try to extract first JSON object
|
| 78 |
+
match = re.search(r'\{.*\}', raw, re.DOTALL)
|
| 79 |
+
if match:
|
| 80 |
+
try:
|
| 81 |
+
return json.loads(match.group())
|
| 82 |
+
except json.JSONDecodeError:
|
| 83 |
+
pass
|
| 84 |
+
# Fallback: give up
|
| 85 |
+
return {
|
| 86 |
+
"action_type": "give_up",
|
| 87 |
+
"final_diagnosis": f"Failed to parse response: {raw[:200]}"
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def build_initial_message(obs: dict) -> str:
|
| 92 |
+
return (
|
| 93 |
+
f"=== DEBUGGING TASK: {obs['task_id'].upper()} ===\n\n"
|
| 94 |
+
f"TASK DESCRIPTION:\n{obs['task_description']}\n\n"
|
| 95 |
+
f"BUGGY CODE:\n```python\n{obs['buggy_code']}\n```\n\n"
|
| 96 |
+
f"TEST SUITE:\n```python\n{obs['test_suite']}\n```\n\n"
|
| 97 |
+
f"INITIAL ERROR OUTPUT:\n{obs['initial_error_output']}\n\n"
|
| 98 |
+
f"Attempts remaining: {obs['attempts_remaining']}\n"
|
| 99 |
+
f"Max steps: {obs['max_steps']}\n\n"
|
| 100 |
+
f"Analyze the error and submit your first fix attempt."
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def build_step_message(obs: dict, reward: dict, info: dict) -> str:
|
| 105 |
+
last_attempt = obs['previous_attempts'][-1] if obs['previous_attempts'] else None
|
| 106 |
+
msg = f"Step {obs['step_number']} result:\n"
|
| 107 |
+
msg += f"Step reward: {reward['step_reward']:+.3f} | Cumulative: {reward['cumulative_reward']:.3f}\n"
|
| 108 |
+
msg += f"Tests passing: {obs['tests_passed']}/{obs['tests_total']}\n"
|
| 109 |
+
msg += f"Attempts remaining: {obs['attempts_remaining']}\n"
|
| 110 |
+
|
| 111 |
+
if info.get("error"):
|
| 112 |
+
msg += f"ERROR: {info['error']}\n"
|
| 113 |
+
|
| 114 |
+
if info.get("query_result"):
|
| 115 |
+
msg += f"\nQUERY RESULT:\n{info['query_result']}\n"
|
| 116 |
+
|
| 117 |
+
if last_attempt and last_attempt.get("execution_output"):
|
| 118 |
+
output = last_attempt["execution_output"]
|
| 119 |
+
# Truncate long outputs to stay within token budget
|
| 120 |
+
if len(output) > 1500:
|
| 121 |
+
output = output[:750] + "\n...[truncated]...\n" + output[-750:]
|
| 122 |
+
msg += f"\nNEW TEST OUTPUT:\n{output}\n"
|
| 123 |
+
|
| 124 |
+
if obs['tests_passed'] == obs['tests_total']:
|
| 125 |
+
msg += "\nβ ALL TESTS PASS! Episode solved."
|
| 126 |
+
else:
|
| 127 |
+
msg += f"\nContinue debugging. {obs['tests_total'] - obs['tests_passed']} tests still failing."
|
| 128 |
+
|
| 129 |
+
return msg
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def run_episode(task_id: str) -> dict:
|
| 133 |
+
"""Run one complete debugging episode. Returns result dict."""
|
| 134 |
+
|
| 135 |
+
# Reset environment
|
| 136 |
+
reset_resp = requests.post(f"{ENV_BASE_URL}/reset", json={"task_id": task_id})
|
| 137 |
+
reset_resp.raise_for_status()
|
| 138 |
+
obs = reset_resp.json()
|
| 139 |
+
|
| 140 |
+
messages = [
|
| 141 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 142 |
+
{"role": "user", "content": build_initial_message(obs)}
|
| 143 |
+
]
|
| 144 |
+
|
| 145 |
+
done = False
|
| 146 |
+
last_result = {"reward": {"grader_score": 0.0, "cumulative_reward": 0.0}, "observation": obs}
|
| 147 |
+
action = {}
|
| 148 |
+
|
| 149 |
+
while not done:
|
| 150 |
+
# Get LLM action
|
| 151 |
+
completion = client.chat.completions.create(
|
| 152 |
+
model=MODEL_NAME,
|
| 153 |
+
messages=messages,
|
| 154 |
+
max_tokens=1200,
|
| 155 |
+
temperature=0.2
|
| 156 |
+
)
|
| 157 |
+
raw = completion.choices[0].message.content
|
| 158 |
+
action = parse_action(raw)
|
| 159 |
+
|
| 160 |
+
# Submit action to environment
|
| 161 |
+
step_resp = requests.post(f"{ENV_BASE_URL}/step", json=action)
|
| 162 |
+
step_resp.raise_for_status()
|
| 163 |
+
result = step_resp.json()
|
| 164 |
+
|
| 165 |
+
obs = result["observation"]
|
| 166 |
+
reward = result["reward"]
|
| 167 |
+
done = result["done"]
|
| 168 |
+
info = result["info"]
|
| 169 |
+
last_result = result
|
| 170 |
+
|
| 171 |
+
# Build context for next LLM call
|
| 172 |
+
step_msg = build_step_message(obs, reward, info)
|
| 173 |
+
messages.append({"role": "assistant", "content": raw})
|
| 174 |
+
messages.append({"role": "user", "content": step_msg})
|
| 175 |
+
|
| 176 |
+
if done:
|
| 177 |
+
break
|
| 178 |
+
|
| 179 |
+
final_obs = last_result["observation"]
|
| 180 |
+
return {
|
| 181 |
+
"task_id": task_id,
|
| 182 |
+
"grader_score": last_result["reward"]["grader_score"],
|
| 183 |
+
"cumulative_reward": last_result["reward"]["cumulative_reward"],
|
| 184 |
+
"steps_taken": final_obs["step_number"],
|
| 185 |
+
"attempts_used": final_obs["max_attempts"] - final_obs["attempts_remaining"],
|
| 186 |
+
"tests_passed": final_obs["tests_passed"],
|
| 187 |
+
"tests_total": final_obs["tests_total"],
|
| 188 |
+
"solved": final_obs["tests_passed"] == final_obs["tests_total"],
|
| 189 |
+
"final_action_type": action.get("action_type", "unknown")
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def main():
|
| 194 |
+
print("AgentDebuggerEnv β Baseline Inference")
|
| 195 |
+
print(f"Model: {MODEL_NAME}")
|
| 196 |
+
print(f"API: {API_BASE_URL}")
|
| 197 |
+
print(f"Env: {ENV_BASE_URL}")
|
| 198 |
+
print("=" * 55)
|
| 199 |
+
|
| 200 |
+
results = []
|
| 201 |
+
start_time = time.time()
|
| 202 |
+
|
| 203 |
+
for task_id in ["easy", "medium", "hard"]:
|
| 204 |
+
print(f"\nTask: {task_id}")
|
| 205 |
+
t0 = time.time()
|
| 206 |
+
result = run_episode(task_id)
|
| 207 |
+
elapsed = time.time() - t0
|
| 208 |
+
|
| 209 |
+
solved_str = "β SOLVED" if result["solved"] else "β UNSOLVED"
|
| 210 |
+
print(f" Score: {result['grader_score']:.3f}")
|
| 211 |
+
print(f" Outcome: {solved_str}")
|
| 212 |
+
print(f" Attempts: {result['attempts_used']}")
|
| 213 |
+
print(f" Tests: {result['tests_passed']}/{result['tests_total']}")
|
| 214 |
+
print(f" Time: {elapsed:.1f}s")
|
| 215 |
+
results.append(result)
|
| 216 |
+
|
| 217 |
+
total_time = time.time() - start_time
|
| 218 |
+
mean_score = sum(r["grader_score"] for r in results) / len(results)
|
| 219 |
+
|
| 220 |
+
print("\n" + "=" * 55)
|
| 221 |
+
print(f"Mean Score: {mean_score:.3f}")
|
| 222 |
+
print(f"Total Time: {total_time:.1f}s (limit: 1200s)")
|
| 223 |
+
print("=" * 55)
|
| 224 |
+
|
| 225 |
+
output = {
|
| 226 |
+
"model": MODEL_NAME,
|
| 227 |
+
"api_base_url": API_BASE_URL,
|
| 228 |
+
"results": results,
|
| 229 |
+
"mean_score": mean_score,
|
| 230 |
+
"total_time_seconds": round(total_time, 1)
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
with open("baseline_results.json", "w") as f:
|
| 234 |
+
json.dump(output, f, indent=2)
|
| 235 |
+
print("\nSaved β baseline_results.json")
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
if __name__ == "__main__":
|
| 239 |
+
main()
|
openenv.yaml
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: agentdebugger-env
|
| 2 |
+
version: 1.0.0
|
| 3 |
+
description: >
|
| 4 |
+
A live, iterative debugging environment where AI agents fix broken code
|
| 5 |
+
by forming hypotheses, submitting fixes, observing test output, and
|
| 6 |
+
iterating β benchmarking genuine agentic reasoning through a
|
| 7 |
+
hypothesis-test-fix feedback loop.
|
| 8 |
+
domain: software_engineering
|
| 9 |
+
tags:
|
| 10 |
+
- debugging
|
| 11 |
+
- agentic-reasoning
|
| 12 |
+
- code-repair
|
| 13 |
+
- openenv
|
| 14 |
+
- software-engineering
|
| 15 |
+
observation_type: structured
|
| 16 |
+
action_type: structured
|
| 17 |
+
reward_type: dense
|
| 18 |
+
episode_termination: action_or_step_limit
|
| 19 |
+
inference_script: inference.py
|
| 20 |
+
tasks:
|
| 21 |
+
- id: easy
|
| 22 |
+
name: Single Function Off-By-One Bug
|
| 23 |
+
difficulty: easy
|
| 24 |
+
max_attempts: 5
|
| 25 |
+
max_steps: 8
|
| 26 |
+
tests_total: 8
|
| 27 |
+
description: >
|
| 28 |
+
Binary search with an off-by-one termination condition.
|
| 29 |
+
Clear error message, 1-2 iterations expected.
|
| 30 |
+
- id: medium
|
| 31 |
+
name: Red Herring β Interdependent Function Bug
|
| 32 |
+
difficulty: medium
|
| 33 |
+
max_attempts: 7
|
| 34 |
+
max_steps: 15
|
| 35 |
+
tests_total: 10
|
| 36 |
+
description: >
|
| 37 |
+
Authentication module where error points to the wrong function.
|
| 38 |
+
Agent must trace data flow backwards from symptom to root cause.
|
| 39 |
+
- id: hard
|
| 40 |
+
name: Concurrency Race Condition
|
| 41 |
+
difficulty: hard
|
| 42 |
+
max_attempts: 10
|
| 43 |
+
max_steps: 25
|
| 44 |
+
tests_total: 8
|
| 45 |
+
description: >
|
| 46 |
+
Thread-safe counter with a race condition invisible to sequential tests.
|
| 47 |
+
Agent must design a concurrent test to surface the bug, then fix it.
|
| 48 |
+
baseline:
|
| 49 |
+
model: gpt-4o
|
| 50 |
+
script: inference.py
|
| 51 |
+
mean_score: 0.51
|
| 52 |
+
scores:
|
| 53 |
+
easy: 0.85
|
| 54 |
+
medium: 0.50
|
| 55 |
+
hard: 0.18
|
| 56 |
+
author: shashaank
|
| 57 |
+
license: MIT
|
| 58 |
+
huggingface_space: shashaank/agentdebugger-env
|
| 59 |
+
api_base_url_env_var: API_BASE_URL
|
| 60 |
+
model_name_env_var: MODEL_NAME
|
| 61 |
+
hf_token_env_var: HF_TOKEN
|
requirements.txt
CHANGED
|
@@ -6,4 +6,4 @@ requests==2.31.0
|
|
| 6 |
python-dotenv==1.0.1
|
| 7 |
pytest==8.1.0
|
| 8 |
httpx==0.27.0
|
| 9 |
-
RestrictedPython==7.
|
|
|
|
| 6 |
python-dotenv==1.0.1
|
| 7 |
pytest==8.1.0
|
| 8 |
httpx==0.27.0
|
| 9 |
+
RestrictedPython==7.4
|
tests/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (154 Bytes). View file
|
|
|
tests/__pycache__/test_environment.cpython-310-pytest-8.1.0.pyc
ADDED
|
Binary file (13.2 kB). View file
|
|
|
tests/__pycache__/test_graders.cpython-310-pytest-8.1.0.pyc
ADDED
|
Binary file (7.34 kB). View file
|
|
|
tests/__pycache__/test_sandbox.cpython-310-pytest-8.1.0.pyc
ADDED
|
Binary file (7.83 kB). View file
|
|
|
tests/test_environment.py
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests for the core environment β reset, step, state.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
from env.environment import DebuggerEnvironment
|
| 7 |
+
from env.models import Action
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@pytest.fixture
|
| 11 |
+
def env():
|
| 12 |
+
return DebuggerEnvironment()
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# ββ Reset Tests ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 16 |
+
|
| 17 |
+
def test_reset_easy_returns_observation(env):
|
| 18 |
+
obs = env.reset("easy")
|
| 19 |
+
assert obs["task_id"] == "easy"
|
| 20 |
+
assert obs["done"] is False
|
| 21 |
+
assert obs["tests_total"] == 8
|
| 22 |
+
assert obs["attempts_remaining"] == 5
|
| 23 |
+
assert obs["max_attempts"] == 5
|
| 24 |
+
assert obs["step_number"] == 0
|
| 25 |
+
assert obs["buggy_code"] != ""
|
| 26 |
+
assert obs["test_suite"] != ""
|
| 27 |
+
assert obs["initial_error_output"] != ""
|
| 28 |
+
assert obs["previous_attempts"] == []
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def test_reset_medium_returns_observation(env):
|
| 32 |
+
obs = env.reset("medium")
|
| 33 |
+
assert obs["task_id"] == "medium"
|
| 34 |
+
assert obs["tests_total"] == 10
|
| 35 |
+
assert obs["max_attempts"] == 7
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def test_reset_hard_returns_observation(env):
|
| 39 |
+
obs = env.reset("hard")
|
| 40 |
+
assert obs["task_id"] == "hard"
|
| 41 |
+
assert obs["tests_total"] == 8
|
| 42 |
+
assert obs["max_attempts"] == 10
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def test_reset_invalid_task_raises(env):
|
| 46 |
+
with pytest.raises(ValueError, match="Unknown task_id"):
|
| 47 |
+
env.reset("nonexistent")
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def test_reset_clears_previous_state(env):
|
| 51 |
+
env.reset("easy")
|
| 52 |
+
# Do a step
|
| 53 |
+
action = Action(
|
| 54 |
+
action_type="submit_fix",
|
| 55 |
+
fixed_code="def binary_search(arr, target): return -1",
|
| 56 |
+
hypothesis="test hypothesis",
|
| 57 |
+
)
|
| 58 |
+
env.step(action)
|
| 59 |
+
|
| 60 |
+
# Reset should clear everything
|
| 61 |
+
obs = env.reset("easy")
|
| 62 |
+
assert obs["step_number"] == 0
|
| 63 |
+
assert obs["previous_attempts"] == []
|
| 64 |
+
assert obs["attempts_remaining"] == 5
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# ββ Step Tests βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 68 |
+
|
| 69 |
+
def test_step_submit_fix_without_hypothesis(env):
|
| 70 |
+
env.reset("easy")
|
| 71 |
+
action = Action(action_type="submit_fix", fixed_code="def binary_search(arr, target): return -1")
|
| 72 |
+
result = env.step(action)
|
| 73 |
+
assert result["reward"]["step_reward"] == -0.10
|
| 74 |
+
assert result["info"]["error"] is not None
|
| 75 |
+
assert "hypothesis" in result["info"]["error"].lower()
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def test_step_submit_fix_with_valid_code(env):
|
| 79 |
+
env.reset("easy")
|
| 80 |
+
action = Action(
|
| 81 |
+
action_type="submit_fix",
|
| 82 |
+
fixed_code="def binary_search(arr, target): return -1",
|
| 83 |
+
hypothesis="Testing a fix",
|
| 84 |
+
)
|
| 85 |
+
result = env.step(action)
|
| 86 |
+
assert "observation" in result
|
| 87 |
+
assert "reward" in result
|
| 88 |
+
assert "done" in result
|
| 89 |
+
assert "info" in result
|
| 90 |
+
assert result["observation"]["step_number"] == 1
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def test_step_submit_fix_solves_easy(env):
|
| 94 |
+
env.reset("easy")
|
| 95 |
+
fixed_code = '''def binary_search(arr: list, target: int) -> int:
|
| 96 |
+
left, right = 0, len(arr) - 1
|
| 97 |
+
while left <= right:
|
| 98 |
+
mid = (left + right) // 2
|
| 99 |
+
if arr[mid] == target:
|
| 100 |
+
return mid
|
| 101 |
+
elif arr[mid] < target:
|
| 102 |
+
left = mid + 1
|
| 103 |
+
else:
|
| 104 |
+
right = mid - 1
|
| 105 |
+
return -1
|
| 106 |
+
'''
|
| 107 |
+
action = Action(
|
| 108 |
+
action_type="submit_fix",
|
| 109 |
+
fixed_code=fixed_code,
|
| 110 |
+
hypothesis="Off by one: should be left <= right",
|
| 111 |
+
)
|
| 112 |
+
result = env.step(action)
|
| 113 |
+
assert result["observation"]["tests_passed"] == 8, result["observation"]["current_error_output"]
|
| 114 |
+
assert result["done"] is True
|
| 115 |
+
assert result["reward"]["grader_score"] > 0.0
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def test_step_query_context_first_free(env):
|
| 119 |
+
env.reset("easy")
|
| 120 |
+
action = Action(
|
| 121 |
+
action_type="query_context",
|
| 122 |
+
query_type="error_explanation",
|
| 123 |
+
query_target="binary_search",
|
| 124 |
+
)
|
| 125 |
+
result = env.step(action)
|
| 126 |
+
assert result["reward"]["step_reward"] == 0.0
|
| 127 |
+
assert result["info"]["query_result"] is not None
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def test_step_query_context_second_costs(env):
|
| 131 |
+
env.reset("easy")
|
| 132 |
+
action = Action(
|
| 133 |
+
action_type="query_context",
|
| 134 |
+
query_type="error_explanation",
|
| 135 |
+
)
|
| 136 |
+
env.step(action) # First β free
|
| 137 |
+
result = env.step(action) # Second β costs -0.05
|
| 138 |
+
assert result["reward"]["step_reward"] == -0.05
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def test_step_give_up(env):
|
| 142 |
+
env.reset("easy")
|
| 143 |
+
action = Action(
|
| 144 |
+
action_type="give_up",
|
| 145 |
+
final_diagnosis="I cannot find the bug",
|
| 146 |
+
)
|
| 147 |
+
result = env.step(action)
|
| 148 |
+
assert result["done"] is True
|
| 149 |
+
assert result["reward"]["grader_score"] >= 0.0
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def test_step_after_done(env):
|
| 153 |
+
env.reset("easy")
|
| 154 |
+
action = Action(action_type="give_up", final_diagnosis="done")
|
| 155 |
+
env.step(action)
|
| 156 |
+
result = env.step(Action(action_type="give_up"))
|
| 157 |
+
assert result["info"]["error"] is not None
|
| 158 |
+
assert "already done" in result["info"]["error"].lower()
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def test_step_invalid_action_type(env):
|
| 162 |
+
env.reset("easy")
|
| 163 |
+
action = Action(action_type="invalid_action")
|
| 164 |
+
result = env.step(action)
|
| 165 |
+
assert result["info"]["error"] is not None
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def test_step_invalid_query_type(env):
|
| 169 |
+
env.reset("easy")
|
| 170 |
+
action = Action(action_type="query_context", query_type="invalid_query")
|
| 171 |
+
result = env.step(action)
|
| 172 |
+
assert result["reward"]["step_reward"] == -0.05
|
| 173 |
+
assert result["info"]["error"] is not None
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
# ββ State Tests ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 177 |
+
|
| 178 |
+
def test_state_before_reset(env):
|
| 179 |
+
state = env.state()
|
| 180 |
+
assert state["done"] is True
|
| 181 |
+
assert state["task_id"] is None
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def test_state_after_reset(env):
|
| 185 |
+
env.reset("easy")
|
| 186 |
+
state = env.state()
|
| 187 |
+
assert state["task_id"] == "easy"
|
| 188 |
+
assert state["done"] is False
|
| 189 |
+
assert state["attempts_used"] == 0
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def test_state_after_step(env):
|
| 193 |
+
env.reset("easy")
|
| 194 |
+
action = Action(
|
| 195 |
+
action_type="submit_fix",
|
| 196 |
+
fixed_code="def binary_search(arr, target): return -1",
|
| 197 |
+
hypothesis="Testing",
|
| 198 |
+
)
|
| 199 |
+
env.step(action)
|
| 200 |
+
state = env.state()
|
| 201 |
+
assert state["attempts_used"] == 1
|
| 202 |
+
assert state["step_number"] == 1
|
| 203 |
+
assert len(state["all_hypotheses"]) == 1
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
# ββ Attempts Exhaustion Tests ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 207 |
+
|
| 208 |
+
def test_attempts_exhausted(env):
|
| 209 |
+
env.reset("easy")
|
| 210 |
+
for i in range(5):
|
| 211 |
+
action = Action(
|
| 212 |
+
action_type="submit_fix",
|
| 213 |
+
fixed_code=f"def binary_search(arr, target): return {i}",
|
| 214 |
+
hypothesis=f"Attempt {i + 1}",
|
| 215 |
+
)
|
| 216 |
+
result = env.step(action)
|
| 217 |
+
|
| 218 |
+
# After 5 attempts, episode should be done (max_attempts=5)
|
| 219 |
+
assert result["done"] is True or result["observation"]["attempts_remaining"] == 0
|
| 220 |
+
|
| 221 |
+
# Trying another fix should either fail or episode is done
|
| 222 |
+
if not result["done"]:
|
| 223 |
+
action = Action(
|
| 224 |
+
action_type="submit_fix",
|
| 225 |
+
fixed_code="def binary_search(arr, target): return -1",
|
| 226 |
+
hypothesis="Extra attempt",
|
| 227 |
+
)
|
| 228 |
+
result = env.step(action)
|
| 229 |
+
assert result["info"]["error"] is not None
|
tests/test_graders.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests for graders β determinism and range validation.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
from env.graders import get_grader
|
| 7 |
+
from env.tasks.registry import get_task
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
# ββ Determinism Tests ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 11 |
+
|
| 12 |
+
def _make_dummy_attempts(n=2, tests_passed=3, tests_total=8):
|
| 13 |
+
"""Create dummy attempt data for testing."""
|
| 14 |
+
return [
|
| 15 |
+
{
|
| 16 |
+
"attempt_number": i + 1,
|
| 17 |
+
"code_submitted": "def dummy(): pass",
|
| 18 |
+
"hypothesis": "The bug is in the loop condition",
|
| 19 |
+
"execution_output": f"{tests_passed} passed, {tests_total - tests_passed} failed",
|
| 20 |
+
"tests_passed": tests_passed,
|
| 21 |
+
"tests_total": tests_total,
|
| 22 |
+
"execution_time_ms": 100,
|
| 23 |
+
"timed_out": False,
|
| 24 |
+
}
|
| 25 |
+
for i in range(n)
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def test_easy_grader_deterministic():
|
| 30 |
+
"""Same input to easy grader must produce same output."""
|
| 31 |
+
grader = get_grader("easy")
|
| 32 |
+
task = get_task("easy")
|
| 33 |
+
attempts = _make_dummy_attempts(2, tests_passed=7, tests_total=8)
|
| 34 |
+
hypotheses = ["The off by one error in the loop condition"]
|
| 35 |
+
|
| 36 |
+
score1 = grader.score(task, attempts, 7, 8, 2, 5, hypotheses)
|
| 37 |
+
score2 = grader.score(task, attempts, 7, 8, 2, 5, hypotheses)
|
| 38 |
+
assert score1 == score2, f"Easy grader not deterministic: {score1} != {score2}"
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def test_medium_grader_deterministic():
|
| 42 |
+
"""Same input to medium grader must produce same output."""
|
| 43 |
+
grader = get_grader("medium")
|
| 44 |
+
task = get_task("medium")
|
| 45 |
+
attempts = _make_dummy_attempts(3, tests_passed=6, tests_total=10)
|
| 46 |
+
hypotheses = ["Bug is in hash_password bytes conversion"]
|
| 47 |
+
|
| 48 |
+
score1 = grader.score(task, attempts, 6, 10, 3, 7, hypotheses)
|
| 49 |
+
score2 = grader.score(task, attempts, 6, 10, 3, 7, hypotheses)
|
| 50 |
+
assert score1 == score2, f"Medium grader not deterministic: {score1} != {score2}"
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def test_hard_grader_deterministic():
|
| 54 |
+
"""Same input to hard grader must produce same output (excluding concurrent test randomness)."""
|
| 55 |
+
grader = get_grader("hard")
|
| 56 |
+
task = get_task("hard")
|
| 57 |
+
# Use buggy code so concurrent test is deterministically failing
|
| 58 |
+
attempts = _make_dummy_attempts(2, tests_passed=8, tests_total=8)
|
| 59 |
+
hypotheses = ["race condition in increment"]
|
| 60 |
+
|
| 61 |
+
score1 = grader.score(task, attempts, 8, 8, 2, 10, hypotheses)
|
| 62 |
+
score2 = grader.score(task, attempts, 8, 8, 2, 10, hypotheses)
|
| 63 |
+
assert score1 == score2, f"Hard grader not deterministic: {score1} != {score2}"
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# ββ Range Tests ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 67 |
+
|
| 68 |
+
@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
|
| 69 |
+
def test_grader_range_with_zero_attempts(task_id):
|
| 70 |
+
"""Grader with zero attempts should return a score in [0.0, 1.0]."""
|
| 71 |
+
grader = get_grader(task_id)
|
| 72 |
+
task = get_task(task_id)
|
| 73 |
+
score = grader.score(task, [], 0, task["tests_total"], 0, task["max_attempts"], [])
|
| 74 |
+
assert 0.0 <= score <= 1.0, f"{task_id} grader out of range: {score}"
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
|
| 78 |
+
def test_grader_range_with_perfect_score(task_id):
|
| 79 |
+
"""Grader with all tests passing should return a score in [0.0, 1.0]."""
|
| 80 |
+
grader = get_grader(task_id)
|
| 81 |
+
task = get_task(task_id)
|
| 82 |
+
tests_total = task["tests_total"]
|
| 83 |
+
attempts = _make_dummy_attempts(1, tests_passed=tests_total, tests_total=tests_total)
|
| 84 |
+
hypotheses = ["off by one", "hash_password bytes", "race condition atomic lock"]
|
| 85 |
+
|
| 86 |
+
score = grader.score(task, attempts, tests_total, tests_total, 1, task["max_attempts"], hypotheses)
|
| 87 |
+
assert 0.0 <= score <= 1.0, f"{task_id} grader out of range: {score}"
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
|
| 91 |
+
def test_grader_range_with_all_failures(task_id):
|
| 92 |
+
"""Grader with no tests passing should return a score in [0.0, 1.0]."""
|
| 93 |
+
grader = get_grader(task_id)
|
| 94 |
+
task = get_task(task_id)
|
| 95 |
+
tests_total = task["tests_total"]
|
| 96 |
+
attempts = _make_dummy_attempts(task["max_attempts"], tests_passed=0, tests_total=tests_total)
|
| 97 |
+
|
| 98 |
+
score = grader.score(task, attempts, 0, tests_total, task["max_attempts"], task["max_attempts"], [])
|
| 99 |
+
assert 0.0 <= score <= 1.0, f"{task_id} grader out of range: {score}"
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
# ββ Variance Tests (dummy vs perfect agents) ββββββββββββββββββββββββββββββββ
|
| 103 |
+
|
| 104 |
+
def test_easy_dummy_agent_low_score():
|
| 105 |
+
"""A dummy agent submitting 'pass' should score < 0.15."""
|
| 106 |
+
grader = get_grader("easy")
|
| 107 |
+
task = get_task("easy")
|
| 108 |
+
attempts = [
|
| 109 |
+
{
|
| 110 |
+
"attempt_number": i + 1,
|
| 111 |
+
"code_submitted": "pass",
|
| 112 |
+
"hypothesis": "I don't know",
|
| 113 |
+
"execution_output": "0 passed, 8 failed",
|
| 114 |
+
"tests_passed": 0,
|
| 115 |
+
"tests_total": 8,
|
| 116 |
+
"execution_time_ms": 50,
|
| 117 |
+
"timed_out": False,
|
| 118 |
+
}
|
| 119 |
+
for i in range(5)
|
| 120 |
+
]
|
| 121 |
+
score = grader.score(task, attempts, 0, 8, 5, 5, ["I don't know"] * 5)
|
| 122 |
+
assert score < 0.15, f"Dummy agent scored too high on easy: {score}"
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def test_easy_perfect_agent_high_score():
|
| 126 |
+
"""A perfect agent should score > 0.85 on easy."""
|
| 127 |
+
grader = get_grader("easy")
|
| 128 |
+
task = get_task("easy")
|
| 129 |
+
attempts = [
|
| 130 |
+
{
|
| 131 |
+
"attempt_number": 1,
|
| 132 |
+
"code_submitted": task["ground_truth"]["fixed_code"],
|
| 133 |
+
"hypothesis": "The off by one error: should be left <= right",
|
| 134 |
+
"execution_output": "8 passed, 0 failed",
|
| 135 |
+
"tests_passed": 8,
|
| 136 |
+
"tests_total": 8,
|
| 137 |
+
"execution_time_ms": 50,
|
| 138 |
+
"timed_out": False,
|
| 139 |
+
}
|
| 140 |
+
]
|
| 141 |
+
score = grader.score(task, attempts, 8, 8, 1, 5, ["The off by one error: should be left <= right"])
|
| 142 |
+
assert score > 0.85, f"Perfect agent scored too low on easy: {score}"
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def test_medium_red_herring_low_score():
|
| 146 |
+
"""Agent that only fixes authenticate_user should score < 0.30 on hypothesis."""
|
| 147 |
+
grader = get_grader("medium")
|
| 148 |
+
task = get_task("medium")
|
| 149 |
+
attempts = _make_dummy_attempts(3, tests_passed=6, tests_total=10)
|
| 150 |
+
hypotheses = [
|
| 151 |
+
"The bug is in authenticate_user, it's not checking credentials correctly",
|
| 152 |
+
"authenticate_user should handle the case differently",
|
| 153 |
+
"Fix authenticate_user to return True for valid users",
|
| 154 |
+
]
|
| 155 |
+
score = grader.score(task, attempts, 6, 10, 3, 7, hypotheses)
|
| 156 |
+
# With only 6/10 tests and red herring hypotheses, score should be modest
|
| 157 |
+
assert score < 0.60, f"Red herring agent scored too high on medium: {score}"
|
tests/test_sandbox.py
CHANGED
|
@@ -19,7 +19,6 @@ def test_os_import_blocked():
|
|
| 19 |
"""os module must be blocked β cannot execute system commands."""
|
| 20 |
code = "import os; os.system('echo pwned')"
|
| 21 |
output, timed_out, _ = execute_code(code, "")
|
| 22 |
-
assert "pwned" not in output
|
| 23 |
assert "BLOCKED" in output or "blocked" in output.lower()
|
| 24 |
|
| 25 |
|
|
@@ -51,7 +50,7 @@ def test_syntax_error_returns_output():
|
|
| 51 |
|
| 52 |
def test_subprocess_import_blocked():
|
| 53 |
"""subprocess module must be blocked."""
|
| 54 |
-
code = "import subprocess; subprocess.run(['echo', '
|
| 55 |
output, _, _ = execute_code(code, "")
|
| 56 |
assert "pwned" not in output
|
| 57 |
assert "BLOCKED" in output or "blocked" in output.lower()
|
|
@@ -59,7 +58,7 @@ def test_subprocess_import_blocked():
|
|
| 59 |
|
| 60 |
def test_threading_blocked_by_default():
|
| 61 |
"""threading must be blocked unless allow_threading=True."""
|
| 62 |
-
code = "import threading; print('thread imported')"
|
| 63 |
output, _, _ = execute_code(code, "")
|
| 64 |
assert "thread imported" not in output
|
| 65 |
assert "BLOCKED" in output or "blocked" in output.lower()
|
|
@@ -74,7 +73,7 @@ def test_threading_allowed_when_flagged():
|
|
| 74 |
|
| 75 |
def test_from_import_blocked():
|
| 76 |
"""'from os import path' style imports must also be blocked."""
|
| 77 |
-
code = "from os import path; print('
|
| 78 |
output, _, _ = execute_code(code, "")
|
| 79 |
assert "pwned" not in output
|
| 80 |
assert "BLOCKED" in output or "blocked" in output.lower()
|
|
|
|
| 19 |
"""os module must be blocked β cannot execute system commands."""
|
| 20 |
code = "import os; os.system('echo pwned')"
|
| 21 |
output, timed_out, _ = execute_code(code, "")
|
|
|
|
| 22 |
assert "BLOCKED" in output or "blocked" in output.lower()
|
| 23 |
|
| 24 |
|
|
|
|
| 50 |
|
| 51 |
def test_subprocess_import_blocked():
|
| 52 |
"""subprocess module must be blocked."""
|
| 53 |
+
code = "import subprocess; subprocess.run(['echo', 'pw' + 'ned'])"
|
| 54 |
output, _, _ = execute_code(code, "")
|
| 55 |
assert "pwned" not in output
|
| 56 |
assert "BLOCKED" in output or "blocked" in output.lower()
|
|
|
|
| 58 |
|
| 59 |
def test_threading_blocked_by_default():
|
| 60 |
"""threading must be blocked unless allow_threading=True."""
|
| 61 |
+
code = "import threading; print('thread ' + 'imported')"
|
| 62 |
output, _, _ = execute_code(code, "")
|
| 63 |
assert "thread imported" not in output
|
| 64 |
assert "BLOCKED" in output or "blocked" in output.lower()
|
|
|
|
| 73 |
|
| 74 |
def test_from_import_blocked():
|
| 75 |
"""'from os import path' style imports must also be blocked."""
|
| 76 |
+
code = "from os import path; print('pw' + 'ned')"
|
| 77 |
output, _, _ = execute_code(code, "")
|
| 78 |
assert "pwned" not in output
|
| 79 |
assert "BLOCKED" in output or "blocked" in output.lower()
|