Resolved README Merge conflicts
Browse files- .gitignore +2 -1
- README.md +207 -318
- data/bugs_tier1.jsonl +8 -0
- data/bugs_tier2.jsonl +3 -0
- data/bugs_tier3.jsonl +2 -0
- data/generate_bugs.py +441 -0
- env/__pycache__/environment.cpython-310.pyc +0 -0
- env/__pycache__/environment.cpython-313.pyc +0 -0
- env/__pycache__/models.cpython-310.pyc +0 -0
- env/__pycache__/models.cpython-313.pyc +0 -0
- env/__pycache__/sandbox.cpython-310.pyc +0 -0
- env/environment.py +261 -3
- env/graders/__pycache__/base_grader.cpython-310.pyc +0 -0
- env/graders/__pycache__/grader_hard.cpython-310.pyc +0 -0
- env/graders/grader_hard.py +13 -114
- env/models.py +64 -1
- env/sandbox.py +90 -48
- inference.py +8 -7
- openenv.yaml +50 -9
- pyproject.toml +4 -1
- requirements.txt +1 -0
- server/models.py +11 -0
- server/reward_calculator.py +283 -0
- tests/__pycache__/test_environment.cpython-310-pytest-8.1.0.pyc +0 -0
- tests/__pycache__/test_sandbox.cpython-310-pytest-8.1.0.pyc +0 -0
- tests/test_integration.py +102 -0
- training/train_grpo.py +324 -0
- uv.lock +129 -57
- validator.py +155 -0
.gitignore
CHANGED
|
@@ -45,4 +45,5 @@ baseline_results.json
|
|
| 45 |
sandbox_*.py
|
| 46 |
/tmp/sandbox_*
|
| 47 |
|
| 48 |
-
instructions.md
|
|
|
|
|
|
| 45 |
sandbox_*.py
|
| 46 |
/tmp/sandbox_*
|
| 47 |
|
| 48 |
+
instructions.md
|
| 49 |
+
CURSOR_INSTRUCTIONS_V2.md
|
README.md
CHANGED
|
@@ -1,315 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# AgentDebuggerEnv π
|
| 2 |
|
| 3 |
-
> **A live, iterative debugging environment for benchmarking agentic reasoning in AI systems.**
|
| 4 |
-
> Submitted to the **Meta + PyTorch + HuggingFace OpenEnv Hackathon**.
|
| 5 |
|
| 6 |
-
[](LICENSE)
|
| 9 |
[](https://www.python.org/)
|
| 10 |
-
|
|
|
|
| 11 |
|
| 12 |
---
|
| 13 |
|
| 14 |
## The Problem with Existing Code Benchmarks
|
| 15 |
|
| 16 |
-
Benchmarks like HumanEval, MBPP, and
|
| 17 |
-
|
| 18 |
-
Real software engineering is not one-shot. It is **iterative**. A developer:
|
| 19 |
|
| 20 |
-
|
| 21 |
-
2. Forms a hypothesis about the root cause
|
| 22 |
-
3. Submits a fix
|
| 23 |
-
4. Reads the new error output
|
| 24 |
-
5. Updates their hypothesis
|
| 25 |
-
6. Repeats β sometimes many times
|
| 26 |
|
| 27 |
-
|
| 28 |
|
| 29 |
---
|
| 30 |
|
| 31 |
-
##
|
| 32 |
-
|
| 33 |
-
SWE-bench gives an agent a static GitHub issue and measures only the final patch correctness. AgentDebuggerEnv is fundamentally different in three ways:
|
| 34 |
|
| 35 |
| Dimension | SWE-bench | AgentDebuggerEnv |
|
| 36 |
|---|---|---|
|
| 37 |
-
| Evaluation target | Final patch
|
| 38 |
-
| Feedback | None β single shot | Real `stdout/stderr` after every
|
| 39 |
-
| Reward signal | Binary
|
| 40 |
| What's measured | Code generation | Hypothesis formation + iterative reasoning |
|
| 41 |
-
| Hard task |
|
|
|
|
| 42 |
|
| 43 |
-
The iterative feedback loop is the core mechanic. Every `step()` call executes the agent's
|
| 44 |
|
| 45 |
---
|
| 46 |
|
| 47 |
-
##
|
| 48 |
|
| 49 |
-
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
|
|
|
| 56 |
|
| 57 |
-
The
|
| 58 |
-
|
| 59 |
-
**Live Space:** https://huggingface.co/spaces/shashaank0707/AgentDebugger-env
|
| 60 |
|
| 61 |
---
|
| 62 |
|
| 63 |
-
|
| 64 |
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
β βββ environment.py # Core OpenEnv class: reset(), step(), state()
|
| 70 |
-
β βββ models.py # Pydantic v2 Observation, Action, Reward models
|
| 71 |
-
β βββ sandbox.py # AST-based sandboxed code execution
|
| 72 |
-
β βββ server.py # FastAPI server: /reset, /step, /state, /health, /tasks
|
| 73 |
-
β βββ tasks/
|
| 74 |
-
β β βββ registry.py # Task registry
|
| 75 |
-
β β βββ task_easy.py # Off-by-one bug in binary search
|
| 76 |
-
β β βββ task_medium.py # Red herring authentication bug
|
| 77 |
-
β β βββ task_hard.py # Concurrency race condition
|
| 78 |
-
β βββ graders/
|
| 79 |
-
β βββ base_grader.py # Abstract base grader
|
| 80 |
-
β βββ grader_easy.py # Standard test-pass + efficiency scoring
|
| 81 |
-
β βββ grader_medium.py # Red herring detection + score floor fix
|
| 82 |
-
β βββ grader_hard.py # Sequential + concurrent stress test scoring
|
| 83 |
-
βββ server/
|
| 84 |
-
β βββ app.py # Entry point alias for openenv validate
|
| 85 |
-
βββ tests/
|
| 86 |
-
β βββ test_environment.py
|
| 87 |
-
β βββ test_sandbox.py
|
| 88 |
-
β βββ test_graders.py
|
| 89 |
-
βββ openenv.yaml # OpenEnv spec metadata
|
| 90 |
-
βββ Dockerfile
|
| 91 |
-
βββ requirements.txt
|
| 92 |
-
βββ pyproject.toml
|
| 93 |
-
βββ uv.lock # Reproducible dependency resolution
|
| 94 |
-
βββ .gitignore
|
| 95 |
-
```
|
| 96 |
|
| 97 |
---
|
| 98 |
|
| 99 |
-
##
|
| 100 |
|
| 101 |
-
###
|
| 102 |
|
| 103 |
-
|
| 104 |
|
| 105 |
-
```
|
| 106 |
-
class FixAttempt(BaseModel):
|
| 107 |
-
attempt_number: int # 1-indexed
|
| 108 |
-
code_submitted: str # Full code the agent submitted
|
| 109 |
-
hypothesis: str # Agent's stated theory before this attempt
|
| 110 |
-
execution_output: str # Full stdout + stderr from sandbox
|
| 111 |
-
tests_passed: int
|
| 112 |
-
tests_total: int
|
| 113 |
-
execution_time_ms: int
|
| 114 |
-
timed_out: bool
|
| 115 |
|
| 116 |
-
|
| 117 |
-
# Fixed for the episode
|
| 118 |
-
task_id: str # "easy" | "medium" | "hard"
|
| 119 |
-
task_description: str
|
| 120 |
-
buggy_code: str # Original broken code β always visible
|
| 121 |
-
test_suite: str # Full test file β agent can read requirements
|
| 122 |
-
initial_error_output: str # Sandbox output on the buggy code at reset()
|
| 123 |
-
|
| 124 |
-
# Changes each step
|
| 125 |
-
current_code: str # Most recent submitted code
|
| 126 |
-
current_error_output: str # Test output on current_code
|
| 127 |
-
tests_passed: int
|
| 128 |
-
tests_total: int
|
| 129 |
-
previous_attempts: List[FixAttempt] # Full episode history
|
| 130 |
-
|
| 131 |
-
# Budget tracking
|
| 132 |
-
attempts_remaining: int
|
| 133 |
-
max_attempts: int
|
| 134 |
-
step_number: int
|
| 135 |
-
max_steps: int
|
| 136 |
-
done: bool
|
| 137 |
-
score_estimate: float # Running grader estimate shown to agent
|
| 138 |
-
hint_used: bool
|
| 139 |
-
```
|
| 140 |
|
| 141 |
-
|
| 142 |
|
| 143 |
-
|
| 144 |
|
| 145 |
-
|
| 146 |
-
class Action(BaseModel):
|
| 147 |
-
action_type: str # "submit_fix" | "query_context" | "give_up"
|
| 148 |
|
| 149 |
-
|
| 150 |
-
fixed_code: Optional[str] = None # Complete corrected code file
|
| 151 |
-
hypothesis: Optional[str] = None # REQUIRED β missing costs -0.10 reward
|
| 152 |
|
| 153 |
-
|
| 154 |
-
query_type: Optional[str] = None # "function_signature" | "related_code"
|
| 155 |
-
# | "error_explanation" | "test_details"
|
| 156 |
-
query_target: Optional[str] = None
|
| 157 |
|
| 158 |
-
|
| 159 |
-
final_diagnosis: Optional[str] = None
|
| 160 |
-
```
|
| 161 |
|
| 162 |
-
|
|
|
|
|
|
|
| 163 |
|
| 164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
|
| 166 |
```python
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
|
|
|
| 172 |
```
|
| 173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
---
|
| 175 |
|
| 176 |
-
## Reward Function
|
| 177 |
|
| 178 |
-
The reward function
|
| 179 |
|
| 180 |
### Step-Level Rewards
|
| 181 |
|
| 182 |
| Event | Reward | Reasoning |
|
| 183 |
|---|---|---|
|
| 184 |
-
| Fix increases tests passing | `+0.15 Γ (Ξpassed / total)` | Scaled progress
|
| 185 |
| Fix decreases tests passing | `-0.10 Γ (Ξfailed / total)` | Regression penalty |
|
| 186 |
-
| Fix makes no change | `-0.05` | Stagnation penalty
|
| 187 |
-
| All tests pass | `+0.50` | Major bonus on top of progress
|
| 188 |
-
|
|
| 189 |
-
| `submit_fix` without hypothesis | `-0.10` | Hypothesis is required |
|
| 190 |
-
|
|
|
|
|
| 191 |
| Episode truncated at max_steps | `-0.20` | Penalizes indecision |
|
| 192 |
|
| 193 |
-
### Episode-Level Grader Score
|
| 194 |
|
| 195 |
```
|
| 196 |
-
grader_score = test_pass_ratio
|
| 197 |
-
+ efficiency_bonus
|
| 198 |
+ hypothesis_accuracy Γ 0.15
|
| 199 |
-
+ early_solve_bonus
|
| 200 |
-
|
| 201 |
-
where:
|
| 202 |
-
test_pass_ratio = agent_best_tests_passed / tests_total
|
| 203 |
-
(from agent submissions only β not initial buggy code)
|
| 204 |
-
efficiency_bonus = max(0, (max_attempts - attempts_used) / max_attempts)
|
| 205 |
-
hypothesis_accuracy = fraction of hypotheses correctly identifying bug location
|
| 206 |
-
early_solve_bonus = 0.05 if all tests pass within ceil(max_attempts / 3) attempts
|
| 207 |
-
```
|
| 208 |
-
|
| 209 |
-
**Score floor design:** `test_pass_ratio` is calculated only from the agent's submitted attempts β never from the initial buggy code run. This guarantees that a dummy agent that submits nothing scores 0.0, not an inflated baseline.
|
| 210 |
-
|
| 211 |
-
---
|
| 212 |
-
|
| 213 |
-
## Tasks
|
| 214 |
-
|
| 215 |
-
### Task 1 β Easy: Off-by-One Bug
|
| 216 |
-
|
| 217 |
-
**Difficulty:** π’ Easy | **Max attempts:** 5 | **Max steps:** 8 | **Tests:** 8
|
| 218 |
-
|
| 219 |
-
A binary search implementation with a single-character bug: the while loop uses `left < right` instead of `left <= right`. This causes the function to miss the target when it is the last element in the array. The failing test produces a high-signal error message directly indicating the problem.
|
| 220 |
-
|
| 221 |
-
**Why it's easy:** The error message names the failing assertion. Reading the while condition reveals the bug. One to two iterations expected for any competent agent.
|
| 222 |
-
|
| 223 |
-
**What the grader checks:** Did the agent fix all 8 tests? Did the hypothesis mention the termination condition or off-by-one logic? Was it solved efficiently?
|
| 224 |
-
|
| 225 |
-
**Expected GPT-4o baseline:** ~0.85
|
| 226 |
-
|
| 227 |
-
---
|
| 228 |
-
|
| 229 |
-
### Task 2 β Medium: Red Herring Authentication Bug
|
| 230 |
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
**Why it's medium:** The agent must resist following the error message and instead reason about data flow between functions. GPT-4o follows this red herring approximately 40% of the time.
|
| 238 |
-
|
| 239 |
-
**Red herring detection in grader:** A hypothesis that mentions only `authenticate_user` scores 0.0 for hypothesis accuracy. A hypothesis that correctly identifies `hash_password` with supporting detail scores 1.0.
|
| 240 |
-
|
| 241 |
-
**Expected GPT-4o baseline:** ~0.50
|
| 242 |
-
|
| 243 |
-
---
|
| 244 |
-
|
| 245 |
-
### Task 3 β Hard: Concurrency Race Condition
|
| 246 |
-
|
| 247 |
-
**Difficulty:** π΄ Hard | **Max attempts:** 10 | **Max steps:** 25 | **Tests:** 8 (all 8 pass on buggy code)
|
| 248 |
-
|
| 249 |
-
A `ConnectionCounter` class used in a web server to track active connections. It uses `threading.Lock` and appears to be correctly implemented. All 8 sequential unit tests pass. The bug is a classic TOCTOU (time-of-check to time-of-use) race condition: `increment()` and `decrement()` split the read-modify-write cycle across two separate lock acquisitions, leaving a window between the read and write where another thread can interleave.
|
| 250 |
-
|
| 251 |
-
```python
|
| 252 |
-
def increment(self):
|
| 253 |
-
with self._lock:
|
| 254 |
-
current = self.count # read β lock released here
|
| 255 |
-
new_val = current + 1 # modify β no lock held
|
| 256 |
-
with self._lock:
|
| 257 |
-
self.count = new_val # write β race window exploited
|
| 258 |
```
|
| 259 |
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
**Why it's hard:** Race conditions are non-deterministic. The bug does not manifest in sequential execution. The agent must demonstrate meta-reasoning about the limits of the existing test suite β a capability current frontier models lack most of the time.
|
| 263 |
-
|
| 264 |
-
**Hard task grader breakdown:**
|
| 265 |
-
- Sequential tests pass: 0.40 (agent submissions only)
|
| 266 |
-
- 1000-thread concurrent stress test passes: 0.30 (run 3Γ β must pass all 3 for full credit)
|
| 267 |
-
- Hypothesis accuracy (mentions "race condition", "atomic", "lock"): 0.20
|
| 268 |
-
- Efficiency bonus (fixed within 5 attempts): 0.10
|
| 269 |
-
|
| 270 |
-
**Expected GPT-4o baseline:** ~0.18
|
| 271 |
|
| 272 |
---
|
| 273 |
|
| 274 |
## Security Sandbox
|
| 275 |
|
| 276 |
-
Every `submit_fix` action executes agent-generated Python code.
|
| 277 |
|
| 278 |
-
|
| 279 |
|
| 280 |
-
**Layer
|
| 281 |
|
| 282 |
-
**Layer
|
| 283 |
|
| 284 |
-
**Layer
|
| 285 |
|
| 286 |
-
**
|
| 287 |
-
|
| 288 |
-
**Threading exception:** The hard task requires `threading` to create the race condition and to verify the fix. The sandbox accepts a `allow_threading=True` flag that removes `threading` from the blocked list for that task only. All other tasks have threading blocked.
|
| 289 |
|
| 290 |
---
|
| 291 |
|
| 292 |
-
##
|
| 293 |
|
| 294 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
|
| 296 |
-
|
| 297 |
-
|
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
| `/reset` | POST | Start a new episode. Body: `{"task_id": "easy"}` |
|
| 302 |
-
| `/step` | POST | Submit one action. Body: Action JSON |
|
| 303 |
-
| `/state` | GET | Full internal episode state |
|
| 304 |
|
| 305 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
|
| 307 |
---
|
| 308 |
|
| 309 |
-
## OpenEnv Compliance
|
| 310 |
|
| 311 |
```yaml
|
| 312 |
-
# openenv.yaml
|
| 313 |
name: agentdebugger-env
|
| 314 |
version: 1.0.0
|
| 315 |
domain: software_engineering
|
|
@@ -318,52 +216,51 @@ action_type: structured
|
|
| 318 |
reward_type: dense
|
| 319 |
episode_termination: action_or_step_limit
|
| 320 |
tasks:
|
| 321 |
-
- id: easy
|
| 322 |
-
- id: medium
|
| 323 |
-
- id: hard
|
| 324 |
```
|
| 325 |
|
| 326 |
-
|
| 327 |
-
```
|
| 328 |
-
β openenv.yaml valid
|
| 329 |
-
β GET /health β 200
|
| 330 |
-
β POST /reset β valid Observation
|
| 331 |
-
β POST /step β (Observation, Reward, bool, dict)
|
| 332 |
-
β GET /state β dict
|
| 333 |
-
β 3 tasks registered: easy, medium, hard
|
| 334 |
-
β grader_easy: score in [0.0, 1.0] β PASS
|
| 335 |
-
β grader_medium: score in [0.0, 1.0] β PASS
|
| 336 |
-
β grader_hard: score in [0.0, 1.0] β PASS
|
| 337 |
-
β inference.py present in root directory
|
| 338 |
-
openenv validate: PASSED
|
| 339 |
-
```
|
| 340 |
|
| 341 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
|
| 343 |
-
|
| 344 |
|
| 345 |
-
|
| 346 |
|
| 347 |
-
|
| 348 |
-
|---|---|---|---|---|---|---|
|
| 349 |
-
| Off-by-One Bug | Easy | 0.85 | Β±0.04 | 100% | 1.8 | 4.2 |
|
| 350 |
-
| Red Herring Auth | Medium | 0.50 | Β±0.10 | 60% | 4.2 | 10.6 |
|
| 351 |
-
| Race Condition | Hard | 0.18 | Β±0.09 | 20% | 8.7 | 22.1 |
|
| 352 |
-
| **Overall Mean** | | **0.51** | | **60%** | | |
|
| 353 |
|
| 354 |
-
|
|
|
|
|
|
|
|
|
|
| 355 |
|
| 356 |
-
|
|
|
|
| 357 |
|
| 358 |
-
|
|
|
|
| 359 |
|
| 360 |
-
|
|
|
|
|
|
|
| 361 |
|
| 362 |
-
|
| 363 |
|
| 364 |
-
|
|
|
|
|
|
|
|
|
|
| 365 |
|
| 366 |
-
###
|
| 367 |
|
| 368 |
```bash
|
| 369 |
git clone https://github.com/shasshaank/AgentDebuggerEnv
|
|
@@ -380,87 +277,71 @@ curl http://localhost:8000/health
|
|
| 380 |
# Run baseline inference
|
| 381 |
export API_BASE_URL="https://api.openai.com/v1"
|
| 382 |
export MODEL_NAME="gpt-4o"
|
| 383 |
-
export HF_TOKEN="
|
| 384 |
export ENV_BASE_URL="http://localhost:8000"
|
| 385 |
python inference.py
|
| 386 |
```
|
| 387 |
|
| 388 |
-
|
| 389 |
|
| 390 |
```bash
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
# Run with inference against the containerized environment
|
| 398 |
-
docker run -p 8000:8000 \
|
| 399 |
-
-e API_BASE_URL="https://api.openai.com/v1" \
|
| 400 |
-
-e MODEL_NAME="gpt-4o" \
|
| 401 |
-
-e HF_TOKEN="your_key" \
|
| 402 |
-
agentdebugger-env
|
| 403 |
-
```
|
| 404 |
-
|
| 405 |
-
### Quick API Test
|
| 406 |
-
|
| 407 |
-
```bash
|
| 408 |
-
# Reset the easy task
|
| 409 |
-
curl -X POST http://localhost:8000/reset \
|
| 410 |
-
-H "Content-Type: application/json" \
|
| 411 |
-
-d '{"task_id": "easy"}'
|
| 412 |
-
|
| 413 |
-
# Submit a fix with hypothesis
|
| 414 |
-
curl -X POST http://localhost:8000/step \
|
| 415 |
-
-H "Content-Type: application/json" \
|
| 416 |
-
-d '{
|
| 417 |
-
"action_type": "submit_fix",
|
| 418 |
-
"fixed_code": "def binary_search(arr, target):\n left, right = 0, len(arr) - 1\n while left <= right:\n mid = (left + right) // 2\n if arr[mid] == target:\n return mid\n elif arr[mid] < target:\n left = mid + 1\n else:\n right = mid - 1\n return -1",
|
| 419 |
-
"hypothesis": "The while loop uses left < right instead of left <= right, causing it to skip the last element."
|
| 420 |
-
}'
|
| 421 |
```
|
| 422 |
|
| 423 |
---
|
| 424 |
|
| 425 |
-
##
|
| 426 |
-
|
| 427 |
-
Four specific failure modes in LLM agents are measurable and scorable here for the first time:
|
| 428 |
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
|
|
|
|
|
|
|
|
|
| 432 |
|
| 433 |
-
|
| 434 |
|
| 435 |
-
|
| 436 |
|
| 437 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
|
| 439 |
---
|
| 440 |
|
| 441 |
## Design Decisions
|
| 442 |
|
| 443 |
-
**Why
|
| 444 |
|
| 445 |
-
**Why
|
| 446 |
|
| 447 |
-
**Why run the concurrent stress test
|
| 448 |
|
| 449 |
-
**Why not use pytest directly?** Using pytest as the test runner
|
| 450 |
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
## Environment Configuration
|
| 454 |
-
|
| 455 |
-
```bash
|
| 456 |
-
# Required for inference.py
|
| 457 |
-
API_BASE_URL # LLM API endpoint (e.g. https://api.openai.com/v1)
|
| 458 |
-
MODEL_NAME # Model identifier (e.g. gpt-4o)
|
| 459 |
-
HF_TOKEN # API key / HuggingFace token
|
| 460 |
-
|
| 461 |
-
# Optional β defaults to localhost:8000
|
| 462 |
-
ENV_BASE_URL # Environment server URL
|
| 463 |
-
```
|
| 464 |
|
| 465 |
---
|
| 466 |
|
|
@@ -468,8 +349,16 @@ ENV_BASE_URL # Environment server URL
|
|
| 468 |
|
| 469 |
**License:** MIT β see [LICENSE](LICENSE)
|
| 470 |
|
| 471 |
-
**
|
|
|
|
|
|
|
| 472 |
|
| 473 |
**Submitted to:** Meta + PyTorch + HuggingFace OpenEnv Hackathon
|
| 474 |
|
| 475 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: AgentDebugger-Env π
|
| 3 |
+
emoji: π
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: yellow
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 8000
|
| 8 |
+
pinned: true
|
| 9 |
+
license: mit
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
# AgentDebuggerEnv π
|
| 13 |
|
| 14 |
+
> **A live, iterative debugging environment for benchmarking genuine agentic reasoning in AI systems.**
|
|
|
|
| 15 |
|
| 16 |
+
[](https://huggingface.co/spaces/shashaank0707/AgentDebugger-env)
|
| 17 |
+
[](#openenv-api-compliance)
|
| 18 |
[](LICENSE)
|
| 19 |
[](https://www.python.org/)
|
| 20 |
+
|
| 21 |
+
*Submitted to the **Meta + PyTorch + HuggingFace OpenEnv Hackathon.***
|
| 22 |
|
| 23 |
---
|
| 24 |
|
| 25 |
## The Problem with Existing Code Benchmarks
|
| 26 |
|
| 27 |
+
Benchmarks like HumanEval, MBPP, and SWE-bench share a fundamental limitation: they are **one-shot**. A model reads a problem, generates code, and is scored on the final output. This measures code generation β not debugging ability.
|
|
|
|
|
|
|
| 28 |
|
| 29 |
+
Real software engineering is not one-shot. It is **iterative**. A developer reads failing tests, forms a hypothesis, submits a fix, reads the new error output, updates their theory, and repeats. No existing OpenEnv environment benchmarks this loop.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
+
**AgentDebuggerEnv does.**
|
| 32 |
|
| 33 |
---
|
| 34 |
|
| 35 |
+
## How It's Different from SWE-bench
|
|
|
|
|
|
|
| 36 |
|
| 37 |
| Dimension | SWE-bench | AgentDebuggerEnv |
|
| 38 |
|---|---|---|
|
| 39 |
+
| Evaluation target | Final patch correctness | Full reasoning trajectory |
|
| 40 |
+
| Feedback to agent | None β single shot | Real `stdout/stderr` after every attempt |
|
| 41 |
+
| Reward signal | Binary end-of-episode | Dense β every step scored |
|
| 42 |
| What's measured | Code generation | Hypothesis formation + iterative reasoning |
|
| 43 |
+
| Hard task | Apply patch to existing issue | Must design a test to surface a hidden bug |
|
| 44 |
+
| Agent failure modes | Not tracked | 4 distinct measurable failure modes |
|
| 45 |
|
| 46 |
+
The iterative feedback loop is the core mechanic. Every `step()` call executes the agent's code in a live sandbox and returns actual test output. The agent must update its theory and try again β exactly like a real developer at a terminal.
|
| 47 |
|
| 48 |
---
|
| 49 |
|
| 50 |
+
## Baseline Performance
|
| 51 |
|
| 52 |
+
Evaluated using `gpt-4o` with zero-shot prompting. Each task run 5 times independently, scores averaged.
|
| 53 |
|
| 54 |
+
| Task | Difficulty | Mean Score | Std Dev | Solved % | Avg Attempts |
|
| 55 |
+
|---|---|---|---|---|---|
|
| 56 |
+
| Off-by-One Bug | π’ Easy | 0.85 | Β±0.04 | 100% | 1.8 |
|
| 57 |
+
| Red Herring Auth Bug | π‘ Medium | 0.50 | Β±0.10 | 60% | 4.2 |
|
| 58 |
+
| Race Condition | π΄ Hard | 0.18 | Β±0.09 | 20% | 8.7 |
|
| 59 |
+
| **Overall Mean** | | **0.51** | | **60%** | |
|
| 60 |
|
| 61 |
+
The hard task is specifically designed so that frontier models fail most of the time. GPT-4o almost never spontaneously recognizes that a race condition can exist when all sequential tests pass β which is exactly the reasoning gap this environment is built to measure.
|
|
|
|
|
|
|
| 62 |
|
| 63 |
---
|
| 64 |
|
| 65 |
+
All four failure modes produce distinct, interpretable score components in the `breakdown` field of every `Reward` response:
|
| 66 |
|
| 67 |
+
* **Red Herring Susceptibility**: Does the agent overtrust error messages (Medium Task symptom) or trace data flow to the root?
|
| 68 |
+
* **Stagnation**: Does the agent repeat failed fixes? Prohibited by the `-0.05` stagnation penalty.
|
| 69 |
+
* **Exploration/Exploitation**: Measures if agents query for context productively before attempting fixes.
|
| 70 |
+
* **Test-Suite Overconfidence**: Detects if an agent fails to reason about concurrency when sequential tests pass (Hard Task).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
---
|
| 73 |
|
| 74 |
+
## Task Suite
|
| 75 |
|
| 76 |
+
### π’ Task 1 β Easy: Off-by-One Bug
|
| 77 |
|
| 78 |
+
**Max attempts:** 5 | **Max steps:** 8 | **Tests:** 8
|
| 79 |
|
| 80 |
+
A binary search implementation with a single-character bug: the while loop uses `left < right` instead of `left <= right`. This causes the function to miss the target when it is the last element. The failing test produces a high-signal error message pointing directly at the problem.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
+
**Why it's easy:** The error message names the failing assertion with expected vs actual values. Reading the while condition reveals the bug. 1β2 iterations expected.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
+
**What the grader checks:** Did all 8 tests pass? Did the hypothesis mention the termination condition or off-by-one logic? Was it efficient?
|
| 85 |
|
| 86 |
+
---
|
| 87 |
|
| 88 |
+
### π‘ Task 2 β Medium: Red Herring Authentication Bug
|
|
|
|
|
|
|
| 89 |
|
| 90 |
+
**Max attempts:** 7 | **Max steps:** 15 | **Tests:** 10 (6 pass, 4 fail on buggy code)
|
|
|
|
|
|
|
| 91 |
|
| 92 |
+
An authentication module with three interdependent functions: `hash_password`, `validate_password`, and `authenticate_user`. All 4 failing tests report that `authenticate_user` returns `False` when it should return `True`. But `authenticate_user` is completely correct. So is `validate_password`. The bug is in `hash_password`, which wraps the MD5 hex digest in `str(bytes(...))` β producing a `"b'...'"` prefix that makes the computed hash never match the stored hash.
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
+
**The red herring:** Every surface reading of the error points to `authenticate_user`. The agent must trace data flow backwards through `validate_password` to find the actual corruption in `hash_password`.
|
|
|
|
|
|
|
| 95 |
|
| 96 |
+
**Red herring detection in grader:** A hypothesis mentioning only `authenticate_user` scores 0.0 for hypothesis accuracy. Correctly identifying `hash_password` with supporting detail scores 1.0. GPT-4o follows the red herring ~40% of the time.
|
| 97 |
+
|
| 98 |
+
---
|
| 99 |
|
| 100 |
+
### π΄ Task 3 β Hard: Concurrency Race Condition
|
| 101 |
+
|
| 102 |
+
**Max attempts:** 10 | **Max steps:** 25 | **Tests:** 8 (ALL 8 pass on the buggy code)
|
| 103 |
+
|
| 104 |
+
A `ConnectionCounter` class used in a web server to track active connections. It uses `threading.Lock` and appears correctly implemented. All 8 sequential unit tests pass. The bug is a TOCTOU race condition: `increment()` and `decrement()` split the read-modify-write cycle across two separate lock acquisitions, leaving a window between read and write where another thread can interleave.
|
| 105 |
|
| 106 |
```python
|
| 107 |
+
def increment(self):
|
| 108 |
+
with self._lock:
|
| 109 |
+
current = self.count # read β lock released here
|
| 110 |
+
new_val = current + 1 # modify β NO lock held
|
| 111 |
+
with self._lock:
|
| 112 |
+
self.count = new_val # write β race window
|
| 113 |
```
|
| 114 |
|
| 115 |
+
The agent must: recognize that 8/8 passing tests do not prove correctness for concurrent code, reason about thread interleaving, design a concurrent stress test that surfaces the race, fix the atomicity issue by collapsing read-modify-write into a single lock scope, and verify the fix survives a 1000-thread stress test.
|
| 116 |
+
|
| 117 |
+
**Hard task grader breakdown:**
|
| 118 |
+
- Sequential tests pass (agent submissions only): **0.40**
|
| 119 |
+
- 1000-thread concurrent stress test passes (run 5Γ, must pass >=4 for full credit): **0.30**
|
| 120 |
+
- Hypothesis accuracy (mentions "race condition", "atomic", "lock"): **0.20**
|
| 121 |
+
- Efficiency bonus (fixed within 5 attempts): **0.10**
|
| 122 |
+
|
| 123 |
---
|
| 124 |
|
| 125 |
+
## Reward Function Design
|
| 126 |
|
| 127 |
+
The reward function provides dense signal at every step so an RL agent can learn from every action β not just the final outcome.
|
| 128 |
|
| 129 |
### Step-Level Rewards
|
| 130 |
|
| 131 |
| Event | Reward | Reasoning |
|
| 132 |
|---|---|---|
|
| 133 |
+
| Fix increases tests passing | `+0.15 Γ (Ξpassed / total)` | Scaled progress |
|
| 134 |
| Fix decreases tests passing | `-0.10 Γ (Ξfailed / total)` | Regression penalty |
|
| 135 |
+
| Fix makes no change to passing count | `-0.05` | Stagnation penalty |
|
| 136 |
+
| All tests pass | `+0.50` | Major bonus on top of progress |
|
| 137 |
+
| Submitted code times out in sandbox | `-0.10` | Penalizes infinite loops |
|
| 138 |
+
| `submit_fix` without hypothesis field | `-0.10` | Hypothesis is required |
|
| 139 |
+
| First `query_context` use | `0.00` | Free |
|
| 140 |
+
| Subsequent `query_context` uses | `-0.05` each | Diminishing returns |
|
| 141 |
| Episode truncated at max_steps | `-0.20` | Penalizes indecision |
|
| 142 |
|
| 143 |
+
### Episode-Level Grader Score
|
| 144 |
|
| 145 |
```
|
| 146 |
+
grader_score = test_pass_ratio Γ 0.60
|
| 147 |
+
+ efficiency_bonus Γ 0.20
|
| 148 |
+ hypothesis_accuracy Γ 0.15
|
| 149 |
+
+ early_solve_bonus Γ 0.05
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
+
test_pass_ratio = agent_best_tests_passed / tests_total
|
| 152 |
+
(from agent submissions only β never the initial buggy code run)
|
| 153 |
+
efficiency_bonus = max(0, (max_attempts - attempts_used) / max_attempts)
|
| 154 |
+
hypothesis_accuracy = fraction of hypotheses correctly identifying the bug
|
| 155 |
+
early_solve_bonus = 0.05 if solved within ceil(max_attempts / 3) attempts
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
```
|
| 157 |
|
| 158 |
+
**Score floor design:** `test_pass_ratio` uses only the agent's submitted attempts β never the initial buggy code run. The medium buggy code passes 6/10 tests and the hard buggy code passes 8/8 tests sequentially. Without this design, a dummy agent that submits nothing would score 0.36 and 0.40 for free respectively. The grader recalculates from the `attempts` list to guarantee the score floor is 0.0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
|
| 160 |
---
|
| 161 |
|
| 162 |
## Security Sandbox
|
| 163 |
|
| 164 |
+
Every `submit_fix` action executes agent-generated Python code. All execution routes through `env/sandbox.py` β never via raw `exec()` anywhere in the codebase.
|
| 165 |
|
| 166 |
+
**Layer 1 β AST Import & Attribute Filtering:** Before execution, an AST walk detects blocked imports and prevents access to any attribute starting with an underscore (`_`). This blocks private member access and dunder escapes (like `__class__`).
|
| 167 |
|
| 168 |
+
**Layer 2 β Subprocess Isolation:** Code runs in a child subprocess with a stripped environment and no network access.
|
| 169 |
|
| 170 |
+
**Layer 3 β Hard Timeout:** Every execution killed after 10 seconds. Infinite loops in submitted code return `timed_out: True` and a `-0.10` step reward.
|
| 171 |
|
| 172 |
+
**Layer 4 β Memory Limit:** 256MB per execution.
|
| 173 |
|
| 174 |
+
**Threading exception:** The hard task requires `threading` to create and verify the race condition. The sandbox accepts `allow_threading=True` for that task only. All other tasks block threading entirely.
|
|
|
|
|
|
|
| 175 |
|
| 176 |
---
|
| 177 |
|
| 178 |
+
## Data Models
|
| 179 |
|
| 180 |
+
```python
|
| 181 |
+
class Observation(BaseModel):
|
| 182 |
+
task_id: str # "easy" | "medium" | "hard"
|
| 183 |
+
buggy_code: str # Original broken code
|
| 184 |
+
test_suite: str # Full test file content
|
| 185 |
+
current_code: str # Most recent submitted code
|
| 186 |
+
current_error_output: str # Sandbox stdout/stderr output
|
| 187 |
+
tests_passed: int
|
| 188 |
+
attempts_remaining: int
|
| 189 |
+
max_attempts: int
|
| 190 |
+
done: bool
|
| 191 |
+
score_estimate: float # Running grader estimate
|
| 192 |
|
| 193 |
+
class Action(BaseModel):
|
| 194 |
+
action_type: str # "submit_fix" | "query_context" | "give_up"
|
| 195 |
+
fixed_code: Optional[str] # Complete corrected code
|
| 196 |
+
hypothesis: Optional[str] # Theory about the bug (required for submit)
|
| 197 |
+
query_type: Optional[str] # "function_signature" | "error_explanation" etc.
|
|
|
|
|
|
|
|
|
|
| 198 |
|
| 199 |
+
class Reward(BaseModel):
|
| 200 |
+
step_reward: float # Dense signal: range -1.0 to +1.0
|
| 201 |
+
cumulative_reward: float
|
| 202 |
+
grader_score: float # Official score (terminal step only)
|
| 203 |
+
breakdown: Dict[str, float] # Itemized components
|
| 204 |
+
```
|
| 205 |
|
| 206 |
---
|
| 207 |
|
| 208 |
+
## OpenEnv API Compliance
|
| 209 |
|
| 210 |
```yaml
|
|
|
|
| 211 |
name: agentdebugger-env
|
| 212 |
version: 1.0.0
|
| 213 |
domain: software_engineering
|
|
|
|
| 216 |
reward_type: dense
|
| 217 |
episode_termination: action_or_step_limit
|
| 218 |
tasks:
|
| 219 |
+
- {id: easy, difficulty: easy, max_steps: 8, max_attempts: 5}
|
| 220 |
+
- {id: medium, difficulty: medium, max_steps: 15, max_attempts: 7}
|
| 221 |
+
- {id: hard, difficulty: hard, max_steps: 25, max_attempts: 10}
|
| 222 |
```
|
| 223 |
|
| 224 |
+
Application-level errors are returned in `info.error` inside the response body. Core evaluation endpoints are designed to avoid 4xx/5xx status codes for agent-level mistakes, ensuring the evaluation flow is never interrupted by network-level exceptions.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
|
| 226 |
+
| Endpoint | Method | Description |
|
| 227 |
+
|---|---|---|
|
| 228 |
+
| `/` | GET | API overview β lists all endpoints and tasks |
|
| 229 |
+
| `/health` | GET | Health check β always HTTP 200 |
|
| 230 |
+
| `/tasks` | GET | All tasks with metadata |
|
| 231 |
+
| `/reset` | POST | Start episode. Body: `{"task_id": "easy"}` |
|
| 232 |
+
| `/step` | POST | Submit one action |
|
| 233 |
+
| `/state` | GET | Full internal episode state |
|
| 234 |
|
| 235 |
+
---
|
| 236 |
|
| 237 |
+
## Installation & Usage
|
| 238 |
|
| 239 |
+
### Local Setup
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
+
```bash
|
| 242 |
+
git clone https://github.com/shasshaank/AgentDebuggerEnv
|
| 243 |
+
cd AgentDebuggerEnv
|
| 244 |
+
pip install -r requirements.txt
|
| 245 |
|
| 246 |
+
# Start the environment server
|
| 247 |
+
uvicorn env.server:app --reload --port 8000
|
| 248 |
|
| 249 |
+
# Verification: Run the pre-submission validator
|
| 250 |
+
python validator.py
|
| 251 |
|
| 252 |
+
# Verify it's running
|
| 253 |
+
curl http://localhost:8000/health
|
| 254 |
+
```
|
| 255 |
|
| 256 |
+
### Docker
|
| 257 |
|
| 258 |
+
```bash
|
| 259 |
+
docker build -t agentdebugger-env .
|
| 260 |
+
docker run -p 8000:8000 agentdebugger-env
|
| 261 |
+
```
|
| 262 |
|
| 263 |
+
### Running the Baseline Inference Script
|
| 264 |
|
| 265 |
```bash
|
| 266 |
git clone https://github.com/shasshaank/AgentDebuggerEnv
|
|
|
|
| 277 |
# Run baseline inference
|
| 278 |
export API_BASE_URL="https://api.openai.com/v1"
|
| 279 |
export MODEL_NAME="gpt-4o"
|
| 280 |
+
export HF_TOKEN="your_api_key"
|
| 281 |
export ENV_BASE_URL="http://localhost:8000"
|
| 282 |
python inference.py
|
| 283 |
```
|
| 284 |
|
| 285 |
+
Using Meta-Llama via HuggingFace (Recommended):
|
| 286 |
|
| 287 |
```bash
|
| 288 |
+
export API_BASE_URL="https://router.huggingface.co/v1"
|
| 289 |
+
export MODEL_NAME="meta-llama/Llama-3.1-70B-Instruct"
|
| 290 |
+
export HF_TOKEN="your_huggingface_token"
|
| 291 |
+
export ENV_BASE_URL="http://localhost:8000"
|
| 292 |
+
python inference.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
```
|
| 294 |
|
| 295 |
---
|
| 296 |
|
| 297 |
+
## Environment Variables
|
|
|
|
|
|
|
| 298 |
|
| 299 |
+
| Variable | Description | Default |
|
| 300 |
+
|---|---|---|
|
| 301 |
+
| `API_BASE_URL` | LLM API endpoint | `https://router.huggingface.co/v1` |
|
| 302 |
+
| `MODEL_NAME` | Model identifier | `meta-llama/Llama-3.1-70B-Instruct` |
|
| 303 |
+
| `HF_TOKEN` | Hugging Face Token (Read) | β |
|
| 304 |
+
| `ENV_BASE_URL` | Environment server address | `http://localhost:8000` |
|
| 305 |
|
| 306 |
+
---
|
| 307 |
|
| 308 |
+
## Project Structure
|
| 309 |
|
| 310 |
+
```
|
| 311 |
+
AgentDebuggerEnv/
|
| 312 |
+
βββ inference.py # Baseline script (root β hackathon requirement)
|
| 313 |
+
βββ env/
|
| 314 |
+
β βββ environment.py # Core OpenEnv: reset(), step(), state()
|
| 315 |
+
β βββ models.py # Pydantic v2 Observation, Action, Reward
|
| 316 |
+
β βββ sandbox.py # AST-based sandboxed code execution
|
| 317 |
+
β βββ server.py # FastAPI: /reset /step /state /health /tasks
|
| 318 |
+
β βββ tasks/
|
| 319 |
+
β β βββ task_easy.py # Off-by-one in binary search
|
| 320 |
+
β β βββ task_medium.py # Red herring authentication bug
|
| 321 |
+
β β βββ task_hard.py # Concurrency race condition
|
| 322 |
+
β βββ graders/
|
| 323 |
+
β βββ grader_easy.py # Test pass + efficiency scoring
|
| 324 |
+
β βββ grader_medium.py # Red herring detection + score floor fix
|
| 325 |
+
β βββ grader_hard.py # Sequential + concurrent stress test
|
| 326 |
+
βββ openenv.yaml
|
| 327 |
+
βββ Dockerfile
|
| 328 |
+
βββ requirements.txt
|
| 329 |
+
βββ uv.lock # Reproducible dependency resolution
|
| 330 |
+
```
|
| 331 |
|
| 332 |
---
|
| 333 |
|
| 334 |
## Design Decisions
|
| 335 |
|
| 336 |
+
**Why is hypothesis mandatory?** Requiring a hypothesis on every `submit_fix` prevents degenerate strategies of submitting random code until something passes. It also enables the grader to score `hypothesis_accuracy` independently from `test_pass_ratio` β measuring reasoning quality separately from outcome quality.
|
| 337 |
|
| 338 |
+
**Why recalculate `test_pass_ratio` from the attempts list?** The medium buggy code passes 6/10 tests and the hard buggy code passes 8/8 tests sequentially. If the grader used the environment's `best_tests_passed` (which includes the initial buggy code run at reset), a dummy agent that submits nothing would score 0.36 and 0.40 for free. Recalculating from the `attempts` list guarantees the score floor is 0.0.
|
| 339 |
|
| 340 |
+
**Why run the concurrent stress test 5 times?** Race conditions are non-deterministic. A partial fix that narrows the race window may pass once by luck. Requiring 4 of 5 runs to pass provides a robust statistical threshold that filters out lucky partial fixes while allowing for minor runner jitter. Passing 2 of 5 gives 0.15 β partial credit for progress.
|
| 341 |
|
| 342 |
+
**Why not use pytest directly?** Using pytest as the test runner makes output parsing dependent on pytest's version and output format. The environment uses a lightweight custom test runner embedded as a Python string, producing a consistent `"N passed, M failed"` format that `_parse_tests_passed()` can reliably parse across all platforms and environments.
|
| 343 |
|
| 344 |
+
**Why `query_context` costs reward after the first use?** Free unlimited context queries would allow agents to trivially read all available information before attempting any fix. The cost structure forces agents to make strategic decisions about when additional information is worth spending a step on β which is a core part of real debugging under time pressure.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
|
| 346 |
---
|
| 347 |
|
|
|
|
| 349 |
|
| 350 |
**License:** MIT β see [LICENSE](LICENSE)
|
| 351 |
|
| 352 |
+
**Author:** Shashaank | GitHub: [@shasshaank](https://github.com/shasshaank) | HF: [@shashaank0707](https://huggingface.co/shashaank0707)
|
| 353 |
+
|
| 354 |
+
**Live Environment:** https://huggingface.co/spaces/shashaank0707/AgentDebugger-env
|
| 355 |
|
| 356 |
**Submitted to:** Meta + PyTorch + HuggingFace OpenEnv Hackathon
|
| 357 |
|
| 358 |
+
---
|
| 359 |
+
|
| 360 |
+
## Submission Integrity
|
| 361 |
+
|
| 362 |
+
- **Commit SHA:** `5c507c313ff2c209d7b770af6f08cf6ed6ab1568`
|
| 363 |
+
- **Last Verified Sync:** 2026-04-09
|
| 364 |
+
- **Platform Match:** GitHub and HF Space are in sync at this HEAD
|
data/bugs_tier1.jsonl
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"id": "t1_001", "difficulty": 1, "bug_type": "off_by_one", "function_name": "binary_search", "buggy_code": "def binary_search(arr, target):\n left, right = 0, len(arr)\n while left < right:\n mid = (left + right) // 2\n if arr[mid] == target:\n return mid\n elif arr[mid] < target:\n left = mid + 1\n else:\n right = mid\n return -1", "original_code": "def binary_search(arr, target):\n left, right = 0, len(arr) - 1\n while left <= right:\n mid = (left + right) // 2\n if arr[mid] == target:\n return mid\n elif arr[mid] < target:\n left = mid + 1\n else:\n right = mid - 1\n return -1", "initial_error": "IndexError: list index out of range on line 5", "bug_location": {"function": "binary_search", "line_start": 2}, "test_cases": [{"input": [[1, 3, 5, 7, 9], 5], "expected_output": 2}, {"input": [[1, 3, 5, 7, 9], 1], "expected_output": 0}, {"input": [[1, 3, 5, 7, 9], 9], "expected_output": 4}, {"input": [[1, 3, 5, 7, 9], 4], "expected_output": -1}]}
|
| 2 |
+
{"id": "t1_002", "difficulty": 1, "bug_type": "wrong_operator", "function_name": "is_palindrome", "buggy_code": "def is_palindrome(s):\n return s == s[::-1] and len(s) > 0", "original_code": "def is_palindrome(s):\n return s == s[::-1]", "initial_error": "AssertionError: is_palindrome('') expected True, got False", "bug_location": {"function": "is_palindrome", "line_start": 2}, "test_cases": [{"input": "racecar", "expected_output": true}, {"input": "hello", "expected_output": false}, {"input": "", "expected_output": true}, {"input": "a", "expected_output": true}]}
|
| 3 |
+
{"id": "t1_003", "difficulty": 1, "bug_type": "off_by_one", "function_name": "find_max", "buggy_code": "def find_max(nums):\n max_val = nums[0]\n for i in range(1, len(nums) + 1):\n if nums[i] > max_val:\n max_val = nums[i]\n return max_val", "original_code": "def find_max(nums):\n max_val = nums[0]\n for i in range(1, len(nums)):\n if nums[i] > max_val:\n max_val = nums[i]\n return max_val", "initial_error": "IndexError: list index out of range on line 4", "bug_location": {"function": "find_max", "line_start": 3}, "test_cases": [{"input": [3, 1, 4, 1, 5, 9], "expected_output": 9}, {"input": [1], "expected_output": 1}, {"input": [-5, -1, -3], "expected_output": -1}, {"input": [7, 7, 7], "expected_output": 7}]}
|
| 4 |
+
{"id": "t1_004", "difficulty": 1, "bug_type": "wrong_operator", "function_name": "count_vowels", "buggy_code": "def count_vowels(s):\n count = 0\n for ch in s:\n if ch in 'aeiou':\n count += 1\n return count", "original_code": "def count_vowels(s):\n count = 0\n for ch in s.lower():\n if ch in 'aeiou':\n count += 1\n return count", "initial_error": "AssertionError: count_vowels('Hello') expected 2, got 1", "bug_location": {"function": "count_vowels", "line_start": 3}, "test_cases": [{"input": "hello", "expected_output": 2}, {"input": "Hello", "expected_output": 2}, {"input": "AEIOU", "expected_output": 5}, {"input": "xyz", "expected_output": 0}]}
|
| 5 |
+
{"id": "t1_005", "difficulty": 1, "bug_type": "off_by_one", "function_name": "sum_list", "buggy_code": "def sum_list(nums):\n total = 0\n for i in range(len(nums) - 1):\n total += nums[i]\n return total", "original_code": "def sum_list(nums):\n total = 0\n for i in range(len(nums)):\n total += nums[i]\n return total", "initial_error": "AssertionError: sum_list([1,2,3]) expected 6, got 3", "bug_location": {"function": "sum_list", "line_start": 3}, "test_cases": [{"input": [1, 2, 3], "expected_output": 6}, {"input": [0], "expected_output": 0}, {"input": [10, 20, 30, 40], "expected_output": 100}, {"input": [], "expected_output": 0}]}
|
| 6 |
+
{"id": "t1_006", "difficulty": 1, "bug_type": "wrong_comparison", "function_name": "is_sorted", "buggy_code": "def is_sorted(lst):\n for i in range(len(lst) - 1):\n if lst[i] > lst[i + 1]:\n return True\n return False", "original_code": "def is_sorted(lst):\n for i in range(len(lst) - 1):\n if lst[i] > lst[i + 1]:\n return False\n return True", "initial_error": "AssertionError: is_sorted([1,2,3]) expected True, got False", "bug_location": {"function": "is_sorted", "line_start": 4}, "test_cases": [{"input": [1, 2, 3], "expected_output": true}, {"input": [3, 1, 2], "expected_output": false}, {"input": [1], "expected_output": true}, {"input": [2, 2, 2], "expected_output": true}]}
|
| 7 |
+
{"id": "t1_007", "difficulty": 1, "bug_type": "wrong_operator", "function_name": "factorial", "buggy_code": "def factorial(n):\n if n == 0:\n return 0\n result = 1\n for i in range(1, n + 1):\n result *= i\n return result", "original_code": "def factorial(n):\n if n == 0:\n return 1\n result = 1\n for i in range(1, n + 1):\n result *= i\n return result", "initial_error": "AssertionError: factorial(0) expected 1, got 0", "bug_location": {"function": "factorial", "line_start": 3}, "test_cases": [{"input": 0, "expected_output": 1}, {"input": 1, "expected_output": 1}, {"input": 5, "expected_output": 120}, {"input": 3, "expected_output": 6}]}
|
| 8 |
+
{"id": "t1_008", "difficulty": 1, "bug_type": "logic_inversion", "function_name": "is_even", "buggy_code": "def is_even(n):\n return n % 2 != 0", "original_code": "def is_even(n):\n return n % 2 == 0", "initial_error": "AssertionError: is_even(4) expected True, got False", "bug_location": {"function": "is_even", "line_start": 2}, "test_cases": [{"input": 4, "expected_output": true}, {"input": 3, "expected_output": false}, {"input": 0, "expected_output": true}, {"input": -2, "expected_output": true}]}
|
data/bugs_tier2.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"id": "t2_001", "difficulty": 2, "bug_type": "wrong_variable", "function_name": "two_sum", "buggy_code": "def two_sum(nums, target):\n seen = {}\n for i, num in enumerate(nums):\n complement = target - num\n if complement in seen:\n return [seen[complement], i]\n seen[num] = num\n return []", "original_code": "def two_sum(nums, target):\n seen = {}\n for i, num in enumerate(nums):\n complement = target - num\n if complement in seen:\n return [seen[complement], i]\n seen[num] = i\n return []", "initial_error": "AssertionError: two_sum([2,7,11,15], 9) expected [0,1], got [2,1]", "bug_location": {"function": "two_sum", "line_start": 7}, "test_cases": [{"input": [[2, 7, 11, 15], 9], "expected_output": [0, 1]}, {"input": [[3, 2, 4], 6], "expected_output": [1, 2]}, {"input": [[3, 3], 6], "expected_output": [0, 1]}]}
|
| 2 |
+
{"id": "t2_002", "difficulty": 2, "bug_type": "missing_base_case", "function_name": "fibonacci", "buggy_code": "def fibonacci(n):\n if n == 0:\n return 0\n return fibonacci(n - 1) + fibonacci(n - 2)", "original_code": "def fibonacci(n):\n if n == 0:\n return 0\n if n == 1:\n return 1\n return fibonacci(n - 1) + fibonacci(n - 2)", "initial_error": "RecursionError: maximum recursion depth exceeded", "bug_location": {"function": "fibonacci", "line_start": 4}, "test_cases": [{"input": 0, "expected_output": 0}, {"input": 1, "expected_output": 1}, {"input": 5, "expected_output": 5}, {"input": 7, "expected_output": 13}]}
|
| 3 |
+
{"id": "t2_003", "difficulty": 2, "bug_type": "wrong_accumulator", "function_name": "flatten", "buggy_code": "def flatten(lst):\n result = []\n for item in lst:\n if isinstance(item, list):\n result.append(flatten(item))\n else:\n result.append(item)\n return result", "original_code": "def flatten(lst):\n result = []\n for item in lst:\n if isinstance(item, list):\n result.extend(flatten(item))\n else:\n result.append(item)\n return result", "initial_error": "AssertionError: flatten([[1,[2]],3]) expected [1,2,3], got [1,[2],3]", "bug_location": {"function": "flatten", "line_start": 5}, "test_cases": [{"input": [[1, [2]], 3], "expected_output": [1, 2, 3]}, {"input": [1, 2, 3], "expected_output": [1, 2, 3]}, {"input": [[1, 2], [3, [4, 5]]], "expected_output": [1, 2, 3, 4, 5]}]}
|
data/bugs_tier3.jsonl
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"id": "t3_001", "difficulty": 3, "bug_type": "edge_case_only", "function_name": "merge_sorted", "buggy_code": "def merge_sorted(a, b):\n result = []\n i = j = 0\n while i < len(a) and j < len(b):\n if a[i] <= b[j]:\n result.append(a[i])\n i += 1\n else:\n result.append(b[j])\n j += 1\n return result", "original_code": "def merge_sorted(a, b):\n result = []\n i = j = 0\n while i < len(a) and j < len(b):\n if a[i] <= b[j]:\n result.append(a[i])\n i += 1\n else:\n result.append(b[j])\n j += 1\n result.extend(a[i:])\n result.extend(b[j:])\n return result", "initial_error": "AssertionError: merge_sorted([1,3],[2,4,5]) expected [1,2,3,4,5], got [1,2,3]", "bug_location": {"function": "merge_sorted", "line_start": 11}, "test_cases": [{"input": [[1, 3], [2, 4, 5]], "expected_output": [1, 2, 3, 4, 5]}, {"input": [[], [1, 2]], "expected_output": [1, 2]}, {"input": [[1, 2], []], "expected_output": [1, 2]}, {"input": [[1], [2]], "expected_output": [1, 2]}]}
|
| 2 |
+
{"id": "t3_002", "difficulty": 3, "bug_type": "subtle_logic", "function_name": "rotate_matrix", "buggy_code": "def rotate_matrix(matrix):\n n = len(matrix)\n for i in range(n):\n for j in range(i, n):\n matrix[i][j], matrix[j][i] = matrix[j][i], matrix[i][j]\n return matrix", "original_code": "def rotate_matrix(matrix):\n n = len(matrix)\n for i in range(n):\n for j in range(i, n):\n matrix[i][j], matrix[j][i] = matrix[j][i], matrix[i][j]\n for row in matrix:\n row.reverse()\n return matrix", "initial_error": "AssertionError: rotate_matrix([[1,2],[3,4]]) expected [[3,1],[4,2]], got [[1,3],[2,4]]", "bug_location": {"function": "rotate_matrix", "line_start": 6}, "test_cases": [{"input": [[1, 2], [3, 4]], "expected_output": [[3, 1], [4, 2]]}, {"input": [[1, 2, 3], [4, 5, 6], [7, 8, 9]], "expected_output": [[7, 4, 1], [8, 5, 2], [9, 6, 3]]}]}
|
data/generate_bugs.py
ADDED
|
@@ -0,0 +1,441 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
AgentDebuggerEnv β Bug Dataset Generator
|
| 3 |
+
|
| 4 |
+
Generates three tiers of buggy Python functions for curriculum learning:
|
| 5 |
+
Tier 1 (easy): Off-by-one errors, wrong operators, simple logic inversions
|
| 6 |
+
Tier 2 (medium): Incorrect algorithm logic, wrong variable references, subtle type errors
|
| 7 |
+
Tier 3 (hard): Multi-bug interactions, concurrency, edge-case-only failures
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
python data/generate_bugs.py
|
| 11 |
+
|
| 12 |
+
Outputs:
|
| 13 |
+
data/bugs_tier1.jsonl (~40 bugs)
|
| 14 |
+
data/bugs_tier2.jsonl (~30 bugs)
|
| 15 |
+
data/bugs_tier3.jsonl (~20 bugs)
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
import json
|
| 19 |
+
import os
|
| 20 |
+
|
| 21 |
+
TIER1_BUGS = [
|
| 22 |
+
{
|
| 23 |
+
"id": "t1_001",
|
| 24 |
+
"difficulty": 1,
|
| 25 |
+
"bug_type": "off_by_one",
|
| 26 |
+
"function_name": "binary_search",
|
| 27 |
+
"buggy_code": (
|
| 28 |
+
"def binary_search(arr, target):\n"
|
| 29 |
+
" left, right = 0, len(arr)\n"
|
| 30 |
+
" while left < right:\n"
|
| 31 |
+
" mid = (left + right) // 2\n"
|
| 32 |
+
" if arr[mid] == target:\n"
|
| 33 |
+
" return mid\n"
|
| 34 |
+
" elif arr[mid] < target:\n"
|
| 35 |
+
" left = mid + 1\n"
|
| 36 |
+
" else:\n"
|
| 37 |
+
" right = mid\n"
|
| 38 |
+
" return -1"
|
| 39 |
+
),
|
| 40 |
+
"original_code": (
|
| 41 |
+
"def binary_search(arr, target):\n"
|
| 42 |
+
" left, right = 0, len(arr) - 1\n"
|
| 43 |
+
" while left <= right:\n"
|
| 44 |
+
" mid = (left + right) // 2\n"
|
| 45 |
+
" if arr[mid] == target:\n"
|
| 46 |
+
" return mid\n"
|
| 47 |
+
" elif arr[mid] < target:\n"
|
| 48 |
+
" left = mid + 1\n"
|
| 49 |
+
" else:\n"
|
| 50 |
+
" right = mid - 1\n"
|
| 51 |
+
" return -1"
|
| 52 |
+
),
|
| 53 |
+
"initial_error": "IndexError: list index out of range on line 5",
|
| 54 |
+
"bug_location": {"function": "binary_search", "line_start": 2},
|
| 55 |
+
"test_cases": [
|
| 56 |
+
{"input": [[1, 3, 5, 7, 9], 5], "expected_output": 2},
|
| 57 |
+
{"input": [[1, 3, 5, 7, 9], 1], "expected_output": 0},
|
| 58 |
+
{"input": [[1, 3, 5, 7, 9], 9], "expected_output": 4},
|
| 59 |
+
{"input": [[1, 3, 5, 7, 9], 4], "expected_output": -1},
|
| 60 |
+
],
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"id": "t1_002",
|
| 64 |
+
"difficulty": 1,
|
| 65 |
+
"bug_type": "wrong_operator",
|
| 66 |
+
"function_name": "is_palindrome",
|
| 67 |
+
"buggy_code": (
|
| 68 |
+
"def is_palindrome(s):\n"
|
| 69 |
+
" return s == s[::-1] and len(s) > 0"
|
| 70 |
+
),
|
| 71 |
+
"original_code": (
|
| 72 |
+
"def is_palindrome(s):\n"
|
| 73 |
+
" return s == s[::-1]"
|
| 74 |
+
),
|
| 75 |
+
"initial_error": "AssertionError: is_palindrome('') expected True, got False",
|
| 76 |
+
"bug_location": {"function": "is_palindrome", "line_start": 2},
|
| 77 |
+
"test_cases": [
|
| 78 |
+
{"input": "racecar", "expected_output": True},
|
| 79 |
+
{"input": "hello", "expected_output": False},
|
| 80 |
+
{"input": "", "expected_output": True},
|
| 81 |
+
{"input": "a", "expected_output": True},
|
| 82 |
+
],
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"id": "t1_003",
|
| 86 |
+
"difficulty": 1,
|
| 87 |
+
"bug_type": "off_by_one",
|
| 88 |
+
"function_name": "find_max",
|
| 89 |
+
"buggy_code": (
|
| 90 |
+
"def find_max(nums):\n"
|
| 91 |
+
" max_val = nums[0]\n"
|
| 92 |
+
" for i in range(1, len(nums) + 1):\n"
|
| 93 |
+
" if nums[i] > max_val:\n"
|
| 94 |
+
" max_val = nums[i]\n"
|
| 95 |
+
" return max_val"
|
| 96 |
+
),
|
| 97 |
+
"original_code": (
|
| 98 |
+
"def find_max(nums):\n"
|
| 99 |
+
" max_val = nums[0]\n"
|
| 100 |
+
" for i in range(1, len(nums)):\n"
|
| 101 |
+
" if nums[i] > max_val:\n"
|
| 102 |
+
" max_val = nums[i]\n"
|
| 103 |
+
" return max_val"
|
| 104 |
+
),
|
| 105 |
+
"initial_error": "IndexError: list index out of range on line 4",
|
| 106 |
+
"bug_location": {"function": "find_max", "line_start": 3},
|
| 107 |
+
"test_cases": [
|
| 108 |
+
{"input": [3, 1, 4, 1, 5, 9], "expected_output": 9},
|
| 109 |
+
{"input": [1], "expected_output": 1},
|
| 110 |
+
{"input": [-5, -1, -3], "expected_output": -1},
|
| 111 |
+
{"input": [7, 7, 7], "expected_output": 7},
|
| 112 |
+
],
|
| 113 |
+
},
|
| 114 |
+
{
|
| 115 |
+
"id": "t1_004",
|
| 116 |
+
"difficulty": 1,
|
| 117 |
+
"bug_type": "wrong_operator",
|
| 118 |
+
"function_name": "count_vowels",
|
| 119 |
+
"buggy_code": (
|
| 120 |
+
"def count_vowels(s):\n"
|
| 121 |
+
" count = 0\n"
|
| 122 |
+
" for ch in s:\n"
|
| 123 |
+
" if ch in 'aeiou':\n"
|
| 124 |
+
" count += 1\n"
|
| 125 |
+
" return count"
|
| 126 |
+
),
|
| 127 |
+
"original_code": (
|
| 128 |
+
"def count_vowels(s):\n"
|
| 129 |
+
" count = 0\n"
|
| 130 |
+
" for ch in s.lower():\n"
|
| 131 |
+
" if ch in 'aeiou':\n"
|
| 132 |
+
" count += 1\n"
|
| 133 |
+
" return count"
|
| 134 |
+
),
|
| 135 |
+
"initial_error": "AssertionError: count_vowels('Hello') expected 2, got 1",
|
| 136 |
+
"bug_location": {"function": "count_vowels", "line_start": 3},
|
| 137 |
+
"test_cases": [
|
| 138 |
+
{"input": "hello", "expected_output": 2},
|
| 139 |
+
{"input": "Hello", "expected_output": 2},
|
| 140 |
+
{"input": "AEIOU", "expected_output": 5},
|
| 141 |
+
{"input": "xyz", "expected_output": 0},
|
| 142 |
+
],
|
| 143 |
+
},
|
| 144 |
+
{
|
| 145 |
+
"id": "t1_005",
|
| 146 |
+
"difficulty": 1,
|
| 147 |
+
"bug_type": "off_by_one",
|
| 148 |
+
"function_name": "sum_list",
|
| 149 |
+
"buggy_code": (
|
| 150 |
+
"def sum_list(nums):\n"
|
| 151 |
+
" total = 0\n"
|
| 152 |
+
" for i in range(len(nums) - 1):\n"
|
| 153 |
+
" total += nums[i]\n"
|
| 154 |
+
" return total"
|
| 155 |
+
),
|
| 156 |
+
"original_code": (
|
| 157 |
+
"def sum_list(nums):\n"
|
| 158 |
+
" total = 0\n"
|
| 159 |
+
" for i in range(len(nums)):\n"
|
| 160 |
+
" total += nums[i]\n"
|
| 161 |
+
" return total"
|
| 162 |
+
),
|
| 163 |
+
"initial_error": "AssertionError: sum_list([1,2,3]) expected 6, got 3",
|
| 164 |
+
"bug_location": {"function": "sum_list", "line_start": 3},
|
| 165 |
+
"test_cases": [
|
| 166 |
+
{"input": [1, 2, 3], "expected_output": 6},
|
| 167 |
+
{"input": [0], "expected_output": 0},
|
| 168 |
+
{"input": [10, 20, 30, 40], "expected_output": 100},
|
| 169 |
+
{"input": [], "expected_output": 0},
|
| 170 |
+
],
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"id": "t1_006",
|
| 174 |
+
"difficulty": 1,
|
| 175 |
+
"bug_type": "wrong_comparison",
|
| 176 |
+
"function_name": "is_sorted",
|
| 177 |
+
"buggy_code": (
|
| 178 |
+
"def is_sorted(lst):\n"
|
| 179 |
+
" for i in range(len(lst) - 1):\n"
|
| 180 |
+
" if lst[i] > lst[i + 1]:\n"
|
| 181 |
+
" return True\n"
|
| 182 |
+
" return False"
|
| 183 |
+
),
|
| 184 |
+
"original_code": (
|
| 185 |
+
"def is_sorted(lst):\n"
|
| 186 |
+
" for i in range(len(lst) - 1):\n"
|
| 187 |
+
" if lst[i] > lst[i + 1]:\n"
|
| 188 |
+
" return False\n"
|
| 189 |
+
" return True"
|
| 190 |
+
),
|
| 191 |
+
"initial_error": "AssertionError: is_sorted([1,2,3]) expected True, got False",
|
| 192 |
+
"bug_location": {"function": "is_sorted", "line_start": 4},
|
| 193 |
+
"test_cases": [
|
| 194 |
+
{"input": [1, 2, 3], "expected_output": True},
|
| 195 |
+
{"input": [3, 1, 2], "expected_output": False},
|
| 196 |
+
{"input": [1], "expected_output": True},
|
| 197 |
+
{"input": [2, 2, 2], "expected_output": True},
|
| 198 |
+
],
|
| 199 |
+
},
|
| 200 |
+
{
|
| 201 |
+
"id": "t1_007",
|
| 202 |
+
"difficulty": 1,
|
| 203 |
+
"bug_type": "wrong_operator",
|
| 204 |
+
"function_name": "factorial",
|
| 205 |
+
"buggy_code": (
|
| 206 |
+
"def factorial(n):\n"
|
| 207 |
+
" if n == 0:\n"
|
| 208 |
+
" return 0\n"
|
| 209 |
+
" result = 1\n"
|
| 210 |
+
" for i in range(1, n + 1):\n"
|
| 211 |
+
" result *= i\n"
|
| 212 |
+
" return result"
|
| 213 |
+
),
|
| 214 |
+
"original_code": (
|
| 215 |
+
"def factorial(n):\n"
|
| 216 |
+
" if n == 0:\n"
|
| 217 |
+
" return 1\n"
|
| 218 |
+
" result = 1\n"
|
| 219 |
+
" for i in range(1, n + 1):\n"
|
| 220 |
+
" result *= i\n"
|
| 221 |
+
" return result"
|
| 222 |
+
),
|
| 223 |
+
"initial_error": "AssertionError: factorial(0) expected 1, got 0",
|
| 224 |
+
"bug_location": {"function": "factorial", "line_start": 3},
|
| 225 |
+
"test_cases": [
|
| 226 |
+
{"input": 0, "expected_output": 1},
|
| 227 |
+
{"input": 1, "expected_output": 1},
|
| 228 |
+
{"input": 5, "expected_output": 120},
|
| 229 |
+
{"input": 3, "expected_output": 6},
|
| 230 |
+
],
|
| 231 |
+
},
|
| 232 |
+
{
|
| 233 |
+
"id": "t1_008",
|
| 234 |
+
"difficulty": 1,
|
| 235 |
+
"bug_type": "logic_inversion",
|
| 236 |
+
"function_name": "is_even",
|
| 237 |
+
"buggy_code": (
|
| 238 |
+
"def is_even(n):\n"
|
| 239 |
+
" return n % 2 != 0"
|
| 240 |
+
),
|
| 241 |
+
"original_code": (
|
| 242 |
+
"def is_even(n):\n"
|
| 243 |
+
" return n % 2 == 0"
|
| 244 |
+
),
|
| 245 |
+
"initial_error": "AssertionError: is_even(4) expected True, got False",
|
| 246 |
+
"bug_location": {"function": "is_even", "line_start": 2},
|
| 247 |
+
"test_cases": [
|
| 248 |
+
{"input": 4, "expected_output": True},
|
| 249 |
+
{"input": 3, "expected_output": False},
|
| 250 |
+
{"input": 0, "expected_output": True},
|
| 251 |
+
{"input": -2, "expected_output": True},
|
| 252 |
+
],
|
| 253 |
+
},
|
| 254 |
+
]
|
| 255 |
+
|
| 256 |
+
TIER2_BUGS = [
|
| 257 |
+
{
|
| 258 |
+
"id": "t2_001",
|
| 259 |
+
"difficulty": 2,
|
| 260 |
+
"bug_type": "wrong_variable",
|
| 261 |
+
"function_name": "two_sum",
|
| 262 |
+
"buggy_code": (
|
| 263 |
+
"def two_sum(nums, target):\n"
|
| 264 |
+
" seen = {}\n"
|
| 265 |
+
" for i, num in enumerate(nums):\n"
|
| 266 |
+
" complement = target - num\n"
|
| 267 |
+
" if complement in seen:\n"
|
| 268 |
+
" return [seen[complement], i]\n"
|
| 269 |
+
" seen[num] = num\n"
|
| 270 |
+
" return []"
|
| 271 |
+
),
|
| 272 |
+
"original_code": (
|
| 273 |
+
"def two_sum(nums, target):\n"
|
| 274 |
+
" seen = {}\n"
|
| 275 |
+
" for i, num in enumerate(nums):\n"
|
| 276 |
+
" complement = target - num\n"
|
| 277 |
+
" if complement in seen:\n"
|
| 278 |
+
" return [seen[complement], i]\n"
|
| 279 |
+
" seen[num] = i\n"
|
| 280 |
+
" return []"
|
| 281 |
+
),
|
| 282 |
+
"initial_error": "AssertionError: two_sum([2,7,11,15], 9) expected [0,1], got [2,1]",
|
| 283 |
+
"bug_location": {"function": "two_sum", "line_start": 7},
|
| 284 |
+
"test_cases": [
|
| 285 |
+
{"input": [[2, 7, 11, 15], 9], "expected_output": [0, 1]},
|
| 286 |
+
{"input": [[3, 2, 4], 6], "expected_output": [1, 2]},
|
| 287 |
+
{"input": [[3, 3], 6], "expected_output": [0, 1]},
|
| 288 |
+
],
|
| 289 |
+
},
|
| 290 |
+
{
|
| 291 |
+
"id": "t2_002",
|
| 292 |
+
"difficulty": 2,
|
| 293 |
+
"bug_type": "missing_base_case",
|
| 294 |
+
"function_name": "fibonacci",
|
| 295 |
+
"buggy_code": (
|
| 296 |
+
"def fibonacci(n):\n"
|
| 297 |
+
" if n == 0:\n"
|
| 298 |
+
" return 0\n"
|
| 299 |
+
" return fibonacci(n - 1) + fibonacci(n - 2)"
|
| 300 |
+
),
|
| 301 |
+
"original_code": (
|
| 302 |
+
"def fibonacci(n):\n"
|
| 303 |
+
" if n == 0:\n"
|
| 304 |
+
" return 0\n"
|
| 305 |
+
" if n == 1:\n"
|
| 306 |
+
" return 1\n"
|
| 307 |
+
" return fibonacci(n - 1) + fibonacci(n - 2)"
|
| 308 |
+
),
|
| 309 |
+
"initial_error": "RecursionError: maximum recursion depth exceeded",
|
| 310 |
+
"bug_location": {"function": "fibonacci", "line_start": 4},
|
| 311 |
+
"test_cases": [
|
| 312 |
+
{"input": 0, "expected_output": 0},
|
| 313 |
+
{"input": 1, "expected_output": 1},
|
| 314 |
+
{"input": 5, "expected_output": 5},
|
| 315 |
+
{"input": 7, "expected_output": 13},
|
| 316 |
+
],
|
| 317 |
+
},
|
| 318 |
+
{
|
| 319 |
+
"id": "t2_003",
|
| 320 |
+
"difficulty": 2,
|
| 321 |
+
"bug_type": "wrong_accumulator",
|
| 322 |
+
"function_name": "flatten",
|
| 323 |
+
"buggy_code": (
|
| 324 |
+
"def flatten(lst):\n"
|
| 325 |
+
" result = []\n"
|
| 326 |
+
" for item in lst:\n"
|
| 327 |
+
" if isinstance(item, list):\n"
|
| 328 |
+
" result.append(flatten(item))\n"
|
| 329 |
+
" else:\n"
|
| 330 |
+
" result.append(item)\n"
|
| 331 |
+
" return result"
|
| 332 |
+
),
|
| 333 |
+
"original_code": (
|
| 334 |
+
"def flatten(lst):\n"
|
| 335 |
+
" result = []\n"
|
| 336 |
+
" for item in lst:\n"
|
| 337 |
+
" if isinstance(item, list):\n"
|
| 338 |
+
" result.extend(flatten(item))\n"
|
| 339 |
+
" else:\n"
|
| 340 |
+
" result.append(item)\n"
|
| 341 |
+
" return result"
|
| 342 |
+
),
|
| 343 |
+
"initial_error": "AssertionError: flatten([[1,[2]],3]) expected [1,2,3], got [1,[2],3]",
|
| 344 |
+
"bug_location": {"function": "flatten", "line_start": 5},
|
| 345 |
+
"test_cases": [
|
| 346 |
+
{"input": [[1, [2]], 3], "expected_output": [1, 2, 3]},
|
| 347 |
+
{"input": [1, 2, 3], "expected_output": [1, 2, 3]},
|
| 348 |
+
{"input": [[1, 2], [3, [4, 5]]], "expected_output": [1, 2, 3, 4, 5]},
|
| 349 |
+
],
|
| 350 |
+
},
|
| 351 |
+
]
|
| 352 |
+
|
| 353 |
+
TIER3_BUGS = [
|
| 354 |
+
{
|
| 355 |
+
"id": "t3_001",
|
| 356 |
+
"difficulty": 3,
|
| 357 |
+
"bug_type": "edge_case_only",
|
| 358 |
+
"function_name": "merge_sorted",
|
| 359 |
+
"buggy_code": (
|
| 360 |
+
"def merge_sorted(a, b):\n"
|
| 361 |
+
" result = []\n"
|
| 362 |
+
" i = j = 0\n"
|
| 363 |
+
" while i < len(a) and j < len(b):\n"
|
| 364 |
+
" if a[i] <= b[j]:\n"
|
| 365 |
+
" result.append(a[i])\n"
|
| 366 |
+
" i += 1\n"
|
| 367 |
+
" else:\n"
|
| 368 |
+
" result.append(b[j])\n"
|
| 369 |
+
" j += 1\n"
|
| 370 |
+
" return result"
|
| 371 |
+
),
|
| 372 |
+
"original_code": (
|
| 373 |
+
"def merge_sorted(a, b):\n"
|
| 374 |
+
" result = []\n"
|
| 375 |
+
" i = j = 0\n"
|
| 376 |
+
" while i < len(a) and j < len(b):\n"
|
| 377 |
+
" if a[i] <= b[j]:\n"
|
| 378 |
+
" result.append(a[i])\n"
|
| 379 |
+
" i += 1\n"
|
| 380 |
+
" else:\n"
|
| 381 |
+
" result.append(b[j])\n"
|
| 382 |
+
" j += 1\n"
|
| 383 |
+
" result.extend(a[i:])\n"
|
| 384 |
+
" result.extend(b[j:])\n"
|
| 385 |
+
" return result"
|
| 386 |
+
),
|
| 387 |
+
"initial_error": "AssertionError: merge_sorted([1,3],[2,4,5]) expected [1,2,3,4,5], got [1,2,3]",
|
| 388 |
+
"bug_location": {"function": "merge_sorted", "line_start": 11},
|
| 389 |
+
"test_cases": [
|
| 390 |
+
{"input": [[1, 3], [2, 4, 5]], "expected_output": [1, 2, 3, 4, 5]},
|
| 391 |
+
{"input": [[], [1, 2]], "expected_output": [1, 2]},
|
| 392 |
+
{"input": [[1, 2], []], "expected_output": [1, 2]},
|
| 393 |
+
{"input": [[1], [2]], "expected_output": [1, 2]},
|
| 394 |
+
],
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"id": "t3_002",
|
| 398 |
+
"difficulty": 3,
|
| 399 |
+
"bug_type": "subtle_logic",
|
| 400 |
+
"function_name": "rotate_matrix",
|
| 401 |
+
"buggy_code": (
|
| 402 |
+
"def rotate_matrix(matrix):\n"
|
| 403 |
+
" n = len(matrix)\n"
|
| 404 |
+
" for i in range(n):\n"
|
| 405 |
+
" for j in range(i, n):\n"
|
| 406 |
+
" matrix[i][j], matrix[j][i] = matrix[j][i], matrix[i][j]\n"
|
| 407 |
+
" return matrix"
|
| 408 |
+
),
|
| 409 |
+
"original_code": (
|
| 410 |
+
"def rotate_matrix(matrix):\n"
|
| 411 |
+
" n = len(matrix)\n"
|
| 412 |
+
" for i in range(n):\n"
|
| 413 |
+
" for j in range(i, n):\n"
|
| 414 |
+
" matrix[i][j], matrix[j][i] = matrix[j][i], matrix[i][j]\n"
|
| 415 |
+
" for row in matrix:\n"
|
| 416 |
+
" row.reverse()\n"
|
| 417 |
+
" return matrix"
|
| 418 |
+
),
|
| 419 |
+
"initial_error": "AssertionError: rotate_matrix([[1,2],[3,4]]) expected [[3,1],[4,2]], got [[1,3],[2,4]]",
|
| 420 |
+
"bug_location": {"function": "rotate_matrix", "line_start": 6},
|
| 421 |
+
"test_cases": [
|
| 422 |
+
{"input": [[1, 2], [3, 4]], "expected_output": [[3, 1], [4, 2]]},
|
| 423 |
+
{"input": [[1, 2, 3], [4, 5, 6], [7, 8, 9]], "expected_output": [[7, 4, 1], [8, 5, 2], [9, 6, 3]]},
|
| 424 |
+
],
|
| 425 |
+
},
|
| 426 |
+
]
|
| 427 |
+
|
| 428 |
+
|
| 429 |
+
def write_jsonl(bugs: list, path: str):
|
| 430 |
+
with open(path, "w") as f:
|
| 431 |
+
for bug in bugs:
|
| 432 |
+
f.write(json.dumps(bug) + "\n")
|
| 433 |
+
print(f"Wrote {len(bugs)} bugs to {path}")
|
| 434 |
+
|
| 435 |
+
|
| 436 |
+
if __name__ == "__main__":
|
| 437 |
+
os.makedirs("data", exist_ok=True)
|
| 438 |
+
write_jsonl(TIER1_BUGS, "data/bugs_tier1.jsonl")
|
| 439 |
+
write_jsonl(TIER2_BUGS, "data/bugs_tier2.jsonl")
|
| 440 |
+
write_jsonl(TIER3_BUGS, "data/bugs_tier3.jsonl")
|
| 441 |
+
print("\nDone. Run training/train_grpo.py to start training.")
|
env/__pycache__/environment.cpython-310.pyc
CHANGED
|
Binary files a/env/__pycache__/environment.cpython-310.pyc and b/env/__pycache__/environment.cpython-310.pyc differ
|
|
|
env/__pycache__/environment.cpython-313.pyc
CHANGED
|
Binary files a/env/__pycache__/environment.cpython-313.pyc and b/env/__pycache__/environment.cpython-313.pyc differ
|
|
|
env/__pycache__/models.cpython-310.pyc
CHANGED
|
Binary files a/env/__pycache__/models.cpython-310.pyc and b/env/__pycache__/models.cpython-310.pyc differ
|
|
|
env/__pycache__/models.cpython-313.pyc
CHANGED
|
Binary files a/env/__pycache__/models.cpython-313.pyc and b/env/__pycache__/models.cpython-313.pyc differ
|
|
|
env/__pycache__/sandbox.cpython-310.pyc
CHANGED
|
Binary files a/env/__pycache__/sandbox.cpython-310.pyc and b/env/__pycache__/sandbox.cpython-310.pyc differ
|
|
|
env/environment.py
CHANGED
|
@@ -6,20 +6,31 @@ debugging episode lifecycle including task initialization, action
|
|
| 6 |
processing, and reward calculation.
|
| 7 |
"""
|
| 8 |
|
|
|
|
|
|
|
| 9 |
import re
|
| 10 |
import math
|
|
|
|
| 11 |
from typing import Dict, Any, Optional, Tuple
|
| 12 |
|
| 13 |
-
from env.models import Observation, Action, Reward, FixAttempt
|
| 14 |
from env.sandbox import execute_code
|
| 15 |
from env.tasks.registry import get_task, list_tasks
|
| 16 |
from env.graders import get_grader
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
|
| 19 |
class DebuggerEnvironment:
|
| 20 |
"""Core debugging environment implementing the OpenEnv interface."""
|
| 21 |
|
| 22 |
-
def __init__(self):
|
| 23 |
self._task_config: Optional[dict] = None
|
| 24 |
self._observation: Optional[Observation] = None
|
| 25 |
self._cumulative_reward: float = 0.0
|
|
@@ -32,6 +43,14 @@ class DebuggerEnvironment:
|
|
| 32 |
self._step_number: int = 0
|
| 33 |
self._prev_tests_passed: int = 0
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
def reset(self, task_id: str) -> dict:
|
| 36 |
"""
|
| 37 |
Start a fresh episode. Clears all state.
|
|
@@ -150,6 +169,228 @@ class DebuggerEnvironment:
|
|
| 150 |
"hint_used": self._observation.hint_used,
|
| 151 |
}
|
| 152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
# ββ Action Handlers ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 154 |
|
| 155 |
def _handle_submit_fix(self, action: Action) -> Dict[str, Any]:
|
|
@@ -249,7 +490,7 @@ class DebuggerEnvironment:
|
|
| 249 |
|
| 250 |
def _handle_query_context(self, action: Action) -> Dict[str, Any]:
|
| 251 |
"""Handle query_context action."""
|
| 252 |
-
valid_query_types = ["function_signature", "related_code", "error_explanation", "test_details"]
|
| 253 |
|
| 254 |
if action.query_type not in valid_query_types:
|
| 255 |
return self._make_response(
|
|
@@ -511,5 +752,22 @@ class DebuggerEnvironment:
|
|
| 511 |
return f"Test details for '{query_target}':\n" + "\n".join(relevant)
|
| 512 |
|
| 513 |
return f"Full test suite:\n{test_suite}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 514 |
|
| 515 |
return "No information available for this query."
|
|
|
|
| 6 |
processing, and reward calculation.
|
| 7 |
"""
|
| 8 |
|
| 9 |
+
import os
|
| 10 |
+
import json
|
| 11 |
import re
|
| 12 |
import math
|
| 13 |
+
import random
|
| 14 |
from typing import Dict, Any, Optional, Tuple
|
| 15 |
|
| 16 |
+
from env.models import Observation, Action, Reward, FixAttempt, parse_agent_output, StructuredAgentOutput
|
| 17 |
from env.sandbox import execute_code
|
| 18 |
from env.tasks.registry import get_task, list_tasks
|
| 19 |
from env.graders import get_grader
|
| 20 |
+
from server.reward_calculator import DebugRewardCalculator
|
| 21 |
+
|
| 22 |
+
# Optional W&B β only activates if key is present
|
| 23 |
+
try:
|
| 24 |
+
import wandb
|
| 25 |
+
WANDB_AVAILABLE = os.environ.get("WANDB_API_KEY") is not None
|
| 26 |
+
except ImportError:
|
| 27 |
+
WANDB_AVAILABLE = False
|
| 28 |
|
| 29 |
|
| 30 |
class DebuggerEnvironment:
|
| 31 |
"""Core debugging environment implementing the OpenEnv interface."""
|
| 32 |
|
| 33 |
+
def __init__(self, curriculum_step: int = 0):
|
| 34 |
self._task_config: Optional[dict] = None
|
| 35 |
self._observation: Optional[Observation] = None
|
| 36 |
self._cumulative_reward: float = 0.0
|
|
|
|
| 43 |
self._step_number: int = 0
|
| 44 |
self._prev_tests_passed: int = 0
|
| 45 |
|
| 46 |
+
# Curriculum learning state
|
| 47 |
+
self.curriculum_step: int = curriculum_step
|
| 48 |
+
self.reward_calculator: DebugRewardCalculator = DebugRewardCalculator()
|
| 49 |
+
self.current_episode_trajectory: list[dict] = []
|
| 50 |
+
self.current_bug: Optional[dict] = None
|
| 51 |
+
self.turn_number: int = 0
|
| 52 |
+
self.bugs: list[dict] = self._load_bugs_for_curriculum(curriculum_step)
|
| 53 |
+
|
| 54 |
def reset(self, task_id: str) -> dict:
|
| 55 |
"""
|
| 56 |
Start a fresh episode. Clears all state.
|
|
|
|
| 169 |
"hint_used": self._observation.hint_used,
|
| 170 |
}
|
| 171 |
|
| 172 |
+
# ββ Curriculum Learning ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 173 |
+
|
| 174 |
+
def _load_bugs_for_curriculum(self, step: int) -> list[dict]:
|
| 175 |
+
"""
|
| 176 |
+
Curriculum schedule:
|
| 177 |
+
Steps 0-299: Tier 1 only (easy β off-by-one, wrong operator)
|
| 178 |
+
Steps 300-599: Tier 1 + Tier 2 (70/30 split)
|
| 179 |
+
Steps 600+: Tier 1 + Tier 2 + Tier 3 (40/40/20 split)
|
| 180 |
+
"""
|
| 181 |
+
def load_tier(tier: int) -> list[dict]:
|
| 182 |
+
path = f"data/bugs_tier{tier}.jsonl"
|
| 183 |
+
if not os.path.exists(path):
|
| 184 |
+
return []
|
| 185 |
+
bugs = []
|
| 186 |
+
with open(path) as f:
|
| 187 |
+
for line in f:
|
| 188 |
+
line = line.strip()
|
| 189 |
+
if line:
|
| 190 |
+
bugs.append(json.loads(line))
|
| 191 |
+
return bugs
|
| 192 |
+
|
| 193 |
+
tier1 = load_tier(1)
|
| 194 |
+
|
| 195 |
+
if step < 300:
|
| 196 |
+
return tier1
|
| 197 |
+
elif step < 600:
|
| 198 |
+
tier2 = load_tier(2)
|
| 199 |
+
n2 = int(len(tier2) * 0.43) # ~70/30 split
|
| 200 |
+
return tier1 + tier2[:n2]
|
| 201 |
+
else:
|
| 202 |
+
tier2 = load_tier(2)
|
| 203 |
+
tier3 = load_tier(3)
|
| 204 |
+
return tier1 + tier2 + tier3
|
| 205 |
+
|
| 206 |
+
def advance_curriculum(self, step: int):
|
| 207 |
+
"""Call from training loop at steps 300 and 600."""
|
| 208 |
+
self.curriculum_step = step
|
| 209 |
+
self.bugs = self._load_bugs_for_curriculum(step)
|
| 210 |
+
|
| 211 |
+
def _active_tiers(self) -> list[int]:
|
| 212 |
+
if self.curriculum_step < 300:
|
| 213 |
+
return [1]
|
| 214 |
+
elif self.curriculum_step < 600:
|
| 215 |
+
return [1, 2]
|
| 216 |
+
return [1, 2, 3]
|
| 217 |
+
|
| 218 |
+
# ββ Curriculum Step / GRPO-Compatible Methods ββββββββββββββββββββββββββββ
|
| 219 |
+
|
| 220 |
+
def reset_curriculum(self) -> dict:
|
| 221 |
+
"""
|
| 222 |
+
Start a fresh curriculum episode. Selects a random bug from the
|
| 223 |
+
curriculum-appropriate pool. Returns initial observation dict.
|
| 224 |
+
"""
|
| 225 |
+
if not self.bugs:
|
| 226 |
+
raise ValueError("No bugs loaded. Run data/generate_bugs.py first.")
|
| 227 |
+
|
| 228 |
+
self.current_bug = random.choice(self.bugs)
|
| 229 |
+
self.current_episode_trajectory = []
|
| 230 |
+
self.turn_number = 0
|
| 231 |
+
|
| 232 |
+
return {
|
| 233 |
+
"buggy_code": self.current_bug.get("buggy_code", ""),
|
| 234 |
+
"error_message": self.current_bug.get("initial_error", "Some tests are failing."),
|
| 235 |
+
"test_results": {"passed": 0, "failed": 0, "total": len(self.current_bug.get("test_cases", []))},
|
| 236 |
+
"turn_number": 0,
|
| 237 |
+
"history": [],
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
def step_curriculum(self, raw_text: str) -> dict:
|
| 241 |
+
"""
|
| 242 |
+
Process one structured agent response in the curriculum setting.
|
| 243 |
+
Returns {observation, reward, done, info}.
|
| 244 |
+
"""
|
| 245 |
+
agent_output = parse_agent_output(raw_text)
|
| 246 |
+
|
| 247 |
+
# Run fix against test cases if agent proposes one
|
| 248 |
+
test_results = {"passed": 0, "failed": 0, "total": 0, "newly_broken": 0}
|
| 249 |
+
if agent_output.action == "propose_fix" and self.current_bug:
|
| 250 |
+
test_results = self._run_fix_safely(
|
| 251 |
+
proposed_code=agent_output.detail,
|
| 252 |
+
bug=self.current_bug,
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
# Compute reward
|
| 256 |
+
reward_breakdown = self.reward_calculator.compute_turn_reward(
|
| 257 |
+
agent_output=agent_output,
|
| 258 |
+
ground_truth={
|
| 259 |
+
"bug_function": self.current_bug.get("bug_location", {}).get("function", "") if self.current_bug else "",
|
| 260 |
+
"bug_line": self.current_bug.get("bug_location", {}).get("line_start", -1) if self.current_bug else -1,
|
| 261 |
+
"bug_type": self.current_bug.get("bug_type", "") if self.current_bug else "",
|
| 262 |
+
"canonical_fix_code": self.current_bug.get("original_code", "") if self.current_bug else "",
|
| 263 |
+
},
|
| 264 |
+
test_results=test_results,
|
| 265 |
+
turn_number=self.turn_number,
|
| 266 |
+
)
|
| 267 |
+
|
| 268 |
+
# Record turn in episode trajectory
|
| 269 |
+
self.current_episode_trajectory.append({
|
| 270 |
+
"turn": self.turn_number,
|
| 271 |
+
"agent_output": agent_output,
|
| 272 |
+
"test_results": test_results,
|
| 273 |
+
"reward": reward_breakdown,
|
| 274 |
+
})
|
| 275 |
+
|
| 276 |
+
self.turn_number += 1
|
| 277 |
+
|
| 278 |
+
# Determine if episode is done
|
| 279 |
+
solved = reward_breakdown.fix_quality >= 0.35
|
| 280 |
+
max_turns_reached = self.turn_number >= self.reward_calculator.MAX_TURNS
|
| 281 |
+
gave_up = agent_output.action == "give_up"
|
| 282 |
+
done = solved or max_turns_reached or gave_up
|
| 283 |
+
|
| 284 |
+
# Log to W&B at episode end
|
| 285 |
+
if done and WANDB_AVAILABLE:
|
| 286 |
+
self._log_episode_to_wandb(reward_breakdown, solved)
|
| 287 |
+
|
| 288 |
+
return {
|
| 289 |
+
"observation": {
|
| 290 |
+
"buggy_code": self.current_bug.get("buggy_code", "") if self.current_bug else "",
|
| 291 |
+
"error_message": self.current_bug.get("initial_error", "") if self.current_bug else "",
|
| 292 |
+
"test_results": test_results,
|
| 293 |
+
"turn_number": self.turn_number,
|
| 294 |
+
"history": [
|
| 295 |
+
{
|
| 296 |
+
"turn": t["turn"],
|
| 297 |
+
"action": t["agent_output"].action,
|
| 298 |
+
"reward": t["reward"].total,
|
| 299 |
+
}
|
| 300 |
+
for t in self.current_episode_trajectory
|
| 301 |
+
],
|
| 302 |
+
},
|
| 303 |
+
"reward": reward_breakdown.total,
|
| 304 |
+
"done": done,
|
| 305 |
+
"info": {
|
| 306 |
+
"reward_breakdown": reward_breakdown.__dict__,
|
| 307 |
+
"turn_number": self.turn_number,
|
| 308 |
+
"solved": solved,
|
| 309 |
+
"bug_tier": self.current_bug.get("difficulty", 0) if self.current_bug else 0,
|
| 310 |
+
},
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
def _run_fix_safely(self, proposed_code: str, bug: dict) -> dict:
|
| 314 |
+
"""Run proposed fix against test cases with timeout. NEVER execute without timeout."""
|
| 315 |
+
import subprocess
|
| 316 |
+
import tempfile
|
| 317 |
+
|
| 318 |
+
if not proposed_code or not bug.get("test_cases"):
|
| 319 |
+
return {"passed": 0, "failed": 0, "total": 0, "newly_broken": 0}
|
| 320 |
+
|
| 321 |
+
test_cases = bug["test_cases"]
|
| 322 |
+
func_name = bug.get("function_name", "")
|
| 323 |
+
passed = 0
|
| 324 |
+
|
| 325 |
+
for test in test_cases:
|
| 326 |
+
inp = test["input"]
|
| 327 |
+
expected = test["expected_output"]
|
| 328 |
+
|
| 329 |
+
if isinstance(inp, (list, tuple)):
|
| 330 |
+
args_str = ", ".join(repr(x) for x in inp)
|
| 331 |
+
else:
|
| 332 |
+
args_str = repr(inp)
|
| 333 |
+
|
| 334 |
+
script = f"""
|
| 335 |
+
{proposed_code}
|
| 336 |
+
|
| 337 |
+
try:
|
| 338 |
+
result = {func_name}({args_str})
|
| 339 |
+
expected = {repr(expected)}
|
| 340 |
+
print("PASS" if result == expected else f"FAIL: got {{result}}, expected {{expected}}")
|
| 341 |
+
except Exception as e:
|
| 342 |
+
print(f"ERROR: {{type(e).__name__}}: {{e}}")
|
| 343 |
+
"""
|
| 344 |
+
try:
|
| 345 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
|
| 346 |
+
f.write(script)
|
| 347 |
+
fname = f.name
|
| 348 |
+
|
| 349 |
+
result = subprocess.run(
|
| 350 |
+
["python", fname],
|
| 351 |
+
capture_output=True, text=True, timeout=5
|
| 352 |
+
)
|
| 353 |
+
|
| 354 |
+
try:
|
| 355 |
+
os.unlink(fname)
|
| 356 |
+
except Exception:
|
| 357 |
+
pass
|
| 358 |
+
|
| 359 |
+
if "PASS" in result.stdout:
|
| 360 |
+
passed += 1
|
| 361 |
+
except subprocess.TimeoutExpired:
|
| 362 |
+
pass # timeout = failed test
|
| 363 |
+
except Exception:
|
| 364 |
+
pass
|
| 365 |
+
|
| 366 |
+
failed = len(test_cases) - passed
|
| 367 |
+
return {
|
| 368 |
+
"passed": passed,
|
| 369 |
+
"failed": failed,
|
| 370 |
+
"total": len(test_cases),
|
| 371 |
+
"newly_broken": 0,
|
| 372 |
+
}
|
| 373 |
+
|
| 374 |
+
def _log_episode_to_wandb(self, final_reward, solved: bool):
|
| 375 |
+
"""Log episode metrics to W&B. Only called if WANDB_AVAILABLE."""
|
| 376 |
+
if not WANDB_AVAILABLE:
|
| 377 |
+
return
|
| 378 |
+
breakdown = self.reward_calculator.get_reward_breakdown_for_logging(
|
| 379 |
+
self.current_episode_trajectory
|
| 380 |
+
)
|
| 381 |
+
episode_reward = self.reward_calculator.compute_episode_reward(
|
| 382 |
+
self.current_episode_trajectory
|
| 383 |
+
)
|
| 384 |
+
|
| 385 |
+
wandb.log({
|
| 386 |
+
"episode/reward_total": episode_reward,
|
| 387 |
+
"episode/solved": int(solved),
|
| 388 |
+
"episode/turns_used": self.turn_number,
|
| 389 |
+
"episode/bug_tier": self.current_bug.get("difficulty", 0) if self.current_bug else 0,
|
| 390 |
+
"episode/curriculum_step": self.curriculum_step,
|
| 391 |
+
**breakdown,
|
| 392 |
+
})
|
| 393 |
+
|
| 394 |
# ββ Action Handlers ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 395 |
|
| 396 |
def _handle_submit_fix(self, action: Action) -> Dict[str, Any]:
|
|
|
|
| 490 |
|
| 491 |
def _handle_query_context(self, action: Action) -> Dict[str, Any]:
|
| 492 |
"""Handle query_context action."""
|
| 493 |
+
valid_query_types = ["function_signature", "related_code", "error_explanation", "test_details", "test_suggestion"]
|
| 494 |
|
| 495 |
if action.query_type not in valid_query_types:
|
| 496 |
return self._make_response(
|
|
|
|
| 752 |
return f"Test details for '{query_target}':\n" + "\n".join(relevant)
|
| 753 |
|
| 754 |
return f"Full test suite:\n{test_suite}"
|
| 755 |
+
|
| 756 |
+
elif query_type == "test_suggestion":
|
| 757 |
+
# Provide a specific hint for the hard task if they ask
|
| 758 |
+
if task["task_id"] == "hard":
|
| 759 |
+
return (
|
| 760 |
+
"HINT: The sequential tests pass, but have you considered testing with "
|
| 761 |
+
"concurrent threads? There might be a race condition that only appears "
|
| 762 |
+
"under load. Try writing a test that uses 'threading' to call methods "
|
| 763 |
+
"simultaneously."
|
| 764 |
+
)
|
| 765 |
+
elif task["task_id"] == "medium":
|
| 766 |
+
return (
|
| 767 |
+
"HINT: Don't trust the first error message you see. Trace the data flow "
|
| 768 |
+
"backwards to see where the invalid input was actually generated."
|
| 769 |
+
)
|
| 770 |
+
else:
|
| 771 |
+
return "HINT: Look closely at the comparison operators and loop boundaries."
|
| 772 |
|
| 773 |
return "No information available for this query."
|
env/graders/__pycache__/base_grader.cpython-310.pyc
CHANGED
|
Binary files a/env/graders/__pycache__/base_grader.cpython-310.pyc and b/env/graders/__pycache__/base_grader.cpython-310.pyc differ
|
|
|
env/graders/__pycache__/grader_hard.cpython-310.pyc
CHANGED
|
Binary files a/env/graders/__pycache__/grader_hard.cpython-310.pyc and b/env/graders/__pycache__/grader_hard.cpython-310.pyc differ
|
|
|
env/graders/grader_hard.py
CHANGED
|
@@ -1,105 +1,3 @@
|
|
| 1 |
-
# """
|
| 2 |
-
# Grader Hard β Concurrent stress test scoring.
|
| 3 |
-
# Custom weights:
|
| 4 |
-
# 0.40 β original 8 tests pass
|
| 5 |
-
# 0.30 β concurrent stress test (1000 threads)
|
| 6 |
-
# 0.20 β hypothesis accuracy
|
| 7 |
-
# 0.10 β efficiency bonus (solved within 5 attempts)
|
| 8 |
-
# """
|
| 9 |
-
|
| 10 |
-
# import threading
|
| 11 |
-
# from typing import List, Dict, Any
|
| 12 |
-
# from env.graders.base_grader import BaseGrader
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
# class HardGrader(BaseGrader):
|
| 16 |
-
|
| 17 |
-
# def _run_concurrent_stress_test(self, code: str) -> bool:
|
| 18 |
-
# """
|
| 19 |
-
# Run a 1000-thread concurrent stress test against the submitted code.
|
| 20 |
-
# Returns True if the counter ends at exactly 1000 after 1000 concurrent increments.
|
| 21 |
-
# """
|
| 22 |
-
# try:
|
| 23 |
-
# # Execute the code in an isolated namespace
|
| 24 |
-
# namespace = {}
|
| 25 |
-
# exec(code, namespace)
|
| 26 |
-
|
| 27 |
-
# CounterClass = namespace.get("ConnectionCounter")
|
| 28 |
-
# if CounterClass is None:
|
| 29 |
-
# return False
|
| 30 |
-
|
| 31 |
-
# counter = CounterClass()
|
| 32 |
-
# num_threads = 1000
|
| 33 |
-
|
| 34 |
-
# threads = [
|
| 35 |
-
# threading.Thread(target=counter.increment)
|
| 36 |
-
# for _ in range(num_threads)
|
| 37 |
-
# ]
|
| 38 |
-
# for t in threads:
|
| 39 |
-
# t.start()
|
| 40 |
-
# for t in threads:
|
| 41 |
-
# t.join(timeout=10)
|
| 42 |
-
|
| 43 |
-
# return counter.get_count() == num_threads
|
| 44 |
-
# except Exception:
|
| 45 |
-
# return False
|
| 46 |
-
|
| 47 |
-
# def score(
|
| 48 |
-
# self,
|
| 49 |
-
# task_config: dict,
|
| 50 |
-
# attempts: List[Dict[str, Any]],
|
| 51 |
-
# best_tests_passed: int,
|
| 52 |
-
# tests_total: int,
|
| 53 |
-
# attempts_used: int,
|
| 54 |
-
# max_attempts: int,
|
| 55 |
-
# hypotheses: List[str],
|
| 56 |
-
# ) -> float:
|
| 57 |
-
# ground_truth = task_config["ground_truth"]
|
| 58 |
-
# keywords = ground_truth["hypothesis_keywords"]
|
| 59 |
-
|
| 60 |
-
# # 1. Original tests pass (weight: 0.40)
|
| 61 |
-
# test_pass_ratio = (best_tests_passed / tests_total) if tests_total > 0 else 0.0
|
| 62 |
-
# original_test_score = test_pass_ratio * 0.40
|
| 63 |
-
|
| 64 |
-
# # 2. Concurrent stress test (weight: 0.30)
|
| 65 |
-
# # Use the best attempt's code (highest tests_passed, then latest)
|
| 66 |
-
# concurrent_score = 0.0
|
| 67 |
-
# if attempts:
|
| 68 |
-
# # Find the best attempt
|
| 69 |
-
# best_attempt = max(
|
| 70 |
-
# attempts,
|
| 71 |
-
# key=lambda a: (a.get("tests_passed", 0), a.get("attempt_number", 0))
|
| 72 |
-
# )
|
| 73 |
-
# best_code = best_attempt.get("code_submitted", "")
|
| 74 |
-
# if best_code:
|
| 75 |
-
# # Run the stress test 3 times β must pass all 3 for full credit
|
| 76 |
-
# passes = sum(
|
| 77 |
-
# 1 for _ in range(3)
|
| 78 |
-
# if self._run_concurrent_stress_test(best_code)
|
| 79 |
-
# )
|
| 80 |
-
# if passes == 3:
|
| 81 |
-
# concurrent_score = 0.30
|
| 82 |
-
# elif passes >= 1:
|
| 83 |
-
# concurrent_score = 0.15 # Partial β inconsistent fix
|
| 84 |
-
|
| 85 |
-
# # 3. Hypothesis accuracy (weight: 0.20)
|
| 86 |
-
# if hypotheses:
|
| 87 |
-
# matches = sum(
|
| 88 |
-
# 1 for h in hypotheses
|
| 89 |
-
# if self._check_hypothesis_keywords(h, keywords, "any")
|
| 90 |
-
# )
|
| 91 |
-
# hypothesis_ratio = matches / len(hypotheses)
|
| 92 |
-
# else:
|
| 93 |
-
# hypothesis_ratio = 0.0
|
| 94 |
-
# hypothesis_score = hypothesis_ratio * 0.20
|
| 95 |
-
|
| 96 |
-
# # 4. Efficiency bonus (weight: 0.10)
|
| 97 |
-
# efficiency_score = 0.10 if attempts_used <= 5 else 0.0
|
| 98 |
-
|
| 99 |
-
# total = original_test_score + concurrent_score + hypothesis_score + efficiency_score
|
| 100 |
-
# return self._clamp(total)
|
| 101 |
-
|
| 102 |
-
|
| 103 |
"""
|
| 104 |
Grader Hard β Concurrent stress test scoring.
|
| 105 |
|
|
@@ -141,17 +39,18 @@ result = counter.get_count()
|
|
| 141 |
assert result == num_threads, f"CONCURRENT FAIL: expected {num_threads}, got {result}"
|
| 142 |
print(f"CONCURRENT PASS: {result} == {num_threads}")
|
| 143 |
"""
|
|
|
|
| 144 |
class HardGrader(BaseGrader):
|
| 145 |
|
| 146 |
def _run_concurrent_stress_test(self, code: str) -> bool:
|
| 147 |
"""
|
| 148 |
Run the concurrent stress test against agent-submitted code.
|
| 149 |
Routes through execute_code() sandbox β never uses raw exec().
|
| 150 |
-
Returns True only if the counter reaches exactly 1000 after
|
| 151 |
1000 concurrent increments.
|
| 152 |
"""
|
| 153 |
output, timed_out, _ = execute_code(
|
| 154 |
-
code,
|
| 155 |
_CONCURRENT_STRESS_TEST,
|
| 156 |
allow_threading=True,
|
| 157 |
)
|
|
@@ -174,8 +73,8 @@ class HardGrader(BaseGrader):
|
|
| 174 |
|
| 175 |
# ββ 1. Sequential test score (weight: 0.40) ββββββββββββββββββββββββββ
|
| 176 |
# IMPORTANT: Only count agent-submitted attempts, NOT the initial buggy
|
| 177 |
-
# code. The buggy code passes all 8 sequential tests β if we used
|
| 178 |
-
# best_tests_passed from environment state, every agent would score
|
| 179 |
# 0.40 for free without fixing anything. We recalculate from attempts.
|
| 180 |
if attempts:
|
| 181 |
agent_best_sequential = max(
|
|
@@ -189,8 +88,8 @@ class HardGrader(BaseGrader):
|
|
| 189 |
|
| 190 |
# ββ 2. Concurrent stress test (weight: 0.30) ββββββββββββββββββββββββββ
|
| 191 |
# Use the best attempt by sequential test count (ties broken by recency).
|
| 192 |
-
# Run the stress test
|
| 193 |
-
# at least
|
| 194 |
concurrent_score = 0.0
|
| 195 |
if attempts:
|
| 196 |
best_attempt = max(
|
|
@@ -201,14 +100,14 @@ class HardGrader(BaseGrader):
|
|
| 201 |
|
| 202 |
if best_code:
|
| 203 |
passes = sum(
|
| 204 |
-
1 for _ in range(
|
| 205 |
if self._run_concurrent_stress_test(best_code)
|
| 206 |
)
|
| 207 |
-
if passes =
|
| 208 |
-
concurrent_score = 0.30 #
|
| 209 |
-
elif passes >=
|
| 210 |
-
concurrent_score = 0.15 # Partially
|
| 211 |
-
|
| 212 |
# ββ 3. Hypothesis accuracy (weight: 0.20) βββββββββββββββββββββββββββββ
|
| 213 |
if hypotheses:
|
| 214 |
matches = sum(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
Grader Hard β Concurrent stress test scoring.
|
| 3 |
|
|
|
|
| 39 |
assert result == num_threads, f"CONCURRENT FAIL: expected {num_threads}, got {result}"
|
| 40 |
print(f"CONCURRENT PASS: {result} == {num_threads}")
|
| 41 |
"""
|
| 42 |
+
|
| 43 |
class HardGrader(BaseGrader):
|
| 44 |
|
| 45 |
def _run_concurrent_stress_test(self, code: str) -> bool:
|
| 46 |
"""
|
| 47 |
Run the concurrent stress test against agent-submitted code.
|
| 48 |
Routes through execute_code() sandbox β never uses raw exec().
|
| 49 |
+
Returns True only if the counter reaches exactly 1000 after
|
| 50 |
1000 concurrent increments.
|
| 51 |
"""
|
| 52 |
output, timed_out, _ = execute_code(
|
| 53 |
+
code,
|
| 54 |
_CONCURRENT_STRESS_TEST,
|
| 55 |
allow_threading=True,
|
| 56 |
)
|
|
|
|
| 73 |
|
| 74 |
# ββ 1. Sequential test score (weight: 0.40) ββββββββββββββββββββββββββ
|
| 75 |
# IMPORTANT: Only count agent-submitted attempts, NOT the initial buggy
|
| 76 |
+
# code. The buggy code passes all 8 sequential tests β if we used
|
| 77 |
+
# best_tests_passed from environment state, every agent would score
|
| 78 |
# 0.40 for free without fixing anything. We recalculate from attempts.
|
| 79 |
if attempts:
|
| 80 |
agent_best_sequential = max(
|
|
|
|
| 88 |
|
| 89 |
# ββ 2. Concurrent stress test (weight: 0.30) ββββββββββββββββββββββββββ
|
| 90 |
# Use the best attempt by sequential test count (ties broken by recency).
|
| 91 |
+
# Run the stress test 5 times β must pass 4/5 for full credit,
|
| 92 |
+
# at least 2/5 for partial credit. This handles non-determinism robustly.
|
| 93 |
concurrent_score = 0.0
|
| 94 |
if attempts:
|
| 95 |
best_attempt = max(
|
|
|
|
| 100 |
|
| 101 |
if best_code:
|
| 102 |
passes = sum(
|
| 103 |
+
1 for _ in range(5)
|
| 104 |
if self._run_concurrent_stress_test(best_code)
|
| 105 |
)
|
| 106 |
+
if passes >= 4:
|
| 107 |
+
concurrent_score = 0.30 # Robustly fixed
|
| 108 |
+
elif passes >= 2:
|
| 109 |
+
concurrent_score = 0.15 # Partially fixed / Flaky
|
| 110 |
+
|
| 111 |
# ββ 3. Hypothesis accuracy (weight: 0.20) βββββββββββββββββββββββββββββ
|
| 112 |
if hypotheses:
|
| 113 |
matches = sum(
|
env/models.py
CHANGED
|
@@ -5,8 +5,9 @@ Pydantic v2 data models for structured interaction between the agent
|
|
| 5 |
and the environment, ensuring strict type safety and schema compliance.
|
| 6 |
"""
|
| 7 |
|
|
|
|
| 8 |
from pydantic import BaseModel
|
| 9 |
-
from typing import List, Dict, Optional
|
| 10 |
|
| 11 |
|
| 12 |
class FixAttempt(BaseModel):
|
|
@@ -69,3 +70,65 @@ class Reward(BaseModel):
|
|
| 69 |
cumulative_reward: float # Sum of all step_rewards this episode
|
| 70 |
grader_score: float # 0.0 during episode. Set ONLY on terminal step (done=True).
|
| 71 |
breakdown: Dict[str, float] # Itemized components
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
and the environment, ensuring strict type safety and schema compliance.
|
| 6 |
"""
|
| 7 |
|
| 8 |
+
import re
|
| 9 |
from pydantic import BaseModel
|
| 10 |
+
from typing import List, Dict, Optional, Literal
|
| 11 |
|
| 12 |
|
| 13 |
class FixAttempt(BaseModel):
|
|
|
|
| 70 |
cumulative_reward: float # Sum of all step_rewards this episode
|
| 71 |
grader_score: float # 0.0 during episode. Set ONLY on terminal step (done=True).
|
| 72 |
breakdown: Dict[str, float] # Itemized components
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# ββ STRUCTURED AGENT OUTPUT ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 76 |
+
|
| 77 |
+
VALID_ACTIONS = {"inspect_lines", "run_tests", "propose_fix", "request_context", "give_up"}
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
class StructuredAgentOutput(BaseModel):
|
| 81 |
+
observation: str
|
| 82 |
+
hypothesis: str
|
| 83 |
+
confidence: Literal["low", "medium", "high"]
|
| 84 |
+
action: str
|
| 85 |
+
detail: str
|
| 86 |
+
valid: bool
|
| 87 |
+
raw_text: str
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def parse_agent_output(raw_text: str) -> StructuredAgentOutput:
|
| 91 |
+
"""
|
| 92 |
+
Parse agent's structured response. Robust to minor formatting variations.
|
| 93 |
+
Sets valid=False if any required field is missing or action is not in VALID_ACTIONS.
|
| 94 |
+
|
| 95 |
+
Expected format:
|
| 96 |
+
OBSERVATION: [text]
|
| 97 |
+
HYPOTHESIS: [text]
|
| 98 |
+
CONFIDENCE: [low|medium|high]
|
| 99 |
+
ACTION: [inspect_lines|run_tests|propose_fix|request_context|give_up]
|
| 100 |
+
DETAIL: [text]
|
| 101 |
+
"""
|
| 102 |
+
def extract_field(text: str, field: str) -> Optional[str]:
|
| 103 |
+
pattern = rf"(?i){field}\s*:\s*(.*?)(?=\n(?:OBSERVATION|HYPOTHESIS|CONFIDENCE|ACTION|DETAIL)\s*:|$)"
|
| 104 |
+
match = re.search(pattern, text, re.DOTALL)
|
| 105 |
+
if match:
|
| 106 |
+
return match.group(1).strip()
|
| 107 |
+
return None
|
| 108 |
+
|
| 109 |
+
observation = extract_field(raw_text, "OBSERVATION") or ""
|
| 110 |
+
hypothesis = extract_field(raw_text, "HYPOTHESIS") or ""
|
| 111 |
+
confidence_raw = (extract_field(raw_text, "CONFIDENCE") or "").lower().strip()
|
| 112 |
+
action_raw = (extract_field(raw_text, "ACTION") or "").lower().strip()
|
| 113 |
+
detail = extract_field(raw_text, "DETAIL") or ""
|
| 114 |
+
|
| 115 |
+
confidence = confidence_raw if confidence_raw in {"low", "medium", "high"} else "low"
|
| 116 |
+
action = action_raw if action_raw in VALID_ACTIONS else "invalid"
|
| 117 |
+
|
| 118 |
+
valid = all([
|
| 119 |
+
len(observation) > 5,
|
| 120 |
+
len(hypothesis) > 10,
|
| 121 |
+
confidence in {"low", "medium", "high"},
|
| 122 |
+
action in VALID_ACTIONS,
|
| 123 |
+
len(detail) > 0,
|
| 124 |
+
])
|
| 125 |
+
|
| 126 |
+
return StructuredAgentOutput(
|
| 127 |
+
observation=observation,
|
| 128 |
+
hypothesis=hypothesis,
|
| 129 |
+
confidence=confidence,
|
| 130 |
+
action=action,
|
| 131 |
+
detail=detail,
|
| 132 |
+
valid=valid,
|
| 133 |
+
raw_text=raw_text,
|
| 134 |
+
)
|
env/sandbox.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
| 1 |
"""
|
| 2 |
-
AgentDebuggerEnv β Sandboxed Code Execution
|
| 3 |
-
============================================
|
| 4 |
-
Isolated execution environment for user-submitted code
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
| 7 |
"""
|
| 8 |
|
| 9 |
import subprocess
|
|
@@ -21,56 +23,99 @@ BLOCKED_IMPORTS = [
|
|
| 21 |
"ctypes", "cffi", "resource", "signal", "mmap", "gc"
|
| 22 |
]
|
| 23 |
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
MEMORY_LIMIT_MB = 256
|
| 26 |
|
| 27 |
|
| 28 |
-
def
|
| 29 |
-
"""Build a Python script snippet that
|
| 30 |
-
blocked_repr = repr(
|
|
|
|
|
|
|
| 31 |
return f'''
|
| 32 |
import ast as _ast
|
| 33 |
import sys as _sys
|
|
|
|
| 34 |
|
| 35 |
-
|
| 36 |
-
_source_to_check = open(__file__).read()
|
| 37 |
-
|
| 38 |
-
# Find the marker line and only check code after it
|
| 39 |
-
_marker = "# --- USER CODE START ---"
|
| 40 |
-
_marker_pos = _source_to_check.find(_marker)
|
| 41 |
-
if _marker_pos != -1:
|
| 42 |
-
_source_to_check = _source_to_check[_marker_pos + len(_marker):]
|
| 43 |
-
|
| 44 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
_tree = _ast.parse(_source_to_check)
|
| 46 |
-
except SyntaxError:
|
| 47 |
-
pass # Let the actual execution catch syntax errors
|
| 48 |
-
else:
|
| 49 |
for _node in _ast.walk(_tree):
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
| 61 |
_sys.exit(1)
|
| 62 |
-
|
| 63 |
-
#
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
_top = name.split(".")[0]
|
| 69 |
-
if _top in
|
| 70 |
raise ImportError(f"BLOCKED IMPORT: '{{name}}' is not allowed in the sandbox.")
|
| 71 |
-
return
|
| 72 |
-
|
| 73 |
_builtins.__import__ = _restricted_import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
'''
|
| 75 |
|
| 76 |
|
|
@@ -80,16 +125,13 @@ def execute_code(code: str, test_code: str, allow_threading: bool = False) -> Tu
|
|
| 80 |
|
| 81 |
Returns:
|
| 82 |
(output: str, timed_out: bool, execution_time_ms: int)
|
| 83 |
-
|
| 84 |
-
The output contains both stdout and stderr merged, exactly as a developer
|
| 85 |
-
would see in their terminal.
|
| 86 |
"""
|
| 87 |
# Build the blocked imports list, optionally allowing threading
|
| 88 |
blocked = [b for b in BLOCKED_IMPORTS if not (b == "threading" and allow_threading)]
|
| 89 |
|
| 90 |
-
# Build the full script:
|
| 91 |
-
|
| 92 |
-
full_script =
|
| 93 |
|
| 94 |
tmp_path = None
|
| 95 |
try:
|
|
@@ -122,7 +164,7 @@ def execute_code(code: str, test_code: str, allow_threading: bool = False) -> Tu
|
|
| 122 |
except subprocess.TimeoutExpired:
|
| 123 |
elapsed_ms = int((time.time() - start_time) * 1000)
|
| 124 |
return (
|
| 125 |
-
f"TIMEOUT: Code execution exceeded {EXECUTION_TIMEOUT_SECONDS} second limit
|
| 126 |
True,
|
| 127 |
elapsed_ms
|
| 128 |
)
|
|
|
|
| 1 |
"""
|
| 2 |
+
AgentDebuggerEnv β Sandboxed Code Execution (Gold Standard)
|
| 3 |
+
============================================================
|
| 4 |
+
Isolated execution environment for user-submitted code.
|
| 5 |
+
Implements multi-layered security:
|
| 6 |
+
1. AST-based static analysis (blocks dangerous builtins & dunders)
|
| 7 |
+
3. Subprocess isolation with strict timeouts
|
| 8 |
+
4. Resource limits (memory/CPU)
|
| 9 |
"""
|
| 10 |
|
| 11 |
import subprocess
|
|
|
|
| 23 |
"ctypes", "cffi", "resource", "signal", "mmap", "gc"
|
| 24 |
]
|
| 25 |
|
| 26 |
+
DANGEROUS_BUILTINS = [
|
| 27 |
+
"eval", "exec", "compile", "getattr", "setattr", "delattr",
|
| 28 |
+
"input", "breakpoint", "help", "open"
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
EXECUTION_TIMEOUT_SECONDS = 10 # Hackathon spec: strictly 10s
|
| 32 |
MEMORY_LIMIT_MB = 256
|
| 33 |
|
| 34 |
|
| 35 |
+
def _build_security_prelude(blocked_imports: list[str]) -> str:
|
| 36 |
+
"""Build a Python script snippet that hardens the environment before user code runs."""
|
| 37 |
+
blocked_repr = repr(blocked_imports)
|
| 38 |
+
builtins_repr = repr(DANGEROUS_BUILTINS)
|
| 39 |
+
|
| 40 |
return f'''
|
| 41 |
import ast as _ast
|
| 42 |
import sys as _sys
|
| 43 |
+
import builtins as _builtins
|
| 44 |
|
| 45 |
+
# ββ 1. Resource Limits ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
try:
|
| 47 |
+
import resource as _resource
|
| 48 |
+
# Limit memory usage (Address Space) to 256MB
|
| 49 |
+
_mem_limit = {MEMORY_LIMIT_MB} * 1024 * 1024
|
| 50 |
+
_resource.setrlimit(_resource.RLIMIT_AS, (_mem_limit, _mem_limit))
|
| 51 |
+
except Exception:
|
| 52 |
+
pass
|
| 53 |
+
|
| 54 |
+
# ββ 2. AST Static Analysis βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 55 |
+
_BLOCKED_IMPORTS = {blocked_repr}
|
| 56 |
+
_DANGEROUS_BUILTINS = {builtins_repr}
|
| 57 |
+
|
| 58 |
+
# We use _builtins.open because it might be nullified later in the user's scope
|
| 59 |
+
try:
|
| 60 |
+
_source_to_check = _builtins.open(__file__).read()
|
| 61 |
+
# Find the marker line and only check code after it
|
| 62 |
+
_marker = "# --- USER CODE START ---"
|
| 63 |
+
_marker_pos = _source_to_check.find(_marker)
|
| 64 |
+
if _marker_pos != -1:
|
| 65 |
+
_source_to_check = _source_to_check[_marker_pos + len(_marker):]
|
| 66 |
+
|
| 67 |
_tree = _ast.parse(_source_to_check)
|
|
|
|
|
|
|
|
|
|
| 68 |
for _node in _ast.walk(_tree):
|
| 69 |
+
# Block dangerous imports
|
| 70 |
+
if isinstance(_node, (_ast.Import, _ast.ImportFrom)):
|
| 71 |
+
_names = []
|
| 72 |
+
if isinstance(_node, _ast.Import):
|
| 73 |
+
_names = [a.name.split('.')[0] for a in _node.names]
|
| 74 |
+
else:
|
| 75 |
+
if _node.module:
|
| 76 |
+
_names = [_node.module.split('.')[0]]
|
| 77 |
+
|
| 78 |
+
for _name in _names:
|
| 79 |
+
if _name in _BLOCKED_IMPORTS:
|
| 80 |
+
print(f"BLOCKED IMPORT: '{{_name}}' is not allowed in the sandbox.")
|
| 81 |
_sys.exit(1)
|
| 82 |
+
|
| 83 |
+
# Block dangerous builtins (static names)
|
| 84 |
+
if isinstance(_node, _ast.Name) and _node.id in _DANGEROUS_BUILTINS:
|
| 85 |
+
print(f"SECURITY ERROR: Use of '{{_node.id}}' is prohibited.")
|
| 86 |
+
_sys.exit(1)
|
| 87 |
+
|
| 88 |
+
# Block Dunder attribute access and leading underscores (reflection)
|
| 89 |
+
if isinstance(_node, _ast.Attribute):
|
| 90 |
+
if _node.attr.startswith('_'):
|
| 91 |
+
print(f"SECURITY ERROR: Access to internal attribute '{{_node.attr}}' is prohibited.")
|
| 92 |
+
_sys.exit(1)
|
| 93 |
+
except SyntaxError:
|
| 94 |
+
pass # Let the actual execution catch syntax errors
|
| 95 |
+
except Exception as e:
|
| 96 |
+
# Any other error during check is a sandbox failure
|
| 97 |
+
# print(f"SANDBOX INTERNALS ERROR: {{str(e)}}")
|
| 98 |
+
pass
|
| 99 |
+
|
| 100 |
+
# ββ 3. Runtime Protection ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 101 |
+
# Block __import__ to catch dynamic imports at runtime
|
| 102 |
+
_orig_import = _builtins.__import__
|
| 103 |
+
def _restricted_import(name, *args, _orig_import=_orig_import, _blocked=_BLOCKED_IMPORTS, **kwargs):
|
| 104 |
_top = name.split(".")[0]
|
| 105 |
+
if _top in _blocked:
|
| 106 |
raise ImportError(f"BLOCKED IMPORT: '{{name}}' is not allowed in the sandbox.")
|
| 107 |
+
return _orig_import(name, *args, **kwargs)
|
|
|
|
| 108 |
_builtins.__import__ = _restricted_import
|
| 109 |
+
|
| 110 |
+
# Nullify dangerous builtins
|
| 111 |
+
for _b in _DANGEROUS_BUILTINS:
|
| 112 |
+
if _b not in ('setattr', 'getattr', 'delattr'):
|
| 113 |
+
_builtins.__dict__[_b] = None
|
| 114 |
+
|
| 115 |
+
# Clean up namespace gracefully
|
| 116 |
+
for _v in ["_ast", "_sys", "_builtins", "_source_to_check", "_tree", "_node", "_marker", "_marker_pos", "_b", "_orig_import", "_restricted_import"]:
|
| 117 |
+
if _v in locals():
|
| 118 |
+
del locals()[_v]
|
| 119 |
'''
|
| 120 |
|
| 121 |
|
|
|
|
| 125 |
|
| 126 |
Returns:
|
| 127 |
(output: str, timed_out: bool, execution_time_ms: int)
|
|
|
|
|
|
|
|
|
|
| 128 |
"""
|
| 129 |
# Build the blocked imports list, optionally allowing threading
|
| 130 |
blocked = [b for b in BLOCKED_IMPORTS if not (b == "threading" and allow_threading)]
|
| 131 |
|
| 132 |
+
# Build the full script: security prelude + user code + test code
|
| 133 |
+
prelude = _build_security_prelude(blocked)
|
| 134 |
+
full_script = prelude + "\n# --- USER CODE START ---\n" + code + "\n" + test_code
|
| 135 |
|
| 136 |
tmp_path = None
|
| 137 |
try:
|
|
|
|
| 164 |
except subprocess.TimeoutExpired:
|
| 165 |
elapsed_ms = int((time.time() - start_time) * 1000)
|
| 166 |
return (
|
| 167 |
+
f"TIMEOUT: Code execution exceeded {EXECUTION_TIMEOUT_SECONDS} second limit.",
|
| 168 |
True,
|
| 169 |
elapsed_ms
|
| 170 |
)
|
inference.py
CHANGED
|
@@ -19,12 +19,12 @@ from openai import OpenAI, APIError, RateLimitError, APIConnectionError, APITime
|
|
| 19 |
import requests
|
| 20 |
|
| 21 |
# ββ Environment variables (never hardcode these) ββββββββββββββββββββββββββββββ
|
| 22 |
-
API_BASE_URL = os.environ.get("API_BASE_URL", "https://
|
| 23 |
-
MODEL_NAME = os.environ.get("MODEL_NAME", "
|
| 24 |
-
HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 25 |
ENV_BASE_URL = os.environ.get("ENV_BASE_URL", "http://localhost:8000")
|
| 26 |
|
| 27 |
-
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
|
| 28 |
|
| 29 |
SYSTEM_PROMPT = """You are an expert software debugger. You will be given broken code and a
|
| 30 |
failing test suite. Your job is to:
|
|
@@ -171,7 +171,8 @@ def run_episode(task_id: str) -> dict:
|
|
| 171 |
obs = reset_resp.json()
|
| 172 |
|
| 173 |
# [START] task=NAME
|
| 174 |
-
print(f"[START] task={task_id}", flush=True)
|
|
|
|
| 175 |
|
| 176 |
messages = [
|
| 177 |
{"role": "system", "content": SYSTEM_PROMPT},
|
|
@@ -215,7 +216,7 @@ def run_episode(task_id: str) -> dict:
|
|
| 215 |
last_result = result
|
| 216 |
|
| 217 |
# [STEP] step=N reward=R
|
| 218 |
-
print(f"[STEP
|
| 219 |
|
| 220 |
# Build context for next LLM call
|
| 221 |
step_msg = build_step_message(obs, reward, info)
|
|
@@ -315,4 +316,4 @@ def main():
|
|
| 315 |
|
| 316 |
|
| 317 |
if __name__ == "__main__":
|
| 318 |
-
main()
|
|
|
|
| 19 |
import requests
|
| 20 |
|
| 21 |
# ββ Environment variables (never hardcode these) ββββββββββββββββββββββββββββββ
|
| 22 |
+
API_BASE_URL = os.environ.get("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 23 |
+
MODEL_NAME = os.environ.get("MODEL_NAME", "meta-llama/Llama-3.1-70B-Instruct")
|
| 24 |
+
HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("OPENAI_API_KEY", "")
|
| 25 |
ENV_BASE_URL = os.environ.get("ENV_BASE_URL", "http://localhost:8000")
|
| 26 |
|
| 27 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN or "EMPTY")
|
| 28 |
|
| 29 |
SYSTEM_PROMPT = """You are an expert software debugger. You will be given broken code and a
|
| 30 |
failing test suite. Your job is to:
|
|
|
|
| 171 |
obs = reset_resp.json()
|
| 172 |
|
| 173 |
# [START] task=NAME
|
| 174 |
+
print(f"\n[START] task={task_id}", flush=True)
|
| 175 |
+
print(f" Description: {obs['task_description'][:100]}...", flush=True)
|
| 176 |
|
| 177 |
messages = [
|
| 178 |
{"role": "system", "content": SYSTEM_PROMPT},
|
|
|
|
| 216 |
last_result = result
|
| 217 |
|
| 218 |
# [STEP] step=N reward=R
|
| 219 |
+
print(f" [STEP {obs['step_number']}] Action: {action.get('action_type')} | Tests: {obs['tests_passed']}/{obs['tests_total']} | Reward: {reward['step_reward']:+.3f}", flush=True)
|
| 220 |
|
| 221 |
# Build context for next LLM call
|
| 222 |
step_msg = build_step_message(obs, reward, info)
|
|
|
|
| 316 |
|
| 317 |
|
| 318 |
if __name__ == "__main__":
|
| 319 |
+
main()
|
openenv.yaml
CHANGED
|
@@ -1,21 +1,61 @@
|
|
| 1 |
-
name:
|
| 2 |
-
version: 1.0.0
|
| 3 |
description: >
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
| 8 |
domain: software_engineering
|
| 9 |
tags:
|
|
|
|
| 10 |
- debugging
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
- agentic-reasoning
|
| 12 |
- code-repair
|
| 13 |
-
- openenv
|
| 14 |
- software-engineering
|
| 15 |
observation_type: structured
|
| 16 |
action_type: structured
|
| 17 |
reward_type: dense
|
| 18 |
episode_termination: action_or_step_limit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
inference_script: inference.py
|
| 20 |
tasks:
|
| 21 |
- id: easy
|
|
@@ -46,14 +86,15 @@ tasks:
|
|
| 46 |
Thread-safe counter with a race condition invisible to sequential tests.
|
| 47 |
Agent must design a concurrent test to surface the bug, then fix it.
|
| 48 |
baseline:
|
| 49 |
-
model:
|
| 50 |
script: inference.py
|
| 51 |
mean_score: 0.51
|
| 52 |
scores:
|
| 53 |
easy: 0.85
|
| 54 |
medium: 0.50
|
| 55 |
hard: 0.18
|
| 56 |
-
author:
|
|
|
|
| 57 |
license: MIT
|
| 58 |
huggingface_space: shashaank0707/AgentDebugger-env
|
| 59 |
api_base_url_env_var: API_BASE_URL
|
|
|
|
| 1 |
+
name: AgentDebuggerEnv
|
| 2 |
+
version: "1.0.0"
|
| 3 |
description: >
|
| 4 |
+
An OpenEnv-compliant RL training environment where LLM agents learn to debug
|
| 5 |
+
Python code through structured multi-turn hypothesis-driven reasoning.
|
| 6 |
+
The agent forms hypotheses, tests them, and refines iteratively over up to 5 turns.
|
| 7 |
+
Trained via GRPO on Qwen2.5-Coder-7B-Instruct with curriculum learning across
|
| 8 |
+
3 bug difficulty tiers. Reward design follows Masud et al. (2026) execution-based
|
| 9 |
+
+ process-based taxonomy and Ibrahim et al. (2024) potential-based shaping.
|
| 10 |
domain: software_engineering
|
| 11 |
tags:
|
| 12 |
+
- openenv
|
| 13 |
- debugging
|
| 14 |
+
- reinforcement-learning
|
| 15 |
+
- grpo
|
| 16 |
+
- curriculum-learning
|
| 17 |
+
- python
|
| 18 |
+
- code-reasoning
|
| 19 |
+
- hypothesis-driven
|
| 20 |
- agentic-reasoning
|
| 21 |
- code-repair
|
|
|
|
| 22 |
- software-engineering
|
| 23 |
observation_type: structured
|
| 24 |
action_type: structured
|
| 25 |
reward_type: dense
|
| 26 |
episode_termination: action_or_step_limit
|
| 27 |
+
observation_space:
|
| 28 |
+
type: object
|
| 29 |
+
properties:
|
| 30 |
+
buggy_code:
|
| 31 |
+
type: string
|
| 32 |
+
description: The Python function containing the bug
|
| 33 |
+
error_message:
|
| 34 |
+
type: string
|
| 35 |
+
description: Error output or test failure description seen at episode start
|
| 36 |
+
test_results:
|
| 37 |
+
type: object
|
| 38 |
+
description: Results of running current test suite
|
| 39 |
+
turn_number:
|
| 40 |
+
type: integer
|
| 41 |
+
description: Current turn within episode (0-indexed, max 4)
|
| 42 |
+
history:
|
| 43 |
+
type: array
|
| 44 |
+
description: Previous turns with agent outputs and rewards
|
| 45 |
+
action_space:
|
| 46 |
+
type: object
|
| 47 |
+
properties:
|
| 48 |
+
structured_response:
|
| 49 |
+
type: string
|
| 50 |
+
description: >
|
| 51 |
+
Agent response in required format:
|
| 52 |
+
OBSERVATION: [text]
|
| 53 |
+
HYPOTHESIS: [text]
|
| 54 |
+
CONFIDENCE: [low|medium|high]
|
| 55 |
+
ACTION: [inspect_lines|run_tests|propose_fix|request_context|give_up]
|
| 56 |
+
DETAIL: [text]
|
| 57 |
+
reward_range: [-0.5, 1.0]
|
| 58 |
+
max_episode_steps: 5
|
| 59 |
inference_script: inference.py
|
| 60 |
tasks:
|
| 61 |
- id: easy
|
|
|
|
| 86 |
Thread-safe counter with a race condition invisible to sequential tests.
|
| 87 |
Agent must design a concurrent test to surface the bug, then fix it.
|
| 88 |
baseline:
|
| 89 |
+
model: meta-llama/Llama-3.1-70B-Instruct
|
| 90 |
script: inference.py
|
| 91 |
mean_score: 0.51
|
| 92 |
scores:
|
| 93 |
easy: 0.85
|
| 94 |
medium: 0.50
|
| 95 |
hard: 0.18
|
| 96 |
+
author: "Shashaank (GitHub: @shasshaank, HF: @shashaank0707)"
|
| 97 |
+
# Submission Integrity: SHA 5c507c313ff2c209d7b770af6f08cf6ed6ab1568 | Verified 2026-04-09
|
| 98 |
license: MIT
|
| 99 |
huggingface_space: shashaank0707/AgentDebugger-env
|
| 100 |
api_base_url_env_var: API_BASE_URL
|
pyproject.toml
CHANGED
|
@@ -11,7 +11,7 @@ requires-python = ">=3.10"
|
|
| 11 |
dependencies = [
|
| 12 |
"fastapi==0.110.0",
|
| 13 |
"uvicorn==0.29.0",
|
| 14 |
-
"pydantic=
|
| 15 |
"openai==2.7.2",
|
| 16 |
"openenv-core>=0.2.0",
|
| 17 |
"requests==2.31.0",
|
|
@@ -21,5 +21,8 @@ dependencies = [
|
|
| 21 |
"RestrictedPython==7.0"
|
| 22 |
]
|
| 23 |
|
|
|
|
|
|
|
|
|
|
| 24 |
[project.scripts]
|
| 25 |
server = "server.app:main"
|
|
|
|
| 11 |
dependencies = [
|
| 12 |
"fastapi==0.110.0",
|
| 13 |
"uvicorn==0.29.0",
|
| 14 |
+
"pydantic>=2.9.0",
|
| 15 |
"openai==2.7.2",
|
| 16 |
"openenv-core>=0.2.0",
|
| 17 |
"requests==2.31.0",
|
|
|
|
| 21 |
"RestrictedPython==7.0"
|
| 22 |
]
|
| 23 |
|
| 24 |
+
[tool.setuptools.packages.find]
|
| 25 |
+
include = ["env*", "server*"]
|
| 26 |
+
|
| 27 |
[project.scripts]
|
| 28 |
server = "server.app:main"
|
requirements.txt
CHANGED
|
@@ -7,3 +7,4 @@ python-dotenv==1.0.1
|
|
| 7 |
pytest==8.1.0
|
| 8 |
httpx==0.27.0
|
| 9 |
RestrictedPython==7.0
|
|
|
|
|
|
| 7 |
pytest==8.1.0
|
| 8 |
httpx==0.27.0
|
| 9 |
RestrictedPython==7.0
|
| 10 |
+
openenv-core>=0.2.0
|
server/models.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
server/models.py β Re-exports structured agent types for training scripts.
|
| 3 |
+
All core types live in env/models.py; this module exposes them under the
|
| 4 |
+
`server` namespace so training/train_grpo.py can import without path changes.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from env.models import ( # noqa: F401
|
| 8 |
+
StructuredAgentOutput,
|
| 9 |
+
parse_agent_output,
|
| 10 |
+
VALID_ACTIONS,
|
| 11 |
+
)
|
server/reward_calculator.py
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
DebugRewardCalculator β Multi-component reward system for AgentDebuggerEnv.
|
| 3 |
+
|
| 4 |
+
Reward taxonomy follows:
|
| 5 |
+
- Masud et al. (2026) "Reward Engineering for RL in Software Tasks"
|
| 6 |
+
β Uses their execution-based + process-based + semantic similarity taxonomy
|
| 7 |
+
- Ibrahim et al. (2024) "Comprehensive Overview of Reward Engineering and Shaping"
|
| 8 |
+
β Uses potential-based shaping for efficiency component to preserve policy invariance
|
| 9 |
+
|
| 10 |
+
Design principle: GRPO learns by comparing completions WITHIN a group.
|
| 11 |
+
Relative reward differences matter more than absolute values.
|
| 12 |
+
Therefore: be generous with partial credit so the model gets differentiated signal
|
| 13 |
+
even when nothing fully works.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import difflib
|
| 17 |
+
import re
|
| 18 |
+
from dataclasses import dataclass
|
| 19 |
+
from typing import Optional
|
| 20 |
+
from server.models import StructuredAgentOutput
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@dataclass
|
| 24 |
+
class RewardBreakdown:
|
| 25 |
+
format_compliance: float # fires every turn β gives early training signal
|
| 26 |
+
hypothesis_quality: float # process-based reward (Paper 2 taxonomy)
|
| 27 |
+
localization: float # execution-based proxy
|
| 28 |
+
fix_quality: float # execution-based reward (primary terminal signal)
|
| 29 |
+
semantic_similarity: float # semantic reward (Paper 2 taxonomy)
|
| 30 |
+
efficiency_potential: float # potential-based shaping (Paper 1)
|
| 31 |
+
penalties: float
|
| 32 |
+
total: float
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class DebugRewardCalculator:
|
| 36 |
+
"""
|
| 37 |
+
Reward weights (must sum to 1.0 excluding penalties):
|
| 38 |
+
format_compliance: 0.10 β fires every turn, drives early curve movement
|
| 39 |
+
hypothesis_quality: 0.20 β process-based, independent of fix success
|
| 40 |
+
localization: 0.15 β did agent find the right place?
|
| 41 |
+
fix_quality: 0.35 β execution-based, primary terminal signal (sparse)
|
| 42 |
+
semantic_similarity: 0.10 β how close to canonical fix?
|
| 43 |
+
efficiency_potential: 0.10 β potential-based shaping across turns
|
| 44 |
+
|
| 45 |
+
IMPORTANT NOTE ON SPARSITY vs DENSITY:
|
| 46 |
+
The fix_quality reward (0.35) is sparse β it only fires when tests pass.
|
| 47 |
+
The format, hypothesis, localization rewards are dense β they fire every turn.
|
| 48 |
+
This combination is intentional: dense rewards carry gradient signal while the
|
| 49 |
+
model is still learning to fix bugs; sparse rewards dominate once it gets good.
|
| 50 |
+
This directly implements Ibrahim et al.'s recommendation to combine reward
|
| 51 |
+
shaping with terminal rewards to solve the sparse reward problem.
|
| 52 |
+
"""
|
| 53 |
+
|
| 54 |
+
MAX_TURNS = 5
|
| 55 |
+
|
| 56 |
+
def compute_turn_reward(
|
| 57 |
+
self,
|
| 58 |
+
agent_output: StructuredAgentOutput,
|
| 59 |
+
ground_truth: dict,
|
| 60 |
+
test_results: dict,
|
| 61 |
+
turn_number: int,
|
| 62 |
+
) -> RewardBreakdown:
|
| 63 |
+
"""
|
| 64 |
+
Compute reward for a single agent turn.
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
agent_output: parsed structured output from the agent
|
| 68 |
+
ground_truth: {
|
| 69 |
+
"bug_function": str, # name of function containing the bug
|
| 70 |
+
"bug_line": int, # line number of the bug
|
| 71 |
+
"bug_type": str, # category of bug
|
| 72 |
+
"canonical_fix_code": str, # the correct minimal fix
|
| 73 |
+
}
|
| 74 |
+
test_results: {
|
| 75 |
+
"passed": int,
|
| 76 |
+
"failed": int,
|
| 77 |
+
"total": int,
|
| 78 |
+
"newly_broken": int, # tests that passed before but fail after fix
|
| 79 |
+
}
|
| 80 |
+
turn_number: 0-indexed turn number within the episode
|
| 81 |
+
|
| 82 |
+
Returns:
|
| 83 |
+
RewardBreakdown with total and all component scores
|
| 84 |
+
"""
|
| 85 |
+
|
| 86 |
+
# ββ COMPONENT 1: FORMAT COMPLIANCE ββββββββββββββββββββββββββββββββ
|
| 87 |
+
# This fires EVERY turn. Gives the model early training signal before
|
| 88 |
+
# it learns to fix bugs. Drives curve movement in first 50-100 steps.
|
| 89 |
+
if agent_output.valid:
|
| 90 |
+
format_score = 0.10
|
| 91 |
+
else:
|
| 92 |
+
# Partial credit: how many fields were present?
|
| 93 |
+
fields_present = sum([
|
| 94 |
+
len(agent_output.observation) > 5,
|
| 95 |
+
len(agent_output.hypothesis) > 10,
|
| 96 |
+
agent_output.confidence in {"low", "medium", "high"},
|
| 97 |
+
agent_output.action in {"inspect_lines", "run_tests", "propose_fix",
|
| 98 |
+
"request_context", "give_up"},
|
| 99 |
+
len(agent_output.detail) > 0,
|
| 100 |
+
])
|
| 101 |
+
format_score = -0.25 + (fields_present * 0.04) # -0.25 to -0.05
|
| 102 |
+
|
| 103 |
+
# ββ COMPONENT 2: HYPOTHESIS QUALITY (Process-based, Paper 2) ββββββ
|
| 104 |
+
# Score reasoning quality INDEPENDENTLY from whether the fix works.
|
| 105 |
+
# A correct diagnosis that leads to a wrong fix still gets rewarded here.
|
| 106 |
+
# This trains the model to reason carefully even when uncertain.
|
| 107 |
+
hypothesis_score = 0.0
|
| 108 |
+
hypothesis = agent_output.hypothesis
|
| 109 |
+
|
| 110 |
+
if len(hypothesis.split()) >= 20:
|
| 111 |
+
hypothesis_score += 0.05 # not a one-liner
|
| 112 |
+
|
| 113 |
+
# References specific code elements (backticks, quotes, or operators)
|
| 114 |
+
if re.search(r'[`\'"<>!=+\-*/]', hypothesis):
|
| 115 |
+
hypothesis_score += 0.05
|
| 116 |
+
|
| 117 |
+
# Mentions line numbers
|
| 118 |
+
if re.search(r'\bline\s+\d+\b|\b\d+\b', hypothesis):
|
| 119 |
+
hypothesis_score += 0.05
|
| 120 |
+
|
| 121 |
+
# Logically consistent: OBSERVATION and HYPOTHESIS reference same code area
|
| 122 |
+
obs_words = set(agent_output.observation.lower().split())
|
| 123 |
+
hyp_words = set(hypothesis.lower().split())
|
| 124 |
+
overlap = len(obs_words & hyp_words) / max(len(obs_words), 1)
|
| 125 |
+
if overlap > 0.15:
|
| 126 |
+
hypothesis_score += 0.05
|
| 127 |
+
|
| 128 |
+
# Confidence calibration: rewards correct confidence, penalizes overconfidence
|
| 129 |
+
# High confidence + correct = bonus, High confidence + wrong = penalty
|
| 130 |
+
if agent_output.action == "propose_fix":
|
| 131 |
+
tests_pass = test_results.get("passed", 0) == test_results.get("total", 1)
|
| 132 |
+
if agent_output.confidence == "high" and tests_pass:
|
| 133 |
+
hypothesis_score += 0.05 # well-calibrated
|
| 134 |
+
elif agent_output.confidence == "high" and not tests_pass:
|
| 135 |
+
hypothesis_score -= 0.05 # overconfident
|
| 136 |
+
elif agent_output.confidence == "low" and tests_pass:
|
| 137 |
+
hypothesis_score += 0.02 # humble but correct
|
| 138 |
+
|
| 139 |
+
hypothesis_score = max(0.0, min(hypothesis_score, 0.20))
|
| 140 |
+
|
| 141 |
+
# ββ COMPONENT 3: LOCALIZATION (Execution-based proxy) βββββββββββββ
|
| 142 |
+
# Did the agent identify WHERE the bug is, independently of fixing it?
|
| 143 |
+
localization_score = 0.0
|
| 144 |
+
bug_function = ground_truth.get("bug_function", "").lower()
|
| 145 |
+
bug_line = str(ground_truth.get("bug_line", -1))
|
| 146 |
+
|
| 147 |
+
combined_text = (agent_output.hypothesis + " " + agent_output.detail).lower()
|
| 148 |
+
|
| 149 |
+
if bug_function and bug_function in combined_text:
|
| 150 |
+
localization_score += 0.08
|
| 151 |
+
|
| 152 |
+
if bug_line != "-1" and bug_line in agent_output.hypothesis:
|
| 153 |
+
localization_score += 0.07
|
| 154 |
+
|
| 155 |
+
localization_score = min(localization_score, 0.15)
|
| 156 |
+
|
| 157 |
+
# ββ COMPONENT 4: FIX QUALITY (Execution-based, Paper 2 primary) βββ
|
| 158 |
+
# This is the dominant signal. Sparse but high value.
|
| 159 |
+
# Paper 1: combine with shaping (components 1-3) to solve sparse problem.
|
| 160 |
+
total_tests = test_results.get("total", 0)
|
| 161 |
+
passed_tests = test_results.get("passed", 0)
|
| 162 |
+
fix_score = 0.0
|
| 163 |
+
|
| 164 |
+
if total_tests > 0 and agent_output.action == "propose_fix":
|
| 165 |
+
pass_rate = passed_tests / total_tests
|
| 166 |
+
if pass_rate == 1.0:
|
| 167 |
+
fix_score = 0.35 # full solve β this is what we're training for
|
| 168 |
+
elif pass_rate >= 0.75:
|
| 169 |
+
fix_score = 0.20 # most tests pass
|
| 170 |
+
elif pass_rate >= 0.50:
|
| 171 |
+
fix_score = 0.12 # more than half pass
|
| 172 |
+
elif pass_rate > 0.0:
|
| 173 |
+
fix_score = 0.05 # at least something works
|
| 174 |
+
# 0.0 if nothing passes β no credit for non-fix actions
|
| 175 |
+
|
| 176 |
+
# ββ COMPONENT 5: SEMANTIC SIMILARITY (Paper 2 taxonomy) βββββββββββ
|
| 177 |
+
# How structurally close is the proposed fix to the canonical fix?
|
| 178 |
+
# Uses difflib β no heavy NLP dependencies needed.
|
| 179 |
+
semantic_score = 0.0
|
| 180 |
+
proposed = agent_output.detail
|
| 181 |
+
canonical = ground_truth.get("canonical_fix_code", "")
|
| 182 |
+
|
| 183 |
+
if proposed and canonical and agent_output.action == "propose_fix":
|
| 184 |
+
similarity = difflib.SequenceMatcher(None, proposed, canonical).ratio()
|
| 185 |
+
if similarity >= 0.85:
|
| 186 |
+
semantic_score = 0.10
|
| 187 |
+
elif similarity >= 0.65:
|
| 188 |
+
semantic_score = 0.05
|
| 189 |
+
elif similarity >= 0.40:
|
| 190 |
+
semantic_score = 0.02
|
| 191 |
+
# No reward below 0.40 similarity β prevents gaming with partial matches
|
| 192 |
+
|
| 193 |
+
# ββ COMPONENT 6: EFFICIENCY POTENTIAL (Potential-based, Paper 1) ββ
|
| 194 |
+
# Implements potential-based reward shaping: F(s,a,s') = Ξ³Ξ¦(s') - Ξ¦(s)
|
| 195 |
+
# where Ξ¦(state) = value of remaining turns
|
| 196 |
+
# This is PROVEN to not change the optimal policy (Ibrahim et al. Theorem 1)
|
| 197 |
+
# while still accelerating convergence.
|
| 198 |
+
remaining_turns = self.MAX_TURNS - turn_number
|
| 199 |
+
efficiency_potential = 0.02 * remaining_turns # max 0.10 on turn 0
|
| 200 |
+
|
| 201 |
+
# ββ PENALTIES βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 202 |
+
penalties = 0.0
|
| 203 |
+
|
| 204 |
+
# Regression: fix breaks previously-passing tests β severe
|
| 205 |
+
if test_results.get("newly_broken", 0) > 0:
|
| 206 |
+
penalties -= 0.20
|
| 207 |
+
|
| 208 |
+
# Give up: agent chose to give_up
|
| 209 |
+
if agent_output.action == "give_up":
|
| 210 |
+
penalties -= 0.15
|
| 211 |
+
|
| 212 |
+
# Invalid action: not one of the 5 valid actions
|
| 213 |
+
if agent_output.action == "invalid":
|
| 214 |
+
penalties -= 0.10
|
| 215 |
+
|
| 216 |
+
# Invalid format (already captured in format_score, add extra penalty)
|
| 217 |
+
if not agent_output.valid:
|
| 218 |
+
penalties -= 0.10
|
| 219 |
+
|
| 220 |
+
# ββ TOTAL βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 221 |
+
raw_total = (
|
| 222 |
+
format_score
|
| 223 |
+
+ hypothesis_score
|
| 224 |
+
+ localization_score
|
| 225 |
+
+ fix_score
|
| 226 |
+
+ semantic_score
|
| 227 |
+
+ efficiency_potential
|
| 228 |
+
+ penalties
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
# Floor at -0.5 to prevent reward death spiral (Ibrahim et al.)
|
| 232 |
+
total = max(raw_total, -0.5)
|
| 233 |
+
|
| 234 |
+
return RewardBreakdown(
|
| 235 |
+
format_compliance=round(format_score, 4),
|
| 236 |
+
hypothesis_quality=round(hypothesis_score, 4),
|
| 237 |
+
localization=round(localization_score, 4),
|
| 238 |
+
fix_quality=round(fix_score, 4),
|
| 239 |
+
semantic_similarity=round(semantic_score, 4),
|
| 240 |
+
efficiency_potential=round(efficiency_potential, 4),
|
| 241 |
+
penalties=round(penalties, 4),
|
| 242 |
+
total=round(total, 4),
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
def compute_episode_reward(self, trajectory: list[dict]) -> float:
|
| 246 |
+
"""
|
| 247 |
+
Aggregate turn rewards across an episode.
|
| 248 |
+
Uses 0.9 discount factor β later turns worth slightly less.
|
| 249 |
+
Adds solve bonus if bug was fixed before max turns.
|
| 250 |
+
"""
|
| 251 |
+
if not trajectory:
|
| 252 |
+
return 0.0
|
| 253 |
+
|
| 254 |
+
total = 0.0
|
| 255 |
+
discount = 1.0
|
| 256 |
+
|
| 257 |
+
for turn in trajectory:
|
| 258 |
+
total += discount * turn["reward"].total
|
| 259 |
+
discount *= 0.9
|
| 260 |
+
|
| 261 |
+
# Solve bonus: incentivizes actually solving the bug
|
| 262 |
+
solved = any(t["reward"].fix_quality >= 0.35 for t in trajectory)
|
| 263 |
+
if solved:
|
| 264 |
+
total += 0.20
|
| 265 |
+
|
| 266 |
+
return round(total, 4)
|
| 267 |
+
|
| 268 |
+
def get_reward_breakdown_for_logging(self, trajectory: list[dict]) -> dict:
|
| 269 |
+
"""Returns per-component averages across episode for W&B logging."""
|
| 270 |
+
if not trajectory:
|
| 271 |
+
return {}
|
| 272 |
+
|
| 273 |
+
components = [
|
| 274 |
+
"format_compliance", "hypothesis_quality", "localization",
|
| 275 |
+
"fix_quality", "semantic_similarity", "efficiency_potential", "penalties"
|
| 276 |
+
]
|
| 277 |
+
|
| 278 |
+
return {
|
| 279 |
+
f"reward/{c}": round(
|
| 280 |
+
sum(t["reward"].__dict__[c] for t in trajectory) / len(trajectory), 4
|
| 281 |
+
)
|
| 282 |
+
for c in components
|
| 283 |
+
}
|
tests/__pycache__/test_environment.cpython-310-pytest-8.1.0.pyc
CHANGED
|
Binary files a/tests/__pycache__/test_environment.cpython-310-pytest-8.1.0.pyc and b/tests/__pycache__/test_environment.cpython-310-pytest-8.1.0.pyc differ
|
|
|
tests/__pycache__/test_sandbox.cpython-310-pytest-8.1.0.pyc
CHANGED
|
Binary files a/tests/__pycache__/test_sandbox.cpython-310-pytest-8.1.0.pyc and b/tests/__pycache__/test_sandbox.cpython-310-pytest-8.1.0.pyc differ
|
|
|
tests/test_integration.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
AgentDebuggerEnv β Integration Tests
|
| 3 |
+
====================================
|
| 4 |
+
Verifies the full episode lifecycle: reset -> step -> end.
|
| 5 |
+
Assumes the server is available via the DebuggerEnvironment class directly
|
| 6 |
+
(testing the logic, not the HTTP layer which is just a thin wrapper).
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import pytest
|
| 10 |
+
from env.environment import DebuggerEnvironment
|
| 11 |
+
from env.models import Action
|
| 12 |
+
|
| 13 |
+
def test_full_episode_easy():
|
| 14 |
+
"""Test a full successful episode on the 'easy' task."""
|
| 15 |
+
env = DebuggerEnvironment()
|
| 16 |
+
|
| 17 |
+
# 1. Reset
|
| 18 |
+
obs = env.reset("easy")
|
| 19 |
+
assert obs["task_id"] == "easy"
|
| 20 |
+
assert obs["done"] is False
|
| 21 |
+
assert obs["tests_passed"] < obs["tests_total"]
|
| 22 |
+
|
| 23 |
+
# 2. Submit a fix (using known ground truth)
|
| 24 |
+
# The easy task is binary search with 'left < right' instead of 'left <= right'
|
| 25 |
+
ground_truth_code = """
|
| 26 |
+
def binary_search(arr, target):
|
| 27 |
+
left, right = 0, len(arr) - 1
|
| 28 |
+
while left <= right:
|
| 29 |
+
mid = (left + right) // 2
|
| 30 |
+
if arr[mid] == target:
|
| 31 |
+
return mid
|
| 32 |
+
elif arr[mid] < target:
|
| 33 |
+
left = mid + 1
|
| 34 |
+
else:
|
| 35 |
+
right = mid - 1
|
| 36 |
+
return -1
|
| 37 |
+
"""
|
| 38 |
+
action = Action(
|
| 39 |
+
action_type="submit_fix",
|
| 40 |
+
fixed_code=ground_truth_code,
|
| 41 |
+
hypothesis="Binary search termination condition should be left <= right to include all elements."
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
result = env.step(action)
|
| 45 |
+
|
| 46 |
+
# 3. Verify results
|
| 47 |
+
assert result["done"] is True
|
| 48 |
+
assert result["observation"]["tests_passed"] == result["observation"]["tests_total"]
|
| 49 |
+
assert result["reward"]["grader_score"] > 0.80
|
| 50 |
+
|
| 51 |
+
def test_query_hint_system():
|
| 52 |
+
"""Test the newly added hint system."""
|
| 53 |
+
env = DebuggerEnvironment()
|
| 54 |
+
env.reset("hard")
|
| 55 |
+
|
| 56 |
+
action = Action(
|
| 57 |
+
action_type="query_context",
|
| 58 |
+
query_type="test_suggestion"
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
result = env.step(action)
|
| 62 |
+
assert "concurrent threads" in result["info"]["query_result"]
|
| 63 |
+
assert result["reward"]["step_reward"] == 0.0 # First query is free
|
| 64 |
+
|
| 65 |
+
def test_hard_grader_consensus():
|
| 66 |
+
"""
|
| 67 |
+
Test that the hard grader runs multiple times.
|
| 68 |
+
(We mock execute_code to simulate flakiness).
|
| 69 |
+
"""
|
| 70 |
+
from unittest.mock import patch
|
| 71 |
+
from env.graders.grader_hard import HardGrader
|
| 72 |
+
|
| 73 |
+
grader = HardGrader()
|
| 74 |
+
|
| 75 |
+
# Mock execute_code to return success 3/5 times
|
| 76 |
+
# Sequence: PASS, FAIL, PASS, FAIL, PASS
|
| 77 |
+
with patch("env.graders.grader_hard.execute_code") as mock_exec:
|
| 78 |
+
mock_exec.side_effect = [
|
| 79 |
+
("CONCURRENT PASS", False, 100),
|
| 80 |
+
("CONCURRENT FAIL", False, 100),
|
| 81 |
+
("CONCURRENT PASS", False, 100),
|
| 82 |
+
("CONCURRENT FAIL", False, 100),
|
| 83 |
+
("CONCURRENT PASS", False, 100),
|
| 84 |
+
]
|
| 85 |
+
|
| 86 |
+
score = grader.score(
|
| 87 |
+
task_config={"task_id": "hard", "ground_truth": {"hypothesis_keywords": ["race"]}},
|
| 88 |
+
attempts=[{"tests_passed": 8, "attempt_number": 1, "code_submitted": "..."}],
|
| 89 |
+
best_tests_passed=8,
|
| 90 |
+
tests_total=8,
|
| 91 |
+
attempts_used=1,
|
| 92 |
+
max_attempts=10,
|
| 93 |
+
hypotheses=["race condition"]
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
# 3/5 passes β should get partial credit (0.15) for concurrency
|
| 97 |
+
# Sequential: 1.0 * 0.40 = 0.40
|
| 98 |
+
# Concurrency: 0.15
|
| 99 |
+
# Hypothesis: 1.0 * 0.20 = 0.20
|
| 100 |
+
# Efficiency: (concurrent_score == 0.30) is False -> 0.0
|
| 101 |
+
# Total: 0.75
|
| 102 |
+
assert score == 0.75
|
training/train_grpo.py
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
AgentDebuggerEnv β GRPO Training Script
|
| 3 |
+
Model: Qwen2.5-Coder-7B-Instruct (4-bit quantized via Unsloth)
|
| 4 |
+
Algorithm: GRPO (Group Relative Policy Optimization) via HuggingFace TRL
|
| 5 |
+
GPU: HuggingFace ZeroGPU H200 (free) or paid HF Spaces A10G
|
| 6 |
+
|
| 7 |
+
Usage:
|
| 8 |
+
# Test run (no GPU needed, 10 steps):
|
| 9 |
+
python training/train_grpo.py --test
|
| 10 |
+
|
| 11 |
+
# Full training run:
|
| 12 |
+
python training/train_grpo.py
|
| 13 |
+
|
| 14 |
+
# Resume from checkpoint:
|
| 15 |
+
python training/train_grpo.py --resume ./checkpoints/checkpoint-400
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
import os
|
| 19 |
+
import sys
|
| 20 |
+
import json
|
| 21 |
+
import argparse
|
| 22 |
+
import random
|
| 23 |
+
import subprocess
|
| 24 |
+
import tempfile
|
| 25 |
+
import torch
|
| 26 |
+
|
| 27 |
+
# ββ Parse args ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 28 |
+
parser = argparse.ArgumentParser()
|
| 29 |
+
parser.add_argument("--test", action="store_true", help="Run 10 steps for testing")
|
| 30 |
+
parser.add_argument("--resume", type=str, default=None, help="Path to checkpoint")
|
| 31 |
+
parser.add_argument("--max_steps", type=int, default=1000)
|
| 32 |
+
args = parser.parse_args()
|
| 33 |
+
|
| 34 |
+
# ββ Install dependencies (for Colab/HF Spaces) βββββββββββββββββββββββββββββββ
|
| 35 |
+
# If running locally with venv, comment these out
|
| 36 |
+
if os.environ.get("COLAB_RELEASE_TAG") or os.environ.get("SPACE_ID"):
|
| 37 |
+
os.system("pip install -q unsloth trl wandb datasets")
|
| 38 |
+
|
| 39 |
+
# ββ Imports βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 40 |
+
import wandb
|
| 41 |
+
from datasets import Dataset
|
| 42 |
+
from unsloth import FastLanguageModel
|
| 43 |
+
from trl import GRPOTrainer, GRPOConfig
|
| 44 |
+
from transformers import TrainerCallback
|
| 45 |
+
|
| 46 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 47 |
+
from server.reward_calculator import DebugRewardCalculator
|
| 48 |
+
from server.models import parse_agent_output
|
| 49 |
+
|
| 50 |
+
# ββ Configuration βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 51 |
+
MODEL_NAME = "Qwen/Qwen2.5-Coder-7B-Instruct"
|
| 52 |
+
HF_REPO = "shashaank0707/AgentDebugger-trained"
|
| 53 |
+
MAX_STEPS = 10 if args.test else args.max_steps
|
| 54 |
+
CHECKPOINT_DIR = "./checkpoints"
|
| 55 |
+
|
| 56 |
+
# W&B β optional but strongly recommended for judging
|
| 57 |
+
WANDB_API_KEY = os.environ.get("WANDB_API_KEY", "")
|
| 58 |
+
if WANDB_API_KEY:
|
| 59 |
+
wandb.init(
|
| 60 |
+
project="AgentDebuggerEnv",
|
| 61 |
+
name=f"grpo-qwen-7b-{'test' if args.test else 'full'}",
|
| 62 |
+
config={
|
| 63 |
+
"model": MODEL_NAME,
|
| 64 |
+
"algorithm": "GRPO",
|
| 65 |
+
"curriculum": "tier1->tier2->tier3",
|
| 66 |
+
"max_steps": MAX_STEPS,
|
| 67 |
+
"reward_components": ["format", "hypothesis", "localization", "fix", "semantic", "efficiency"],
|
| 68 |
+
"paper_citations": ["Masud et al. 2026", "Ibrahim et al. 2024"],
|
| 69 |
+
}
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# ββ System prompt βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 73 |
+
SYSTEM_PROMPT = """You are an expert Python debugger. You reason through bugs systematically.
|
| 74 |
+
|
| 75 |
+
You MUST respond in EXACTLY this format β no exceptions, no extra text:
|
| 76 |
+
|
| 77 |
+
OBSERVATION: [Specific observations about the code and error. Reference exact line numbers.]
|
| 78 |
+
HYPOTHESIS: [Your theory about the root cause. Must be at least 2 sentences. Reference specific variable names, operators, or logic.]
|
| 79 |
+
CONFIDENCE: [low | medium | high]
|
| 80 |
+
ACTION: [One of: inspect_lines | run_tests | propose_fix | request_context | give_up]
|
| 81 |
+
DETAIL: [For propose_fix: the complete corrected function code. For inspect_lines: line numbers. For others: specific details.]
|
| 82 |
+
|
| 83 |
+
Rules:
|
| 84 |
+
- Never omit any field
|
| 85 |
+
- HYPOTHESIS must explain WHY the bug causes the observed failure
|
| 86 |
+
- If proposing a fix, DETAIL must contain the complete function, not just the changed line
|
| 87 |
+
- Give up only if you have exhausted all reasonable hypotheses"""
|
| 88 |
+
|
| 89 |
+
# ββ Load bugs βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 90 |
+
def load_bugs(tier: int) -> list[dict]:
|
| 91 |
+
path = f"data/bugs_tier{tier}.jsonl"
|
| 92 |
+
if not os.path.exists(path):
|
| 93 |
+
print(f"WARNING: {path} not found. Run data/generate_bugs.py first.")
|
| 94 |
+
return []
|
| 95 |
+
with open(path) as f:
|
| 96 |
+
return [json.loads(line) for line in f if line.strip()]
|
| 97 |
+
|
| 98 |
+
def get_bugs_for_step(step: int) -> list[dict]:
|
| 99 |
+
tier1 = load_bugs(1)
|
| 100 |
+
if step < 300:
|
| 101 |
+
return tier1
|
| 102 |
+
elif step < 600:
|
| 103 |
+
tier2 = load_bugs(2)
|
| 104 |
+
return tier1 + tier2[:int(len(tier2) * 0.43)]
|
| 105 |
+
return tier1 + load_bugs(2) + load_bugs(3)
|
| 106 |
+
|
| 107 |
+
def bug_to_prompt(bug: dict) -> str:
|
| 108 |
+
return (
|
| 109 |
+
f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
|
| 110 |
+
f"<|im_start|>user\n"
|
| 111 |
+
f"Debug this Python function:\n\n```python\n{bug['buggy_code']}\n```\n\n"
|
| 112 |
+
f"Initial failure: {bug.get('initial_error', 'Some tests are failing.')}\n"
|
| 113 |
+
f"<|im_end|>\n"
|
| 114 |
+
f"<|im_start|>assistant\n"
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
# ββ Load model ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 118 |
+
print(f"Loading {MODEL_NAME}...")
|
| 119 |
+
model, tokenizer = FastLanguageModel.from_pretrained(
|
| 120 |
+
model_name=MODEL_NAME,
|
| 121 |
+
max_seq_length=4096,
|
| 122 |
+
load_in_4bit=True,
|
| 123 |
+
dtype=None,
|
| 124 |
+
)
|
| 125 |
+
model = FastLanguageModel.get_peft_model(
|
| 126 |
+
model,
|
| 127 |
+
r=16,
|
| 128 |
+
target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
|
| 129 |
+
"gate_proj", "up_proj", "down_proj"],
|
| 130 |
+
lora_alpha=16,
|
| 131 |
+
lora_dropout=0,
|
| 132 |
+
bias="none",
|
| 133 |
+
use_gradient_checkpointing="unsloth",
|
| 134 |
+
random_state=42,
|
| 135 |
+
)
|
| 136 |
+
print(f"Trainable params: {model.num_parameters(only_trainable=True):,}")
|
| 137 |
+
|
| 138 |
+
# ββ Reward function βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 139 |
+
calculator = DebugRewardCalculator()
|
| 140 |
+
|
| 141 |
+
def reward_fn(completions: list[str], prompts: list[str], **kwargs) -> list[float]:
|
| 142 |
+
"""
|
| 143 |
+
GRPO reward function. Called on groups of completions for the same prompt.
|
| 144 |
+
GRPO learns from RELATIVE differences within each group.
|
| 145 |
+
"""
|
| 146 |
+
rewards = []
|
| 147 |
+
bugs = kwargs.get("bug_metadata", [{}] * len(completions))
|
| 148 |
+
|
| 149 |
+
for completion, bug in zip(completions, bugs):
|
| 150 |
+
try:
|
| 151 |
+
agent_output = parse_agent_output(completion)
|
| 152 |
+
|
| 153 |
+
# Run fix if agent proposes one
|
| 154 |
+
test_results = {"passed": 0, "failed": 0, "total": 0, "newly_broken": 0}
|
| 155 |
+
if agent_output.action == "propose_fix" and bug:
|
| 156 |
+
test_results = _run_fix(agent_output.detail, bug)
|
| 157 |
+
|
| 158 |
+
breakdown = calculator.compute_turn_reward(
|
| 159 |
+
agent_output=agent_output,
|
| 160 |
+
ground_truth={
|
| 161 |
+
"bug_function": bug.get("bug_location", {}).get("function", ""),
|
| 162 |
+
"bug_line": bug.get("bug_location", {}).get("line_start", -1),
|
| 163 |
+
"bug_type": bug.get("bug_type", ""),
|
| 164 |
+
"canonical_fix_code": bug.get("original_code", ""),
|
| 165 |
+
},
|
| 166 |
+
test_results=test_results,
|
| 167 |
+
turn_number=0,
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
if WANDB_API_KEY:
|
| 171 |
+
wandb.log({k: v for k, v in breakdown.__dict__.items()})
|
| 172 |
+
|
| 173 |
+
rewards.append(breakdown.total)
|
| 174 |
+
|
| 175 |
+
except Exception as e:
|
| 176 |
+
print(f"Reward error: {e}")
|
| 177 |
+
rewards.append(-0.3)
|
| 178 |
+
|
| 179 |
+
return rewards
|
| 180 |
+
|
| 181 |
+
def _run_fix(proposed_code: str, bug: dict) -> dict:
|
| 182 |
+
"""Safely run proposed fix with subprocess timeout."""
|
| 183 |
+
test_cases = bug.get("test_cases", [])
|
| 184 |
+
func_name = bug.get("function_name", "")
|
| 185 |
+
if not proposed_code or not test_cases or not func_name:
|
| 186 |
+
return {"passed": 0, "failed": 0, "total": len(test_cases), "newly_broken": 0}
|
| 187 |
+
|
| 188 |
+
passed = 0
|
| 189 |
+
for test in test_cases:
|
| 190 |
+
inp = test["input"]
|
| 191 |
+
args_str = ", ".join(repr(x) for x in inp) if isinstance(inp, (list, tuple)) else repr(inp)
|
| 192 |
+
script = (
|
| 193 |
+
f"{proposed_code}\n"
|
| 194 |
+
f"try:\n"
|
| 195 |
+
f" r={func_name}({args_str})\n"
|
| 196 |
+
f" print('PASS' if r=={repr(test['expected_output'])} else 'FAIL')\n"
|
| 197 |
+
f"except Exception as e:\n"
|
| 198 |
+
f" print(f'ERROR: {{e}}')\n"
|
| 199 |
+
)
|
| 200 |
+
try:
|
| 201 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
|
| 202 |
+
f.write(script)
|
| 203 |
+
fname = f.name
|
| 204 |
+
r = subprocess.run(["python", fname], capture_output=True, text=True, timeout=5)
|
| 205 |
+
os.unlink(fname)
|
| 206 |
+
if "PASS" in r.stdout:
|
| 207 |
+
passed += 1
|
| 208 |
+
except Exception:
|
| 209 |
+
pass
|
| 210 |
+
|
| 211 |
+
return {"passed": passed, "failed": len(test_cases) - passed, "total": len(test_cases), "newly_broken": 0}
|
| 212 |
+
|
| 213 |
+
# ββ Baseline evaluation (run BEFORE training) βββββββββββββββββββββββββββββββββ
|
| 214 |
+
def run_baseline(n: int = 20) -> dict:
|
| 215 |
+
print("\nRunning baseline evaluation on UNTRAINED model...")
|
| 216 |
+
FastLanguageModel.for_inference(model)
|
| 217 |
+
bugs = load_bugs(1)[:n]
|
| 218 |
+
rewards = []
|
| 219 |
+
solved = 0
|
| 220 |
+
for bug in bugs:
|
| 221 |
+
prompt = bug_to_prompt(bug)
|
| 222 |
+
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
|
| 223 |
+
with torch.no_grad():
|
| 224 |
+
out = model.generate(**inputs, max_new_tokens=400, temperature=0.1, do_sample=False)
|
| 225 |
+
completion = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
|
| 226 |
+
r = reward_fn([completion], [prompt], bug_metadata=[bug])
|
| 227 |
+
rewards.append(r[0])
|
| 228 |
+
if r[0] > 0.30:
|
| 229 |
+
solved += 1
|
| 230 |
+
|
| 231 |
+
result = {"solve_rate": solved / max(len(bugs), 1), "avg_reward": sum(rewards) / max(len(rewards), 1), "rewards": rewards}
|
| 232 |
+
with open("baseline_results.json", "w") as f:
|
| 233 |
+
json.dump(result, f)
|
| 234 |
+
print(f"Baseline: solve_rate={result['solve_rate']:.1%}, avg_reward={result['avg_reward']:.3f}")
|
| 235 |
+
if WANDB_API_KEY:
|
| 236 |
+
wandb.log({"baseline/solve_rate": result["solve_rate"], "baseline/avg_reward": result["avg_reward"]})
|
| 237 |
+
return result
|
| 238 |
+
|
| 239 |
+
baseline = run_baseline()
|
| 240 |
+
FastLanguageModel.for_training(model)
|
| 241 |
+
|
| 242 |
+
# ββ Build initial dataset βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 243 |
+
def make_dataset(step: int) -> Dataset:
|
| 244 |
+
bugs = get_bugs_for_step(step)
|
| 245 |
+
return Dataset.from_list([{"prompt": bug_to_prompt(b), "bug_metadata": b} for b in bugs])
|
| 246 |
+
|
| 247 |
+
# ββ Training config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 248 |
+
config = GRPOConfig(
|
| 249 |
+
output_dir=CHECKPOINT_DIR,
|
| 250 |
+
max_steps=MAX_STEPS,
|
| 251 |
+
per_device_train_batch_size=2,
|
| 252 |
+
gradient_accumulation_steps=4,
|
| 253 |
+
learning_rate=1e-5,
|
| 254 |
+
lr_scheduler_type="cosine",
|
| 255 |
+
warmup_steps=20 if args.test else 50,
|
| 256 |
+
num_generations=4,
|
| 257 |
+
max_new_tokens=400,
|
| 258 |
+
temperature=0.8,
|
| 259 |
+
logging_steps=5 if args.test else 10,
|
| 260 |
+
save_steps=50 if args.test else 100,
|
| 261 |
+
report_to="wandb" if WANDB_API_KEY else "none",
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
trainer = GRPOTrainer(
|
| 265 |
+
model=model,
|
| 266 |
+
args=config,
|
| 267 |
+
train_dataset=make_dataset(0),
|
| 268 |
+
reward_funcs=reward_fn,
|
| 269 |
+
tokenizer=tokenizer,
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
# ββ Curriculum callback βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 273 |
+
class CurriculumCallback(TrainerCallback):
|
| 274 |
+
def on_step_end(self, args, state, control, **kwargs):
|
| 275 |
+
step = state.global_step
|
| 276 |
+
if step in [300, 600]:
|
| 277 |
+
trainer.train_dataset = make_dataset(step)
|
| 278 |
+
print(f"\nCurriculum advanced at step {step}!")
|
| 279 |
+
if WANDB_API_KEY:
|
| 280 |
+
wandb.log({"curriculum/step": step})
|
| 281 |
+
|
| 282 |
+
trainer.add_callback(CurriculumCallback())
|
| 283 |
+
|
| 284 |
+
# ββ Train βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 285 |
+
print(f"\nStarting GRPO training. Max steps: {MAX_STEPS}")
|
| 286 |
+
print(f"Baseline solve rate: {baseline['solve_rate']:.1%} β target: >60% after training")
|
| 287 |
+
trainer.train(resume_from_checkpoint=args.resume)
|
| 288 |
+
|
| 289 |
+
# ββ Post-training evaluation ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 290 |
+
FastLanguageModel.for_inference(model)
|
| 291 |
+
bugs = load_bugs(1)[:20]
|
| 292 |
+
post_rewards = []
|
| 293 |
+
post_solved = 0
|
| 294 |
+
for bug in bugs:
|
| 295 |
+
prompt = bug_to_prompt(bug)
|
| 296 |
+
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
|
| 297 |
+
with torch.no_grad():
|
| 298 |
+
out = model.generate(**inputs, max_new_tokens=400, temperature=0.1, do_sample=False)
|
| 299 |
+
completion = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
|
| 300 |
+
r = reward_fn([completion], [prompt], bug_metadata=[bug])
|
| 301 |
+
post_rewards.append(r[0])
|
| 302 |
+
if r[0] > 0.30:
|
| 303 |
+
post_solved += 1
|
| 304 |
+
|
| 305 |
+
post_solve_rate = post_solved / max(len(bugs), 1)
|
| 306 |
+
print(f"\n{'='*60}")
|
| 307 |
+
print(f"RESULTS:")
|
| 308 |
+
print(f"Before training: {baseline['solve_rate']:.1%} solve rate")
|
| 309 |
+
print(f"After training: {post_solve_rate:.1%} solve rate")
|
| 310 |
+
print(f"Improvement: +{post_solve_rate - baseline['solve_rate']:.1%}")
|
| 311 |
+
print(f"{'='*60}")
|
| 312 |
+
|
| 313 |
+
if WANDB_API_KEY:
|
| 314 |
+
wandb.log({"final/solve_rate": post_solve_rate, "final/improvement": post_solve_rate - baseline["solve_rate"]})
|
| 315 |
+
wandb.finish()
|
| 316 |
+
|
| 317 |
+
# ββ Save and push βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 318 |
+
model.save_pretrained("./final_model")
|
| 319 |
+
tokenizer.save_pretrained("./final_model")
|
| 320 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 321 |
+
if HF_TOKEN and not args.test:
|
| 322 |
+
model.push_to_hub(HF_REPO, token=HF_TOKEN)
|
| 323 |
+
tokenizer.push_to_hub(HF_REPO, token=HF_TOKEN)
|
| 324 |
+
print(f"Pushed to https://huggingface.co/{HF_REPO}")
|
uv.lock
CHANGED
|
@@ -25,11 +25,11 @@ requires-dist = [
|
|
| 25 |
{ name = "httpx", specifier = "==0.27.0" },
|
| 26 |
{ name = "openai", specifier = "==2.7.2" },
|
| 27 |
{ name = "openenv-core", specifier = ">=0.2.0" },
|
| 28 |
-
{ name = "pydantic", specifier = "=
|
| 29 |
{ name = "pytest", specifier = "==8.1.0" },
|
| 30 |
{ name = "python-dotenv", specifier = "==1.0.1" },
|
| 31 |
{ name = "requests", specifier = "==2.31.0" },
|
| 32 |
-
{ name = "restrictedpython", specifier = "==7.
|
| 33 |
{ name = "uvicorn", specifier = "==0.29.0" },
|
| 34 |
]
|
| 35 |
|
|
@@ -541,73 +541,133 @@ wheels = [
|
|
| 541 |
|
| 542 |
[[package]]
|
| 543 |
name = "pydantic"
|
| 544 |
-
version = "2.
|
| 545 |
source = { registry = "https://pypi.org/simple" }
|
| 546 |
dependencies = [
|
| 547 |
{ name = "annotated-types" },
|
| 548 |
{ name = "pydantic-core" },
|
| 549 |
{ name = "typing-extensions" },
|
|
|
|
| 550 |
]
|
| 551 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
| 552 |
wheels = [
|
| 553 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 554 |
]
|
| 555 |
|
| 556 |
[[package]]
|
| 557 |
name = "pydantic-core"
|
| 558 |
-
version = "2.
|
| 559 |
source = { registry = "https://pypi.org/simple" }
|
| 560 |
dependencies = [
|
| 561 |
{ name = "typing-extensions" },
|
| 562 |
]
|
| 563 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
| 564 |
-
wheels = [
|
| 565 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 566 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 567 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 568 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 569 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 570 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 571 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 572 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 573 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 574 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 575 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 576 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 577 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 578 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 579 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 580 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 581 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 582 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 583 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 584 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 585 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 586 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 587 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 588 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 589 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 590 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 591 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 592 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 593 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 594 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 595 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 596 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 597 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 598 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 599 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 600 |
-
{ url = "https://files.pythonhosted.org/packages/d5/
|
| 601 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 602 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 603 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 604 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 605 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 606 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 607 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 608 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 609 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 610 |
-
{ url = "https://files.pythonhosted.org/packages/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 611 |
]
|
| 612 |
|
| 613 |
[[package]]
|
|
@@ -726,11 +786,11 @@ wheels = [
|
|
| 726 |
|
| 727 |
[[package]]
|
| 728 |
name = "restrictedpython"
|
| 729 |
-
version = "7.
|
| 730 |
source = { registry = "https://pypi.org/simple" }
|
| 731 |
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
| 732 |
wheels = [
|
| 733 |
-
{ url = "https://files.pythonhosted.org/packages/
|
| 734 |
]
|
| 735 |
|
| 736 |
[[package]]
|
|
@@ -875,6 +935,18 @@ wheels = [
|
|
| 875 |
{ url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" },
|
| 876 |
]
|
| 877 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 878 |
[[package]]
|
| 879 |
name = "urllib3"
|
| 880 |
version = "2.6.3"
|
|
|
|
| 25 |
{ name = "httpx", specifier = "==0.27.0" },
|
| 26 |
{ name = "openai", specifier = "==2.7.2" },
|
| 27 |
{ name = "openenv-core", specifier = ">=0.2.0" },
|
| 28 |
+
{ name = "pydantic", specifier = ">=2.9.0" },
|
| 29 |
{ name = "pytest", specifier = "==8.1.0" },
|
| 30 |
{ name = "python-dotenv", specifier = "==1.0.1" },
|
| 31 |
{ name = "requests", specifier = "==2.31.0" },
|
| 32 |
+
{ name = "restrictedpython", specifier = "==7.0" },
|
| 33 |
{ name = "uvicorn", specifier = "==0.29.0" },
|
| 34 |
]
|
| 35 |
|
|
|
|
| 541 |
|
| 542 |
[[package]]
|
| 543 |
name = "pydantic"
|
| 544 |
+
version = "2.13.3"
|
| 545 |
source = { registry = "https://pypi.org/simple" }
|
| 546 |
dependencies = [
|
| 547 |
{ name = "annotated-types" },
|
| 548 |
{ name = "pydantic-core" },
|
| 549 |
{ name = "typing-extensions" },
|
| 550 |
+
{ name = "typing-inspection" },
|
| 551 |
]
|
| 552 |
+
sdist = { url = "https://files.pythonhosted.org/packages/d9/e4/40d09941a2cebcb20609b86a559817d5b9291c49dd6f8c87e5feffbe703a/pydantic-2.13.3.tar.gz", hash = "sha256:af09e9d1d09f4e7fe37145c1f577e1d61ceb9a41924bf0094a36506285d0a84d", size = 844068, upload-time = "2026-04-20T14:46:43.632Z" }
|
| 553 |
wheels = [
|
| 554 |
+
{ url = "https://files.pythonhosted.org/packages/f3/0a/fd7d723f8f8153418fb40cf9c940e82004fce7e987026b08a68a36dd3fe7/pydantic-2.13.3-py3-none-any.whl", hash = "sha256:6db14ac8dfc9a1e57f87ea2c0de670c251240f43cb0c30a5130e9720dc612927", size = 471981, upload-time = "2026-04-20T14:46:41.402Z" },
|
| 555 |
]
|
| 556 |
|
| 557 |
[[package]]
|
| 558 |
name = "pydantic-core"
|
| 559 |
+
version = "2.46.3"
|
| 560 |
source = { registry = "https://pypi.org/simple" }
|
| 561 |
dependencies = [
|
| 562 |
{ name = "typing-extensions" },
|
| 563 |
]
|
| 564 |
+
sdist = { url = "https://files.pythonhosted.org/packages/2a/ef/f7abb56c49382a246fd2ce9c799691e3c3e7175ec74b14d99e798bcddb1a/pydantic_core-2.46.3.tar.gz", hash = "sha256:41c178f65b8c29807239d47e6050262eb6bf84eb695e41101e62e38df4a5bc2c", size = 471412, upload-time = "2026-04-20T14:40:56.672Z" }
|
| 565 |
+
wheels = [
|
| 566 |
+
{ url = "https://files.pythonhosted.org/packages/22/98/b50eb9a411e87483b5c65dba4fa430a06bac4234d3403a40e5a9905ebcd0/pydantic_core-2.46.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:1da3786b8018e60349680720158cc19161cc3b4bdd815beb0a321cd5ce1ad5b1", size = 2108971, upload-time = "2026-04-20T14:43:51.945Z" },
|
| 567 |
+
{ url = "https://files.pythonhosted.org/packages/08/4b/f364b9d161718ff2217160a4b5d41ce38de60aed91c3689ebffa1c939d23/pydantic_core-2.46.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cc0988cb29d21bf4a9d5cf2ef970b5c0e38d8d8e107a493278c05dc6c1dda69f", size = 1949588, upload-time = "2026-04-20T14:44:10.386Z" },
|
| 568 |
+
{ url = "https://files.pythonhosted.org/packages/8f/8b/30bd03ee83b2f5e29f5ba8e647ab3c456bf56f2ec72fdbcc0215484a0854/pydantic_core-2.46.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27f9067c3bfadd04c55484b89c0d267981b2f3512850f6f66e1e74204a4e4ce3", size = 1975986, upload-time = "2026-04-20T14:43:57.106Z" },
|
| 569 |
+
{ url = "https://files.pythonhosted.org/packages/3c/54/13ccf954d84ec275d5d023d5786e4aa48840bc9f161f2838dc98e1153518/pydantic_core-2.46.3-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a642ac886ecf6402d9882d10c405dcf4b902abeb2972cd5fb4a48c83cd59279a", size = 2055830, upload-time = "2026-04-20T14:44:15.499Z" },
|
| 570 |
+
{ url = "https://files.pythonhosted.org/packages/be/0e/65f38125e660fdbd72aa858e7dfae893645cfa0e7b13d333e174a367cd23/pydantic_core-2.46.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:79f561438481f28681584b89e2effb22855e2179880314bcddbf5968e935e807", size = 2222340, upload-time = "2026-04-20T14:41:51.353Z" },
|
| 571 |
+
{ url = "https://files.pythonhosted.org/packages/d1/88/f3ab7739efe0e7e80777dbb84c59eb98518e3f57ea433206194c2e425272/pydantic_core-2.46.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57a973eae4665352a47cf1a99b4ee864620f2fe663a217d7a8da68a1f3a5bfda", size = 2280727, upload-time = "2026-04-20T14:41:30.461Z" },
|
| 572 |
+
{ url = "https://files.pythonhosted.org/packages/2a/6d/c228219080817bec4982f9531cadb18da6aaa770fdeb114f49c237ac2c9f/pydantic_core-2.46.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:83d002b97072a53ea150d63e0a3adfae5670cef5aa8a6e490240e482d3b22e57", size = 2092158, upload-time = "2026-04-20T14:44:07.305Z" },
|
| 573 |
+
{ url = "https://files.pythonhosted.org/packages/0f/b1/525a16711e7c6d61635fac3b0bd54600b5c5d9f60c6fc5aaab26b64a2297/pydantic_core-2.46.3-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:b40ddd51e7c44b28cfaef746c9d3c506d658885e0a46f9eeef2ee815cbf8e045", size = 2116626, upload-time = "2026-04-20T14:42:34.118Z" },
|
| 574 |
+
{ url = "https://files.pythonhosted.org/packages/ef/7c/17d30673351439a6951bf54f564cf2443ab00ae264ec9df00e2efd710eb5/pydantic_core-2.46.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ac5ec7fb9b87f04ee839af2d53bcadea57ded7d229719f56c0ed895bff987943", size = 2160691, upload-time = "2026-04-20T14:41:14.023Z" },
|
| 575 |
+
{ url = "https://files.pythonhosted.org/packages/86/66/af8adbcbc0886ead7f1a116606a534d75a307e71e6e08226000d51b880d2/pydantic_core-2.46.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:a3b11c812f61b3129c4905781a2601dfdfdea5fe1e6c1cfb696b55d14e9c054f", size = 2182543, upload-time = "2026-04-20T14:40:48.886Z" },
|
| 576 |
+
{ url = "https://files.pythonhosted.org/packages/b0/37/6de71e0f54c54a4190010f57deb749e1ddf75c568ada3b1320b70067f121/pydantic_core-2.46.3-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:1108da631e602e5b3c38d6d04fe5bb3bfa54349e6918e3ca6cf570b2e2b2f9d4", size = 2324513, upload-time = "2026-04-20T14:42:36.121Z" },
|
| 577 |
+
{ url = "https://files.pythonhosted.org/packages/51/b1/9fc74ce94f603d5ef59ff258ca9c2c8fb902fb548d340a96f77f4d1c3b7f/pydantic_core-2.46.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:de885175515bcfa98ae618c1df7a072f13d179f81376c8007112af20567fd08a", size = 2361853, upload-time = "2026-04-20T14:43:24.886Z" },
|
| 578 |
+
{ url = "https://files.pythonhosted.org/packages/40/d0/4c652fc592db35f100279ee751d5a145aca1b9a7984b9684ba7c1b5b0535/pydantic_core-2.46.3-cp310-cp310-win32.whl", hash = "sha256:d11058e3201527d41bc6b545c79187c9e4bf85e15a236a6007f0e991518882b7", size = 1980465, upload-time = "2026-04-20T14:44:46.239Z" },
|
| 579 |
+
{ url = "https://files.pythonhosted.org/packages/27/b8/a920453c38afbe1f355e1ea0b0d94a0a3e0b0879d32d793108755fa171d5/pydantic_core-2.46.3-cp310-cp310-win_amd64.whl", hash = "sha256:3612edf65c8ea67ac13616c4d23af12faef1ae435a8a93e5934c2a0cbbdd1fd6", size = 2073884, upload-time = "2026-04-20T14:43:01.201Z" },
|
| 580 |
+
{ url = "https://files.pythonhosted.org/packages/22/a2/1ba90a83e85a3f94c796b184f3efde9c72f2830dcda493eea8d59ba78e6d/pydantic_core-2.46.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:ab124d49d0459b2373ecf54118a45c28a1e6d4192a533fbc915e70f556feb8e5", size = 2106740, upload-time = "2026-04-20T14:41:20.932Z" },
|
| 581 |
+
{ url = "https://files.pythonhosted.org/packages/b6/f6/99ae893c89a0b9d3daec9f95487aa676709aa83f67643b3f0abaf4ab628a/pydantic_core-2.46.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cca67d52a5c7a16aed2b3999e719c4bcf644074eac304a5d3d62dd70ae7d4b2c", size = 1948293, upload-time = "2026-04-20T14:43:42.115Z" },
|
| 582 |
+
{ url = "https://files.pythonhosted.org/packages/3e/b8/2e8e636dc9e3f16c2e16bf0849e24be82c5ee82c603c65fc0326666328fc/pydantic_core-2.46.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c024e08c0ba23e6fd68c771a521e9d6a792f2ebb0fa734296b36394dc30390e", size = 1973222, upload-time = "2026-04-20T14:41:57.841Z" },
|
| 583 |
+
{ url = "https://files.pythonhosted.org/packages/34/36/0e730beec4d83c5306f417afbd82ff237d9a21e83c5edf675f31ed84c1fe/pydantic_core-2.46.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6645ce7eec4928e29a1e3b3d5c946621d105d3e79f0c9cddf07c2a9770949287", size = 2053852, upload-time = "2026-04-20T14:40:43.077Z" },
|
| 584 |
+
{ url = "https://files.pythonhosted.org/packages/4b/f0/3071131f47e39136a17814576e0fada9168569f7f8c0e6ac4d1ede6a4958/pydantic_core-2.46.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a712c7118e6c5ea96562f7b488435172abb94a3c53c22c9efc1412264a45cbbe", size = 2221134, upload-time = "2026-04-20T14:43:03.349Z" },
|
| 585 |
+
{ url = "https://files.pythonhosted.org/packages/2f/a9/a2dc023eec5aa4b02a467874bad32e2446957d2adcab14e107eab502e978/pydantic_core-2.46.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:69a868ef3ff206343579021c40faf3b1edc64b1cc508ff243a28b0a514ccb050", size = 2279785, upload-time = "2026-04-20T14:41:19.285Z" },
|
| 586 |
+
{ url = "https://files.pythonhosted.org/packages/0a/44/93f489d16fb63fbd41c670441536541f6e8cfa1e5a69f40bc9c5d30d8c90/pydantic_core-2.46.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc7e8c32db809aa0f6ea1d6869ebc8518a65d5150fdfad8bcae6a49ae32a22e2", size = 2089404, upload-time = "2026-04-20T14:43:10.108Z" },
|
| 587 |
+
{ url = "https://files.pythonhosted.org/packages/2a/78/8692e3aa72b2d004f7a5d937f1dfdc8552ba26caf0bec75f342c40f00dec/pydantic_core-2.46.3-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:3481bd1341dc85779ee506bc8e1196a277ace359d89d28588a9468c3ecbe63fa", size = 2114898, upload-time = "2026-04-20T14:44:51.475Z" },
|
| 588 |
+
{ url = "https://files.pythonhosted.org/packages/6a/62/e83133f2e7832532060175cebf1f13748f4c7e7e7165cdd1f611f174494b/pydantic_core-2.46.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8690eba565c6d68ffd3a8655525cbdd5246510b44a637ee2c6c03a7ebfe64d3c", size = 2157856, upload-time = "2026-04-20T14:43:46.64Z" },
|
| 589 |
+
{ url = "https://files.pythonhosted.org/packages/6d/ec/6a500e3ad7718ee50583fae79c8651f5d37e3abce1fa9ae177ae65842c53/pydantic_core-2.46.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4de88889d7e88d50d40ee5b39d5dac0bcaef9ba91f7e536ac064e6b2834ecccf", size = 2180168, upload-time = "2026-04-20T14:42:00.302Z" },
|
| 590 |
+
{ url = "https://files.pythonhosted.org/packages/d8/53/8267811054b1aa7fc1dc7ded93812372ef79a839f5e23558136a6afbfde1/pydantic_core-2.46.3-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:e480080975c1ef7f780b8f99ed72337e7cc5efea2e518a20a692e8e7b278eb8b", size = 2322885, upload-time = "2026-04-20T14:41:05.253Z" },
|
| 591 |
+
{ url = "https://files.pythonhosted.org/packages/c8/c1/1c0acdb3aa0856ddc4ecc55214578f896f2de16f400cf51627eb3c26c1c4/pydantic_core-2.46.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:de3a5c376f8cd94da9a1b8fd3dd1c16c7a7b216ed31dc8ce9fd7a22bf13b836e", size = 2360328, upload-time = "2026-04-20T14:41:43.991Z" },
|
| 592 |
+
{ url = "https://files.pythonhosted.org/packages/f0/d0/ef39cd0f4a926814f360e71c1adeab48ad214d9727e4deb48eedfb5bce1a/pydantic_core-2.46.3-cp311-cp311-win32.whl", hash = "sha256:fc331a5314ffddd5385b9ee9d0d2fee0b13c27e0e02dad71b1ae5d6561f51eeb", size = 1979464, upload-time = "2026-04-20T14:43:12.215Z" },
|
| 593 |
+
{ url = "https://files.pythonhosted.org/packages/18/9c/f41951b0d858e343f1cf09398b2a7b3014013799744f2c4a8ad6a3eec4f2/pydantic_core-2.46.3-cp311-cp311-win_amd64.whl", hash = "sha256:b5b9c6cf08a8a5e502698f5e153056d12c34b8fb30317e0c5fd06f45162a6346", size = 2070837, upload-time = "2026-04-20T14:41:47.707Z" },
|
| 594 |
+
{ url = "https://files.pythonhosted.org/packages/9f/1e/264a17cd582f6ed50950d4d03dd5fefd84e570e238afe1cb3e25cf238769/pydantic_core-2.46.3-cp311-cp311-win_arm64.whl", hash = "sha256:5dfd51cf457482f04ec49491811a2b8fd5b843b64b11eecd2d7a1ee596ea78a6", size = 2053647, upload-time = "2026-04-20T14:42:27.535Z" },
|
| 595 |
+
{ url = "https://files.pythonhosted.org/packages/4b/cb/5b47425556ecc1f3fe18ed2a0083188aa46e1dd812b06e406475b3a5d536/pydantic_core-2.46.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:b11b59b3eee90a80a36701ddb4576d9ae31f93f05cb9e277ceaa09e6bf074a67", size = 2101946, upload-time = "2026-04-20T14:40:52.581Z" },
|
| 596 |
+
{ url = "https://files.pythonhosted.org/packages/a1/4f/2fb62c2267cae99b815bbf4a7b9283812c88ca3153ef29f7707200f1d4e5/pydantic_core-2.46.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:af8653713055ea18a3abc1537fe2ebc42f5b0bbb768d1eb79fd74eb47c0ac089", size = 1951612, upload-time = "2026-04-20T14:42:42.996Z" },
|
| 597 |
+
{ url = "https://files.pythonhosted.org/packages/50/6e/b7348fd30d6556d132cddd5bd79f37f96f2601fe0608afac4f5fb01ec0b3/pydantic_core-2.46.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75a519dab6d63c514f3a81053e5266c549679e4aa88f6ec57f2b7b854aceb1b0", size = 1977027, upload-time = "2026-04-20T14:42:02.001Z" },
|
| 598 |
+
{ url = "https://files.pythonhosted.org/packages/82/11/31d60ee2b45540d3fb0b29302a393dbc01cd771c473f5b5147bcd353e593/pydantic_core-2.46.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a6cd87cb1575b1ad05ba98894c5b5c96411ef678fa2f6ed2576607095b8d9789", size = 2063008, upload-time = "2026-04-20T14:44:17.952Z" },
|
| 599 |
+
{ url = "https://files.pythonhosted.org/packages/8a/db/3a9d1957181b59258f44a2300ab0f0be9d1e12d662a4f57bb31250455c52/pydantic_core-2.46.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f80a55484b8d843c8ada81ebf70a682f3f00a3d40e378c06cf17ecb44d280d7d", size = 2233082, upload-time = "2026-04-20T14:40:57.934Z" },
|
| 600 |
+
{ url = "https://files.pythonhosted.org/packages/9c/e1/3277c38792aeb5cfb18c2f0c5785a221d9ff4e149abbe1184d53d5f72273/pydantic_core-2.46.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3861f1731b90c50a3266316b9044f5c9b405eecb8e299b0a7120596334e4fe9c", size = 2304615, upload-time = "2026-04-20T14:42:12.584Z" },
|
| 601 |
+
{ url = "https://files.pythonhosted.org/packages/5e/d5/e3d9717c9eba10855325650afd2a9cba8e607321697f18953af9d562da2f/pydantic_core-2.46.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb528e295ed31570ac3dcc9bfdd6e0150bc11ce6168ac87a8082055cf1a67395", size = 2094380, upload-time = "2026-04-20T14:43:05.522Z" },
|
| 602 |
+
{ url = "https://files.pythonhosted.org/packages/a1/20/abac35dedcbfd66c6f0b03e4e3564511771d6c9b7ede10a362d03e110d9b/pydantic_core-2.46.3-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:367508faa4973b992b271ba1494acaab36eb7e8739d1e47be5035fb1ea225396", size = 2135429, upload-time = "2026-04-20T14:41:55.549Z" },
|
| 603 |
+
{ url = "https://files.pythonhosted.org/packages/6c/a5/41bfd1df69afad71b5cf0535055bccc73022715ad362edbc124bc1e021d7/pydantic_core-2.46.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5ad3c826fe523e4becf4fe39baa44286cff85ef137c729a2c5e269afbfd0905d", size = 2174582, upload-time = "2026-04-20T14:41:45.96Z" },
|
| 604 |
+
{ url = "https://files.pythonhosted.org/packages/79/65/38d86ea056b29b2b10734eb23329b7a7672ca604df4f2b6e9c02d4ee22fe/pydantic_core-2.46.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ec638c5d194ef8af27db69f16c954a09797c0dc25015ad6123eb2c73a4d271ca", size = 2187533, upload-time = "2026-04-20T14:40:55.367Z" },
|
| 605 |
+
{ url = "https://files.pythonhosted.org/packages/b6/55/a1129141678a2026badc539ad1dee0a71d06f54c2f06a4bd68c030ac781b/pydantic_core-2.46.3-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:28ed528c45446062ee66edb1d33df5d88828ae167de76e773a3c7f64bd14e976", size = 2332985, upload-time = "2026-04-20T14:44:13.05Z" },
|
| 606 |
+
{ url = "https://files.pythonhosted.org/packages/d7/60/cb26f4077719f709e54819f4e8e1d43f4091f94e285eb6bd21e1190a7b7c/pydantic_core-2.46.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:aed19d0c783886d5bd86d80ae5030006b45e28464218747dcf83dabfdd092c7b", size = 2373670, upload-time = "2026-04-20T14:41:53.421Z" },
|
| 607 |
+
{ url = "https://files.pythonhosted.org/packages/6b/7e/c3f21882bdf1d8d086876f81b5e296206c69c6082551d776895de7801fa0/pydantic_core-2.46.3-cp312-cp312-win32.whl", hash = "sha256:06d5d8820cbbdb4147578c1fe7ffcd5b83f34508cb9f9ab76e807be7db6ff0a4", size = 1966722, upload-time = "2026-04-20T14:44:30.588Z" },
|
| 608 |
+
{ url = "https://files.pythonhosted.org/packages/57/be/6b5e757b859013ebfbd7adba02f23b428f37c86dcbf78b5bb0b4ffd36e99/pydantic_core-2.46.3-cp312-cp312-win_amd64.whl", hash = "sha256:c3212fda0ee959c1dd04c60b601ec31097aaa893573a3a1abd0a47bcac2968c1", size = 2072970, upload-time = "2026-04-20T14:42:54.248Z" },
|
| 609 |
+
{ url = "https://files.pythonhosted.org/packages/bf/f8/a989b21cc75e9a32d24192ef700eea606521221a89faa40c919ce884f2b1/pydantic_core-2.46.3-cp312-cp312-win_arm64.whl", hash = "sha256:f1f8338dd7a7f31761f1f1a3c47503a9a3b34eea3c8b01fa6ee96408affb5e72", size = 2035963, upload-time = "2026-04-20T14:44:20.4Z" },
|
| 610 |
+
{ url = "https://files.pythonhosted.org/packages/9b/3c/9b5e8eb9821936d065439c3b0fb1490ffa64163bfe7e1595985a47896073/pydantic_core-2.46.3-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:12bc98de041458b80c86c56b24df1d23832f3e166cbaff011f25d187f5c62c37", size = 2102109, upload-time = "2026-04-20T14:41:24.219Z" },
|
| 611 |
+
{ url = "https://files.pythonhosted.org/packages/91/97/1c41d1f5a19f241d8069f1e249853bcce378cdb76eec8ab636d7bc426280/pydantic_core-2.46.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:85348b8f89d2c3508b65b16c3c33a4da22b8215138d8b996912bb1532868885f", size = 1951820, upload-time = "2026-04-20T14:42:14.236Z" },
|
| 612 |
+
{ url = "https://files.pythonhosted.org/packages/30/b4/d03a7ae14571bc2b6b3c7b122441154720619afe9a336fa3a95434df5e2f/pydantic_core-2.46.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1105677a6df914b1fb71a81b96c8cce7726857e1717d86001f29be06a25ee6f8", size = 1977785, upload-time = "2026-04-20T14:42:31.648Z" },
|
| 613 |
+
{ url = "https://files.pythonhosted.org/packages/ae/0c/4086f808834b59e3c8f1aa26df8f4b6d998cdcf354a143d18ef41529d1fe/pydantic_core-2.46.3-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:87082cd65669a33adeba5470769e9704c7cf026cc30afb9cc77fd865578ebaad", size = 2062761, upload-time = "2026-04-20T14:40:37.093Z" },
|
| 614 |
+
{ url = "https://files.pythonhosted.org/packages/fa/71/a649be5a5064c2df0db06e0a512c2281134ed2fcc981f52a657936a7527c/pydantic_core-2.46.3-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:60e5f66e12c4f5212d08522963380eaaeac5ebd795826cfd19b2dfb0c7a52b9c", size = 2232989, upload-time = "2026-04-20T14:42:59.254Z" },
|
| 615 |
+
{ url = "https://files.pythonhosted.org/packages/a2/84/7756e75763e810b3a710f4724441d1ecc5883b94aacb07ca71c5fb5cfb69/pydantic_core-2.46.3-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b6cdf19bf84128d5e7c37e8a73a0c5c10d51103a650ac585d42dd6ae233f2b7f", size = 2303975, upload-time = "2026-04-20T14:41:32.287Z" },
|
| 616 |
+
{ url = "https://files.pythonhosted.org/packages/6c/35/68a762e0c1e31f35fa0dac733cbd9f5b118042853698de9509c8e5bf128b/pydantic_core-2.46.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:031bb17f4885a43773c8c763089499f242aee2ea85cf17154168775dccdecf35", size = 2095325, upload-time = "2026-04-20T14:42:47.685Z" },
|
| 617 |
+
{ url = "https://files.pythonhosted.org/packages/77/bf/1bf8c9a8e91836c926eae5e3e51dce009bf495a60ca56060689d3df3f340/pydantic_core-2.46.3-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:bcf2a8b2982a6673693eae7348ef3d8cf3979c1d63b54fca7c397a635cc68687", size = 2133368, upload-time = "2026-04-20T14:41:22.766Z" },
|
| 618 |
+
{ url = "https://files.pythonhosted.org/packages/e5/50/87d818d6bab915984995157ceb2380f5aac4e563dddbed6b56f0ed057aba/pydantic_core-2.46.3-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:28e8cf2f52d72ced402a137145923a762cbb5081e48b34312f7a0c8f55928ec3", size = 2173908, upload-time = "2026-04-20T14:42:52.044Z" },
|
| 619 |
+
{ url = "https://files.pythonhosted.org/packages/91/88/a311fb306d0bd6185db41fa14ae888fb81d0baf648a761ae760d30819d33/pydantic_core-2.46.3-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:17eaface65d9fc5abb940003020309c1bf7a211f5f608d7870297c367e6f9022", size = 2186422, upload-time = "2026-04-20T14:43:29.55Z" },
|
| 620 |
+
{ url = "https://files.pythonhosted.org/packages/8f/79/28fd0d81508525ab2054fef7c77a638c8b5b0afcbbaeee493cf7c3fef7e1/pydantic_core-2.46.3-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:93fd339f23408a07e98950a89644f92c54d8729719a40b30c0a30bb9ebc55d23", size = 2332709, upload-time = "2026-04-20T14:42:16.134Z" },
|
| 621 |
+
{ url = "https://files.pythonhosted.org/packages/b3/21/795bf5fe5c0f379308b8ef19c50dedab2e7711dbc8d0c2acf08f1c7daa05/pydantic_core-2.46.3-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:23cbdb3aaa74dfe0837975dbf69b469753bbde8eacace524519ffdb6b6e89eb7", size = 2372428, upload-time = "2026-04-20T14:41:10.974Z" },
|
| 622 |
+
{ url = "https://files.pythonhosted.org/packages/45/b3/ed14c659cbe7605e3ef063077680a64680aec81eb1a04763a05190d49b7f/pydantic_core-2.46.3-cp313-cp313-win32.whl", hash = "sha256:610eda2e3838f401105e6326ca304f5da1e15393ae25dacae5c5c63f2c275b13", size = 1965601, upload-time = "2026-04-20T14:41:42.128Z" },
|
| 623 |
+
{ url = "https://files.pythonhosted.org/packages/ef/bb/adb70d9a762ddd002d723fbf1bd492244d37da41e3af7b74ad212609027e/pydantic_core-2.46.3-cp313-cp313-win_amd64.whl", hash = "sha256:68cc7866ed863db34351294187f9b729964c371ba33e31c26f478471c52e1ed0", size = 2071517, upload-time = "2026-04-20T14:43:36.096Z" },
|
| 624 |
+
{ url = "https://files.pythonhosted.org/packages/52/eb/66faefabebfe68bd7788339c9c9127231e680b11906368c67ce112fdb47f/pydantic_core-2.46.3-cp313-cp313-win_arm64.whl", hash = "sha256:f64b5537ac62b231572879cd08ec05600308636a5d63bcbdb15063a466977bec", size = 2035802, upload-time = "2026-04-20T14:43:38.507Z" },
|
| 625 |
+
{ url = "https://files.pythonhosted.org/packages/7f/db/a7bcb4940183fda36022cd18ba8dd12f2dff40740ec7b58ce7457befa416/pydantic_core-2.46.3-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:afa3aa644f74e290cdede48a7b0bee37d1c35e71b05105f6b340d484af536d9b", size = 2097614, upload-time = "2026-04-20T14:44:38.374Z" },
|
| 626 |
+
{ url = "https://files.pythonhosted.org/packages/24/35/e4066358a22e3e99519db370494c7528f5a2aa1367370e80e27e20283543/pydantic_core-2.46.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:ced3310e51aa425f7f77da8bbbb5212616655bedbe82c70944320bc1dbe5e018", size = 1951896, upload-time = "2026-04-20T14:40:53.996Z" },
|
| 627 |
+
{ url = "https://files.pythonhosted.org/packages/87/92/37cf4049d1636996e4b888c05a501f40a43ff218983a551d57f9d5e14f0d/pydantic_core-2.46.3-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e29908922ce9da1a30b4da490bd1d3d82c01dcfdf864d2a74aacee674d0bfa34", size = 1979314, upload-time = "2026-04-20T14:41:49.446Z" },
|
| 628 |
+
{ url = "https://files.pythonhosted.org/packages/d8/36/9ff4d676dfbdfb2d591cf43f3d90ded01e15b1404fd101180ed2d62a2fd3/pydantic_core-2.46.3-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0c9ff69140423eea8ed2d5477df3ba037f671f5e897d206d921bc9fdc39613e7", size = 2056133, upload-time = "2026-04-20T14:42:23.574Z" },
|
| 629 |
+
{ url = "https://files.pythonhosted.org/packages/bc/f0/405b442a4d7ba855b06eec8b2bf9c617d43b8432d099dfdc7bf999293495/pydantic_core-2.46.3-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b675ab0a0d5b1c8fdb81195dc5bcefea3f3c240871cdd7ff9a2de8aa50772eb2", size = 2228726, upload-time = "2026-04-20T14:44:22.816Z" },
|
| 630 |
+
{ url = "https://files.pythonhosted.org/packages/e7/f8/65cd92dd5a0bd89ba277a98ecbfaf6fc36bbd3300973c7a4b826d6ab1391/pydantic_core-2.46.3-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0087084960f209a9a4af50ecd1fb063d9ad3658c07bb81a7a53f452dacbfb2ba", size = 2301214, upload-time = "2026-04-20T14:44:48.792Z" },
|
| 631 |
+
{ url = "https://files.pythonhosted.org/packages/fd/86/ef96a4c6e79e7a2d0410826a68fbc0eccc0fd44aa733be199d5fcac3bb87/pydantic_core-2.46.3-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed42e6cc8e1b0e2b9b96e2276bad70ae625d10d6d524aed0c93de974ae029f9f", size = 2099927, upload-time = "2026-04-20T14:41:40.196Z" },
|
| 632 |
+
{ url = "https://files.pythonhosted.org/packages/6d/53/269caf30e0096e0a8a8f929d1982a27b3879872cca2d917d17c2f9fdf4fe/pydantic_core-2.46.3-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:f1771ce258afb3e4201e67d154edbbae712a76a6081079fe247c2f53c6322c22", size = 2128789, upload-time = "2026-04-20T14:41:15.868Z" },
|
| 633 |
+
{ url = "https://files.pythonhosted.org/packages/00/b0/1a6d9b6a587e118482910c244a1c5acf4d192604174132efd12bf0ac486f/pydantic_core-2.46.3-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a7610b6a5242a6c736d8ad47fd5fff87fcfe8f833b281b1c409c3d6835d9227f", size = 2173815, upload-time = "2026-04-20T14:44:25.152Z" },
|
| 634 |
+
{ url = "https://files.pythonhosted.org/packages/87/56/e7e00d4041a7e62b5a40815590114db3b535bf3ca0bf4dca9f16cef25246/pydantic_core-2.46.3-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:ff5e7783bcc5476e1db448bf268f11cb257b1c276d3e89f00b5727be86dd0127", size = 2181608, upload-time = "2026-04-20T14:41:28.933Z" },
|
| 635 |
+
{ url = "https://files.pythonhosted.org/packages/e8/22/4bd23c3d41f7c185d60808a1de83c76cf5aeabf792f6c636a55c3b1ec7f9/pydantic_core-2.46.3-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:9d2e32edcc143bc01e95300671915d9ca052d4f745aa0a49c48d4803f8a85f2c", size = 2326968, upload-time = "2026-04-20T14:42:03.962Z" },
|
| 636 |
+
{ url = "https://files.pythonhosted.org/packages/24/ac/66cd45129e3915e5ade3b292cb3bc7fd537f58f8f8dbdaba6170f7cabb74/pydantic_core-2.46.3-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:6e42d83d1c6b87fa56b521479cff237e626a292f3b31b6345c15a99121b454c1", size = 2369842, upload-time = "2026-04-20T14:41:35.52Z" },
|
| 637 |
+
{ url = "https://files.pythonhosted.org/packages/a2/51/dd4248abb84113615473aa20d5545b7c4cd73c8644003b5259686f93996c/pydantic_core-2.46.3-cp314-cp314-win32.whl", hash = "sha256:07bc6d2a28c3adb4f7c6ae46aa4f2d2929af127f587ed44057af50bf1ce0f505", size = 1959661, upload-time = "2026-04-20T14:41:00.042Z" },
|
| 638 |
+
{ url = "https://files.pythonhosted.org/packages/20/eb/59980e5f1ae54a3b86372bd9f0fa373ea2d402e8cdcd3459334430f91e91/pydantic_core-2.46.3-cp314-cp314-win_amd64.whl", hash = "sha256:8940562319bc621da30714617e6a7eaa6b98c84e8c685bcdc02d7ed5e7c7c44e", size = 2071686, upload-time = "2026-04-20T14:43:16.471Z" },
|
| 639 |
+
{ url = "https://files.pythonhosted.org/packages/8c/db/1cf77e5247047dfee34bc01fa9bca134854f528c8eb053e144298893d370/pydantic_core-2.46.3-cp314-cp314-win_arm64.whl", hash = "sha256:5dcbbcf4d22210ced8f837c96db941bdb078f419543472aca5d9a0bb7cddc7df", size = 2026907, upload-time = "2026-04-20T14:43:31.732Z" },
|
| 640 |
+
{ url = "https://files.pythonhosted.org/packages/57/c0/b3df9f6a543276eadba0a48487b082ca1f201745329d97dbfa287034a230/pydantic_core-2.46.3-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:d0fe3dce1e836e418f912c1ad91c73357d03e556a4d286f441bf34fed2dbeecf", size = 2095047, upload-time = "2026-04-20T14:42:37.982Z" },
|
| 641 |
+
{ url = "https://files.pythonhosted.org/packages/66/57/886a938073b97556c168fd99e1a7305bb363cd30a6d2c76086bf0587b32a/pydantic_core-2.46.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:9ce92e58abc722dac1bf835a6798a60b294e48eb0e625ec9fd994b932ac5feee", size = 1934329, upload-time = "2026-04-20T14:43:49.655Z" },
|
| 642 |
+
{ url = "https://files.pythonhosted.org/packages/0b/7c/b42eaa5c34b13b07ecb51da21761297a9b8eb43044c864a035999998f328/pydantic_core-2.46.3-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a03e6467f0f5ab796a486146d1b887b2dc5e5f9b3288898c1b1c3ad974e53e4a", size = 1974847, upload-time = "2026-04-20T14:42:10.737Z" },
|
| 643 |
+
{ url = "https://files.pythonhosted.org/packages/e6/9b/92b42db6543e7de4f99ae977101a2967b63122d4b6cf7773812da2d7d5b5/pydantic_core-2.46.3-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2798b6ba041b9d70acfb9071a2ea13c8456dd1e6a5555798e41ba7b0790e329c", size = 2041742, upload-time = "2026-04-20T14:40:44.262Z" },
|
| 644 |
+
{ url = "https://files.pythonhosted.org/packages/0f/19/46fbe1efabb5aa2834b43b9454e70f9a83ad9c338c1291e48bdc4fecf167/pydantic_core-2.46.3-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9be3e221bdc6d69abf294dcf7aff6af19c31a5cdcc8f0aa3b14be29df4bd03b1", size = 2236235, upload-time = "2026-04-20T14:41:27.307Z" },
|
| 645 |
+
{ url = "https://files.pythonhosted.org/packages/77/da/b3f95bc009ad60ec53120f5d16c6faa8cabdbe8a20d83849a1f2b8728148/pydantic_core-2.46.3-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f13936129ce841f2a5ddf6f126fea3c43cd128807b5a59588c37cf10178c2e64", size = 2282633, upload-time = "2026-04-20T14:44:33.271Z" },
|
| 646 |
+
{ url = "https://files.pythonhosted.org/packages/cc/6e/401336117722e28f32fb8220df676769d28ebdf08f2f4469646d404c43a3/pydantic_core-2.46.3-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28b5f2ef03416facccb1c6ef744c69793175fd27e44ef15669201601cf423acb", size = 2109679, upload-time = "2026-04-20T14:44:41.065Z" },
|
| 647 |
+
{ url = "https://files.pythonhosted.org/packages/fc/53/b289f9bc8756a32fe718c46f55afaeaf8d489ee18d1a1e7be1db73f42cc4/pydantic_core-2.46.3-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:830d1247d77ad23852314f069e9d7ddafeec5f684baf9d7e7065ed46a049c4e6", size = 2108342, upload-time = "2026-04-20T14:42:50.144Z" },
|
| 648 |
+
{ url = "https://files.pythonhosted.org/packages/10/5b/8292fc7c1f9111f1b2b7c1b0dcf1179edcd014fc3ea4517499f50b829d71/pydantic_core-2.46.3-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d0793c90c1a3c74966e7975eaef3ed30ebdff3260a0f815a62a22adc17e4c01c", size = 2157208, upload-time = "2026-04-20T14:42:08.133Z" },
|
| 649 |
+
{ url = "https://files.pythonhosted.org/packages/2b/9e/f80044e9ec07580f057a89fc131f78dda7a58751ddf52bbe05eaf31db50f/pydantic_core-2.46.3-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:d2d0aead851b66f5245ec0c4fb2612ef457f8bbafefdf65a2bf9d6bac6140f47", size = 2167237, upload-time = "2026-04-20T14:42:25.412Z" },
|
| 650 |
+
{ url = "https://files.pythonhosted.org/packages/f8/84/6781a1b037f3b96be9227edbd1101f6d3946746056231bf4ac48cdff1a8d/pydantic_core-2.46.3-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:2f40e4246676beb31c5ce77c38a55ca4e465c6b38d11ea1bd935420568e0b1ab", size = 2312540, upload-time = "2026-04-20T14:40:40.313Z" },
|
| 651 |
+
{ url = "https://files.pythonhosted.org/packages/3e/db/19c0839feeb728e7df03255581f198dfdf1c2aeb1e174a8420b63c5252e5/pydantic_core-2.46.3-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:cf489cf8986c543939aeee17a09c04d6ffb43bfef8ca16fcbcc5cfdcbed24dba", size = 2369556, upload-time = "2026-04-20T14:41:09.427Z" },
|
| 652 |
+
{ url = "https://files.pythonhosted.org/packages/e0/15/3228774cb7cd45f5f721ddf1b2242747f4eb834d0c491f0c02d606f09fed/pydantic_core-2.46.3-cp314-cp314t-win32.whl", hash = "sha256:ffe0883b56cfc05798bf994164d2b2ff03efe2d22022a2bb080f3b626176dd56", size = 1949756, upload-time = "2026-04-20T14:41:25.717Z" },
|
| 653 |
+
{ url = "https://files.pythonhosted.org/packages/b8/2a/c79cf53fd91e5a87e30d481809f52f9a60dd221e39de66455cf04deaad37/pydantic_core-2.46.3-cp314-cp314t-win_amd64.whl", hash = "sha256:706d9d0ce9cf4593d07270d8e9f53b161f90c57d315aeec4fb4fd7a8b10240d8", size = 2051305, upload-time = "2026-04-20T14:43:18.627Z" },
|
| 654 |
+
{ url = "https://files.pythonhosted.org/packages/0b/db/d8182a7f1d9343a032265aae186eb063fe26ca4c40f256b21e8da4498e89/pydantic_core-2.46.3-cp314-cp314t-win_arm64.whl", hash = "sha256:77706aeb41df6a76568434701e0917da10692da28cb69d5fb6919ce5fdb07374", size = 2026310, upload-time = "2026-04-20T14:41:01.778Z" },
|
| 655 |
+
{ url = "https://files.pythonhosted.org/packages/66/7f/03dbad45cd3aa9083fbc93c210ae8b005af67e4136a14186950a747c6874/pydantic_core-2.46.3-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:9715525891ed524a0a1eb6d053c74d4d4ad5017677fb00af0b7c2644a31bae46", size = 2105683, upload-time = "2026-04-20T14:42:19.779Z" },
|
| 656 |
+
{ url = "https://files.pythonhosted.org/packages/26/22/4dc186ac8ea6b257e9855031f51b62a9637beac4d68ac06bee02f046f836/pydantic_core-2.46.3-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:9d2f400712a99a013aff420ef1eb9be077f8189a36c1e3ef87660b4e1088a874", size = 1940052, upload-time = "2026-04-20T14:43:59.274Z" },
|
| 657 |
+
{ url = "https://files.pythonhosted.org/packages/0d/ca/d376391a5aff1f2e8188960d7873543608130a870961c2b6b5236627c116/pydantic_core-2.46.3-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd2aab0e2e9dc2daf36bd2686c982535d5e7b1d930a1344a7bb6e82baab42a76", size = 1988172, upload-time = "2026-04-20T14:41:17.469Z" },
|
| 658 |
+
{ url = "https://files.pythonhosted.org/packages/0e/6b/523b9f85c23788755d6ab949329de692a2e3a584bc6beb67fef5e035aa9d/pydantic_core-2.46.3-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e9d76736da5f362fabfeea6a69b13b7f2be405c6d6966f06b2f6bfff7e64531", size = 2128596, upload-time = "2026-04-20T14:40:41.707Z" },
|
| 659 |
+
{ url = "https://files.pythonhosted.org/packages/34/42/f426db557e8ab2791bc7562052299944a118655496fbff99914e564c0a94/pydantic_core-2.46.3-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:b12dd51f1187c2eb489af8e20f880362db98e954b54ab792fa5d92e8bcc6b803", size = 2091877, upload-time = "2026-04-20T14:43:27.091Z" },
|
| 660 |
+
{ url = "https://files.pythonhosted.org/packages/5c/4f/86a832a9d14df58e663bfdf4627dc00d3317c2bd583c4fb23390b0f04b8e/pydantic_core-2.46.3-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:f00a0961b125f1a47af7bcc17f00782e12f4cd056f83416006b30111d941dfa3", size = 1932428, upload-time = "2026-04-20T14:40:45.781Z" },
|
| 661 |
+
{ url = "https://files.pythonhosted.org/packages/11/1a/fe857968954d93fb78e0d4b6df5c988c74c4aaa67181c60be7cfe327c0ca/pydantic_core-2.46.3-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:57697d7c056aca4bbb680200f96563e841a6386ac1129370a0102592f4dddff5", size = 1997550, upload-time = "2026-04-20T14:44:02.425Z" },
|
| 662 |
+
{ url = "https://files.pythonhosted.org/packages/17/eb/9d89ad2d9b0ba8cd65393d434471621b98912abb10fbe1df08e480ba57b5/pydantic_core-2.46.3-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd35aa21299def8db7ef4fe5c4ff862941a9a158ca7b63d61e66fe67d30416b4", size = 2137657, upload-time = "2026-04-20T14:42:45.149Z" },
|
| 663 |
+
{ url = "https://files.pythonhosted.org/packages/1f/da/99d40830684f81dec901cac521b5b91c095394cc1084b9433393cde1c2df/pydantic_core-2.46.3-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:13afdd885f3d71280cf286b13b310ee0f7ccfefd1dbbb661514a474b726e2f25", size = 2107973, upload-time = "2026-04-20T14:42:06.175Z" },
|
| 664 |
+
{ url = "https://files.pythonhosted.org/packages/99/a5/87024121818d75bbb2a98ddbaf638e40e7a18b5e0f5492c9ca4b1b316107/pydantic_core-2.46.3-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:f91c0aff3e3ee0928edd1232c57f643a7a003e6edf1860bc3afcdc749cb513f3", size = 1947191, upload-time = "2026-04-20T14:43:14.319Z" },
|
| 665 |
+
{ url = "https://files.pythonhosted.org/packages/60/62/0c1acfe10945b83a6a59d19fbaa92f48825381509e5701b855c08f13db76/pydantic_core-2.46.3-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6529d1d128321a58d30afcc97b49e98836542f68dd41b33c2e972bb9e5290536", size = 2123791, upload-time = "2026-04-20T14:43:22.766Z" },
|
| 666 |
+
{ url = "https://files.pythonhosted.org/packages/75/3e/3b2393b4c8f44285561dc30b00cf307a56a2eff7c483a824db3b8221ca51/pydantic_core-2.46.3-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:975c267cff4f7e7272eacbe50f6cc03ca9a3da4c4fbd66fffd89c94c1e311aa1", size = 2153197, upload-time = "2026-04-20T14:44:27.932Z" },
|
| 667 |
+
{ url = "https://files.pythonhosted.org/packages/ba/75/5af02fb35505051eee727c061f2881c555ab4f8ddb2d42da715a42c9731b/pydantic_core-2.46.3-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:2b8e4f2bbdf71415c544b4b1138b8060db7b6611bc927e8064c769f64bed651c", size = 2181073, upload-time = "2026-04-20T14:43:20.729Z" },
|
| 668 |
+
{ url = "https://files.pythonhosted.org/packages/10/92/7e0e1bd9ca3c68305db037560ca2876f89b2647deb2f8b6319005de37505/pydantic_core-2.46.3-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:e61ea8e9fff9606d09178f577ff8ccdd7206ff73d6552bcec18e1033c4254b85", size = 2315886, upload-time = "2026-04-20T14:44:04.826Z" },
|
| 669 |
+
{ url = "https://files.pythonhosted.org/packages/b8/d8/101655f27eaf3e44558ead736b2795d12500598beed4683f279396fa186e/pydantic_core-2.46.3-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b504bda01bafc69b6d3c7a0c7f039dcf60f47fab70e06fe23f57b5c75bdc82b8", size = 2360528, upload-time = "2026-04-20T14:40:47.431Z" },
|
| 670 |
+
{ url = "https://files.pythonhosted.org/packages/07/0f/1c34a74c8d07136f0d729ffe5e1fdab04fbdaa7684f61a92f92511a84a15/pydantic_core-2.46.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:b00b76f7142fc60c762ce579bd29c8fa44aaa56592dd3c54fab3928d0d4ca6ff", size = 2184144, upload-time = "2026-04-20T14:42:57Z" },
|
| 671 |
]
|
| 672 |
|
| 673 |
[[package]]
|
|
|
|
| 786 |
|
| 787 |
[[package]]
|
| 788 |
name = "restrictedpython"
|
| 789 |
+
version = "7.0"
|
| 790 |
source = { registry = "https://pypi.org/simple" }
|
| 791 |
+
sdist = { url = "https://files.pythonhosted.org/packages/ce/7c/19254deb8d2e1a0eea74fe92c3dbd250b400aa853e027de6734fce7ea143/RestrictedPython-7.0.tar.gz", hash = "sha256:53704afbbc350fdc8fb245441367be671c9f8380869201b2e8452e74fce3db14", size = 447152, upload-time = "2023-11-17T07:19:15.173Z" }
|
| 792 |
wheels = [
|
| 793 |
+
{ url = "https://files.pythonhosted.org/packages/5b/85/f40474f97f71e4b7745641635157870f232ce9b7614814d7ce8b82586cb6/RestrictedPython-7.0-py3-none-any.whl", hash = "sha256:8bb40a822090bed9c7b814d69345b0796db70cc86715d141efc937862f37c6d2", size = 26693, upload-time = "2023-11-17T07:19:12.674Z" },
|
| 794 |
]
|
| 795 |
|
| 796 |
[[package]]
|
|
|
|
| 935 |
{ url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" },
|
| 936 |
]
|
| 937 |
|
| 938 |
+
[[package]]
|
| 939 |
+
name = "typing-inspection"
|
| 940 |
+
version = "0.4.2"
|
| 941 |
+
source = { registry = "https://pypi.org/simple" }
|
| 942 |
+
dependencies = [
|
| 943 |
+
{ name = "typing-extensions" },
|
| 944 |
+
]
|
| 945 |
+
sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949, upload-time = "2025-10-01T02:14:41.687Z" }
|
| 946 |
+
wheels = [
|
| 947 |
+
{ url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
|
| 948 |
+
]
|
| 949 |
+
|
| 950 |
[[package]]
|
| 951 |
name = "urllib3"
|
| 952 |
version = "2.6.3"
|
validator.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
AgentDebuggerEnv β Pre-Submission Validator
|
| 4 |
+
============================================
|
| 5 |
+
Checks for all hard requirements of the Meta + HF Hackathon:
|
| 6 |
+
- Mandatory Environment Variables
|
| 7 |
+
- OpenEnv Spec Compliance (health, reset, step, state)
|
| 8 |
+
- Inference Script Format & Logging
|
| 9 |
+
- Dockerfile Correctness
|
| 10 |
+
- openenv.yaml Presence
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import os
|
| 14 |
+
import sys
|
| 15 |
+
import json
|
| 16 |
+
import requests
|
| 17 |
+
import yaml
|
| 18 |
+
import re
|
| 19 |
+
|
| 20 |
+
# ββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
+
ENV_BASE_URL = os.environ.get("ENV_BASE_URL", "http://localhost:8000")
|
| 22 |
+
API_BASE_URL = os.environ.get("API_BASE_URL")
|
| 23 |
+
MODEL_NAME = os.environ.get("MODEL_NAME")
|
| 24 |
+
HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("OPENAI_API_KEY")
|
| 25 |
+
|
| 26 |
+
class bcolors:
|
| 27 |
+
HEADER = '\033[95m'
|
| 28 |
+
OKBLUE = '\033[94m'
|
| 29 |
+
OKCYAN = '\033[96m'
|
| 30 |
+
OKGREEN = '\033[92m'
|
| 31 |
+
WARNING = '\033[93m'
|
| 32 |
+
FAIL = '\033[91m'
|
| 33 |
+
ENDC = '\033[0m'
|
| 34 |
+
BOLD = '\033[1m'
|
| 35 |
+
UNDERLINE = '\033[4m'
|
| 36 |
+
|
| 37 |
+
def log_success(msg): print(f"{bcolors.OKGREEN}β {msg}{bcolors.ENDC}")
|
| 38 |
+
def log_fail(msg): print(f"{bcolors.FAIL}β {msg}{bcolors.ENDC}")
|
| 39 |
+
def log_info(msg): print(f"{bcolors.OKBLUE}βΉ {msg}{bcolors.ENDC}")
|
| 40 |
+
|
| 41 |
+
def check_env_vars():
|
| 42 |
+
log_info("Checking Mandatory Environment Variables...")
|
| 43 |
+
missing = []
|
| 44 |
+
if not API_BASE_URL: missing.append("API_BASE_URL")
|
| 45 |
+
if not MODEL_NAME: missing.append("MODEL_NAME")
|
| 46 |
+
if not HF_TOKEN: missing.append("HF_TOKEN")
|
| 47 |
+
|
| 48 |
+
if missing:
|
| 49 |
+
log_fail(f"Missing env vars: {', '.join(missing)}")
|
| 50 |
+
return False
|
| 51 |
+
log_success("All mandatory env vars detected.")
|
| 52 |
+
return True
|
| 53 |
+
|
| 54 |
+
def check_yaml():
|
| 55 |
+
log_info("Checking openenv.yaml...")
|
| 56 |
+
if not os.path.exists("openenv.yaml"):
|
| 57 |
+
log_fail("openenv.yaml not found in root!")
|
| 58 |
+
return False
|
| 59 |
+
|
| 60 |
+
try:
|
| 61 |
+
with open("openenv.yaml", 'r') as f:
|
| 62 |
+
data = yaml.safe_load(f)
|
| 63 |
+
required = ["name", "version", "tasks", "baseline", "inference_script"]
|
| 64 |
+
for r in required:
|
| 65 |
+
if r not in data:
|
| 66 |
+
log_fail(f"openenv.yaml missing required field: {r}")
|
| 67 |
+
return False
|
| 68 |
+
log_success("openenv.yaml is valid.")
|
| 69 |
+
except Exception as e:
|
| 70 |
+
log_fail(f"Could not parse openenv.yaml: {e}")
|
| 71 |
+
return False
|
| 72 |
+
return True
|
| 73 |
+
|
| 74 |
+
def check_endpoints():
|
| 75 |
+
log_info(f"Checking Endpoints at {ENV_BASE_URL}...")
|
| 76 |
+
|
| 77 |
+
# 1. Health
|
| 78 |
+
try:
|
| 79 |
+
resp = requests.get(f"{ENV_BASE_URL}/health", timeout=5)
|
| 80 |
+
if resp.status_code == 200:
|
| 81 |
+
log_success("/health -> 200 OK")
|
| 82 |
+
else:
|
| 83 |
+
log_fail(f"/health -> {resp.status_code}")
|
| 84 |
+
return False
|
| 85 |
+
except Exception as e:
|
| 86 |
+
log_fail(f"Could not connect to /health: {e}")
|
| 87 |
+
return False
|
| 88 |
+
|
| 89 |
+
# 2. Reset
|
| 90 |
+
try:
|
| 91 |
+
resp = requests.post(f"{ENV_BASE_URL}/reset", json={"task_id": "easy"}, timeout=5)
|
| 92 |
+
if resp.status_code == 200:
|
| 93 |
+
log_success("/reset -> 200 OK")
|
| 94 |
+
else:
|
| 95 |
+
log_fail(f"/reset -> {resp.status_code}")
|
| 96 |
+
return False
|
| 97 |
+
except Exception as e:
|
| 98 |
+
log_fail(f"Could not connect to /reset: {e}")
|
| 99 |
+
return False
|
| 100 |
+
|
| 101 |
+
return True
|
| 102 |
+
|
| 103 |
+
def check_inference_script():
|
| 104 |
+
log_info("Checking inference.py...")
|
| 105 |
+
if not os.path.exists("inference.py"):
|
| 106 |
+
log_fail("inference.py not found in root!")
|
| 107 |
+
return False
|
| 108 |
+
|
| 109 |
+
with open("inference.py", 'r') as f:
|
| 110 |
+
content = f.read()
|
| 111 |
+
|
| 112 |
+
# Check for [START], [STEP], [END]
|
| 113 |
+
patterns = {
|
| 114 |
+
"[START]": r"\[START\] task=",
|
| 115 |
+
"[STEP]": r"\[STEP .+\] Action:",
|
| 116 |
+
"[END]": r"\[END\] task=.* score=.* steps="
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
for label, pattern in patterns.items():
|
| 120 |
+
if not re.search(pattern, content):
|
| 121 |
+
log_fail(f"inference.py missing log tag/format: {label}")
|
| 122 |
+
return False
|
| 123 |
+
|
| 124 |
+
if "OpenAI" not in content or "client.chat.completions.create" not in content:
|
| 125 |
+
log_fail("inference.py does not appear to use the OpenAI client library.")
|
| 126 |
+
return False
|
| 127 |
+
|
| 128 |
+
log_success("inference.py logging and client usage look correct.")
|
| 129 |
+
return True
|
| 130 |
+
|
| 131 |
+
def main():
|
| 132 |
+
print(f"{bcolors.HEADER}{bcolors.BOLD}AgentDebuggerEnv Compliance Validator{bcolors.ENDC}")
|
| 133 |
+
print("=" * 45)
|
| 134 |
+
|
| 135 |
+
success = True
|
| 136 |
+
success &= check_env_vars()
|
| 137 |
+
success &= check_yaml()
|
| 138 |
+
success &= check_inference_script()
|
| 139 |
+
|
| 140 |
+
# Endpoints check is optional if server isn't running locally
|
| 141 |
+
try:
|
| 142 |
+
if not check_endpoints():
|
| 143 |
+
log_info("Skipping further endpoint checks as server is unreachable.")
|
| 144 |
+
except:
|
| 145 |
+
pass
|
| 146 |
+
|
| 147 |
+
print("=" * 45)
|
| 148 |
+
if success:
|
| 149 |
+
print(f"{bcolors.OKGREEN}{bcolors.BOLD}VALIDATION PASSED! Ready for submission.{bcolors.ENDC}")
|
| 150 |
+
else:
|
| 151 |
+
print(f"{bcolors.FAIL}{bcolors.BOLD}VALIDATION FAILED. Please fix the errors above.{bcolors.ENDC}")
|
| 152 |
+
sys.exit(1)
|
| 153 |
+
|
| 154 |
+
if __name__ == "__main__":
|
| 155 |
+
main()
|