arnavzz Claude Sonnet 4.6 commited on
Commit
c14504c
·
0 Parent(s):

feat: initial OpenEnv code debugging environment

Browse files

Implements a complete OpenEnv-compatible environment for the Meta x PyTorch
Hackathon where an AI agent debugs broken Python code across 6 tasks
(easy/medium/hard) with reward = fraction of tests passing (0.0–1.0).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ .env
5
+ .venv/
6
+ venv/
7
+ *.egg-info/
8
+ dist/
9
+ build/
10
+ .pytest_cache/
README.md ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code Debug OpenEnv
2
+
3
+ An **OpenEnv-compatible environment** for the Meta x PyTorch Hackathon where an AI agent debugs broken Python code.
4
+
5
+ ## Overview
6
+
7
+ The agent receives buggy Python code and test descriptions, submits fixes, and is rewarded by the fraction of tests passing (0.0–1.0). The episode ends when all tests pass or the step limit is reached.
8
+
9
+ ## Tasks
10
+
11
+ | Task | Difficulty | Bug Type |
12
+ |------|-----------|----------|
13
+ | task_001_off_by_one | Easy | Fibonacci returns wrong variable |
14
+ | task_002_wrong_operator | Easy | `<` instead of `>` in find_max |
15
+ | task_003_mutable_default | Medium | Mutable default argument in list builder |
16
+ | task_004_scope_bug | Medium | Closure captures loop variable by reference |
17
+ | task_005_binary_search | Hard | Binary search boundary bugs |
18
+ | task_006_graph_cycle | Hard | DFS cycle detection missing recursion stack |
19
+
20
+ ## API Endpoints
21
+
22
+ | Method | Path | Description |
23
+ |--------|------|-------------|
24
+ | GET | `/health` | Health check |
25
+ | GET | `/tasks` | List all available tasks |
26
+ | POST | `/reset` | Start a new episode |
27
+ | POST | `/step/{episode_id}` | Submit fixed code |
28
+ | GET | `/state/{episode_id}` | Get episode metadata |
29
+
30
+ ## Reward
31
+
32
+ ```
33
+ reward = tests_passed / total_tests # range: 0.0 – 1.0
34
+ done = reward == 1.0 OR step_count >= max_steps
35
+ ```
36
+
37
+ ## Setup & Run
38
+
39
+ ### Local (development)
40
+
41
+ ```bash
42
+ pip install fastapi uvicorn pydantic httpx openai
43
+ cd Desktop/Meta
44
+ uvicorn code_debug_env.server.app:app --host 0.0.0.0 --port 7860 --reload
45
+ ```
46
+
47
+ ### Docker
48
+
49
+ ```bash
50
+ cd Desktop/Meta/code_debug_env
51
+ docker build -t code-debug-env -f server/Dockerfile ..
52
+ docker run -p 7860:7860 code-debug-env
53
+ ```
54
+
55
+ ## Inference Script
56
+
57
+ ```bash
58
+ export API_BASE_URL="https://router.huggingface.co/v1"
59
+ export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct"
60
+ export HF_TOKEN="your_token"
61
+ export ENV_URL="http://localhost:7860" # or HF Space URL
62
+
63
+ python inference.py
64
+ ```
65
+
66
+ ### Expected output format
67
+
68
+ ```
69
+ [START] task=task_001_off_by_one env=http://localhost:7860 model=Qwen/Qwen2.5-72B-Instruct
70
+ [STEP] step=1 action='def fib...' reward=1.00 done=true error=null
71
+ [END] success=true steps=1 score=1.000 rewards=1.00
72
+ ```
73
+
74
+ ## Environment Variables
75
+
76
+ | Variable | Required | Description |
77
+ |----------|----------|-------------|
78
+ | `API_BASE_URL` | Yes | LLM API endpoint |
79
+ | `MODEL_NAME` | Yes | Model identifier |
80
+ | `HF_TOKEN` | Yes | Hugging Face / API key |
81
+ | `ENV_URL` | No | OpenEnv server URL (default: http://localhost:7860) |
82
+
83
+ ## Project Structure
84
+
85
+ ```
86
+ code_debug_env/
87
+ ├── models.py # Pydantic models (DebugAction, DebugObservation, DebugState)
88
+ ├── client.py # HTTP client wrapper
89
+ ├── openenv.yaml # Environment manifest
90
+ ├── pyproject.toml # Package metadata
91
+ ├── tasks/ # Task definitions (JSON)
92
+ │ ├── easy/
93
+ │ ├── medium/
94
+ │ └── hard/
95
+ └── server/
96
+ ├── environment.py # Core logic (reset/step/state)
97
+ ├── executor.py # Safe subprocess code runner
98
+ ├── app.py # FastAPI server
99
+ └── Dockerfile
100
+ inference.py # Root-level inference script
101
+ ```
code_debug_env/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .models import DebugAction, DebugObservation, DebugState, TestResult
2
+ from .client import CodeDebugClient
3
+
4
+ __all__ = ["DebugAction", "DebugObservation", "DebugState", "TestResult", "CodeDebugClient"]
code_debug_env/client.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Simple HTTP client for the Code Debug OpenEnv server.
3
+ """
4
+
5
+ from typing import Optional
6
+ import httpx
7
+
8
+
9
+ class CodeDebugClient:
10
+ def __init__(self, base_url: str = "http://localhost:7860", timeout: float = 30.0):
11
+ self.base_url = base_url.rstrip("/")
12
+ self._http = httpx.Client(base_url=self.base_url, timeout=timeout)
13
+ self.episode_id: Optional[str] = None
14
+
15
+ def reset(self, task_id: Optional[str] = None, seed: Optional[int] = None) -> dict:
16
+ payload: dict = {}
17
+ if task_id is not None:
18
+ payload["task_id"] = task_id
19
+ if seed is not None:
20
+ payload["seed"] = seed
21
+ resp = self._http.post("/reset", json=payload)
22
+ resp.raise_for_status()
23
+ data = resp.json()
24
+ self.episode_id = data["episode_id"]
25
+ return data
26
+
27
+ def step(self, code: str) -> dict:
28
+ if self.episode_id is None:
29
+ raise RuntimeError("No active episode. Call reset() first.")
30
+ resp = self._http.post(
31
+ f"/step/{self.episode_id}",
32
+ json={"action": {"code": code}},
33
+ )
34
+ resp.raise_for_status()
35
+ return resp.json()
36
+
37
+ def state(self) -> dict:
38
+ if self.episode_id is None:
39
+ raise RuntimeError("No active episode. Call reset() first.")
40
+ resp = self._http.get(f"/state/{self.episode_id}")
41
+ resp.raise_for_status()
42
+ return resp.json()
43
+
44
+ def list_tasks(self) -> list[dict]:
45
+ resp = self._http.get("/tasks")
46
+ resp.raise_for_status()
47
+ return resp.json()
48
+
49
+ def health(self) -> dict:
50
+ resp = self._http.get("/health")
51
+ resp.raise_for_status()
52
+ return resp.json()
53
+
54
+ def close(self):
55
+ self._http.close()
56
+
57
+ def __enter__(self):
58
+ return self
59
+
60
+ def __exit__(self, *args):
61
+ self.close()
code_debug_env/models.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from typing import Optional
3
+ from enum import Enum
4
+
5
+
6
+ class Difficulty(str, Enum):
7
+ easy = "easy"
8
+ medium = "medium"
9
+ hard = "hard"
10
+
11
+
12
+ class DebugAction(BaseModel):
13
+ """Agent submits fixed code."""
14
+ code: str = Field(..., description="The corrected Python code")
15
+
16
+
17
+ class TestResult(BaseModel):
18
+ """Result of a single test case execution."""
19
+ test_name: str
20
+ passed: bool
21
+ expected: str = ""
22
+ actual: str = ""
23
+ error: str = ""
24
+
25
+
26
+ class DebugObservation(BaseModel):
27
+ """What the agent sees after reset or step."""
28
+ task_id: str
29
+ difficulty: Difficulty
30
+ description: str
31
+ buggy_code: str
32
+ test_descriptions: list[str]
33
+ test_results: list[TestResult] = Field(default_factory=list)
34
+ stdout: str = ""
35
+ stderr: str = ""
36
+ step_count: int = 0
37
+ max_steps: int = 5
38
+ reward: float = 0.0
39
+ done: bool = False
40
+ total_tests: int = 0
41
+ tests_passed: int = 0
42
+
43
+
44
+ class DebugState(BaseModel):
45
+ """Episode metadata exposed via /state."""
46
+ episode_id: str
47
+ task_id: str
48
+ difficulty: Difficulty
49
+ step_count: int = 0
50
+ max_steps: int = 5
51
+ last_reward: float = 0.0
52
+ cumulative_reward: float = 0.0
53
+ tests_passed: int = 0
54
+ total_tests: int = 0
55
+ done: bool = False
56
+
57
+
58
+ # --- API request / response models ---
59
+
60
+ class ResetRequest(BaseModel):
61
+ task_id: Optional[str] = None
62
+ seed: Optional[int] = None
63
+
64
+
65
+ class ResetResponse(BaseModel):
66
+ episode_id: str
67
+ observation: DebugObservation
68
+
69
+
70
+ class StepRequest(BaseModel):
71
+ action: DebugAction
72
+
73
+
74
+ class StepResponse(BaseModel):
75
+ observation: DebugObservation
76
+ reward: float
77
+ done: bool
78
+ info: dict = Field(default_factory=dict)
code_debug_env/openenv.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: code-debug-env
2
+ version: "1.0.0"
3
+ description: "An OpenEnv environment where an AI agent debugs broken Python code. The agent receives buggy code and test descriptions, submits fixes, and is rewarded by the fraction of tests passing (0.0–1.0)."
4
+ type: space
5
+ runtime: fastapi
6
+ app: "code_debug_env.server.app:app"
7
+ port: 7860
8
+
9
+ tasks:
10
+ - id: task_001_off_by_one
11
+ difficulty: easy
12
+ max_steps: 5
13
+ - id: task_002_wrong_operator
14
+ difficulty: easy
15
+ max_steps: 5
16
+ - id: task_003_mutable_default
17
+ difficulty: medium
18
+ max_steps: 7
19
+ - id: task_004_scope_bug
20
+ difficulty: medium
21
+ max_steps: 7
22
+ - id: task_005_binary_search
23
+ difficulty: hard
24
+ max_steps: 10
25
+ - id: task_006_graph_cycle
26
+ difficulty: hard
27
+ max_steps: 10
28
+
29
+ reward:
30
+ type: deterministic
31
+ range: [0.0, 1.0]
32
+ formula: "tests_passed / total_tests"
33
+
34
+ constraints:
35
+ runtime_limit: "20 minutes"
36
+ vcpu: 2
37
+ memory: "8GB"
code_debug_env/pyproject.toml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "code-debug-env"
7
+ version = "1.0.0"
8
+ description = "OpenEnv environment for AI-driven Python code debugging"
9
+ requires-python = ">=3.11"
10
+ dependencies = [
11
+ "fastapi>=0.109.0",
12
+ "uvicorn[standard]>=0.27.0",
13
+ "pydantic>=2.6.1",
14
+ "httpx>=0.27.0",
15
+ "openai>=1.0.0",
16
+ ]
17
+
18
+ [project.optional-dependencies]
19
+ dev = ["pytest>=8.0.0"]
20
+
21
+ [project.scripts]
22
+ server = "code_debug_env.server.app:main"
23
+
24
+ [tool.setuptools.packages.find]
25
+ where = ["."]
26
+ include = ["code_debug_env*"]
code_debug_env/server/Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
6
+
7
+ COPY server/requirements.txt ./requirements.txt
8
+ RUN pip install --no-cache-dir -r requirements.txt
9
+
10
+ COPY . .
11
+
12
+ ENV PYTHONPATH=/app
13
+ ENV PYTHONUNBUFFERED=1
14
+ ENV PORT=7860
15
+
16
+ EXPOSE 7860
17
+
18
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=10s --retries=3 \
19
+ CMD curl -f http://localhost:7860/health || exit 1
20
+
21
+ CMD ["uvicorn", "code_debug_env.server.app:app", "--host", "0.0.0.0", "--port", "7860"]
code_debug_env/server/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .environment import CodeDebugEnvironment
2
+
3
+ __all__ = ["CodeDebugEnvironment"]
code_debug_env/server/app.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI server exposing the OpenEnv-compatible HTTP API.
3
+ Port: 7860 (Hugging Face Spaces default)
4
+ """
5
+
6
+ from fastapi import FastAPI, HTTPException
7
+ from fastapi.responses import JSONResponse
8
+
9
+ from ..models import (
10
+ DebugState,
11
+ ResetRequest,
12
+ ResetResponse,
13
+ StepRequest,
14
+ StepResponse,
15
+ )
16
+ from .environment import CodeDebugEnvironment
17
+
18
+ app = FastAPI(
19
+ title="Code Debug OpenEnv",
20
+ description="An OpenEnv environment where an AI agent debugs broken Python code.",
21
+ version="1.0.0",
22
+ )
23
+
24
+ env = CodeDebugEnvironment()
25
+
26
+
27
+ # ------------------------------------------------------------------
28
+ # Health & metadata
29
+ # ------------------------------------------------------------------
30
+
31
+ @app.get("/health")
32
+ async def health():
33
+ return {"status": "healthy", "tasks_loaded": len(env.tasks)}
34
+
35
+
36
+ @app.get("/tasks")
37
+ async def list_tasks():
38
+ return env.list_tasks()
39
+
40
+
41
+ # ------------------------------------------------------------------
42
+ # OpenEnv core endpoints
43
+ # ------------------------------------------------------------------
44
+
45
+ @app.post("/reset", response_model=ResetResponse)
46
+ async def reset(req: ResetRequest = None):
47
+ if req is None:
48
+ req = ResetRequest()
49
+ try:
50
+ result = env.reset(task_id=req.task_id, seed=req.seed)
51
+ return result
52
+ except KeyError as e:
53
+ raise HTTPException(status_code=404, detail=str(e))
54
+
55
+
56
+ @app.post("/step/{episode_id}", response_model=StepResponse)
57
+ async def step(episode_id: str, req: StepRequest):
58
+ try:
59
+ result = env.step(episode_id, req.action.model_dump())
60
+ return result
61
+ except KeyError as e:
62
+ raise HTTPException(status_code=404, detail=str(e))
63
+ except ValueError as e:
64
+ raise HTTPException(status_code=400, detail=str(e))
65
+
66
+
67
+ @app.get("/state/{episode_id}", response_model=DebugState)
68
+ async def state(episode_id: str):
69
+ try:
70
+ return env.state(episode_id)
71
+ except KeyError as e:
72
+ raise HTTPException(status_code=404, detail=str(e))
73
+
74
+
75
+ # ------------------------------------------------------------------
76
+ # Entry point for local dev
77
+ # ------------------------------------------------------------------
78
+
79
+ if __name__ == "__main__":
80
+ import uvicorn
81
+ uvicorn.run("code_debug_env.server.app:app", host="0.0.0.0", port=7860, reload=True)
code_debug_env/server/environment.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Core environment logic: task loading, reset, step, state.
3
+ """
4
+
5
+ import json
6
+ import random
7
+ import uuid
8
+ from pathlib import Path
9
+
10
+ from .executor import run_code_safely
11
+
12
+
13
+ class CodeDebugEnvironment:
14
+ def __init__(self):
15
+ self.tasks: dict[str, dict] = {}
16
+ self.episodes: dict[str, dict] = {}
17
+ self._load_tasks()
18
+
19
+ def _load_tasks(self):
20
+ tasks_dir = Path(__file__).parent.parent / "tasks"
21
+ for json_file in sorted(tasks_dir.rglob("*.json")):
22
+ with open(json_file, encoding="utf-8") as f:
23
+ task = json.load(f)
24
+ self.tasks[task["task_id"]] = task
25
+
26
+ # ------------------------------------------------------------------
27
+ # Public API
28
+ # ------------------------------------------------------------------
29
+
30
+ def reset(self, task_id: str | None = None, seed: int | None = None) -> dict:
31
+ if seed is not None:
32
+ random.seed(seed)
33
+
34
+ if task_id is None:
35
+ task_id = random.choice(list(self.tasks.keys()))
36
+
37
+ if task_id not in self.tasks:
38
+ raise KeyError(f"Unknown task_id: {task_id!r}")
39
+
40
+ task = self.tasks[task_id]
41
+ episode_id = str(uuid.uuid4())
42
+
43
+ self.episodes[episode_id] = {
44
+ "episode_id": episode_id,
45
+ "task": task,
46
+ "step_count": 0,
47
+ "done": False,
48
+ "rewards": [],
49
+ "last_test_results": [],
50
+ }
51
+
52
+ observation = self._initial_observation(task)
53
+ return {"episode_id": episode_id, "observation": observation}
54
+
55
+ def step(self, episode_id: str, action: dict) -> dict:
56
+ if episode_id not in self.episodes:
57
+ raise KeyError(f"Unknown episode_id: {episode_id!r}")
58
+
59
+ ep = self.episodes[episode_id]
60
+ if ep["done"]:
61
+ raise ValueError("Episode is already finished. Call reset() to start a new episode.")
62
+
63
+ task = ep["task"]
64
+ submitted_code = action.get("code", "")
65
+ ep["step_count"] += 1
66
+
67
+ test_results_raw, stdout, stderr = run_code_safely(
68
+ submitted_code,
69
+ task["test_code"],
70
+ timeout=10,
71
+ )
72
+
73
+ tests_passed = sum(1 for t in test_results_raw if t.get("passed", False))
74
+ total_tests = len(test_results_raw)
75
+ reward = round(tests_passed / total_tests, 4) if total_tests > 0 else 0.0
76
+
77
+ max_steps = task.get("max_steps", 5)
78
+ done = reward == 1.0 or ep["step_count"] >= max_steps
79
+
80
+ ep["done"] = done
81
+ ep["rewards"].append(reward)
82
+ ep["last_test_results"] = test_results_raw
83
+
84
+ observation = {
85
+ "task_id": task["task_id"],
86
+ "difficulty": task["difficulty"],
87
+ "description": task["description"],
88
+ "buggy_code": task["buggy_code"],
89
+ "test_descriptions": task["test_descriptions"],
90
+ "test_results": test_results_raw,
91
+ "stdout": stdout,
92
+ "stderr": stderr,
93
+ "step_count": ep["step_count"],
94
+ "max_steps": max_steps,
95
+ "reward": reward,
96
+ "done": done,
97
+ "total_tests": total_tests,
98
+ "tests_passed": tests_passed,
99
+ }
100
+
101
+ return {"observation": observation, "reward": reward, "done": done, "info": {}}
102
+
103
+ def state(self, episode_id: str) -> dict:
104
+ if episode_id not in self.episodes:
105
+ raise KeyError(f"Unknown episode_id: {episode_id!r}")
106
+
107
+ ep = self.episodes[episode_id]
108
+ task = ep["task"]
109
+ last_results = ep.get("last_test_results", [])
110
+
111
+ return {
112
+ "episode_id": episode_id,
113
+ "task_id": task["task_id"],
114
+ "difficulty": task["difficulty"],
115
+ "step_count": ep["step_count"],
116
+ "max_steps": task.get("max_steps", 5),
117
+ "last_reward": ep["rewards"][-1] if ep["rewards"] else 0.0,
118
+ "cumulative_reward": round(sum(ep["rewards"]), 4),
119
+ "tests_passed": sum(1 for t in last_results if t.get("passed", False)),
120
+ "total_tests": len(last_results),
121
+ "done": ep["done"],
122
+ }
123
+
124
+ def list_tasks(self) -> list[dict]:
125
+ return [
126
+ {
127
+ "task_id": t["task_id"],
128
+ "difficulty": t["difficulty"],
129
+ "description": t["description"],
130
+ "max_steps": t.get("max_steps", 5),
131
+ "total_tests": len(t["test_descriptions"]),
132
+ }
133
+ for t in self.tasks.values()
134
+ ]
135
+
136
+ # ------------------------------------------------------------------
137
+ # Internal helpers
138
+ # ------------------------------------------------------------------
139
+
140
+ def _initial_observation(self, task: dict) -> dict:
141
+ return {
142
+ "task_id": task["task_id"],
143
+ "difficulty": task["difficulty"],
144
+ "description": task["description"],
145
+ "buggy_code": task["buggy_code"],
146
+ "test_descriptions": task["test_descriptions"],
147
+ "test_results": [],
148
+ "stdout": "",
149
+ "stderr": "",
150
+ "step_count": 0,
151
+ "max_steps": task.get("max_steps", 5),
152
+ "reward": 0.0,
153
+ "done": False,
154
+ "total_tests": len(task["test_descriptions"]),
155
+ "tests_passed": 0,
156
+ }
code_debug_env/server/executor.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Safe code execution engine.
3
+
4
+ Runs submitted code in a subprocess with timeout.
5
+ Writes code to a temp directory, generates a test harness,
6
+ and parses JSON results from stdout.
7
+ """
8
+
9
+ import json
10
+ import subprocess
11
+ import tempfile
12
+ import textwrap
13
+ from pathlib import Path
14
+ from typing import Tuple
15
+
16
+
17
+ def run_code_safely(
18
+ submitted_code: str,
19
+ test_code: str,
20
+ timeout: int = 10,
21
+ ) -> Tuple[list[dict], str, str]:
22
+ """
23
+ Execute submitted code against test cases in an isolated subprocess.
24
+
25
+ Args:
26
+ submitted_code: The Python code the agent submitted as a fix.
27
+ test_code: Python snippet that populates a `results` list with test dicts.
28
+ timeout: Max seconds before killing the subprocess.
29
+
30
+ Returns:
31
+ (test_results, stdout_extra, stderr) where test_results is a list of
32
+ {"test_name", "passed", "expected", "actual", "error"} dicts.
33
+ """
34
+ with tempfile.TemporaryDirectory() as tmpdir:
35
+ solution_path = Path(tmpdir) / "solution.py"
36
+ harness_path = Path(tmpdir) / "harness.py"
37
+
38
+ # Write the submitted code as a module
39
+ solution_path.write_text(submitted_code, encoding="utf-8")
40
+
41
+ # Build the test harness
42
+ harness = textwrap.dedent(f"""\
43
+ import sys, json, traceback
44
+ sys.path.insert(0, r"{tmpdir}")
45
+
46
+ results = []
47
+
48
+ # Execute the submitted solution in this namespace
49
+ try:
50
+ exec(open(r"{solution_path}", encoding="utf-8").read())
51
+ except Exception as e:
52
+ # If the solution itself fails to load, all tests fail
53
+ print(json.dumps([{{"test_name": "load", "passed": False,
54
+ "expected": "code loads", "actual": "",
55
+ "error": traceback.format_exc()}}]))
56
+ sys.exit(0)
57
+
58
+ # Run the test code (populates `results`)
59
+ {textwrap.indent(test_code, " ").strip()}
60
+
61
+ print(json.dumps(results))
62
+ """)
63
+
64
+ harness_path.write_text(harness, encoding="utf-8")
65
+
66
+ try:
67
+ proc = subprocess.run(
68
+ ["python", str(harness_path)],
69
+ capture_output=True,
70
+ text=True,
71
+ timeout=timeout,
72
+ cwd=tmpdir,
73
+ )
74
+ stdout = proc.stdout.strip()
75
+ stderr = proc.stderr.strip()
76
+
77
+ # Parse test results from last line of stdout (the JSON array)
78
+ if stdout:
79
+ # The JSON array should be the last line
80
+ lines = stdout.split("\n")
81
+ json_line = lines[-1]
82
+ extra_output = "\n".join(lines[:-1]) if len(lines) > 1 else ""
83
+ try:
84
+ test_results = json.loads(json_line)
85
+ return test_results, extra_output, stderr
86
+ except json.JSONDecodeError:
87
+ return [
88
+ {
89
+ "test_name": "parse_error",
90
+ "passed": False,
91
+ "expected": "valid JSON output",
92
+ "actual": stdout[:200],
93
+ "error": "Could not parse test results from subprocess output",
94
+ }
95
+ ], "", stderr
96
+
97
+ # No stdout at all — likely a crash
98
+ return [
99
+ {
100
+ "test_name": "execution_error",
101
+ "passed": False,
102
+ "expected": "code runs",
103
+ "actual": "",
104
+ "error": stderr[:500] if stderr else "No output produced",
105
+ }
106
+ ], "", stderr
107
+
108
+ except subprocess.TimeoutExpired:
109
+ return [
110
+ {
111
+ "test_name": "timeout",
112
+ "passed": False,
113
+ "expected": f"completes within {timeout}s",
114
+ "actual": "timed out",
115
+ "error": f"Code execution exceeded {timeout} second timeout",
116
+ }
117
+ ], "", "Execution timed out"
code_debug_env/server/requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi>=0.109.0
2
+ uvicorn[standard]>=0.27.0
3
+ pydantic>=2.6.1
4
+ httpx>=0.27.0
5
+ openai>=1.0.0
code_debug_env/tasks/easy/task_001_off_by_one.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_id": "task_001_off_by_one",
3
+ "difficulty": "easy",
4
+ "description": "Fix the bug in the fibonacci function. It should return the nth Fibonacci number (0-indexed: fib(0)=0, fib(1)=1, fib(2)=1, fib(5)=5, fib(10)=55).",
5
+ "buggy_code": "def fib(n):\n if n <= 0:\n return 0\n if n == 1:\n return 1\n a, b = 0, 1\n for i in range(n - 1):\n a, b = b, a + b\n return a\n",
6
+ "solution": "def fib(n):\n if n <= 0:\n return 0\n if n == 1:\n return 1\n a, b = 0, 1\n for i in range(n - 1):\n a, b = b, a + b\n return b\n",
7
+ "test_descriptions": [
8
+ "fib(0) should return 0",
9
+ "fib(1) should return 1",
10
+ "fib(2) should return 1",
11
+ "fib(5) should return 5",
12
+ "fib(10) should return 55"
13
+ ],
14
+ "test_code": "test_cases = [(0, 0), (1, 1), (2, 1), (5, 5), (10, 55)]\nfor inp, exp in test_cases:\n name = f\"fib({inp})=={exp}\"\n try:\n actual = fib(inp)\n results.append({\"test_name\": name, \"passed\": actual == exp, \"expected\": str(exp), \"actual\": str(actual), \"error\": \"\"})\n except Exception as e:\n results.append({\"test_name\": name, \"passed\": False, \"expected\": str(exp), \"actual\": \"\", \"error\": str(e)})",
15
+ "max_steps": 5
16
+ }
code_debug_env/tasks/easy/task_002_wrong_operator.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_id": "task_002_wrong_operator",
3
+ "difficulty": "easy",
4
+ "description": "Fix the bug in the find_max function. It should return the maximum value in a list, or None if the list is empty.",
5
+ "buggy_code": "def find_max(lst):\n if not lst:\n return None\n max_val = lst[0]\n for item in lst[1:]:\n if item < max_val:\n max_val = item\n return max_val\n",
6
+ "solution": "def find_max(lst):\n if not lst:\n return None\n max_val = lst[0]\n for item in lst[1:]:\n if item > max_val:\n max_val = item\n return max_val\n",
7
+ "test_descriptions": [
8
+ "find_max([1, 2, 3]) should return 3",
9
+ "find_max([-1, -5, 0]) should return 0",
10
+ "find_max([42]) should return 42",
11
+ "find_max([]) should return None",
12
+ "find_max([3, 1, 4, 1, 5, 9, 2, 6]) should return 9"
13
+ ],
14
+ "test_code": "test_cases = [([1, 2, 3], 3), ([-1, -5, 0], 0), ([42], 42), ([], None), ([3, 1, 4, 1, 5, 9, 2, 6], 9)]\nfor inp, exp in test_cases:\n name = f\"find_max({inp})=={exp}\"\n try:\n actual = find_max(inp)\n results.append({\"test_name\": name, \"passed\": actual == exp, \"expected\": str(exp), \"actual\": str(actual), \"error\": \"\"})\n except Exception as e:\n results.append({\"test_name\": name, \"passed\": False, \"expected\": str(exp), \"actual\": \"\", \"error\": str(e)})",
15
+ "max_steps": 5
16
+ }
code_debug_env/tasks/hard/task_005_binary_search.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_id": "task_005_binary_search",
3
+ "difficulty": "hard",
4
+ "description": "Fix the bugs in binary_search. It should return the index of target in a sorted array, or -1 if not found. There are multiple bugs in the boundary handling and loop condition.",
5
+ "buggy_code": "def binary_search(arr, target):\n low, high = 0, len(arr)\n while low < high:\n mid = (low + high) // 2\n if arr[mid] == target:\n return mid\n elif arr[mid] < target:\n low = mid + 1\n else:\n high = mid - 1\n return -1\n",
6
+ "solution": "def binary_search(arr, target):\n low, high = 0, len(arr) - 1\n while low <= high:\n mid = (low + high) // 2\n if arr[mid] == target:\n return mid\n elif arr[mid] < target:\n low = mid + 1\n else:\n high = mid - 1\n return -1\n",
7
+ "test_descriptions": [
8
+ "binary_search([1, 3, 5, 7, 9], 5) should return 2",
9
+ "binary_search([1, 3, 5, 7, 9], 1) should return 0 (first element)",
10
+ "binary_search([1, 3, 5, 7, 9], 9) should return 4 (last element)",
11
+ "binary_search([1, 3, 5, 7, 9], 4) should return -1 (not found)",
12
+ "binary_search([42], 42) should return 0 (single element)",
13
+ "binary_search([], 1) should return -1 (empty array)"
14
+ ],
15
+ "test_code": "test_cases = [\n (([1,3,5,7,9], 5), 2, \"find_middle\"),\n (([1,3,5,7,9], 1), 0, \"find_first\"),\n (([1,3,5,7,9], 9), 4, \"find_last\"),\n (([1,3,5,7,9], 4), -1, \"not_found\"),\n (([42], 42), 0, \"single_element\"),\n (([], 1), -1, \"empty_array\")\n]\nfor (arr, target), exp, name in test_cases:\n try:\n actual = binary_search(arr, target)\n results.append({\"test_name\": name, \"passed\": actual == exp, \"expected\": str(exp), \"actual\": str(actual), \"error\": \"\"})\n except Exception as e:\n results.append({\"test_name\": name, \"passed\": False, \"expected\": str(exp), \"actual\": \"\", \"error\": str(e)})",
16
+ "max_steps": 10
17
+ }
code_debug_env/tasks/hard/task_006_graph_cycle.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_id": "task_006_graph_cycle",
3
+ "difficulty": "hard",
4
+ "description": "Fix the bug in has_cycle. It should detect if a directed graph (given as an adjacency dict) contains a cycle. The current implementation incorrectly reports cycles in DAGs because it doesn't distinguish between nodes in the current DFS path vs. nodes that have been fully explored.",
5
+ "buggy_code": "def has_cycle(graph):\n visited = set()\n\n def dfs(node):\n if node in visited:\n return True\n visited.add(node)\n for neighbor in graph.get(node, []):\n if dfs(neighbor):\n return True\n return False\n\n for node in graph:\n if node not in visited:\n if dfs(node):\n return True\n return False\n",
6
+ "solution": "def has_cycle(graph):\n visited = set()\n rec_stack = set()\n\n def dfs(node):\n visited.add(node)\n rec_stack.add(node)\n for neighbor in graph.get(node, []):\n if neighbor not in visited:\n if dfs(neighbor):\n return True\n elif neighbor in rec_stack:\n return True\n rec_stack.discard(node)\n return False\n\n for node in graph:\n if node not in visited:\n if dfs(node):\n return True\n return False\n",
7
+ "test_descriptions": [
8
+ "A simple cycle A->B->C->A should return True",
9
+ "A DAG A->B->C, A->C should return False",
10
+ "A self-loop A->A should return True",
11
+ "An empty graph {} should return False",
12
+ "A disconnected graph with a cycle in one component should return True",
13
+ "A diamond DAG A->B, A->C, B->D, C->D should return False"
14
+ ],
15
+ "test_code": "test_cases = [\n ({\"A\": [\"B\"], \"B\": [\"C\"], \"C\": [\"A\"]}, True, \"simple_cycle\"),\n ({\"A\": [\"B\", \"C\"], \"B\": [\"C\"], \"C\": []}, False, \"dag_no_cycle\"),\n ({\"A\": [\"A\"]}, True, \"self_loop\"),\n ({}, False, \"empty_graph\"),\n ({\"A\": [\"B\"], \"B\": [], \"C\": [\"D\"], \"D\": [\"E\"], \"E\": [\"C\"]}, True, \"disconnected_with_cycle\"),\n ({\"A\": [\"B\", \"C\"], \"B\": [\"D\"], \"C\": [\"D\"], \"D\": []}, False, \"diamond_dag\")\n]\nfor graph, exp, name in test_cases:\n try:\n actual = has_cycle(graph)\n results.append({\"test_name\": name, \"passed\": actual == exp, \"expected\": str(exp), \"actual\": str(actual), \"error\": \"\"})\n except Exception as e:\n results.append({\"test_name\": name, \"passed\": False, \"expected\": str(exp), \"actual\": \"\", \"error\": str(e)})",
16
+ "max_steps": 10
17
+ }
code_debug_env/tasks/medium/task_003_mutable_default.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_id": "task_003_mutable_default",
3
+ "difficulty": "medium",
4
+ "description": "Fix the bug in the add_item and build_shopping_lists functions. build_shopping_lists should return independent lists for each group of items. Currently all lists share the same underlying list due to a mutable default argument.",
5
+ "buggy_code": "def add_item(item, lst=[]):\n lst.append(item)\n return lst\n\n\ndef build_shopping_lists(items_per_list):\n result = []\n for items in items_per_list:\n current_list = add_item(items[0])\n for item in items[1:]:\n current_list = add_item(item)\n result.append(current_list)\n return result\n",
6
+ "solution": "def add_item(item, lst=None):\n if lst is None:\n lst = []\n lst.append(item)\n return lst\n\n\ndef build_shopping_lists(items_per_list):\n result = []\n for items in items_per_list:\n current_list = add_item(items[0])\n for item in items[1:]:\n current_list = add_item(item, current_list)\n result.append(current_list)\n return result\n",
7
+ "test_descriptions": [
8
+ "build_shopping_lists([['a', 'b']]) should return [['a', 'b']]",
9
+ "build_shopping_lists([['a', 'b'], ['c', 'd']]) should return [['a', 'b'], ['c', 'd']] (independent lists)",
10
+ "build_shopping_lists([['x']]) should return [['x']]",
11
+ "Calling build_shopping_lists twice should not carry state between calls"
12
+ ],
13
+ "test_code": "try:\n r1 = build_shopping_lists([['a', 'b']])\n results.append({\"test_name\": \"single_list\", \"passed\": r1 == [['a', 'b']], \"expected\": \"[['a', 'b']]\", \"actual\": str(r1), \"error\": \"\"})\nexcept Exception as e:\n results.append({\"test_name\": \"single_list\", \"passed\": False, \"expected\": \"[['a', 'b']]\", \"actual\": \"\", \"error\": str(e)})\n\ntry:\n r2 = build_shopping_lists([['a', 'b'], ['c', 'd']])\n results.append({\"test_name\": \"independent_lists\", \"passed\": r2 == [['a', 'b'], ['c', 'd']], \"expected\": \"[['a', 'b'], ['c', 'd']]\", \"actual\": str(r2), \"error\": \"\"})\nexcept Exception as e:\n results.append({\"test_name\": \"independent_lists\", \"passed\": False, \"expected\": \"[['a', 'b'], ['c', 'd']]\", \"actual\": \"\", \"error\": str(e)})\n\ntry:\n r3 = build_shopping_lists([['x']])\n results.append({\"test_name\": \"single_item\", \"passed\": r3 == [['x']], \"expected\": \"[['x']]\", \"actual\": str(r3), \"error\": \"\"})\nexcept Exception as e:\n results.append({\"test_name\": \"single_item\", \"passed\": False, \"expected\": \"[['x']]\", \"actual\": \"\", \"error\": str(e)})\n\ntry:\n call1 = build_shopping_lists([['a']])\n call2 = build_shopping_lists([['b']])\n results.append({\"test_name\": \"no_state_leak\", \"passed\": call1 == [['a']] and call2 == [['b']], \"expected\": \"[['a']] and [['b']]\", \"actual\": f\"{call1} and {call2}\", \"error\": \"\"})\nexcept Exception as e:\n results.append({\"test_name\": \"no_state_leak\", \"passed\": False, \"expected\": \"[['a']] and [['b']]\", \"actual\": \"\", \"error\": str(e)})",
14
+ "max_steps": 7
15
+ }
code_debug_env/tasks/medium/task_004_scope_bug.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_id": "task_004_scope_bug",
3
+ "difficulty": "medium",
4
+ "description": "Fix the bug in make_counters. It should return a list of n functions where the i-th function returns i. Currently all returned functions return the same value (n-1) because the closure captures the loop variable by reference.",
5
+ "buggy_code": "def make_counters(n):\n counters = []\n for i in range(n):\n def counter():\n return i\n counters.append(counter)\n return counters\n",
6
+ "solution": "def make_counters(n):\n counters = []\n for i in range(n):\n def counter(i=i):\n return i\n counters.append(counter)\n return counters\n",
7
+ "test_descriptions": [
8
+ "make_counters(3) should return functions that return 0, 1, 2 respectively",
9
+ "make_counters(1) should return a single function that returns 0",
10
+ "make_counters(5)[4]() should return 4",
11
+ "make_counters(0) should return an empty list"
12
+ ],
13
+ "test_code": "try:\n fns = make_counters(3)\n vals = [f() for f in fns]\n results.append({\"test_name\": \"counters_3\", \"passed\": vals == [0, 1, 2], \"expected\": \"[0, 1, 2]\", \"actual\": str(vals), \"error\": \"\"})\nexcept Exception as e:\n results.append({\"test_name\": \"counters_3\", \"passed\": False, \"expected\": \"[0, 1, 2]\", \"actual\": \"\", \"error\": str(e)})\n\ntry:\n fns = make_counters(1)\n vals = [f() for f in fns]\n results.append({\"test_name\": \"counters_1\", \"passed\": vals == [0], \"expected\": \"[0]\", \"actual\": str(vals), \"error\": \"\"})\nexcept Exception as e:\n results.append({\"test_name\": \"counters_1\", \"passed\": False, \"expected\": \"[0]\", \"actual\": \"\", \"error\": str(e)})\n\ntry:\n fns = make_counters(5)\n val = fns[4]()\n results.append({\"test_name\": \"counter_5_last\", \"passed\": val == 4, \"expected\": \"4\", \"actual\": str(val), \"error\": \"\"})\nexcept Exception as e:\n results.append({\"test_name\": \"counter_5_last\", \"passed\": False, \"expected\": \"4\", \"actual\": \"\", \"error\": str(e)})\n\ntry:\n fns = make_counters(0)\n results.append({\"test_name\": \"counters_0\", \"passed\": fns == [], \"expected\": \"[]\", \"actual\": str(fns), \"error\": \"\"})\nexcept Exception as e:\n results.append({\"test_name\": \"counters_0\", \"passed\": False, \"expected\": \"[]\", \"actual\": \"\", \"error\": str(e)})",
14
+ "max_steps": 7
15
+ }
inference.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Inference Script — Code Debug OpenEnv
3
+ ======================================
4
+ MANDATORY environment variables:
5
+ API_BASE_URL The API endpoint for the LLM.
6
+ MODEL_NAME The model identifier to use for inference.
7
+ HF_TOKEN Your Hugging Face / API key.
8
+ ENV_URL URL of the running OpenEnv server (default: http://localhost:7860)
9
+
10
+ STDOUT FORMAT:
11
+ [START] task=<task_id> env=<env_url> model=<model_name>
12
+ [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
13
+ [END] success=<true|false> steps=<n> score=<0.000> rewards=<r1,r2,...>
14
+ """
15
+
16
+ import os
17
+ import sys
18
+ import textwrap
19
+ from typing import Optional
20
+
21
+ import httpx
22
+ from openai import OpenAI
23
+
24
+ # ------------------------------------------------------------------
25
+ # Configuration
26
+ # ------------------------------------------------------------------
27
+
28
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
29
+ MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
30
+ HF_TOKEN = os.getenv("HF_TOKEN", "")
31
+ ENV_URL = os.getenv("ENV_URL", "http://localhost:7860")
32
+
33
+ TEMPERATURE = 0.2
34
+ MAX_TOKENS = 2048
35
+ SUCCESS_THRESHOLD = 1.0 # require all tests passing
36
+
37
+ SYSTEM_PROMPT = textwrap.dedent("""
38
+ You are an expert Python debugger.
39
+
40
+ You will receive:
41
+ 1. A description of the task
42
+ 2. The buggy Python code
43
+ 3. Descriptions of what each test checks
44
+ 4. (From step 2 onwards) Test results showing which tests passed or failed,
45
+ with actual vs expected values and any error messages
46
+
47
+ Your job: return ONLY the corrected Python code with all bugs fixed.
48
+ Rules:
49
+ - Output raw Python code only — no markdown fences, no explanations
50
+ - Include the complete function definition(s), not just the changed lines
51
+ - Make sure all tests pass
52
+ """).strip()
53
+
54
+
55
+ # ------------------------------------------------------------------
56
+ # Logging helpers (strict format per submission spec)
57
+ # ------------------------------------------------------------------
58
+
59
+ def log_start(task: str, env: str, model: str) -> None:
60
+ print(f"[START] task={task} env={env} model={model}", flush=True)
61
+
62
+
63
+ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
64
+ error_val = error if error else "null"
65
+ done_val = str(done).lower()
66
+ safe_action = action.replace("\n", "\\n")[:120]
67
+ print(
68
+ f"[STEP] step={step} action={safe_action!r} reward={reward:.2f} done={done_val} error={error_val}",
69
+ flush=True,
70
+ )
71
+
72
+
73
+ def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
74
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
75
+ print(
76
+ f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
77
+ flush=True,
78
+ )
79
+
80
+
81
+ # ------------------------------------------------------------------
82
+ # Prompt builders
83
+ # ------------------------------------------------------------------
84
+
85
+ def build_initial_prompt(obs: dict) -> str:
86
+ lines = [
87
+ f"## Task\n{obs['description']}",
88
+ f"\n## Buggy Code\n```python\n{obs['buggy_code']}\n```",
89
+ "\n## Tests that must pass",
90
+ ]
91
+ for desc in obs.get("test_descriptions", []):
92
+ lines.append(f"- {desc}")
93
+ return "\n".join(lines)
94
+
95
+
96
+ def build_feedback_prompt(obs: dict) -> str:
97
+ lines = ["## Test Results from your last submission\n"]
98
+ for tr in obs.get("test_results", []):
99
+ status = "PASS" if tr["passed"] else "FAIL"
100
+ lines.append(f"[{status}] {tr['test_name']}")
101
+ if not tr["passed"]:
102
+ lines.append(f" expected : {tr['expected']}")
103
+ lines.append(f" actual : {tr['actual']}")
104
+ if tr.get("error"):
105
+ lines.append(f" error : {tr['error']}")
106
+ if obs.get("stderr"):
107
+ lines.append(f"\n## Stderr\n{obs['stderr'][:500]}")
108
+ lines.append("\nFix the remaining failures and return the complete corrected code.")
109
+ return "\n".join(lines)
110
+
111
+
112
+ def strip_fences(text: str) -> str:
113
+ text = text.strip()
114
+ if text.startswith("```python"):
115
+ text = text[len("```python"):].strip()
116
+ elif text.startswith("```"):
117
+ text = text[3:].strip()
118
+ if text.endswith("```"):
119
+ text = text[:-3].strip()
120
+ return text
121
+
122
+
123
+ # ------------------------------------------------------------------
124
+ # Single episode runner
125
+ # ------------------------------------------------------------------
126
+
127
+ def run_episode(http: httpx.Client, client: OpenAI, task_id: str) -> dict:
128
+ # Reset
129
+ reset_resp = http.post("/reset", json={"task_id": task_id})
130
+ reset_resp.raise_for_status()
131
+ reset_data = reset_resp.json()
132
+ episode_id = reset_data["episode_id"]
133
+ obs = reset_data["observation"]
134
+ max_steps = obs.get("max_steps", 5)
135
+
136
+ log_start(task=task_id, env=ENV_URL, model=MODEL_NAME)
137
+
138
+ messages: list[dict] = [{"role": "system", "content": SYSTEM_PROMPT}]
139
+ rewards: list[float] = []
140
+ steps_taken = 0
141
+ error_msg: Optional[str] = None
142
+
143
+ try:
144
+ for step in range(1, max_steps + 1):
145
+ # Build user message
146
+ if step == 1:
147
+ user_content = build_initial_prompt(obs)
148
+ else:
149
+ user_content = build_feedback_prompt(obs)
150
+
151
+ messages.append({"role": "user", "content": user_content})
152
+
153
+ # LLM call
154
+ try:
155
+ completion = client.chat.completions.create(
156
+ model=MODEL_NAME,
157
+ messages=messages,
158
+ temperature=TEMPERATURE,
159
+ max_tokens=MAX_TOKENS,
160
+ )
161
+ fixed_code = strip_fences(completion.choices[0].message.content or "")
162
+ messages.append({"role": "assistant", "content": fixed_code})
163
+ except Exception as exc:
164
+ error_msg = str(exc)
165
+ log_step(step=step, action="llm_error", reward=0.0, done=False, error=error_msg)
166
+ break
167
+
168
+ # Step environment
169
+ step_resp = http.post(
170
+ f"/step/{episode_id}",
171
+ json={"action": {"code": fixed_code}},
172
+ )
173
+ step_resp.raise_for_status()
174
+ step_data = step_resp.json()
175
+ obs = step_data["observation"]
176
+ reward = step_data["reward"]
177
+ done = step_data["done"]
178
+
179
+ rewards.append(reward)
180
+ steps_taken = step
181
+
182
+ log_step(step=step, action=fixed_code, reward=reward, done=done, error=None)
183
+
184
+ if done:
185
+ break
186
+
187
+ except Exception as exc:
188
+ error_msg = str(exc)
189
+
190
+ score = rewards[-1] if rewards else 0.0
191
+ success = score >= SUCCESS_THRESHOLD
192
+ log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
193
+
194
+ return {
195
+ "task_id": task_id,
196
+ "episode_id": episode_id,
197
+ "success": success,
198
+ "score": score,
199
+ "steps": steps_taken,
200
+ }
201
+
202
+
203
+ # ------------------------------------------------------------------
204
+ # Main: run all tasks
205
+ # ------------------------------------------------------------------
206
+
207
+ def main():
208
+ client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN or "EMPTY")
209
+ http = httpx.Client(base_url=ENV_URL, timeout=60.0)
210
+
211
+ # Discover available tasks from the server
212
+ try:
213
+ tasks_resp = http.get("/tasks")
214
+ tasks_resp.raise_for_status()
215
+ all_tasks = [t["task_id"] for t in tasks_resp.json()]
216
+ except Exception as exc:
217
+ print(f"[ERROR] Could not fetch task list: {exc}", file=sys.stderr, flush=True)
218
+ sys.exit(1)
219
+
220
+ results = []
221
+ for task_id in all_tasks:
222
+ result = run_episode(http, client, task_id)
223
+ results.append(result)
224
+
225
+ # Summary
226
+ total = len(results)
227
+ solved = sum(1 for r in results if r["success"])
228
+ avg = sum(r["score"] for r in results) / total if total else 0.0
229
+ print(f"\n=== SUMMARY: solved={solved}/{total} avg_score={avg:.3f} ===", flush=True)
230
+
231
+
232
+ if __name__ == "__main__":
233
+ main()