Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- Dockerfile +23 -0
- README.md +126 -5
- __init__.py +5 -0
- baseline/run_baseline.py +112 -0
- baseline/train_grpo_120b.py +208 -0
- client.py +29 -0
- models.py +57 -0
- openenv.yaml +70 -0
- pyproject.toml +34 -0
- server/__init__.py +1 -0
- server/app.py +72 -0
- server/environment.py +114 -0
- server/grader.py +118 -0
- server/requirements.txt +8 -0
- server/task_generator.py +52 -0
- server/tasks/__init__.py +17 -0
- server/tasks/task_easy.py +48 -0
- server/tasks/task_hard.py +104 -0
- server/tasks/task_medium.py +64 -0
- test_client.py +17 -0
- uv.lock +0 -0
Dockerfile
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Dockerfile — builds from openenv-base, MUST use this base image
|
| 2 |
+
ARG BASE_IMAGE=openenv-base:latest
|
| 3 |
+
FROM ${BASE_IMAGE}
|
| 4 |
+
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Install Python dependencies
|
| 8 |
+
COPY server/requirements.txt /tmp/requirements.txt
|
| 9 |
+
RUN pip install --no-cache-dir -r /tmp/requirements.txt && rm /tmp/requirements.txt
|
| 10 |
+
|
| 11 |
+
# Copy environment code
|
| 12 |
+
COPY . /app/code_debug_env/
|
| 13 |
+
|
| 14 |
+
# Health check (required by hackathon validator)
|
| 15 |
+
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
|
| 16 |
+
CMD curl -f http://localhost:8000/health || exit 1
|
| 17 |
+
|
| 18 |
+
# Expose port
|
| 19 |
+
EXPOSE 8000
|
| 20 |
+
|
| 21 |
+
# Start server
|
| 22 |
+
ENV ENABLE_WEB_INTERFACE=true
|
| 23 |
+
CMD ["uvicorn", "code_debug_env.server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
README.md
CHANGED
|
@@ -1,10 +1,131 @@
|
|
| 1 |
---
|
| 2 |
title: Code Debug Env
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
-
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: Code Debug Env
|
| 3 |
+
emoji: 🐞
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 8000
|
| 8 |
+
base_path: /web
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# code-debug-env
|
| 12 |
+
|
| 13 |
+
An OpenEnv environment for training AI agents to repair buggy Python code.
|
| 14 |
+
The agent receives a broken function and must iteratively submit patches until
|
| 15 |
+
all unit tests pass.
|
| 16 |
+
|
| 17 |
+
## Quick Start
|
| 18 |
+
|
| 19 |
+
```python
|
| 20 |
+
from code_debug_env import CodeDebugEnv, Action
|
| 21 |
+
|
| 22 |
+
async with CodeDebugEnv(base_url="https://luciferai-devil-code-debug-env.hf.space") as env:
|
| 23 |
+
obs = await env.reset(task_id="task_easy")
|
| 24 |
+
print(obs.buggy_code) # The broken function
|
| 25 |
+
|
| 26 |
+
result = await env.step(Action(
|
| 27 |
+
patch="def find_max_subarray_sum(nums):\n ...",
|
| 28 |
+
task_id="task_easy",
|
| 29 |
+
think="The off-by-one error is in range(1, len(nums)-1)"
|
| 30 |
+
))
|
| 31 |
+
print(result.observation.score) # 0.0–1.0
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
## Action Space
|
| 35 |
+
|
| 36 |
+
| Field | Type | Required | Description |
|
| 37 |
+
|---|---|---|---|
|
| 38 |
+
| `patch` | str | Yes | Full Python source replacement for the function |
|
| 39 |
+
| `task_id` | str | Yes | Which task to target |
|
| 40 |
+
| `think` | str | No | Chain-of-thought reasoning (earns +0.2 reward bonus) |
|
| 41 |
+
|
| 42 |
+
## Observation Space
|
| 43 |
+
|
| 44 |
+
| Field | Type | Description |
|
| 45 |
+
|---|---|---|
|
| 46 |
+
| `buggy_code` | str | Current version of the code |
|
| 47 |
+
| `test_results` | list | Per-test pass/fail with error messages |
|
| 48 |
+
| `passed` / `total` | int | Tests passing out of total |
|
| 49 |
+
| `score` | float | Composite reward for this step (0.0–1.0) |
|
| 50 |
+
| `done` | bool | True when all tests pass or max_steps reached |
|
| 51 |
+
|
| 52 |
+
## Reward Function
|
| 53 |
+
|
| 54 |
+
```
|
| 55 |
+
r = 0.5 × (tests_passed / tests_total) # correctness
|
| 56 |
+
+ 0.2 × (1 if valid syntax else 0) # format
|
| 57 |
+
+ 0.2 × (1 if <think> provided else 0) # chain-of-thought bonus
|
| 58 |
+
+ 0.1 × (steps_remaining / max_steps) # efficiency
|
| 59 |
+
− 0.3 × (1 if timeout/crash else 0) # penalty
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
## Tasks
|
| 63 |
+
|
| 64 |
+
| ID | Difficulty | Description | Variants |
|
| 65 |
+
|---|---|---|---|
|
| 66 |
+
| `task_easy` | Easy | Single off-by-one error | 6+ |
|
| 67 |
+
| `task_medium` | Medium | Two independent bugs | 6+ |
|
| 68 |
+
| `task_hard` | Hard | 3+ subtle bugs in recursive function | 7+ |
|
| 69 |
+
|
| 70 |
+
*Total: 19 procedurally generated tasks via `task_generator.py`.*
|
| 71 |
+
|
| 72 |
+
## Setup
|
| 73 |
+
|
| 74 |
+
```bash
|
| 75 |
+
pip install openenv-core
|
| 76 |
+
pip install git+https://huggingface.co/spaces/luciferai-devil/code-debug-env
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
## Docker
|
| 80 |
+
|
| 81 |
+
```bash
|
| 82 |
+
docker pull luciferai-devil/code-debug-env:latest
|
| 83 |
+
docker run -p 8000:8000 luciferai-devil/code-debug-env
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
## Baseline Results (via OpenAI API)
|
| 87 |
+
|
| 88 |
+
Evaluated using `gpt-4o-mini` / `gpt-oss-120b` reasoning models.
|
| 89 |
+
|
| 90 |
+
| Task | Agent | Score | Notes |
|
| 91 |
+
|---|---|---|---|
|
| 92 |
+
| task_easy | LLM | 0.99 | One-shot fix with CoT |
|
| 93 |
+
| task_medium | LLM | 0.74 | Iterative refinement |
|
| 94 |
+
| task_hard | LLM | 0.59 | Struggles with complex recursion depth |
|
| 95 |
+
|
| 96 |
+
*Average Score: 0.77*
|
| 97 |
+
|
| 98 |
+
## Training with GRPO
|
| 99 |
+
|
| 100 |
+
See `baseline/run_baseline.py` for the inference client.
|
| 101 |
+
Compatible with TRL's `GRPOTrainer` — pass `reward_fn` that calls `/grader`.
|
| 102 |
+
|
| 103 |
+
## API Endpoints
|
| 104 |
+
|
| 105 |
+
| Endpoint | Method | Description |
|
| 106 |
+
|---|---|---|
|
| 107 |
+
| `/health` | GET | Health check |
|
| 108 |
+
| `/reset` | POST | Start a new episode |
|
| 109 |
+
| `/step` | POST | Submit action, get observation |
|
| 110 |
+
| `/state` | GET | Get current episode state |
|
| 111 |
+
| `/tasks` | GET | List all available tasks |
|
| 112 |
+
| `/grader` | GET | Grade a submission directly |
|
| 113 |
+
| `/baseline` | GET | Run baseline agent on all tasks |
|
| 114 |
+
|
| 115 |
+
## Local Development
|
| 116 |
+
|
| 117 |
+
```bash
|
| 118 |
+
# Run server locally
|
| 119 |
+
uvicorn code_debug_env.server.app:app --reload --port 8000
|
| 120 |
+
|
| 121 |
+
# Build Docker
|
| 122 |
+
docker build -t code-debug-env -f server/Dockerfile .
|
| 123 |
+
|
| 124 |
+
# Run Docker
|
| 125 |
+
docker run -p 8000:8000 code-debug-env
|
| 126 |
+
|
| 127 |
+
# Smoke test
|
| 128 |
+
curl http://localhost:8000/health
|
| 129 |
+
curl -X POST http://localhost:8000/reset -H "Content-Type: application/json" -d '{}'
|
| 130 |
+
curl http://localhost:8000/tasks
|
| 131 |
+
```
|
__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# __init__.py — export the public API
|
| 2 |
+
from .models import Action, Observation, State
|
| 3 |
+
from .client import CodeDebugEnv
|
| 4 |
+
|
| 5 |
+
__all__ = ["Action", "Observation", "State", "CodeDebugEnv"]
|
baseline/run_baseline.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Baseline inference script.
|
| 4 |
+
Runs an LLM agent on all 3 tasks using OpenAI API.
|
| 5 |
+
Usage: python baseline/run_baseline.py [--output json]
|
| 6 |
+
Requires: OPENAI_API_KEY environment variable.
|
| 7 |
+
"""
|
| 8 |
+
import asyncio
|
| 9 |
+
import sys
|
| 10 |
+
import json
|
| 11 |
+
import os
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
# Add parent to path for imports
|
| 15 |
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
| 16 |
+
|
| 17 |
+
from code_debug_env.client import CodeDebugEnv
|
| 18 |
+
from code_debug_env.models import Action
|
| 19 |
+
|
| 20 |
+
try:
|
| 21 |
+
from openai import AsyncOpenAI
|
| 22 |
+
except ImportError:
|
| 23 |
+
print("Please install openai: pip install openai")
|
| 24 |
+
sys.exit(1)
|
| 25 |
+
|
| 26 |
+
BASE_URL = os.getenv("OPENENV_URL", "http://127.0.0.1:8000")
|
| 27 |
+
API_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1")
|
| 28 |
+
MODEL_NAME = os.getenv("OPENENV_MODEL", "gpt-4o-mini")
|
| 29 |
+
|
| 30 |
+
client = AsyncOpenAI(
|
| 31 |
+
api_key=os.getenv("OPENAI_API_KEY"),
|
| 32 |
+
base_url=API_BASE_URL
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
async def openai_agent(observation) -> Action:
|
| 36 |
+
"""Uses LLM to suggest a code fix."""
|
| 37 |
+
prompt = f"""You are an expert Python debugger. Your task is to fix the buggy code below.
|
| 38 |
+
Task Description: {observation.task_description}
|
| 39 |
+
|
| 40 |
+
Buggy Code:
|
| 41 |
+
```python
|
| 42 |
+
{observation.buggy_code}
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
Test Results so far:
|
| 46 |
+
{[[t.name, t.passed, t.error] for t in observation.test_results]}
|
| 47 |
+
Passed {observation.passed} out of {observation.total} tests.
|
| 48 |
+
|
| 49 |
+
Provide ONLY a valid JSON object matching this schema:
|
| 50 |
+
{{
|
| 51 |
+
"patch": "The FULL python function as a string, with the bugs fixed",
|
| 52 |
+
"task_id": "{observation.task_id}",
|
| 53 |
+
"think": "Your chain-of-thought reasoning before patching (important!)"
|
| 54 |
+
}}
|
| 55 |
+
"""
|
| 56 |
+
try:
|
| 57 |
+
response = await client.chat.completions.create(
|
| 58 |
+
model=MODEL_NAME,
|
| 59 |
+
messages=[{"role": "user", "content": prompt}],
|
| 60 |
+
response_format={"type": "json_object"} if "gpt-4" in MODEL_NAME or "gpt-oss" in MODEL_NAME else None,
|
| 61 |
+
temperature=0.2,
|
| 62 |
+
)
|
| 63 |
+
content = response.choices[0].message.content
|
| 64 |
+
data = json.loads(content)
|
| 65 |
+
return Action(
|
| 66 |
+
patch=data["patch"],
|
| 67 |
+
task_id=observation.task_id,
|
| 68 |
+
think=data.get("think", "Applied fix based on test errors."),
|
| 69 |
+
)
|
| 70 |
+
except Exception as e:
|
| 71 |
+
print(f"LLM Error: {e}", file=sys.stderr)
|
| 72 |
+
# fallback to returning original code to avoid crashing the loop
|
| 73 |
+
return Action(
|
| 74 |
+
patch=observation.buggy_code,
|
| 75 |
+
task_id=observation.task_id,
|
| 76 |
+
think="Failed to generate patch.",
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
async def evaluate_task(env, task_id: str) -> dict:
|
| 80 |
+
result = await env.reset(task_id=task_id)
|
| 81 |
+
obs = result.observation
|
| 82 |
+
best_score = 0.0
|
| 83 |
+
for step in range(10):
|
| 84 |
+
action = await openai_agent(obs)
|
| 85 |
+
result = await env.step(action)
|
| 86 |
+
best_score = max(best_score, result.observation.score)
|
| 87 |
+
obs = result.observation
|
| 88 |
+
if obs.done:
|
| 89 |
+
break
|
| 90 |
+
return {"task_id": task_id, "best_score": round(best_score, 4), "steps": step + 1}
|
| 91 |
+
|
| 92 |
+
async def main(output_format: str = "table"):
|
| 93 |
+
if not os.getenv("OPENAI_API_KEY"):
|
| 94 |
+
print("Warning: OPENAI_API_KEY not set. LLM calls will fail.", file=sys.stderr)
|
| 95 |
+
|
| 96 |
+
results = []
|
| 97 |
+
async with CodeDebugEnv(base_url=BASE_URL) as env:
|
| 98 |
+
for task_id in ["task_easy", "task_medium", "task_hard"]:
|
| 99 |
+
res = await evaluate_task(env, task_id)
|
| 100 |
+
results.append(res)
|
| 101 |
+
|
| 102 |
+
if output_format == "json":
|
| 103 |
+
print(json.dumps({"baseline_results": results, "agent": "openai_api"}))
|
| 104 |
+
else:
|
| 105 |
+
print("\n=== Baseline Results ===")
|
| 106 |
+
for r in results:
|
| 107 |
+
print(f" {r['task_id']:15s} score={r['best_score']:.3f} steps={r['steps']}")
|
| 108 |
+
print(f"\n avg score: {sum(r['best_score'] for r in results) / len(results):.3f}")
|
| 109 |
+
|
| 110 |
+
if __name__ == "__main__":
|
| 111 |
+
output = "json" if "--output json" in sys.argv else "table"
|
| 112 |
+
asyncio.run(main(output))
|
baseline/train_grpo_120b.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
GRPO Training Script for gpt-oss-120b using OpenEnv and TRL.
|
| 4 |
+
Adapted from the openenv-course repository architecture.
|
| 5 |
+
|
| 6 |
+
Requirements:
|
| 7 |
+
pip install "trl>=0.17.0" openenv-core transformers datasets accelerate vllm
|
| 8 |
+
"""
|
| 9 |
+
import os
|
| 10 |
+
import sys
|
| 11 |
+
import json
|
| 12 |
+
import torch
|
| 13 |
+
from datasets import Dataset
|
| 14 |
+
from transformers import AutoTokenizer
|
| 15 |
+
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 18 |
+
|
| 19 |
+
from code_debug_env.client import CodeDebugEnv
|
| 20 |
+
from code_debug_env.models import Action
|
| 21 |
+
|
| 22 |
+
# TRL imports
|
| 23 |
+
from trl import GRPOConfig, GRPOTrainer
|
| 24 |
+
from trl.experimental.openenv import generate_rollout_completions
|
| 25 |
+
|
| 26 |
+
# 1. Configuration
|
| 27 |
+
MODEL_NAME = "openai/gpt-oss-120b"
|
| 28 |
+
OUTPUT_DIR = "code-debug-grpo-120b"
|
| 29 |
+
ENV_URL = os.getenv("OPENENV_URL", "http://127.0.0.1:8000")
|
| 30 |
+
|
| 31 |
+
# 2. Setup Persistent Environment Connection
|
| 32 |
+
print(f"Connecting to env: {ENV_URL}")
|
| 33 |
+
env = CodeDebugEnv(base_url=ENV_URL)
|
| 34 |
+
sync_env = env.sync()
|
| 35 |
+
sync_env.connect()
|
| 36 |
+
|
| 37 |
+
# 3. Setup Tokenizer
|
| 38 |
+
print(f"Loading tokenizer for {MODEL_NAME}")
|
| 39 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 40 |
+
if tokenizer.pad_token is None:
|
| 41 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 42 |
+
|
| 43 |
+
# 4. System Prompt Definition
|
| 44 |
+
SYSTEM_PROMPT = """You are an expert Python debugger and RL agent.
|
| 45 |
+
Your task is to fix the buggy code provided to you.
|
| 46 |
+
|
| 47 |
+
Provide ONLY a valid JSON object matching this schema:
|
| 48 |
+
{
|
| 49 |
+
"patch": "The FULL python function as a string, with the bugs fixed",
|
| 50 |
+
"task_id": "the task requested",
|
| 51 |
+
"think": "Your chain-of-thought reasoning before patching (important for rewards!)"
|
| 52 |
+
}
|
| 53 |
+
"""
|
| 54 |
+
|
| 55 |
+
def make_user_prompt(observation):
|
| 56 |
+
return (
|
| 57 |
+
f"Task Description: {observation.task_description}\n\n"
|
| 58 |
+
f"Buggy Code:\n```python\n{observation.buggy_code}\n```\n\n"
|
| 59 |
+
f"Passed {observation.passed} out of {observation.total} tests."
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
# 5. Rollout Function
|
| 63 |
+
def rollout_once(trainer, sync_env, tokenizer, dataset_prompt, system_prompt, max_turns):
|
| 64 |
+
"""Execute one full episode to gather trajectory formatting for GRPO."""
|
| 65 |
+
result = sync_env.reset()
|
| 66 |
+
observation = result.observation
|
| 67 |
+
|
| 68 |
+
prompt_ids = []
|
| 69 |
+
completion_ids = []
|
| 70 |
+
logprobs = []
|
| 71 |
+
composite_rewards = []
|
| 72 |
+
|
| 73 |
+
for _turn in range(max_turns):
|
| 74 |
+
if result.done:
|
| 75 |
+
break
|
| 76 |
+
|
| 77 |
+
user_prompt = make_user_prompt(observation)
|
| 78 |
+
messages = [
|
| 79 |
+
{'role': 'system', 'content': system_prompt},
|
| 80 |
+
{'role': 'user', 'content': user_prompt},
|
| 81 |
+
]
|
| 82 |
+
|
| 83 |
+
prompt_text = tokenizer.apply_chat_template(
|
| 84 |
+
messages,
|
| 85 |
+
add_generation_prompt=True,
|
| 86 |
+
tokenize=False,
|
| 87 |
+
enable_thinking=False,
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
rollout_outputs = generate_rollout_completions(trainer, [prompt_text])[0]
|
| 91 |
+
prompt_ids.append(rollout_outputs['prompt_ids'])
|
| 92 |
+
completion_ids.append(rollout_outputs['completion_ids'])
|
| 93 |
+
logprobs.append(rollout_outputs['logprobs'])
|
| 94 |
+
|
| 95 |
+
completion_text = rollout_outputs.get('text') or tokenizer.decode(
|
| 96 |
+
rollout_outputs['completion_ids'], skip_special_tokens=True
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
# Parse JSON output from the model
|
| 100 |
+
try:
|
| 101 |
+
# simple extraction since prompt dictates JSON
|
| 102 |
+
start = completion_text.find("{")
|
| 103 |
+
end = completion_text.rfind("}") + 1
|
| 104 |
+
if start != -1 and end != -1:
|
| 105 |
+
data = json.loads(completion_text[start:end])
|
| 106 |
+
action = Action(patch=data["patch"], task_id=observation.task_id, think=data.get("think", ""))
|
| 107 |
+
else:
|
| 108 |
+
raise ValueError("No JSON found")
|
| 109 |
+
except:
|
| 110 |
+
# Fallback action if parsing fails
|
| 111 |
+
action = Action(patch=observation.buggy_code, task_id=observation.task_id, think="")
|
| 112 |
+
|
| 113 |
+
# Step the environment
|
| 114 |
+
result = sync_env.step(action)
|
| 115 |
+
observation = result.observation
|
| 116 |
+
|
| 117 |
+
# The environment already calculates the composite reward (0.0 to 1.0)
|
| 118 |
+
# correctness, format, CoT bonus, and efficiency are all baked in.
|
| 119 |
+
composite_rewards.append(observation.score)
|
| 120 |
+
|
| 121 |
+
return {
|
| 122 |
+
'prompt_ids': [pid for sub in prompt_ids for pid in sub], # flatten
|
| 123 |
+
'completion_ids': [cid for sub in completion_ids for cid in sub],
|
| 124 |
+
'logprobs': [lp for sub in logprobs for lp in sub],
|
| 125 |
+
'env_reward': composite_rewards[-1] if composite_rewards else 0.0,
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def rollout_func(prompts, trainer=None):
|
| 130 |
+
"""Rollout function called by GRPOTrainer."""
|
| 131 |
+
episode_prompt_ids = []
|
| 132 |
+
episode_completion_ids = []
|
| 133 |
+
episode_logprobs = []
|
| 134 |
+
rewards = []
|
| 135 |
+
|
| 136 |
+
for prompt_text in prompts:
|
| 137 |
+
episode = rollout_once(
|
| 138 |
+
trainer=trainer,
|
| 139 |
+
sync_env=sync_env,
|
| 140 |
+
tokenizer=tokenizer,
|
| 141 |
+
dataset_prompt=prompt_text,
|
| 142 |
+
system_prompt=SYSTEM_PROMPT,
|
| 143 |
+
max_turns=3, # Keep turns low for heavy models like 120B
|
| 144 |
+
)
|
| 145 |
+
episode_prompt_ids.append(episode['prompt_ids'])
|
| 146 |
+
episode_completion_ids.append(episode['completion_ids'])
|
| 147 |
+
episode_logprobs.append(episode['logprobs'])
|
| 148 |
+
rewards.append(episode['env_reward'])
|
| 149 |
+
|
| 150 |
+
return {
|
| 151 |
+
'prompt_ids': episode_prompt_ids,
|
| 152 |
+
'completion_ids': episode_completion_ids,
|
| 153 |
+
'logprobs': episode_logprobs,
|
| 154 |
+
'env_reward': rewards,
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
# 6. Reward Functions (Mapped from rollout_func keys)
|
| 158 |
+
def composite_env_reward(completions, **kwargs):
|
| 159 |
+
rewards = kwargs.get("env_reward")
|
| 160 |
+
return [float(r) for r in rewards] if rewards else [0.0] * len(completions)
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
# 7. Create Dataset & Config
|
| 164 |
+
def main():
|
| 165 |
+
print("Preparing dataset...")
|
| 166 |
+
# Dummy prompts to kick off the rollout loop (the actual env state overrides this)
|
| 167 |
+
dataset = Dataset.from_dict({"prompt": ["Fix the buggy Python code."] * 500})
|
| 168 |
+
|
| 169 |
+
# Using specific optimizations for 120B model (like MXFP4, tensor parallelism if available)
|
| 170 |
+
grpo_config = GRPOConfig(
|
| 171 |
+
num_train_epochs=1,
|
| 172 |
+
learning_rate=1e-6, # lower LR for 120B
|
| 173 |
+
gradient_accumulation_steps=128,
|
| 174 |
+
per_device_train_batch_size=1,
|
| 175 |
+
warmup_steps=10,
|
| 176 |
+
num_generations=2,
|
| 177 |
+
max_completion_length=512,
|
| 178 |
+
max_prompt_length=1500,
|
| 179 |
+
use_vllm=True,
|
| 180 |
+
vllm_mode="colocate",
|
| 181 |
+
vllm_gpu_memory_utilization=0.9, # maximize for 120B
|
| 182 |
+
output_dir=OUTPUT_DIR,
|
| 183 |
+
logging_steps=1,
|
| 184 |
+
save_steps=50,
|
| 185 |
+
gradient_checkpointing=True,
|
| 186 |
+
gradient_checkpointing_kwargs={"use_reentrant": False},
|
| 187 |
+
push_to_hub=False,
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
print(f"Initializing GRPOTrainer for {MODEL_NAME}...")
|
| 191 |
+
trainer = GRPOTrainer(
|
| 192 |
+
model=MODEL_NAME,
|
| 193 |
+
processing_class=tokenizer,
|
| 194 |
+
reward_funcs=[composite_env_reward],
|
| 195 |
+
train_dataset=dataset,
|
| 196 |
+
args=grpo_config,
|
| 197 |
+
rollout_func=rollout_func,
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
print("Starting training...")
|
| 201 |
+
trainer.train()
|
| 202 |
+
|
| 203 |
+
sync_env.close()
|
| 204 |
+
trainer.save_model(OUTPUT_DIR)
|
| 205 |
+
print("Training complete! Model saved.")
|
| 206 |
+
|
| 207 |
+
if __name__ == "__main__":
|
| 208 |
+
main()
|
client.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# client.py — used in training code / run_baseline.py
|
| 2 |
+
from typing import Any
|
| 3 |
+
from openenv.core.env_client import EnvClient
|
| 4 |
+
from openenv.core.client_types import StepResult
|
| 5 |
+
from .models import Action, Observation, State
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class CodeDebugEnv(EnvClient[Action, Observation, State]):
|
| 9 |
+
"""
|
| 10 |
+
Client for the CodeDebug environment.
|
| 11 |
+
Usage:
|
| 12 |
+
async with CodeDebugEnv(base_url="https://your-space.hf.space") as env:
|
| 13 |
+
result = await env.reset(task_id="task_easy")
|
| 14 |
+
result = await env.step(Action(patch="...", task_id="task_easy"))
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
def _step_payload(self, action: Action) -> dict[str, Any]:
|
| 18 |
+
return action.model_dump(exclude_none=True)
|
| 19 |
+
|
| 20 |
+
def _parse_result(self, payload: dict[str, Any]) -> StepResult[Observation]:
|
| 21 |
+
obs = Observation(**payload.get("observation", payload))
|
| 22 |
+
return StepResult(
|
| 23 |
+
observation=obs,
|
| 24 |
+
reward=payload.get("reward", obs.reward),
|
| 25 |
+
done=payload.get("done", obs.done)
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
def _parse_state(self, payload: dict[str, Any]) -> State:
|
| 29 |
+
return State(**payload)
|
models.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# models.py
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
from pydantic import BaseModel, Field
|
| 4 |
+
from typing import Optional
|
| 5 |
+
import uuid
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Action(BaseModel):
|
| 9 |
+
"""Agent's action: submit a code patch to fix the buggy function."""
|
| 10 |
+
patch: str = Field(
|
| 11 |
+
description="Full replacement of the function body (valid Python source code)."
|
| 12 |
+
)
|
| 13 |
+
task_id: str = Field(
|
| 14 |
+
description="Which task this patch targets. Must match a task from /tasks."
|
| 15 |
+
)
|
| 16 |
+
think: Optional[str] = Field(
|
| 17 |
+
default=None,
|
| 18 |
+
description="Optional chain-of-thought reasoning. Providing this earns r_cot bonus."
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class TestResult(BaseModel):
|
| 23 |
+
name: str
|
| 24 |
+
passed: bool
|
| 25 |
+
error: Optional[str] = None
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class Observation(BaseModel):
|
| 29 |
+
"""What the agent sees after reset() or step()."""
|
| 30 |
+
task_id: str
|
| 31 |
+
buggy_code: str = Field(description="Current version of the code (may be patched).")
|
| 32 |
+
task_description: str
|
| 33 |
+
test_results: list[TestResult] = Field(default_factory=list)
|
| 34 |
+
passed: int = 0
|
| 35 |
+
total: int = 0
|
| 36 |
+
score: float = 0.0
|
| 37 |
+
done: bool = False
|
| 38 |
+
reward: float = Field(default=0.0, exclude=True) # Required by openenv 0.2 serialization
|
| 39 |
+
error: Optional[str] = None
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class State(BaseModel):
|
| 43 |
+
"""Episode metadata — returned by state() endpoint."""
|
| 44 |
+
episode_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
| 45 |
+
task_id: str = ""
|
| 46 |
+
step_count: int = 0
|
| 47 |
+
max_steps: int = 10
|
| 48 |
+
current_score: float = 0.0
|
| 49 |
+
best_score: float = 0.0
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class TaskInfo(BaseModel):
|
| 53 |
+
"""Returned by /tasks endpoint."""
|
| 54 |
+
task_id: str
|
| 55 |
+
difficulty: str # "easy" | "medium" | "hard"
|
| 56 |
+
description: str
|
| 57 |
+
action_schema: dict # JSON schema of Action for this task
|
openenv.yaml
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# openenv.yaml — validated by `openenv validate`
|
| 2 |
+
name: code-debug-env
|
| 3 |
+
version: "1.0.0"
|
| 4 |
+
description: >
|
| 5 |
+
A real-world RL environment where an AI agent repairs buggy Python functions.
|
| 6 |
+
The agent receives broken code and must iteratively submit patches until all
|
| 7 |
+
unit tests pass. Designed for training LLMs on code repair via GRPO/RLVR.
|
| 8 |
+
|
| 9 |
+
author: "luciferai-devil"
|
| 10 |
+
license: MIT
|
| 11 |
+
|
| 12 |
+
# Hackathon domain tag
|
| 13 |
+
domain: software-engineering
|
| 14 |
+
|
| 15 |
+
tasks:
|
| 16 |
+
- id: task_easy
|
| 17 |
+
difficulty: easy
|
| 18 |
+
description: "Fix a single off-by-one error in a Kadane's algorithm implementation."
|
| 19 |
+
- id: task_medium
|
| 20 |
+
difficulty: medium
|
| 21 |
+
description: "Fix two independent bugs in a string parsing utility."
|
| 22 |
+
- id: task_hard
|
| 23 |
+
difficulty: hard
|
| 24 |
+
description: "Fix 3+ subtle bugs in a recursive tree function with missing edge cases."
|
| 25 |
+
|
| 26 |
+
action:
|
| 27 |
+
type: object
|
| 28 |
+
properties:
|
| 29 |
+
patch:
|
| 30 |
+
type: string
|
| 31 |
+
description: "Full replacement Python source for the function body."
|
| 32 |
+
task_id:
|
| 33 |
+
type: string
|
| 34 |
+
description: "Which task this patch targets."
|
| 35 |
+
think:
|
| 36 |
+
type: string
|
| 37 |
+
description: "Optional chain-of-thought reasoning (earns bonus reward)."
|
| 38 |
+
required: [patch, task_id]
|
| 39 |
+
|
| 40 |
+
observation:
|
| 41 |
+
type: object
|
| 42 |
+
properties:
|
| 43 |
+
task_id: { type: string }
|
| 44 |
+
buggy_code: { type: string }
|
| 45 |
+
task_description: { type: string }
|
| 46 |
+
test_results: { type: array }
|
| 47 |
+
passed: { type: integer }
|
| 48 |
+
total: { type: integer }
|
| 49 |
+
score: { type: number, minimum: 0.0, maximum: 1.0 }
|
| 50 |
+
done: { type: boolean }
|
| 51 |
+
error: { type: string, nullable: true }
|
| 52 |
+
|
| 53 |
+
reward:
|
| 54 |
+
description: >
|
| 55 |
+
Composite reward: 0.5×correctness + 0.2×valid_syntax + 0.2×chain_of_thought
|
| 56 |
+
+ 0.1×step_efficiency − 0.3×timeout_penalty. Range: [0.0, 1.0].
|
| 57 |
+
type: number
|
| 58 |
+
minimum: 0.0
|
| 59 |
+
maximum: 1.0
|
| 60 |
+
|
| 61 |
+
episode:
|
| 62 |
+
max_steps: 10
|
| 63 |
+
termination: "All tests pass (score=1.0) OR max_steps reached."
|
| 64 |
+
|
| 65 |
+
server:
|
| 66 |
+
port: 8000
|
| 67 |
+
transport: websocket # openenv uses WebSocket for persistent sessions
|
| 68 |
+
|
| 69 |
+
huggingface:
|
| 70 |
+
space_id: "luciferai-devil/code-debug-env"
|
pyproject.toml
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["hatchling"]
|
| 3 |
+
build-backend = "hatchling.build"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "code-debug-env"
|
| 7 |
+
version = "1.0.0"
|
| 8 |
+
description = "OpenEnv environment for AI-powered code repair via GRPO training"
|
| 9 |
+
requires-python = ">=3.10"
|
| 10 |
+
dependencies = [
|
| 11 |
+
"openenv-core>=0.1.0",
|
| 12 |
+
"fastapi>=0.110.0",
|
| 13 |
+
"uvicorn[standard]>=0.27.0",
|
| 14 |
+
"pydantic>=2.0.0",
|
| 15 |
+
"pytest>=8.0.0",
|
| 16 |
+
"pytest-timeout>=2.3.0",
|
| 17 |
+
"pytest-json-report>=1.5.0",
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
[project.optional-dependencies]
|
| 21 |
+
baseline = [
|
| 22 |
+
"transformers>=4.40.0",
|
| 23 |
+
"torch>=2.2.0",
|
| 24 |
+
"trl>=0.8.6",
|
| 25 |
+
"accelerate>=0.28.0",
|
| 26 |
+
"openai>=1.0.0",
|
| 27 |
+
]
|
| 28 |
+
|
| 29 |
+
[project.scripts]
|
| 30 |
+
code-debug-env = "code_debug_env.server.app:main"
|
| 31 |
+
server = "code_debug_env.server.app:main"
|
| 32 |
+
|
| 33 |
+
[tool.hatch.build.targets.wheel]
|
| 34 |
+
packages = ["code_debug_env"]
|
server/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# server/__init__.py
|
server/app.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/app.py
|
| 2 |
+
from fastapi import FastAPI, HTTPException
|
| 3 |
+
from openenv.core.env_server import create_fastapi_app
|
| 4 |
+
from ..models import Action, Observation, TaskInfo
|
| 5 |
+
from .environment import CodeDebugEnvironment
|
| 6 |
+
from .tasks import TASK_REGISTRY
|
| 7 |
+
from .grader import grade
|
| 8 |
+
|
| 9 |
+
# Core OpenEnv app (provides /reset, /step, /state, /ws, /health)
|
| 10 |
+
app = create_fastapi_app(CodeDebugEnvironment, Action, Observation)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# ── Additional required hackathon endpoints ────────────────────────────
|
| 14 |
+
|
| 15 |
+
@app.get("/tasks")
|
| 16 |
+
def list_tasks() -> list[TaskInfo]:
|
| 17 |
+
"""Return all tasks with their action schema."""
|
| 18 |
+
return [
|
| 19 |
+
TaskInfo(
|
| 20 |
+
task_id=tid,
|
| 21 |
+
difficulty=task["difficulty"],
|
| 22 |
+
description=task["description"],
|
| 23 |
+
action_schema=Action.model_json_schema(),
|
| 24 |
+
)
|
| 25 |
+
for tid, task in TASK_REGISTRY.items()
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@app.get("/grader")
|
| 30 |
+
def get_grader_score(task_id: str, submitted_code: str) -> dict:
|
| 31 |
+
"""
|
| 32 |
+
Grade a submission directly (for testing / evaluation).
|
| 33 |
+
Returns: { score: float, passed: int, total: int, test_results: list }
|
| 34 |
+
"""
|
| 35 |
+
if task_id not in TASK_REGISTRY:
|
| 36 |
+
raise HTTPException(status_code=404, detail=f"Unknown task_id: {task_id}")
|
| 37 |
+
task = TASK_REGISTRY[task_id]
|
| 38 |
+
result = grade(submitted_code, task_id, task["test_suite"])
|
| 39 |
+
return {
|
| 40 |
+
"task_id": task_id,
|
| 41 |
+
"score": result["score"],
|
| 42 |
+
"passed": result["passed"],
|
| 43 |
+
"total": result["total"],
|
| 44 |
+
"test_results": [r.model_dump() for r in result["test_results"]],
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@app.get("/baseline")
|
| 49 |
+
def run_baseline() -> dict:
|
| 50 |
+
"""
|
| 51 |
+
Run the baseline agent on all tasks and return scores.
|
| 52 |
+
This endpoint triggers the baseline inference script.
|
| 53 |
+
"""
|
| 54 |
+
import subprocess, sys, json
|
| 55 |
+
try:
|
| 56 |
+
result = subprocess.run(
|
| 57 |
+
[sys.executable, "baseline/run_baseline.py", "--output", "json"],
|
| 58 |
+
capture_output=True, text=True, timeout=120,
|
| 59 |
+
)
|
| 60 |
+
return json.loads(result.stdout)
|
| 61 |
+
except Exception as e:
|
| 62 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def main():
|
| 66 |
+
"""Entry point for the server."""
|
| 67 |
+
import uvicorn
|
| 68 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
if __name__ == "__main__":
|
| 72 |
+
main()
|
server/environment.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/environment.py
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
import uuid
|
| 4 |
+
from openenv.core.env_server import Environment
|
| 5 |
+
from ..models import Action, Observation, State
|
| 6 |
+
from .grader import grade
|
| 7 |
+
from .tasks import TASK_REGISTRY
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class CodeDebugEnvironment(Environment):
|
| 11 |
+
"""
|
| 12 |
+
Real-world environment: AI agent must fix buggy Python functions.
|
| 13 |
+
Episodes are multi-turn: agent iterates until all tests pass or max_steps reached.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
def __init__(self):
|
| 17 |
+
super().__init__()
|
| 18 |
+
self._state = State()
|
| 19 |
+
self._current_task = None
|
| 20 |
+
|
| 21 |
+
def reset(
|
| 22 |
+
self,
|
| 23 |
+
seed: int | None = None,
|
| 24 |
+
episode_id: str | None = None,
|
| 25 |
+
task_id: str | None = None,
|
| 26 |
+
**kwargs,
|
| 27 |
+
) -> Observation:
|
| 28 |
+
"""
|
| 29 |
+
Start a new episode.
|
| 30 |
+
- If task_id is None, sample a random task from the registry.
|
| 31 |
+
- Always returns a clean Observation with the buggy code.
|
| 32 |
+
"""
|
| 33 |
+
if task_id is None:
|
| 34 |
+
import random
|
| 35 |
+
task_id = random.choice(list(TASK_REGISTRY.keys()))
|
| 36 |
+
|
| 37 |
+
task = TASK_REGISTRY[task_id]
|
| 38 |
+
self._current_task = task
|
| 39 |
+
self._state = State(
|
| 40 |
+
episode_id=str(uuid.uuid4()),
|
| 41 |
+
task_id=task_id,
|
| 42 |
+
step_count=0,
|
| 43 |
+
max_steps=10,
|
| 44 |
+
current_score=0.0,
|
| 45 |
+
best_score=0.0,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
return Observation(
|
| 49 |
+
task_id=task_id,
|
| 50 |
+
buggy_code=task["buggy_code"],
|
| 51 |
+
task_description=task["description"],
|
| 52 |
+
passed=0,
|
| 53 |
+
total=task["num_tests"],
|
| 54 |
+
score=0.0,
|
| 55 |
+
done=False,
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
def step(
|
| 59 |
+
self,
|
| 60 |
+
action: Action,
|
| 61 |
+
timeout_s: float | None = None,
|
| 62 |
+
**kwargs,
|
| 63 |
+
) -> Observation:
|
| 64 |
+
"""
|
| 65 |
+
Execute the agent's patch.
|
| 66 |
+
Returns observation with test results and composite reward.
|
| 67 |
+
"""
|
| 68 |
+
if self._current_task is None:
|
| 69 |
+
raise RuntimeError("Call reset() before step()")
|
| 70 |
+
|
| 71 |
+
self._state.step_count += 1
|
| 72 |
+
task = self._current_task
|
| 73 |
+
|
| 74 |
+
# Grade the submission
|
| 75 |
+
grade_result = grade(
|
| 76 |
+
submitted_code=action.patch,
|
| 77 |
+
task_id=action.task_id,
|
| 78 |
+
test_suite=task["test_suite"],
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
# Composite reward:
|
| 82 |
+
# 0.5 * correctness + 0.2 * format + 0.2 * cot_bonus + 0.1 * efficiency
|
| 83 |
+
r_correct = grade_result["score"] # 0.0–1.0
|
| 84 |
+
r_format = 1.0 if grade_result["valid_syntax"] else 0.0
|
| 85 |
+
r_cot = 0.2 if (action.think and len(action.think) > 20) else 0.0
|
| 86 |
+
r_eff = max(0.0, (10 - self._state.step_count) / 10) * 0.1
|
| 87 |
+
|
| 88 |
+
reward = 0.5 * r_correct + 0.2 * r_format + r_cot + r_eff
|
| 89 |
+
reward = max(0.0, min(1.0, reward))
|
| 90 |
+
|
| 91 |
+
# Penalty for timeout/crash
|
| 92 |
+
if grade_result.get("timed_out"):
|
| 93 |
+
reward = max(0.0, reward - 0.3)
|
| 94 |
+
|
| 95 |
+
done = (r_correct == 1.0) or (self._state.step_count >= self._state.max_steps)
|
| 96 |
+
|
| 97 |
+
self._state.current_score = reward
|
| 98 |
+
self._state.best_score = max(self._state.best_score, reward)
|
| 99 |
+
|
| 100 |
+
return Observation(
|
| 101 |
+
task_id=action.task_id,
|
| 102 |
+
buggy_code=action.patch,
|
| 103 |
+
task_description=task["description"],
|
| 104 |
+
test_results=grade_result["test_results"],
|
| 105 |
+
passed=grade_result["passed"],
|
| 106 |
+
total=grade_result["total"],
|
| 107 |
+
score=reward,
|
| 108 |
+
done=done,
|
| 109 |
+
error=grade_result.get("error"),
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
@property
|
| 113 |
+
def state(self) -> State:
|
| 114 |
+
return self._state
|
server/grader.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/grader.py
|
| 2 |
+
"""
|
| 3 |
+
Deterministic, sandboxed grader. Runs submitted code against a hidden pytest suite.
|
| 4 |
+
Returns score = passed / total (float 0.0–1.0).
|
| 5 |
+
|
| 6 |
+
SECURITY: runs in a subprocess with:
|
| 7 |
+
- 10 second wall-clock timeout
|
| 8 |
+
- No network access (subprocess inherits restricted env)
|
| 9 |
+
- Restricted builtins (no open, no os, no sys import)
|
| 10 |
+
"""
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
import subprocess
|
| 13 |
+
import sys
|
| 14 |
+
import textwrap
|
| 15 |
+
import tempfile
|
| 16 |
+
import os
|
| 17 |
+
import json
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
from ..models import TestResult
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def grade(
|
| 23 |
+
submitted_code: str,
|
| 24 |
+
task_id: str,
|
| 25 |
+
test_suite: str,
|
| 26 |
+
timeout: int = 10,
|
| 27 |
+
) -> dict:
|
| 28 |
+
"""
|
| 29 |
+
Grade submitted_code against test_suite.
|
| 30 |
+
Returns dict with: score, passed, total, valid_syntax, timed_out, test_results, error.
|
| 31 |
+
"""
|
| 32 |
+
# Step 1: syntax check (fast, no subprocess needed)
|
| 33 |
+
try:
|
| 34 |
+
compile(submitted_code, "<submission>", "exec")
|
| 35 |
+
valid_syntax = True
|
| 36 |
+
except SyntaxError as e:
|
| 37 |
+
return {
|
| 38 |
+
"score": 0.0, "passed": 0, "total": 1,
|
| 39 |
+
"valid_syntax": False, "timed_out": False,
|
| 40 |
+
"test_results": [TestResult(name="syntax", passed=False, error=str(e))],
|
| 41 |
+
"error": f"SyntaxError: {e}",
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
# Step 2: build test module in a temp dir
|
| 45 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 46 |
+
# Write submission
|
| 47 |
+
sub_path = Path(tmpdir) / "submission.py"
|
| 48 |
+
sub_path.write_text(submitted_code)
|
| 49 |
+
|
| 50 |
+
# Write test file (imports submission)
|
| 51 |
+
test_content = f"""
|
| 52 |
+
import sys
|
| 53 |
+
sys.path.insert(0, "{tmpdir}")
|
| 54 |
+
from submission import *
|
| 55 |
+
{test_suite}
|
| 56 |
+
"""
|
| 57 |
+
test_path = Path(tmpdir) / "test_submission.py"
|
| 58 |
+
test_path.write_text(textwrap.dedent(test_content))
|
| 59 |
+
|
| 60 |
+
# Step 3: run pytest with JSON output
|
| 61 |
+
result_path = Path(tmpdir) / "results.json"
|
| 62 |
+
cmd = [
|
| 63 |
+
sys.executable, "-m", "pytest",
|
| 64 |
+
str(test_path),
|
| 65 |
+
"--tb=short",
|
| 66 |
+
"-q",
|
| 67 |
+
f"--json-report",
|
| 68 |
+
f"--json-report-file={result_path}",
|
| 69 |
+
"--timeout=8",
|
| 70 |
+
]
|
| 71 |
+
|
| 72 |
+
try:
|
| 73 |
+
proc = subprocess.run(
|
| 74 |
+
cmd,
|
| 75 |
+
capture_output=True,
|
| 76 |
+
text=True,
|
| 77 |
+
timeout=timeout,
|
| 78 |
+
cwd=tmpdir,
|
| 79 |
+
env={**os.environ, "PYTHONDONTWRITEBYTECODE": "1"},
|
| 80 |
+
)
|
| 81 |
+
timed_out = False
|
| 82 |
+
except subprocess.TimeoutExpired:
|
| 83 |
+
return {
|
| 84 |
+
"score": 0.0, "passed": 0, "total": 1,
|
| 85 |
+
"valid_syntax": True, "timed_out": True,
|
| 86 |
+
"test_results": [TestResult(name="timeout", passed=False, error="Timed out")],
|
| 87 |
+
"error": "TimeoutExpired",
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
# Step 4: parse results
|
| 91 |
+
if result_path.exists():
|
| 92 |
+
data = json.loads(result_path.read_text())
|
| 93 |
+
passed = data["summary"].get("passed", 0)
|
| 94 |
+
total = data["summary"].get("total", 1)
|
| 95 |
+
test_results = [
|
| 96 |
+
TestResult(
|
| 97 |
+
name=t["nodeid"],
|
| 98 |
+
passed=(t["outcome"] == "passed"),
|
| 99 |
+
error=t.get("call", {}).get("longrepr"),
|
| 100 |
+
)
|
| 101 |
+
for t in data.get("tests", [])
|
| 102 |
+
]
|
| 103 |
+
else:
|
| 104 |
+
# Fallback: parse stdout
|
| 105 |
+
passed = proc.stdout.count(" passed")
|
| 106 |
+
total = max(1, passed + proc.stdout.count(" failed") + proc.stdout.count(" error"))
|
| 107 |
+
test_results = []
|
| 108 |
+
|
| 109 |
+
score = passed / total if total > 0 else 0.0
|
| 110 |
+
return {
|
| 111 |
+
"score": round(score, 4),
|
| 112 |
+
"passed": passed,
|
| 113 |
+
"total": total,
|
| 114 |
+
"valid_syntax": True,
|
| 115 |
+
"timed_out": False,
|
| 116 |
+
"test_results": test_results,
|
| 117 |
+
"error": None,
|
| 118 |
+
}
|
server/requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.110.0
|
| 2 |
+
uvicorn[standard]>=0.27.0
|
| 3 |
+
pydantic>=2.0.0
|
| 4 |
+
pytest>=8.0.0
|
| 5 |
+
pytest-timeout>=2.3.0
|
| 6 |
+
pytest-json-report>=1.5.0
|
| 7 |
+
accelerate>=0.28.0
|
| 8 |
+
bitsandbytes>=0.43.0
|
server/task_generator.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# task_generator.py — generate task variants programmatically
|
| 2 |
+
"""
|
| 3 |
+
Generates variants of each task by injecting different bug patterns.
|
| 4 |
+
Used to build a larger task pool for robust RL training.
|
| 5 |
+
"""
|
| 6 |
+
import random
|
| 7 |
+
from typing import Iterator
|
| 8 |
+
|
| 9 |
+
BUG_PATTERNS = [
|
| 10 |
+
# General Logic Bugs
|
| 11 |
+
("off_by_one_minus", "len(arr)", "len(arr) - 1"),
|
| 12 |
+
("off_by_one_plus", "range(n)", "range(n + 1)"),
|
| 13 |
+
("wrong_operator", "current + nums[i]","current - nums[i]"),
|
| 14 |
+
("wrong_init", "max_sum = arr[0]", "max_sum = 0"),
|
| 15 |
+
("wrong_comparison", "if a > b", "if a >= b"),
|
| 16 |
+
("wrong_return", "return result", "return result - 1"),
|
| 17 |
+
("wrong_boolean", "if not ", "if "),
|
| 18 |
+
|
| 19 |
+
# String Parsing Bugs (targets task_medium)
|
| 20 |
+
("wrong_split", "split(';')", "split(',')"),
|
| 21 |
+
("missing_strip_1", "key.strip()", "key"),
|
| 22 |
+
("missing_strip_2", "value.strip()", "value"),
|
| 23 |
+
|
| 24 |
+
# Dictionary/List Bugs (targets task_hard)
|
| 25 |
+
("wrong_enumerate", "enumerate(v)", "enumerate(v, start=1)"),
|
| 26 |
+
("wrong_recursion", "new_key, sep)", "new_key, '/')"),
|
| 27 |
+
("missing_str_cast", "str_k = str(k)", "str_k = k"),
|
| 28 |
+
("wrong_list_index", "str(i)", "str(i+1)"),
|
| 29 |
+
("wrong_dict_check", "if not v:", "if v:"),
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def inject_bug(code: str, pattern: tuple) -> str:
|
| 34 |
+
_, find, replace = pattern
|
| 35 |
+
if find in code:
|
| 36 |
+
return code.replace(find, replace, 1)
|
| 37 |
+
return code # pattern not applicable to this code
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def generate_task_variants(base_task: dict, n: int = 20) -> Iterator[dict]:
|
| 41 |
+
"""Yield n variants of base_task with randomly injected bugs."""
|
| 42 |
+
for i in range(n):
|
| 43 |
+
pattern = random.choice(BUG_PATTERNS)
|
| 44 |
+
buggy = inject_bug(base_task["clean_code"], pattern)
|
| 45 |
+
if buggy == base_task["clean_code"]:
|
| 46 |
+
continue # pattern not applicable
|
| 47 |
+
yield {
|
| 48 |
+
**base_task,
|
| 49 |
+
"task_id": f"{base_task['task_id']}_v{i:03d}",
|
| 50 |
+
"buggy_code": buggy,
|
| 51 |
+
"bug_pattern": pattern[0],
|
| 52 |
+
}
|
server/tasks/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/tasks/__init__.py
|
| 2 |
+
from .task_easy import TASK_EASY
|
| 3 |
+
from .task_medium import TASK_MEDIUM
|
| 4 |
+
from .task_hard import TASK_HARD
|
| 5 |
+
from ..task_generator import generate_task_variants
|
| 6 |
+
|
| 7 |
+
TASK_REGISTRY: dict[str, dict] = {
|
| 8 |
+
"task_easy": TASK_EASY,
|
| 9 |
+
"task_medium": TASK_MEDIUM,
|
| 10 |
+
"task_hard": TASK_HARD,
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
# Generate 100 variants per task for the hackathon differentiator
|
| 14 |
+
for base_task in [TASK_EASY, TASK_MEDIUM, TASK_HARD]:
|
| 15 |
+
# ensure base_task has a clean_code field if task_generator requires it, or just use buggy_code as base
|
| 16 |
+
for variant in generate_task_variants(base_task, n=100):
|
| 17 |
+
TASK_REGISTRY[variant["task_id"]] = variant
|
server/tasks/task_easy.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/tasks/task_easy.py
|
| 2 |
+
TASK_EASY = {
|
| 3 |
+
"task_id": "task_easy",
|
| 4 |
+
"difficulty": "easy",
|
| 5 |
+
"num_tests": 4,
|
| 6 |
+
"description": (
|
| 7 |
+
"Fix the off-by-one error in the `find_max_subarray_sum` function. "
|
| 8 |
+
"It should return the maximum contiguous subarray sum (Kadane's algorithm). "
|
| 9 |
+
"Currently it misses the last element."
|
| 10 |
+
),
|
| 11 |
+
# The broken version the agent sees
|
| 12 |
+
"buggy_code": """\
|
| 13 |
+
def find_max_subarray_sum(nums: list[int]) -> int:
|
| 14 |
+
if not nums:
|
| 15 |
+
return 0
|
| 16 |
+
max_sum = current_sum = nums[0]
|
| 17 |
+
# BUG: range stops one short
|
| 18 |
+
for i in range(1, len(nums) - 1):
|
| 19 |
+
current_sum = max(nums[i], current_sum + nums[i])
|
| 20 |
+
max_sum = max(max_sum, current_sum)
|
| 21 |
+
return max_sum
|
| 22 |
+
""",
|
| 23 |
+
# The correct solution (used by task_generator)
|
| 24 |
+
"clean_code": """\
|
| 25 |
+
def find_max_subarray_sum(nums: list[int]) -> int:
|
| 26 |
+
if not nums:
|
| 27 |
+
return 0
|
| 28 |
+
max_sum = current_sum = nums[0]
|
| 29 |
+
for i in range(1, len(nums)):
|
| 30 |
+
current_sum = max(nums[i], current_sum + nums[i])
|
| 31 |
+
max_sum = max(max_sum, current_sum)
|
| 32 |
+
return max_sum
|
| 33 |
+
""",
|
| 34 |
+
# Hidden test suite — agent never sees this directly
|
| 35 |
+
"test_suite": """\
|
| 36 |
+
def test_basic():
|
| 37 |
+
assert find_max_subarray_sum([-2, 1, -3, 4, -1, 2, 1, -5, 4]) == 6
|
| 38 |
+
|
| 39 |
+
def test_all_negative():
|
| 40 |
+
assert find_max_subarray_sum([-3, -1, -2]) == -1
|
| 41 |
+
|
| 42 |
+
def test_single():
|
| 43 |
+
assert find_max_subarray_sum([42]) == 42
|
| 44 |
+
|
| 45 |
+
def test_empty():
|
| 46 |
+
assert find_max_subarray_sum([]) == 0
|
| 47 |
+
""",
|
| 48 |
+
}
|
server/tasks/task_hard.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/tasks/task_hard.py
|
| 2 |
+
TASK_HARD = {
|
| 3 |
+
"task_id": "task_hard",
|
| 4 |
+
"difficulty": "hard",
|
| 5 |
+
"num_tests": 10,
|
| 6 |
+
"description": (
|
| 7 |
+
"Fix 3+ subtle bugs in the `flatten_nested_dict` function. "
|
| 8 |
+
"It should recursively flatten a nested dictionary using dot-notation keys. "
|
| 9 |
+
"Example: {'a': {'b': 1}} → {'a.b': 1}. "
|
| 10 |
+
"Bug 1: wrong separator used in recursive calls. "
|
| 11 |
+
"Bug 2: lists are not handled correctly (should be indexed like 'a.0', 'a.1'). "
|
| 12 |
+
"Bug 3: empty dict values should produce the parent key with empty dict, not be skipped. "
|
| 13 |
+
"Bug 4: non-string keys are not converted to strings."
|
| 14 |
+
),
|
| 15 |
+
# The broken version the agent sees
|
| 16 |
+
"buggy_code": """\
|
| 17 |
+
def flatten_nested_dict(d: dict, parent_key: str = '', sep: str = '.') -> dict:
|
| 18 |
+
\"\"\"Flatten a nested dict into a single-level dict with dot-notation keys.
|
| 19 |
+
Lists should be expanded with numeric indices: {'a': [1,2]} → {'a.0': 1, 'a.1': 2}
|
| 20 |
+
Empty dicts should map to {}: {'a': {}} → {'a': {}}
|
| 21 |
+
Non-string keys should be converted to strings.\"\"\"
|
| 22 |
+
items = []
|
| 23 |
+
for k, v in d.items():
|
| 24 |
+
# BUG 4: k not converted to str
|
| 25 |
+
new_key = parent_key + sep + k if parent_key else k
|
| 26 |
+
if isinstance(v, dict):
|
| 27 |
+
# BUG 3: empty dict case not handled — this skips it entirely
|
| 28 |
+
# BUG 1: passes '/' instead of sep in recursive call
|
| 29 |
+
items.extend(flatten_nested_dict(v, new_key, '/').items())
|
| 30 |
+
elif isinstance(v, list):
|
| 31 |
+
# BUG 2: uses 1-based indexing instead of 0-based
|
| 32 |
+
for i, item in enumerate(v, start=1):
|
| 33 |
+
list_key = new_key + sep + str(i)
|
| 34 |
+
if isinstance(item, dict):
|
| 35 |
+
items.extend(flatten_nested_dict(item, list_key, sep).items())
|
| 36 |
+
else:
|
| 37 |
+
items.append((list_key, item))
|
| 38 |
+
else:
|
| 39 |
+
items.append((new_key, v))
|
| 40 |
+
return dict(items)
|
| 41 |
+
""",
|
| 42 |
+
"clean_code": """\
|
| 43 |
+
def flatten_nested_dict(d: dict, parent_key: str = '', sep: str = '.') -> dict:
|
| 44 |
+
\"\"\"Flatten a nested dict into a single-level dict with dot-notation keys.
|
| 45 |
+
Lists should be expanded with numeric indices: {'a': [1,2]} → {'a.0': 1, 'a.1': 2}
|
| 46 |
+
Empty dicts should map to {}: {'a': {}} → {'a': {}}
|
| 47 |
+
Non-string keys should be converted to strings.\"\"\"
|
| 48 |
+
items = []
|
| 49 |
+
for k, v in d.items():
|
| 50 |
+
str_k = str(k)
|
| 51 |
+
new_key = parent_key + sep + str_k if parent_key else str_k
|
| 52 |
+
if isinstance(v, dict):
|
| 53 |
+
if not v:
|
| 54 |
+
items.append((new_key, {}))
|
| 55 |
+
else:
|
| 56 |
+
items.extend(flatten_nested_dict(v, new_key, sep).items())
|
| 57 |
+
elif isinstance(v, list):
|
| 58 |
+
for i, item in enumerate(v):
|
| 59 |
+
list_key = new_key + sep + str(i)
|
| 60 |
+
if isinstance(item, dict):
|
| 61 |
+
items.extend(flatten_nested_dict(item, list_key, sep).items())
|
| 62 |
+
else:
|
| 63 |
+
items.append((list_key, item))
|
| 64 |
+
else:
|
| 65 |
+
items.append((new_key, v))
|
| 66 |
+
return dict(items)
|
| 67 |
+
""",
|
| 68 |
+
"test_suite": """\
|
| 69 |
+
def test_simple_flat():
|
| 70 |
+
assert flatten_nested_dict({"a": 1, "b": 2}) == {"a": 1, "b": 2}
|
| 71 |
+
|
| 72 |
+
def test_one_level_nesting():
|
| 73 |
+
assert flatten_nested_dict({"a": {"b": 1}}) == {"a.b": 1}
|
| 74 |
+
|
| 75 |
+
def test_deep_nesting():
|
| 76 |
+
assert flatten_nested_dict({"a": {"b": {"c": 3}}}) == {"a.b.c": 3}
|
| 77 |
+
|
| 78 |
+
def test_mixed_nesting():
|
| 79 |
+
result = flatten_nested_dict({"a": 1, "b": {"c": 2, "d": {"e": 3}}})
|
| 80 |
+
assert result == {"a": 1, "b.c": 2, "b.d.e": 3}
|
| 81 |
+
|
| 82 |
+
def test_list_values():
|
| 83 |
+
assert flatten_nested_dict({"a": [10, 20, 30]}) == {"a.0": 10, "a.1": 20, "a.2": 30}
|
| 84 |
+
|
| 85 |
+
def test_nested_list_of_dicts():
|
| 86 |
+
inp = {"users": [{"name": "Alice"}, {"name": "Bob"}]}
|
| 87 |
+
expected = {"users.0.name": "Alice", "users.1.name": "Bob"}
|
| 88 |
+
assert flatten_nested_dict(inp) == expected
|
| 89 |
+
|
| 90 |
+
def test_empty_dict_value():
|
| 91 |
+
assert flatten_nested_dict({"a": {}, "b": 1}) == {"a": {}, "b": 1}
|
| 92 |
+
|
| 93 |
+
def test_empty_input():
|
| 94 |
+
assert flatten_nested_dict({}) == {}
|
| 95 |
+
|
| 96 |
+
def test_numeric_keys():
|
| 97 |
+
assert flatten_nested_dict({1: "a", 2: {"3": "b"}}) == {"1": "a", "2.3": "b"}
|
| 98 |
+
|
| 99 |
+
def test_complex_mixed():
|
| 100 |
+
inp = {"x": [{"y": [1, 2]}, {"z": 3}], "w": 4}
|
| 101 |
+
expected = {"x.0.y.0": 1, "x.0.y.1": 2, "x.1.z": 3, "w": 4}
|
| 102 |
+
assert flatten_nested_dict(inp) == expected
|
| 103 |
+
""",
|
| 104 |
+
}
|
server/tasks/task_medium.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# server/tasks/task_medium.py
|
| 2 |
+
TASK_MEDIUM = {
|
| 3 |
+
"task_id": "task_medium",
|
| 4 |
+
"difficulty": "medium",
|
| 5 |
+
"num_tests": 6,
|
| 6 |
+
"description": (
|
| 7 |
+
"Fix two independent bugs in the `parse_key_value` function. "
|
| 8 |
+
"It should parse a string of 'key=value' pairs separated by semicolons "
|
| 9 |
+
"into a dictionary. Bug 1: it splits on the wrong delimiter for pairs. "
|
| 10 |
+
"Bug 2: it doesn't strip whitespace from keys and values."
|
| 11 |
+
),
|
| 12 |
+
# The broken version the agent sees
|
| 13 |
+
"buggy_code": """\
|
| 14 |
+
def parse_key_value(s: str) -> dict[str, str]:
|
| 15 |
+
\"\"\"Parse 'key1=val1;key2=val2' into {'key1': 'val1', 'key2': 'val2'}.
|
| 16 |
+
Handles whitespace around keys/values. Returns empty dict for empty string.\"\"\"
|
| 17 |
+
if not s or not s.strip():
|
| 18 |
+
return {}
|
| 19 |
+
result = {}
|
| 20 |
+
# BUG 1: splits on ',' instead of ';'
|
| 21 |
+
pairs = s.split(',')
|
| 22 |
+
for pair in pairs:
|
| 23 |
+
if '=' not in pair:
|
| 24 |
+
continue
|
| 25 |
+
key, value = pair.split('=', 1)
|
| 26 |
+
# BUG 2: missing .strip() on key and value
|
| 27 |
+
result[key] = value
|
| 28 |
+
return result
|
| 29 |
+
""",
|
| 30 |
+
"clean_code": """\
|
| 31 |
+
def parse_key_value(s: str) -> dict[str, str]:
|
| 32 |
+
\"\"\"Parse 'key1=val1;key2=val2' into {'key1': 'val1', 'key2': 'val2'}.
|
| 33 |
+
Handles whitespace around keys/values. Returns empty dict for empty string.\"\"\"
|
| 34 |
+
if not s or not s.strip():
|
| 35 |
+
return {}
|
| 36 |
+
result = {}
|
| 37 |
+
pairs = s.split(';')
|
| 38 |
+
for pair in pairs:
|
| 39 |
+
if '=' not in pair:
|
| 40 |
+
continue
|
| 41 |
+
key, value = pair.split('=', 1)
|
| 42 |
+
result[key.strip()] = value.strip()
|
| 43 |
+
return result
|
| 44 |
+
""",
|
| 45 |
+
"test_suite": """\
|
| 46 |
+
def test_basic():
|
| 47 |
+
assert parse_key_value("name=Alice;age=30") == {"name": "Alice", "age": "30"}
|
| 48 |
+
|
| 49 |
+
def test_whitespace():
|
| 50 |
+
assert parse_key_value(" name = Alice ; age = 30 ") == {"name": "Alice", "age": "30"}
|
| 51 |
+
|
| 52 |
+
def test_empty():
|
| 53 |
+
assert parse_key_value("") == {}
|
| 54 |
+
|
| 55 |
+
def test_single_pair():
|
| 56 |
+
assert parse_key_value("key=value") == {"key": "value"}
|
| 57 |
+
|
| 58 |
+
def test_value_with_equals():
|
| 59 |
+
assert parse_key_value("expr=a=b;other=c") == {"expr": "a=b", "other": "c"}
|
| 60 |
+
|
| 61 |
+
def test_whitespace_only():
|
| 62 |
+
assert parse_key_value(" ") == {}
|
| 63 |
+
""",
|
| 64 |
+
}
|
test_client.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
from code_debug_env.client import CodeDebugEnv
|
| 3 |
+
from code_debug_env.models import Action
|
| 4 |
+
|
| 5 |
+
async def test():
|
| 6 |
+
async with CodeDebugEnv(base_url="http://127.0.0.1:8000") as env:
|
| 7 |
+
obs = await env.reset(task_id="task_easy")
|
| 8 |
+
print("Reset OK:", obs.observation.buggy_code[:20])
|
| 9 |
+
action = Action(patch="def foo(): pass", task_id="task_easy")
|
| 10 |
+
print("Sending step...")
|
| 11 |
+
try:
|
| 12 |
+
res = await env.step(action)
|
| 13 |
+
print("Step OK", res)
|
| 14 |
+
except Exception as e:
|
| 15 |
+
print("Exception during step:", repr(e))
|
| 16 |
+
|
| 17 |
+
asyncio.run(test())
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|