Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- Dockerfile +69 -0
- README.md +118 -5
- __init__.py +21 -0
- client.py +140 -0
- models.py +69 -0
- opencode_openenv.egg-info/PKG-INFO +19 -0
- opencode_openenv.egg-info/SOURCES.txt +21 -0
- opencode_openenv.egg-info/dependency_links.txt +1 -0
- opencode_openenv.egg-info/entry_points.txt +2 -0
- opencode_openenv.egg-info/requires.txt +14 -0
- opencode_openenv.egg-info/top_level.txt +1 -0
- openenv.yaml +6 -0
- pyproject.toml +51 -0
- server/__init__.py +0 -0
- server/app.py +92 -0
- server/gradio_ui.py +295 -0
- server/opencode_environment.py +352 -0
- server/requirements.txt +9 -0
- tests/__init__.py +0 -0
- tests/test_client.py +253 -0
- uv.lock +0 -0
Dockerfile
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# syntax=docker/dockerfile:1
|
| 2 |
+
# Multi-stage build for opencode-openenv
|
| 3 |
+
# Mirrors the pattern used by jupyter-agent-openenv.
|
| 4 |
+
#
|
| 5 |
+
# Build:
|
| 6 |
+
# docker build -t opencode-openenv .
|
| 7 |
+
#
|
| 8 |
+
# Run:
|
| 9 |
+
# docker run -p 8000:8000 \
|
| 10 |
+
# -e E2B_API_KEY=e2b_... \
|
| 11 |
+
# -e ENABLE_WEB_INTERFACE=true \
|
| 12 |
+
# opencode-openenv
|
| 13 |
+
|
| 14 |
+
ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
|
| 15 |
+
|
| 16 |
+
# ββ Stage 1: builder ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 17 |
+
FROM ${BASE_IMAGE} AS builder
|
| 18 |
+
|
| 19 |
+
WORKDIR /app
|
| 20 |
+
|
| 21 |
+
ARG BUILD_MODE=standalone
|
| 22 |
+
|
| 23 |
+
COPY . /app/env
|
| 24 |
+
WORKDIR /app/env
|
| 25 |
+
|
| 26 |
+
# Ensure uv is available
|
| 27 |
+
RUN if ! command -v uv >/dev/null 2>&1; then \
|
| 28 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
| 29 |
+
mv /root/.local/bin/uv /usr/local/bin/uv && \
|
| 30 |
+
mv /root/.local/bin/uvx /usr/local/bin/uvx; \
|
| 31 |
+
fi
|
| 32 |
+
|
| 33 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 34 |
+
git \
|
| 35 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 36 |
+
|
| 37 |
+
# Install dependencies (cache-friendly two-pass)
|
| 38 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 39 |
+
if [ -f uv.lock ]; then \
|
| 40 |
+
uv sync --frozen --no-install-project --no-editable; \
|
| 41 |
+
else \
|
| 42 |
+
uv sync --no-install-project --no-editable; \
|
| 43 |
+
fi
|
| 44 |
+
|
| 45 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 46 |
+
if [ -f uv.lock ]; then \
|
| 47 |
+
uv sync --frozen --no-editable; \
|
| 48 |
+
else \
|
| 49 |
+
uv sync --no-editable; \
|
| 50 |
+
fi
|
| 51 |
+
|
| 52 |
+
# ββ Stage 2: runtime ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 53 |
+
FROM ${BASE_IMAGE}
|
| 54 |
+
|
| 55 |
+
WORKDIR /app
|
| 56 |
+
|
| 57 |
+
COPY --from=builder /app/env/.venv /app/.venv
|
| 58 |
+
COPY --from=builder /app/env /app/env
|
| 59 |
+
|
| 60 |
+
ENV PATH="/app/.venv/bin:$PATH"
|
| 61 |
+
ENV PYTHONPATH="/app/env:$PYTHONPATH"
|
| 62 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 63 |
+
ENV PYTHONUNBUFFERED=1
|
| 64 |
+
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
|
| 65 |
+
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
|
| 66 |
+
|
| 67 |
+
EXPOSE 8000
|
| 68 |
+
|
| 69 |
+
CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
|
README.md
CHANGED
|
@@ -1,10 +1,123 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
|
|
|
| 7 |
pinned: false
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: OpenCode Env
|
| 3 |
+
emoji: π₯οΈ
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: pink
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 8000
|
| 8 |
pinned: false
|
| 9 |
+
base_path: /web
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# opencode-openenv
|
| 13 |
+
|
| 14 |
+
An [OpenEnv](https://github.com/meta-pytorch/OpenEnv)-compatible environment
|
| 15 |
+
for the [OpenCode](https://opencode.ai/) CLI coding agent. Each call runs
|
| 16 |
+
the agent inside an isolated E2B sandbox against an OpenAI-compatible LLM
|
| 17 |
+
endpoint of your choice, executes a user-supplied bash verifier, and
|
| 18 |
+
returns the scalar reward plus artifacts.
|
| 19 |
+
|
| 20 |
+
Layout mirrors [`jupyter-agent-openenv`](https://huggingface.co/spaces/AdithyaSK/jupyter-agent-openenv).
|
| 21 |
+
|
| 22 |
+
## The one tool
|
| 23 |
+
|
| 24 |
+
| Property | Value |
|
| 25 |
+
|---|---|
|
| 26 |
+
| Framework | OpenEnv `MCPEnvironment` |
|
| 27 |
+
| Execution backend | E2B sandbox |
|
| 28 |
+
| Server | FastAPI + Gradio UI at `/` |
|
| 29 |
+
| Client | `OpenCodeEnv(MCPToolClient)` |
|
| 30 |
+
|
| 31 |
+
| Tool | Description |
|
| 32 |
+
|---|---|
|
| 33 |
+
| `run_rollout` | Spawn an E2B sandbox, run `opencode run` against your LLM endpoint, run your verifier, return reward + trace + workdir files |
|
| 34 |
+
|
| 35 |
+
## Environment variables
|
| 36 |
+
|
| 37 |
+
| Variable | Required | Default | Description |
|
| 38 |
+
|---|---|---|---|
|
| 39 |
+
| `E2B_API_KEY` | Yes | - | API key from [e2b.dev](https://e2b.dev/) |
|
| 40 |
+
| `ENABLE_WEB_INTERFACE` | No | `true` | Enable Gradio UI at `/` |
|
| 41 |
+
| `MAX_CONCURRENT_ENVS` | No | `4` | Max concurrent sandbox sessions |
|
| 42 |
+
|
| 43 |
+
## Run locally
|
| 44 |
+
|
| 45 |
+
**Prerequisites:** Python 3.10+, [uv](https://docs.astral.sh/uv/)
|
| 46 |
+
|
| 47 |
+
```bash
|
| 48 |
+
cd trl-internal/environments/opencode/openenv
|
| 49 |
+
uv sync
|
| 50 |
+
E2B_API_KEY=e2b_... uv run uvicorn server.app:app --host 0.0.0.0 --port 8000
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
The server starts at `http://localhost:8000`; the Gradio UI is mounted at
|
| 54 |
+
the root path.
|
| 55 |
+
|
| 56 |
+
**Verify it works:**
|
| 57 |
+
|
| 58 |
+
```bash
|
| 59 |
+
curl http://localhost:8000/health
|
| 60 |
+
# {"status":"healthy"}
|
| 61 |
+
|
| 62 |
+
curl -X POST http://localhost:8000/mcp \
|
| 63 |
+
-H "Content-Type: application/json" \
|
| 64 |
+
-d '{"jsonrpc":"2.0","method":"tools/list","id":1,"params":{}}'
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
## Run with Docker
|
| 68 |
+
|
| 69 |
+
```bash
|
| 70 |
+
docker build -t opencode-openenv .
|
| 71 |
+
docker run -p 8000:8000 -e E2B_API_KEY=e2b_... opencode-openenv
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
## Python client usage
|
| 75 |
+
|
| 76 |
+
```python
|
| 77 |
+
from opencode_env_server import OpenCodeEnv
|
| 78 |
+
|
| 79 |
+
with OpenCodeEnv(base_url="http://localhost:8000") as env:
|
| 80 |
+
env.reset()
|
| 81 |
+
result = env.run_rollout(
|
| 82 |
+
vllm_url="https://your-llm-host/v1",
|
| 83 |
+
model="Qwen/Qwen3.5-4B",
|
| 84 |
+
instruction="Write fizzbuzz.py in the current directory.",
|
| 85 |
+
test_script=open("my_tests/fizzbuzz.sh").read(),
|
| 86 |
+
task_id="fizzbuzz_001",
|
| 87 |
+
mode="transparent_proxy",
|
| 88 |
+
disable_thinking=True,
|
| 89 |
+
)
|
| 90 |
+
print(result.reward, len(result.proxy_turns))
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
## REST API
|
| 94 |
+
|
| 95 |
+
Standard OpenEnv endpoints are available:
|
| 96 |
+
|
| 97 |
+
```
|
| 98 |
+
GET /health # {"status": "healthy"}
|
| 99 |
+
GET /metadata # env name, version, description
|
| 100 |
+
GET /schema # action + observation JSON schemas
|
| 101 |
+
POST /reset # start new episode
|
| 102 |
+
POST /step # execute an action
|
| 103 |
+
POST /mcp # JSON-RPC 2.0 for MCP tool calls
|
| 104 |
+
GET / # Gradio UI
|
| 105 |
+
```
|
| 106 |
+
|
| 107 |
+
## Project structure
|
| 108 |
+
|
| 109 |
+
```
|
| 110 |
+
opencode-openenv/
|
| 111 |
+
βββ __init__.py # Package exports
|
| 112 |
+
βββ client.py # OpenCodeEnv(MCPToolClient)
|
| 113 |
+
βββ models.py # OpenCodeState, RolloutTurn, RolloutResult
|
| 114 |
+
βββ openenv.yaml # OpenEnv manifest
|
| 115 |
+
βββ pyproject.toml # Dependencies
|
| 116 |
+
βββ .env.example # Environment variable template
|
| 117 |
+
βββ Dockerfile # Multi-stage uv build on openenv-base
|
| 118 |
+
βββ server/
|
| 119 |
+
βββ app.py # FastAPI + Gradio mount
|
| 120 |
+
βββ opencode_environment.py # MCPEnvironment implementation
|
| 121 |
+
βββ gradio_ui.py # Interactive UI
|
| 122 |
+
βββ requirements.txt # Pip fallback deps
|
| 123 |
+
```
|
__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OpenEnv OpenCode environment.
|
| 2 |
+
|
| 3 |
+
Exposes a single MCP tool ``run_rollout`` that spawns an E2B sandbox, runs
|
| 4 |
+
the OpenCode CLI agent against a caller-supplied LLM endpoint, runs the
|
| 5 |
+
caller-supplied verifier script, and returns reward + proxy trace +
|
| 6 |
+
workdir contents as a JSON-serialized :class:`RolloutResult`.
|
| 7 |
+
|
| 8 |
+
Import either the :class:`OpenCodeEnv` HTTP client (for training scripts
|
| 9 |
+
talking to a deployed server) or the models (for type-safe parsing of
|
| 10 |
+
rollout results).
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from .client import OpenCodeEnv
|
| 14 |
+
from .models import OpenCodeState, RolloutResult, RolloutTurn
|
| 15 |
+
|
| 16 |
+
__all__ = [
|
| 17 |
+
"OpenCodeEnv",
|
| 18 |
+
"OpenCodeState",
|
| 19 |
+
"RolloutResult",
|
| 20 |
+
"RolloutTurn",
|
| 21 |
+
]
|
client.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OpenCode Environment Client.
|
| 2 |
+
|
| 3 |
+
Thin MCP client over the deployed ``opencode-openenv`` server. The server
|
| 4 |
+
exposes a single tool ``run_rollout`` that takes a task + LLM endpoint,
|
| 5 |
+
runs one OpenCode agent rollout in a fresh E2B sandbox, and returns a
|
| 6 |
+
JSON-serialized :class:`RolloutResult`.
|
| 7 |
+
|
| 8 |
+
Example::
|
| 9 |
+
|
| 10 |
+
from opencode_env_server import OpenCodeEnv
|
| 11 |
+
|
| 12 |
+
with OpenCodeEnv(base_url="https://adithya-s-k-opencode-openenv.hf.space") as env:
|
| 13 |
+
env.reset()
|
| 14 |
+
result = env.run_rollout(
|
| 15 |
+
vllm_url="https://your-llm-host/v1",
|
| 16 |
+
model="Qwen/Qwen3.5-4B",
|
| 17 |
+
instruction="Write fizzbuzz.py in the current directory.",
|
| 18 |
+
test_script="#!/bin/bash\\n...",
|
| 19 |
+
task_id="fizzbuzz_001",
|
| 20 |
+
mode="transparent_proxy",
|
| 21 |
+
disable_thinking=True,
|
| 22 |
+
)
|
| 23 |
+
print(result.reward, len(result.proxy_turns))
|
| 24 |
+
|
| 25 |
+
Docker convenience::
|
| 26 |
+
|
| 27 |
+
env = OpenCodeEnv.from_docker_image("opencode-openenv:latest")
|
| 28 |
+
env.reset()
|
| 29 |
+
...
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
from __future__ import annotations
|
| 33 |
+
|
| 34 |
+
import json
|
| 35 |
+
from typing import Any
|
| 36 |
+
|
| 37 |
+
from openenv.core.mcp_client import MCPToolClient
|
| 38 |
+
|
| 39 |
+
try:
|
| 40 |
+
from .models import RolloutResult
|
| 41 |
+
except ImportError:
|
| 42 |
+
from models import RolloutResult # type: ignore
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class OpenCodeEnv(MCPToolClient):
|
| 46 |
+
"""Client for the OpenCode OpenEnv server.
|
| 47 |
+
|
| 48 |
+
Inherits MCP plumbing (``reset``, ``call_tool``, ``list_tools``,
|
| 49 |
+
``from_docker_image``, context-manager support) from
|
| 50 |
+
:class:`MCPToolClient`. Adds :meth:`run_rollout` as a typed helper that
|
| 51 |
+
deserializes the tool result into a :class:`RolloutResult`.
|
| 52 |
+
"""
|
| 53 |
+
|
| 54 |
+
def run_rollout(
|
| 55 |
+
self,
|
| 56 |
+
*,
|
| 57 |
+
vllm_url: str,
|
| 58 |
+
model: str,
|
| 59 |
+
instruction: str,
|
| 60 |
+
test_script: str,
|
| 61 |
+
task_id: str = "",
|
| 62 |
+
setup_shell: str = "",
|
| 63 |
+
upload_files: dict[str, str] | None = None,
|
| 64 |
+
provider: str = "openai_compatible",
|
| 65 |
+
api_key: str = "intercepted",
|
| 66 |
+
mode: str = "transparent_proxy",
|
| 67 |
+
disable_thinking: bool = False,
|
| 68 |
+
max_tokens_cap: int = 4096,
|
| 69 |
+
agent_timeout_s: float = 600.0,
|
| 70 |
+
) -> RolloutResult:
|
| 71 |
+
"""Typed helper around ``call_tool("run_rollout", ...)``."""
|
| 72 |
+
|
| 73 |
+
raw = self.call_tool(
|
| 74 |
+
"run_rollout",
|
| 75 |
+
vllm_url=vllm_url,
|
| 76 |
+
model=model,
|
| 77 |
+
instruction=instruction,
|
| 78 |
+
test_script=test_script,
|
| 79 |
+
task_id=task_id,
|
| 80 |
+
setup_shell=setup_shell,
|
| 81 |
+
upload_files=upload_files or {},
|
| 82 |
+
provider=provider,
|
| 83 |
+
api_key=api_key,
|
| 84 |
+
mode=mode,
|
| 85 |
+
disable_thinking=disable_thinking,
|
| 86 |
+
max_tokens_cap=max_tokens_cap,
|
| 87 |
+
agent_timeout_s=agent_timeout_s,
|
| 88 |
+
)
|
| 89 |
+
payload = _extract_text(raw)
|
| 90 |
+
return RolloutResult.model_validate_json(payload)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def _extract_text(result: Any) -> str:
|
| 94 |
+
"""Pull the text payload out of an MCP tool result shape.
|
| 95 |
+
|
| 96 |
+
Handles three shapes MCPToolClient / call_tool may return:
|
| 97 |
+
- the raw tool text (str)
|
| 98 |
+
- a CallToolObservation-like object with ``.result.content[0].text``
|
| 99 |
+
- a dict with ``content`` list containing ``{"text": ...}`` entries
|
| 100 |
+
"""
|
| 101 |
+
|
| 102 |
+
if isinstance(result, str):
|
| 103 |
+
return result
|
| 104 |
+
|
| 105 |
+
# Object with attribute chain: obs.result.content[0].text
|
| 106 |
+
inner = getattr(result, "result", None)
|
| 107 |
+
if inner is not None:
|
| 108 |
+
content = getattr(inner, "content", None)
|
| 109 |
+
if content:
|
| 110 |
+
first = content[0]
|
| 111 |
+
text = getattr(first, "text", None)
|
| 112 |
+
if isinstance(text, str):
|
| 113 |
+
return text
|
| 114 |
+
if isinstance(first, dict) and "text" in first:
|
| 115 |
+
return first["text"]
|
| 116 |
+
|
| 117 |
+
if isinstance(result, dict):
|
| 118 |
+
content = result.get("content")
|
| 119 |
+
if isinstance(content, list) and content:
|
| 120 |
+
first = content[0]
|
| 121 |
+
if isinstance(first, dict) and "text" in first:
|
| 122 |
+
return first["text"]
|
| 123 |
+
nested = result.get("result")
|
| 124 |
+
if isinstance(nested, dict):
|
| 125 |
+
content = nested.get("content")
|
| 126 |
+
if isinstance(content, list) and content:
|
| 127 |
+
first = content[0]
|
| 128 |
+
if isinstance(first, dict) and "text" in first:
|
| 129 |
+
return first["text"]
|
| 130 |
+
return json.dumps(result, default=str)
|
| 131 |
+
|
| 132 |
+
# Object with .content directly
|
| 133 |
+
content = getattr(result, "content", None)
|
| 134 |
+
if content:
|
| 135 |
+
first = content[0]
|
| 136 |
+
text = getattr(first, "text", None)
|
| 137 |
+
if isinstance(text, str):
|
| 138 |
+
return text
|
| 139 |
+
|
| 140 |
+
return str(result)
|
models.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic models for the OpenCode OpenEnv environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
from openenv.core.env_server.types import State
|
| 8 |
+
from pydantic import BaseModel, Field
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class RolloutTurn(BaseModel):
|
| 12 |
+
"""One intercepted LLM turn captured by the in-sandbox proxy (Mode B)."""
|
| 13 |
+
|
| 14 |
+
turn: int
|
| 15 |
+
request: dict[str, Any] = Field(default_factory=dict)
|
| 16 |
+
response: dict[str, Any] = Field(default_factory=dict)
|
| 17 |
+
completion_tokens: list[str] = Field(default_factory=list)
|
| 18 |
+
completion_token_ids: list[int] = Field(default_factory=list)
|
| 19 |
+
per_token_logps: list[float] = Field(default_factory=list)
|
| 20 |
+
finish_reason: str | None = None
|
| 21 |
+
latency_s: float = 0.0
|
| 22 |
+
timestamp: float = 0.0
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class RolloutResult(BaseModel):
|
| 26 |
+
"""Outcome of one call to the ``run_rollout`` tool.
|
| 27 |
+
|
| 28 |
+
Serialized to JSON as the tool result. The training-side client
|
| 29 |
+
deserializes and feeds ``proxy_turns`` + ``reward`` into GRPO.
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
# Identifiers
|
| 33 |
+
task_id: str = ""
|
| 34 |
+
sandbox_id: str = ""
|
| 35 |
+
|
| 36 |
+
# Scalars
|
| 37 |
+
reward: float | None = None
|
| 38 |
+
exit_code: int = 0
|
| 39 |
+
wall_s: float = 0.0
|
| 40 |
+
mode: str = "transparent_proxy"
|
| 41 |
+
|
| 42 |
+
# Per-turn trajectory (empty in black_box mode)
|
| 43 |
+
proxy_turns: list[RolloutTurn] = Field(default_factory=list)
|
| 44 |
+
|
| 45 |
+
# Agent artifacts
|
| 46 |
+
workdir_files: dict[str, str] = Field(default_factory=dict)
|
| 47 |
+
agent_log_tail: str = ""
|
| 48 |
+
|
| 49 |
+
# Verifier bookkeeping
|
| 50 |
+
verifier_stdout: str = ""
|
| 51 |
+
verifier_stderr: str = ""
|
| 52 |
+
test_exit_code: int | None = None
|
| 53 |
+
|
| 54 |
+
# Errors (if any) surfacing from sandbox/proxy/verifier path
|
| 55 |
+
error: str | None = None
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class OpenCodeState(State):
|
| 59 |
+
"""Persistent env state across calls to the single environment instance.
|
| 60 |
+
|
| 61 |
+
Each HTTP session gets its own OpenCodeEnvironment (via
|
| 62 |
+
``SUPPORTS_CONCURRENT_SESSIONS = True`` on the server class), so this
|
| 63 |
+
state is per-session.
|
| 64 |
+
"""
|
| 65 |
+
|
| 66 |
+
rollouts_completed: int = 0
|
| 67 |
+
last_reward: float | None = None
|
| 68 |
+
last_task_id: str | None = None
|
| 69 |
+
last_sandbox_id: str | None = None
|
opencode_openenv.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: opencode-openenv
|
| 3 |
+
Version: 0.1.0
|
| 4 |
+
Summary: OpenEnv OpenCode environment β spawns an E2B sandbox per rollout, runs OpenCode against a caller-supplied LLM endpoint, returns reward + proxy trace.
|
| 5 |
+
Author-email: adithya-s-k <adithyaskolavi@gmail.com>
|
| 6 |
+
Requires-Python: >=3.10
|
| 7 |
+
Requires-Dist: openenv-core[core] @ git+https://github.com/adithya-s-k/OpenEnv.git@opencode-harness
|
| 8 |
+
Requires-Dist: openenv-opencode_env @ git+https://github.com/adithya-s-k/OpenEnv.git@opencode-harness#subdirectory=envs/opencode_env
|
| 9 |
+
Requires-Dist: fastmcp>=3.0.0
|
| 10 |
+
Requires-Dist: fastapi>=0.115.0
|
| 11 |
+
Requires-Dist: uvicorn>=0.24.0
|
| 12 |
+
Requires-Dist: pydantic>=2.0.0
|
| 13 |
+
Requires-Dist: gradio>=4.0.0
|
| 14 |
+
Requires-Dist: python-dotenv>=1.0.0
|
| 15 |
+
Requires-Dist: requests>=2.31.0
|
| 16 |
+
Provides-Extra: dev
|
| 17 |
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
| 18 |
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
| 19 |
+
Requires-Dist: httpx>=0.27.0; extra == "dev"
|
opencode_openenv.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
README.md
|
| 2 |
+
__init__.py
|
| 3 |
+
client.py
|
| 4 |
+
models.py
|
| 5 |
+
openenv.yaml
|
| 6 |
+
pyproject.toml
|
| 7 |
+
./__init__.py
|
| 8 |
+
./client.py
|
| 9 |
+
./models.py
|
| 10 |
+
./openenv.yaml
|
| 11 |
+
opencode_openenv.egg-info/PKG-INFO
|
| 12 |
+
opencode_openenv.egg-info/SOURCES.txt
|
| 13 |
+
opencode_openenv.egg-info/dependency_links.txt
|
| 14 |
+
opencode_openenv.egg-info/entry_points.txt
|
| 15 |
+
opencode_openenv.egg-info/requires.txt
|
| 16 |
+
opencode_openenv.egg-info/top_level.txt
|
| 17 |
+
server/__init__.py
|
| 18 |
+
server/app.py
|
| 19 |
+
server/gradio_ui.py
|
| 20 |
+
server/opencode_environment.py
|
| 21 |
+
server/requirements.txt
|
opencode_openenv.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
opencode_openenv.egg-info/entry_points.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[console_scripts]
|
| 2 |
+
server = opencode_env_server.server.app:main
|
opencode_openenv.egg-info/requires.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv-core[core] @ git+https://github.com/adithya-s-k/OpenEnv.git@opencode-harness
|
| 2 |
+
openenv-opencode_env @ git+https://github.com/adithya-s-k/OpenEnv.git@opencode-harness#subdirectory=envs/opencode_env
|
| 3 |
+
fastmcp>=3.0.0
|
| 4 |
+
fastapi>=0.115.0
|
| 5 |
+
uvicorn>=0.24.0
|
| 6 |
+
pydantic>=2.0.0
|
| 7 |
+
gradio>=4.0.0
|
| 8 |
+
python-dotenv>=1.0.0
|
| 9 |
+
requests>=2.31.0
|
| 10 |
+
|
| 11 |
+
[dev]
|
| 12 |
+
pytest>=8.0.0
|
| 13 |
+
pytest-cov>=4.0.0
|
| 14 |
+
httpx>=0.27.0
|
opencode_openenv.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
opencode_env_server
|
openenv.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
name: opencode_env
|
| 3 |
+
type: space
|
| 4 |
+
runtime: fastapi
|
| 5 |
+
app: server.app:app
|
| 6 |
+
port: 8000
|
pyproject.toml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=45", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "opencode-openenv"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "OpenEnv environment running the OpenCode coding agent inside an E2B sandbox, returning reward and per-turn trace."
|
| 9 |
+
authors = [
|
| 10 |
+
{ name = "adithya-s-k", email = "adithyaskolavi@gmail.com" }
|
| 11 |
+
]
|
| 12 |
+
requires-python = ">=3.10"
|
| 13 |
+
dependencies = [
|
| 14 |
+
# NOTE: openenv-core must come from the same branch as the primitive β
|
| 15 |
+
# the ``openenv.core.harness`` module doesn't exist on PyPI yet (it lives
|
| 16 |
+
# on PR #471 and our opencode-harness branch stacked on top of it).
|
| 17 |
+
"openenv-core[core] @ git+https://github.com/adithya-s-k/OpenEnv.git@opencode-harness",
|
| 18 |
+
"openenv-opencode_env @ git+https://github.com/adithya-s-k/OpenEnv.git@opencode-harness#subdirectory=envs/opencode_env",
|
| 19 |
+
"fastmcp>=3.0.0",
|
| 20 |
+
"fastapi>=0.115.0",
|
| 21 |
+
"uvicorn>=0.24.0",
|
| 22 |
+
"pydantic>=2.0.0",
|
| 23 |
+
"gradio>=4.0.0",
|
| 24 |
+
"python-dotenv>=1.0.0",
|
| 25 |
+
"requests>=2.31.0",
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
[project.optional-dependencies]
|
| 29 |
+
dev = [
|
| 30 |
+
"pytest>=8.0.0",
|
| 31 |
+
"pytest-cov>=4.0.0",
|
| 32 |
+
"httpx>=0.27.0",
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
[project.scripts]
|
| 36 |
+
server = "opencode_env_server.server.app:main"
|
| 37 |
+
|
| 38 |
+
[tool.setuptools]
|
| 39 |
+
include-package-data = true
|
| 40 |
+
packages = ["opencode_env_server", "opencode_env_server.server"]
|
| 41 |
+
package-dir = { "opencode_env_server" = ".", "opencode_env_server.server" = "server" }
|
| 42 |
+
|
| 43 |
+
[tool.setuptools.package-data]
|
| 44 |
+
"*" = ["*.txt", "*.yaml"]
|
| 45 |
+
|
| 46 |
+
[dependency-groups]
|
| 47 |
+
dev = [
|
| 48 |
+
"httpx>=0.27.0",
|
| 49 |
+
"pytest>=8.0.0",
|
| 50 |
+
"pytest-cov>=4.0.0",
|
| 51 |
+
]
|
server/__init__.py
ADDED
|
File without changes
|
server/app.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI application for the OpenCode Environment.
|
| 2 |
+
|
| 3 |
+
The Gradio web UI is mounted at root (``/``). OpenEnv's standard API
|
| 4 |
+
endpoints (``/health``, ``/reset``, ``/step``, ``/mcp``) are registered
|
| 5 |
+
first and take precedence over Gradio routes.
|
| 6 |
+
|
| 7 |
+
Usage::
|
| 8 |
+
|
| 9 |
+
# Development:
|
| 10 |
+
E2B_API_KEY=... uv run uvicorn server.app:app --reload
|
| 11 |
+
|
| 12 |
+
# Via uv project script:
|
| 13 |
+
E2B_API_KEY=... uv run --project . server
|
| 14 |
+
|
| 15 |
+
# Docker:
|
| 16 |
+
docker run -p 8000:8000 -e E2B_API_KEY=... opencode-openenv
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
from __future__ import annotations
|
| 20 |
+
|
| 21 |
+
import os
|
| 22 |
+
|
| 23 |
+
import gradio as gr
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
from openenv.core.env_server.gradio_theme import (
|
| 27 |
+
OPENENV_GRADIO_CSS,
|
| 28 |
+
OPENENV_GRADIO_THEME,
|
| 29 |
+
)
|
| 30 |
+
from openenv.core.env_server.http_server import create_app
|
| 31 |
+
from openenv.core.env_server.mcp_types import (
|
| 32 |
+
CallToolAction,
|
| 33 |
+
CallToolObservation,
|
| 34 |
+
)
|
| 35 |
+
from openenv.core.env_server.web_interface import WebInterfaceManager
|
| 36 |
+
|
| 37 |
+
from .opencode_environment import OpenCodeEnvironment
|
| 38 |
+
from .gradio_ui import opencode_ui_builder
|
| 39 |
+
except ImportError:
|
| 40 |
+
from openenv.core.env_server.gradio_theme import (
|
| 41 |
+
OPENENV_GRADIO_CSS,
|
| 42 |
+
OPENENV_GRADIO_THEME,
|
| 43 |
+
)
|
| 44 |
+
from openenv.core.env_server.http_server import create_app
|
| 45 |
+
from openenv.core.env_server.mcp_types import (
|
| 46 |
+
CallToolAction,
|
| 47 |
+
CallToolObservation,
|
| 48 |
+
)
|
| 49 |
+
from openenv.core.env_server.web_interface import WebInterfaceManager
|
| 50 |
+
|
| 51 |
+
from server.opencode_environment import OpenCodeEnvironment
|
| 52 |
+
from server.gradio_ui import opencode_ui_builder
|
| 53 |
+
|
| 54 |
+
# Build the HTTP API server with MCP routing. We mount our own Gradio UI
|
| 55 |
+
# below, so disable the built-in web interface from create_app.
|
| 56 |
+
os.environ["ENABLE_WEB_INTERFACE"] = "false"
|
| 57 |
+
|
| 58 |
+
app = create_app(
|
| 59 |
+
OpenCodeEnvironment,
|
| 60 |
+
CallToolAction,
|
| 61 |
+
CallToolObservation,
|
| 62 |
+
env_name="opencode_env",
|
| 63 |
+
max_concurrent_envs=int(os.getenv("MAX_CONCURRENT_ENVS", "4")),
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
_web_manager = WebInterfaceManager(
|
| 67 |
+
OpenCodeEnvironment,
|
| 68 |
+
CallToolAction,
|
| 69 |
+
CallToolObservation,
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
_demo = opencode_ui_builder(
|
| 73 |
+
web_manager=_web_manager,
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
app = gr.mount_gradio_app(
|
| 77 |
+
app,
|
| 78 |
+
_demo,
|
| 79 |
+
path="/",
|
| 80 |
+
theme=OPENENV_GRADIO_THEME,
|
| 81 |
+
css=OPENENV_GRADIO_CSS,
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def main(host: str = "0.0.0.0", port: int = 8000) -> None:
|
| 86 |
+
import uvicorn
|
| 87 |
+
|
| 88 |
+
uvicorn.run(app, host=host, port=port)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
if __name__ == "__main__":
|
| 92 |
+
main()
|
server/gradio_ui.py
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Gradio UI for the OpenCode OpenEnv server.
|
| 2 |
+
|
| 3 |
+
One page. Top half: LLM config + task inputs. Bottom half: rollout
|
| 4 |
+
summary, proxy trace, workdir files, verifier output.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
from typing import Any
|
| 11 |
+
|
| 12 |
+
import gradio as gr
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# ββ Defaults pre-filled in the form (can be overridden per run) βββββββββββββ
|
| 16 |
+
|
| 17 |
+
_DEFAULT_INSTRUCTION = (
|
| 18 |
+
"Write a Python script `fizzbuzz.py` in the current directory that "
|
| 19 |
+
"prints FizzBuzz for numbers 1..15, one per line. Print 'Fizz' for "
|
| 20 |
+
"multiples of 3, 'Buzz' for multiples of 5, 'FizzBuzz' for both."
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
_DEFAULT_TEST_SCRIPT = r"""#!/usr/bin/env bash
|
| 24 |
+
set -u
|
| 25 |
+
mkdir -p /home/user/logs/verifier
|
| 26 |
+
REWARD_PATH=/home/user/logs/verifier/reward.txt
|
| 27 |
+
|
| 28 |
+
cd /home/user/workdir || { echo 0 > "$REWARD_PATH"; exit 0; }
|
| 29 |
+
if [ ! -f fizzbuzz.py ]; then
|
| 30 |
+
echo 0 > "$REWARD_PATH"
|
| 31 |
+
exit 0
|
| 32 |
+
fi
|
| 33 |
+
|
| 34 |
+
OUTPUT=$(python fizzbuzz.py 2>&1 | head -20 || true)
|
| 35 |
+
EXPECTED=(1 2 Fizz 4 Buzz Fizz 7 8 Fizz Buzz 11 Fizz 13 14 FizzBuzz)
|
| 36 |
+
|
| 37 |
+
HITS=0
|
| 38 |
+
for line in "${EXPECTED[@]}"; do
|
| 39 |
+
if echo "$OUTPUT" | grep -qxF "$line"; then HITS=$((HITS + 1)); fi
|
| 40 |
+
done
|
| 41 |
+
|
| 42 |
+
python -c "print(${HITS} / ${#EXPECTED[@]})" > "$REWARD_PATH"
|
| 43 |
+
echo "fizzbuzz: ${HITS}/${#EXPECTED[@]} lines correct"
|
| 44 |
+
"""
|
| 45 |
+
|
| 46 |
+
_EXAMPLE_MODELS = [
|
| 47 |
+
"Qwen/Qwen3.5-4B",
|
| 48 |
+
"Qwen/Qwen3-Coder-Next",
|
| 49 |
+
"openai/gpt-4o-mini",
|
| 50 |
+
"openai/gpt-5.3-chat-latest",
|
| 51 |
+
]
|
| 52 |
+
|
| 53 |
+
_EXAMPLE_VLLM_URLS = [
|
| 54 |
+
"https://<your-public-llm-host>/v1",
|
| 55 |
+
"https://api.openai.com/v1",
|
| 56 |
+
]
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def opencode_ui_builder(
|
| 60 |
+
*,
|
| 61 |
+
web_manager: Any,
|
| 62 |
+
title: str = "OpenCode Env",
|
| 63 |
+
**_: Any,
|
| 64 |
+
) -> gr.Blocks:
|
| 65 |
+
"""Build the Gradio Blocks UI bound to ``web_manager``.
|
| 66 |
+
|
| 67 |
+
The web manager is a thin wrapper around ``OpenCodeEnvironment`` β
|
| 68 |
+
calling ``call_tool("run_rollout", ...)`` on it drives one rollout.
|
| 69 |
+
"""
|
| 70 |
+
|
| 71 |
+
with gr.Blocks(title=title, analytics_enabled=False) as demo:
|
| 72 |
+
gr.Markdown(f"# {title}\nRun one OpenCode rollout against any OpenAI-compatible endpoint.")
|
| 73 |
+
|
| 74 |
+
# ββ Config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 75 |
+
with gr.Row():
|
| 76 |
+
with gr.Column(scale=1):
|
| 77 |
+
vllm_url = gr.Textbox(
|
| 78 |
+
label="vLLM / LLM base URL",
|
| 79 |
+
value=_EXAMPLE_VLLM_URLS[0],
|
| 80 |
+
placeholder="https://.../v1",
|
| 81 |
+
)
|
| 82 |
+
model = gr.Textbox(
|
| 83 |
+
label="Model id",
|
| 84 |
+
value=_EXAMPLE_MODELS[0],
|
| 85 |
+
placeholder="Qwen/Qwen3.5-4B",
|
| 86 |
+
)
|
| 87 |
+
provider = gr.Dropdown(
|
| 88 |
+
label="Provider",
|
| 89 |
+
choices=["openai_compatible", "openai", "anthropic"],
|
| 90 |
+
value="openai_compatible",
|
| 91 |
+
)
|
| 92 |
+
api_key = gr.Textbox(
|
| 93 |
+
label="API key (ignored by vLLM)",
|
| 94 |
+
value="intercepted",
|
| 95 |
+
type="password",
|
| 96 |
+
)
|
| 97 |
+
with gr.Column(scale=1):
|
| 98 |
+
mode = gr.Dropdown(
|
| 99 |
+
label="Mode",
|
| 100 |
+
choices=["transparent_proxy", "black_box"],
|
| 101 |
+
value="transparent_proxy",
|
| 102 |
+
)
|
| 103 |
+
disable_thinking = gr.Checkbox(
|
| 104 |
+
label="Disable Qwen3 thinking mode",
|
| 105 |
+
value=True,
|
| 106 |
+
)
|
| 107 |
+
max_tokens_cap = gr.Slider(
|
| 108 |
+
label="max_tokens cap",
|
| 109 |
+
minimum=512, maximum=32768, value=4096, step=256,
|
| 110 |
+
)
|
| 111 |
+
agent_timeout_s = gr.Slider(
|
| 112 |
+
label="Agent timeout (s)",
|
| 113 |
+
minimum=60, maximum=1200, value=300, step=30,
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
# ββ Task βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 117 |
+
with gr.Row():
|
| 118 |
+
task_id = gr.Textbox(label="Task id (optional)", value="fizzbuzz_demo")
|
| 119 |
+
instruction = gr.Textbox(
|
| 120 |
+
label="Instruction",
|
| 121 |
+
value=_DEFAULT_INSTRUCTION,
|
| 122 |
+
lines=4,
|
| 123 |
+
)
|
| 124 |
+
test_script = gr.Code(
|
| 125 |
+
label="test.sh (bash verifier β writes reward to /home/user/logs/verifier/reward.txt)",
|
| 126 |
+
value=_DEFAULT_TEST_SCRIPT,
|
| 127 |
+
language="shell",
|
| 128 |
+
)
|
| 129 |
+
setup_shell = gr.Textbox(
|
| 130 |
+
label="Setup shell (optional, runs before opencode)",
|
| 131 |
+
value="",
|
| 132 |
+
placeholder="e.g. pip install polars",
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
run_btn = gr.Button("βΆ Run rollout", variant="primary")
|
| 136 |
+
|
| 137 |
+
# ββ Output panels βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 138 |
+
status = gr.Markdown()
|
| 139 |
+
with gr.Row():
|
| 140 |
+
reward = gr.Number(label="reward", value=None, interactive=False)
|
| 141 |
+
wall_s = gr.Number(label="wall_s", value=None, interactive=False)
|
| 142 |
+
exit_code = gr.Number(label="exit_code", value=None, interactive=False)
|
| 143 |
+
n_turns = gr.Number(label="proxy_turns", value=None, interactive=False)
|
| 144 |
+
|
| 145 |
+
with gr.Accordion("Workdir files", open=True):
|
| 146 |
+
workdir_md = gr.Markdown()
|
| 147 |
+
with gr.Accordion("Proxy trace (per turn)", open=False):
|
| 148 |
+
proxy_trace_json = gr.JSON(label=None)
|
| 149 |
+
with gr.Accordion("Verifier stdout / stderr", open=False):
|
| 150 |
+
verifier_out = gr.Textbox(label="stdout", lines=8)
|
| 151 |
+
verifier_err = gr.Textbox(label="stderr", lines=4)
|
| 152 |
+
with gr.Accordion("Raw result JSON", open=False):
|
| 153 |
+
raw_json = gr.JSON(label=None)
|
| 154 |
+
|
| 155 |
+
# ββ Run handler ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 156 |
+
def _run(
|
| 157 |
+
vllm_url_v: str,
|
| 158 |
+
model_v: str,
|
| 159 |
+
provider_v: str,
|
| 160 |
+
api_key_v: str,
|
| 161 |
+
mode_v: str,
|
| 162 |
+
disable_thinking_v: bool,
|
| 163 |
+
max_tokens_cap_v: int,
|
| 164 |
+
agent_timeout_s_v: float,
|
| 165 |
+
task_id_v: str,
|
| 166 |
+
instruction_v: str,
|
| 167 |
+
test_script_v: str,
|
| 168 |
+
setup_shell_v: str,
|
| 169 |
+
):
|
| 170 |
+
try:
|
| 171 |
+
env = web_manager.get_environment()
|
| 172 |
+
env.reset()
|
| 173 |
+
result_raw = env.call_tool(
|
| 174 |
+
"run_rollout",
|
| 175 |
+
vllm_url=vllm_url_v,
|
| 176 |
+
model=model_v,
|
| 177 |
+
instruction=instruction_v,
|
| 178 |
+
test_script=test_script_v,
|
| 179 |
+
task_id=task_id_v,
|
| 180 |
+
setup_shell=setup_shell_v,
|
| 181 |
+
upload_files={},
|
| 182 |
+
provider=provider_v,
|
| 183 |
+
api_key=api_key_v,
|
| 184 |
+
mode=mode_v,
|
| 185 |
+
disable_thinking=bool(disable_thinking_v),
|
| 186 |
+
max_tokens_cap=int(max_tokens_cap_v),
|
| 187 |
+
agent_timeout_s=float(agent_timeout_s_v),
|
| 188 |
+
)
|
| 189 |
+
result = _parse_result(result_raw)
|
| 190 |
+
except Exception as exc:
|
| 191 |
+
msg = f"**Error:** `{type(exc).__name__}: {exc}`"
|
| 192 |
+
return (msg, None, None, None, None, "", [], "", "", {"error": str(exc)})
|
| 193 |
+
|
| 194 |
+
status_md = _summarize_status(result)
|
| 195 |
+
wd_md = _render_workdir(result.get("workdir_files") or {})
|
| 196 |
+
turns = result.get("proxy_turns") or []
|
| 197 |
+
verifier_stdout = (result.get("verifier_stdout") or "")[:4000]
|
| 198 |
+
verifier_stderr = (result.get("verifier_stderr") or "")[:2000]
|
| 199 |
+
return (
|
| 200 |
+
status_md,
|
| 201 |
+
result.get("reward"),
|
| 202 |
+
result.get("wall_s"),
|
| 203 |
+
result.get("exit_code"),
|
| 204 |
+
len(turns),
|
| 205 |
+
wd_md,
|
| 206 |
+
turns,
|
| 207 |
+
verifier_stdout,
|
| 208 |
+
verifier_stderr,
|
| 209 |
+
result,
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
run_btn.click(
|
| 213 |
+
_run,
|
| 214 |
+
inputs=[
|
| 215 |
+
vllm_url, model, provider, api_key, mode, disable_thinking,
|
| 216 |
+
max_tokens_cap, agent_timeout_s,
|
| 217 |
+
task_id, instruction, test_script, setup_shell,
|
| 218 |
+
],
|
| 219 |
+
outputs=[
|
| 220 |
+
status, reward, wall_s, exit_code, n_turns,
|
| 221 |
+
workdir_md, proxy_trace_json,
|
| 222 |
+
verifier_out, verifier_err, raw_json,
|
| 223 |
+
],
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
return demo
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
# ββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def _parse_result(raw: Any) -> dict[str, Any]:
|
| 233 |
+
"""Unwrap the server's JSON tool result into a plain dict."""
|
| 234 |
+
# Object with attribute chain: obs.result.content[0].text
|
| 235 |
+
inner = getattr(raw, "result", None)
|
| 236 |
+
if inner is not None:
|
| 237 |
+
content = getattr(inner, "content", None)
|
| 238 |
+
if content:
|
| 239 |
+
first = content[0]
|
| 240 |
+
text = getattr(first, "text", None)
|
| 241 |
+
if isinstance(text, str):
|
| 242 |
+
try:
|
| 243 |
+
return json.loads(text)
|
| 244 |
+
except Exception:
|
| 245 |
+
return {"raw": text}
|
| 246 |
+
|
| 247 |
+
if isinstance(raw, dict):
|
| 248 |
+
content = raw.get("content")
|
| 249 |
+
if isinstance(content, list) and content:
|
| 250 |
+
first = content[0]
|
| 251 |
+
text = first.get("text") if isinstance(first, dict) else None
|
| 252 |
+
if isinstance(text, str):
|
| 253 |
+
try:
|
| 254 |
+
return json.loads(text)
|
| 255 |
+
except Exception:
|
| 256 |
+
return {"raw": text}
|
| 257 |
+
return raw
|
| 258 |
+
if isinstance(raw, str):
|
| 259 |
+
try:
|
| 260 |
+
return json.loads(raw)
|
| 261 |
+
except Exception:
|
| 262 |
+
return {"raw": raw}
|
| 263 |
+
return {"raw": str(raw)}
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
def _summarize_status(result: dict[str, Any]) -> str:
|
| 267 |
+
if result.get("error"):
|
| 268 |
+
return f"β **Error:** `{result['error']}`"
|
| 269 |
+
reward = result.get("reward")
|
| 270 |
+
turns = result.get("proxy_turns") or []
|
| 271 |
+
wall = result.get("wall_s", 0.0)
|
| 272 |
+
sb = result.get("sandbox_id", "")
|
| 273 |
+
exit_code = result.get("exit_code")
|
| 274 |
+
parts = [
|
| 275 |
+
f"**reward** = `{reward}`",
|
| 276 |
+
f"**wall** = `{wall}s`",
|
| 277 |
+
f"**turns** = `{len(turns)}`",
|
| 278 |
+
f"**exit** = `{exit_code}`",
|
| 279 |
+
]
|
| 280 |
+
if sb:
|
| 281 |
+
parts.append(f"**sandbox** = `{sb}`")
|
| 282 |
+
return " Β· ".join(parts)
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
def _render_workdir(files: dict[str, str]) -> str:
|
| 286 |
+
if not files:
|
| 287 |
+
return "_(no files produced)_"
|
| 288 |
+
lines = []
|
| 289 |
+
for path, contents in files.items():
|
| 290 |
+
lines.append(f"### `{path}`")
|
| 291 |
+
lines.append("")
|
| 292 |
+
lines.append("```")
|
| 293 |
+
lines.append((contents or "").rstrip()[:2000])
|
| 294 |
+
lines.append("```")
|
| 295 |
+
return "\n".join(lines)
|
server/opencode_environment.py
ADDED
|
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OpenCode MCP Environment.
|
| 2 |
+
|
| 3 |
+
Exposes a single tool ``run_rollout`` that runs one OpenCode agent rollout
|
| 4 |
+
end-to-end against a caller-supplied LLM endpoint:
|
| 5 |
+
|
| 6 |
+
1. Spawn a fresh E2B sandbox (via the primitive's ``E2BSandboxBackend``).
|
| 7 |
+
2. Install opencode in the sandbox, write its config pointing at an
|
| 8 |
+
in-sandbox proxy (Mode B) or the caller's LLM URL directly (Mode A).
|
| 9 |
+
3. Stage the caller-supplied task: instruction + test.sh + any extra files.
|
| 10 |
+
4. Run ``opencode run`` to completion.
|
| 11 |
+
5. Execute the verifier script; read the scalar reward from
|
| 12 |
+
``/home/user/logs/verifier/reward.txt``.
|
| 13 |
+
6. Collect proxy trace + workdir contents.
|
| 14 |
+
7. Return a JSON-serialized :class:`RolloutResult`.
|
| 15 |
+
|
| 16 |
+
The env is deliberately task-agnostic β the training script passes the
|
| 17 |
+
full task (instruction + verifier) through the tool arguments.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
import json
|
| 23 |
+
import os
|
| 24 |
+
import time
|
| 25 |
+
from typing import Any, Optional
|
| 26 |
+
from uuid import uuid4
|
| 27 |
+
|
| 28 |
+
from dotenv import load_dotenv
|
| 29 |
+
from fastmcp import FastMCP
|
| 30 |
+
from openenv.core.env_server.mcp_environment import MCPEnvironment
|
| 31 |
+
from openenv.core.env_server.types import Action, Observation
|
| 32 |
+
|
| 33 |
+
load_dotenv()
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# Default test-script and reward paths inside the sandbox. The server writes
|
| 37 |
+
# the caller-supplied ``test_script`` text to this path; the verifier reads
|
| 38 |
+
# the reward file back out after it finishes.
|
| 39 |
+
REMOTE_TEST_PATH = "/home/user/tests/test.sh"
|
| 40 |
+
REMOTE_REWARD_PATH = "/home/user/logs/verifier/reward.txt"
|
| 41 |
+
WORKDIR_PATH = "/home/user/workdir"
|
| 42 |
+
VERIFIER_TIMEOUT_S = 120
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class OpenCodeEnvironment(MCPEnvironment):
|
| 46 |
+
"""One-tool MCP environment.
|
| 47 |
+
|
| 48 |
+
The single tool ``run_rollout`` is synchronous and returns a JSON string
|
| 49 |
+
β one call = one complete agent rollout. Each call creates and destroys
|
| 50 |
+
its own E2B sandbox, so the environment is stateless across calls.
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
SUPPORTS_CONCURRENT_SESSIONS = True
|
| 54 |
+
|
| 55 |
+
def __init__(self) -> None:
|
| 56 |
+
# Import inside __init__ to keep module import cheap and to allow
|
| 57 |
+
# patching for tests. Dual-import pattern: package when installed,
|
| 58 |
+
# flat when run directly out of the repo via ``server.app:app``.
|
| 59 |
+
try:
|
| 60 |
+
from ..models import OpenCodeState, RolloutResult, RolloutTurn
|
| 61 |
+
except ImportError:
|
| 62 |
+
from models import OpenCodeState, RolloutResult, RolloutTurn # type: ignore
|
| 63 |
+
from opencode_env import (
|
| 64 |
+
E2BSandboxBackend,
|
| 65 |
+
OpenCodeConfig,
|
| 66 |
+
OpenCodeSessionFactory,
|
| 67 |
+
collect_rollout_summary,
|
| 68 |
+
)
|
| 69 |
+
from openenv.core.harness import VerifyResult
|
| 70 |
+
|
| 71 |
+
self._state_cls = OpenCodeState
|
| 72 |
+
self._result_cls = RolloutResult
|
| 73 |
+
self._turn_cls = RolloutTurn
|
| 74 |
+
self._OpenCodeConfig = OpenCodeConfig
|
| 75 |
+
self._OpenCodeSessionFactory = OpenCodeSessionFactory
|
| 76 |
+
self._E2BSandboxBackend = E2BSandboxBackend
|
| 77 |
+
self._collect_rollout_summary = collect_rollout_summary
|
| 78 |
+
self._VerifyResult = VerifyResult
|
| 79 |
+
|
| 80 |
+
# Require E2B credentials up front β fail loudly if unset.
|
| 81 |
+
if not os.environ.get("E2B_API_KEY"):
|
| 82 |
+
raise RuntimeError(
|
| 83 |
+
"E2B_API_KEY environment variable is required for OpenCodeEnvironment"
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
self._state = self._state_cls(episode_id=str(uuid4()))
|
| 87 |
+
|
| 88 |
+
mcp = FastMCP("opencode_env")
|
| 89 |
+
|
| 90 |
+
@mcp.tool
|
| 91 |
+
def run_rollout(
|
| 92 |
+
vllm_url: str,
|
| 93 |
+
model: str,
|
| 94 |
+
instruction: str,
|
| 95 |
+
test_script: str,
|
| 96 |
+
task_id: str = "",
|
| 97 |
+
setup_shell: str = "",
|
| 98 |
+
upload_files: Optional[dict[str, str]] = None,
|
| 99 |
+
provider: str = "openai_compatible",
|
| 100 |
+
api_key: str = "intercepted",
|
| 101 |
+
mode: str = "transparent_proxy",
|
| 102 |
+
disable_thinking: bool = False,
|
| 103 |
+
max_tokens_cap: int = 4096,
|
| 104 |
+
agent_timeout_s: float = 600.0,
|
| 105 |
+
) -> str:
|
| 106 |
+
"""Run one OpenCode rollout end-to-end.
|
| 107 |
+
|
| 108 |
+
Args:
|
| 109 |
+
vllm_url: LLM endpoint (``https://host/v1``).
|
| 110 |
+
model: Model id the provider recognizes.
|
| 111 |
+
instruction: Prompt passed to ``opencode run``.
|
| 112 |
+
test_script: Bash verifier. Must write a float reward to
|
| 113 |
+
``/home/user/logs/verifier/reward.txt``.
|
| 114 |
+
task_id: Optional identifier echoed back for traceability.
|
| 115 |
+
setup_shell: Optional shell run before opencode starts.
|
| 116 |
+
upload_files: Optional {remote_path: content} staged into the
|
| 117 |
+
sandbox.
|
| 118 |
+
provider: OpenCodeConfig provider id. For vLLM use
|
| 119 |
+
``"openai_compatible"``; for real OpenAI ``"openai"``.
|
| 120 |
+
api_key: Provider API key. vLLM ignores this.
|
| 121 |
+
mode: ``"transparent_proxy"`` (captures per-turn logprobs) or
|
| 122 |
+
``"black_box"`` (direct connection, no logprobs).
|
| 123 |
+
disable_thinking: Qwen3/Qwen3.5 proxy-side thinking disable.
|
| 124 |
+
max_tokens_cap: Clamp forwarded ``max_tokens``.
|
| 125 |
+
agent_timeout_s: Max opencode runtime in seconds.
|
| 126 |
+
|
| 127 |
+
Returns:
|
| 128 |
+
JSON-serialized :class:`RolloutResult`.
|
| 129 |
+
"""
|
| 130 |
+
return self._run_rollout_impl(
|
| 131 |
+
vllm_url=vllm_url,
|
| 132 |
+
model=model,
|
| 133 |
+
instruction=instruction,
|
| 134 |
+
test_script=test_script,
|
| 135 |
+
task_id=task_id,
|
| 136 |
+
setup_shell=setup_shell,
|
| 137 |
+
upload_files=upload_files or {},
|
| 138 |
+
provider=provider,
|
| 139 |
+
api_key=api_key,
|
| 140 |
+
mode=mode,
|
| 141 |
+
disable_thinking=disable_thinking,
|
| 142 |
+
max_tokens_cap=max_tokens_cap,
|
| 143 |
+
agent_timeout_s=agent_timeout_s,
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
super().__init__(mcp)
|
| 147 |
+
|
| 148 |
+
# ββ OpenEnv lifecycle βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 149 |
+
|
| 150 |
+
def reset(
|
| 151 |
+
self,
|
| 152 |
+
seed: Optional[int] = None,
|
| 153 |
+
episode_id: Optional[str] = None,
|
| 154 |
+
**_: Any,
|
| 155 |
+
) -> Observation:
|
| 156 |
+
self._state = self._state_cls(episode_id=episode_id or str(uuid4()))
|
| 157 |
+
return Observation(
|
| 158 |
+
done=False,
|
| 159 |
+
reward=None,
|
| 160 |
+
metadata={
|
| 161 |
+
"status": "ready",
|
| 162 |
+
"message": "OpenCode env ready. Call run_rollout(...) with a task.",
|
| 163 |
+
},
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
def _step_impl(
|
| 167 |
+
self,
|
| 168 |
+
action: Action,
|
| 169 |
+
timeout_s: Optional[float] = None,
|
| 170 |
+
**_: Any,
|
| 171 |
+
) -> Observation:
|
| 172 |
+
return Observation(
|
| 173 |
+
done=False,
|
| 174 |
+
reward=None,
|
| 175 |
+
metadata={
|
| 176 |
+
"error": (
|
| 177 |
+
f"Unknown action type: {type(action).__name__}. "
|
| 178 |
+
"Use CallToolAction(name='run_rollout', ...)."
|
| 179 |
+
),
|
| 180 |
+
},
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
@property
|
| 184 |
+
def state(self) -> Any:
|
| 185 |
+
return self._state
|
| 186 |
+
|
| 187 |
+
# ββ Rollout implementation ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 188 |
+
|
| 189 |
+
def _run_rollout_impl(
|
| 190 |
+
self,
|
| 191 |
+
*,
|
| 192 |
+
vllm_url: str,
|
| 193 |
+
model: str,
|
| 194 |
+
instruction: str,
|
| 195 |
+
test_script: str,
|
| 196 |
+
task_id: str,
|
| 197 |
+
setup_shell: str,
|
| 198 |
+
upload_files: dict[str, str],
|
| 199 |
+
provider: str,
|
| 200 |
+
api_key: str,
|
| 201 |
+
mode: str,
|
| 202 |
+
disable_thinking: bool,
|
| 203 |
+
max_tokens_cap: int,
|
| 204 |
+
agent_timeout_s: float,
|
| 205 |
+
) -> str:
|
| 206 |
+
from opencode_env import OpenCodeTask
|
| 207 |
+
|
| 208 |
+
result = self._result_cls(task_id=task_id, mode=mode)
|
| 209 |
+
t0 = time.time()
|
| 210 |
+
|
| 211 |
+
provider_model = _qualify_model(provider, model)
|
| 212 |
+
|
| 213 |
+
config = self._OpenCodeConfig(
|
| 214 |
+
provider=provider,
|
| 215 |
+
base_url=vllm_url.rstrip("/"),
|
| 216 |
+
api_key=api_key,
|
| 217 |
+
model=provider_model,
|
| 218 |
+
agent_timeout_s=agent_timeout_s,
|
| 219 |
+
proxy_disable_thinking=disable_thinking,
|
| 220 |
+
proxy_max_tokens_cap=max_tokens_cap if max_tokens_cap > 0 else None,
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
factory = self._OpenCodeSessionFactory(
|
| 224 |
+
config=config,
|
| 225 |
+
sandbox_backend=self._E2BSandboxBackend(),
|
| 226 |
+
mode=mode, # "transparent_proxy" or "black_box"
|
| 227 |
+
verifier=None, # we run the caller's test_script ourselves below
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
merged_uploads = dict(upload_files)
|
| 231 |
+
merged_uploads[REMOTE_TEST_PATH] = test_script
|
| 232 |
+
task = OpenCodeTask(
|
| 233 |
+
instruction=instruction,
|
| 234 |
+
setup_shell=setup_shell or None,
|
| 235 |
+
upload_files=merged_uploads,
|
| 236 |
+
metadata={"task_id": task_id},
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
session = None
|
| 240 |
+
try:
|
| 241 |
+
session = factory.create(task=task)
|
| 242 |
+
result.sandbox_id = session.sandbox.sandbox_id
|
| 243 |
+
|
| 244 |
+
exit_code = session.wait_for_completion(timeout_s=agent_timeout_s)
|
| 245 |
+
result.exit_code = int(exit_code)
|
| 246 |
+
|
| 247 |
+
# Run the verifier. Exit code is ignored; the reward file is the
|
| 248 |
+
# source of truth.
|
| 249 |
+
session.sandbox.exec(
|
| 250 |
+
f"mkdir -p /home/user/logs/verifier /home/user/tests && "
|
| 251 |
+
f"chmod +x {REMOTE_TEST_PATH}",
|
| 252 |
+
timeout=15,
|
| 253 |
+
)
|
| 254 |
+
verifier_run = session.sandbox.exec(
|
| 255 |
+
f"bash {REMOTE_TEST_PATH}",
|
| 256 |
+
cwd=WORKDIR_PATH,
|
| 257 |
+
timeout=VERIFIER_TIMEOUT_S,
|
| 258 |
+
)
|
| 259 |
+
result.test_exit_code = int(verifier_run.exit_code)
|
| 260 |
+
result.verifier_stdout = (verifier_run.stdout or "")[:4000]
|
| 261 |
+
result.verifier_stderr = (verifier_run.stderr or "")[:2000]
|
| 262 |
+
result.reward = _read_reward(session.sandbox, REMOTE_REWARD_PATH)
|
| 263 |
+
|
| 264 |
+
# Collect artifacts via the primitive's summary helper.
|
| 265 |
+
summary = self._collect_rollout_summary(session)
|
| 266 |
+
result.agent_log_tail = _tail(summary.opencode_events, 20)
|
| 267 |
+
result.workdir_files = {
|
| 268 |
+
path: (contents or "")[:8000]
|
| 269 |
+
for path, contents in (summary.workdir_contents or {}).items()
|
| 270 |
+
}
|
| 271 |
+
for raw in summary.proxy_turns:
|
| 272 |
+
result.proxy_turns.append(self._turn_cls(**_clamp_turn(raw)))
|
| 273 |
+
except Exception as exc:
|
| 274 |
+
result.error = f"{type(exc).__name__}: {exc}"
|
| 275 |
+
finally:
|
| 276 |
+
if session is not None:
|
| 277 |
+
try:
|
| 278 |
+
session.close()
|
| 279 |
+
except Exception:
|
| 280 |
+
pass
|
| 281 |
+
|
| 282 |
+
result.wall_s = round(time.time() - t0, 3)
|
| 283 |
+
|
| 284 |
+
# Persist lightweight state for bookkeeping.
|
| 285 |
+
self._state.rollouts_completed += 1
|
| 286 |
+
self._state.last_reward = result.reward
|
| 287 |
+
self._state.last_task_id = task_id or None
|
| 288 |
+
self._state.last_sandbox_id = result.sandbox_id or None
|
| 289 |
+
|
| 290 |
+
return result.model_dump_json()
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
# ββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
def _qualify_model(provider: str, model: str) -> str:
|
| 297 |
+
"""Return a ``<provider>/<model>`` string acceptable to the primitive.
|
| 298 |
+
|
| 299 |
+
If the caller already prefixed the model, leave it alone; otherwise
|
| 300 |
+
prepend the provider so OpenCode's config file is well-formed.
|
| 301 |
+
"""
|
| 302 |
+
if "/" in model:
|
| 303 |
+
return model
|
| 304 |
+
return f"{provider}/{model}"
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
def _read_reward(sandbox: Any, reward_path: str) -> Optional[float]:
|
| 308 |
+
try:
|
| 309 |
+
raw = sandbox.read_text(reward_path).strip()
|
| 310 |
+
except Exception:
|
| 311 |
+
return None
|
| 312 |
+
if not raw:
|
| 313 |
+
return None
|
| 314 |
+
try:
|
| 315 |
+
return float(raw)
|
| 316 |
+
except ValueError:
|
| 317 |
+
return None
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
def _clamp_turn(turn: dict[str, Any]) -> dict[str, Any]:
|
| 321 |
+
"""Clamp per-turn payload sizes to keep responses under a reasonable cap."""
|
| 322 |
+
out = dict(turn)
|
| 323 |
+
# Compact ``response`` β we already captured tokens/logps explicitly.
|
| 324 |
+
out["response"] = {
|
| 325 |
+
"finish_reason": (out.get("response") or {}).get("choices", [{}])[0].get(
|
| 326 |
+
"finish_reason"
|
| 327 |
+
),
|
| 328 |
+
"usage": (out.get("response") or {}).get("usage"),
|
| 329 |
+
}
|
| 330 |
+
req = out.get("request") or {}
|
| 331 |
+
messages = req.get("messages") or []
|
| 332 |
+
# Keep request messages (trainer needs them) but drop very long tool schemas.
|
| 333 |
+
req = {
|
| 334 |
+
"model": req.get("model"),
|
| 335 |
+
"messages": messages,
|
| 336 |
+
"temperature": req.get("temperature"),
|
| 337 |
+
"top_p": req.get("top_p"),
|
| 338 |
+
"max_tokens": req.get("max_tokens"),
|
| 339 |
+
"max_completion_tokens": req.get("max_completion_tokens"),
|
| 340 |
+
"logprobs": req.get("logprobs"),
|
| 341 |
+
"top_logprobs": req.get("top_logprobs"),
|
| 342 |
+
"stream": req.get("stream"),
|
| 343 |
+
}
|
| 344 |
+
out["request"] = req
|
| 345 |
+
return out
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
def _tail(events: list[dict[str, Any]], n: int) -> str:
|
| 349 |
+
"""Return the last ``n`` opencode event lines as a newline-joined string."""
|
| 350 |
+
if not events:
|
| 351 |
+
return ""
|
| 352 |
+
return "\n".join(json.dumps(e) for e in events[-n:])
|
server/requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv-core[core] @ git+https://github.com/adithya-s-k/OpenEnv.git@opencode-harness
|
| 2 |
+
openenv-opencode_env @ git+https://github.com/adithya-s-k/OpenEnv.git@opencode-harness#subdirectory=envs/opencode_env
|
| 3 |
+
fastmcp>=3.0.0
|
| 4 |
+
fastapi>=0.115.0
|
| 5 |
+
uvicorn>=0.24.0
|
| 6 |
+
pydantic>=2.0.0
|
| 7 |
+
gradio>=4.0.0
|
| 8 |
+
python-dotenv>=1.0.0
|
| 9 |
+
requests>=2.31.0
|
tests/__init__.py
ADDED
|
File without changes
|
tests/test_client.py
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""End-to-end HTTP tests for the deployed OpenCode OpenEnv server.
|
| 2 |
+
|
| 3 |
+
By default the tests hit the HF Space deployment. Override
|
| 4 |
+
``OPENCODE_ENV_URL`` to point at a local ``uvicorn server.app:app``
|
| 5 |
+
or a ``docker run``-backed container. Every test also needs a reachable
|
| 6 |
+
vLLM endpoint β set ``VLLM_BASE_URL`` to the public URL of a running
|
| 7 |
+
``vllm serve Qwen/Qwen3.5-4B`` (see the slurm scripts under dev/slurm/
|
| 8 |
+
for one way to stand one up).
|
| 9 |
+
|
| 10 |
+
Run::
|
| 11 |
+
|
| 12 |
+
export VLLM_BASE_URL=https://your-llm-host/v1
|
| 13 |
+
uv run pytest tests/ -v -s
|
| 14 |
+
|
| 15 |
+
# against a local server:
|
| 16 |
+
OPENCODE_ENV_URL=http://localhost:8000 uv run pytest tests/ -v -s
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
from __future__ import annotations
|
| 20 |
+
|
| 21 |
+
import json
|
| 22 |
+
import os
|
| 23 |
+
from typing import Any
|
| 24 |
+
|
| 25 |
+
import pytest
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
ENV_URL = os.getenv(
|
| 29 |
+
"OPENCODE_ENV_URL", "https://AdithyaSK-opencode-openenv.hf.space"
|
| 30 |
+
)
|
| 31 |
+
VLLM_BASE_URL = os.getenv("VLLM_BASE_URL", "").rstrip("/")
|
| 32 |
+
VLLM_MODEL = os.getenv("VLLM_MODEL", "Qwen/Qwen3.5-4B")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
pytestmark = pytest.mark.skipif(
|
| 36 |
+
not VLLM_BASE_URL,
|
| 37 |
+
reason=(
|
| 38 |
+
"VLLM_BASE_URL not set; point it at a live public-endpointed "
|
| 39 |
+
"vLLM endpoint (see dev/slurm/vllm_endpoint_qwen35_4b.slurm)."
|
| 40 |
+
),
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# ββ Inline task bundles βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 45 |
+
# Tasks live in the training script, not the env β these are test fixtures
|
| 46 |
+
# mirroring what a trainer would send through ``run_rollout``.
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
_FIZZBUZZ_INSTRUCTION = (
|
| 50 |
+
"Write a Python script `fizzbuzz.py` in the current working directory "
|
| 51 |
+
"that prints FizzBuzz for numbers 1..15, one per line. Print 'Fizz' "
|
| 52 |
+
"for multiples of 3, 'Buzz' for multiples of 5, 'FizzBuzz' for both."
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
_FIZZBUZZ_TEST = r"""#!/usr/bin/env bash
|
| 56 |
+
set -u
|
| 57 |
+
mkdir -p /home/user/logs/verifier
|
| 58 |
+
REWARD=/home/user/logs/verifier/reward.txt
|
| 59 |
+
cd /home/user/workdir || { echo 0 > "$REWARD"; exit 0; }
|
| 60 |
+
[ -f fizzbuzz.py ] || { echo 0 > "$REWARD"; exit 0; }
|
| 61 |
+
OUT=$(python fizzbuzz.py 2>&1 | head -20 || true)
|
| 62 |
+
EXPECTED=(1 2 Fizz 4 Buzz Fizz 7 8 Fizz Buzz 11 Fizz 13 14 FizzBuzz)
|
| 63 |
+
HITS=0
|
| 64 |
+
for line in "${EXPECTED[@]}"; do
|
| 65 |
+
echo "$OUT" | grep -qxF "$line" && HITS=$((HITS + 1))
|
| 66 |
+
done
|
| 67 |
+
python -c "print(${HITS}/${#EXPECTED[@]})" > "$REWARD"
|
| 68 |
+
echo "fizzbuzz: ${HITS}/${#EXPECTED[@]}"
|
| 69 |
+
"""
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
_SORT_LIST_INSTRUCTION = (
|
| 73 |
+
"Write a Python script `sort_list.py` in the current working directory "
|
| 74 |
+
"that sorts [42, 7, 13, 1, 99, 5, 23, 8, 31, 11] ascending and prints "
|
| 75 |
+
"the result as one comma-separated line with no spaces. Expected "
|
| 76 |
+
"output (exactly): 1,5,7,8,11,13,23,31,42,99 β do not print anything else."
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
_SORT_LIST_TEST = r"""#!/usr/bin/env bash
|
| 80 |
+
set -u
|
| 81 |
+
mkdir -p /home/user/logs/verifier
|
| 82 |
+
REWARD=/home/user/logs/verifier/reward.txt
|
| 83 |
+
cd /home/user/workdir || { echo 0 > "$REWARD"; exit 0; }
|
| 84 |
+
[ -f sort_list.py ] || { echo 0 > "$REWARD"; exit 0; }
|
| 85 |
+
EXPECTED="1,5,7,8,11,13,23,31,42,99"
|
| 86 |
+
OUT=$(python sort_list.py 2>/dev/null | head -1 || true)
|
| 87 |
+
if [ "$OUT" = "$EXPECTED" ]; then
|
| 88 |
+
echo 1.0 > "$REWARD"
|
| 89 |
+
echo "sort_list: PASS"
|
| 90 |
+
else
|
| 91 |
+
echo 0.0 > "$REWARD"
|
| 92 |
+
echo "sort_list: FAIL got='${OUT}' want='${EXPECTED}'"
|
| 93 |
+
fi
|
| 94 |
+
"""
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
_SIMPLE_IO_INSTRUCTION = (
|
| 98 |
+
"Create a file `greeting.txt` in the current working directory "
|
| 99 |
+
"containing exactly the line `hello, world` (followed by a newline). "
|
| 100 |
+
"Then write a Python script `read_and_echo.py` that opens "
|
| 101 |
+
"`greeting.txt` and prints its contents to stdout. Run the script "
|
| 102 |
+
"to verify it prints `hello, world` before you stop."
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
_SIMPLE_IO_TEST = r"""#!/usr/bin/env bash
|
| 106 |
+
set -u
|
| 107 |
+
mkdir -p /home/user/logs/verifier
|
| 108 |
+
REWARD=/home/user/logs/verifier/reward.txt
|
| 109 |
+
cd /home/user/workdir || { echo 0 > "$REWARD"; exit 0; }
|
| 110 |
+
SCORE=0.0
|
| 111 |
+
if [ -f greeting.txt ]; then
|
| 112 |
+
if [ "$(cat greeting.txt)" = "hello, world" ]; then
|
| 113 |
+
SCORE=$(python -c "print(${SCORE} + 0.5)")
|
| 114 |
+
fi
|
| 115 |
+
fi
|
| 116 |
+
if [ -f read_and_echo.py ]; then
|
| 117 |
+
OUT=$(python read_and_echo.py 2>/dev/null | head -1 || true)
|
| 118 |
+
if [ "$OUT" = "hello, world" ]; then
|
| 119 |
+
SCORE=$(python -c "print(${SCORE} + 0.5)")
|
| 120 |
+
fi
|
| 121 |
+
fi
|
| 122 |
+
echo "$SCORE" > "$REWARD"
|
| 123 |
+
echo "simple_io: score=$SCORE"
|
| 124 |
+
"""
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
_TASKS = {
|
| 128 |
+
"fizzbuzz": (_FIZZBUZZ_INSTRUCTION, _FIZZBUZZ_TEST),
|
| 129 |
+
"sort_list": (_SORT_LIST_INSTRUCTION, _SORT_LIST_TEST),
|
| 130 |
+
"simple_io": (_SIMPLE_IO_INSTRUCTION, _SIMPLE_IO_TEST),
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
# ββ Fixtures ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
@pytest.fixture(scope="module")
|
| 138 |
+
def client():
|
| 139 |
+
"""Create a sync MCP client against the env server."""
|
| 140 |
+
try:
|
| 141 |
+
from opencode_env_server import OpenCodeEnv
|
| 142 |
+
except ImportError:
|
| 143 |
+
# Running from the source tree before the package is pip-installed.
|
| 144 |
+
import sys
|
| 145 |
+
from pathlib import Path
|
| 146 |
+
|
| 147 |
+
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
| 148 |
+
from client import OpenCodeEnv # type: ignore
|
| 149 |
+
|
| 150 |
+
env = OpenCodeEnv(base_url=ENV_URL).sync()
|
| 151 |
+
env.__enter__()
|
| 152 |
+
yield env
|
| 153 |
+
env.__exit__(None, None, None)
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
# ββ Server-liveness tests βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
class TestOpenEnvServer:
|
| 160 |
+
"""Basic OpenEnv MCP contract checks."""
|
| 161 |
+
|
| 162 |
+
def test_reset(self, client):
|
| 163 |
+
client.reset()
|
| 164 |
+
|
| 165 |
+
def test_list_tools(self, client):
|
| 166 |
+
client.reset()
|
| 167 |
+
tools = client.list_tools()
|
| 168 |
+
names = sorted(t.name for t in tools)
|
| 169 |
+
assert names == ["run_rollout"], f"unexpected tool set: {names}"
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
# ββ Rollout tests (require VLLM_BASE_URL) βββββββββββββββββββββββββββββββββ
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
class TestRunRollout:
|
| 176 |
+
"""Drive one rollout per bundled task via the server and verify the result."""
|
| 177 |
+
|
| 178 |
+
@pytest.mark.parametrize("task_id", ["fizzbuzz", "sort_list", "simple_io"])
|
| 179 |
+
def test_run_rollout(self, client, task_id: str):
|
| 180 |
+
instruction, test_script = _TASKS[task_id]
|
| 181 |
+
client.reset()
|
| 182 |
+
|
| 183 |
+
base_url = VLLM_BASE_URL if VLLM_BASE_URL.endswith("/v1") else f"{VLLM_BASE_URL}/v1"
|
| 184 |
+
|
| 185 |
+
raw = client.call_tool(
|
| 186 |
+
"run_rollout",
|
| 187 |
+
vllm_url=base_url,
|
| 188 |
+
model=VLLM_MODEL,
|
| 189 |
+
instruction=instruction,
|
| 190 |
+
test_script=test_script,
|
| 191 |
+
task_id=task_id,
|
| 192 |
+
provider="openai_compatible",
|
| 193 |
+
api_key="intercepted",
|
| 194 |
+
mode="transparent_proxy",
|
| 195 |
+
disable_thinking=True,
|
| 196 |
+
max_tokens_cap=4096,
|
| 197 |
+
agent_timeout_s=360.0,
|
| 198 |
+
)
|
| 199 |
+
result = _parse_json(raw)
|
| 200 |
+
|
| 201 |
+
print(
|
| 202 |
+
f"\n[{task_id}] reward={result['reward']} wall={result['wall_s']}s "
|
| 203 |
+
f"turns={len(result['proxy_turns'])} files={list((result['workdir_files'] or {}).keys())}"
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
# Contract assertions
|
| 207 |
+
assert result["error"] is None, f"rollout errored: {result['error']}"
|
| 208 |
+
assert result["exit_code"] == 0, "opencode did not exit cleanly"
|
| 209 |
+
assert (
|
| 210 |
+
len(result["proxy_turns"]) >= 1
|
| 211 |
+
), "proxy captured zero turns β logprob path is broken"
|
| 212 |
+
|
| 213 |
+
# At least one turn must carry logprobs (Mode B contract).
|
| 214 |
+
productive = [t for t in result["proxy_turns"] if t["completion_tokens"]]
|
| 215 |
+
assert (
|
| 216 |
+
len(productive) >= 1
|
| 217 |
+
), "no productive turns β streaming / logprob capture is broken"
|
| 218 |
+
first = productive[0]
|
| 219 |
+
assert first["request"].get("logprobs") is True
|
| 220 |
+
assert len(first["per_token_logps"]) == len(first["completion_tokens"])
|
| 221 |
+
|
| 222 |
+
# Task quality
|
| 223 |
+
assert result["reward"] is not None, "verifier did not write reward.txt"
|
| 224 |
+
assert result["reward"] >= 0.5, (
|
| 225 |
+
f"task={task_id} reward={result['reward']} too low; "
|
| 226 |
+
f"workdir={list((result['workdir_files'] or {}).keys())} "
|
| 227 |
+
f"verifier_stdout={(result['verifier_stdout'] or '').strip()[:200]}"
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
# ββ helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
def _parse_json(raw: Any) -> dict[str, Any]:
|
| 235 |
+
"""Unwrap a CallTool result shape into a plain dict."""
|
| 236 |
+
if isinstance(raw, str):
|
| 237 |
+
return json.loads(raw)
|
| 238 |
+
if isinstance(raw, dict):
|
| 239 |
+
content = raw.get("content")
|
| 240 |
+
if isinstance(content, list) and content:
|
| 241 |
+
first = content[0]
|
| 242 |
+
if isinstance(first, dict) and isinstance(first.get("text"), str):
|
| 243 |
+
return json.loads(first["text"])
|
| 244 |
+
return raw
|
| 245 |
+
# Handle MCP object shapes (.result.content[0].text or .content[0].text)
|
| 246 |
+
inner = getattr(raw, "result", None) or raw
|
| 247 |
+
content = getattr(inner, "content", None)
|
| 248 |
+
if content:
|
| 249 |
+
first = content[0]
|
| 250 |
+
text = getattr(first, "text", None)
|
| 251 |
+
if isinstance(text, str):
|
| 252 |
+
return json.loads(text)
|
| 253 |
+
raise TypeError(f"Cannot parse tool result of type {type(raw).__name__}: {raw!r}")
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|