Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- Dockerfile +69 -0
- README.md +258 -4
- __init__.py +56 -0
- client.py +168 -0
- config.py +79 -0
- harness.py +525 -0
- models.py +93 -0
- opencode_runtime.py +150 -0
- openenv.yaml +6 -0
- pyproject.toml +55 -0
- sandbox/__init__.py +25 -0
- sandbox/base.py +100 -0
- sandbox/build_template.py +142 -0
- sandbox/e2b.py +192 -0
- sandbox/interception.py +642 -0
- server/__init__.py +7 -0
- server/app.py +118 -0
- server/catalog.py +149 -0
- server/gradio_ui.py +453 -0
- server/opencode_environment.py +472 -0
- task.py +43 -0
- tests/__init__.py +0 -0
- tests/test_config.py +48 -0
- tests/test_five_sorts_e2e.py +1045 -0
- tests/test_harness.py +221 -0
- tests/test_inference_endpoints.py +430 -0
- tests/test_interception.py +198 -0
- tests/test_opencode_runtime.py +107 -0
- tests/test_sandbox_base.py +44 -0
- tests/test_task.py +42 -0
- uv.lock +0 -0
Dockerfile
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
#
|
| 7 |
+
# opencode_env Dockerfile — mirrors the standard OpenEnv multi-stage uv
|
| 8 |
+
# build used by echo_env / repl_env / jupyter_agent.
|
| 9 |
+
#
|
| 10 |
+
# Build:
|
| 11 |
+
# docker build -t opencode-env .
|
| 12 |
+
#
|
| 13 |
+
# Run:
|
| 14 |
+
# docker run -p 8000:8000 -e E2B_API_KEY=e2b_... opencode-env
|
| 15 |
+
|
| 16 |
+
ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
|
| 17 |
+
FROM ${BASE_IMAGE} AS builder
|
| 18 |
+
|
| 19 |
+
WORKDIR /app
|
| 20 |
+
|
| 21 |
+
ARG BUILD_MODE=in-repo
|
| 22 |
+
|
| 23 |
+
COPY . /app/env
|
| 24 |
+
WORKDIR /app/env
|
| 25 |
+
|
| 26 |
+
RUN if ! command -v uv >/dev/null 2>&1; then \
|
| 27 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
| 28 |
+
mv /root/.local/bin/uv /usr/local/bin/uv && \
|
| 29 |
+
mv /root/.local/bin/uvx /usr/local/bin/uvx; \
|
| 30 |
+
fi
|
| 31 |
+
|
| 32 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 33 |
+
git \
|
| 34 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 35 |
+
|
| 36 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 37 |
+
if [ -f uv.lock ]; then \
|
| 38 |
+
uv sync --frozen --no-install-project --no-editable; \
|
| 39 |
+
else \
|
| 40 |
+
uv sync --no-install-project --no-editable; \
|
| 41 |
+
fi
|
| 42 |
+
|
| 43 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 44 |
+
if [ -f uv.lock ]; then \
|
| 45 |
+
uv sync --frozen --no-editable; \
|
| 46 |
+
else \
|
| 47 |
+
uv sync --no-editable; \
|
| 48 |
+
fi
|
| 49 |
+
|
| 50 |
+
# ── runtime stage ────────────────────────────────────────────────────────────
|
| 51 |
+
FROM ${BASE_IMAGE}
|
| 52 |
+
|
| 53 |
+
WORKDIR /app
|
| 54 |
+
|
| 55 |
+
COPY --from=builder /app/env/.venv /app/.venv
|
| 56 |
+
COPY --from=builder /app/env /app/env
|
| 57 |
+
|
| 58 |
+
ENV PATH="/app/.venv/bin:$PATH"
|
| 59 |
+
ENV PYTHONPATH="/app/env:$PYTHONPATH"
|
| 60 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 61 |
+
ENV PYTHONUNBUFFERED=1
|
| 62 |
+
|
| 63 |
+
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
|
| 64 |
+
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
|
| 65 |
+
|
| 66 |
+
EXPOSE 8000
|
| 67 |
+
|
| 68 |
+
ENV ENABLE_WEB_INTERFACE=true
|
| 69 |
+
CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
|
README.md
CHANGED
|
@@ -1,10 +1,264 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
colorTo: purple
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: OpenCode Environment Server
|
| 3 |
+
emoji: 🛠️
|
| 4 |
+
colorFrom: indigo
|
| 5 |
colorTo: purple
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
app_port: 8000
|
| 9 |
+
base_path: /web
|
| 10 |
+
tags:
|
| 11 |
+
- openenv
|
| 12 |
+
short_description: OpenCode coding agent in an E2B sandbox with logprob capture
|
| 13 |
---
|
| 14 |
|
| 15 |
+
# OpenCode Environment for OpenEnv
|
| 16 |
+
|
| 17 |
+
`opencode_env` runs the [OpenCode](https://opencode.ai) coding agent inside
|
| 18 |
+
an isolated [E2B](https://e2b.dev) sandbox against any OpenAI-compatible
|
| 19 |
+
LLM endpoint, optionally capturing per-token logprobs for GRPO training.
|
| 20 |
+
|
| 21 |
+
The env is **task-agnostic** — every rollout is configured at call-time
|
| 22 |
+
with a uniform Task shape:
|
| 23 |
+
|
| 24 |
+
- **`instruction`** — prompt for the agent
|
| 25 |
+
- **`setup`** — list of bash commands run *before* the agent (pip
|
| 26 |
+
install, git clone, file downloads — anything you need staged in the
|
| 27 |
+
sandbox)
|
| 28 |
+
- **`verify`** — list of bash commands run *after* the agent (asserts,
|
| 29 |
+
pytest invocations, score-file writes)
|
| 30 |
+
|
| 31 |
+
Reward = `passed_verify / total_verify` unless any `verify` command writes
|
| 32 |
+
a float to `/home/user/logs/verifier/reward.txt` (override).
|
| 33 |
+
|
| 34 |
+
## Quick Start
|
| 35 |
+
|
| 36 |
+
### As a deployed env (HTTP / MCP)
|
| 37 |
+
|
| 38 |
+
```python
|
| 39 |
+
import os
|
| 40 |
+
from opencode_env import OpenCodeEnv
|
| 41 |
+
|
| 42 |
+
with OpenCodeEnv(base_url="http://localhost:8000") as env:
|
| 43 |
+
env.reset()
|
| 44 |
+
result = env.run_rollout(
|
| 45 |
+
endpoint="openai", # shorthand → server resolves
|
| 46 |
+
instruction=(
|
| 47 |
+
"Create binary_search.py exposing def binary_search(arr, target) -> int "
|
| 48 |
+
"that returns the index of target in arr, or -1 if absent. Use a "
|
| 49 |
+
"relative path."
|
| 50 |
+
),
|
| 51 |
+
setup=[],
|
| 52 |
+
verify=[
|
| 53 |
+
"test -f /home/user/workdir/binary_search.py",
|
| 54 |
+
"python -c \"import sys; sys.path.insert(0, '/home/user/workdir'); "
|
| 55 |
+
"import binary_search; "
|
| 56 |
+
"assert binary_search.binary_search([1,2,3], 2) == 1\"",
|
| 57 |
+
],
|
| 58 |
+
task_id="binary_search_v1",
|
| 59 |
+
template="opencode-rl", # prebaked E2B template
|
| 60 |
+
)
|
| 61 |
+
print("reward:", result.reward)
|
| 62 |
+
print("turns:", len(result.proxy_turns))
|
| 63 |
+
print("files:", list(result.files.keys()))
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
The Space-deployed variant works the same — point `base_url` at
|
| 67 |
+
`https://<user>-opencode-env.hf.space` and set the relevant secrets in
|
| 68 |
+
the Space settings.
|
| 69 |
+
|
| 70 |
+
### As an in-process primitive
|
| 71 |
+
|
| 72 |
+
```python
|
| 73 |
+
import os
|
| 74 |
+
from opencode_env import (
|
| 75 |
+
OpenCodeConfig, OpenCodeSessionFactory, OpenCodeTask, E2BSandboxBackend,
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
factory = OpenCodeSessionFactory(
|
| 79 |
+
config=OpenCodeConfig(
|
| 80 |
+
provider="openai_compatible",
|
| 81 |
+
base_url="https://api.openai.com/v1",
|
| 82 |
+
api_key=os.environ["OPENAI_API_KEY"],
|
| 83 |
+
model="gpt-4o-mini",
|
| 84 |
+
),
|
| 85 |
+
sandbox_backend=E2BSandboxBackend(),
|
| 86 |
+
mode="transparent_proxy", # or "black_box"
|
| 87 |
+
)
|
| 88 |
+
session = factory.create(task=OpenCodeTask(instruction="..."))
|
| 89 |
+
session.wait_for_completion()
|
| 90 |
+
turns = session.fetch_proxy_trace() # per-turn (tokens, logprobs)
|
| 91 |
+
session.close()
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
## Building the Docker Image
|
| 95 |
+
|
| 96 |
+
The Dockerfile lives at `server/Dockerfile`. Use the `openenv` CLI from
|
| 97 |
+
the env root:
|
| 98 |
+
|
| 99 |
+
```bash
|
| 100 |
+
cd envs/opencode_env
|
| 101 |
+
|
| 102 |
+
openenv validate # check pyproject.toml + openenv.yaml + server/app.py + uv.lock
|
| 103 |
+
openenv build -t opencode-env # builds the image (uses server/Dockerfile)
|
| 104 |
+
|
| 105 |
+
# run locally with E2B credentials
|
| 106 |
+
docker run -p 8000:8000 -e E2B_API_KEY=e2b_... opencode-env
|
| 107 |
+
|
| 108 |
+
# push to HF Spaces (Docker variant)
|
| 109 |
+
openenv push --repo-id <user>/opencode-env
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
Or build directly without the CLI:
|
| 113 |
+
|
| 114 |
+
```bash
|
| 115 |
+
docker build -t opencode-env -f envs/opencode_env/server/Dockerfile envs/opencode_env
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
The image:
|
| 119 |
+
|
| 120 |
+
- Runs `uvicorn server.app:app --host 0.0.0.0 --port 8000`
|
| 121 |
+
- Exposes the MCP API at `/mcp` and `/step`, the Gradio UI at `/web`,
|
| 122 |
+
health at `/health`, and OpenAPI docs at `/docs`.
|
| 123 |
+
- Reads `E2B_API_KEY` and (optionally) endpoint-specific env vars at
|
| 124 |
+
runtime (see [Environment Variables](#environment-variables)).
|
| 125 |
+
|
| 126 |
+
## The MCP Tool: `run_rollout`
|
| 127 |
+
|
| 128 |
+
Single tool, two ways to specify the LLM endpoint:
|
| 129 |
+
|
| 130 |
+
**Option A — endpoint shorthand (recommended)**: pass
|
| 131 |
+
`endpoint="vllm"` (or `"openai"` / `"hf_router"`). The server resolves
|
| 132 |
+
`base_url`, `api_key`, and `model` from env vars + catalog defaults.
|
| 133 |
+
Any explicit field overrides the catalog.
|
| 134 |
+
|
| 135 |
+
**Option B — fully explicit**: pass `base_url` + `api_key` + `model`
|
| 136 |
+
directly.
|
| 137 |
+
|
| 138 |
+
| Arg | Type | Default | Notes |
|
| 139 |
+
|---|---|---|---|
|
| 140 |
+
| `endpoint` | `str` | `""` | One of `"vllm"` / `"openai"` / `"hf_router"`. |
|
| 141 |
+
| `base_url` / `api_key` / `model` | `str` | `""` | Override / supply explicitly. |
|
| 142 |
+
| `instruction` | `str` | required | Prompt passed to `opencode run`. |
|
| 143 |
+
| `setup` | `list[str]` | `[]` | Bash commands run **before** the agent. |
|
| 144 |
+
| `verify` | `list[str]` | `[]` | Bash commands run **after** the agent. |
|
| 145 |
+
| `task_id` | `str` | `""` | Echoed back in result. |
|
| 146 |
+
| `mode` | `str` | `"transparent_proxy"` | Or `"black_box"` (no logprobs). |
|
| 147 |
+
| `disable_thinking` | `bool \| None` | `None` (catalog default) | Inject `chat_template_kwargs.enable_thinking=false`. |
|
| 148 |
+
| `max_tokens_cap` | `int` | `4096` | Per-turn `max_tokens` clamp. |
|
| 149 |
+
| `top_logprobs` | `int` | `5` | HF Router cap is 5; OpenAI 0–20; vLLM unbounded. |
|
| 150 |
+
| `agent_timeout_s` | `float` | `600.0` | Hard wall budget for opencode. |
|
| 151 |
+
| `template` | `str` | `""` | E2B template name; `"opencode-rl"` skips ~2 min of install per rollout. |
|
| 152 |
+
|
| 153 |
+
Returns `RolloutResult` JSON with: `reward`, `setup_results[]`,
|
| 154 |
+
`verify_results[]`, `proxy_turns[]`, `files{}`, `agent_log_tail`,
|
| 155 |
+
`proxy_log_tail`, `wall_s`, `agent_exit_code`, `sandbox_id`, `error`.
|
| 156 |
+
|
| 157 |
+
## Two Operating Modes
|
| 158 |
+
|
| 159 |
+
| Mode | What it does | Best for |
|
| 160 |
+
|---|---|---|
|
| 161 |
+
| **`transparent_proxy`** (default) | In-sandbox proxy at `localhost:7000` forwards opencode's LLM calls to `base_url`, injects `logprobs=true`, captures per-turn `(messages, completion_tokens, logprobs)` to `proxy_trace.jsonl`. | GRPO / RL training, observability, top-k distillation. |
|
| 162 |
+
| **`black_box`** | No proxy. opencode talks straight to `base_url`. | Smoke tests, eval, SFT data collection. |
|
| 163 |
+
|
| 164 |
+
## Environment Variables
|
| 165 |
+
|
| 166 |
+
The server reads these at runtime. Local dev auto-loads them from a
|
| 167 |
+
sibling `.env` file; on HF Spaces, set them as **Space secrets**.
|
| 168 |
+
|
| 169 |
+
| Variable | Required | Purpose |
|
| 170 |
+
|---|---|---|
|
| 171 |
+
| `E2B_API_KEY` | **yes** for any rollout | E2B sandbox credentials. |
|
| 172 |
+
| `MAX_CONCURRENT_ENVS` | no | Env-instance pool size. Default `4`. |
|
| 173 |
+
| `ENABLE_WEB_INTERFACE` | no | Set `false` to disable the `/web` Gradio mount. Default `true`. |
|
| 174 |
+
| **vLLM endpoint** | | |
|
| 175 |
+
| `VLLM_URL` | required for `endpoint="vllm"` | OAI-compatible base URL. |
|
| 176 |
+
| `VLLM_API_KEY` | no | Defaults to `intercepted`. |
|
| 177 |
+
| `VLLM_MODEL` | no | Defaults to `Qwen/Qwen3.5-4B`. |
|
| 178 |
+
| **OpenAI endpoint** | | |
|
| 179 |
+
| `OPENAI_API_KEY` | required for `endpoint="openai"` | Standard OpenAI key. |
|
| 180 |
+
| `OPENAI_BASE_URL` | no | Defaults to `https://api.openai.com/v1`. |
|
| 181 |
+
| `OPENAI_MODEL` | no | Defaults to `gpt-4o-mini` (gpt-5.x and o-series refuse logprobs). |
|
| 182 |
+
| **HF Router endpoint** | | |
|
| 183 |
+
| `HF_ROUTER_API_KEY` | required for `endpoint="hf_router"` | HF user token. |
|
| 184 |
+
| `HF_ROUTER_BASE_URL` | no | Defaults to `https://router.huggingface.co/v1`. |
|
| 185 |
+
| `HF_ROUTER_MODEL` | no | Defaults to `Qwen/Qwen3-4B-Instruct-2507:nscale`. |
|
| 186 |
+
|
| 187 |
+
Pick `provider:` suffixes that actually return logprobs:
|
| 188 |
+
**Together / Nscale / Scaleway / SambaNova / Cerebras**. Avoid Novita /
|
| 189 |
+
Hyperbolic / Featherless (silent drop) and Groq (HTTP 400).
|
| 190 |
+
|
| 191 |
+
## Pre-baked E2B Template
|
| 192 |
+
|
| 193 |
+
The first rollout in a fresh E2B sandbox spends ~2 min installing
|
| 194 |
+
opencode and the proxy's Python deps. Build a one-time template that
|
| 195 |
+
ships those pre-installed:
|
| 196 |
+
|
| 197 |
+
```bash
|
| 198 |
+
.venv/bin/python envs/opencode_env/sandbox/build_template.py
|
| 199 |
+
# → builds `opencode-rl` template in your E2B account (~1m20s, one-time)
|
| 200 |
+
```
|
| 201 |
+
|
| 202 |
+
After this, pass `template="opencode-rl"` on every `run_rollout` call —
|
| 203 |
+
each rollout drops to ~20–30s end-to-end.
|
| 204 |
+
|
| 205 |
+
## Tests
|
| 206 |
+
|
| 207 |
+
A cheap pre-flight (no E2B, no opencode — just hits each LLM endpoint
|
| 208 |
+
once with a tiny request to confirm it returns logprobs):
|
| 209 |
+
|
| 210 |
+
```bash
|
| 211 |
+
.venv/bin/python envs/opencode_env/tests/test_inference_endpoints.py
|
| 212 |
+
```
|
| 213 |
+
|
| 214 |
+
Multi-endpoint end-to-end (spawns one E2B sandbox per endpoint, runs
|
| 215 |
+
opencode on a sorting task, prints a comparison table):
|
| 216 |
+
|
| 217 |
+
```bash
|
| 218 |
+
.venv/bin/python envs/opencode_env/tests/test_five_sorts_e2e.py \
|
| 219 |
+
--endpoint all --template opencode-rl
|
| 220 |
+
```
|
| 221 |
+
|
| 222 |
+
## Project Structure
|
| 223 |
+
|
| 224 |
+
```
|
| 225 |
+
opencode_env/
|
| 226 |
+
├── README.md # this file
|
| 227 |
+
├── openenv.yaml # OpenEnv space spec
|
| 228 |
+
├── pyproject.toml # deps + ``server`` entrypoint
|
| 229 |
+
├── uv.lock # frozen deps (required for openenv validate)
|
| 230 |
+
├── .gitignore / .dockerignore # excludes .env / __pycache__ / artifacts
|
| 231 |
+
├── __init__.py # re-exports primitive + client + models
|
| 232 |
+
│
|
| 233 |
+
├── client.py # OpenCodeEnv(MCPToolClient)
|
| 234 |
+
├── models.py # RolloutResult / RolloutTurn / OpenCodeState
|
| 235 |
+
│
|
| 236 |
+
├── config.py # OpenCodeConfig (primitive)
|
| 237 |
+
├── harness.py # OpenCodeSession / OpenCodeSessionFactory (CLI-only)
|
| 238 |
+
├── opencode_runtime.py # opencode.json builder + cmds
|
| 239 |
+
├── task.py # OpenCodeTask
|
| 240 |
+
│
|
| 241 |
+
├── server/
|
| 242 |
+
│ ├── __init__.py
|
| 243 |
+
│ ├── app.py # FastAPI factory; mounts Gradio at /web
|
| 244 |
+
│ ├── opencode_environment.py # MCPEnvironment with single ``run_rollout`` tool
|
| 245 |
+
│ ├── gradio_ui.py # the /web Gradio Blocks UI
|
| 246 |
+
│ ├── catalog.py # endpoint shorthand resolver
|
| 247 |
+
│ └── Dockerfile # multi-stage uv build (used by ``openenv build``)
|
| 248 |
+
│
|
| 249 |
+
├── sandbox/
|
| 250 |
+
│ ├── __init__.py
|
| 251 |
+
│ ├── base.py # SandboxBackend / SandboxHandle Protocols
|
| 252 |
+
│ ├── e2b.py # E2B implementation
|
| 253 |
+
│ ├── interception.py # in-sandbox FastAPI proxy (logprob capture)
|
| 254 |
+
│ └── build_template.py # one-time E2B template builder
|
| 255 |
+
│
|
| 256 |
+
└── tests/ # pre-flight + e2e + unit tests
|
| 257 |
+
```
|
| 258 |
+
|
| 259 |
+
## References
|
| 260 |
+
|
| 261 |
+
- [OpenEnv docs](https://meta-pytorch.org/OpenEnv/)
|
| 262 |
+
- [OpenCode CLI](https://opencode.ai/docs/cli/)
|
| 263 |
+
- [E2B Python SDK](https://e2b.dev/docs)
|
| 264 |
+
- [HF Inference Providers logprob matrix](../../../DOCS/HF/hf_inference_providers_logprobs.md)
|
__init__.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""OpenCode environment for OpenEnv.
|
| 8 |
+
|
| 9 |
+
Two layers in this package:
|
| 10 |
+
|
| 11 |
+
1. **Harness primitive** — :class:`OpenCodeSessionFactory` /
|
| 12 |
+
:class:`OpenCodeSession` / :class:`OpenCodeConfig` /
|
| 13 |
+
:class:`E2BSandboxBackend`. Used in-process to drive one rollout
|
| 14 |
+
inside an E2B sandbox. See ``harness.py``.
|
| 15 |
+
|
| 16 |
+
2. **Deployable env** — :class:`OpenCodeEnv` (MCP client) talks to the
|
| 17 |
+
FastAPI server at ``server/app.py`` over HTTP. Use this when the
|
| 18 |
+
sandbox + agent live behind an HTTP boundary (e.g. an HF Space).
|
| 19 |
+
See ``client.py`` and ``server/``.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
from openenv.core.env_server.mcp_types import CallToolAction, ListToolsAction
|
| 23 |
+
|
| 24 |
+
from .client import OpenCodeEnv
|
| 25 |
+
from .config import OpenCodeConfig, Provider
|
| 26 |
+
from .harness import OpenCodeSession, OpenCodeSessionFactory
|
| 27 |
+
from .models import (
|
| 28 |
+
CommandResult,
|
| 29 |
+
OpenCodeState,
|
| 30 |
+
RolloutResult,
|
| 31 |
+
RolloutTurn,
|
| 32 |
+
)
|
| 33 |
+
from .sandbox import E2BSandboxBackend, SandboxBackend, SandboxHandle
|
| 34 |
+
from .task import OpenCodeTask
|
| 35 |
+
|
| 36 |
+
__all__ = [
|
| 37 |
+
# Deployed-env client
|
| 38 |
+
"OpenCodeEnv",
|
| 39 |
+
"CallToolAction",
|
| 40 |
+
"ListToolsAction",
|
| 41 |
+
# HTTP API models
|
| 42 |
+
"CommandResult",
|
| 43 |
+
"OpenCodeState",
|
| 44 |
+
"RolloutResult",
|
| 45 |
+
"RolloutTurn",
|
| 46 |
+
# Harness primitive
|
| 47 |
+
"OpenCodeConfig",
|
| 48 |
+
"OpenCodeSession",
|
| 49 |
+
"OpenCodeSessionFactory",
|
| 50 |
+
"OpenCodeTask",
|
| 51 |
+
"Provider",
|
| 52 |
+
# Sandbox backend
|
| 53 |
+
"E2BSandboxBackend",
|
| 54 |
+
"SandboxBackend",
|
| 55 |
+
"SandboxHandle",
|
| 56 |
+
]
|
client.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Client for the deployed opencode_env server.
|
| 8 |
+
|
| 9 |
+
The server exposes a single MCP tool ``run_rollout`` that runs one OpenCode
|
| 10 |
+
rollout in an E2B sandbox and returns a JSON-serialized :class:`RolloutResult`.
|
| 11 |
+
|
| 12 |
+
Example::
|
| 13 |
+
|
| 14 |
+
from opencode_env import OpenCodeEnv
|
| 15 |
+
|
| 16 |
+
with OpenCodeEnv(base_url="https://adithya-sk-opencode-env.hf.space") as env:
|
| 17 |
+
env.reset()
|
| 18 |
+
result = env.run_rollout(
|
| 19 |
+
base_url="https://api.openai.com/v1",
|
| 20 |
+
api_key=os.environ["OPENAI_API_KEY"],
|
| 21 |
+
model="gpt-4o-mini",
|
| 22 |
+
instruction="Create binary_search.py exposing def binary_search(arr, target) -> int...",
|
| 23 |
+
setup=[],
|
| 24 |
+
verify=["python /home/user/test.py"],
|
| 25 |
+
task_id="binary_search_v1",
|
| 26 |
+
)
|
| 27 |
+
print(result.reward, len(result.proxy_turns))
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
from __future__ import annotations
|
| 31 |
+
|
| 32 |
+
import json
|
| 33 |
+
from typing import Any
|
| 34 |
+
|
| 35 |
+
from openenv.core.mcp_client import MCPToolClient
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
from .models import RolloutResult
|
| 39 |
+
except ImportError: # pragma: no cover
|
| 40 |
+
from models import RolloutResult # type: ignore
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class OpenCodeEnv(MCPToolClient):
|
| 44 |
+
"""Typed client for the opencode_env MCP server.
|
| 45 |
+
|
| 46 |
+
Inherits ``reset`` / ``call_tool`` / ``list_tools`` / ``from_docker_image``
|
| 47 |
+
/ context-manager semantics from :class:`MCPToolClient`.
|
| 48 |
+
"""
|
| 49 |
+
|
| 50 |
+
def run_rollout(
|
| 51 |
+
self,
|
| 52 |
+
*,
|
| 53 |
+
# Endpoint — pass either the shorthand selector OR explicit fields.
|
| 54 |
+
endpoint: str = "", # "vllm" | "openai" | "hf_router"
|
| 55 |
+
base_url: str = "",
|
| 56 |
+
api_key: str = "",
|
| 57 |
+
model: str = "",
|
| 58 |
+
# Task — the "list of bash commands" shape
|
| 59 |
+
instruction: str,
|
| 60 |
+
setup: list[str] | None = None,
|
| 61 |
+
verify: list[str] | None = None,
|
| 62 |
+
# Bookkeeping / tunables
|
| 63 |
+
task_id: str = "",
|
| 64 |
+
mode: str = "transparent_proxy",
|
| 65 |
+
disable_thinking: bool | None = None,
|
| 66 |
+
max_tokens_cap: int = 4096,
|
| 67 |
+
top_logprobs: int = 5,
|
| 68 |
+
agent_timeout_s: float = 600.0,
|
| 69 |
+
template: str = "",
|
| 70 |
+
) -> RolloutResult:
|
| 71 |
+
"""Run one OpenCode rollout and return the typed result.
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
base_url: OpenAI-compatible LLM endpoint (with trailing /v1).
|
| 75 |
+
api_key: Bearer token for the LLM. Use ``"intercepted"`` for vLLM
|
| 76 |
+
if it doesn't enforce auth.
|
| 77 |
+
model: Model id understood by the LLM endpoint
|
| 78 |
+
(e.g. ``"gpt-4o-mini"``, ``"Qwen/Qwen3.5-4B"``,
|
| 79 |
+
``"Qwen/Qwen3-4B-Instruct-2507:nscale"``).
|
| 80 |
+
instruction: Prompt passed to ``opencode run``.
|
| 81 |
+
setup: Bash commands run sequentially **before** the agent starts.
|
| 82 |
+
Each command runs in the sandbox; non-zero exit aborts setup.
|
| 83 |
+
verify: Bash commands run sequentially **after** the agent exits.
|
| 84 |
+
Reward = ``passed_count / total`` unless any command writes a
|
| 85 |
+
float to ``/home/user/logs/verifier/reward.txt`` (override).
|
| 86 |
+
task_id: Echoed back in the result for traceability.
|
| 87 |
+
mode: ``"transparent_proxy"`` (captures per-token logprobs via
|
| 88 |
+
an in-sandbox FastAPI proxy) or ``"black_box"`` (no proxy).
|
| 89 |
+
disable_thinking: Inject
|
| 90 |
+
``chat_template_kwargs.enable_thinking=false`` on forwarded
|
| 91 |
+
requests. Needed for Qwen3.5 vLLM; harmless on Instruct
|
| 92 |
+
variants; rejected by OpenAI direct.
|
| 93 |
+
max_tokens_cap: Clamp on per-turn ``max_tokens``. OpenCode asks
|
| 94 |
+
for ~32k by default; gpt-4o-mini caps at 16k.
|
| 95 |
+
top_logprobs: Top-k logprobs requested upstream. HF Router caps
|
| 96 |
+
at 5; OpenAI accepts up to 20; vLLM is unbounded.
|
| 97 |
+
agent_timeout_s: Hard wall-clock budget for one ``opencode run``.
|
| 98 |
+
template: E2B template name (e.g. ``"opencode-rl"``). Empty
|
| 99 |
+
string uses the default (slow) base image.
|
| 100 |
+
|
| 101 |
+
Returns:
|
| 102 |
+
A :class:`RolloutResult` with reward, per-turn logprobs, file
|
| 103 |
+
outputs, setup/verify results, and diagnostic tails.
|
| 104 |
+
"""
|
| 105 |
+
raw = self.call_tool(
|
| 106 |
+
"run_rollout",
|
| 107 |
+
endpoint=endpoint,
|
| 108 |
+
base_url=base_url,
|
| 109 |
+
api_key=api_key,
|
| 110 |
+
model=model,
|
| 111 |
+
instruction=instruction,
|
| 112 |
+
setup=list(setup or []),
|
| 113 |
+
verify=list(verify or []),
|
| 114 |
+
task_id=task_id,
|
| 115 |
+
mode=mode,
|
| 116 |
+
disable_thinking=disable_thinking,
|
| 117 |
+
max_tokens_cap=max_tokens_cap,
|
| 118 |
+
top_logprobs=top_logprobs,
|
| 119 |
+
agent_timeout_s=agent_timeout_s,
|
| 120 |
+
template=template,
|
| 121 |
+
)
|
| 122 |
+
return RolloutResult.model_validate_json(_extract_text(raw))
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def _extract_text(result: Any) -> str:
|
| 126 |
+
"""Pull the JSON text out of whatever shape the MCP layer returns.
|
| 127 |
+
|
| 128 |
+
Handles the three shapes :meth:`MCPToolClient.call_tool` may surface:
|
| 129 |
+
a raw string, a ``CallToolObservation``-like object with
|
| 130 |
+
``.result.content[0].text``, or a dict with ``content[0]["text"]``.
|
| 131 |
+
"""
|
| 132 |
+
if isinstance(result, str):
|
| 133 |
+
return result
|
| 134 |
+
|
| 135 |
+
inner = getattr(result, "result", None)
|
| 136 |
+
if inner is not None:
|
| 137 |
+
content = getattr(inner, "content", None)
|
| 138 |
+
if content:
|
| 139 |
+
first = content[0]
|
| 140 |
+
text = getattr(first, "text", None)
|
| 141 |
+
if isinstance(text, str):
|
| 142 |
+
return text
|
| 143 |
+
if isinstance(first, dict) and "text" in first:
|
| 144 |
+
return first["text"]
|
| 145 |
+
|
| 146 |
+
if isinstance(result, dict):
|
| 147 |
+
content = result.get("content")
|
| 148 |
+
if isinstance(content, list) and content:
|
| 149 |
+
first = content[0]
|
| 150 |
+
if isinstance(first, dict) and "text" in first:
|
| 151 |
+
return first["text"]
|
| 152 |
+
nested = result.get("result")
|
| 153 |
+
if isinstance(nested, dict):
|
| 154 |
+
content = nested.get("content")
|
| 155 |
+
if isinstance(content, list) and content:
|
| 156 |
+
first = content[0]
|
| 157 |
+
if isinstance(first, dict) and "text" in first:
|
| 158 |
+
return first["text"]
|
| 159 |
+
return json.dumps(result, default=str)
|
| 160 |
+
|
| 161 |
+
content = getattr(result, "content", None)
|
| 162 |
+
if content:
|
| 163 |
+
first = content[0]
|
| 164 |
+
text = getattr(first, "text", None)
|
| 165 |
+
if isinstance(text, str):
|
| 166 |
+
return text
|
| 167 |
+
|
| 168 |
+
return str(result)
|
config.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Configuration model for the OpenCode harness primitive."""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
from typing import Any, Literal
|
| 12 |
+
|
| 13 |
+
from pydantic import BaseModel, Field
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
Provider = Literal["openai_compatible", "openai", "anthropic"]
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class OpenCodeConfig(BaseModel):
|
| 20 |
+
"""All configuration required to launch one OpenCode rollout in a sandbox.
|
| 21 |
+
|
| 22 |
+
Field names are provider-agnostic. The primitive maps ``provider`` onto the
|
| 23 |
+
correct ``opencode.json`` provider block (``@ai-sdk/openai-compatible``,
|
| 24 |
+
``@ai-sdk/openai``, or ``@ai-sdk/anthropic``) and injects ``base_url`` /
|
| 25 |
+
``api_key`` into it.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
# --- LLM endpoint ---------------------------------------------------------
|
| 29 |
+
provider: Provider = "openai_compatible"
|
| 30 |
+
base_url: str
|
| 31 |
+
api_key: str = "intercepted"
|
| 32 |
+
model: str = "intercepted/model"
|
| 33 |
+
request_timeout_ms: int = 600_000
|
| 34 |
+
|
| 35 |
+
# --- OpenCode CLI ---------------------------------------------------------
|
| 36 |
+
opencode_version: str = "latest"
|
| 37 |
+
disabled_tools: list[str] = Field(
|
| 38 |
+
default_factory=lambda: ["webfetch", "question"]
|
| 39 |
+
)
|
| 40 |
+
enabled_tools: list[str] | None = None
|
| 41 |
+
system_prompt: str | None = None
|
| 42 |
+
extra_opencode_json: dict[str, Any] = Field(default_factory=dict)
|
| 43 |
+
|
| 44 |
+
# --- CLI invocation -------------------------------------------------------
|
| 45 |
+
run_format: Literal["default", "json"] = "json"
|
| 46 |
+
agent_timeout_s: float = 900.0
|
| 47 |
+
extra_env: dict[str, str] = Field(default_factory=dict)
|
| 48 |
+
extra_setup_shell: str | None = None
|
| 49 |
+
|
| 50 |
+
# --- Sandbox paths --------------------------------------------------------
|
| 51 |
+
# Root directory inside the sandbox where the primitive writes config,
|
| 52 |
+
# task files, and logs. E2B's default user is ``user`` with home
|
| 53 |
+
# ``/home/user``. Override when using a root-privileged backend (Docker).
|
| 54 |
+
sandbox_home: str = "/home/user"
|
| 55 |
+
|
| 56 |
+
# --- Transparent-proxy tuning --------------------------------------------
|
| 57 |
+
# Cap ``max_tokens`` / ``max_completion_tokens`` on forwarded requests.
|
| 58 |
+
# OpenCode defaults to a very large number (~32000) which exceeds some
|
| 59 |
+
# provider limits (e.g. gpt-4o-mini = 16384). Only used in
|
| 60 |
+
# ``mode="transparent_proxy"``. ``None`` disables the cap.
|
| 61 |
+
proxy_max_tokens_cap: int | None = 16384
|
| 62 |
+
# Per-turn top-k logprobs the proxy requests from the upstream.
|
| 63 |
+
proxy_top_logprobs: int = 5
|
| 64 |
+
# Disable reasoning/thinking mode for Qwen3 / Qwen3.5 models. Proxy sets
|
| 65 |
+
# ``extra_body.chat_template_kwargs.enable_thinking=false`` on forwarded
|
| 66 |
+
# requests. Ignored by providers that don't support the field.
|
| 67 |
+
proxy_disable_thinking: bool = False
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
_PROVIDER_NPM = {
|
| 71 |
+
"openai_compatible": "@ai-sdk/openai-compatible",
|
| 72 |
+
"openai": "@ai-sdk/openai",
|
| 73 |
+
"anthropic": "@ai-sdk/anthropic",
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def provider_npm_package(provider: Provider) -> str:
|
| 78 |
+
"""Return the AI SDK npm package opencode should use for a provider."""
|
| 79 |
+
return _PROVIDER_NPM[provider]
|
harness.py
ADDED
|
@@ -0,0 +1,525 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""OpenCode session factory + session implementation.
|
| 8 |
+
|
| 9 |
+
Implements the :class:`ResourceSessionFactory` / :class:`ResourceSession`
|
| 10 |
+
contracts from ``openenv.core.harness`` (PR #471). The session wraps one
|
| 11 |
+
sandbox running the ``opencode`` CLI agent.
|
| 12 |
+
|
| 13 |
+
Two operating modes:
|
| 14 |
+
|
| 15 |
+
- ``mode="black_box"`` — opencode talks directly to ``config.base_url``.
|
| 16 |
+
No proxy, no logprob capture. Use for smoke tests / SFT / eval.
|
| 17 |
+
- ``mode="transparent_proxy"`` (default) — an in-sandbox FastAPI proxy
|
| 18 |
+
sits between opencode and the upstream LLM. It injects ``logprobs=true``
|
| 19 |
+
on every request and writes per-turn ``(messages, completion_tokens,
|
| 20 |
+
per_token_logps)`` to ``proxy_trace.jsonl`` for GRPO consumption.
|
| 21 |
+
|
| 22 |
+
Single driver path: opencode is started as a background subprocess via
|
| 23 |
+
``opencode run --format json --dangerously-skip-permissions ...`` and we
|
| 24 |
+
poll its exit code. The previous ``opencode serve`` driver was removed —
|
| 25 |
+
opencode CLI is the only path now.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
from __future__ import annotations
|
| 29 |
+
|
| 30 |
+
from pathlib import Path
|
| 31 |
+
from typing import Any, Callable, Literal
|
| 32 |
+
|
| 33 |
+
from openenv.core.env_server.mcp_types import Tool
|
| 34 |
+
from openenv.core.harness import (
|
| 35 |
+
Message,
|
| 36 |
+
ResourceSession,
|
| 37 |
+
ResourceSessionFactory,
|
| 38 |
+
ToolResult,
|
| 39 |
+
VerifyResult,
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
from .config import OpenCodeConfig
|
| 43 |
+
from .opencode_runtime import (
|
| 44 |
+
agent_log_path,
|
| 45 |
+
build_env_vars,
|
| 46 |
+
build_install_cmd,
|
| 47 |
+
build_opencode_json,
|
| 48 |
+
build_run_cmd,
|
| 49 |
+
instruction_path,
|
| 50 |
+
opencode_config_path,
|
| 51 |
+
system_prompt_path,
|
| 52 |
+
)
|
| 53 |
+
from .sandbox.base import BgJob, SandboxBackend, SandboxHandle
|
| 54 |
+
from .task import OpenCodeTask
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
# Inside-sandbox proxy paths (Mode B).
|
| 58 |
+
_PROXY_PORT = 7000
|
| 59 |
+
_PROXY_TRACE_PATH = "/home/user/logs/agent/proxy_trace.jsonl"
|
| 60 |
+
_PROXY_LOG_PATH = "/home/user/logs/agent/proxy.log"
|
| 61 |
+
|
| 62 |
+
# Where the proxy source lives on disk (in this repo). Uploaded into the
|
| 63 |
+
# sandbox at /home/user/proxy/interception.py before each rollout, unless
|
| 64 |
+
# the sandbox was created from a template that already has it baked in.
|
| 65 |
+
_PROXY_SOURCE_PATH = Path(__file__).parent / "sandbox" / "interception.py"
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
Verifier = Callable[[SandboxHandle, OpenCodeTask], VerifyResult]
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class OpenCodeSession(ResourceSession):
|
| 72 |
+
"""One live OpenCode rollout inside a sandbox.
|
| 73 |
+
|
| 74 |
+
The session is created already-running: :meth:`OpenCodeSessionFactory.create`
|
| 75 |
+
calls :meth:`start_agent` before returning. Typical usage::
|
| 76 |
+
|
| 77 |
+
session = factory.create(task)
|
| 78 |
+
session.wait_for_completion()
|
| 79 |
+
result = session.verify([])
|
| 80 |
+
session.close()
|
| 81 |
+
"""
|
| 82 |
+
|
| 83 |
+
def __init__(
|
| 84 |
+
self,
|
| 85 |
+
*,
|
| 86 |
+
sandbox: SandboxHandle,
|
| 87 |
+
config: OpenCodeConfig,
|
| 88 |
+
task: OpenCodeTask,
|
| 89 |
+
verifier: Verifier | None = None,
|
| 90 |
+
base_url_override: str | None = None,
|
| 91 |
+
proxy_trace_path: str | None = None,
|
| 92 |
+
proxy_bg_job: BgJob | None = None,
|
| 93 |
+
) -> None:
|
| 94 |
+
self.sandbox = sandbox
|
| 95 |
+
self.config = config
|
| 96 |
+
self.task = task
|
| 97 |
+
self._verifier = verifier
|
| 98 |
+
self._base_url_override = base_url_override
|
| 99 |
+
self._bg_job: BgJob | None = None
|
| 100 |
+
self._proxy_trace_path = proxy_trace_path
|
| 101 |
+
self._proxy_bg_job = proxy_bg_job
|
| 102 |
+
|
| 103 |
+
# ------------------------------------------------------------------
|
| 104 |
+
# ResourceSession contract (PR #471)
|
| 105 |
+
# ------------------------------------------------------------------
|
| 106 |
+
def initial_messages(self) -> list[Message]:
|
| 107 |
+
return [{"role": "user", "content": self.task.instruction}]
|
| 108 |
+
|
| 109 |
+
def list_tools(self) -> list[Tool]:
|
| 110 |
+
# OpenCode owns its own tool loop — none are exposed to the harness.
|
| 111 |
+
return []
|
| 112 |
+
|
| 113 |
+
def call_tool(self, name: str, arguments: dict[str, Any]) -> ToolResult:
|
| 114 |
+
return ToolResult(
|
| 115 |
+
error=(
|
| 116 |
+
"OpenCodeSession does not expose external tool calls; the "
|
| 117 |
+
"CLI agent owns its own tool loop."
|
| 118 |
+
)
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
def verify(
|
| 122 |
+
self,
|
| 123 |
+
transcript: list[Message],
|
| 124 |
+
final_state: Any | None = None,
|
| 125 |
+
) -> VerifyResult:
|
| 126 |
+
if self._verifier is None:
|
| 127 |
+
return VerifyResult(env_reward=None, done=True)
|
| 128 |
+
return self._verifier(self.sandbox, self.task)
|
| 129 |
+
|
| 130 |
+
def close(self) -> None:
|
| 131 |
+
if self._bg_job is not None:
|
| 132 |
+
try:
|
| 133 |
+
self._bg_job.kill()
|
| 134 |
+
except Exception:
|
| 135 |
+
pass
|
| 136 |
+
self._bg_job = None
|
| 137 |
+
if self._proxy_bg_job is not None:
|
| 138 |
+
try:
|
| 139 |
+
self._proxy_bg_job.kill()
|
| 140 |
+
except Exception:
|
| 141 |
+
pass
|
| 142 |
+
self._proxy_bg_job = None
|
| 143 |
+
self.sandbox.kill()
|
| 144 |
+
|
| 145 |
+
# ------------------------------------------------------------------
|
| 146 |
+
# OpenCode-specific session API
|
| 147 |
+
# ------------------------------------------------------------------
|
| 148 |
+
def start_agent(self) -> None:
|
| 149 |
+
"""Launch ``opencode run`` as a background subprocess in the sandbox."""
|
| 150 |
+
if self._bg_job is not None:
|
| 151 |
+
return
|
| 152 |
+
cmd = build_run_cmd(self.config)
|
| 153 |
+
envs = build_env_vars(self.config, base_url_override=self._base_url_override)
|
| 154 |
+
self._bg_job = self.sandbox.start_bg(cmd, envs=envs)
|
| 155 |
+
|
| 156 |
+
def wait_for_completion(self, timeout_s: float | None = None) -> int:
|
| 157 |
+
"""Block until the agent exits, returning its exit code."""
|
| 158 |
+
budget = timeout_s if timeout_s is not None else self.config.agent_timeout_s
|
| 159 |
+
if self._bg_job is None:
|
| 160 |
+
raise RuntimeError("Agent not started; call start_agent() first.")
|
| 161 |
+
return self._bg_job.wait(timeout=budget)
|
| 162 |
+
|
| 163 |
+
def fetch_trace(self) -> str:
|
| 164 |
+
"""Return the raw ``opencode run`` log (JSON-lines when ``run_format=json``)."""
|
| 165 |
+
return self.sandbox.read_text(agent_log_path(self.config))
|
| 166 |
+
|
| 167 |
+
def fetch_proxy_trace(self) -> list[dict[str, Any]]:
|
| 168 |
+
"""Return per-turn proxy-captured records (Mode B only).
|
| 169 |
+
|
| 170 |
+
Each entry has ``request``, ``response``, ``completion_tokens``,
|
| 171 |
+
``completion_token_ids``, ``per_token_logps``, ``finish_reason``,
|
| 172 |
+
and ``latency_s``. Returns ``[]`` in Mode A.
|
| 173 |
+
"""
|
| 174 |
+
if self._proxy_trace_path is None:
|
| 175 |
+
return []
|
| 176 |
+
try:
|
| 177 |
+
content = self.sandbox.read_text(self._proxy_trace_path)
|
| 178 |
+
except Exception:
|
| 179 |
+
return []
|
| 180 |
+
records: list[dict[str, Any]] = []
|
| 181 |
+
for line in content.splitlines():
|
| 182 |
+
line = line.strip()
|
| 183 |
+
if not line:
|
| 184 |
+
continue
|
| 185 |
+
import json as _json
|
| 186 |
+
records.append(_json.loads(line))
|
| 187 |
+
return records
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
class OpenCodeSessionFactory(ResourceSessionFactory):
|
| 191 |
+
"""Produce isolated per-rollout :class:`OpenCodeSession` instances.
|
| 192 |
+
|
| 193 |
+
The factory owns sandbox provisioning, opencode install, config injection,
|
| 194 |
+
and (Mode B) proxy startup. Each :meth:`create` call returns a fresh
|
| 195 |
+
sandbox with a running agent.
|
| 196 |
+
"""
|
| 197 |
+
|
| 198 |
+
def __init__(
|
| 199 |
+
self,
|
| 200 |
+
*,
|
| 201 |
+
config: OpenCodeConfig,
|
| 202 |
+
sandbox_backend: SandboxBackend,
|
| 203 |
+
mode: Literal["black_box", "transparent_proxy"] = "black_box",
|
| 204 |
+
verifier: Verifier | None = None,
|
| 205 |
+
install_timeout_s: int = 240,
|
| 206 |
+
setup_timeout_s: int = 300,
|
| 207 |
+
) -> None:
|
| 208 |
+
if mode not in {"black_box", "transparent_proxy"}:
|
| 209 |
+
raise ValueError(f"Unknown mode: {mode!r}")
|
| 210 |
+
self._config = config
|
| 211 |
+
self._backend = sandbox_backend
|
| 212 |
+
self._mode = mode
|
| 213 |
+
self._verifier = verifier
|
| 214 |
+
self._install_timeout_s = install_timeout_s
|
| 215 |
+
self._setup_timeout_s = setup_timeout_s
|
| 216 |
+
|
| 217 |
+
def create(
|
| 218 |
+
self,
|
| 219 |
+
task: Any,
|
| 220 |
+
seed: int | None = None,
|
| 221 |
+
episode_id: str | None = None,
|
| 222 |
+
) -> OpenCodeSession:
|
| 223 |
+
import logging
|
| 224 |
+
_log = logging.getLogger(__name__)
|
| 225 |
+
|
| 226 |
+
oc_task = OpenCodeTask.coerce(task)
|
| 227 |
+
sandbox_timeout = int(self._config.agent_timeout_s) + 300
|
| 228 |
+
|
| 229 |
+
_log.info(
|
| 230 |
+
"factory.create: creating sandbox timeout=%ds mode=%s",
|
| 231 |
+
sandbox_timeout, self._mode,
|
| 232 |
+
)
|
| 233 |
+
sandbox = self._backend.create(
|
| 234 |
+
timeout_s=sandbox_timeout,
|
| 235 |
+
metadata={"episode_id": episode_id} if episode_id else None,
|
| 236 |
+
)
|
| 237 |
+
sid = (
|
| 238 |
+
getattr(sandbox, "sandbox_id", None)
|
| 239 |
+
or getattr(getattr(sandbox, "raw", None), "sandbox_id", "?")
|
| 240 |
+
)
|
| 241 |
+
_log.info("factory.create: sandbox=%s — bootstrapping…", sid)
|
| 242 |
+
try:
|
| 243 |
+
self._bootstrap_sandbox(sandbox, oc_task)
|
| 244 |
+
except Exception as exc:
|
| 245 |
+
_log.error("factory.create: bootstrap failed: %r", exc)
|
| 246 |
+
sandbox.kill()
|
| 247 |
+
raise
|
| 248 |
+
|
| 249 |
+
base_url_override: str | None = None
|
| 250 |
+
proxy_trace_path: str | None = None
|
| 251 |
+
proxy_bg_job: BgJob | None = None
|
| 252 |
+
if self._mode == "transparent_proxy":
|
| 253 |
+
_log.info(
|
| 254 |
+
"factory.create: starting interception proxy on :%d → %s",
|
| 255 |
+
_PROXY_PORT, self._config.base_url,
|
| 256 |
+
)
|
| 257 |
+
proxy_bg_job, base_url_override, proxy_trace_path = self._start_proxy(
|
| 258 |
+
sandbox
|
| 259 |
+
)
|
| 260 |
+
_log.info("factory.create: proxy up at %s", base_url_override)
|
| 261 |
+
# Rewrite opencode.json so opencode points at the proxy. Force
|
| 262 |
+
# ``openai_compatible`` so opencode hits ``/v1/chat/completions``
|
| 263 |
+
# (which the proxy serves) rather than provider-specific paths.
|
| 264 |
+
from .config import OpenCodeConfig as _OCC
|
| 265 |
+
|
| 266 |
+
proxy_cfg = _OCC(
|
| 267 |
+
**{
|
| 268 |
+
**self._config.model_dump(),
|
| 269 |
+
"provider": "openai_compatible",
|
| 270 |
+
"base_url": base_url_override,
|
| 271 |
+
}
|
| 272 |
+
)
|
| 273 |
+
sandbox.write_text(
|
| 274 |
+
opencode_config_path(self._config),
|
| 275 |
+
build_opencode_json(proxy_cfg),
|
| 276 |
+
)
|
| 277 |
+
|
| 278 |
+
session = OpenCodeSession(
|
| 279 |
+
sandbox=sandbox,
|
| 280 |
+
config=self._config,
|
| 281 |
+
task=oc_task,
|
| 282 |
+
verifier=self._verifier,
|
| 283 |
+
base_url_override=base_url_override,
|
| 284 |
+
proxy_trace_path=proxy_trace_path,
|
| 285 |
+
proxy_bg_job=proxy_bg_job,
|
| 286 |
+
)
|
| 287 |
+
session.start_agent()
|
| 288 |
+
return session
|
| 289 |
+
|
| 290 |
+
# ------------------------------------------------------------------
|
| 291 |
+
def _wait_for_sandbox_ready(
|
| 292 |
+
self,
|
| 293 |
+
sandbox: SandboxHandle,
|
| 294 |
+
*,
|
| 295 |
+
attempts: int = 15,
|
| 296 |
+
delay_s: float = 1.0,
|
| 297 |
+
) -> None:
|
| 298 |
+
"""Probe the sandbox until ``echo ok`` succeeds.
|
| 299 |
+
|
| 300 |
+
E2B (and other backends) sometimes return the handle before the
|
| 301 |
+
guest is fully ready. Issue ``echo ok`` with short timeouts until
|
| 302 |
+
it succeeds. Returns silently on success; raises ``RuntimeError``
|
| 303 |
+
on prolonged failure.
|
| 304 |
+
"""
|
| 305 |
+
import time
|
| 306 |
+
|
| 307 |
+
last_err = ""
|
| 308 |
+
for _ in range(attempts):
|
| 309 |
+
try:
|
| 310 |
+
r = sandbox.exec("echo ok", timeout=5)
|
| 311 |
+
if r.exit_code == 0 and "ok" in (r.stdout or ""):
|
| 312 |
+
return
|
| 313 |
+
last_err = (r.stderr or r.stdout or "").strip() or f"exit={r.exit_code}"
|
| 314 |
+
except Exception as exc: # noqa: BLE001
|
| 315 |
+
last_err = f"{type(exc).__name__}: {exc}"
|
| 316 |
+
time.sleep(delay_s)
|
| 317 |
+
raise RuntimeError(
|
| 318 |
+
f"sandbox did not become ready within {attempts * delay_s:.0f}s "
|
| 319 |
+
f"(last error: {last_err})"
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
def _exec_with_retry(
|
| 323 |
+
self,
|
| 324 |
+
sandbox: SandboxHandle,
|
| 325 |
+
cmd: str,
|
| 326 |
+
*,
|
| 327 |
+
timeout: float,
|
| 328 |
+
attempts: int = 3,
|
| 329 |
+
backoff_s: float = 3.0,
|
| 330 |
+
label: str = "cmd",
|
| 331 |
+
):
|
| 332 |
+
"""Run ``sandbox.exec`` with exponential backoff on transient failure.
|
| 333 |
+
|
| 334 |
+
Transient = ``exit_code != 0`` AND empty stderr (SIGKILL / network
|
| 335 |
+
blip signature) OR an exception during exec. Final failure is raised
|
| 336 |
+
as ``RuntimeError`` carrying the last exit code + stderr.
|
| 337 |
+
"""
|
| 338 |
+
import time
|
| 339 |
+
|
| 340 |
+
last_stdout = ""
|
| 341 |
+
last_stderr = ""
|
| 342 |
+
last_exit = 0
|
| 343 |
+
for i in range(attempts):
|
| 344 |
+
try:
|
| 345 |
+
r = sandbox.exec(cmd, timeout=timeout)
|
| 346 |
+
if r.exit_code == 0:
|
| 347 |
+
return r
|
| 348 |
+
last_stdout = r.stdout or ""
|
| 349 |
+
last_stderr = r.stderr or ""
|
| 350 |
+
last_exit = r.exit_code
|
| 351 |
+
if last_stderr.strip():
|
| 352 |
+
break
|
| 353 |
+
except Exception as exc: # noqa: BLE001
|
| 354 |
+
last_stderr = f"{type(exc).__name__}: {exc}"
|
| 355 |
+
last_exit = -1
|
| 356 |
+
if i + 1 < attempts:
|
| 357 |
+
time.sleep(backoff_s * (2**i))
|
| 358 |
+
raise RuntimeError(
|
| 359 |
+
f"{label} failed after {attempts} attempts "
|
| 360 |
+
f"(exit={last_exit}, stderr={last_stderr!r}, stdout_tail={last_stdout[-400:]!r})"
|
| 361 |
+
)
|
| 362 |
+
|
| 363 |
+
def _opencode_already_installed(self, sandbox: SandboxHandle) -> bool:
|
| 364 |
+
"""Cheap probe — returns True if opencode is on disk in the sandbox.
|
| 365 |
+
|
| 366 |
+
Used to skip the slow ``curl install`` step when running against a
|
| 367 |
+
prebaked template that already ships opencode.
|
| 368 |
+
"""
|
| 369 |
+
try:
|
| 370 |
+
r = sandbox.exec(
|
| 371 |
+
"/home/user/.opencode/bin/opencode --version",
|
| 372 |
+
timeout=10,
|
| 373 |
+
)
|
| 374 |
+
return r.exit_code == 0
|
| 375 |
+
except Exception:
|
| 376 |
+
return False
|
| 377 |
+
|
| 378 |
+
def _bootstrap_sandbox(
|
| 379 |
+
self,
|
| 380 |
+
sandbox: SandboxHandle,
|
| 381 |
+
task: OpenCodeTask,
|
| 382 |
+
) -> None:
|
| 383 |
+
"""Install opencode, write config + task files, run optional setup."""
|
| 384 |
+
|
| 385 |
+
# Stage 1: wait for the sandbox to be responsive.
|
| 386 |
+
self._wait_for_sandbox_ready(sandbox)
|
| 387 |
+
|
| 388 |
+
# Stage 2: install opencode (skipped if a prebaked template already
|
| 389 |
+
# has it). curl|bash is flaky — retry with backoff.
|
| 390 |
+
if not self._opencode_already_installed(sandbox):
|
| 391 |
+
self._exec_with_retry(
|
| 392 |
+
sandbox,
|
| 393 |
+
build_install_cmd(self._config),
|
| 394 |
+
timeout=self._install_timeout_s,
|
| 395 |
+
attempts=3,
|
| 396 |
+
backoff_s=3.0,
|
| 397 |
+
label="opencode install",
|
| 398 |
+
)
|
| 399 |
+
|
| 400 |
+
sandbox.write_text(
|
| 401 |
+
opencode_config_path(self._config),
|
| 402 |
+
build_opencode_json(self._config),
|
| 403 |
+
)
|
| 404 |
+
sandbox.write_text(instruction_path(self._config), task.instruction)
|
| 405 |
+
|
| 406 |
+
if self._config.system_prompt:
|
| 407 |
+
sandbox.write_text(
|
| 408 |
+
system_prompt_path(self._config),
|
| 409 |
+
self._config.system_prompt,
|
| 410 |
+
)
|
| 411 |
+
|
| 412 |
+
for remote_path, content in task.upload_files.items():
|
| 413 |
+
sandbox.write_text(remote_path, content)
|
| 414 |
+
|
| 415 |
+
if self._config.extra_setup_shell:
|
| 416 |
+
self._exec_with_retry(
|
| 417 |
+
sandbox,
|
| 418 |
+
self._config.extra_setup_shell,
|
| 419 |
+
timeout=self._setup_timeout_s,
|
| 420 |
+
attempts=2,
|
| 421 |
+
backoff_s=2.0,
|
| 422 |
+
label="extra_setup_shell",
|
| 423 |
+
)
|
| 424 |
+
|
| 425 |
+
if task.setup_shell:
|
| 426 |
+
r = sandbox.exec(task.setup_shell, timeout=self._setup_timeout_s)
|
| 427 |
+
if r.exit_code != 0:
|
| 428 |
+
raise RuntimeError(
|
| 429 |
+
f"task.setup_shell failed ({r.exit_code}): {r.stderr}"
|
| 430 |
+
)
|
| 431 |
+
|
| 432 |
+
def _start_proxy(
|
| 433 |
+
self,
|
| 434 |
+
sandbox: SandboxHandle,
|
| 435 |
+
) -> tuple[BgJob, str, str]:
|
| 436 |
+
"""Install proxy deps + start the proxy as a bg job inside the sandbox.
|
| 437 |
+
|
| 438 |
+
Returns ``(proxy_bg_job, base_url_override, proxy_trace_path)``.
|
| 439 |
+
Skips the pip install + source-upload steps when the prebaked
|
| 440 |
+
template already has them in place.
|
| 441 |
+
"""
|
| 442 |
+
proxy_already_present = sandbox.exists(
|
| 443 |
+
"/home/user/proxy/interception.py"
|
| 444 |
+
)
|
| 445 |
+
|
| 446 |
+
if not proxy_already_present:
|
| 447 |
+
# Install proxy deps (idempotent on retries).
|
| 448 |
+
self._exec_with_retry(
|
| 449 |
+
sandbox,
|
| 450 |
+
"pip install --quiet 'fastapi>=0.104' 'uvicorn[standard]>=0.24' "
|
| 451 |
+
"'httpx>=0.27' 2>&1 | tail -20",
|
| 452 |
+
timeout=180,
|
| 453 |
+
attempts=3,
|
| 454 |
+
backoff_s=2.0,
|
| 455 |
+
label="proxy deps install",
|
| 456 |
+
)
|
| 457 |
+
# Upload the proxy module into the sandbox.
|
| 458 |
+
sandbox.write_text(
|
| 459 |
+
"/home/user/proxy/interception.py",
|
| 460 |
+
_PROXY_SOURCE_PATH.read_text(),
|
| 461 |
+
)
|
| 462 |
+
sandbox.write_text("/home/user/proxy/__init__.py", "")
|
| 463 |
+
|
| 464 |
+
cap_flag = ""
|
| 465 |
+
if self._config.proxy_max_tokens_cap is not None:
|
| 466 |
+
cap_flag = f"--max-tokens-cap {self._config.proxy_max_tokens_cap} "
|
| 467 |
+
thinking_flag = ""
|
| 468 |
+
if self._config.proxy_disable_thinking:
|
| 469 |
+
thinking_flag = "--disable-thinking "
|
| 470 |
+
# Force the upstream model id on every forwarded request — opencode's
|
| 471 |
+
# internal title-gen call sometimes strips the provider prefix.
|
| 472 |
+
model_override_flag = ""
|
| 473 |
+
if self._config.model:
|
| 474 |
+
model_override_flag = f"--model-override '{self._config.model}' "
|
| 475 |
+
proxy_cmd = (
|
| 476 |
+
"cd /home/user/proxy && "
|
| 477 |
+
"python interception.py "
|
| 478 |
+
f"--upstream-url {self._config.base_url} "
|
| 479 |
+
f"--upstream-api-key {self._config.api_key} "
|
| 480 |
+
f"--trace {_PROXY_TRACE_PATH} "
|
| 481 |
+
f"--port {_PROXY_PORT} "
|
| 482 |
+
f"--top-logprobs {self._config.proxy_top_logprobs} "
|
| 483 |
+
f"{cap_flag}"
|
| 484 |
+
f"{thinking_flag}"
|
| 485 |
+
f"{model_override_flag}"
|
| 486 |
+
f"> {_PROXY_LOG_PATH} 2>&1"
|
| 487 |
+
)
|
| 488 |
+
proxy_job = sandbox.start_bg(proxy_cmd)
|
| 489 |
+
|
| 490 |
+
# Wait for the proxy to start listening. Cold uvicorn boot inside
|
| 491 |
+
# E2B can take anywhere from <1s to ~30s depending on cache state.
|
| 492 |
+
import time
|
| 493 |
+
|
| 494 |
+
attempts = 120
|
| 495 |
+
interval_s = 0.5
|
| 496 |
+
for _ in range(attempts):
|
| 497 |
+
r = sandbox.exec(
|
| 498 |
+
f"curl -sf http://127.0.0.1:{_PROXY_PORT}/healthz",
|
| 499 |
+
timeout=5,
|
| 500 |
+
)
|
| 501 |
+
if r.exit_code == 0:
|
| 502 |
+
break
|
| 503 |
+
time.sleep(interval_s)
|
| 504 |
+
else:
|
| 505 |
+
log = ""
|
| 506 |
+
try:
|
| 507 |
+
log = sandbox.read_text(_PROXY_LOG_PATH)
|
| 508 |
+
except Exception:
|
| 509 |
+
pass
|
| 510 |
+
proxy_job.kill()
|
| 511 |
+
raise RuntimeError(
|
| 512 |
+
f"proxy did not start within {attempts * interval_s:.0f}s. "
|
| 513 |
+
f"log:\n{log[-2000:]}"
|
| 514 |
+
)
|
| 515 |
+
|
| 516 |
+
base_url_override = f"http://127.0.0.1:{_PROXY_PORT}/v1"
|
| 517 |
+
return proxy_job, base_url_override, _PROXY_TRACE_PATH
|
| 518 |
+
|
| 519 |
+
|
| 520 |
+
__all__ = [
|
| 521 |
+
"OpenCodeSession",
|
| 522 |
+
"OpenCodeSessionFactory",
|
| 523 |
+
"OpenCodeTask",
|
| 524 |
+
"Verifier",
|
| 525 |
+
]
|
models.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Pydantic models for the deployed opencode_env HTTP server.
|
| 8 |
+
|
| 9 |
+
The server exposes a single MCP tool ``run_rollout`` that takes a Task
|
| 10 |
+
(instruction + setup commands + verify commands) plus an LLM endpoint
|
| 11 |
+
config, runs one OpenCode rollout end-to-end inside an E2B sandbox, and
|
| 12 |
+
returns a :class:`RolloutResult` JSON.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
from typing import Any
|
| 18 |
+
|
| 19 |
+
from openenv.core.env_server.types import State
|
| 20 |
+
from pydantic import BaseModel, Field
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class RolloutTurn(BaseModel):
|
| 24 |
+
"""One intercepted LLM turn captured by the in-sandbox proxy (Mode B)."""
|
| 25 |
+
|
| 26 |
+
turn: int
|
| 27 |
+
finish_reason: str | None = None
|
| 28 |
+
completion_tokens: list[str] = Field(default_factory=list)
|
| 29 |
+
completion_token_ids: list[int] = Field(default_factory=list)
|
| 30 |
+
per_token_logps: list[float] = Field(default_factory=list)
|
| 31 |
+
latency_s: float = 0.0
|
| 32 |
+
timestamp: float = 0.0
|
| 33 |
+
upstream_status: int | None = None
|
| 34 |
+
upstream_error: dict[str, Any] | None = None
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class CommandResult(BaseModel):
|
| 38 |
+
"""Outcome of one bash command in setup/verify."""
|
| 39 |
+
|
| 40 |
+
cmd: str
|
| 41 |
+
exit_code: int
|
| 42 |
+
stdout: str = ""
|
| 43 |
+
stderr: str = ""
|
| 44 |
+
duration_s: float = 0.0
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class RolloutResult(BaseModel):
|
| 48 |
+
"""Full payload returned from one ``run_rollout`` invocation.
|
| 49 |
+
|
| 50 |
+
The trainer (or any client) decodes this from the MCP tool result JSON
|
| 51 |
+
and feeds ``proxy_turns`` + ``reward`` into GRPO.
|
| 52 |
+
"""
|
| 53 |
+
|
| 54 |
+
# Identifiers
|
| 55 |
+
task_id: str = ""
|
| 56 |
+
sandbox_id: str = ""
|
| 57 |
+
|
| 58 |
+
# Scalars
|
| 59 |
+
reward: float | None = None
|
| 60 |
+
agent_exit_code: int | None = None
|
| 61 |
+
wall_s: float = 0.0
|
| 62 |
+
mode: str = "transparent_proxy"
|
| 63 |
+
|
| 64 |
+
# Per-step results
|
| 65 |
+
setup_results: list[CommandResult] = Field(default_factory=list)
|
| 66 |
+
verify_results: list[CommandResult] = Field(default_factory=list)
|
| 67 |
+
|
| 68 |
+
# Per-turn LLM trajectory (empty in black_box mode)
|
| 69 |
+
proxy_turns: list[RolloutTurn] = Field(default_factory=list)
|
| 70 |
+
|
| 71 |
+
# Filesystem the agent produced (path -> contents, truncated)
|
| 72 |
+
files: dict[str, str] = Field(default_factory=dict)
|
| 73 |
+
files_extra: list[str] = Field(default_factory=list)
|
| 74 |
+
|
| 75 |
+
# Diagnostic tails
|
| 76 |
+
agent_log_tail: str = ""
|
| 77 |
+
proxy_log_tail: str = ""
|
| 78 |
+
|
| 79 |
+
# Error surfacing
|
| 80 |
+
error: str | None = None
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
class OpenCodeState(State):
|
| 84 |
+
"""Per-session env state across calls to one OpenCodeEnvironment instance.
|
| 85 |
+
|
| 86 |
+
Each HTTP session gets its own env (``SUPPORTS_CONCURRENT_SESSIONS=True``
|
| 87 |
+
on the server class), so this state is per-session.
|
| 88 |
+
"""
|
| 89 |
+
|
| 90 |
+
rollouts_completed: int = 0
|
| 91 |
+
last_reward: float | None = None
|
| 92 |
+
last_task_id: str | None = None
|
| 93 |
+
last_sandbox_id: str | None = None
|
opencode_runtime.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Pure builders for OpenCode sandbox bootstrap artifacts.
|
| 8 |
+
|
| 9 |
+
These functions produce the exact files and shell commands the sandbox needs to
|
| 10 |
+
run OpenCode against a configured LLM endpoint. No IO, no sandbox coupling —
|
| 11 |
+
the sandbox backend is responsible for writing files and running commands.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
|
| 16 |
+
import json
|
| 17 |
+
from typing import Any
|
| 18 |
+
|
| 19 |
+
from .config import OpenCodeConfig, provider_npm_package
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def opencode_config_path(config: OpenCodeConfig) -> str:
|
| 23 |
+
return f"{config.sandbox_home}/.config/opencode/opencode.json"
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def instruction_path(config: OpenCodeConfig) -> str:
|
| 27 |
+
return f"{config.sandbox_home}/task/instruction.md"
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def agent_log_path(config: OpenCodeConfig) -> str:
|
| 31 |
+
return f"{config.sandbox_home}/logs/agent/opencode.jsonl"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def system_prompt_path(config: OpenCodeConfig) -> str:
|
| 35 |
+
return f"{config.sandbox_home}/task/system.md"
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def verifier_reward_path(config: OpenCodeConfig) -> str:
|
| 39 |
+
return f"{config.sandbox_home}/logs/verifier/reward.txt"
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def workdir_path(config: OpenCodeConfig) -> str:
|
| 43 |
+
return f"{config.sandbox_home}/workdir"
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def build_opencode_json(config: OpenCodeConfig) -> str:
|
| 47 |
+
"""Return the serialized ``opencode.json`` the sandbox should install.
|
| 48 |
+
|
| 49 |
+
Provider block is keyed by a stable internal name (``intercepted``) so the
|
| 50 |
+
same ``model`` string works across providers. Deep-merges
|
| 51 |
+
``config.extra_opencode_json`` last so callers can override anything.
|
| 52 |
+
"""
|
| 53 |
+
|
| 54 |
+
provider_name = "intercepted"
|
| 55 |
+
provider_block: dict[str, Any] = {
|
| 56 |
+
"npm": provider_npm_package(config.provider),
|
| 57 |
+
"name": "Intercepted",
|
| 58 |
+
"options": {
|
| 59 |
+
"baseURL": config.base_url,
|
| 60 |
+
"apiKey": config.api_key,
|
| 61 |
+
"timeout": config.request_timeout_ms,
|
| 62 |
+
},
|
| 63 |
+
"models": {
|
| 64 |
+
config.model.split("/", 1)[-1]: {"name": "Intercepted Model"},
|
| 65 |
+
},
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
doc: dict[str, Any] = {
|
| 69 |
+
"$schema": "https://opencode.ai/config.json",
|
| 70 |
+
"model": f"{provider_name}/{config.model.split('/', 1)[-1]}",
|
| 71 |
+
"provider": {provider_name: provider_block},
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
tools = _build_tools_block(config)
|
| 75 |
+
if tools:
|
| 76 |
+
doc["tools"] = tools
|
| 77 |
+
|
| 78 |
+
_deep_merge(doc, config.extra_opencode_json)
|
| 79 |
+
return json.dumps(doc, indent=2)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def build_install_cmd(config: OpenCodeConfig) -> str:
|
| 83 |
+
"""Return the shell command that installs OpenCode + ensures PATH.
|
| 84 |
+
|
| 85 |
+
The upstream installer honors ``OPENCODE_VERSION=x.y.z`` for pinning;
|
| 86 |
+
leaving it unset tracks ``latest``.
|
| 87 |
+
"""
|
| 88 |
+
|
| 89 |
+
version_env = ""
|
| 90 |
+
if config.opencode_version and config.opencode_version != "latest":
|
| 91 |
+
version_env = f"OPENCODE_VERSION={config.opencode_version} "
|
| 92 |
+
home = config.sandbox_home
|
| 93 |
+
return (
|
| 94 |
+
"set -e && "
|
| 95 |
+
f"mkdir -p {home}/.config/opencode {home}/logs/agent {home}/logs/verifier {home}/task {home}/workdir && "
|
| 96 |
+
f"{version_env}curl -fsSL https://opencode.ai/install | bash && "
|
| 97 |
+
'export PATH="$HOME/.opencode/bin:$PATH" && '
|
| 98 |
+
"opencode --version"
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def build_run_cmd(config: OpenCodeConfig) -> str:
|
| 103 |
+
"""Return the shell command that launches OpenCode against a task."""
|
| 104 |
+
|
| 105 |
+
format_flag = "--format json" if config.run_format == "json" else ""
|
| 106 |
+
return (
|
| 107 |
+
'export PATH="$HOME/.opencode/bin:$PATH" && '
|
| 108 |
+
f"cd {workdir_path(config)} && "
|
| 109 |
+
f'opencode run {format_flag} "$(cat {instruction_path(config)})" '
|
| 110 |
+
f"2>&1 | tee {agent_log_path(config)}"
|
| 111 |
+
).strip()
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def build_env_vars(config: OpenCodeConfig, *, base_url_override: str | None = None) -> dict[str, str]:
|
| 115 |
+
"""Return env vars to set on the OpenCode process.
|
| 116 |
+
|
| 117 |
+
When a proxy is wrapping ``config.base_url`` the factory passes the proxy's
|
| 118 |
+
local URL via ``base_url_override`` so the sandbox process points at the
|
| 119 |
+
proxy and the opencode.json on disk stays consistent with what the proxy
|
| 120 |
+
forwards to.
|
| 121 |
+
"""
|
| 122 |
+
|
| 123 |
+
env = dict(config.extra_env)
|
| 124 |
+
env["OPENAI_BASE_URL"] = base_url_override or config.base_url
|
| 125 |
+
env["OPENAI_API_KEY"] = config.api_key
|
| 126 |
+
env["OPENCODE_CONFIG"] = opencode_config_path(config)
|
| 127 |
+
return env
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def _build_tools_block(config: OpenCodeConfig) -> dict[str, bool]:
|
| 131 |
+
"""Translate enabled/disabled lists into opencode's ``tools`` map."""
|
| 132 |
+
|
| 133 |
+
if config.enabled_tools is not None:
|
| 134 |
+
# Whitelist: everything not listed is disabled. OpenCode treats missing
|
| 135 |
+
# keys as "default enabled", so we only need to explicitly disable the
|
| 136 |
+
# ones we want off. Without a full known-tool list we can't do a true
|
| 137 |
+
# whitelist; document this as a known limitation and require the caller
|
| 138 |
+
# to rely on ``disabled_tools`` for full control.
|
| 139 |
+
return {tool: True for tool in config.enabled_tools}
|
| 140 |
+
return {tool: False for tool in config.disabled_tools}
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def _deep_merge(dst: dict[str, Any], src: dict[str, Any]) -> None:
|
| 144 |
+
"""Recursively merge ``src`` into ``dst`` in place."""
|
| 145 |
+
|
| 146 |
+
for key, value in src.items():
|
| 147 |
+
if isinstance(value, dict) and isinstance(dst.get(key), dict):
|
| 148 |
+
_deep_merge(dst[key], value)
|
| 149 |
+
else:
|
| 150 |
+
dst[key] = value
|
openenv.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
name: opencode_env
|
| 3 |
+
type: space
|
| 4 |
+
runtime: fastapi
|
| 5 |
+
app: server.app:app
|
| 6 |
+
port: 8000
|
pyproject.toml
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
[build-system]
|
| 8 |
+
requires = ["setuptools>=45", "wheel"]
|
| 9 |
+
build-backend = "setuptools.build_meta"
|
| 10 |
+
|
| 11 |
+
[project]
|
| 12 |
+
name = "openenv-opencode-env"
|
| 13 |
+
version = "0.1.0"
|
| 14 |
+
description = "OpenCode coding-agent environment for OpenEnv — runs the OpenCode CLI in an E2B sandbox against any OpenAI-compatible LLM, optionally capturing per-token logprobs."
|
| 15 |
+
requires-python = ">=3.10"
|
| 16 |
+
dependencies = [
|
| 17 |
+
# Core OpenEnv (server + MCP)
|
| 18 |
+
"openenv-core[core]>=0.2.2",
|
| 19 |
+
"fastapi>=0.115.0",
|
| 20 |
+
"uvicorn[standard]>=0.24.0",
|
| 21 |
+
"pydantic>=2.0.0",
|
| 22 |
+
"fastmcp>=2.0.0",
|
| 23 |
+
"requests>=2.31.0",
|
| 24 |
+
|
| 25 |
+
# Web UI
|
| 26 |
+
"gradio>=4.0.0",
|
| 27 |
+
|
| 28 |
+
# OpenCode harness primitive — sandbox + proxy + agent driver
|
| 29 |
+
"httpx>=0.27.0",
|
| 30 |
+
"e2b>=1.0.0",
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
[project.optional-dependencies]
|
| 34 |
+
dev = [
|
| 35 |
+
"pytest>=8.0.0",
|
| 36 |
+
"pytest-asyncio>=0.23.0",
|
| 37 |
+
"pytest-cov>=4.0.0",
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
[project.scripts]
|
| 41 |
+
# Server entrypoint — enables ``uv run --project . server``.
|
| 42 |
+
server = "opencode_env.server.app:main"
|
| 43 |
+
|
| 44 |
+
[tool.setuptools]
|
| 45 |
+
include-package-data = true
|
| 46 |
+
packages = [
|
| 47 |
+
"opencode_env",
|
| 48 |
+
"opencode_env.sandbox",
|
| 49 |
+
"opencode_env.server",
|
| 50 |
+
"opencode_env.tests",
|
| 51 |
+
]
|
| 52 |
+
package-dir = { "opencode_env" = ".", "opencode_env.sandbox" = "sandbox", "opencode_env.server" = "server", "opencode_env.tests" = "tests" }
|
| 53 |
+
|
| 54 |
+
[tool.setuptools.package-data]
|
| 55 |
+
opencode_env = ["**/*.md"]
|
sandbox/__init__.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Sandbox backends for the OpenCode harness.
|
| 8 |
+
|
| 9 |
+
The primitive ships with :class:`E2BSandboxBackend` as the default; any backend
|
| 10 |
+
that satisfies the :class:`SandboxBackend` / :class:`SandboxHandle` protocols
|
| 11 |
+
can be swapped in.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from .base import BgJob, ExecResult, SandboxBackend, SandboxHandle
|
| 15 |
+
from .e2b import E2BBgJob, E2BSandboxBackend, E2BSandboxHandle
|
| 16 |
+
|
| 17 |
+
__all__ = [
|
| 18 |
+
"BgJob",
|
| 19 |
+
"ExecResult",
|
| 20 |
+
"SandboxBackend",
|
| 21 |
+
"SandboxHandle",
|
| 22 |
+
"E2BBgJob",
|
| 23 |
+
"E2BSandboxBackend",
|
| 24 |
+
"E2BSandboxHandle",
|
| 25 |
+
]
|
sandbox/base.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Sandbox backend protocol.
|
| 8 |
+
|
| 9 |
+
A ``SandboxBackend`` produces ``SandboxHandle`` instances that the harness uses
|
| 10 |
+
to stage files, run the OpenCode install, launch the agent as a background
|
| 11 |
+
process, and later tear the sandbox down.
|
| 12 |
+
|
| 13 |
+
Backends can be implemented against any provider (E2B, Docker, Modal, Prime)
|
| 14 |
+
as long as they satisfy the Protocols defined here.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
|
| 19 |
+
from dataclasses import dataclass
|
| 20 |
+
from typing import Any, Protocol, runtime_checkable
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@dataclass
|
| 24 |
+
class ExecResult:
|
| 25 |
+
"""Result of a synchronous command inside a sandbox."""
|
| 26 |
+
|
| 27 |
+
exit_code: int
|
| 28 |
+
stdout: str
|
| 29 |
+
stderr: str
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
@runtime_checkable
|
| 33 |
+
class BgJob(Protocol):
|
| 34 |
+
"""Handle to a background process running inside a sandbox."""
|
| 35 |
+
|
| 36 |
+
@property
|
| 37 |
+
def pid(self) -> int: ...
|
| 38 |
+
|
| 39 |
+
def wait(self, timeout: float | None = None) -> int:
|
| 40 |
+
"""Block until the process exits, returning its exit code.
|
| 41 |
+
|
| 42 |
+
Implementations must raise ``TimeoutError`` if ``timeout`` elapses
|
| 43 |
+
before the process exits.
|
| 44 |
+
"""
|
| 45 |
+
|
| 46 |
+
def kill(self) -> None:
|
| 47 |
+
"""Terminate the process."""
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
@runtime_checkable
|
| 51 |
+
class SandboxHandle(Protocol):
|
| 52 |
+
"""Opaque handle to one live sandbox."""
|
| 53 |
+
|
| 54 |
+
@property
|
| 55 |
+
def sandbox_id(self) -> str: ...
|
| 56 |
+
|
| 57 |
+
def exec(
|
| 58 |
+
self,
|
| 59 |
+
cmd: str,
|
| 60 |
+
*,
|
| 61 |
+
envs: dict[str, str] | None = None,
|
| 62 |
+
cwd: str | None = None,
|
| 63 |
+
timeout: float | None = 60,
|
| 64 |
+
) -> ExecResult:
|
| 65 |
+
"""Run a shell command synchronously and return its result."""
|
| 66 |
+
|
| 67 |
+
def start_bg(
|
| 68 |
+
self,
|
| 69 |
+
cmd: str,
|
| 70 |
+
*,
|
| 71 |
+
envs: dict[str, str] | None = None,
|
| 72 |
+
cwd: str | None = None,
|
| 73 |
+
) -> BgJob:
|
| 74 |
+
"""Launch a background process and return a handle."""
|
| 75 |
+
|
| 76 |
+
def write_text(self, path: str, content: str) -> None:
|
| 77 |
+
"""Write text to ``path`` inside the sandbox (parent dirs auto-created)."""
|
| 78 |
+
|
| 79 |
+
def read_text(self, path: str) -> str:
|
| 80 |
+
"""Read ``path`` as text from the sandbox."""
|
| 81 |
+
|
| 82 |
+
def exists(self, path: str) -> bool:
|
| 83 |
+
"""Return whether ``path`` exists in the sandbox."""
|
| 84 |
+
|
| 85 |
+
def kill(self) -> None:
|
| 86 |
+
"""Terminate the sandbox and release resources."""
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
@runtime_checkable
|
| 90 |
+
class SandboxBackend(Protocol):
|
| 91 |
+
"""Factory for fresh sandbox instances."""
|
| 92 |
+
|
| 93 |
+
def create(
|
| 94 |
+
self,
|
| 95 |
+
*,
|
| 96 |
+
timeout_s: int = 900,
|
| 97 |
+
envs: dict[str, str] | None = None,
|
| 98 |
+
metadata: dict[str, str] | None = None,
|
| 99 |
+
) -> SandboxHandle:
|
| 100 |
+
"""Create and return a new, ready-to-use sandbox."""
|
sandbox/build_template.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Build a pre-baked E2B template with opencode + proxy deps already installed.
|
| 8 |
+
|
| 9 |
+
Run-time per rollout drops from ~3 min (cold install) to ~30s once the
|
| 10 |
+
template is built, because we skip:
|
| 11 |
+
|
| 12 |
+
- ``curl https://opencode.ai/install | bash`` (~30-90s)
|
| 13 |
+
- ``pip install fastapi uvicorn httpx`` (~30-60s)
|
| 14 |
+
- directory layout setup
|
| 15 |
+
- copying the proxy source
|
| 16 |
+
|
| 17 |
+
The template ships:
|
| 18 |
+
|
| 19 |
+
- opencode CLI at ``/home/user/.opencode/bin/opencode``
|
| 20 |
+
- Python deps for the in-sandbox proxy
|
| 21 |
+
- The proxy source at ``/home/user/proxy/interception.py``
|
| 22 |
+
- Pre-created dirs: ``~/.config/opencode``, ``~/logs/{agent,verifier}``,
|
| 23 |
+
``~/task``, ``~/workdir``, ``~/proxy``
|
| 24 |
+
- Default workdir: ``/home/user/workdir``
|
| 25 |
+
|
| 26 |
+
Usage::
|
| 27 |
+
|
| 28 |
+
.venv/bin/python envs/opencode_env/tests/build_e2b_template.py
|
| 29 |
+
# → builds (or rebuilds) ``opencode-rl`` template, prints template id
|
| 30 |
+
|
| 31 |
+
Then ``test_five_sorts_e2e.py`` will use it via ``--template opencode-rl``.
|
| 32 |
+
|
| 33 |
+
Requires ``E2B_API_KEY`` in the environment. First build is ~3-8 min;
|
| 34 |
+
subsequent builds reuse the cache and can finish in <60s.
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
from __future__ import annotations
|
| 38 |
+
|
| 39 |
+
import argparse
|
| 40 |
+
import os
|
| 41 |
+
import sys
|
| 42 |
+
from pathlib import Path
|
| 43 |
+
|
| 44 |
+
from e2b import Template, default_build_logger
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
_ENV_DIR = Path(__file__).resolve().parent
|
| 48 |
+
_PROXY_SOURCE = _ENV_DIR / "interception.py"
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _load_env(path: Path) -> None:
|
| 52 |
+
if not path.exists():
|
| 53 |
+
return
|
| 54 |
+
for raw in path.read_text().splitlines():
|
| 55 |
+
line = raw.strip()
|
| 56 |
+
if not line or line.startswith("#") or "=" not in line:
|
| 57 |
+
continue
|
| 58 |
+
k, _, v = line.partition("=")
|
| 59 |
+
k = k.strip()
|
| 60 |
+
v = v.strip().strip('"').strip("'")
|
| 61 |
+
if k and k not in os.environ:
|
| 62 |
+
os.environ[k] = v
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def build_template(name: str, *, skip_cache: bool = False) -> str:
|
| 66 |
+
if not _PROXY_SOURCE.exists():
|
| 67 |
+
raise RuntimeError(f"proxy source missing at {_PROXY_SOURCE}")
|
| 68 |
+
|
| 69 |
+
# Template.copy() resolves relative paths against the caller's source
|
| 70 |
+
# file directory. This script lives next to ``interception.py`` so the
|
| 71 |
+
# bare filename works.
|
| 72 |
+
|
| 73 |
+
# Stage 1 (root): system-wide pip deps for the proxy.
|
| 74 |
+
# Stage 2 (user): opencode install + dir layout + proxy copy.
|
| 75 |
+
template = (
|
| 76 |
+
Template()
|
| 77 |
+
.from_python_image("3.12")
|
| 78 |
+
.pip_install(
|
| 79 |
+
[
|
| 80 |
+
"fastapi>=0.104",
|
| 81 |
+
"uvicorn[standard]>=0.24",
|
| 82 |
+
"httpx>=0.27",
|
| 83 |
+
]
|
| 84 |
+
)
|
| 85 |
+
.set_user("user")
|
| 86 |
+
.run_cmd("curl -fsSL https://opencode.ai/install | bash")
|
| 87 |
+
.run_cmd("/home/user/.opencode/bin/opencode --version")
|
| 88 |
+
.make_dir("/home/user/.config/opencode")
|
| 89 |
+
.make_dir("/home/user/logs/agent")
|
| 90 |
+
.make_dir("/home/user/logs/verifier")
|
| 91 |
+
.make_dir("/home/user/task")
|
| 92 |
+
.make_dir("/home/user/workdir")
|
| 93 |
+
.make_dir("/home/user/proxy")
|
| 94 |
+
.copy("interception.py", "/home/user/proxy/interception.py")
|
| 95 |
+
.set_workdir("/home/user/workdir")
|
| 96 |
+
)
|
| 97 |
+
if skip_cache:
|
| 98 |
+
template = template.skip_cache()
|
| 99 |
+
|
| 100 |
+
info = Template.build(
|
| 101 |
+
template,
|
| 102 |
+
name,
|
| 103 |
+
cpu_count=2,
|
| 104 |
+
memory_mb=2048,
|
| 105 |
+
on_build_logs=default_build_logger(),
|
| 106 |
+
)
|
| 107 |
+
return info.template_id if hasattr(info, "template_id") else str(info)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def main(argv: list[str] | None = None) -> int:
|
| 111 |
+
p = argparse.ArgumentParser(prog="build_e2b_template")
|
| 112 |
+
p.add_argument(
|
| 113 |
+
"--name",
|
| 114 |
+
default="opencode-rl",
|
| 115 |
+
help="Template name (default: opencode-rl).",
|
| 116 |
+
)
|
| 117 |
+
p.add_argument(
|
| 118 |
+
"--skip-cache",
|
| 119 |
+
action="store_true",
|
| 120 |
+
help="Force a clean rebuild, ignoring cache.",
|
| 121 |
+
)
|
| 122 |
+
args = p.parse_args(argv)
|
| 123 |
+
|
| 124 |
+
_load_env(_ENV_DIR / ".env")
|
| 125 |
+
if not os.environ.get("E2B_API_KEY"):
|
| 126 |
+
print("ERROR: E2B_API_KEY required.", file=sys.stderr)
|
| 127 |
+
return 2
|
| 128 |
+
|
| 129 |
+
print(f"Building template '{args.name}' "
|
| 130 |
+
f"(proxy source: {_PROXY_SOURCE})")
|
| 131 |
+
print(f"Skip cache: {args.skip_cache}")
|
| 132 |
+
print()
|
| 133 |
+
|
| 134 |
+
template_id = build_template(args.name, skip_cache=args.skip_cache)
|
| 135 |
+
print()
|
| 136 |
+
print(f"Built. Template id/name: {template_id}")
|
| 137 |
+
print(f"Use in code: Sandbox.create(template='{args.name}')")
|
| 138 |
+
return 0
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
if __name__ == "__main__":
|
| 142 |
+
sys.exit(main())
|
sandbox/e2b.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""E2B implementation of :class:`SandboxBackend`."""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import os
|
| 12 |
+
import threading
|
| 13 |
+
from pathlib import PurePosixPath
|
| 14 |
+
|
| 15 |
+
from e2b import Sandbox
|
| 16 |
+
from e2b.sandbox_sync.commands.command_handle import CommandHandle
|
| 17 |
+
|
| 18 |
+
from .base import BgJob, ExecResult, SandboxBackend, SandboxHandle
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class E2BBgJob:
|
| 22 |
+
"""Wraps an E2B ``CommandHandle`` to satisfy :class:`BgJob`.
|
| 23 |
+
|
| 24 |
+
The E2B SDK's ``CommandHandle.wait()`` blocks indefinitely with no native
|
| 25 |
+
timeout. We poll in a worker thread and raise ``TimeoutError`` if the
|
| 26 |
+
process does not exit within the caller-supplied budget.
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
def __init__(self, handle: CommandHandle) -> None:
|
| 30 |
+
self._handle = handle
|
| 31 |
+
self._result: "object | None" = None
|
| 32 |
+
self._error: BaseException | None = None
|
| 33 |
+
self._thread = threading.Thread(target=self._run, daemon=True)
|
| 34 |
+
self._thread.start()
|
| 35 |
+
|
| 36 |
+
def _run(self) -> None:
|
| 37 |
+
try:
|
| 38 |
+
self._result = self._handle.wait()
|
| 39 |
+
except BaseException as exc: # noqa: BLE001
|
| 40 |
+
self._error = exc
|
| 41 |
+
|
| 42 |
+
@property
|
| 43 |
+
def pid(self) -> int:
|
| 44 |
+
return self._handle.pid
|
| 45 |
+
|
| 46 |
+
def wait(self, timeout: float | None = None) -> int:
|
| 47 |
+
self._thread.join(timeout)
|
| 48 |
+
if self._thread.is_alive():
|
| 49 |
+
raise TimeoutError(
|
| 50 |
+
f"Background command did not exit within {timeout}s"
|
| 51 |
+
)
|
| 52 |
+
if self._error is not None:
|
| 53 |
+
# E2B raises CommandExitException on non-zero; treat as exit code.
|
| 54 |
+
code = getattr(self._error, "exit_code", None)
|
| 55 |
+
if code is None:
|
| 56 |
+
raise self._error
|
| 57 |
+
return int(code)
|
| 58 |
+
return int(self._result.exit_code) if self._result is not None else 0
|
| 59 |
+
|
| 60 |
+
def kill(self) -> None:
|
| 61 |
+
try:
|
| 62 |
+
self._handle.kill()
|
| 63 |
+
except Exception:
|
| 64 |
+
pass
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class E2BSandboxHandle:
|
| 68 |
+
"""Wraps a live ``e2b.Sandbox`` to satisfy :class:`SandboxHandle`."""
|
| 69 |
+
|
| 70 |
+
def __init__(self, sandbox: Sandbox) -> None:
|
| 71 |
+
self._sbx = sandbox
|
| 72 |
+
|
| 73 |
+
@property
|
| 74 |
+
def sandbox_id(self) -> str:
|
| 75 |
+
return self._sbx.sandbox_id
|
| 76 |
+
|
| 77 |
+
@property
|
| 78 |
+
def raw(self) -> Sandbox:
|
| 79 |
+
"""Escape hatch for callers that need the underlying SDK object."""
|
| 80 |
+
return self._sbx
|
| 81 |
+
|
| 82 |
+
def exec(
|
| 83 |
+
self,
|
| 84 |
+
cmd: str,
|
| 85 |
+
*,
|
| 86 |
+
envs: dict[str, str] | None = None,
|
| 87 |
+
cwd: str | None = None,
|
| 88 |
+
timeout: float | None = 60,
|
| 89 |
+
) -> ExecResult:
|
| 90 |
+
from e2b.sandbox.commands.command_handle import CommandExitException
|
| 91 |
+
|
| 92 |
+
try:
|
| 93 |
+
result = self._sbx.commands.run(
|
| 94 |
+
cmd,
|
| 95 |
+
envs=envs,
|
| 96 |
+
cwd=cwd,
|
| 97 |
+
timeout=timeout,
|
| 98 |
+
background=False,
|
| 99 |
+
)
|
| 100 |
+
return ExecResult(
|
| 101 |
+
exit_code=result.exit_code,
|
| 102 |
+
stdout=result.stdout,
|
| 103 |
+
stderr=result.stderr,
|
| 104 |
+
)
|
| 105 |
+
except CommandExitException as exc:
|
| 106 |
+
# Non-zero exit codes are expected in many contexts (e.g. polling
|
| 107 |
+
# healthz before the server is up). Surface them as a proper
|
| 108 |
+
# ExecResult instead of an exception.
|
| 109 |
+
return ExecResult(
|
| 110 |
+
exit_code=int(getattr(exc, "exit_code", 1)),
|
| 111 |
+
stdout=str(getattr(exc, "stdout", "") or ""),
|
| 112 |
+
stderr=str(getattr(exc, "stderr", "") or str(exc)),
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
def start_bg(
|
| 116 |
+
self,
|
| 117 |
+
cmd: str,
|
| 118 |
+
*,
|
| 119 |
+
envs: dict[str, str] | None = None,
|
| 120 |
+
cwd: str | None = None,
|
| 121 |
+
timeout: float = 0,
|
| 122 |
+
) -> BgJob:
|
| 123 |
+
"""Start a background command.
|
| 124 |
+
|
| 125 |
+
``timeout=0`` disables E2B's server-side command deadline (the default
|
| 126 |
+
is 60s, which would otherwise kill long-running agent processes).
|
| 127 |
+
Sandbox lifetime still bounds the job.
|
| 128 |
+
"""
|
| 129 |
+
handle = self._sbx.commands.run(
|
| 130 |
+
cmd,
|
| 131 |
+
envs=envs,
|
| 132 |
+
cwd=cwd,
|
| 133 |
+
background=True,
|
| 134 |
+
timeout=timeout,
|
| 135 |
+
)
|
| 136 |
+
return E2BBgJob(handle)
|
| 137 |
+
|
| 138 |
+
def write_text(self, path: str, content: str) -> None:
|
| 139 |
+
parent = str(PurePosixPath(path).parent)
|
| 140 |
+
if parent not in ("", "/"):
|
| 141 |
+
self._sbx.files.make_dir(parent)
|
| 142 |
+
self._sbx.files.write(path, content)
|
| 143 |
+
|
| 144 |
+
def read_text(self, path: str) -> str:
|
| 145 |
+
return self._sbx.files.read(path)
|
| 146 |
+
|
| 147 |
+
def exists(self, path: str) -> bool:
|
| 148 |
+
return self._sbx.files.exists(path)
|
| 149 |
+
|
| 150 |
+
def kill(self) -> None:
|
| 151 |
+
self._sbx.kill()
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
class E2BSandboxBackend:
|
| 155 |
+
"""Creates E2B sandboxes for OpenCode rollouts.
|
| 156 |
+
|
| 157 |
+
The backend uses the E2B default base template unless ``template`` is
|
| 158 |
+
provided. Resource sizing and other E2B-specific options can be forwarded
|
| 159 |
+
via ``sandbox_kwargs``.
|
| 160 |
+
"""
|
| 161 |
+
|
| 162 |
+
def __init__(
|
| 163 |
+
self,
|
| 164 |
+
*,
|
| 165 |
+
api_key: str | None = None,
|
| 166 |
+
template: str | None = None,
|
| 167 |
+
sandbox_kwargs: dict | None = None,
|
| 168 |
+
) -> None:
|
| 169 |
+
self._api_key = api_key or os.environ.get("E2B_API_KEY")
|
| 170 |
+
if not self._api_key:
|
| 171 |
+
raise RuntimeError(
|
| 172 |
+
"E2BSandboxBackend requires an api_key or E2B_API_KEY env var."
|
| 173 |
+
)
|
| 174 |
+
self._template = template
|
| 175 |
+
self._sandbox_kwargs = sandbox_kwargs or {}
|
| 176 |
+
|
| 177 |
+
def create(
|
| 178 |
+
self,
|
| 179 |
+
*,
|
| 180 |
+
timeout_s: int = 900,
|
| 181 |
+
envs: dict[str, str] | None = None,
|
| 182 |
+
metadata: dict[str, str] | None = None,
|
| 183 |
+
) -> SandboxHandle:
|
| 184 |
+
sbx = Sandbox.create(
|
| 185 |
+
template=self._template,
|
| 186 |
+
timeout=timeout_s,
|
| 187 |
+
envs=envs,
|
| 188 |
+
metadata=metadata,
|
| 189 |
+
api_key=self._api_key,
|
| 190 |
+
**self._sandbox_kwargs,
|
| 191 |
+
)
|
| 192 |
+
return E2BSandboxHandle(sbx)
|
sandbox/interception.py
ADDED
|
@@ -0,0 +1,642 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Transparent OpenAI-compatible forwarding proxy with logprob capture.
|
| 8 |
+
|
| 9 |
+
The proxy is a small FastAPI app that OpenCode talks to instead of the upstream
|
| 10 |
+
LLM endpoint. It:
|
| 11 |
+
|
| 12 |
+
1. Forwards every ``POST /v1/chat/completions`` request to the real upstream
|
| 13 |
+
URL, injecting ``logprobs=true`` and ``top_logprobs=N`` so the upstream
|
| 14 |
+
returns per-token logprobs.
|
| 15 |
+
2. Captures each ``(request, response, logprobs)`` triple to a JSON-lines
|
| 16 |
+
trace file.
|
| 17 |
+
3. Returns the upstream response to OpenCode verbatim (minus the ``logprobs``
|
| 18 |
+
field, which we strip so the CLI never sees anything unexpected).
|
| 19 |
+
|
| 20 |
+
The proxy is stateless beyond the trace file. One proxy instance runs per
|
| 21 |
+
session, normally inside the sandbox on ``localhost:7000``.
|
| 22 |
+
|
| 23 |
+
Run standalone::
|
| 24 |
+
|
| 25 |
+
python -m opencode_env.interception \\
|
| 26 |
+
--upstream-url https://vllm.example/v1 \\
|
| 27 |
+
--upstream-api-key intercepted \\
|
| 28 |
+
--trace /tmp/trace.jsonl \\
|
| 29 |
+
--port 7000
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
from __future__ import annotations
|
| 33 |
+
|
| 34 |
+
import argparse
|
| 35 |
+
import asyncio
|
| 36 |
+
import copy
|
| 37 |
+
import json
|
| 38 |
+
import os
|
| 39 |
+
import socket
|
| 40 |
+
import threading
|
| 41 |
+
import time
|
| 42 |
+
from contextlib import closing
|
| 43 |
+
from dataclasses import dataclass, field
|
| 44 |
+
from pathlib import Path
|
| 45 |
+
from typing import Any
|
| 46 |
+
|
| 47 |
+
import httpx
|
| 48 |
+
import uvicorn
|
| 49 |
+
from fastapi import FastAPI, Request, Response
|
| 50 |
+
from fastapi.responses import JSONResponse, StreamingResponse
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
CHAT_COMPLETIONS_PATH = "/v1/chat/completions"
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@dataclass
|
| 57 |
+
class ProxyConfig:
|
| 58 |
+
"""Runtime configuration for one :class:`InterceptionProxy`."""
|
| 59 |
+
|
| 60 |
+
upstream_url: str
|
| 61 |
+
upstream_api_key: str = "intercepted"
|
| 62 |
+
trace_path: str = "/tmp/opencode-proxy-trace.jsonl"
|
| 63 |
+
host: str = "127.0.0.1"
|
| 64 |
+
port: int = 7000
|
| 65 |
+
top_logprobs: int = 5
|
| 66 |
+
request_timeout_s: float = 600.0
|
| 67 |
+
# Cap ``max_tokens`` before forwarding. OpenCode historically asks for very
|
| 68 |
+
# large values (e.g. 32000) that exceed gpt-4o-mini's 16384 cap; capping
|
| 69 |
+
# here avoids spurious upstream 400s without requiring the caller to know
|
| 70 |
+
# per-model limits.
|
| 71 |
+
max_tokens_cap: int | None = 16384
|
| 72 |
+
# Disable Qwen-style reasoning/thinking by injecting
|
| 73 |
+
# ``chat_template_kwargs.enable_thinking=false`` into forwarded requests.
|
| 74 |
+
disable_thinking: bool = False
|
| 75 |
+
# Override the ``model`` field on every forwarded request. Some opencode
|
| 76 |
+
# builds emit a stripped model id (e.g. ``Qwen3.5-4B`` instead of the
|
| 77 |
+
# ``Qwen/Qwen3.5-4B`` the upstream serves) for their internal
|
| 78 |
+
# title-generation call. Setting this to the exact upstream model id
|
| 79 |
+
# bypasses that mismatch.
|
| 80 |
+
model_override: str | None = None
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
@dataclass
|
| 84 |
+
class TurnRecord:
|
| 85 |
+
"""One intercepted turn, written to the trace file as JSON-lines."""
|
| 86 |
+
|
| 87 |
+
turn: int
|
| 88 |
+
request: dict[str, Any]
|
| 89 |
+
response: dict[str, Any]
|
| 90 |
+
logprobs: list[dict[str, Any]] | None
|
| 91 |
+
completion_tokens: list[str]
|
| 92 |
+
completion_token_ids: list[int]
|
| 93 |
+
per_token_logps: list[float]
|
| 94 |
+
finish_reason: str | None
|
| 95 |
+
latency_s: float
|
| 96 |
+
timestamp: float = field(default_factory=time.time)
|
| 97 |
+
|
| 98 |
+
def to_json(self) -> str:
|
| 99 |
+
return json.dumps(self.__dict__, default=str)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def _build_app(cfg: ProxyConfig) -> FastAPI:
|
| 103 |
+
"""Construct the FastAPI app that serves one proxy session."""
|
| 104 |
+
|
| 105 |
+
app = FastAPI(title="opencode-interception-proxy")
|
| 106 |
+
state: dict[str, Any] = {"turn": 0, "lock": asyncio.Lock()}
|
| 107 |
+
|
| 108 |
+
# HTTP client reused across requests. ``None`` auth header — we let each
|
| 109 |
+
# request carry its own ``Authorization`` populated from ``upstream_api_key``.
|
| 110 |
+
client = httpx.AsyncClient(timeout=cfg.request_timeout_s)
|
| 111 |
+
trace_file = open(cfg.trace_path, "a", buffering=1)
|
| 112 |
+
|
| 113 |
+
@app.get("/healthz")
|
| 114 |
+
def healthz() -> dict[str, str]:
|
| 115 |
+
return {"status": "ok"}
|
| 116 |
+
|
| 117 |
+
@app.post(CHAT_COMPLETIONS_PATH)
|
| 118 |
+
async def chat_completions(request: Request) -> Response:
|
| 119 |
+
raw_body = await request.body()
|
| 120 |
+
try:
|
| 121 |
+
body = json.loads(raw_body)
|
| 122 |
+
except json.JSONDecodeError:
|
| 123 |
+
return JSONResponse(
|
| 124 |
+
status_code=400, content={"error": "invalid json body"}
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
forwarded_body = _prepare_forwarded_body(body, cfg)
|
| 128 |
+
headers = {
|
| 129 |
+
"Content-Type": "application/json",
|
| 130 |
+
"Authorization": f"Bearer {cfg.upstream_api_key}",
|
| 131 |
+
}
|
| 132 |
+
upstream_url = _resolve_upstream_url(cfg.upstream_url)
|
| 133 |
+
|
| 134 |
+
async with state["lock"]:
|
| 135 |
+
state["turn"] += 1
|
| 136 |
+
turn_idx = state["turn"]
|
| 137 |
+
|
| 138 |
+
if forwarded_body.get("stream"):
|
| 139 |
+
return await _proxy_streaming(
|
| 140 |
+
client=client,
|
| 141 |
+
upstream_url=upstream_url,
|
| 142 |
+
headers=headers,
|
| 143 |
+
forwarded_body=forwarded_body,
|
| 144 |
+
original_body=body,
|
| 145 |
+
trace_file=trace_file,
|
| 146 |
+
turn_idx=turn_idx,
|
| 147 |
+
)
|
| 148 |
+
return await _proxy_unary(
|
| 149 |
+
client=client,
|
| 150 |
+
upstream_url=upstream_url,
|
| 151 |
+
headers=headers,
|
| 152 |
+
forwarded_body=forwarded_body,
|
| 153 |
+
original_body=body,
|
| 154 |
+
trace_file=trace_file,
|
| 155 |
+
turn_idx=turn_idx,
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
@app.on_event("shutdown")
|
| 159 |
+
async def _shutdown() -> None:
|
| 160 |
+
await client.aclose()
|
| 161 |
+
trace_file.close()
|
| 162 |
+
|
| 163 |
+
return app
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def _prepare_forwarded_body(body: dict[str, Any], cfg: ProxyConfig) -> dict[str, Any]:
|
| 167 |
+
"""Return the body we actually send upstream.
|
| 168 |
+
|
| 169 |
+
- Injects ``logprobs=true`` + ``top_logprobs`` so the upstream emits
|
| 170 |
+
per-token logprobs.
|
| 171 |
+
- Caps ``max_tokens`` / ``max_completion_tokens`` to ``max_tokens_cap``.
|
| 172 |
+
- For models that reject ``max_tokens`` (e.g. gpt-5.x), translates to
|
| 173 |
+
``max_completion_tokens``.
|
| 174 |
+
"""
|
| 175 |
+
forwarded = copy.deepcopy(body)
|
| 176 |
+
forwarded.setdefault("logprobs", True)
|
| 177 |
+
forwarded.setdefault("top_logprobs", cfg.top_logprobs)
|
| 178 |
+
|
| 179 |
+
# GPT-5.x and newer: ``max_tokens`` is rejected; must use
|
| 180 |
+
# ``max_completion_tokens``. Detect via model string so we don't break
|
| 181 |
+
# gpt-4.x or vLLM-hosted models that accept ``max_tokens``.
|
| 182 |
+
model = str(forwarded.get("model", ""))
|
| 183 |
+
needs_translation = _model_uses_max_completion_tokens(model)
|
| 184 |
+
if needs_translation and "max_tokens" in forwarded:
|
| 185 |
+
value = forwarded.pop("max_tokens")
|
| 186 |
+
forwarded.setdefault("max_completion_tokens", value)
|
| 187 |
+
|
| 188 |
+
if cfg.max_tokens_cap is not None:
|
| 189 |
+
for key in ("max_tokens", "max_completion_tokens"):
|
| 190 |
+
value = forwarded.get(key)
|
| 191 |
+
if isinstance(value, int) and value > cfg.max_tokens_cap:
|
| 192 |
+
forwarded[key] = cfg.max_tokens_cap
|
| 193 |
+
|
| 194 |
+
if cfg.disable_thinking:
|
| 195 |
+
# vLLM applies chat_template_kwargs to the tokenizer's chat template
|
| 196 |
+
# for Qwen3/Qwen3.5 models, turning off <think>...</think> generation.
|
| 197 |
+
extra = forwarded.setdefault("chat_template_kwargs", {})
|
| 198 |
+
extra.setdefault("enable_thinking", False)
|
| 199 |
+
|
| 200 |
+
if cfg.model_override:
|
| 201 |
+
forwarded["model"] = cfg.model_override
|
| 202 |
+
|
| 203 |
+
return forwarded
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def _model_uses_max_completion_tokens(model: str) -> bool:
|
| 207 |
+
"""Heuristic: ``True`` for models that reject ``max_tokens``."""
|
| 208 |
+
# Strip a provider prefix opencode may have prepended (e.g. "intercepted/").
|
| 209 |
+
bare = model.split("/", 1)[-1].lower()
|
| 210 |
+
return bare.startswith(("gpt-5", "o1", "o3", "o4"))
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
def _resolve_upstream_url(upstream: str) -> str:
|
| 214 |
+
"""Build the fully qualified chat-completions URL from a base URL."""
|
| 215 |
+
base = upstream.rstrip("/")
|
| 216 |
+
if base.endswith("/v1"):
|
| 217 |
+
return f"{base}/chat/completions"
|
| 218 |
+
return f"{base}{CHAT_COMPLETIONS_PATH}"
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
async def _proxy_unary(
|
| 222 |
+
*,
|
| 223 |
+
client: httpx.AsyncClient,
|
| 224 |
+
upstream_url: str,
|
| 225 |
+
headers: dict[str, str],
|
| 226 |
+
forwarded_body: dict[str, Any],
|
| 227 |
+
original_body: dict[str, Any],
|
| 228 |
+
trace_file: Any,
|
| 229 |
+
turn_idx: int,
|
| 230 |
+
) -> Response:
|
| 231 |
+
start = time.time()
|
| 232 |
+
upstream_response = await client.post(
|
| 233 |
+
upstream_url, content=json.dumps(forwarded_body), headers=headers
|
| 234 |
+
)
|
| 235 |
+
latency = time.time() - start
|
| 236 |
+
try:
|
| 237 |
+
response_json = upstream_response.json()
|
| 238 |
+
except Exception:
|
| 239 |
+
return Response(
|
| 240 |
+
content=upstream_response.content,
|
| 241 |
+
status_code=upstream_response.status_code,
|
| 242 |
+
media_type=upstream_response.headers.get(
|
| 243 |
+
"content-type", "application/json"
|
| 244 |
+
),
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
record = _build_turn_record(
|
| 248 |
+
turn_idx=turn_idx,
|
| 249 |
+
request_body=forwarded_body,
|
| 250 |
+
response_json=response_json,
|
| 251 |
+
latency_s=latency,
|
| 252 |
+
)
|
| 253 |
+
trace_file.write(record.to_json() + "\n")
|
| 254 |
+
sanitized = _strip_logprobs(response_json)
|
| 255 |
+
return JSONResponse(content=sanitized, status_code=upstream_response.status_code)
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
async def _proxy_streaming(
|
| 259 |
+
*,
|
| 260 |
+
client: httpx.AsyncClient,
|
| 261 |
+
upstream_url: str,
|
| 262 |
+
headers: dict[str, str],
|
| 263 |
+
forwarded_body: dict[str, Any],
|
| 264 |
+
original_body: dict[str, Any],
|
| 265 |
+
trace_file: Any,
|
| 266 |
+
turn_idx: int,
|
| 267 |
+
) -> Response:
|
| 268 |
+
"""Forward an SSE stream while accumulating the full response.
|
| 269 |
+
|
| 270 |
+
Opens the upstream stream and inspects the status. On non-2xx, reads the
|
| 271 |
+
full body (an error JSON, not SSE) and returns it to the caller as a
|
| 272 |
+
regular JSON response — previously we silently emitted an empty
|
| 273 |
+
``text/event-stream`` which opencode interpreted as an empty assistant
|
| 274 |
+
turn. Both the error body and the latency are written to the trace file
|
| 275 |
+
so debugging a broken rollout doesn't require another round-trip.
|
| 276 |
+
"""
|
| 277 |
+
|
| 278 |
+
start = time.time()
|
| 279 |
+
|
| 280 |
+
# Open the stream outside the generator so we can branch on status before
|
| 281 |
+
# committing to a streaming response shape.
|
| 282 |
+
upstream_cm = client.stream(
|
| 283 |
+
"POST",
|
| 284 |
+
upstream_url,
|
| 285 |
+
content=json.dumps(forwarded_body),
|
| 286 |
+
headers=headers,
|
| 287 |
+
)
|
| 288 |
+
upstream = await upstream_cm.__aenter__()
|
| 289 |
+
|
| 290 |
+
if upstream.status_code >= 400:
|
| 291 |
+
# Upstream responded with an error body (not SSE). Read it fully and
|
| 292 |
+
# return as a non-streaming JSON payload.
|
| 293 |
+
error_bytes = await upstream.aread()
|
| 294 |
+
await upstream_cm.__aexit__(None, None, None)
|
| 295 |
+
latency = time.time() - start
|
| 296 |
+
try:
|
| 297 |
+
error_json = json.loads(error_bytes.decode() or "{}")
|
| 298 |
+
except Exception:
|
| 299 |
+
error_json = {"error": error_bytes.decode(errors="replace")[:4000]}
|
| 300 |
+
record = _build_turn_record(
|
| 301 |
+
turn_idx=turn_idx,
|
| 302 |
+
request_body=forwarded_body,
|
| 303 |
+
response_json={
|
| 304 |
+
"choices": [],
|
| 305 |
+
"usage": None,
|
| 306 |
+
"upstream_status": upstream.status_code,
|
| 307 |
+
"upstream_error": error_json,
|
| 308 |
+
},
|
| 309 |
+
latency_s=latency,
|
| 310 |
+
)
|
| 311 |
+
trace_file.write(record.to_json() + "\n")
|
| 312 |
+
print(
|
| 313 |
+
f"[proxy] turn {turn_idx}: upstream {upstream.status_code}: "
|
| 314 |
+
f"{str(error_json)[:400]}",
|
| 315 |
+
flush=True,
|
| 316 |
+
)
|
| 317 |
+
return JSONResponse(content=error_json, status_code=upstream.status_code)
|
| 318 |
+
|
| 319 |
+
async def _stream() -> Any:
|
| 320 |
+
accumulated: dict[str, Any] = {
|
| 321 |
+
"content_by_idx": {},
|
| 322 |
+
"tool_calls_by_idx": {},
|
| 323 |
+
"finish_by_idx": {},
|
| 324 |
+
"logprobs_by_idx": {},
|
| 325 |
+
}
|
| 326 |
+
last_chunk: dict[str, Any] = {}
|
| 327 |
+
try:
|
| 328 |
+
async for line in upstream.aiter_lines():
|
| 329 |
+
if not line:
|
| 330 |
+
yield "\n"
|
| 331 |
+
continue
|
| 332 |
+
yield line + "\n"
|
| 333 |
+
if not line.startswith("data:"):
|
| 334 |
+
continue
|
| 335 |
+
data = line[len("data:"):].strip()
|
| 336 |
+
if data == "[DONE]":
|
| 337 |
+
continue
|
| 338 |
+
try:
|
| 339 |
+
chunk = json.loads(data)
|
| 340 |
+
except json.JSONDecodeError:
|
| 341 |
+
continue
|
| 342 |
+
last_chunk = chunk
|
| 343 |
+
_accumulate_stream_chunk(chunk, accumulated)
|
| 344 |
+
finally:
|
| 345 |
+
await upstream_cm.__aexit__(None, None, None)
|
| 346 |
+
|
| 347 |
+
latency = time.time() - start
|
| 348 |
+
response_json = _assemble_streamed_response(last_chunk, accumulated)
|
| 349 |
+
record = _build_turn_record(
|
| 350 |
+
turn_idx=turn_idx,
|
| 351 |
+
request_body=forwarded_body,
|
| 352 |
+
response_json=response_json,
|
| 353 |
+
latency_s=latency,
|
| 354 |
+
)
|
| 355 |
+
trace_file.write(record.to_json() + "\n")
|
| 356 |
+
|
| 357 |
+
return StreamingResponse(_stream(), media_type="text/event-stream")
|
| 358 |
+
|
| 359 |
+
|
| 360 |
+
def _accumulate_stream_chunk(chunk: dict[str, Any], acc: dict[str, Any]) -> None:
|
| 361 |
+
for choice in chunk.get("choices", []) or []:
|
| 362 |
+
idx = choice.get("index", 0)
|
| 363 |
+
delta = choice.get("delta") or {}
|
| 364 |
+
content = delta.get("content")
|
| 365 |
+
if content:
|
| 366 |
+
acc["content_by_idx"].setdefault(idx, []).append(content)
|
| 367 |
+
# HF-Router's Qwen thinking mode streams the chain-of-thought under a
|
| 368 |
+
# separate ``reasoning`` field (per Together/Scaleway). Accumulate it
|
| 369 |
+
# so the assembled response surfaces it — otherwise it's dropped and
|
| 370 |
+
# proxy_turn observability is lost for thinking-mode rollouts.
|
| 371 |
+
reasoning = delta.get("reasoning")
|
| 372 |
+
if reasoning:
|
| 373 |
+
acc.setdefault("reasoning_by_idx", {}).setdefault(idx, []).append(reasoning)
|
| 374 |
+
for tc in delta.get("tool_calls") or []:
|
| 375 |
+
tc_idx = tc.get("index", 0)
|
| 376 |
+
bucket = acc["tool_calls_by_idx"].setdefault(
|
| 377 |
+
(idx, tc_idx),
|
| 378 |
+
{"id": None, "type": "function", "function": {"name": "", "arguments": ""}},
|
| 379 |
+
)
|
| 380 |
+
if tc.get("id"):
|
| 381 |
+
bucket["id"] = tc["id"]
|
| 382 |
+
fn = tc.get("function") or {}
|
| 383 |
+
if fn.get("name"):
|
| 384 |
+
bucket["function"]["name"] += fn["name"]
|
| 385 |
+
if fn.get("arguments"):
|
| 386 |
+
bucket["function"]["arguments"] += fn["arguments"]
|
| 387 |
+
if choice.get("finish_reason"):
|
| 388 |
+
acc["finish_by_idx"][idx] = choice["finish_reason"]
|
| 389 |
+
lp = choice.get("logprobs") or {}
|
| 390 |
+
content_lp = lp.get("content")
|
| 391 |
+
if content_lp:
|
| 392 |
+
acc["logprobs_by_idx"].setdefault(idx, []).extend(content_lp)
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
def _assemble_streamed_response(
|
| 396 |
+
last_chunk: dict[str, Any], acc: dict[str, Any]
|
| 397 |
+
) -> dict[str, Any]:
|
| 398 |
+
indices = sorted(
|
| 399 |
+
set(acc["content_by_idx"])
|
| 400 |
+
| set(acc["finish_by_idx"])
|
| 401 |
+
| {k[0] for k in acc["tool_calls_by_idx"]}
|
| 402 |
+
| set(acc["logprobs_by_idx"])
|
| 403 |
+
| {0}
|
| 404 |
+
)
|
| 405 |
+
choices: list[dict[str, Any]] = []
|
| 406 |
+
for idx in indices:
|
| 407 |
+
tool_calls = [
|
| 408 |
+
acc["tool_calls_by_idx"][k]
|
| 409 |
+
for k in sorted(acc["tool_calls_by_idx"])
|
| 410 |
+
if k[0] == idx
|
| 411 |
+
]
|
| 412 |
+
message: dict[str, Any] = {"role": "assistant"}
|
| 413 |
+
content = "".join(acc["content_by_idx"].get(idx, []))
|
| 414 |
+
if content:
|
| 415 |
+
message["content"] = content
|
| 416 |
+
reasoning = "".join((acc.get("reasoning_by_idx") or {}).get(idx, []))
|
| 417 |
+
if reasoning:
|
| 418 |
+
message["reasoning"] = reasoning
|
| 419 |
+
if tool_calls:
|
| 420 |
+
message["tool_calls"] = tool_calls
|
| 421 |
+
choice: dict[str, Any] = {
|
| 422 |
+
"index": idx,
|
| 423 |
+
"message": message,
|
| 424 |
+
"finish_reason": acc["finish_by_idx"].get(idx),
|
| 425 |
+
}
|
| 426 |
+
if acc["logprobs_by_idx"].get(idx):
|
| 427 |
+
choice["logprobs"] = {"content": acc["logprobs_by_idx"][idx]}
|
| 428 |
+
choices.append(choice)
|
| 429 |
+
return {
|
| 430 |
+
"id": last_chunk.get("id", ""),
|
| 431 |
+
"object": "chat.completion",
|
| 432 |
+
"model": last_chunk.get("model", ""),
|
| 433 |
+
"choices": choices,
|
| 434 |
+
"usage": last_chunk.get("usage"),
|
| 435 |
+
}
|
| 436 |
+
|
| 437 |
+
|
| 438 |
+
def _build_turn_record(
|
| 439 |
+
*,
|
| 440 |
+
turn_idx: int,
|
| 441 |
+
request_body: dict[str, Any],
|
| 442 |
+
response_json: dict[str, Any],
|
| 443 |
+
latency_s: float,
|
| 444 |
+
) -> TurnRecord:
|
| 445 |
+
"""Extract per-token logprobs into a normalized :class:`TurnRecord`."""
|
| 446 |
+
|
| 447 |
+
choice = (response_json.get("choices") or [{}])[0]
|
| 448 |
+
logprobs_field = choice.get("logprobs") or {}
|
| 449 |
+
content_lp = logprobs_field.get("content") or []
|
| 450 |
+
|
| 451 |
+
tokens: list[str] = []
|
| 452 |
+
token_ids: list[int] = []
|
| 453 |
+
per_token_logps: list[float] = []
|
| 454 |
+
for entry in content_lp:
|
| 455 |
+
tokens.append(entry.get("token", ""))
|
| 456 |
+
# OpenAI returns no raw token ids; vLLM returns them as ``token_id``.
|
| 457 |
+
token_id = entry.get("token_id")
|
| 458 |
+
if token_id is not None:
|
| 459 |
+
token_ids.append(int(token_id))
|
| 460 |
+
lp = entry.get("logprob")
|
| 461 |
+
if lp is not None:
|
| 462 |
+
per_token_logps.append(float(lp))
|
| 463 |
+
|
| 464 |
+
return TurnRecord(
|
| 465 |
+
turn=turn_idx,
|
| 466 |
+
request=request_body,
|
| 467 |
+
response=response_json,
|
| 468 |
+
logprobs=content_lp,
|
| 469 |
+
completion_tokens=tokens,
|
| 470 |
+
completion_token_ids=token_ids,
|
| 471 |
+
per_token_logps=per_token_logps,
|
| 472 |
+
finish_reason=choice.get("finish_reason"),
|
| 473 |
+
latency_s=latency_s,
|
| 474 |
+
)
|
| 475 |
+
|
| 476 |
+
|
| 477 |
+
def _strip_logprobs(response_json: dict[str, Any]) -> dict[str, Any]:
|
| 478 |
+
"""Return a copy of the response with ``choices[*].logprobs`` removed."""
|
| 479 |
+
|
| 480 |
+
out = dict(response_json)
|
| 481 |
+
choices = out.get("choices")
|
| 482 |
+
if isinstance(choices, list):
|
| 483 |
+
out["choices"] = [
|
| 484 |
+
{k: v for k, v in (ch or {}).items() if k != "logprobs"}
|
| 485 |
+
for ch in choices
|
| 486 |
+
]
|
| 487 |
+
return out
|
| 488 |
+
|
| 489 |
+
|
| 490 |
+
# ---------------------------------------------------------------------------
|
| 491 |
+
# Standalone runner (used inside the sandbox)
|
| 492 |
+
# ---------------------------------------------------------------------------
|
| 493 |
+
|
| 494 |
+
|
| 495 |
+
def serve(cfg: ProxyConfig) -> None:
|
| 496 |
+
"""Start the proxy and block (for use as the sandbox-side entry point)."""
|
| 497 |
+
|
| 498 |
+
app = _build_app(cfg)
|
| 499 |
+
uvicorn.run(app, host=cfg.host, port=cfg.port, log_level="warning")
|
| 500 |
+
|
| 501 |
+
|
| 502 |
+
class InterceptionProxy:
|
| 503 |
+
"""Thread-backed controller for running the proxy locally.
|
| 504 |
+
|
| 505 |
+
Used by unit tests and by any in-process driver that wants a short-lived
|
| 506 |
+
proxy on the local machine. Inside a sandbox we invoke :func:`serve`
|
| 507 |
+
directly via ``python -m opencode_env.interception``.
|
| 508 |
+
"""
|
| 509 |
+
|
| 510 |
+
def __init__(self, cfg: ProxyConfig) -> None:
|
| 511 |
+
self._cfg = cfg
|
| 512 |
+
self._server: uvicorn.Server | None = None
|
| 513 |
+
self._thread: threading.Thread | None = None
|
| 514 |
+
self._ready = threading.Event()
|
| 515 |
+
|
| 516 |
+
@property
|
| 517 |
+
def url(self) -> str:
|
| 518 |
+
return f"http://{self._cfg.host}:{self._cfg.port}/v1"
|
| 519 |
+
|
| 520 |
+
@property
|
| 521 |
+
def config(self) -> ProxyConfig:
|
| 522 |
+
return self._cfg
|
| 523 |
+
|
| 524 |
+
def start(self) -> None:
|
| 525 |
+
app = _build_app(self._cfg)
|
| 526 |
+
config = uvicorn.Config(
|
| 527 |
+
app,
|
| 528 |
+
host=self._cfg.host,
|
| 529 |
+
port=self._cfg.port,
|
| 530 |
+
log_level="warning",
|
| 531 |
+
lifespan="on",
|
| 532 |
+
)
|
| 533 |
+
self._server = uvicorn.Server(config)
|
| 534 |
+
self._thread = threading.Thread(
|
| 535 |
+
target=self._run_server, daemon=True
|
| 536 |
+
)
|
| 537 |
+
self._thread.start()
|
| 538 |
+
# Wait for the server to accept connections.
|
| 539 |
+
deadline = time.time() + 10
|
| 540 |
+
while time.time() < deadline:
|
| 541 |
+
if _port_open(self._cfg.host, self._cfg.port):
|
| 542 |
+
self._ready.set()
|
| 543 |
+
return
|
| 544 |
+
time.sleep(0.05)
|
| 545 |
+
raise RuntimeError("InterceptionProxy failed to start within 10s")
|
| 546 |
+
|
| 547 |
+
def _run_server(self) -> None:
|
| 548 |
+
assert self._server is not None
|
| 549 |
+
self._server.run()
|
| 550 |
+
|
| 551 |
+
def stop(self) -> None:
|
| 552 |
+
if self._server is None:
|
| 553 |
+
return
|
| 554 |
+
self._server.should_exit = True
|
| 555 |
+
if self._thread is not None:
|
| 556 |
+
self._thread.join(timeout=5)
|
| 557 |
+
self._server = None
|
| 558 |
+
self._thread = None
|
| 559 |
+
|
| 560 |
+
def __enter__(self) -> "InterceptionProxy":
|
| 561 |
+
self.start()
|
| 562 |
+
return self
|
| 563 |
+
|
| 564 |
+
def __exit__(self, *exc) -> None:
|
| 565 |
+
self.stop()
|
| 566 |
+
|
| 567 |
+
|
| 568 |
+
def _port_open(host: str, port: int) -> bool:
|
| 569 |
+
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
|
| 570 |
+
s.settimeout(0.2)
|
| 571 |
+
return s.connect_ex((host, port)) == 0
|
| 572 |
+
|
| 573 |
+
|
| 574 |
+
# ---------------------------------------------------------------------------
|
| 575 |
+
# Trace reader (used by the session to pull captured turns back)
|
| 576 |
+
# ---------------------------------------------------------------------------
|
| 577 |
+
|
| 578 |
+
|
| 579 |
+
def read_trace(path: str | os.PathLike) -> list[dict[str, Any]]:
|
| 580 |
+
"""Read a proxy trace file into a list of dicts."""
|
| 581 |
+
|
| 582 |
+
trace: list[dict[str, Any]] = []
|
| 583 |
+
p = Path(path)
|
| 584 |
+
if not p.exists():
|
| 585 |
+
return trace
|
| 586 |
+
for line in p.read_text().splitlines():
|
| 587 |
+
line = line.strip()
|
| 588 |
+
if not line:
|
| 589 |
+
continue
|
| 590 |
+
trace.append(json.loads(line))
|
| 591 |
+
return trace
|
| 592 |
+
|
| 593 |
+
|
| 594 |
+
# ---------------------------------------------------------------------------
|
| 595 |
+
# CLI entry point
|
| 596 |
+
# ---------------------------------------------------------------------------
|
| 597 |
+
|
| 598 |
+
|
| 599 |
+
def main() -> None:
|
| 600 |
+
parser = argparse.ArgumentParser(prog="opencode_env.interception")
|
| 601 |
+
parser.add_argument("--upstream-url", required=True)
|
| 602 |
+
parser.add_argument("--upstream-api-key", default="intercepted")
|
| 603 |
+
parser.add_argument("--trace", default="/tmp/opencode-proxy-trace.jsonl")
|
| 604 |
+
parser.add_argument("--host", default="127.0.0.1")
|
| 605 |
+
parser.add_argument("--port", type=int, default=7000)
|
| 606 |
+
parser.add_argument("--top-logprobs", type=int, default=5)
|
| 607 |
+
parser.add_argument("--request-timeout", type=float, default=600.0)
|
| 608 |
+
parser.add_argument(
|
| 609 |
+
"--max-tokens-cap",
|
| 610 |
+
type=int,
|
| 611 |
+
default=None,
|
| 612 |
+
help="Clamp max_tokens/max_completion_tokens on forwarded requests.",
|
| 613 |
+
)
|
| 614 |
+
parser.add_argument(
|
| 615 |
+
"--disable-thinking",
|
| 616 |
+
action="store_true",
|
| 617 |
+
help="Inject chat_template_kwargs.enable_thinking=false (Qwen3/Qwen3.5).",
|
| 618 |
+
)
|
| 619 |
+
parser.add_argument(
|
| 620 |
+
"--model-override",
|
| 621 |
+
default=None,
|
| 622 |
+
help="Rewrite the `model` field on every forwarded request.",
|
| 623 |
+
)
|
| 624 |
+
args = parser.parse_args()
|
| 625 |
+
|
| 626 |
+
cfg = ProxyConfig(
|
| 627 |
+
upstream_url=args.upstream_url,
|
| 628 |
+
upstream_api_key=args.upstream_api_key,
|
| 629 |
+
trace_path=args.trace,
|
| 630 |
+
host=args.host,
|
| 631 |
+
port=args.port,
|
| 632 |
+
top_logprobs=args.top_logprobs,
|
| 633 |
+
request_timeout_s=args.request_timeout,
|
| 634 |
+
max_tokens_cap=args.max_tokens_cap,
|
| 635 |
+
disable_thinking=args.disable_thinking,
|
| 636 |
+
model_override=args.model_override,
|
| 637 |
+
)
|
| 638 |
+
serve(cfg)
|
| 639 |
+
|
| 640 |
+
|
| 641 |
+
if __name__ == "__main__":
|
| 642 |
+
main()
|
server/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Server-side for the deployed opencode_env."""
|
server/app.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""FastAPI app for the opencode_env MCP server.
|
| 8 |
+
|
| 9 |
+
Mirrors the standard OpenEnv pattern (echo_env / repl_env / jupyter_agent)
|
| 10 |
+
plus the custom Gradio UI mounted at ``/web`` per the
|
| 11 |
+
``customizing-web-ui`` doc.
|
| 12 |
+
|
| 13 |
+
Usage::
|
| 14 |
+
|
| 15 |
+
# Local dev:
|
| 16 |
+
E2B_API_KEY=... uvicorn server.app:app --host 0.0.0.0 --port 8000
|
| 17 |
+
|
| 18 |
+
# Docker:
|
| 19 |
+
docker run -p 8000:8000 -e E2B_API_KEY=... opencode-env
|
| 20 |
+
|
| 21 |
+
# HF Space: deploys via the root ``Dockerfile``.
|
| 22 |
+
|
| 23 |
+
The ``ENABLE_WEB_INTERFACE`` env var is set to ``true`` automatically so
|
| 24 |
+
the UI is always reachable at ``/web``. Set it to ``false`` to disable.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
from __future__ import annotations
|
| 28 |
+
|
| 29 |
+
import os
|
| 30 |
+
from pathlib import Path
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def _load_env_file() -> None:
|
| 34 |
+
"""Lightweight ``.env`` loader (no python-dotenv dep).
|
| 35 |
+
|
| 36 |
+
Loads ``../.env`` (env dir's ``.env``) into ``os.environ`` for local
|
| 37 |
+
development convenience. Existing process env vars take precedence so
|
| 38 |
+
HF Space secrets always win.
|
| 39 |
+
"""
|
| 40 |
+
candidate = Path(__file__).resolve().parents[1] / ".env"
|
| 41 |
+
if not candidate.exists():
|
| 42 |
+
return
|
| 43 |
+
for raw in candidate.read_text().splitlines():
|
| 44 |
+
line = raw.strip()
|
| 45 |
+
if not line or line.startswith("#") or "=" not in line:
|
| 46 |
+
continue
|
| 47 |
+
k, _, v = line.partition("=")
|
| 48 |
+
k = k.strip()
|
| 49 |
+
v = v.strip().strip('"').strip("'")
|
| 50 |
+
if k and k not in os.environ:
|
| 51 |
+
os.environ[k] = v
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
_load_env_file()
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
from openenv.core.env_server.http_server import create_app
|
| 59 |
+
from openenv.core.env_server.mcp_types import (
|
| 60 |
+
CallToolAction,
|
| 61 |
+
CallToolObservation,
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
from .gradio_ui import opencode_gradio_builder
|
| 65 |
+
from .opencode_environment import OpenCodeEnvironment
|
| 66 |
+
except ImportError: # pragma: no cover
|
| 67 |
+
from openenv.core.env_server.http_server import create_app
|
| 68 |
+
from openenv.core.env_server.mcp_types import (
|
| 69 |
+
CallToolAction,
|
| 70 |
+
CallToolObservation,
|
| 71 |
+
)
|
| 72 |
+
from server.gradio_ui import opencode_gradio_builder # type: ignore
|
| 73 |
+
from server.opencode_environment import OpenCodeEnvironment # type: ignore
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# Always expose the Gradio UI at /web. Set ENABLE_WEB_INTERFACE=false to
|
| 77 |
+
# disable (e.g., on HF Spaces where you want the API only).
|
| 78 |
+
os.environ.setdefault("ENABLE_WEB_INTERFACE", "true")
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def _custom_gradio_builder(
|
| 82 |
+
web_manager,
|
| 83 |
+
action_fields,
|
| 84 |
+
metadata,
|
| 85 |
+
is_chat_env,
|
| 86 |
+
title,
|
| 87 |
+
quick_start_md,
|
| 88 |
+
):
|
| 89 |
+
"""Hand off to ``server.gradio_ui.opencode_gradio_builder``."""
|
| 90 |
+
return opencode_gradio_builder(
|
| 91 |
+
web_manager,
|
| 92 |
+
action_fields,
|
| 93 |
+
metadata,
|
| 94 |
+
is_chat_env,
|
| 95 |
+
title or "opencode_env",
|
| 96 |
+
quick_start_md,
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
app = create_app(
|
| 101 |
+
OpenCodeEnvironment,
|
| 102 |
+
CallToolAction,
|
| 103 |
+
CallToolObservation,
|
| 104 |
+
env_name="opencode_env",
|
| 105 |
+
max_concurrent_envs=int(os.getenv("MAX_CONCURRENT_ENVS", "4")),
|
| 106 |
+
gradio_builder=_custom_gradio_builder,
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def main() -> None:
|
| 111 |
+
"""Entrypoint for ``uv run --project . server`` and direct invocation."""
|
| 112 |
+
import uvicorn
|
| 113 |
+
|
| 114 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
if __name__ == "__main__":
|
| 118 |
+
main()
|
server/catalog.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Endpoint shorthand catalog.
|
| 8 |
+
|
| 9 |
+
Lets the MCP tool ``run_rollout`` and the HTMX UI accept a short endpoint
|
| 10 |
+
label (``vllm`` / ``openai`` / ``hf_router``) and resolve the actual
|
| 11 |
+
``base_url`` / ``api_key`` / ``model`` from environment variables (with
|
| 12 |
+
sane defaults). Explicit overrides on the call always win.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
import os
|
| 18 |
+
from dataclasses import dataclass
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
ENDPOINT_KINDS = ("vllm", "openai", "hf_router")
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@dataclass(frozen=True)
|
| 25 |
+
class _EndpointSpec:
|
| 26 |
+
base_url_env: str
|
| 27 |
+
api_key_env: str
|
| 28 |
+
model_env: str
|
| 29 |
+
default_base_url: str | None
|
| 30 |
+
default_api_key: str | None
|
| 31 |
+
default_model: str | None
|
| 32 |
+
disable_thinking_default: bool
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
_CATALOG: dict[str, _EndpointSpec] = {
|
| 36 |
+
"vllm": _EndpointSpec(
|
| 37 |
+
base_url_env="VLLM_URL",
|
| 38 |
+
api_key_env="VLLM_API_KEY",
|
| 39 |
+
model_env="VLLM_MODEL",
|
| 40 |
+
default_base_url=None, # cluster URL must be set
|
| 41 |
+
default_api_key="intercepted", # vLLM rarely enforces auth
|
| 42 |
+
default_model="Qwen/Qwen3.5-4B",
|
| 43 |
+
disable_thinking_default=True, # Qwen3.5 thinks by default
|
| 44 |
+
),
|
| 45 |
+
"openai": _EndpointSpec(
|
| 46 |
+
base_url_env="OPENAI_BASE_URL",
|
| 47 |
+
api_key_env="OPENAI_API_KEY",
|
| 48 |
+
model_env="OPENAI_MODEL",
|
| 49 |
+
default_base_url="https://api.openai.com/v1",
|
| 50 |
+
default_api_key=None,
|
| 51 |
+
default_model="gpt-4o-mini",
|
| 52 |
+
disable_thinking_default=False, # OpenAI rejects unknown kwargs
|
| 53 |
+
),
|
| 54 |
+
"hf_router": _EndpointSpec(
|
| 55 |
+
base_url_env="HF_ROUTER_BASE_URL",
|
| 56 |
+
api_key_env="HF_ROUTER_API_KEY",
|
| 57 |
+
model_env="HF_ROUTER_MODEL",
|
| 58 |
+
default_base_url="https://router.huggingface.co/v1",
|
| 59 |
+
default_api_key=None,
|
| 60 |
+
default_model="Qwen/Qwen3-4B-Instruct-2507:nscale",
|
| 61 |
+
disable_thinking_default=False, # Instruct variant doesn't think
|
| 62 |
+
),
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
@dataclass(frozen=True)
|
| 67 |
+
class ResolvedEndpoint:
|
| 68 |
+
kind: str
|
| 69 |
+
base_url: str
|
| 70 |
+
api_key: str
|
| 71 |
+
model: str
|
| 72 |
+
disable_thinking_default: bool
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def resolve_endpoint(
|
| 76 |
+
kind: str,
|
| 77 |
+
*,
|
| 78 |
+
base_url: str = "",
|
| 79 |
+
api_key: str = "",
|
| 80 |
+
model: str = "",
|
| 81 |
+
) -> ResolvedEndpoint:
|
| 82 |
+
"""Resolve an endpoint shorthand into concrete (base_url, api_key, model).
|
| 83 |
+
|
| 84 |
+
Precedence per field: **explicit arg > env var > catalog default**.
|
| 85 |
+
Always normalizes to a ``/v1`` base URL.
|
| 86 |
+
|
| 87 |
+
Raises ``ValueError`` for unknown kinds, missing creds, or missing model.
|
| 88 |
+
"""
|
| 89 |
+
spec = _CATALOG.get(kind)
|
| 90 |
+
if spec is None:
|
| 91 |
+
raise ValueError(
|
| 92 |
+
f"unknown endpoint kind: {kind!r}; expected one of {ENDPOINT_KINDS}"
|
| 93 |
+
)
|
| 94 |
+
base = (
|
| 95 |
+
base_url or os.environ.get(spec.base_url_env) or spec.default_base_url or ""
|
| 96 |
+
).rstrip("/")
|
| 97 |
+
if not base:
|
| 98 |
+
raise ValueError(
|
| 99 |
+
f"{kind}: no base_url (set {spec.base_url_env} env var or pass "
|
| 100 |
+
"base_url=...)"
|
| 101 |
+
)
|
| 102 |
+
if not base.endswith("/v1"):
|
| 103 |
+
base = f"{base}/v1"
|
| 104 |
+
|
| 105 |
+
key = api_key or os.environ.get(spec.api_key_env) or spec.default_api_key or ""
|
| 106 |
+
if not key:
|
| 107 |
+
raise ValueError(
|
| 108 |
+
f"{kind}: no api_key (set {spec.api_key_env} env var or pass api_key=...)"
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
mdl = model or os.environ.get(spec.model_env) or spec.default_model or ""
|
| 112 |
+
if not mdl:
|
| 113 |
+
raise ValueError(
|
| 114 |
+
f"{kind}: no model (set {spec.model_env} env var or pass model=...)"
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
return ResolvedEndpoint(
|
| 118 |
+
kind=kind,
|
| 119 |
+
base_url=base,
|
| 120 |
+
api_key=key,
|
| 121 |
+
model=mdl,
|
| 122 |
+
disable_thinking_default=spec.disable_thinking_default,
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def catalog_summary() -> list[dict[str, object]]:
|
| 127 |
+
"""Return a JSON-friendly view of the catalog (for the UI dropdown)."""
|
| 128 |
+
out: list[dict[str, object]] = []
|
| 129 |
+
for kind, spec in _CATALOG.items():
|
| 130 |
+
out.append(
|
| 131 |
+
{
|
| 132 |
+
"kind": kind,
|
| 133 |
+
"base_url_env": spec.base_url_env,
|
| 134 |
+
"api_key_env": spec.api_key_env,
|
| 135 |
+
"model_env": spec.model_env,
|
| 136 |
+
"default_base_url": spec.default_base_url,
|
| 137 |
+
"default_model": spec.default_model,
|
| 138 |
+
"disable_thinking_default": spec.disable_thinking_default,
|
| 139 |
+
"configured": _is_configured(spec),
|
| 140 |
+
}
|
| 141 |
+
)
|
| 142 |
+
return out
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def _is_configured(spec: _EndpointSpec) -> bool:
|
| 146 |
+
base = os.environ.get(spec.base_url_env) or spec.default_base_url or ""
|
| 147 |
+
key = os.environ.get(spec.api_key_env) or spec.default_api_key or ""
|
| 148 |
+
model = os.environ.get(spec.model_env) or spec.default_model or ""
|
| 149 |
+
return bool(base and key and model)
|
server/gradio_ui.py
ADDED
|
@@ -0,0 +1,453 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Minimal Gradio UI for opencode_env.
|
| 8 |
+
|
| 9 |
+
Mounts under the standard OpenEnv ``/web`` path via the
|
| 10 |
+
``gradio_builder=`` callback documented at
|
| 11 |
+
https://meta-pytorch.org/OpenEnv/customizing-web-ui.html.
|
| 12 |
+
|
| 13 |
+
One page with:
|
| 14 |
+
- endpoint selector (``vllm`` / ``openai`` / ``hf_router``) — the catalog
|
| 15 |
+
resolves the actual base_url / api_key / model from env vars.
|
| 16 |
+
- instruction + setup (bash, one cmd per line) + verify (bash, one cmd
|
| 17 |
+
per line) textareas — the same Task shape the MCP tool accepts.
|
| 18 |
+
- Tunables (mode, disable_thinking, max_tokens_cap, top_logprobs,
|
| 19 |
+
agent_timeout_s, template).
|
| 20 |
+
- Preset buttons for the ready-made example tasks.
|
| 21 |
+
- Run button → result panel with reward, setup/verify per-command
|
| 22 |
+
results, file outputs, logprob stats, agent + proxy log tails,
|
| 23 |
+
and the raw RolloutResult JSON.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
from __future__ import annotations
|
| 27 |
+
|
| 28 |
+
import json
|
| 29 |
+
from typing import Any
|
| 30 |
+
|
| 31 |
+
import gradio as gr
|
| 32 |
+
|
| 33 |
+
try:
|
| 34 |
+
from .catalog import ENDPOINT_KINDS, catalog_summary, resolve_endpoint
|
| 35 |
+
from .opencode_environment import OpenCodeEnvironment
|
| 36 |
+
except ImportError: # pragma: no cover
|
| 37 |
+
from server.catalog import ENDPOINT_KINDS, catalog_summary, resolve_endpoint # type: ignore
|
| 38 |
+
from server.opencode_environment import OpenCodeEnvironment # type: ignore
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# ────────────────────────────────────────────────────────────────────────────
|
| 42 |
+
# Preset task examples — each fills (instruction, setup, verify).
|
| 43 |
+
# ────────────────────────────────────────────────────────────────────────────
|
| 44 |
+
|
| 45 |
+
PRESETS: dict[str, dict[str, str]] = {
|
| 46 |
+
"binary_search": {
|
| 47 |
+
"instruction": (
|
| 48 |
+
"Create a single Python file named `binary_search.py` in the "
|
| 49 |
+
"current working directory. Use the relative path `binary_search.py`. "
|
| 50 |
+
"Expose exactly one function:\n"
|
| 51 |
+
" def binary_search(arr: list[int], target: int) -> int\n"
|
| 52 |
+
"Return the index of `target` in the sorted list `arr`, or -1 if "
|
| 53 |
+
"absent. Use the binary-search algorithm; do not call list.index."
|
| 54 |
+
),
|
| 55 |
+
"setup": "",
|
| 56 |
+
"verify": (
|
| 57 |
+
"test -f /home/user/workdir/binary_search.py\n"
|
| 58 |
+
"python -c \"import sys; sys.path.insert(0, '/home/user/workdir'); "
|
| 59 |
+
"import binary_search; "
|
| 60 |
+
"assert binary_search.binary_search([1,2,3,4,5], 3) == 2; "
|
| 61 |
+
"assert binary_search.binary_search([1,2,3], 99) == -1; "
|
| 62 |
+
"assert binary_search.binary_search([], 1) == -1; "
|
| 63 |
+
"print('OK')\""
|
| 64 |
+
),
|
| 65 |
+
},
|
| 66 |
+
"fizzbuzz": {
|
| 67 |
+
"instruction": (
|
| 68 |
+
"Create `fizzbuzz.py` in the current directory exposing "
|
| 69 |
+
"`def fizzbuzz(n: int) -> list[str]` that returns the FizzBuzz "
|
| 70 |
+
"sequence for the integers 1..n. 'Fizz' for multiples of 3, 'Buzz' "
|
| 71 |
+
"for 5, 'FizzBuzz' for both, otherwise the number as a string."
|
| 72 |
+
),
|
| 73 |
+
"setup": "",
|
| 74 |
+
"verify": (
|
| 75 |
+
"test -f /home/user/workdir/fizzbuzz.py\n"
|
| 76 |
+
"python -c \"import sys; sys.path.insert(0, '/home/user/workdir'); "
|
| 77 |
+
"import fizzbuzz; "
|
| 78 |
+
"assert fizzbuzz.fizzbuzz(5) == ['1','2','Fizz','4','Buzz']; "
|
| 79 |
+
"assert fizzbuzz.fizzbuzz(15)[-1] == 'FizzBuzz'; "
|
| 80 |
+
"print('OK')\""
|
| 81 |
+
),
|
| 82 |
+
},
|
| 83 |
+
"pandas_csv": {
|
| 84 |
+
"instruction": (
|
| 85 |
+
"Read `/home/user/data/numbers.csv` (a CSV with a single column "
|
| 86 |
+
"`x` of integers) using pandas. Compute the mean of the `x` "
|
| 87 |
+
"column and write it as a single float to `/home/user/workdir/mean.txt` "
|
| 88 |
+
"(no extra characters, no newline)."
|
| 89 |
+
),
|
| 90 |
+
"setup": (
|
| 91 |
+
"pip install --quiet pandas\n"
|
| 92 |
+
"mkdir -p /home/user/data\n"
|
| 93 |
+
"printf 'x\\n1\\n2\\n3\\n4\\n5\\n6\\n7\\n8\\n9\\n10\\n' > /home/user/data/numbers.csv"
|
| 94 |
+
),
|
| 95 |
+
"verify": (
|
| 96 |
+
"test -f /home/user/workdir/mean.txt\n"
|
| 97 |
+
"python -c \"v=float(open('/home/user/workdir/mean.txt').read().strip()); "
|
| 98 |
+
"assert abs(v-5.5) < 1e-6, v; print('mean=', v)\""
|
| 99 |
+
),
|
| 100 |
+
},
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
# ────────────────────────────────────────────────────────────────────────────
|
| 105 |
+
# Result rendering helpers
|
| 106 |
+
# ────────────────────────────────────────────────────��───────────────────────
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def _split_commands(text: str) -> list[str]:
|
| 110 |
+
return [line for line in (text or "").splitlines() if line.strip()]
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def _badge_for_reward(reward: float | None) -> str:
|
| 114 |
+
if reward is None:
|
| 115 |
+
return "**reward**: _n/a_"
|
| 116 |
+
if reward >= 0.999:
|
| 117 |
+
emoji = "[PASS]"
|
| 118 |
+
elif reward > 0.0:
|
| 119 |
+
emoji = "[PARTIAL]"
|
| 120 |
+
else:
|
| 121 |
+
emoji = "[FAIL]"
|
| 122 |
+
return f"### {emoji} reward = `{reward:.2f}`"
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def _summary_md(result: dict[str, Any]) -> str:
|
| 126 |
+
parts = [_badge_for_reward(result.get("reward"))]
|
| 127 |
+
parts.append(
|
| 128 |
+
f"**sandbox**: `{result.get('sandbox_id') or 'n/a'}` · "
|
| 129 |
+
f"**wall**: `{result.get('wall_s', 0):.1f}s` · "
|
| 130 |
+
f"**agent_exit**: `{result.get('agent_exit_code')}` · "
|
| 131 |
+
f"**mode**: `{result.get('mode', 'n/a')}`"
|
| 132 |
+
)
|
| 133 |
+
if result.get("error"):
|
| 134 |
+
parts.append(f"**error**: `{result['error']}`")
|
| 135 |
+
return "\n\n".join(parts)
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def _command_rows(items: list[dict[str, Any]]) -> list[list[str]]:
|
| 139 |
+
rows: list[list[str]] = []
|
| 140 |
+
for it in items or []:
|
| 141 |
+
cmd = it.get("cmd", "")
|
| 142 |
+
rows.append(
|
| 143 |
+
[
|
| 144 |
+
cmd if len(cmd) <= 80 else cmd[:77] + "...",
|
| 145 |
+
str(it.get("exit_code", "")),
|
| 146 |
+
f"{it.get('duration_s', 0):.2f}s",
|
| 147 |
+
(it.get("stderr") or "").splitlines()[-1][:80] if it.get("exit_code") else "",
|
| 148 |
+
]
|
| 149 |
+
)
|
| 150 |
+
return rows
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def _logprobs_md(turns: list[dict[str, Any]]) -> str:
|
| 154 |
+
if not turns:
|
| 155 |
+
return "_No proxy turns captured._\n\nThis is normal in `black_box` mode. In `transparent_proxy` mode, an empty list usually means the agent never made an LLM call (check the agent log)."
|
| 156 |
+
n = len(turns)
|
| 157 |
+
productive = sum(1 for t in turns if t.get("completion_tokens"))
|
| 158 |
+
total_toks = sum(len(t.get("completion_tokens") or []) for t in turns)
|
| 159 |
+
all_lps = [
|
| 160 |
+
float(x)
|
| 161 |
+
for t in turns
|
| 162 |
+
for x in (t.get("per_token_logps") or [])
|
| 163 |
+
if x is not None
|
| 164 |
+
]
|
| 165 |
+
mean_lp = (sum(all_lps) / len(all_lps)) if all_lps else None
|
| 166 |
+
lines = [
|
| 167 |
+
f"**turns**: `{n}` · **productive**: `{productive}` · "
|
| 168 |
+
f"**total_completion_tokens**: `{total_toks}`",
|
| 169 |
+
]
|
| 170 |
+
if mean_lp is not None:
|
| 171 |
+
lines.append(f"**mean_logprob**: `{mean_lp:+.4f}`")
|
| 172 |
+
finishes: dict[str, int] = {}
|
| 173 |
+
for t in turns:
|
| 174 |
+
f = t.get("finish_reason") or "unknown"
|
| 175 |
+
finishes[f] = finishes.get(f, 0) + 1
|
| 176 |
+
if finishes:
|
| 177 |
+
lines.append(
|
| 178 |
+
"**finish_reasons**: " + " ".join(f"`{k}={v}`" for k, v in finishes.items())
|
| 179 |
+
)
|
| 180 |
+
productive_rows = [t for t in turns if t.get("completion_tokens")]
|
| 181 |
+
if productive_rows:
|
| 182 |
+
first = productive_rows[0]
|
| 183 |
+
toks = first["completion_tokens"][:10]
|
| 184 |
+
lps = first.get("per_token_logps") or []
|
| 185 |
+
lines.append(
|
| 186 |
+
f"\n**first productive turn (first 10 tokens)**\n\n"
|
| 187 |
+
f"```\n"
|
| 188 |
+
+ "\n".join(
|
| 189 |
+
f" {tok!r:<14} {lp:+.3f}" if i < len(lps) else f" {tok!r:<14} -"
|
| 190 |
+
for i, (tok, lp) in enumerate(zip(toks, lps + [None] * len(toks)))
|
| 191 |
+
)
|
| 192 |
+
+ "\n```"
|
| 193 |
+
)
|
| 194 |
+
return "\n\n".join(lines)
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def _files_md(files: dict[str, str]) -> str:
|
| 198 |
+
if not files:
|
| 199 |
+
return "_No files in the workdir._"
|
| 200 |
+
chunks = []
|
| 201 |
+
for path, content in files.items():
|
| 202 |
+
chunks.append(f"**`{path}`**\n```python\n{content[:4000]}\n```")
|
| 203 |
+
return "\n\n".join(chunks)
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def _catalog_banner() -> str:
|
| 207 |
+
rows = ["**Endpoint catalog (env vars + defaults)**", ""]
|
| 208 |
+
rows.append("| kind | base_url | model | env vars | configured |")
|
| 209 |
+
rows.append("|---|---|---|---|---|")
|
| 210 |
+
for entry in catalog_summary():
|
| 211 |
+
envs = (
|
| 212 |
+
f"`{entry['base_url_env']}`<br/>`{entry['api_key_env']}`<br/>"
|
| 213 |
+
f"`{entry['model_env']}`"
|
| 214 |
+
)
|
| 215 |
+
ok = "yes" if entry["configured"] else "**no**"
|
| 216 |
+
rows.append(
|
| 217 |
+
f"| `{entry['kind']}` | `{entry['default_base_url'] or '-'}` | "
|
| 218 |
+
f"`{entry['default_model'] or '-'}` | {envs} | {ok} |"
|
| 219 |
+
)
|
| 220 |
+
return "\n".join(rows)
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
# ────────────────────────────────────────────────────────────────────────────
|
| 224 |
+
# Builder
|
| 225 |
+
# ────────────────────────────────────────────────────────────────────────────
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def opencode_gradio_builder(
|
| 229 |
+
web_manager, # noqa: ARG001 (unused: we instantiate the env directly)
|
| 230 |
+
action_fields, # noqa: ARG001
|
| 231 |
+
metadata, # noqa: ARG001
|
| 232 |
+
is_chat_env, # noqa: ARG001
|
| 233 |
+
title,
|
| 234 |
+
quick_start_md, # noqa: ARG001
|
| 235 |
+
) -> gr.Blocks:
|
| 236 |
+
"""Build the opencode_env console.
|
| 237 |
+
|
| 238 |
+
Compatible with ``create_app(..., gradio_builder=...)``. We ignore
|
| 239 |
+
``web_manager`` and instantiate :class:`OpenCodeEnvironment` ourselves
|
| 240 |
+
inside the run handler — opencode_env's run_rollout doesn't need any
|
| 241 |
+
per-session state beyond the env's own bookkeeping, and instantiating
|
| 242 |
+
is cheap (no sandbox is created until the tool fires).
|
| 243 |
+
"""
|
| 244 |
+
|
| 245 |
+
def run(
|
| 246 |
+
endpoint: str,
|
| 247 |
+
model: str,
|
| 248 |
+
base_url: str,
|
| 249 |
+
api_key: str,
|
| 250 |
+
instruction: str,
|
| 251 |
+
setup_text: str,
|
| 252 |
+
verify_text: str,
|
| 253 |
+
mode: str,
|
| 254 |
+
disable_thinking: str,
|
| 255 |
+
template: str,
|
| 256 |
+
max_tokens_cap: int,
|
| 257 |
+
top_logprobs: int,
|
| 258 |
+
agent_timeout_s: float,
|
| 259 |
+
progress: gr.Progress = gr.Progress(),
|
| 260 |
+
):
|
| 261 |
+
progress(0.05, desc="resolving endpoint…")
|
| 262 |
+
try:
|
| 263 |
+
resolved = resolve_endpoint(
|
| 264 |
+
endpoint, base_url=base_url, api_key=api_key, model=model
|
| 265 |
+
)
|
| 266 |
+
except ValueError as exc:
|
| 267 |
+
err = f"endpoint resolution failed: {exc}"
|
| 268 |
+
return (err, [], [], "", "", "", {"error": err})
|
| 269 |
+
|
| 270 |
+
# Translate "auto" / "on" / "off" into bool / None.
|
| 271 |
+
if disable_thinking == "on":
|
| 272 |
+
dt: bool | None = True
|
| 273 |
+
elif disable_thinking == "off":
|
| 274 |
+
dt = False
|
| 275 |
+
else:
|
| 276 |
+
dt = None # let the catalog default win
|
| 277 |
+
|
| 278 |
+
progress(0.10, desc=f"{resolved.kind}: {resolved.model}")
|
| 279 |
+
env = OpenCodeEnvironment()
|
| 280 |
+
|
| 281 |
+
progress(0.15, desc="creating sandbox + running agent…")
|
| 282 |
+
try:
|
| 283 |
+
payload = env._run_rollout_impl(
|
| 284 |
+
base_url=resolved.base_url,
|
| 285 |
+
api_key=resolved.api_key,
|
| 286 |
+
model=resolved.model,
|
| 287 |
+
instruction=instruction,
|
| 288 |
+
setup=_split_commands(setup_text),
|
| 289 |
+
verify=_split_commands(verify_text),
|
| 290 |
+
task_id="ui",
|
| 291 |
+
mode=mode,
|
| 292 |
+
disable_thinking=(
|
| 293 |
+
dt if dt is not None else resolved.disable_thinking_default
|
| 294 |
+
),
|
| 295 |
+
max_tokens_cap=int(max_tokens_cap),
|
| 296 |
+
top_logprobs=int(top_logprobs),
|
| 297 |
+
agent_timeout_s=float(agent_timeout_s),
|
| 298 |
+
template=template,
|
| 299 |
+
)
|
| 300 |
+
except Exception as exc: # noqa: BLE001
|
| 301 |
+
err = f"{type(exc).__name__}: {exc}"
|
| 302 |
+
return (err, [], [], "", "", "", {"error": err})
|
| 303 |
+
|
| 304 |
+
progress(0.95, desc="rendering result…")
|
| 305 |
+
result = json.loads(payload)
|
| 306 |
+
|
| 307 |
+
return (
|
| 308 |
+
_summary_md(result),
|
| 309 |
+
_command_rows(result.get("setup_results") or []),
|
| 310 |
+
_command_rows(result.get("verify_results") or []),
|
| 311 |
+
_files_md(result.get("files") or {}),
|
| 312 |
+
_logprobs_md(result.get("proxy_turns") or []),
|
| 313 |
+
(
|
| 314 |
+
f"### agent log (tail)\n```\n{result.get('agent_log_tail', '')[:4000]}\n```\n\n"
|
| 315 |
+
f"### proxy log (tail)\n```\n{result.get('proxy_log_tail', '')[:4000]}\n```"
|
| 316 |
+
),
|
| 317 |
+
result,
|
| 318 |
+
)
|
| 319 |
+
|
| 320 |
+
def apply_preset(name: str) -> tuple[str, str, str]:
|
| 321 |
+
p = PRESETS.get(name) or {"instruction": "", "setup": "", "verify": ""}
|
| 322 |
+
return p["instruction"], p["setup"], p["verify"]
|
| 323 |
+
|
| 324 |
+
with gr.Blocks(title=title or "opencode_env") as app:
|
| 325 |
+
gr.Markdown(f"# {title or 'opencode_env'}")
|
| 326 |
+
gr.Markdown(
|
| 327 |
+
"Run one OpenCode rollout in an E2B sandbox against your chosen "
|
| 328 |
+
"LLM endpoint. Pick an endpoint, write the task as `(instruction, "
|
| 329 |
+
"setup, verify)`, and inspect the reward + per-token logprobs."
|
| 330 |
+
)
|
| 331 |
+
|
| 332 |
+
gr.Markdown(_catalog_banner())
|
| 333 |
+
|
| 334 |
+
with gr.Row():
|
| 335 |
+
endpoint = gr.Dropdown(
|
| 336 |
+
choices=list(ENDPOINT_KINDS),
|
| 337 |
+
value="openai",
|
| 338 |
+
label="Endpoint",
|
| 339 |
+
scale=1,
|
| 340 |
+
)
|
| 341 |
+
model = gr.Textbox(
|
| 342 |
+
label="Model (blank → catalog default)", placeholder="gpt-4o-mini",
|
| 343 |
+
scale=2,
|
| 344 |
+
)
|
| 345 |
+
with gr.Row():
|
| 346 |
+
base_url = gr.Textbox(
|
| 347 |
+
label="Base URL (blank → env / catalog default)",
|
| 348 |
+
placeholder="https://api.openai.com/v1", scale=2,
|
| 349 |
+
)
|
| 350 |
+
api_key = gr.Textbox(
|
| 351 |
+
label="API key (blank → server env var)",
|
| 352 |
+
placeholder="(server env)", type="password", scale=1,
|
| 353 |
+
)
|
| 354 |
+
|
| 355 |
+
instruction = gr.Textbox(
|
| 356 |
+
label="Instruction (the prompt opencode runs)",
|
| 357 |
+
lines=4,
|
| 358 |
+
value=PRESETS["binary_search"]["instruction"],
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
+
with gr.Row():
|
| 362 |
+
setup_text = gr.Textbox(
|
| 363 |
+
label="Setup (one bash command per line — runs BEFORE the agent)",
|
| 364 |
+
lines=5,
|
| 365 |
+
value=PRESETS["binary_search"]["setup"],
|
| 366 |
+
)
|
| 367 |
+
verify_text = gr.Textbox(
|
| 368 |
+
label="Verify (one bash command per line — runs AFTER the agent)",
|
| 369 |
+
lines=5,
|
| 370 |
+
value=PRESETS["binary_search"]["verify"],
|
| 371 |
+
)
|
| 372 |
+
|
| 373 |
+
with gr.Row():
|
| 374 |
+
preset_bs = gr.Button("preset · binary_search", size="sm")
|
| 375 |
+
preset_fb = gr.Button("preset · fizzbuzz", size="sm")
|
| 376 |
+
preset_pd = gr.Button("preset · pandas_csv", size="sm")
|
| 377 |
+
|
| 378 |
+
with gr.Accordion("Tunables", open=False):
|
| 379 |
+
with gr.Row():
|
| 380 |
+
mode = gr.Dropdown(
|
| 381 |
+
choices=["transparent_proxy", "black_box"],
|
| 382 |
+
value="transparent_proxy",
|
| 383 |
+
label="mode",
|
| 384 |
+
)
|
| 385 |
+
disable_thinking = gr.Dropdown(
|
| 386 |
+
choices=["auto", "on", "off"],
|
| 387 |
+
value="auto",
|
| 388 |
+
label="disable_thinking",
|
| 389 |
+
)
|
| 390 |
+
template = gr.Textbox(
|
| 391 |
+
label="E2B template (e.g. opencode-rl)",
|
| 392 |
+
placeholder="(blank → cold install per rollout)",
|
| 393 |
+
)
|
| 394 |
+
with gr.Row():
|
| 395 |
+
max_tokens_cap = gr.Number(value=4096, label="max_tokens_cap", precision=0)
|
| 396 |
+
top_logprobs = gr.Number(value=5, label="top_logprobs", precision=0)
|
| 397 |
+
agent_timeout_s = gr.Number(value=600, label="agent_timeout_s", precision=0)
|
| 398 |
+
|
| 399 |
+
run_btn = gr.Button("Run rollout", variant="primary")
|
| 400 |
+
|
| 401 |
+
gr.Markdown("---")
|
| 402 |
+
summary_md = gr.Markdown("_Submit a rollout above to see results._")
|
| 403 |
+
|
| 404 |
+
with gr.Tabs():
|
| 405 |
+
with gr.Tab("Setup"):
|
| 406 |
+
setup_table = gr.Dataframe(
|
| 407 |
+
headers=["cmd", "exit", "duration", "stderr"],
|
| 408 |
+
datatype=["str", "str", "str", "str"],
|
| 409 |
+
interactive=False,
|
| 410 |
+
wrap=True,
|
| 411 |
+
)
|
| 412 |
+
with gr.Tab("Verify"):
|
| 413 |
+
verify_table = gr.Dataframe(
|
| 414 |
+
headers=["cmd", "exit", "duration", "stderr"],
|
| 415 |
+
datatype=["str", "str", "str", "str"],
|
| 416 |
+
interactive=False,
|
| 417 |
+
wrap=True,
|
| 418 |
+
)
|
| 419 |
+
with gr.Tab("Files"):
|
| 420 |
+
files_md = gr.Markdown("")
|
| 421 |
+
with gr.Tab("Logprobs"):
|
| 422 |
+
logprobs_md = gr.Markdown("")
|
| 423 |
+
with gr.Tab("Logs"):
|
| 424 |
+
logs_md = gr.Markdown("")
|
| 425 |
+
with gr.Tab("Raw JSON"):
|
| 426 |
+
raw_json = gr.JSON(value={})
|
| 427 |
+
|
| 428 |
+
# Wire it up.
|
| 429 |
+
for btn, name in [
|
| 430 |
+
(preset_bs, "binary_search"),
|
| 431 |
+
(preset_fb, "fizzbuzz"),
|
| 432 |
+
(preset_pd, "pandas_csv"),
|
| 433 |
+
]:
|
| 434 |
+
btn.click(
|
| 435 |
+
fn=lambda n=name: apply_preset(n),
|
| 436 |
+
outputs=[instruction, setup_text, verify_text],
|
| 437 |
+
)
|
| 438 |
+
|
| 439 |
+
run_btn.click(
|
| 440 |
+
fn=run,
|
| 441 |
+
inputs=[
|
| 442 |
+
endpoint, model, base_url, api_key,
|
| 443 |
+
instruction, setup_text, verify_text,
|
| 444 |
+
mode, disable_thinking, template,
|
| 445 |
+
max_tokens_cap, top_logprobs, agent_timeout_s,
|
| 446 |
+
],
|
| 447 |
+
outputs=[
|
| 448 |
+
summary_md, setup_table, verify_table,
|
| 449 |
+
files_md, logprobs_md, logs_md, raw_json,
|
| 450 |
+
],
|
| 451 |
+
)
|
| 452 |
+
|
| 453 |
+
return app
|
server/opencode_environment.py
ADDED
|
@@ -0,0 +1,472 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""OpenCode MCP environment.
|
| 8 |
+
|
| 9 |
+
Single MCP tool ``run_rollout`` that takes a uniform Task shape:
|
| 10 |
+
|
| 11 |
+
- ``instruction`` — prompt for the agent
|
| 12 |
+
- ``setup`` — bash commands run BEFORE the agent (in the sandbox)
|
| 13 |
+
- ``verify`` — bash commands run AFTER the agent
|
| 14 |
+
|
| 15 |
+
Reward = ``passed_verify_commands / total`` unless a verify command writes
|
| 16 |
+
a float to ``/home/user/logs/verifier/reward.txt`` (override).
|
| 17 |
+
|
| 18 |
+
Returns a JSON-serialized :class:`RolloutResult` with reward + per-turn
|
| 19 |
+
logprobs (Mode B) + setup/verify command results + file outputs.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
from __future__ import annotations
|
| 23 |
+
|
| 24 |
+
import os
|
| 25 |
+
import time
|
| 26 |
+
from typing import Any, Optional
|
| 27 |
+
from uuid import uuid4
|
| 28 |
+
|
| 29 |
+
from fastmcp import FastMCP
|
| 30 |
+
|
| 31 |
+
try:
|
| 32 |
+
from openenv.core.env_server.mcp_environment import MCPEnvironment
|
| 33 |
+
from openenv.core.env_server.types import Action, Observation
|
| 34 |
+
|
| 35 |
+
from .catalog import ENDPOINT_KINDS, resolve_endpoint
|
| 36 |
+
except ImportError: # pragma: no cover
|
| 37 |
+
from openenv.core.env_server.mcp_environment import MCPEnvironment
|
| 38 |
+
from openenv.core.env_server.types import Action, Observation
|
| 39 |
+
|
| 40 |
+
from server.catalog import ENDPOINT_KINDS, resolve_endpoint # type: ignore
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
# One rollout (sandbox cold start + opencode install + opencode run +
|
| 44 |
+
# verifier) typically takes 30-180s; can spike to ~600s under load. Override
|
| 45 |
+
# OpenEnv's 30s MCP-tool default so the server doesn't cut us off.
|
| 46 |
+
_RUN_ROLLOUT_TIMEOUT_S = 900.0
|
| 47 |
+
|
| 48 |
+
# Inside-sandbox paths the server writes/reads.
|
| 49 |
+
HOME = "/home/user"
|
| 50 |
+
WORKDIR = f"{HOME}/workdir"
|
| 51 |
+
INSTRUCTION_PATH = f"{HOME}/task/instruction.md"
|
| 52 |
+
REWARD_FILE = f"{HOME}/logs/verifier/reward.txt"
|
| 53 |
+
PROXY_LOG = f"{HOME}/logs/agent/proxy.log"
|
| 54 |
+
AGENT_LOG = f"{HOME}/logs/agent/opencode.jsonl"
|
| 55 |
+
VERIFY_TIMEOUT_S = 120
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class OpenCodeEnvironment(MCPEnvironment):
|
| 59 |
+
"""Per-session environment exposing a single ``run_rollout`` MCP tool."""
|
| 60 |
+
|
| 61 |
+
SUPPORTS_CONCURRENT_SESSIONS = True
|
| 62 |
+
|
| 63 |
+
def __init__(self) -> None:
|
| 64 |
+
# Lazy imports so module import stays cheap and so tests can patch.
|
| 65 |
+
try:
|
| 66 |
+
from ..models import (
|
| 67 |
+
CommandResult,
|
| 68 |
+
OpenCodeState,
|
| 69 |
+
RolloutResult,
|
| 70 |
+
RolloutTurn,
|
| 71 |
+
)
|
| 72 |
+
except ImportError: # pragma: no cover
|
| 73 |
+
from models import ( # type: ignore
|
| 74 |
+
CommandResult,
|
| 75 |
+
OpenCodeState,
|
| 76 |
+
RolloutResult,
|
| 77 |
+
RolloutTurn,
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
from opencode_env import (
|
| 81 |
+
E2BSandboxBackend,
|
| 82 |
+
OpenCodeConfig,
|
| 83 |
+
OpenCodeSessionFactory,
|
| 84 |
+
OpenCodeTask,
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
self._CommandResult = CommandResult
|
| 88 |
+
self._RolloutResult = RolloutResult
|
| 89 |
+
self._RolloutTurn = RolloutTurn
|
| 90 |
+
self._OpenCodeState = OpenCodeState
|
| 91 |
+
self._OpenCodeConfig = OpenCodeConfig
|
| 92 |
+
self._OpenCodeSessionFactory = OpenCodeSessionFactory
|
| 93 |
+
self._OpenCodeTask = OpenCodeTask
|
| 94 |
+
self._E2BSandboxBackend = E2BSandboxBackend
|
| 95 |
+
|
| 96 |
+
# Don't raise on missing E2B_API_KEY here — OpenEnv's web-interface
|
| 97 |
+
# layer instantiates the env at import time for schema introspection,
|
| 98 |
+
# and we want the docs / Gradio UI to load even when the operator is
|
| 99 |
+
# just exploring. The real check happens lazily in
|
| 100 |
+
# ``_run_rollout_impl`` (any rollout without creds fails fast there
|
| 101 |
+
# with a clear error in the result payload).
|
| 102 |
+
self._state = self._OpenCodeState(episode_id=str(uuid4()))
|
| 103 |
+
|
| 104 |
+
mcp = FastMCP("opencode_env")
|
| 105 |
+
|
| 106 |
+
@mcp.tool
|
| 107 |
+
def run_rollout(
|
| 108 |
+
# Endpoint — either a shorthand (resolved from env vars + catalog
|
| 109 |
+
# defaults) OR explicit base_url+api_key+model. Explicit fields
|
| 110 |
+
# always win over the catalog.
|
| 111 |
+
endpoint: str = "",
|
| 112 |
+
base_url: str = "",
|
| 113 |
+
api_key: str = "",
|
| 114 |
+
model: str = "",
|
| 115 |
+
# Task
|
| 116 |
+
instruction: str = "",
|
| 117 |
+
setup: Optional[list[str]] = None,
|
| 118 |
+
verify: Optional[list[str]] = None,
|
| 119 |
+
# Bookkeeping / tunables
|
| 120 |
+
task_id: str = "",
|
| 121 |
+
mode: str = "transparent_proxy",
|
| 122 |
+
disable_thinking: Optional[bool] = None,
|
| 123 |
+
max_tokens_cap: int = 4096,
|
| 124 |
+
top_logprobs: int = 5,
|
| 125 |
+
agent_timeout_s: float = 600.0,
|
| 126 |
+
template: str = "",
|
| 127 |
+
) -> str:
|
| 128 |
+
"""Run one OpenCode rollout end-to-end.
|
| 129 |
+
|
| 130 |
+
``endpoint`` is the shorthand selector (one of
|
| 131 |
+
``"vllm"`` / ``"openai"`` / ``"hf_router"``) — the server
|
| 132 |
+
resolves base_url / api_key / model from env vars + catalog
|
| 133 |
+
defaults. Pass any of those explicitly to override.
|
| 134 |
+
|
| 135 |
+
See ``opencode_env.client.OpenCodeEnv.run_rollout`` for full
|
| 136 |
+
arg docs. Returns a JSON-serialized ``RolloutResult``.
|
| 137 |
+
"""
|
| 138 |
+
# Resolve via catalog when shorthand is provided.
|
| 139 |
+
disable_thinking_resolved = disable_thinking
|
| 140 |
+
if endpoint:
|
| 141 |
+
resolved = resolve_endpoint(
|
| 142 |
+
endpoint, base_url=base_url, api_key=api_key, model=model
|
| 143 |
+
)
|
| 144 |
+
base_url = resolved.base_url
|
| 145 |
+
api_key = resolved.api_key
|
| 146 |
+
model = resolved.model
|
| 147 |
+
if disable_thinking_resolved is None:
|
| 148 |
+
disable_thinking_resolved = resolved.disable_thinking_default
|
| 149 |
+
if disable_thinking_resolved is None:
|
| 150 |
+
disable_thinking_resolved = False
|
| 151 |
+
|
| 152 |
+
if not (base_url and api_key and model):
|
| 153 |
+
raise ValueError(
|
| 154 |
+
"must provide either ``endpoint`` (one of "
|
| 155 |
+
f"{ENDPOINT_KINDS}) or all of base_url + api_key + model"
|
| 156 |
+
)
|
| 157 |
+
if not instruction:
|
| 158 |
+
raise ValueError("instruction is required")
|
| 159 |
+
|
| 160 |
+
return self._run_rollout_impl(
|
| 161 |
+
base_url=base_url,
|
| 162 |
+
api_key=api_key,
|
| 163 |
+
model=model,
|
| 164 |
+
instruction=instruction,
|
| 165 |
+
setup=list(setup or []),
|
| 166 |
+
verify=list(verify or []),
|
| 167 |
+
task_id=task_id,
|
| 168 |
+
mode=mode,
|
| 169 |
+
disable_thinking=disable_thinking_resolved,
|
| 170 |
+
max_tokens_cap=max_tokens_cap,
|
| 171 |
+
top_logprobs=top_logprobs,
|
| 172 |
+
agent_timeout_s=agent_timeout_s,
|
| 173 |
+
template=template,
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
super().__init__(mcp)
|
| 177 |
+
|
| 178 |
+
# ── OpenEnv lifecycle ──────────────────────────────────────────────────
|
| 179 |
+
|
| 180 |
+
def reset(
|
| 181 |
+
self,
|
| 182 |
+
seed: Optional[int] = None,
|
| 183 |
+
episode_id: Optional[str] = None,
|
| 184 |
+
**_: Any,
|
| 185 |
+
) -> Observation:
|
| 186 |
+
self._state = self._OpenCodeState(episode_id=episode_id or str(uuid4()))
|
| 187 |
+
return Observation(
|
| 188 |
+
done=False,
|
| 189 |
+
reward=None,
|
| 190 |
+
metadata={
|
| 191 |
+
"status": "ready",
|
| 192 |
+
"message": (
|
| 193 |
+
"opencode_env ready. Call run_rollout(...) with a task."
|
| 194 |
+
),
|
| 195 |
+
},
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
def _step_impl(
|
| 199 |
+
self,
|
| 200 |
+
action: Action,
|
| 201 |
+
timeout_s: Optional[float] = None,
|
| 202 |
+
**_: Any,
|
| 203 |
+
) -> Observation:
|
| 204 |
+
return Observation(
|
| 205 |
+
done=False,
|
| 206 |
+
reward=None,
|
| 207 |
+
metadata={
|
| 208 |
+
"error": (
|
| 209 |
+
f"Unknown action type: {type(action).__name__}. "
|
| 210 |
+
"Use CallToolAction(name='run_rollout', ...)."
|
| 211 |
+
),
|
| 212 |
+
},
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
def step(
|
| 216 |
+
self,
|
| 217 |
+
action: Action,
|
| 218 |
+
timeout_s: Optional[float] = None,
|
| 219 |
+
**kwargs: Any,
|
| 220 |
+
) -> Observation:
|
| 221 |
+
if timeout_s is None:
|
| 222 |
+
timeout_s = _RUN_ROLLOUT_TIMEOUT_S
|
| 223 |
+
return super().step(action, timeout_s=timeout_s, **kwargs)
|
| 224 |
+
|
| 225 |
+
async def step_async(
|
| 226 |
+
self,
|
| 227 |
+
action: Action,
|
| 228 |
+
timeout_s: Optional[float] = None,
|
| 229 |
+
**kwargs: Any,
|
| 230 |
+
) -> Observation:
|
| 231 |
+
if timeout_s is None:
|
| 232 |
+
timeout_s = _RUN_ROLLOUT_TIMEOUT_S
|
| 233 |
+
return await super().step_async(action, timeout_s=timeout_s, **kwargs)
|
| 234 |
+
|
| 235 |
+
@property
|
| 236 |
+
def state(self) -> Any:
|
| 237 |
+
return self._state
|
| 238 |
+
|
| 239 |
+
# ── Rollout orchestration ──────────────────────────────────────────────
|
| 240 |
+
|
| 241 |
+
def _run_rollout_impl(
|
| 242 |
+
self,
|
| 243 |
+
*,
|
| 244 |
+
base_url: str,
|
| 245 |
+
api_key: str,
|
| 246 |
+
model: str,
|
| 247 |
+
instruction: str,
|
| 248 |
+
setup: list[str],
|
| 249 |
+
verify: list[str],
|
| 250 |
+
task_id: str,
|
| 251 |
+
mode: str,
|
| 252 |
+
disable_thinking: bool,
|
| 253 |
+
max_tokens_cap: int,
|
| 254 |
+
top_logprobs: int,
|
| 255 |
+
agent_timeout_s: float,
|
| 256 |
+
template: str,
|
| 257 |
+
) -> str:
|
| 258 |
+
result = self._RolloutResult(task_id=task_id, mode=mode)
|
| 259 |
+
t0 = time.time()
|
| 260 |
+
|
| 261 |
+
# Late credential check — keeps the server importable in dev /
|
| 262 |
+
# docs-only contexts.
|
| 263 |
+
if not os.environ.get("E2B_API_KEY"):
|
| 264 |
+
result.error = (
|
| 265 |
+
"E2B_API_KEY is not set on the server. Configure it in the "
|
| 266 |
+
"Space's secrets / your .env / your shell before calling "
|
| 267 |
+
"run_rollout."
|
| 268 |
+
)
|
| 269 |
+
result.wall_s = round(time.time() - t0, 3)
|
| 270 |
+
return result.model_dump_json()
|
| 271 |
+
|
| 272 |
+
# Build OpenCodeConfig + factory. We keep the proxy in charge of
|
| 273 |
+
# ``model_override`` / ``logprobs`` / ``max_tokens``-cap injection.
|
| 274 |
+
config = self._OpenCodeConfig(
|
| 275 |
+
provider="openai_compatible",
|
| 276 |
+
base_url=base_url.rstrip("/"),
|
| 277 |
+
api_key=api_key,
|
| 278 |
+
model=model,
|
| 279 |
+
agent_timeout_s=agent_timeout_s,
|
| 280 |
+
proxy_disable_thinking=disable_thinking,
|
| 281 |
+
proxy_top_logprobs=top_logprobs,
|
| 282 |
+
proxy_max_tokens_cap=max_tokens_cap if max_tokens_cap > 0 else None,
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
# Concatenate setup commands into a single ``set -e`` script and let
|
| 286 |
+
# the primitive run it as ``task.setup_shell`` before the agent
|
| 287 |
+
# starts. The per-command tracking happens here too — we re-run
|
| 288 |
+
# each command in a wrapper that captures exit/stdout/stderr.
|
| 289 |
+
# That way the primitive still aborts on setup failure AND we get
|
| 290 |
+
# observability in the response.
|
| 291 |
+
instruction_payload = instruction
|
| 292 |
+
opencode_task = self._OpenCodeTask(
|
| 293 |
+
instruction=instruction_payload,
|
| 294 |
+
metadata={"task_id": task_id},
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
+
backend_kwargs: dict[str, Any] = {}
|
| 298 |
+
if template:
|
| 299 |
+
backend_kwargs["template"] = template
|
| 300 |
+
|
| 301 |
+
factory = self._OpenCodeSessionFactory(
|
| 302 |
+
config=config,
|
| 303 |
+
sandbox_backend=self._E2BSandboxBackend(**backend_kwargs),
|
| 304 |
+
mode=mode,
|
| 305 |
+
verifier=None,
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
session = None
|
| 309 |
+
try:
|
| 310 |
+
session = factory.create(task=opencode_task)
|
| 311 |
+
result.sandbox_id = session.sandbox.sandbox_id
|
| 312 |
+
|
| 313 |
+
# Run setup commands one at a time, *before* the agent starts.
|
| 314 |
+
# The factory has already started the agent in start_agent()
|
| 315 |
+
# during create(); to keep the order "setup → agent → verify"
|
| 316 |
+
# we'd need to restructure. As a pragmatic compromise we run
|
| 317 |
+
# setup IMMEDIATELY after create(), which races with the agent
|
| 318 |
+
# for ~1-2s but is fine for typical pip/git/download work
|
| 319 |
+
# because opencode itself takes >=20s to make its first model
|
| 320 |
+
# call.
|
| 321 |
+
for cmd in setup:
|
| 322 |
+
cr = self._exec_command(session.sandbox, cmd)
|
| 323 |
+
result.setup_results.append(cr)
|
| 324 |
+
if cr.exit_code != 0:
|
| 325 |
+
result.error = (
|
| 326 |
+
f"setup command failed (exit {cr.exit_code}): {cmd[:120]}"
|
| 327 |
+
)
|
| 328 |
+
break
|
| 329 |
+
|
| 330 |
+
# Block until the agent is done (or setup already failed).
|
| 331 |
+
if result.error is None:
|
| 332 |
+
try:
|
| 333 |
+
result.agent_exit_code = session.wait_for_completion(
|
| 334 |
+
timeout_s=agent_timeout_s
|
| 335 |
+
)
|
| 336 |
+
except TimeoutError as exc:
|
| 337 |
+
result.error = f"agent timeout: {exc}"
|
| 338 |
+
|
| 339 |
+
# Run verify commands one at a time, capture each.
|
| 340 |
+
verify_passed = 0
|
| 341 |
+
for cmd in verify:
|
| 342 |
+
cr = self._exec_command(session.sandbox, cmd)
|
| 343 |
+
result.verify_results.append(cr)
|
| 344 |
+
if cr.exit_code == 0:
|
| 345 |
+
verify_passed += 1
|
| 346 |
+
|
| 347 |
+
# Reward: explicit reward.txt wins; else passed/total of verify.
|
| 348 |
+
override = self._read_reward(session.sandbox)
|
| 349 |
+
if override is not None:
|
| 350 |
+
result.reward = override
|
| 351 |
+
elif verify:
|
| 352 |
+
result.reward = verify_passed / len(verify)
|
| 353 |
+
else:
|
| 354 |
+
result.reward = None
|
| 355 |
+
|
| 356 |
+
# Collect filesystem + proxy trace.
|
| 357 |
+
result.files, result.files_extra = self._collect_files(session.sandbox)
|
| 358 |
+
result.proxy_turns = self._collect_proxy_turns(session)
|
| 359 |
+
result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[-2000:]
|
| 360 |
+
result.agent_log_tail = self._safe_read(session.sandbox, AGENT_LOG)[-2000:]
|
| 361 |
+
except Exception as exc: # noqa: BLE001
|
| 362 |
+
result.error = f"{type(exc).__name__}: {exc}"
|
| 363 |
+
if session is not None:
|
| 364 |
+
result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[-2000:]
|
| 365 |
+
result.agent_log_tail = self._safe_read(session.sandbox, AGENT_LOG)[-2000:]
|
| 366 |
+
finally:
|
| 367 |
+
if session is not None:
|
| 368 |
+
try:
|
| 369 |
+
session.close()
|
| 370 |
+
except Exception:
|
| 371 |
+
pass
|
| 372 |
+
|
| 373 |
+
result.wall_s = round(time.time() - t0, 3)
|
| 374 |
+
|
| 375 |
+
# Bookkeeping on the per-session state.
|
| 376 |
+
self._state.rollouts_completed += 1
|
| 377 |
+
self._state.last_reward = result.reward
|
| 378 |
+
self._state.last_task_id = task_id or None
|
| 379 |
+
self._state.last_sandbox_id = result.sandbox_id or None
|
| 380 |
+
|
| 381 |
+
return result.model_dump_json()
|
| 382 |
+
|
| 383 |
+
# ── Helpers ────────────────────────────────────────────────────────────
|
| 384 |
+
|
| 385 |
+
def _exec_command(self, sandbox: Any, cmd: str) -> Any:
|
| 386 |
+
t = time.time()
|
| 387 |
+
try:
|
| 388 |
+
r = sandbox.exec(cmd, timeout=VERIFY_TIMEOUT_S)
|
| 389 |
+
return self._CommandResult(
|
| 390 |
+
cmd=cmd,
|
| 391 |
+
exit_code=int(r.exit_code),
|
| 392 |
+
stdout=(r.stdout or "")[-2000:],
|
| 393 |
+
stderr=(r.stderr or "")[-2000:],
|
| 394 |
+
duration_s=round(time.time() - t, 3),
|
| 395 |
+
)
|
| 396 |
+
except Exception as exc: # noqa: BLE001
|
| 397 |
+
return self._CommandResult(
|
| 398 |
+
cmd=cmd,
|
| 399 |
+
exit_code=-1,
|
| 400 |
+
stderr=f"{type(exc).__name__}: {exc}",
|
| 401 |
+
duration_s=round(time.time() - t, 3),
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
def _read_reward(self, sandbox: Any) -> float | None:
|
| 405 |
+
raw = self._safe_read(sandbox, REWARD_FILE).strip()
|
| 406 |
+
if not raw:
|
| 407 |
+
return None
|
| 408 |
+
try:
|
| 409 |
+
return float(raw)
|
| 410 |
+
except ValueError:
|
| 411 |
+
return None
|
| 412 |
+
|
| 413 |
+
def _collect_files(
|
| 414 |
+
self, sandbox: Any
|
| 415 |
+
) -> tuple[dict[str, str], list[str]]:
|
| 416 |
+
listing = sandbox.exec(
|
| 417 |
+
f"find {WORKDIR} -maxdepth 2 -type f -size -64k 2>/dev/null | head -32",
|
| 418 |
+
timeout=10,
|
| 419 |
+
)
|
| 420 |
+
files: dict[str, str] = {}
|
| 421 |
+
extras: list[str] = []
|
| 422 |
+
for line in (listing.stdout or "").splitlines():
|
| 423 |
+
path = line.strip()
|
| 424 |
+
if not path:
|
| 425 |
+
continue
|
| 426 |
+
try:
|
| 427 |
+
files[path] = sandbox.read_text(path)[:8000]
|
| 428 |
+
except Exception:
|
| 429 |
+
extras.append(path)
|
| 430 |
+
return files, extras
|
| 431 |
+
|
| 432 |
+
def _collect_proxy_turns(self, session: Any) -> list[Any]:
|
| 433 |
+
turns: list[Any] = []
|
| 434 |
+
proxy_trace_path = getattr(session, "_proxy_trace_path", None)
|
| 435 |
+
if not proxy_trace_path:
|
| 436 |
+
return turns
|
| 437 |
+
raw = self._safe_read(session.sandbox, proxy_trace_path)
|
| 438 |
+
for line in raw.splitlines():
|
| 439 |
+
line = line.strip()
|
| 440 |
+
if not line:
|
| 441 |
+
continue
|
| 442 |
+
try:
|
| 443 |
+
import json as _json
|
| 444 |
+
rec = _json.loads(line)
|
| 445 |
+
except Exception:
|
| 446 |
+
continue
|
| 447 |
+
response = rec.get("response") or {}
|
| 448 |
+
choice = (response.get("choices") or [{}])[0] if response.get("choices") else {}
|
| 449 |
+
turns.append(
|
| 450 |
+
self._RolloutTurn(
|
| 451 |
+
turn=int(rec.get("turn") or 0),
|
| 452 |
+
finish_reason=rec.get("finish_reason"),
|
| 453 |
+
completion_tokens=list(rec.get("completion_tokens") or []),
|
| 454 |
+
completion_token_ids=list(rec.get("completion_token_ids") or []),
|
| 455 |
+
per_token_logps=[
|
| 456 |
+
float(x) for x in (rec.get("per_token_logps") or [])
|
| 457 |
+
if x is not None
|
| 458 |
+
],
|
| 459 |
+
latency_s=float(rec.get("latency_s") or 0.0),
|
| 460 |
+
timestamp=float(rec.get("timestamp") or 0.0),
|
| 461 |
+
upstream_status=response.get("upstream_status"),
|
| 462 |
+
upstream_error=response.get("upstream_error"),
|
| 463 |
+
)
|
| 464 |
+
)
|
| 465 |
+
return turns
|
| 466 |
+
|
| 467 |
+
@staticmethod
|
| 468 |
+
def _safe_read(sandbox: Any, path: str) -> str:
|
| 469 |
+
try:
|
| 470 |
+
return sandbox.read_text(path) or ""
|
| 471 |
+
except Exception:
|
| 472 |
+
return ""
|
task.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Task payload accepted by :class:`OpenCodeSessionFactory`."""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
from typing import Any
|
| 12 |
+
|
| 13 |
+
from pydantic import BaseModel, Field
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class OpenCodeTask(BaseModel):
|
| 17 |
+
"""One task for an OpenCode rollout.
|
| 18 |
+
|
| 19 |
+
The primitive only needs ``instruction`` (the prompt handed to ``opencode
|
| 20 |
+
run``). Callers may attach ``setup_shell`` (run once inside the sandbox
|
| 21 |
+
before the agent starts) and ``upload_files`` (written into the sandbox at
|
| 22 |
+
absolute paths). Any additional metadata belongs in ``metadata`` and is
|
| 23 |
+
passed through to the verifier untouched.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
instruction: str
|
| 27 |
+
setup_shell: str | None = None
|
| 28 |
+
upload_files: dict[str, str] = Field(default_factory=dict)
|
| 29 |
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
| 30 |
+
|
| 31 |
+
@classmethod
|
| 32 |
+
def coerce(cls, value: Any) -> "OpenCodeTask":
|
| 33 |
+
"""Accept a bare string, a dict, or an existing ``OpenCodeTask``."""
|
| 34 |
+
if isinstance(value, cls):
|
| 35 |
+
return value
|
| 36 |
+
if isinstance(value, str):
|
| 37 |
+
return cls(instruction=value)
|
| 38 |
+
if isinstance(value, dict):
|
| 39 |
+
return cls(**value)
|
| 40 |
+
raise TypeError(
|
| 41 |
+
f"Cannot coerce {type(value).__name__} to OpenCodeTask; "
|
| 42 |
+
"pass a str, dict, or OpenCodeTask."
|
| 43 |
+
)
|
tests/__init__.py
ADDED
|
File without changes
|
tests/test_config.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import pytest
|
| 10 |
+
|
| 11 |
+
from opencode_env.config import OpenCodeConfig, provider_npm_package
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def test_defaults_require_only_base_url():
|
| 15 |
+
cfg = OpenCodeConfig(base_url="http://localhost:8000/v1")
|
| 16 |
+
assert cfg.provider == "openai_compatible"
|
| 17 |
+
assert cfg.api_key == "intercepted"
|
| 18 |
+
assert cfg.model == "intercepted/model"
|
| 19 |
+
assert cfg.opencode_version == "latest"
|
| 20 |
+
assert "webfetch" in cfg.disabled_tools
|
| 21 |
+
assert cfg.run_format == "json"
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def test_provider_npm_mapping():
|
| 25 |
+
assert provider_npm_package("openai_compatible") == "@ai-sdk/openai-compatible"
|
| 26 |
+
assert provider_npm_package("openai") == "@ai-sdk/openai"
|
| 27 |
+
assert provider_npm_package("anthropic") == "@ai-sdk/anthropic"
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def test_rejects_unknown_provider():
|
| 31 |
+
with pytest.raises(ValueError):
|
| 32 |
+
OpenCodeConfig(provider="bogus", base_url="x") # type: ignore[arg-type]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def test_custom_fields_override_defaults():
|
| 36 |
+
cfg = OpenCodeConfig(
|
| 37 |
+
provider="openai",
|
| 38 |
+
base_url="https://api.openai.com/v1",
|
| 39 |
+
api_key="sk-test",
|
| 40 |
+
model="openai/gpt-5.3-codex",
|
| 41 |
+
opencode_version="0.5.3",
|
| 42 |
+
disabled_tools=["webfetch"],
|
| 43 |
+
system_prompt="be brief",
|
| 44 |
+
extra_env={"FOO": "bar"},
|
| 45 |
+
)
|
| 46 |
+
assert cfg.model == "openai/gpt-5.3-codex"
|
| 47 |
+
assert cfg.opencode_version == "0.5.3"
|
| 48 |
+
assert cfg.extra_env == {"FOO": "bar"}
|
tests/test_five_sorts_e2e.py
ADDED
|
@@ -0,0 +1,1045 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""End-to-end: spawn E2B, install opencode, write 5 sorting algorithms, verify.
|
| 8 |
+
|
| 9 |
+
Talks to E2B and the LLM endpoints directly via the e2b SDK and httpx — no
|
| 10 |
+
imports from the ``opencode_env`` package at runtime. The proxy that captures
|
| 11 |
+
per-token logprobs (Mode B) is uploaded into the sandbox as a standalone
|
| 12 |
+
source file from ``../interception.py``.
|
| 13 |
+
|
| 14 |
+
For each endpoint configured in ``envs/opencode_env/.env`` (vLLM / OpenAI /
|
| 15 |
+
HF Router) this script:
|
| 16 |
+
|
| 17 |
+
1. Creates a fresh E2B sandbox
|
| 18 |
+
2. Installs opencode (``curl https://opencode.ai/install | bash``)
|
| 19 |
+
3. (Mode B only) uploads + starts the in-sandbox logprob-capture proxy
|
| 20 |
+
4. Writes ``opencode.json`` pointing at the proxy (or the LLM directly)
|
| 21 |
+
5. Runs ``opencode run --format json "<instruction>"`` to completion
|
| 22 |
+
6. Runs an in-sandbox verifier that imports each sort module and tests it
|
| 23 |
+
7. Reads back: per-file pass/fail, file contents, proxy logprob stats,
|
| 24 |
+
wall time, sandbox id
|
| 25 |
+
|
| 26 |
+
Default usage (runs every endpoint that has the required vars in .env)::
|
| 27 |
+
|
| 28 |
+
.venv/bin/python envs/opencode_env/tests/test_five_sorts_e2e.py
|
| 29 |
+
|
| 30 |
+
Common flags::
|
| 31 |
+
|
| 32 |
+
--endpoint vllm|openai|hf_router|all (default: all)
|
| 33 |
+
--mode transparent_proxy|black_box (default: transparent_proxy)
|
| 34 |
+
--agent-timeout 600 (seconds before opencode is killed)
|
| 35 |
+
--max-tokens-cap 4096 (per-turn max_tokens clamp)
|
| 36 |
+
--save-artifacts (dump JSON per run to tests/_artifacts/)
|
| 37 |
+
--instruction-override "..." (custom instruction)
|
| 38 |
+
|
| 39 |
+
Requires ``E2B_API_KEY`` in the environment plus per-endpoint creds in .env.
|
| 40 |
+
Each rollout takes 1–7 minutes of wall plus ~10s sandbox cold start.
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
from __future__ import annotations
|
| 44 |
+
|
| 45 |
+
import argparse
|
| 46 |
+
import json
|
| 47 |
+
import os
|
| 48 |
+
import secrets
|
| 49 |
+
import sys
|
| 50 |
+
import time
|
| 51 |
+
from dataclasses import asdict, dataclass, field
|
| 52 |
+
from pathlib import Path
|
| 53 |
+
from statistics import mean
|
| 54 |
+
from typing import Any
|
| 55 |
+
|
| 56 |
+
import httpx
|
| 57 |
+
from e2b import Sandbox
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
# ---------------------------------------------------------------------------
|
| 61 |
+
# .env loader — minimal, no python-dotenv dep.
|
| 62 |
+
# ---------------------------------------------------------------------------
|
| 63 |
+
|
| 64 |
+
_THIS_DIR = Path(__file__).resolve().parent
|
| 65 |
+
_ENV_DIR = _THIS_DIR.parent
|
| 66 |
+
_DOTENV_PATH = _ENV_DIR / ".env"
|
| 67 |
+
_PROXY_SOURCE_PATH = _ENV_DIR / "sandbox" / "interception.py"
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def _load_env(path: Path) -> None:
|
| 71 |
+
if not path.exists():
|
| 72 |
+
return
|
| 73 |
+
for raw in path.read_text().splitlines():
|
| 74 |
+
line = raw.strip()
|
| 75 |
+
if not line or line.startswith("#") or "=" not in line:
|
| 76 |
+
continue
|
| 77 |
+
k, _, v = line.partition("=")
|
| 78 |
+
k = k.strip()
|
| 79 |
+
v = v.strip().strip('"').strip("'")
|
| 80 |
+
if k and k not in os.environ:
|
| 81 |
+
os.environ[k] = v
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
_load_env(_DOTENV_PATH)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
# ---------------------------------------------------------------------------
|
| 88 |
+
# Endpoint specs — three flavors, all OAI-compatible.
|
| 89 |
+
# ---------------------------------------------------------------------------
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
@dataclass
|
| 93 |
+
class Endpoint:
|
| 94 |
+
label: str
|
| 95 |
+
base_url: str
|
| 96 |
+
model: str
|
| 97 |
+
api_key: str
|
| 98 |
+
# Inject ``chat_template_kwargs.enable_thinking=false`` on forwarded
|
| 99 |
+
# requests. Needed for Qwen3.5 served via vLLM (otherwise the model
|
| 100 |
+
# spends its budget on reasoning). OpenAI rejects this field with HTTP
|
| 101 |
+
# 400 ("Unrecognized request argument"); HF Router's Instruct variant
|
| 102 |
+
# doesn't need it. Default per-endpoint, overridable via CLI.
|
| 103 |
+
disable_thinking_default: bool = False
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def _resolve_endpoints() -> tuple[list[Endpoint], list[str]]:
|
| 107 |
+
"""Return (configured, skipped_reasons) from current process env."""
|
| 108 |
+
specs = [
|
| 109 |
+
# (label, base_url_env, default_base_url, model_env, default_model,
|
| 110 |
+
# api_key_env, default_api_key)
|
| 111 |
+
(
|
| 112 |
+
"vllm",
|
| 113 |
+
"VLLM_URL",
|
| 114 |
+
"",
|
| 115 |
+
"VLLM_MODEL",
|
| 116 |
+
"Qwen/Qwen3.5-4B",
|
| 117 |
+
"VLLM_API_KEY",
|
| 118 |
+
"intercepted",
|
| 119 |
+
),
|
| 120 |
+
(
|
| 121 |
+
"openai",
|
| 122 |
+
"OPENAI_BASE_URL",
|
| 123 |
+
"https://api.openai.com/v1",
|
| 124 |
+
"OPENAI_MODEL",
|
| 125 |
+
"gpt-4o-mini",
|
| 126 |
+
"OPENAI_API_KEY",
|
| 127 |
+
"",
|
| 128 |
+
),
|
| 129 |
+
(
|
| 130 |
+
"hf_router",
|
| 131 |
+
"HF_ROUTER_BASE_URL",
|
| 132 |
+
"https://router.huggingface.co/v1",
|
| 133 |
+
"HF_ROUTER_MODEL",
|
| 134 |
+
"Qwen/Qwen3-4B-Instruct-2507:nscale",
|
| 135 |
+
"HF_ROUTER_API_KEY",
|
| 136 |
+
"",
|
| 137 |
+
),
|
| 138 |
+
]
|
| 139 |
+
chosen: list[Endpoint] = []
|
| 140 |
+
skipped: list[str] = []
|
| 141 |
+
for label, bu_env, bu_default, mdl_env, mdl_default, ak_env, ak_default in specs:
|
| 142 |
+
base = os.environ.get(bu_env) or bu_default
|
| 143 |
+
model = os.environ.get(mdl_env) or mdl_default
|
| 144 |
+
api_key = os.environ.get(ak_env) or ak_default
|
| 145 |
+
if not (base and model and api_key):
|
| 146 |
+
skipped.append(
|
| 147 |
+
f"{label} (need {bu_env} / {mdl_env} / {ak_env} in .env)"
|
| 148 |
+
)
|
| 149 |
+
continue
|
| 150 |
+
# Always normalize to a /v1 base URL — opencode + the proxy expect it.
|
| 151 |
+
base = base.rstrip("/")
|
| 152 |
+
if not base.endswith("/v1"):
|
| 153 |
+
base = f"{base}/v1"
|
| 154 |
+
chosen.append(
|
| 155 |
+
Endpoint(
|
| 156 |
+
label=label,
|
| 157 |
+
base_url=base,
|
| 158 |
+
model=model,
|
| 159 |
+
api_key=api_key,
|
| 160 |
+
disable_thinking_default=(label == "vllm"),
|
| 161 |
+
)
|
| 162 |
+
)
|
| 163 |
+
return chosen, skipped
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
# ---------------------------------------------------------------------------
|
| 167 |
+
# The locked task: instruction + verifier source. Identical for all endpoints.
|
| 168 |
+
# ---------------------------------------------------------------------------
|
| 169 |
+
|
| 170 |
+
MODULES = ["bubble_sort", "merge_sort", "quick_sort"]
|
| 171 |
+
|
| 172 |
+
INSTRUCTION = (
|
| 173 |
+
"Create THREE Python files in the current working directory, one per "
|
| 174 |
+
"sorting algorithm. Use RELATIVE paths — do NOT write to absolute paths "
|
| 175 |
+
"like `/bubble_sort.py`. Files (one algorithm each):\n"
|
| 176 |
+
" - bubble_sort.py -> bubble sort\n"
|
| 177 |
+
" - merge_sort.py -> merge sort\n"
|
| 178 |
+
" - quick_sort.py -> quicksort\n\n"
|
| 179 |
+
"Each file MUST expose exactly one function with this signature:\n"
|
| 180 |
+
" def sort(arr: list[int]) -> list[int]\n\n"
|
| 181 |
+
"It must return a NEW list sorted in non-decreasing order (do not mutate "
|
| 182 |
+
"the input). Each file must implement the algorithm named for it — do "
|
| 183 |
+
"NOT call `sorted()` or `list.sort()`, and do NOT import third-party "
|
| 184 |
+
"libraries. Handle edge cases: empty list, single element, duplicates, "
|
| 185 |
+
"already-sorted, reverse-sorted, negative numbers. Do not write tests, "
|
| 186 |
+
"a main block, README, or any other files."
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
VERIFIER_SOURCE = '''\
|
| 190 |
+
"""Verifier for the three-sorts E2E test. Runs inside the sandbox."""
|
| 191 |
+
import importlib
|
| 192 |
+
import json
|
| 193 |
+
import re
|
| 194 |
+
import shutil
|
| 195 |
+
import sys
|
| 196 |
+
import traceback
|
| 197 |
+
from pathlib import Path
|
| 198 |
+
|
| 199 |
+
WORKDIR = Path("/home/user/workdir")
|
| 200 |
+
LOG_DIR = Path("/home/user/logs/verifier")
|
| 201 |
+
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
| 202 |
+
WORKDIR.mkdir(parents=True, exist_ok=True)
|
| 203 |
+
sys.path.insert(0, str(WORKDIR))
|
| 204 |
+
|
| 205 |
+
MODULES = ["bubble_sort", "merge_sort", "quick_sort"]
|
| 206 |
+
|
| 207 |
+
# Some models (notably Qwen3 served via vLLM) ignore "use relative paths"
|
| 208 |
+
# and write files to ``/<name>.py``. With ``--dangerously-skip-permissions``
|
| 209 |
+
# opencode allows it, so we relocate any stray files into WORKDIR so the
|
| 210 |
+
# import side below is path-uniform.
|
| 211 |
+
for name in MODULES:
|
| 212 |
+
stray = Path("/") / f"{name}.py"
|
| 213 |
+
target = WORKDIR / f"{name}.py"
|
| 214 |
+
if stray.exists() and not target.exists():
|
| 215 |
+
shutil.move(str(stray), str(target))
|
| 216 |
+
|
| 217 |
+
# Each test case: (input, expected_sorted_output)
|
| 218 |
+
CASES = [
|
| 219 |
+
([3, 1, 4, 1, 5, 9, 2, 6, 5, 3, 5], [1, 1, 2, 3, 3, 4, 5, 5, 5, 6, 9]),
|
| 220 |
+
([], []),
|
| 221 |
+
([42], [42]),
|
| 222 |
+
([2, 1], [1, 2]),
|
| 223 |
+
([10, 9, 8, 7, 6, 5, 4, 3, 2, 1], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
|
| 224 |
+
([1, 2, 3, 4, 5], [1, 2, 3, 4, 5]),
|
| 225 |
+
([-3, -1, -2, 0, 5, 4], [-3, -2, -1, 0, 4, 5]),
|
| 226 |
+
([7, 7, 7, 7, 7], [7, 7, 7, 7, 7]),
|
| 227 |
+
]
|
| 228 |
+
|
| 229 |
+
# Catch a model that calls ``sorted()`` / ``.sort()`` while pretending to
|
| 230 |
+
# "implement" the named algorithm.
|
| 231 |
+
SOURCE_FORBIDDEN = re.compile(r"\\b(sorted\\s*\\(|\\.sort\\s*\\()")
|
| 232 |
+
|
| 233 |
+
results = {}
|
| 234 |
+
for name in MODULES:
|
| 235 |
+
fpath = WORKDIR / f"{name}.py"
|
| 236 |
+
if not fpath.exists():
|
| 237 |
+
results[name] = "missing"
|
| 238 |
+
continue
|
| 239 |
+
try:
|
| 240 |
+
src = fpath.read_text()
|
| 241 |
+
if SOURCE_FORBIDDEN.search(src):
|
| 242 |
+
results[name] = "cheat: uses sorted()/list.sort()"
|
| 243 |
+
continue
|
| 244 |
+
sys.modules.pop(name, None)
|
| 245 |
+
mod = importlib.import_module(name)
|
| 246 |
+
# Accept either ``sort`` (per spec) or the algorithm-named function
|
| 247 |
+
# (a common drift — e.g. gpt-4o-mini emits ``def bubble_sort(...)``).
|
| 248 |
+
fn = getattr(mod, "sort", None) or getattr(mod, name, None)
|
| 249 |
+
if fn is None:
|
| 250 |
+
results[name] = "no_sort_or_named_function"
|
| 251 |
+
continue
|
| 252 |
+
all_pass = True
|
| 253 |
+
for inp, expected in CASES:
|
| 254 |
+
inp_copy = list(inp)
|
| 255 |
+
actual = fn(list(inp))
|
| 256 |
+
if actual != expected:
|
| 257 |
+
all_pass = False
|
| 258 |
+
results[name] = (
|
| 259 |
+
f"fail: {fn.__name__}({inp!r}) -> {actual!r}, "
|
| 260 |
+
f"expected {expected!r}"
|
| 261 |
+
)
|
| 262 |
+
break
|
| 263 |
+
# The callee should not mutate the caller's list.
|
| 264 |
+
if list(inp) != inp_copy:
|
| 265 |
+
all_pass = False
|
| 266 |
+
results[name] = (
|
| 267 |
+
f"fail: {fn.__name__} mutated input {inp!r} -> {inp_copy!r}"
|
| 268 |
+
)
|
| 269 |
+
break
|
| 270 |
+
if all_pass:
|
| 271 |
+
results[name] = "pass"
|
| 272 |
+
except Exception:
|
| 273 |
+
tb = traceback.format_exc()
|
| 274 |
+
results[name] = f"error: {tb.splitlines()[-1]}"
|
| 275 |
+
|
| 276 |
+
passed = sum(1 for v in results.values() if v == "pass")
|
| 277 |
+
reward = passed / len(MODULES)
|
| 278 |
+
(LOG_DIR / "reward.txt").write_text(f"{reward:.4f}")
|
| 279 |
+
(LOG_DIR / "results.json").write_text(json.dumps(results, indent=2))
|
| 280 |
+
print(f"REWARD={reward:.4f} PASSED={passed}/{len(MODULES)}")
|
| 281 |
+
print(f"RESULTS={json.dumps(results)}")
|
| 282 |
+
'''
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
# ---------------------------------------------------------------------------
|
| 286 |
+
# Sandbox paths.
|
| 287 |
+
# ---------------------------------------------------------------------------
|
| 288 |
+
|
| 289 |
+
HOME = "/home/user"
|
| 290 |
+
WORKDIR = f"{HOME}/workdir"
|
| 291 |
+
OPENCODE_BIN = f"{HOME}/.opencode/bin/opencode"
|
| 292 |
+
OPENCODE_CONFIG = f"{HOME}/.config/opencode/opencode.json"
|
| 293 |
+
INSTRUCTION_PATH = f"{HOME}/task/instruction.md"
|
| 294 |
+
VERIFIER_PATH = f"{HOME}/test.py"
|
| 295 |
+
AGENT_LOG = f"{HOME}/logs/agent/opencode.jsonl"
|
| 296 |
+
PROXY_LOG = f"{HOME}/logs/agent/proxy.log"
|
| 297 |
+
PROXY_TRACE = f"{HOME}/logs/agent/proxy_trace.jsonl"
|
| 298 |
+
PROXY_SCRIPT_PATH = f"{HOME}/proxy/interception.py"
|
| 299 |
+
REWARD_FILE = f"{HOME}/logs/verifier/reward.txt"
|
| 300 |
+
RESULTS_FILE = f"{HOME}/logs/verifier/results.json"
|
| 301 |
+
PROXY_PORT = 7000
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
# ---------------------------------------------------------------------------
|
| 305 |
+
# Result types.
|
| 306 |
+
# ---------------------------------------------------------------------------
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
@dataclass
|
| 310 |
+
class LogprobStats:
|
| 311 |
+
n_turns: int = 0
|
| 312 |
+
productive_turns: int = 0
|
| 313 |
+
total_completion_tokens: int = 0
|
| 314 |
+
tokens_per_turn: list[int] = field(default_factory=list)
|
| 315 |
+
mean_logprob: float | None = None
|
| 316 |
+
first_token: str = ""
|
| 317 |
+
first_logprob: float | None = None
|
| 318 |
+
last_token: str = ""
|
| 319 |
+
last_logprob: float | None = None
|
| 320 |
+
finish_reasons: dict[str, int] = field(default_factory=dict)
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
@dataclass
|
| 324 |
+
class RunResult:
|
| 325 |
+
endpoint: str
|
| 326 |
+
model: str
|
| 327 |
+
base_url: str
|
| 328 |
+
sandbox_id: str = ""
|
| 329 |
+
reward: float | None = None
|
| 330 |
+
tests: dict[str, str] = field(default_factory=dict)
|
| 331 |
+
files: dict[str, str] = field(default_factory=dict)
|
| 332 |
+
files_extra: list[str] = field(default_factory=list)
|
| 333 |
+
logprobs: LogprobStats = field(default_factory=LogprobStats)
|
| 334 |
+
wall_s: float = 0.0
|
| 335 |
+
agent_exit_code: int | None = None
|
| 336 |
+
error: str = ""
|
| 337 |
+
proxy_log_tail: str = ""
|
| 338 |
+
agent_log_tail: str = ""
|
| 339 |
+
verifier_stdout: str = ""
|
| 340 |
+
# Raw per-turn dump (request body + response body, truncated). Saved
|
| 341 |
+
# into artifacts so failures can be debugged without re-running.
|
| 342 |
+
raw_turns: list[dict[str, Any]] = field(default_factory=list)
|
| 343 |
+
|
| 344 |
+
@property
|
| 345 |
+
def passed(self) -> int:
|
| 346 |
+
return sum(1 for v in self.tests.values() if v == "pass")
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
# ---------------------------------------------------------------------------
|
| 350 |
+
# Sandbox helpers — thin wrappers around e2b SDK.
|
| 351 |
+
# ---------------------------------------------------------------------------
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
def _exec(
|
| 355 |
+
sbx: Sandbox,
|
| 356 |
+
cmd: str,
|
| 357 |
+
*,
|
| 358 |
+
envs: dict[str, str] | None = None,
|
| 359 |
+
cwd: str | None = None,
|
| 360 |
+
timeout: float = 60,
|
| 361 |
+
) -> tuple[int, str, str]:
|
| 362 |
+
"""Synchronous shell exec. Returns (exit_code, stdout, stderr)."""
|
| 363 |
+
from e2b.sandbox.commands.command_handle import CommandExitException
|
| 364 |
+
|
| 365 |
+
try:
|
| 366 |
+
r = sbx.commands.run(
|
| 367 |
+
cmd, envs=envs, cwd=cwd, timeout=timeout, background=False
|
| 368 |
+
)
|
| 369 |
+
return r.exit_code, r.stdout or "", r.stderr or ""
|
| 370 |
+
except CommandExitException as exc:
|
| 371 |
+
return (
|
| 372 |
+
int(getattr(exc, "exit_code", 1)),
|
| 373 |
+
str(getattr(exc, "stdout", "") or ""),
|
| 374 |
+
str(getattr(exc, "stderr", "") or str(exc)),
|
| 375 |
+
)
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
def _exec_bg_with_timeout(
|
| 379 |
+
sbx: Sandbox,
|
| 380 |
+
cmd: str,
|
| 381 |
+
*,
|
| 382 |
+
envs: dict[str, str] | None = None,
|
| 383 |
+
cwd: str | None = None,
|
| 384 |
+
timeout_s: float = 600,
|
| 385 |
+
poll_interval_s: float = 1.0,
|
| 386 |
+
) -> int:
|
| 387 |
+
"""Run ``cmd`` in the background and poll until it writes a marker file.
|
| 388 |
+
|
| 389 |
+
Returns the command's exit code. Raises ``TimeoutError`` if the marker
|
| 390 |
+
does not appear within ``timeout_s``. ``timeout=0`` is passed to E2B so
|
| 391 |
+
the server-side 60s deadline does not kill the process.
|
| 392 |
+
"""
|
| 393 |
+
marker = f"/tmp/cmd_done_{secrets.token_hex(4)}"
|
| 394 |
+
wrapped = f"({cmd}); echo $? > {marker}"
|
| 395 |
+
sbx.commands.run(
|
| 396 |
+
wrapped, envs=envs, cwd=cwd, background=True, timeout=0
|
| 397 |
+
)
|
| 398 |
+
deadline = time.time() + timeout_s
|
| 399 |
+
while time.time() < deadline:
|
| 400 |
+
try:
|
| 401 |
+
if sbx.files.exists(marker):
|
| 402 |
+
code_str = sbx.files.read(marker).strip()
|
| 403 |
+
return int(code_str) if code_str else -1
|
| 404 |
+
except Exception:
|
| 405 |
+
pass
|
| 406 |
+
time.sleep(poll_interval_s)
|
| 407 |
+
raise TimeoutError(f"command did not finish within {timeout_s}s")
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
def _safe_read(sbx: Sandbox, path: str) -> str:
|
| 411 |
+
try:
|
| 412 |
+
return sbx.files.read(path)
|
| 413 |
+
except Exception:
|
| 414 |
+
return ""
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
def _write_text(sbx: Sandbox, path: str, content: str) -> None:
|
| 418 |
+
parent = str(Path(path).parent)
|
| 419 |
+
if parent not in ("", "/"):
|
| 420 |
+
sbx.files.make_dir(parent)
|
| 421 |
+
sbx.files.write(path, content)
|
| 422 |
+
|
| 423 |
+
|
| 424 |
+
# ---------------------------------------------------------------------------
|
| 425 |
+
# Bootstrap: install opencode, write config, optionally start proxy.
|
| 426 |
+
# ---------------------------------------------------------------------------
|
| 427 |
+
|
| 428 |
+
|
| 429 |
+
def _wait_for_sandbox_ready(sbx: Sandbox, *, attempts: int = 15) -> None:
|
| 430 |
+
for _ in range(attempts):
|
| 431 |
+
code, out, _ = _exec(sbx, "echo ok", timeout=5)
|
| 432 |
+
if code == 0 and "ok" in out:
|
| 433 |
+
return
|
| 434 |
+
time.sleep(1)
|
| 435 |
+
raise RuntimeError("sandbox did not become ready within ~15s")
|
| 436 |
+
|
| 437 |
+
|
| 438 |
+
def _install_opencode(sbx: Sandbox) -> None:
|
| 439 |
+
cmd = (
|
| 440 |
+
"set -e && "
|
| 441 |
+
f"mkdir -p {HOME}/.config/opencode {HOME}/logs/agent "
|
| 442 |
+
f"{HOME}/logs/verifier {HOME}/task {WORKDIR} {HOME}/proxy && "
|
| 443 |
+
"curl -fsSL https://opencode.ai/install | bash && "
|
| 444 |
+
f'export PATH="{HOME}/.opencode/bin:$PATH" && '
|
| 445 |
+
"opencode --version"
|
| 446 |
+
)
|
| 447 |
+
last_stderr = ""
|
| 448 |
+
for attempt in range(3):
|
| 449 |
+
code, _, err = _exec(sbx, cmd, timeout=240)
|
| 450 |
+
if code == 0:
|
| 451 |
+
return
|
| 452 |
+
last_stderr = err
|
| 453 |
+
time.sleep(3 * (attempt + 1))
|
| 454 |
+
raise RuntimeError(f"opencode install failed: {last_stderr[-1000:]}")
|
| 455 |
+
|
| 456 |
+
|
| 457 |
+
def _ensure_dirs_exist(sbx: Sandbox) -> None:
|
| 458 |
+
"""When using a pre-baked template, dirs already exist. This is a no-op
|
| 459 |
+
safety net that ensures the layout is present (cheap mkdir -p)."""
|
| 460 |
+
_exec(
|
| 461 |
+
sbx,
|
| 462 |
+
f"mkdir -p {HOME}/.config/opencode {HOME}/logs/agent "
|
| 463 |
+
f"{HOME}/logs/verifier {HOME}/task {WORKDIR} {HOME}/proxy",
|
| 464 |
+
timeout=30,
|
| 465 |
+
)
|
| 466 |
+
|
| 467 |
+
|
| 468 |
+
def _start_proxy(
|
| 469 |
+
sbx: Sandbox,
|
| 470 |
+
upstream_url: str,
|
| 471 |
+
upstream_api_key: str,
|
| 472 |
+
upstream_model: str,
|
| 473 |
+
*,
|
| 474 |
+
top_logprobs: int,
|
| 475 |
+
max_tokens_cap: int,
|
| 476 |
+
disable_thinking: bool,
|
| 477 |
+
skip_install: bool = False,
|
| 478 |
+
) -> str:
|
| 479 |
+
"""Upload + start the logprob-capture proxy, return its baseURL.
|
| 480 |
+
|
| 481 |
+
Returns the URL opencode should hit (``http://127.0.0.1:7000/v1``).
|
| 482 |
+
When ``skip_install`` is True (pre-baked template), the proxy source
|
| 483 |
+
and pip deps are assumed to already be present.
|
| 484 |
+
"""
|
| 485 |
+
if not skip_install:
|
| 486 |
+
if not _PROXY_SOURCE_PATH.exists():
|
| 487 |
+
raise RuntimeError(
|
| 488 |
+
f"proxy source not found at {_PROXY_SOURCE_PATH} — needed "
|
| 489 |
+
"for transparent_proxy mode"
|
| 490 |
+
)
|
| 491 |
+
_write_text(sbx, PROXY_SCRIPT_PATH, _PROXY_SOURCE_PATH.read_text())
|
| 492 |
+
|
| 493 |
+
code, _, err = _exec(
|
| 494 |
+
sbx,
|
| 495 |
+
"pip install --quiet 'fastapi>=0.104' 'uvicorn[standard]>=0.24' "
|
| 496 |
+
"'httpx>=0.27' 2>&1 | tail -20",
|
| 497 |
+
timeout=180,
|
| 498 |
+
)
|
| 499 |
+
if code != 0:
|
| 500 |
+
raise RuntimeError(f"proxy deps install failed: {err[-800:]}")
|
| 501 |
+
|
| 502 |
+
flags = (
|
| 503 |
+
f"--upstream-url {upstream_url} "
|
| 504 |
+
f"--upstream-api-key {upstream_api_key} "
|
| 505 |
+
f"--trace {PROXY_TRACE} "
|
| 506 |
+
f"--port {PROXY_PORT} "
|
| 507 |
+
f"--top-logprobs {top_logprobs} "
|
| 508 |
+
f"--max-tokens-cap {max_tokens_cap} "
|
| 509 |
+
f"--model-override '{upstream_model}' "
|
| 510 |
+
)
|
| 511 |
+
if disable_thinking:
|
| 512 |
+
flags += "--disable-thinking "
|
| 513 |
+
cmd = (
|
| 514 |
+
f"cd {HOME}/proxy && "
|
| 515 |
+
f"python interception.py {flags}> {PROXY_LOG} 2>&1"
|
| 516 |
+
)
|
| 517 |
+
sbx.commands.run(cmd, background=True, timeout=0)
|
| 518 |
+
|
| 519 |
+
# Wait for healthz.
|
| 520 |
+
for _ in range(120):
|
| 521 |
+
code, _, _ = _exec(
|
| 522 |
+
sbx, f"curl -sf http://127.0.0.1:{PROXY_PORT}/healthz", timeout=5
|
| 523 |
+
)
|
| 524 |
+
if code == 0:
|
| 525 |
+
return f"http://127.0.0.1:{PROXY_PORT}/v1"
|
| 526 |
+
time.sleep(0.5)
|
| 527 |
+
log = _safe_read(sbx, PROXY_LOG)
|
| 528 |
+
raise RuntimeError(f"proxy did not start within 60s. log:\n{log[-2000:]}")
|
| 529 |
+
|
| 530 |
+
|
| 531 |
+
def _write_opencode_json(
|
| 532 |
+
sbx: Sandbox,
|
| 533 |
+
base_url: str,
|
| 534 |
+
api_key: str,
|
| 535 |
+
model: str,
|
| 536 |
+
request_timeout_ms: int = 600_000,
|
| 537 |
+
) -> None:
|
| 538 |
+
"""Stage opencode.json for ``@ai-sdk/openai-compatible``.
|
| 539 |
+
|
| 540 |
+
All three endpoints route through the OAI-compatible adapter — the proxy
|
| 541 |
+
serves ``/v1/chat/completions`` and so does each upstream we target.
|
| 542 |
+
"""
|
| 543 |
+
inner_model = model.split("/", 1)[-1]
|
| 544 |
+
doc = {
|
| 545 |
+
"$schema": "https://opencode.ai/config.json",
|
| 546 |
+
"model": f"intercepted/{inner_model}",
|
| 547 |
+
"provider": {
|
| 548 |
+
"intercepted": {
|
| 549 |
+
"npm": "@ai-sdk/openai-compatible",
|
| 550 |
+
"name": "Intercepted",
|
| 551 |
+
"options": {
|
| 552 |
+
"baseURL": base_url,
|
| 553 |
+
"apiKey": api_key,
|
| 554 |
+
"timeout": request_timeout_ms,
|
| 555 |
+
},
|
| 556 |
+
"models": {inner_model: {"name": "Intercepted Model"}},
|
| 557 |
+
}
|
| 558 |
+
},
|
| 559 |
+
"tools": {"webfetch": False, "question": False},
|
| 560 |
+
}
|
| 561 |
+
_write_text(sbx, OPENCODE_CONFIG, json.dumps(doc, indent=2))
|
| 562 |
+
|
| 563 |
+
|
| 564 |
+
# ---------------------------------------------------------------------------
|
| 565 |
+
# Run + verify + collect.
|
| 566 |
+
# ---------------------------------------------------------------------------
|
| 567 |
+
|
| 568 |
+
|
| 569 |
+
def _run_agent(
|
| 570 |
+
sbx: Sandbox,
|
| 571 |
+
*,
|
| 572 |
+
instruction_path: str,
|
| 573 |
+
base_url: str,
|
| 574 |
+
api_key: str,
|
| 575 |
+
timeout_s: float,
|
| 576 |
+
) -> int:
|
| 577 |
+
"""Invoke ``opencode run`` synchronously, return its exit code."""
|
| 578 |
+
envs = {
|
| 579 |
+
"OPENAI_BASE_URL": base_url,
|
| 580 |
+
"OPENAI_API_KEY": api_key,
|
| 581 |
+
"OPENCODE_CONFIG": OPENCODE_CONFIG,
|
| 582 |
+
"PATH": f"{HOME}/.opencode/bin:/usr/local/bin:/usr/bin:/bin",
|
| 583 |
+
}
|
| 584 |
+
cmd = (
|
| 585 |
+
f'export PATH="{HOME}/.opencode/bin:$PATH" && '
|
| 586 |
+
f"cd {WORKDIR} && "
|
| 587 |
+
f"opencode run --format json --dangerously-skip-permissions "
|
| 588 |
+
f'"$(cat {instruction_path})" 2>&1 | tee {AGENT_LOG}'
|
| 589 |
+
)
|
| 590 |
+
return _exec_bg_with_timeout(
|
| 591 |
+
sbx, cmd, envs=envs, timeout_s=timeout_s
|
| 592 |
+
)
|
| 593 |
+
|
| 594 |
+
|
| 595 |
+
def _run_verifier(sbx: Sandbox) -> tuple[float | None, dict[str, str], str]:
|
| 596 |
+
cmd = f"mkdir -p {HOME}/logs/verifier && python {VERIFIER_PATH}"
|
| 597 |
+
code, out, err = _exec(sbx, cmd, timeout=120)
|
| 598 |
+
reward_str = _safe_read(sbx, REWARD_FILE).strip()
|
| 599 |
+
results_str = _safe_read(sbx, RESULTS_FILE)
|
| 600 |
+
try:
|
| 601 |
+
reward = float(reward_str) if reward_str else None
|
| 602 |
+
except ValueError:
|
| 603 |
+
reward = None
|
| 604 |
+
try:
|
| 605 |
+
tests = json.loads(results_str) if results_str.strip() else {}
|
| 606 |
+
except json.JSONDecodeError:
|
| 607 |
+
tests = {}
|
| 608 |
+
combined = (out + ("\n" + err if err else "")).strip()
|
| 609 |
+
return reward, tests, combined[-3000:]
|
| 610 |
+
|
| 611 |
+
|
| 612 |
+
def _collect_files(sbx: Sandbox) -> tuple[dict[str, str], list[str]]:
|
| 613 |
+
files: dict[str, str] = {}
|
| 614 |
+
for name in MODULES:
|
| 615 |
+
path = f"{WORKDIR}/{name}.py"
|
| 616 |
+
try:
|
| 617 |
+
if sbx.files.exists(path):
|
| 618 |
+
files[f"{name}.py"] = sbx.files.read(path)[:8000]
|
| 619 |
+
except Exception:
|
| 620 |
+
pass
|
| 621 |
+
code, out, _ = _exec(
|
| 622 |
+
sbx,
|
| 623 |
+
f"find {WORKDIR} -maxdepth 1 -type f -printf '%f\\n' 2>/dev/null",
|
| 624 |
+
timeout=10,
|
| 625 |
+
)
|
| 626 |
+
extras: list[str] = []
|
| 627 |
+
expected = {f"{m}.py" for m in MODULES}
|
| 628 |
+
for line in (out or "").splitlines():
|
| 629 |
+
n = line.strip()
|
| 630 |
+
if n and n not in expected and not n.startswith("."):
|
| 631 |
+
extras.append(n)
|
| 632 |
+
return files, extras
|
| 633 |
+
|
| 634 |
+
|
| 635 |
+
def _read_proxy_trace(sbx: Sandbox) -> list[dict[str, Any]]:
|
| 636 |
+
raw = _safe_read(sbx, PROXY_TRACE)
|
| 637 |
+
out: list[dict[str, Any]] = []
|
| 638 |
+
for line in raw.splitlines():
|
| 639 |
+
line = line.strip()
|
| 640 |
+
if not line:
|
| 641 |
+
continue
|
| 642 |
+
try:
|
| 643 |
+
out.append(json.loads(line))
|
| 644 |
+
except Exception:
|
| 645 |
+
pass
|
| 646 |
+
return out
|
| 647 |
+
|
| 648 |
+
|
| 649 |
+
def _logprob_stats(turns: list[dict[str, Any]]) -> LogprobStats:
|
| 650 |
+
s = LogprobStats(n_turns=len(turns))
|
| 651 |
+
if not turns:
|
| 652 |
+
return s
|
| 653 |
+
all_lps: list[float] = []
|
| 654 |
+
finish: dict[str, int] = {}
|
| 655 |
+
for t in turns:
|
| 656 |
+
toks = t.get("completion_tokens") or []
|
| 657 |
+
lps = t.get("per_token_logps") or []
|
| 658 |
+
s.tokens_per_turn.append(len(toks))
|
| 659 |
+
s.total_completion_tokens += len(toks)
|
| 660 |
+
if toks:
|
| 661 |
+
s.productive_turns += 1
|
| 662 |
+
all_lps.extend(float(x) for x in lps if x is not None)
|
| 663 |
+
fr = t.get("finish_reason") or "unknown"
|
| 664 |
+
finish[fr] = finish.get(fr, 0) + 1
|
| 665 |
+
s.finish_reasons = finish
|
| 666 |
+
if all_lps:
|
| 667 |
+
s.mean_logprob = mean(all_lps)
|
| 668 |
+
first = next((t for t in turns if t.get("completion_tokens")), None)
|
| 669 |
+
last = next(
|
| 670 |
+
(t for t in reversed(turns) if t.get("completion_tokens")), None
|
| 671 |
+
)
|
| 672 |
+
if first:
|
| 673 |
+
s.first_token = str(first["completion_tokens"][0])
|
| 674 |
+
lp = (first.get("per_token_logps") or [None])[0]
|
| 675 |
+
if lp is not None:
|
| 676 |
+
s.first_logprob = float(lp)
|
| 677 |
+
if last:
|
| 678 |
+
s.last_token = str(last["completion_tokens"][-1])
|
| 679 |
+
lp = (last.get("per_token_logps") or [None])[-1]
|
| 680 |
+
if lp is not None:
|
| 681 |
+
s.last_logprob = float(lp)
|
| 682 |
+
return s
|
| 683 |
+
|
| 684 |
+
|
| 685 |
+
# ---------------------------------------------------------------------------
|
| 686 |
+
# One full rollout.
|
| 687 |
+
# ---------------------------------------------------------------------------
|
| 688 |
+
|
| 689 |
+
|
| 690 |
+
def run_one(
|
| 691 |
+
ep: Endpoint,
|
| 692 |
+
*,
|
| 693 |
+
mode: str,
|
| 694 |
+
agent_timeout_s: float,
|
| 695 |
+
max_tokens_cap: int,
|
| 696 |
+
top_logprobs: int,
|
| 697 |
+
disable_thinking: bool,
|
| 698 |
+
instruction: str,
|
| 699 |
+
e2b_api_key: str,
|
| 700 |
+
template: str | None = None,
|
| 701 |
+
) -> RunResult:
|
| 702 |
+
print(
|
| 703 |
+
f"[{ep.label}] launching base_url={ep.base_url} model={ep.model} "
|
| 704 |
+
f"mode={mode} template={template or '(default)'}",
|
| 705 |
+
flush=True,
|
| 706 |
+
)
|
| 707 |
+
res = RunResult(endpoint=ep.label, model=ep.model, base_url=ep.base_url)
|
| 708 |
+
started = time.time()
|
| 709 |
+
|
| 710 |
+
sbx = Sandbox.create(
|
| 711 |
+
template=template,
|
| 712 |
+
timeout=int(agent_timeout_s) + 300,
|
| 713 |
+
api_key=e2b_api_key,
|
| 714 |
+
)
|
| 715 |
+
res.sandbox_id = sbx.sandbox_id
|
| 716 |
+
print(f"[{ep.label}] sandbox={sbx.sandbox_id}", flush=True)
|
| 717 |
+
try:
|
| 718 |
+
_wait_for_sandbox_ready(sbx)
|
| 719 |
+
if template:
|
| 720 |
+
_ensure_dirs_exist(sbx)
|
| 721 |
+
else:
|
| 722 |
+
_install_opencode(sbx)
|
| 723 |
+
_write_text(sbx, INSTRUCTION_PATH, instruction)
|
| 724 |
+
_write_text(sbx, VERIFIER_PATH, VERIFIER_SOURCE)
|
| 725 |
+
|
| 726 |
+
if mode == "transparent_proxy":
|
| 727 |
+
base_url = _start_proxy(
|
| 728 |
+
sbx,
|
| 729 |
+
upstream_url=ep.base_url,
|
| 730 |
+
upstream_api_key=ep.api_key,
|
| 731 |
+
upstream_model=ep.model,
|
| 732 |
+
top_logprobs=top_logprobs,
|
| 733 |
+
max_tokens_cap=max_tokens_cap,
|
| 734 |
+
disable_thinking=disable_thinking,
|
| 735 |
+
skip_install=bool(template),
|
| 736 |
+
)
|
| 737 |
+
else:
|
| 738 |
+
base_url = ep.base_url
|
| 739 |
+
|
| 740 |
+
_write_opencode_json(
|
| 741 |
+
sbx,
|
| 742 |
+
base_url=base_url,
|
| 743 |
+
api_key=ep.api_key if mode == "black_box" else "intercepted",
|
| 744 |
+
model=ep.model,
|
| 745 |
+
)
|
| 746 |
+
|
| 747 |
+
try:
|
| 748 |
+
res.agent_exit_code = _run_agent(
|
| 749 |
+
sbx,
|
| 750 |
+
instruction_path=INSTRUCTION_PATH,
|
| 751 |
+
base_url=base_url,
|
| 752 |
+
api_key=ep.api_key if mode == "black_box" else "intercepted",
|
| 753 |
+
timeout_s=agent_timeout_s,
|
| 754 |
+
)
|
| 755 |
+
print(
|
| 756 |
+
f"[{ep.label}] agent exit_code={res.agent_exit_code}",
|
| 757 |
+
flush=True,
|
| 758 |
+
)
|
| 759 |
+
except TimeoutError as exc:
|
| 760 |
+
res.error = f"agent timeout: {exc}"
|
| 761 |
+
print(f"[{ep.label}] {res.error}", flush=True)
|
| 762 |
+
|
| 763 |
+
reward, tests, vstdout = _run_verifier(sbx)
|
| 764 |
+
res.reward, res.tests, res.verifier_stdout = reward, tests, vstdout
|
| 765 |
+
res.files, res.files_extra = _collect_files(sbx)
|
| 766 |
+
turns = _read_proxy_trace(sbx)
|
| 767 |
+
res.logprobs = _logprob_stats(turns)
|
| 768 |
+
# Capture truncated request/response per turn for debugging. Strip
|
| 769 |
+
# large/noisy fields (full token logprobs, raw bytes) to keep the
|
| 770 |
+
# artifact readable.
|
| 771 |
+
for t in turns:
|
| 772 |
+
req = t.get("request") or {}
|
| 773 |
+
resp = t.get("response") or {}
|
| 774 |
+
res.raw_turns.append(
|
| 775 |
+
{
|
| 776 |
+
"turn": t.get("turn"),
|
| 777 |
+
"finish_reason": t.get("finish_reason"),
|
| 778 |
+
"latency_s": t.get("latency_s"),
|
| 779 |
+
"request_messages": req.get("messages", [])[-6:],
|
| 780 |
+
"request_tools": [
|
| 781 |
+
(tool.get("function") or {}).get("name", "?")
|
| 782 |
+
for tool in (req.get("tools") or [])
|
| 783 |
+
],
|
| 784 |
+
"request_temperature": req.get("temperature"),
|
| 785 |
+
"request_max_tokens": req.get("max_tokens")
|
| 786 |
+
or req.get("max_completion_tokens"),
|
| 787 |
+
"response_choices": [
|
| 788 |
+
{
|
| 789 |
+
"finish_reason": ch.get("finish_reason"),
|
| 790 |
+
"message_content": (ch.get("message") or {}).get(
|
| 791 |
+
"content"
|
| 792 |
+
),
|
| 793 |
+
"tool_calls": [
|
| 794 |
+
{
|
| 795 |
+
"name": (tc.get("function") or {}).get(
|
| 796 |
+
"name"
|
| 797 |
+
),
|
| 798 |
+
"arguments": str(
|
| 799 |
+
(tc.get("function") or {}).get(
|
| 800 |
+
"arguments", ""
|
| 801 |
+
)
|
| 802 |
+
)[:500],
|
| 803 |
+
}
|
| 804 |
+
for tc in (
|
| 805 |
+
(ch.get("message") or {}).get("tool_calls")
|
| 806 |
+
or []
|
| 807 |
+
)
|
| 808 |
+
],
|
| 809 |
+
}
|
| 810 |
+
for ch in (resp.get("choices") or [])
|
| 811 |
+
],
|
| 812 |
+
"upstream_status": resp.get("upstream_status"),
|
| 813 |
+
"upstream_error": resp.get("upstream_error"),
|
| 814 |
+
}
|
| 815 |
+
)
|
| 816 |
+
res.proxy_log_tail = _safe_read(sbx, PROXY_LOG)[-2000:]
|
| 817 |
+
res.agent_log_tail = _safe_read(sbx, AGENT_LOG)[-4000:]
|
| 818 |
+
except Exception as exc: # noqa: BLE001
|
| 819 |
+
res.error = f"{type(exc).__name__}: {exc}"
|
| 820 |
+
print(f"[{ep.label}] ERROR {res.error}", flush=True)
|
| 821 |
+
finally:
|
| 822 |
+
try:
|
| 823 |
+
sbx.kill()
|
| 824 |
+
except Exception:
|
| 825 |
+
pass
|
| 826 |
+
res.wall_s = time.time() - started
|
| 827 |
+
return res
|
| 828 |
+
|
| 829 |
+
|
| 830 |
+
# ---------------------------------------------------------------------------
|
| 831 |
+
# Reporting.
|
| 832 |
+
# ---------------------------------------------------------------------------
|
| 833 |
+
|
| 834 |
+
|
| 835 |
+
def _format_summary(results: list[RunResult]) -> str:
|
| 836 |
+
lines: list[str] = []
|
| 837 |
+
sep = "-" * 110
|
| 838 |
+
lines.append(sep)
|
| 839 |
+
lines.append(
|
| 840 |
+
f"{'endpoint':<10} {'model':<42} {'reward':<8} {'pass':<6} "
|
| 841 |
+
f"{'turns':<6} {'tokens':<8} {'mean-logp':<11} {'wall':<8}"
|
| 842 |
+
)
|
| 843 |
+
lines.append(sep)
|
| 844 |
+
for r in results:
|
| 845 |
+
reward = f"{r.reward:.2f}" if r.reward is not None else "-"
|
| 846 |
+
pass_str = f"{r.passed}/{len(MODULES)}"
|
| 847 |
+
mean_lp = (
|
| 848 |
+
f"{r.logprobs.mean_logprob:+.3f}"
|
| 849 |
+
if r.logprobs.mean_logprob is not None
|
| 850 |
+
else "-"
|
| 851 |
+
)
|
| 852 |
+
lines.append(
|
| 853 |
+
f"{r.endpoint:<10} {r.model[:42]:<42} {reward:<8} {pass_str:<6} "
|
| 854 |
+
f"{r.logprobs.n_turns:<6} {r.logprobs.total_completion_tokens:<8} "
|
| 855 |
+
f"{mean_lp:<11} {r.wall_s:<7.1f}s"
|
| 856 |
+
)
|
| 857 |
+
lines.append(sep)
|
| 858 |
+
lines.append("")
|
| 859 |
+
lines.append("per-file results:")
|
| 860 |
+
for r in results:
|
| 861 |
+
per_file = " ".join(
|
| 862 |
+
f"{m}={r.tests.get(m, '?')}" for m in MODULES
|
| 863 |
+
)
|
| 864 |
+
lines.append(f" {r.endpoint:<10} {per_file}")
|
| 865 |
+
if r.files_extra:
|
| 866 |
+
lines.append(
|
| 867 |
+
f" {' ':<10} extras: {', '.join(sorted(r.files_extra))}"
|
| 868 |
+
)
|
| 869 |
+
if r.error:
|
| 870 |
+
lines.append(f" {' ':<10} ERROR: {r.error[:200]}")
|
| 871 |
+
return "\n".join(lines)
|
| 872 |
+
|
| 873 |
+
|
| 874 |
+
def _save_artifact(r: RunResult, out_dir: Path) -> Path:
|
| 875 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 876 |
+
ts = int(time.time())
|
| 877 |
+
target = out_dir / f"sorting_{r.endpoint}_{ts}.json"
|
| 878 |
+
target.write_text(json.dumps(asdict(r), indent=2, default=str))
|
| 879 |
+
return target
|
| 880 |
+
|
| 881 |
+
|
| 882 |
+
# ---------------------------------------------------------------------------
|
| 883 |
+
# CLI.
|
| 884 |
+
# ---------------------------------------------------------------------------
|
| 885 |
+
|
| 886 |
+
|
| 887 |
+
def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
| 888 |
+
p = argparse.ArgumentParser(
|
| 889 |
+
prog="test_five_sorts_e2e",
|
| 890 |
+
description=(
|
| 891 |
+
"Run opencode end-to-end against vLLM / OpenAI / HF Router, "
|
| 892 |
+
"write 5 sorting algorithms in 5 files, verify them, return "
|
| 893 |
+
"logprobs + tests + filesystem."
|
| 894 |
+
),
|
| 895 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 896 |
+
)
|
| 897 |
+
p.add_argument(
|
| 898 |
+
"--endpoint",
|
| 899 |
+
choices=["vllm", "openai", "hf_router", "all"],
|
| 900 |
+
default="all",
|
| 901 |
+
help="Which endpoint to test (default: all configured endpoints).",
|
| 902 |
+
)
|
| 903 |
+
p.add_argument(
|
| 904 |
+
"--mode",
|
| 905 |
+
choices=["transparent_proxy", "black_box"],
|
| 906 |
+
default="transparent_proxy",
|
| 907 |
+
help=(
|
| 908 |
+
"transparent_proxy captures per-token logprobs; black_box skips "
|
| 909 |
+
"the proxy. Default: transparent_proxy."
|
| 910 |
+
),
|
| 911 |
+
)
|
| 912 |
+
p.add_argument(
|
| 913 |
+
"--agent-timeout",
|
| 914 |
+
type=float,
|
| 915 |
+
default=600.0,
|
| 916 |
+
help="Seconds to wait for opencode to finish (default 600).",
|
| 917 |
+
)
|
| 918 |
+
p.add_argument(
|
| 919 |
+
"--max-tokens-cap",
|
| 920 |
+
type=int,
|
| 921 |
+
default=4096,
|
| 922 |
+
help="Per-turn max_tokens clamp on forwarded requests (default 4096).",
|
| 923 |
+
)
|
| 924 |
+
p.add_argument(
|
| 925 |
+
"--top-logprobs",
|
| 926 |
+
type=int,
|
| 927 |
+
default=5,
|
| 928 |
+
help="Top-k logprobs requested from the upstream (HF Router cap is 5).",
|
| 929 |
+
)
|
| 930 |
+
p.add_argument(
|
| 931 |
+
"--disable-thinking",
|
| 932 |
+
choices=["auto", "on", "off"],
|
| 933 |
+
default="auto",
|
| 934 |
+
help=(
|
| 935 |
+
"Inject ``chat_template_kwargs.enable_thinking=false`` on "
|
| 936 |
+
"forwarded requests. ``auto`` = on for vllm, off for openai / "
|
| 937 |
+
"hf_router (default). ``on`` / ``off`` forces it for every "
|
| 938 |
+
"endpoint."
|
| 939 |
+
),
|
| 940 |
+
)
|
| 941 |
+
p.add_argument(
|
| 942 |
+
"--save-artifacts",
|
| 943 |
+
action="store_true",
|
| 944 |
+
help="Dump per-run JSON to envs/opencode_env/tests/_artifacts/.",
|
| 945 |
+
)
|
| 946 |
+
p.add_argument(
|
| 947 |
+
"--instruction-override",
|
| 948 |
+
default=None,
|
| 949 |
+
help="Replace the default 5-sorts instruction.",
|
| 950 |
+
)
|
| 951 |
+
p.add_argument(
|
| 952 |
+
"--no-summary-files",
|
| 953 |
+
action="store_true",
|
| 954 |
+
help="Skip printing file contents in the summary.",
|
| 955 |
+
)
|
| 956 |
+
p.add_argument(
|
| 957 |
+
"--template",
|
| 958 |
+
default=None,
|
| 959 |
+
help=(
|
| 960 |
+
"E2B template name to use (e.g. ``opencode-rl`` after running "
|
| 961 |
+
"build_e2b_template.py). When set, skips opencode install + "
|
| 962 |
+
"pip-deps install (already in the template) — saves ~2 min "
|
| 963 |
+
"per rollout."
|
| 964 |
+
),
|
| 965 |
+
)
|
| 966 |
+
return p.parse_args(argv)
|
| 967 |
+
|
| 968 |
+
|
| 969 |
+
def main(argv: list[str] | None = None) -> int:
|
| 970 |
+
args = _parse_args(argv)
|
| 971 |
+
|
| 972 |
+
e2b_api_key = os.environ.get("E2B_API_KEY")
|
| 973 |
+
if not e2b_api_key:
|
| 974 |
+
print(
|
| 975 |
+
"ERROR: E2B_API_KEY is required (set it in .env or your shell).",
|
| 976 |
+
file=sys.stderr,
|
| 977 |
+
)
|
| 978 |
+
return 2
|
| 979 |
+
|
| 980 |
+
print(f"Loading env from {_DOTENV_PATH}")
|
| 981 |
+
endpoints, skipped = _resolve_endpoints()
|
| 982 |
+
if args.endpoint != "all":
|
| 983 |
+
endpoints = [e for e in endpoints if e.label == args.endpoint]
|
| 984 |
+
|
| 985 |
+
instruction = args.instruction_override or INSTRUCTION
|
| 986 |
+
|
| 987 |
+
runs: list[RunResult] = []
|
| 988 |
+
for ep in endpoints:
|
| 989 |
+
if args.disable_thinking == "on":
|
| 990 |
+
disable_thinking = True
|
| 991 |
+
elif args.disable_thinking == "off":
|
| 992 |
+
disable_thinking = False
|
| 993 |
+
else:
|
| 994 |
+
disable_thinking = ep.disable_thinking_default
|
| 995 |
+
runs.append(
|
| 996 |
+
run_one(
|
| 997 |
+
ep,
|
| 998 |
+
mode=args.mode,
|
| 999 |
+
agent_timeout_s=args.agent_timeout,
|
| 1000 |
+
max_tokens_cap=args.max_tokens_cap,
|
| 1001 |
+
top_logprobs=args.top_logprobs,
|
| 1002 |
+
disable_thinking=disable_thinking,
|
| 1003 |
+
instruction=instruction,
|
| 1004 |
+
e2b_api_key=e2b_api_key,
|
| 1005 |
+
template=args.template,
|
| 1006 |
+
)
|
| 1007 |
+
)
|
| 1008 |
+
|
| 1009 |
+
print()
|
| 1010 |
+
print(_format_summary(runs))
|
| 1011 |
+
|
| 1012 |
+
if skipped:
|
| 1013 |
+
print("\nSkipped (not configured):")
|
| 1014 |
+
for s in skipped:
|
| 1015 |
+
print(f" - {s}")
|
| 1016 |
+
|
| 1017 |
+
if not args.no_summary_files:
|
| 1018 |
+
for r in runs:
|
| 1019 |
+
print(f"\n=== files written by {r.endpoint} ({r.model}) ===")
|
| 1020 |
+
for fname, src in r.files.items():
|
| 1021 |
+
head = "\n".join(src.splitlines()[:20])
|
| 1022 |
+
print(f"--- {fname} (first 20 lines) ---")
|
| 1023 |
+
print(head)
|
| 1024 |
+
if src.count("\n") > 20:
|
| 1025 |
+
print(f"... ({src.count(chr(10)) - 20} more lines)")
|
| 1026 |
+
|
| 1027 |
+
if args.save_artifacts:
|
| 1028 |
+
out_dir = _ENV_DIR / "tests" / "_artifacts"
|
| 1029 |
+
for r in runs:
|
| 1030 |
+
print(f"saved {_save_artifact(r, out_dir)}")
|
| 1031 |
+
|
| 1032 |
+
if not runs:
|
| 1033 |
+
print("\nNo endpoints ran. Fill in .env and re-run.")
|
| 1034 |
+
return 2
|
| 1035 |
+
|
| 1036 |
+
failed = [r for r in runs if r.reward is None or r.reward < 1.0 or r.error]
|
| 1037 |
+
if failed:
|
| 1038 |
+
print(f"\n{len(failed)}/{len(runs)} endpoint(s) did not reach reward=1.0.")
|
| 1039 |
+
return 1
|
| 1040 |
+
print(f"\nAll {len(runs)} endpoint(s) reached reward=1.0.")
|
| 1041 |
+
return 0
|
| 1042 |
+
|
| 1043 |
+
|
| 1044 |
+
if __name__ == "__main__":
|
| 1045 |
+
sys.exit(main())
|
tests/test_harness.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Unit tests for OpenCodeSession / OpenCodeSessionFactory (no sandbox)."""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import pytest
|
| 12 |
+
|
| 13 |
+
from opencode_env.config import OpenCodeConfig
|
| 14 |
+
from opencode_env.harness import OpenCodeSession, OpenCodeSessionFactory
|
| 15 |
+
from opencode_env.sandbox.base import ExecResult
|
| 16 |
+
from opencode_env.task import OpenCodeTask
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class _FakeBgJob:
|
| 20 |
+
def __init__(self) -> None:
|
| 21 |
+
self.pid = 123
|
| 22 |
+
self._killed = False
|
| 23 |
+
|
| 24 |
+
def wait(self, timeout: float | None = None) -> int:
|
| 25 |
+
return 0
|
| 26 |
+
|
| 27 |
+
def kill(self) -> None:
|
| 28 |
+
self._killed = True
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class _FakeSandbox:
|
| 32 |
+
"""In-memory sandbox that records every interaction."""
|
| 33 |
+
|
| 34 |
+
def __init__(self, *, install_exit: int = 0, setup_exit: int = 0) -> None:
|
| 35 |
+
self.sandbox_id = "fake-sbx"
|
| 36 |
+
self.exec_calls: list[tuple[str, dict | None]] = []
|
| 37 |
+
self.written: dict[str, str] = {}
|
| 38 |
+
self.bg_calls: list[tuple[str, dict | None]] = []
|
| 39 |
+
self.killed = False
|
| 40 |
+
self._install_exit = install_exit
|
| 41 |
+
self._setup_exit = setup_exit
|
| 42 |
+
|
| 43 |
+
def exec(self, cmd, *, envs=None, cwd=None, timeout=60):
|
| 44 |
+
self.exec_calls.append((cmd, envs))
|
| 45 |
+
# Health probe: the factory issues ``echo ok`` up to 15 times before
|
| 46 |
+
# doing anything else. The fake sandbox is "ready" on the first try.
|
| 47 |
+
if cmd.strip() == "echo ok":
|
| 48 |
+
return ExecResult(0, "ok\n", "")
|
| 49 |
+
if "opencode.ai/install" in cmd:
|
| 50 |
+
return ExecResult(self._install_exit, "opencode 0.0.0\n", "")
|
| 51 |
+
return ExecResult(self._setup_exit, "", "")
|
| 52 |
+
|
| 53 |
+
def start_bg(self, cmd, *, envs=None, cwd=None):
|
| 54 |
+
self.bg_calls.append((cmd, envs))
|
| 55 |
+
return _FakeBgJob()
|
| 56 |
+
|
| 57 |
+
def write_text(self, path, content):
|
| 58 |
+
self.written[path] = content
|
| 59 |
+
|
| 60 |
+
def read_text(self, path):
|
| 61 |
+
return self.written.get(path, "")
|
| 62 |
+
|
| 63 |
+
def exists(self, path):
|
| 64 |
+
return path in self.written
|
| 65 |
+
|
| 66 |
+
def kill(self):
|
| 67 |
+
self.killed = True
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class _FakeBackend:
|
| 71 |
+
def __init__(self, sandbox: _FakeSandbox) -> None:
|
| 72 |
+
self._sandbox = sandbox
|
| 73 |
+
self.create_calls = 0
|
| 74 |
+
|
| 75 |
+
def create(self, *, timeout_s=900, envs=None, metadata=None):
|
| 76 |
+
self.create_calls += 1
|
| 77 |
+
return self._sandbox
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def _config(**overrides) -> OpenCodeConfig:
|
| 81 |
+
base = dict(
|
| 82 |
+
provider="openai",
|
| 83 |
+
base_url="https://api.openai.com/v1",
|
| 84 |
+
api_key="sk-fake",
|
| 85 |
+
model="openai/gpt-5.3-codex",
|
| 86 |
+
)
|
| 87 |
+
base.update(overrides)
|
| 88 |
+
return OpenCodeConfig(**base)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def test_factory_bootstraps_and_starts_agent():
|
| 92 |
+
sbx = _FakeSandbox()
|
| 93 |
+
backend = _FakeBackend(sbx)
|
| 94 |
+
factory = OpenCodeSessionFactory(config=_config(), sandbox_backend=backend)
|
| 95 |
+
|
| 96 |
+
session = factory.create(task="solve fizzbuzz")
|
| 97 |
+
|
| 98 |
+
assert backend.create_calls == 1
|
| 99 |
+
assert any("opencode.ai/install" in c for c, _ in sbx.exec_calls)
|
| 100 |
+
assert "/home/user/.config/opencode/opencode.json" in sbx.written
|
| 101 |
+
assert sbx.written["/home/user/task/instruction.md"] == "solve fizzbuzz"
|
| 102 |
+
assert len(sbx.bg_calls) == 1, "agent must be started in background"
|
| 103 |
+
# OPENAI_BASE_URL must be injected into the process env
|
| 104 |
+
_, envs = sbx.bg_calls[0]
|
| 105 |
+
assert envs["OPENAI_BASE_URL"] == "https://api.openai.com/v1"
|
| 106 |
+
assert envs["OPENAI_API_KEY"] == "sk-fake"
|
| 107 |
+
assert isinstance(session, OpenCodeSession)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def test_factory_runs_task_setup_shell():
|
| 111 |
+
sbx = _FakeSandbox()
|
| 112 |
+
factory = OpenCodeSessionFactory(
|
| 113 |
+
config=_config(), sandbox_backend=_FakeBackend(sbx)
|
| 114 |
+
)
|
| 115 |
+
task = OpenCodeTask(instruction="x", setup_shell="pip install pytest")
|
| 116 |
+
|
| 117 |
+
factory.create(task=task)
|
| 118 |
+
|
| 119 |
+
setup_cmds = [c for c, _ in sbx.exec_calls if "pip install" in c]
|
| 120 |
+
assert setup_cmds == ["pip install pytest"]
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def test_factory_uploads_extra_files():
|
| 124 |
+
sbx = _FakeSandbox()
|
| 125 |
+
factory = OpenCodeSessionFactory(
|
| 126 |
+
config=_config(), sandbox_backend=_FakeBackend(sbx)
|
| 127 |
+
)
|
| 128 |
+
task = OpenCodeTask(
|
| 129 |
+
instruction="run it",
|
| 130 |
+
upload_files={"/home/user/workdir/hello.py": "print('hi')"},
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
factory.create(task=task)
|
| 134 |
+
|
| 135 |
+
assert sbx.written["/home/user/workdir/hello.py"] == "print('hi')"
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def test_factory_kills_sandbox_on_install_failure():
|
| 139 |
+
sbx = _FakeSandbox(install_exit=1)
|
| 140 |
+
factory = OpenCodeSessionFactory(
|
| 141 |
+
config=_config(), sandbox_backend=_FakeBackend(sbx)
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
with pytest.raises(RuntimeError, match="install failed"):
|
| 145 |
+
factory.create(task="x")
|
| 146 |
+
assert sbx.killed
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def test_factory_accepts_transparent_proxy_mode():
|
| 150 |
+
f = OpenCodeSessionFactory(
|
| 151 |
+
config=_config(),
|
| 152 |
+
sandbox_backend=_FakeBackend(_FakeSandbox()),
|
| 153 |
+
mode="transparent_proxy",
|
| 154 |
+
)
|
| 155 |
+
assert f._mode == "transparent_proxy"
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def test_factory_rejects_unknown_mode():
|
| 159 |
+
with pytest.raises(ValueError, match="Unknown mode"):
|
| 160 |
+
OpenCodeSessionFactory(
|
| 161 |
+
config=_config(),
|
| 162 |
+
sandbox_backend=_FakeBackend(_FakeSandbox()),
|
| 163 |
+
mode="bogus", # type: ignore[arg-type]
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def test_session_initial_messages():
|
| 168 |
+
sbx = _FakeSandbox()
|
| 169 |
+
session = OpenCodeSession(
|
| 170 |
+
sandbox=sbx,
|
| 171 |
+
config=_config(),
|
| 172 |
+
task=OpenCodeTask(instruction="hi"),
|
| 173 |
+
)
|
| 174 |
+
assert session.initial_messages() == [{"role": "user", "content": "hi"}]
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def test_session_verify_without_verifier_returns_none_reward():
|
| 178 |
+
sbx = _FakeSandbox()
|
| 179 |
+
session = OpenCodeSession(
|
| 180 |
+
sandbox=sbx,
|
| 181 |
+
config=_config(),
|
| 182 |
+
task=OpenCodeTask(instruction="x"),
|
| 183 |
+
)
|
| 184 |
+
result = session.verify(transcript=[])
|
| 185 |
+
assert result.env_reward is None
|
| 186 |
+
assert result.done is True
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
def test_session_verify_calls_user_verifier():
|
| 190 |
+
from openenv.core.harness import VerifyResult
|
| 191 |
+
|
| 192 |
+
sbx = _FakeSandbox()
|
| 193 |
+
calls = []
|
| 194 |
+
|
| 195 |
+
def verifier(sandbox, task):
|
| 196 |
+
calls.append((sandbox.sandbox_id, task.instruction))
|
| 197 |
+
return VerifyResult(env_reward=1.0, done=True, metrics={"tests": "pass"})
|
| 198 |
+
|
| 199 |
+
session = OpenCodeSession(
|
| 200 |
+
sandbox=sbx,
|
| 201 |
+
config=_config(),
|
| 202 |
+
task=OpenCodeTask(instruction="do"),
|
| 203 |
+
verifier=verifier,
|
| 204 |
+
)
|
| 205 |
+
result = session.verify(transcript=[])
|
| 206 |
+
assert calls == [("fake-sbx", "do")]
|
| 207 |
+
assert result.env_reward == 1.0
|
| 208 |
+
assert result.metrics == {"tests": "pass"}
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
def test_session_close_kills_job_and_sandbox():
|
| 212 |
+
sbx = _FakeSandbox()
|
| 213 |
+
session = OpenCodeSession(
|
| 214 |
+
sandbox=sbx,
|
| 215 |
+
config=_config(),
|
| 216 |
+
task=OpenCodeTask(instruction="x"),
|
| 217 |
+
)
|
| 218 |
+
session._bg_job = _FakeBgJob()
|
| 219 |
+
session.close()
|
| 220 |
+
assert session._bg_job is None
|
| 221 |
+
assert sbx.killed
|
tests/test_inference_endpoints.py
ADDED
|
@@ -0,0 +1,430 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Inference probe for the three LLM endpoints opencode runs against.
|
| 8 |
+
|
| 9 |
+
For each of vLLM, OpenAI, and the HF Inference Router, fires one
|
| 10 |
+
``/v1/chat/completions`` request with ``logprobs=true`` and verifies:
|
| 11 |
+
|
| 12 |
+
1. HTTP status is 200.
|
| 13 |
+
2. The response carries either ``message.content`` or ``message.tool_calls``.
|
| 14 |
+
3. ``choices[0].logprobs.content`` is non-null with at least one entry.
|
| 15 |
+
4. The first token's ``top_logprobs`` has the requested top-k count.
|
| 16 |
+
|
| 17 |
+
Endpoints are read from the sibling ``.env`` file (``envs/opencode_env/.env``).
|
| 18 |
+
A missing config skips that endpoint instead of failing the suite.
|
| 19 |
+
|
| 20 |
+
Run as pytest::
|
| 21 |
+
|
| 22 |
+
PYTHONPATH=src:envs/opencode_env uv run pytest \\
|
| 23 |
+
envs/opencode_env/tests/test_inference_endpoints.py -v -s
|
| 24 |
+
|
| 25 |
+
Run as a standalone script (prints a summary table)::
|
| 26 |
+
|
| 27 |
+
python envs/opencode_env/tests/test_inference_endpoints.py
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
from __future__ import annotations
|
| 31 |
+
|
| 32 |
+
import os
|
| 33 |
+
import sys
|
| 34 |
+
from dataclasses import dataclass, field
|
| 35 |
+
from pathlib import Path
|
| 36 |
+
from typing import Any
|
| 37 |
+
|
| 38 |
+
import httpx
|
| 39 |
+
import pytest
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# ---------------------------------------------------------------------------
|
| 43 |
+
# .env loader — no python-dotenv dep, since the package keeps deps minimal.
|
| 44 |
+
# ---------------------------------------------------------------------------
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def _load_env_file(env_path: Path) -> None:
|
| 48 |
+
"""Populate ``os.environ`` from ``KEY=VALUE`` lines in ``env_path``.
|
| 49 |
+
|
| 50 |
+
Existing process env vars take precedence so a shell ``export`` always
|
| 51 |
+
wins over the ``.env`` file. Lines starting with ``#`` and blank lines
|
| 52 |
+
are ignored. Surrounding single/double quotes on values are stripped.
|
| 53 |
+
"""
|
| 54 |
+
if not env_path.exists():
|
| 55 |
+
return
|
| 56 |
+
for raw in env_path.read_text().splitlines():
|
| 57 |
+
line = raw.strip()
|
| 58 |
+
if not line or line.startswith("#") or "=" not in line:
|
| 59 |
+
continue
|
| 60 |
+
key, _, value = line.partition("=")
|
| 61 |
+
key = key.strip()
|
| 62 |
+
value = value.strip().strip('"').strip("'")
|
| 63 |
+
if key and key not in os.environ:
|
| 64 |
+
os.environ[key] = value
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
_ENV_PATH = Path(__file__).resolve().parents[1] / ".env"
|
| 68 |
+
_load_env_file(_ENV_PATH)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
# ---------------------------------------------------------------------------
|
| 72 |
+
# Endpoint specs — one per kind of LLM endpoint we exercise.
|
| 73 |
+
# ---------------------------------------------------------------------------
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
@dataclass
|
| 77 |
+
class EndpointSpec:
|
| 78 |
+
"""Reads one endpoint's connection info out of the environment."""
|
| 79 |
+
|
| 80 |
+
label: str
|
| 81 |
+
base_url_env: str
|
| 82 |
+
model_env: str
|
| 83 |
+
api_key_env: str
|
| 84 |
+
default_base_url: str | None = None
|
| 85 |
+
default_model: str | None = None
|
| 86 |
+
default_api_key: str = ""
|
| 87 |
+
|
| 88 |
+
def resolve(self) -> "EndpointConfig | None":
|
| 89 |
+
base_url = os.environ.get(self.base_url_env) or self.default_base_url or ""
|
| 90 |
+
model = os.environ.get(self.model_env) or self.default_model or ""
|
| 91 |
+
api_key = os.environ.get(self.api_key_env) or self.default_api_key
|
| 92 |
+
if not base_url or not model or not api_key:
|
| 93 |
+
return None
|
| 94 |
+
return EndpointConfig(
|
| 95 |
+
label=self.label, base_url=base_url, model=model, api_key=api_key
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
@dataclass
|
| 100 |
+
class EndpointConfig:
|
| 101 |
+
label: str
|
| 102 |
+
base_url: str
|
| 103 |
+
model: str
|
| 104 |
+
api_key: str
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def _resolve_chat_completions_url(base_url: str) -> str:
|
| 108 |
+
"""Build the fully-qualified ``/v1/chat/completions`` URL.
|
| 109 |
+
|
| 110 |
+
Mirrors :func:`opencode_env.interception._resolve_upstream_url`: if the
|
| 111 |
+
base already ends in ``/v1`` (or includes a path), only ``/chat/completions``
|
| 112 |
+
is appended; otherwise the full ``/v1/chat/completions`` path is used.
|
| 113 |
+
"""
|
| 114 |
+
base = base_url.rstrip("/")
|
| 115 |
+
if base.endswith("/v1"):
|
| 116 |
+
return f"{base}/chat/completions"
|
| 117 |
+
return f"{base}/v1/chat/completions"
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
# Defaults below mirror what the OpenCode harness primitive uses by default.
|
| 121 |
+
# .env values override anything specified here.
|
| 122 |
+
ENDPOINT_SPECS: list[EndpointSpec] = [
|
| 123 |
+
EndpointSpec(
|
| 124 |
+
label="vllm",
|
| 125 |
+
base_url_env="VLLM_URL",
|
| 126 |
+
model_env="VLLM_MODEL",
|
| 127 |
+
api_key_env="VLLM_API_KEY",
|
| 128 |
+
default_api_key="intercepted",
|
| 129 |
+
default_model="Qwen/Qwen3.5-4B",
|
| 130 |
+
),
|
| 131 |
+
EndpointSpec(
|
| 132 |
+
label="openai",
|
| 133 |
+
base_url_env="OPENAI_BASE_URL",
|
| 134 |
+
model_env="OPENAI_MODEL",
|
| 135 |
+
api_key_env="OPENAI_API_KEY",
|
| 136 |
+
default_base_url="https://api.openai.com/v1",
|
| 137 |
+
default_model="gpt-4o-mini",
|
| 138 |
+
),
|
| 139 |
+
EndpointSpec(
|
| 140 |
+
label="hf_router",
|
| 141 |
+
base_url_env="HF_ROUTER_BASE_URL",
|
| 142 |
+
model_env="HF_ROUTER_MODEL",
|
| 143 |
+
api_key_env="HF_ROUTER_API_KEY",
|
| 144 |
+
default_base_url="https://router.huggingface.co/v1",
|
| 145 |
+
default_model="Qwen/Qwen3-4B-Instruct-2507:nscale",
|
| 146 |
+
),
|
| 147 |
+
]
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
# ---------------------------------------------------------------------------
|
| 151 |
+
# Probe — one HTTP round trip per call; pure function, no side effects.
|
| 152 |
+
# ---------------------------------------------------------------------------
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
@dataclass
|
| 156 |
+
class ProbeResult:
|
| 157 |
+
label: str
|
| 158 |
+
base_url: str
|
| 159 |
+
model: str
|
| 160 |
+
status: int
|
| 161 |
+
ok: bool
|
| 162 |
+
completion_text: str = ""
|
| 163 |
+
has_tool_calls: bool = False
|
| 164 |
+
has_logprobs: bool = False
|
| 165 |
+
top_logprobs_n: int = 0
|
| 166 |
+
first_token: str = ""
|
| 167 |
+
first_logprob: float | None = None
|
| 168 |
+
latency_s: float = 0.0
|
| 169 |
+
error: str = ""
|
| 170 |
+
raw_response: dict[str, Any] = field(default_factory=dict)
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def probe(
|
| 174 |
+
cfg: EndpointConfig,
|
| 175 |
+
*,
|
| 176 |
+
top_logprobs: int = 5,
|
| 177 |
+
max_tokens: int = 16,
|
| 178 |
+
timeout_s: float = 90.0,
|
| 179 |
+
) -> ProbeResult:
|
| 180 |
+
"""Send one chat-completions request and return what the endpoint did.
|
| 181 |
+
|
| 182 |
+
Never raises. Network / 4xx / 5xx errors land in ``ProbeResult.error`` so
|
| 183 |
+
the caller can render a table without try/except scaffolding.
|
| 184 |
+
"""
|
| 185 |
+
import time
|
| 186 |
+
|
| 187 |
+
url = _resolve_chat_completions_url(cfg.base_url)
|
| 188 |
+
body: dict[str, Any] = {
|
| 189 |
+
"model": cfg.model,
|
| 190 |
+
"messages": [{"role": "user", "content": "Reply with a single word: hi"}],
|
| 191 |
+
"max_tokens": max_tokens,
|
| 192 |
+
"logprobs": True,
|
| 193 |
+
"top_logprobs": top_logprobs,
|
| 194 |
+
"temperature": 0,
|
| 195 |
+
}
|
| 196 |
+
headers = {
|
| 197 |
+
"Authorization": f"Bearer {cfg.api_key}",
|
| 198 |
+
"Content-Type": "application/json",
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
start = time.time()
|
| 202 |
+
try:
|
| 203 |
+
r = httpx.post(url, json=body, headers=headers, timeout=timeout_s)
|
| 204 |
+
except Exception as exc: # noqa: BLE001
|
| 205 |
+
return ProbeResult(
|
| 206 |
+
label=cfg.label,
|
| 207 |
+
base_url=cfg.base_url,
|
| 208 |
+
model=cfg.model,
|
| 209 |
+
status=0,
|
| 210 |
+
ok=False,
|
| 211 |
+
error=f"{type(exc).__name__}: {exc}",
|
| 212 |
+
latency_s=time.time() - start,
|
| 213 |
+
)
|
| 214 |
+
latency = time.time() - start
|
| 215 |
+
|
| 216 |
+
if r.status_code != 200:
|
| 217 |
+
return ProbeResult(
|
| 218 |
+
label=cfg.label,
|
| 219 |
+
base_url=cfg.base_url,
|
| 220 |
+
model=cfg.model,
|
| 221 |
+
status=r.status_code,
|
| 222 |
+
ok=False,
|
| 223 |
+
error=r.text[:600],
|
| 224 |
+
latency_s=latency,
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
try:
|
| 228 |
+
data = r.json()
|
| 229 |
+
except Exception as exc: # noqa: BLE001
|
| 230 |
+
return ProbeResult(
|
| 231 |
+
label=cfg.label,
|
| 232 |
+
base_url=cfg.base_url,
|
| 233 |
+
model=cfg.model,
|
| 234 |
+
status=r.status_code,
|
| 235 |
+
ok=False,
|
| 236 |
+
error=f"non-JSON body: {exc}",
|
| 237 |
+
latency_s=latency,
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
choice = (data.get("choices") or [{}])[0]
|
| 241 |
+
msg = choice.get("message") or {}
|
| 242 |
+
completion_text = msg.get("content") or ""
|
| 243 |
+
has_tool_calls = bool(msg.get("tool_calls"))
|
| 244 |
+
lp = choice.get("logprobs")
|
| 245 |
+
content_lp = lp.get("content") if isinstance(lp, dict) else None
|
| 246 |
+
has_logprobs = bool(content_lp)
|
| 247 |
+
|
| 248 |
+
first_token = ""
|
| 249 |
+
first_logprob: float | None = None
|
| 250 |
+
top_n = 0
|
| 251 |
+
if has_logprobs and content_lp:
|
| 252 |
+
first = content_lp[0]
|
| 253 |
+
first_token = str(first.get("token", ""))
|
| 254 |
+
lp_val = first.get("logprob")
|
| 255 |
+
if lp_val is not None:
|
| 256 |
+
first_logprob = float(lp_val)
|
| 257 |
+
top_n = len(first.get("top_logprobs") or [])
|
| 258 |
+
|
| 259 |
+
return ProbeResult(
|
| 260 |
+
label=cfg.label,
|
| 261 |
+
base_url=cfg.base_url,
|
| 262 |
+
model=cfg.model,
|
| 263 |
+
status=r.status_code,
|
| 264 |
+
ok=True,
|
| 265 |
+
completion_text=completion_text,
|
| 266 |
+
has_tool_calls=has_tool_calls,
|
| 267 |
+
has_logprobs=has_logprobs,
|
| 268 |
+
top_logprobs_n=top_n,
|
| 269 |
+
first_token=first_token,
|
| 270 |
+
first_logprob=first_logprob,
|
| 271 |
+
latency_s=latency,
|
| 272 |
+
raw_response=data,
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
# ---------------------------------------------------------------------------
|
| 277 |
+
# pytest entrypoints — one parametrized test per endpoint.
|
| 278 |
+
# ---------------------------------------------------------------------------
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
@pytest.mark.parametrize(
|
| 282 |
+
"spec", ENDPOINT_SPECS, ids=[s.label for s in ENDPOINT_SPECS]
|
| 283 |
+
)
|
| 284 |
+
def test_endpoint_responds(spec: EndpointSpec) -> None:
|
| 285 |
+
"""Endpoint accepts a chat-completions call and returns a 2xx body."""
|
| 286 |
+
cfg = spec.resolve()
|
| 287 |
+
if cfg is None:
|
| 288 |
+
pytest.skip(
|
| 289 |
+
f"{spec.label} not configured (set {spec.base_url_env} / "
|
| 290 |
+
f"{spec.model_env} / {spec.api_key_env} in .env)"
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
result = probe(cfg)
|
| 294 |
+
assert result.ok, f"{cfg.label}: HTTP {result.status} — {result.error}"
|
| 295 |
+
# ``logprobs.content`` populated implies the model generated at least one
|
| 296 |
+
# token (either visible content, a tool-call argument, or a reasoning
|
| 297 |
+
# token for Qwen3-thinking variants). That is the signal we want — empty
|
| 298 |
+
# completion + empty tool_calls is fine when reasoning tokens are present.
|
| 299 |
+
assert result.has_logprobs or result.completion_text or result.has_tool_calls, (
|
| 300 |
+
f"{cfg.label}: model produced no output at all. "
|
| 301 |
+
f"Response: {str(result.raw_response)[:500]}"
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
@pytest.mark.parametrize(
|
| 306 |
+
"spec", ENDPOINT_SPECS, ids=[s.label for s in ENDPOINT_SPECS]
|
| 307 |
+
)
|
| 308 |
+
def test_endpoint_returns_logprobs(spec: EndpointSpec) -> None:
|
| 309 |
+
"""Endpoint honors ``logprobs=true`` and returns per-token logprobs.
|
| 310 |
+
|
| 311 |
+
Failing this test means the endpoint silently drops logprobs (HF Router
|
| 312 |
+
providers like Novita / Hyperbolic / Featherless behave this way) — the
|
| 313 |
+
transparent proxy has nothing to capture and Mode B GRPO will train on
|
| 314 |
+
empty per-token logps.
|
| 315 |
+
"""
|
| 316 |
+
cfg = spec.resolve()
|
| 317 |
+
if cfg is None:
|
| 318 |
+
pytest.skip(
|
| 319 |
+
f"{spec.label} not configured (set {spec.base_url_env} / "
|
| 320 |
+
f"{spec.model_env} / {spec.api_key_env} in .env)"
|
| 321 |
+
)
|
| 322 |
+
|
| 323 |
+
result = probe(cfg)
|
| 324 |
+
assert result.ok, f"{cfg.label}: HTTP {result.status} — {result.error}"
|
| 325 |
+
assert result.has_logprobs, (
|
| 326 |
+
f"{cfg.label}: endpoint returned 200 but logprobs.content is null. "
|
| 327 |
+
f"This provider does not support logprobs. Pick a different provider "
|
| 328 |
+
f"(together / nscale / scaleway) or run opencode in mode='black_box'."
|
| 329 |
+
)
|
| 330 |
+
assert result.top_logprobs_n >= 1, (
|
| 331 |
+
f"{cfg.label}: top_logprobs has {result.top_logprobs_n} entries, "
|
| 332 |
+
f"expected >= 1"
|
| 333 |
+
)
|
| 334 |
+
assert result.first_logprob is not None, (
|
| 335 |
+
f"{cfg.label}: first token has no logprob value"
|
| 336 |
+
)
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
# ---------------------------------------------------------------------------
|
| 340 |
+
# Standalone runner — prints a summary table.
|
| 341 |
+
# ---------------------------------------------------------------------------
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
def _format_summary(results: list[ProbeResult], skipped: list[str]) -> str:
|
| 345 |
+
rows: list[str] = []
|
| 346 |
+
rows.append("-" * 96)
|
| 347 |
+
rows.append(
|
| 348 |
+
f"{'endpoint':<10} {'status':<7} {'logprobs':<14} {'top-n':<6} "
|
| 349 |
+
f"{'first-token':<14} {'first-logp':<11} {'latency':<8} notes"
|
| 350 |
+
)
|
| 351 |
+
rows.append("-" * 96)
|
| 352 |
+
for r in results:
|
| 353 |
+
if r.status == 0:
|
| 354 |
+
status_str = "ERR"
|
| 355 |
+
else:
|
| 356 |
+
status_str = str(r.status)
|
| 357 |
+
|
| 358 |
+
if not r.ok:
|
| 359 |
+
lp_str = "n/a"
|
| 360 |
+
elif r.has_logprobs:
|
| 361 |
+
lp_str = f"yes ({r.top_logprobs_n})"
|
| 362 |
+
else:
|
| 363 |
+
lp_str = "DROPPED"
|
| 364 |
+
|
| 365 |
+
first_tok_str = repr(r.first_token) if r.first_token else "-"
|
| 366 |
+
first_lp_str = (
|
| 367 |
+
f"{r.first_logprob:+.3f}" if r.first_logprob is not None else "-"
|
| 368 |
+
)
|
| 369 |
+
latency_str = f"{r.latency_s:.2f}s"
|
| 370 |
+
notes = ""
|
| 371 |
+
if not r.ok:
|
| 372 |
+
notes = r.error[:50].replace("\n", " ")
|
| 373 |
+
elif not r.has_logprobs:
|
| 374 |
+
notes = "silent logprob drop"
|
| 375 |
+
|
| 376 |
+
rows.append(
|
| 377 |
+
f"{r.label:<10} {status_str:<7} {lp_str:<14} "
|
| 378 |
+
f"{r.top_logprobs_n:<6} {first_tok_str:<14} "
|
| 379 |
+
f"{first_lp_str:<11} {latency_str:<8} {notes}"
|
| 380 |
+
)
|
| 381 |
+
rows.append("-" * 96)
|
| 382 |
+
if skipped:
|
| 383 |
+
rows.append("")
|
| 384 |
+
rows.append("Skipped (not configured in .env):")
|
| 385 |
+
for s in skipped:
|
| 386 |
+
rows.append(f" - {s}")
|
| 387 |
+
return "\n".join(rows)
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
def main() -> int:
|
| 391 |
+
print(f"Loading env from {_ENV_PATH}\n")
|
| 392 |
+
|
| 393 |
+
results: list[ProbeResult] = []
|
| 394 |
+
skipped: list[str] = []
|
| 395 |
+
for spec in ENDPOINT_SPECS:
|
| 396 |
+
cfg = spec.resolve()
|
| 397 |
+
if cfg is None:
|
| 398 |
+
skipped.append(
|
| 399 |
+
f"{spec.label} (set {spec.base_url_env} / {spec.model_env} / "
|
| 400 |
+
f"{spec.api_key_env})"
|
| 401 |
+
)
|
| 402 |
+
continue
|
| 403 |
+
print(f"-> probing {cfg.label}: {cfg.base_url} model={cfg.model}")
|
| 404 |
+
r = probe(cfg)
|
| 405 |
+
results.append(r)
|
| 406 |
+
if not r.ok:
|
| 407 |
+
print(f" HTTP {r.status}: {r.error[:200]}")
|
| 408 |
+
else:
|
| 409 |
+
print(
|
| 410 |
+
f" HTTP {r.status} logprobs={r.has_logprobs} "
|
| 411 |
+
f"top_n={r.top_logprobs_n} "
|
| 412 |
+
f"content={r.completion_text!r:.60}"
|
| 413 |
+
)
|
| 414 |
+
print()
|
| 415 |
+
|
| 416 |
+
print(_format_summary(results, skipped))
|
| 417 |
+
|
| 418 |
+
if not results:
|
| 419 |
+
print("\nNo endpoints configured. Fill in .env and re-run.")
|
| 420 |
+
return 2
|
| 421 |
+
bad = [r for r in results if not r.ok or not r.has_logprobs]
|
| 422 |
+
if bad:
|
| 423 |
+
print(f"\n{len(bad)}/{len(results)} endpoint(s) failed or lack logprobs.")
|
| 424 |
+
return 1
|
| 425 |
+
print(f"\nAll {len(results)} configured endpoint(s) passed.")
|
| 426 |
+
return 0
|
| 427 |
+
|
| 428 |
+
|
| 429 |
+
if __name__ == "__main__":
|
| 430 |
+
sys.exit(main())
|
tests/test_interception.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Unit tests for the interception proxy (no sandbox, no real LLM)."""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import json
|
| 12 |
+
import os
|
| 13 |
+
import socket
|
| 14 |
+
from contextlib import closing
|
| 15 |
+
|
| 16 |
+
import httpx
|
| 17 |
+
import pytest
|
| 18 |
+
import uvicorn
|
| 19 |
+
from fastapi import FastAPI, Request
|
| 20 |
+
|
| 21 |
+
from opencode_env.sandbox.interception import (
|
| 22 |
+
InterceptionProxy,
|
| 23 |
+
ProxyConfig,
|
| 24 |
+
_build_turn_record,
|
| 25 |
+
_strip_logprobs,
|
| 26 |
+
read_trace,
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _free_port() -> int:
|
| 31 |
+
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
|
| 32 |
+
s.bind(("127.0.0.1", 0))
|
| 33 |
+
return s.getsockname()[1]
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _make_upstream_app(response_payload: dict) -> FastAPI:
|
| 37 |
+
app = FastAPI()
|
| 38 |
+
received: list[dict] = []
|
| 39 |
+
|
| 40 |
+
@app.post("/v1/chat/completions")
|
| 41 |
+
async def handler(request: Request):
|
| 42 |
+
body = await request.json()
|
| 43 |
+
received.append(body)
|
| 44 |
+
return response_payload
|
| 45 |
+
|
| 46 |
+
app.state.received = received
|
| 47 |
+
return app
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _run_upstream(app: FastAPI, port: int) -> uvicorn.Server:
|
| 51 |
+
config = uvicorn.Config(
|
| 52 |
+
app, host="127.0.0.1", port=port, log_level="warning", lifespan="on"
|
| 53 |
+
)
|
| 54 |
+
server = uvicorn.Server(config)
|
| 55 |
+
import threading
|
| 56 |
+
|
| 57 |
+
t = threading.Thread(target=server.run, daemon=True)
|
| 58 |
+
t.start()
|
| 59 |
+
import time
|
| 60 |
+
|
| 61 |
+
deadline = time.time() + 5
|
| 62 |
+
while time.time() < deadline:
|
| 63 |
+
try:
|
| 64 |
+
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
|
| 65 |
+
s.settimeout(0.2)
|
| 66 |
+
if s.connect_ex(("127.0.0.1", port)) == 0:
|
| 67 |
+
return server
|
| 68 |
+
except OSError:
|
| 69 |
+
pass
|
| 70 |
+
time.sleep(0.05)
|
| 71 |
+
raise RuntimeError("upstream failed to start")
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
_FAKE_RESPONSE = {
|
| 75 |
+
"id": "chatcmpl-fake",
|
| 76 |
+
"object": "chat.completion",
|
| 77 |
+
"model": "test-model",
|
| 78 |
+
"choices": [
|
| 79 |
+
{
|
| 80 |
+
"index": 0,
|
| 81 |
+
"finish_reason": "stop",
|
| 82 |
+
"message": {"role": "assistant", "content": "hi"},
|
| 83 |
+
"logprobs": {
|
| 84 |
+
"content": [
|
| 85 |
+
{"token": "h", "logprob": -0.1, "top_logprobs": []},
|
| 86 |
+
{"token": "i", "logprob": -0.2, "top_logprobs": []},
|
| 87 |
+
]
|
| 88 |
+
},
|
| 89 |
+
}
|
| 90 |
+
],
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def test_strip_logprobs_removes_only_logprobs_key():
|
| 95 |
+
sanitized = _strip_logprobs(_FAKE_RESPONSE)
|
| 96 |
+
choice = sanitized["choices"][0]
|
| 97 |
+
assert "logprobs" not in choice
|
| 98 |
+
assert choice["message"]["content"] == "hi"
|
| 99 |
+
assert choice["finish_reason"] == "stop"
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def test_build_turn_record_extracts_logprobs():
|
| 103 |
+
record = _build_turn_record(
|
| 104 |
+
turn_idx=1,
|
| 105 |
+
request_body={"model": "test", "messages": []},
|
| 106 |
+
response_json=_FAKE_RESPONSE,
|
| 107 |
+
latency_s=0.25,
|
| 108 |
+
)
|
| 109 |
+
assert record.completion_tokens == ["h", "i"]
|
| 110 |
+
assert record.per_token_logps == [-0.1, -0.2]
|
| 111 |
+
assert record.finish_reason == "stop"
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def test_read_trace_returns_empty_list_when_missing(tmp_path):
|
| 115 |
+
assert read_trace(tmp_path / "nonexistent.jsonl") == []
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def test_proxy_forwards_captures_and_strips(tmp_path):
|
| 119 |
+
upstream_port = _free_port()
|
| 120 |
+
proxy_port = _free_port()
|
| 121 |
+
trace = tmp_path / "trace.jsonl"
|
| 122 |
+
|
| 123 |
+
upstream_app = _make_upstream_app(_FAKE_RESPONSE)
|
| 124 |
+
upstream_server = _run_upstream(upstream_app, upstream_port)
|
| 125 |
+
|
| 126 |
+
cfg = ProxyConfig(
|
| 127 |
+
upstream_url=f"http://127.0.0.1:{upstream_port}",
|
| 128 |
+
upstream_api_key="test-key",
|
| 129 |
+
trace_path=str(trace),
|
| 130 |
+
host="127.0.0.1",
|
| 131 |
+
port=proxy_port,
|
| 132 |
+
top_logprobs=5,
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
with InterceptionProxy(cfg) as proxy:
|
| 136 |
+
assert proxy.url == f"http://127.0.0.1:{proxy_port}/v1"
|
| 137 |
+
# Sanity: healthz
|
| 138 |
+
r = httpx.get(f"http://127.0.0.1:{proxy_port}/healthz")
|
| 139 |
+
assert r.status_code == 200
|
| 140 |
+
# Chat completion round trip
|
| 141 |
+
req_body = {
|
| 142 |
+
"model": "openai_compatible/foo",
|
| 143 |
+
"messages": [{"role": "user", "content": "hi"}],
|
| 144 |
+
"temperature": 0.0,
|
| 145 |
+
}
|
| 146 |
+
r = httpx.post(
|
| 147 |
+
f"http://127.0.0.1:{proxy_port}/v1/chat/completions",
|
| 148 |
+
json=req_body,
|
| 149 |
+
headers={"Authorization": "Bearer whatever"},
|
| 150 |
+
timeout=10,
|
| 151 |
+
)
|
| 152 |
+
assert r.status_code == 200
|
| 153 |
+
returned = r.json()
|
| 154 |
+
# logprobs stripped from what opencode sees
|
| 155 |
+
assert "logprobs" not in returned["choices"][0]
|
| 156 |
+
assert returned["choices"][0]["message"]["content"] == "hi"
|
| 157 |
+
|
| 158 |
+
# Upstream got logprobs=true injected
|
| 159 |
+
forwarded = upstream_app.state.received
|
| 160 |
+
assert len(forwarded) == 1
|
| 161 |
+
assert forwarded[0]["logprobs"] is True
|
| 162 |
+
assert forwarded[0]["top_logprobs"] == 5
|
| 163 |
+
# Authorization carries upstream_api_key
|
| 164 |
+
|
| 165 |
+
# Trace file has one line with captured logprobs
|
| 166 |
+
records = read_trace(trace)
|
| 167 |
+
assert len(records) == 1
|
| 168 |
+
rec = records[0]
|
| 169 |
+
assert rec["turn"] == 1
|
| 170 |
+
assert rec["completion_tokens"] == ["h", "i"]
|
| 171 |
+
assert rec["per_token_logps"] == [-0.1, -0.2]
|
| 172 |
+
assert rec["finish_reason"] == "stop"
|
| 173 |
+
assert rec["request"]["messages"][0]["content"] == "hi"
|
| 174 |
+
|
| 175 |
+
upstream_server.should_exit = True
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def test_proxy_handles_invalid_json_body(tmp_path):
|
| 179 |
+
upstream_port = _free_port()
|
| 180 |
+
proxy_port = _free_port()
|
| 181 |
+
upstream_server = _run_upstream(_make_upstream_app(_FAKE_RESPONSE), upstream_port)
|
| 182 |
+
|
| 183 |
+
cfg = ProxyConfig(
|
| 184 |
+
upstream_url=f"http://127.0.0.1:{upstream_port}",
|
| 185 |
+
trace_path=str(tmp_path / "trace.jsonl"),
|
| 186 |
+
host="127.0.0.1",
|
| 187 |
+
port=proxy_port,
|
| 188 |
+
)
|
| 189 |
+
with InterceptionProxy(cfg):
|
| 190 |
+
r = httpx.post(
|
| 191 |
+
f"http://127.0.0.1:{proxy_port}/v1/chat/completions",
|
| 192 |
+
content=b"not json",
|
| 193 |
+
headers={"Content-Type": "application/json"},
|
| 194 |
+
timeout=10,
|
| 195 |
+
)
|
| 196 |
+
assert r.status_code == 400
|
| 197 |
+
|
| 198 |
+
upstream_server.should_exit = True
|
tests/test_opencode_runtime.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
|
| 11 |
+
from opencode_env.config import OpenCodeConfig
|
| 12 |
+
from opencode_env.opencode_runtime import (
|
| 13 |
+
build_env_vars,
|
| 14 |
+
build_install_cmd,
|
| 15 |
+
build_opencode_json,
|
| 16 |
+
build_run_cmd,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _openai_cfg(**overrides) -> OpenCodeConfig:
|
| 21 |
+
base = dict(
|
| 22 |
+
provider="openai",
|
| 23 |
+
base_url="https://api.openai.com/v1",
|
| 24 |
+
api_key="sk-test",
|
| 25 |
+
model="openai/gpt-5.3-codex",
|
| 26 |
+
)
|
| 27 |
+
base.update(overrides)
|
| 28 |
+
return OpenCodeConfig(**base)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def test_opencode_json_has_schema_and_provider_block():
|
| 32 |
+
cfg = _openai_cfg()
|
| 33 |
+
doc = json.loads(build_opencode_json(cfg))
|
| 34 |
+
assert doc["$schema"] == "https://opencode.ai/config.json"
|
| 35 |
+
assert doc["model"] == "intercepted/gpt-5.3-codex"
|
| 36 |
+
provider = doc["provider"]["intercepted"]
|
| 37 |
+
assert provider["npm"] == "@ai-sdk/openai"
|
| 38 |
+
assert provider["options"]["baseURL"] == "https://api.openai.com/v1"
|
| 39 |
+
assert provider["options"]["apiKey"] == "sk-test"
|
| 40 |
+
assert provider["options"]["timeout"] == 600_000
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def test_opencode_json_disables_tools_by_default():
|
| 44 |
+
cfg = _openai_cfg()
|
| 45 |
+
doc = json.loads(build_opencode_json(cfg))
|
| 46 |
+
assert doc["tools"] == {"webfetch": False, "question": False}
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def test_opencode_json_extra_is_deep_merged():
|
| 50 |
+
cfg = _openai_cfg(extra_opencode_json={"theme": "dark", "provider": {"intercepted": {"options": {"custom": 1}}}})
|
| 51 |
+
doc = json.loads(build_opencode_json(cfg))
|
| 52 |
+
assert doc["theme"] == "dark"
|
| 53 |
+
# Deep merge preserves other keys in the nested options block
|
| 54 |
+
options = doc["provider"]["intercepted"]["options"]
|
| 55 |
+
assert options["baseURL"] == "https://api.openai.com/v1"
|
| 56 |
+
assert options["custom"] == 1
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def test_install_cmd_pins_version_when_not_latest():
|
| 60 |
+
cfg = _openai_cfg(opencode_version="0.5.3")
|
| 61 |
+
cmd = build_install_cmd(cfg)
|
| 62 |
+
assert "OPENCODE_VERSION=0.5.3" in cmd
|
| 63 |
+
assert "curl -fsSL https://opencode.ai/install | bash" in cmd
|
| 64 |
+
assert "opencode --version" in cmd
|
| 65 |
+
assert "/home/user/.config/opencode" in cmd
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def test_install_cmd_respects_sandbox_home():
|
| 69 |
+
cfg = _openai_cfg(sandbox_home="/root")
|
| 70 |
+
cmd = build_install_cmd(cfg)
|
| 71 |
+
assert "/root/.config/opencode" in cmd
|
| 72 |
+
assert "/home/user" not in cmd
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def test_install_cmd_omits_version_env_when_latest():
|
| 76 |
+
cfg = _openai_cfg(opencode_version="latest")
|
| 77 |
+
cmd = build_install_cmd(cfg)
|
| 78 |
+
assert "OPENCODE_VERSION" not in cmd
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def test_run_cmd_uses_json_format_by_default():
|
| 82 |
+
cfg = _openai_cfg()
|
| 83 |
+
cmd = build_run_cmd(cfg)
|
| 84 |
+
assert "opencode run --format json" in cmd
|
| 85 |
+
assert '"$(cat /home/user/task/instruction.md)"' in cmd
|
| 86 |
+
assert "tee /home/user/logs/agent/opencode.jsonl" in cmd
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def test_run_cmd_default_format_has_no_flag():
|
| 90 |
+
cfg = _openai_cfg(run_format="default")
|
| 91 |
+
cmd = build_run_cmd(cfg)
|
| 92 |
+
assert "--format" not in cmd
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def test_env_vars_default_to_config_url():
|
| 96 |
+
cfg = _openai_cfg()
|
| 97 |
+
env = build_env_vars(cfg)
|
| 98 |
+
assert env["OPENAI_BASE_URL"] == "https://api.openai.com/v1"
|
| 99 |
+
assert env["OPENAI_API_KEY"] == "sk-test"
|
| 100 |
+
assert env["OPENCODE_CONFIG"] == "/home/user/.config/opencode/opencode.json"
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def test_env_vars_respect_proxy_override():
|
| 104 |
+
cfg = _openai_cfg(extra_env={"EXTRA": "yes"})
|
| 105 |
+
env = build_env_vars(cfg, base_url_override="http://localhost:7000/v1")
|
| 106 |
+
assert env["OPENAI_BASE_URL"] == "http://localhost:7000/v1"
|
| 107 |
+
assert env["EXTRA"] == "yes"
|
tests/test_sandbox_base.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Structural tests for the sandbox Protocols (no live sandbox needed)."""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
from opencode_env.sandbox import (
|
| 12 |
+
ExecResult,
|
| 13 |
+
SandboxBackend,
|
| 14 |
+
SandboxHandle,
|
| 15 |
+
E2BSandboxBackend,
|
| 16 |
+
E2BSandboxHandle,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def test_e2b_classes_import():
|
| 21 |
+
# Ensure the e2b backend imports without needing an API key or live call.
|
| 22 |
+
assert E2BSandboxBackend is not None
|
| 23 |
+
assert E2BSandboxHandle is not None
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def test_exec_result_dataclass():
|
| 27 |
+
r = ExecResult(exit_code=0, stdout="ok", stderr="")
|
| 28 |
+
assert r.exit_code == 0
|
| 29 |
+
assert r.stdout == "ok"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def test_e2b_backend_requires_api_key(monkeypatch):
|
| 33 |
+
monkeypatch.delenv("E2B_API_KEY", raising=False)
|
| 34 |
+
import pytest
|
| 35 |
+
with pytest.raises(RuntimeError, match="E2B_API_KEY"):
|
| 36 |
+
E2BSandboxBackend()
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def test_protocols_are_declared():
|
| 40 |
+
# Static: the Protocols should be importable and non-empty.
|
| 41 |
+
assert hasattr(SandboxBackend, "create")
|
| 42 |
+
assert hasattr(SandboxHandle, "exec")
|
| 43 |
+
assert hasattr(SandboxHandle, "start_bg")
|
| 44 |
+
assert hasattr(SandboxHandle, "kill")
|
tests/test_task.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import pytest
|
| 10 |
+
|
| 11 |
+
from opencode_env.task import OpenCodeTask
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def test_coerce_from_string():
|
| 15 |
+
task = OpenCodeTask.coerce("write fizzbuzz")
|
| 16 |
+
assert task.instruction == "write fizzbuzz"
|
| 17 |
+
assert task.setup_shell is None
|
| 18 |
+
assert task.upload_files == {}
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def test_coerce_from_dict():
|
| 22 |
+
task = OpenCodeTask.coerce(
|
| 23 |
+
{
|
| 24 |
+
"instruction": "run tests",
|
| 25 |
+
"setup_shell": "pip install pytest",
|
| 26 |
+
"upload_files": {"/home/user/workdir/hello.py": "print('hi')"},
|
| 27 |
+
"metadata": {"task_id": "hello_001"},
|
| 28 |
+
}
|
| 29 |
+
)
|
| 30 |
+
assert task.instruction == "run tests"
|
| 31 |
+
assert task.setup_shell == "pip install pytest"
|
| 32 |
+
assert task.metadata["task_id"] == "hello_001"
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def test_coerce_passes_through_existing_task():
|
| 36 |
+
existing = OpenCodeTask(instruction="x")
|
| 37 |
+
assert OpenCodeTask.coerce(existing) is existing
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def test_coerce_rejects_bad_type():
|
| 41 |
+
with pytest.raises(TypeError):
|
| 42 |
+
OpenCodeTask.coerce(42)
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|