Spaces:

AdithyaSK
/

opencode-env

Sleeping

App Files Files Community

AdithyaSK HF Staff commited on 9 days ago

Commit

70f2179

verified ·

1 Parent(s): 59114e2

Upload folder using huggingface_hub

Browse files

Files changed (31) hide show

Dockerfile +69 -0
README.md +258 -4
__init__.py +56 -0
client.py +168 -0
config.py +79 -0
harness.py +525 -0
models.py +93 -0
opencode_runtime.py +150 -0
openenv.yaml +6 -0
pyproject.toml +55 -0
sandbox/__init__.py +25 -0
sandbox/base.py +100 -0
sandbox/build_template.py +142 -0
sandbox/e2b.py +192 -0
sandbox/interception.py +642 -0
server/__init__.py +7 -0
server/app.py +118 -0
server/catalog.py +149 -0
server/gradio_ui.py +453 -0
server/opencode_environment.py +472 -0
task.py +43 -0
tests/__init__.py +0 -0
tests/test_config.py +48 -0
tests/test_five_sorts_e2e.py +1045 -0
tests/test_harness.py +221 -0
tests/test_inference_endpoints.py +430 -0
tests/test_interception.py +198 -0
tests/test_opencode_runtime.py +107 -0
tests/test_sandbox_base.py +44 -0
tests/test_task.py +42 -0
uv.lock +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,69 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# opencode_env Dockerfile — mirrors the standard OpenEnv multi-stage uv
+# build used by echo_env / repl_env / jupyter_agent.
+#
+# Build:
+#   docker build -t opencode-env .
+#
+# Run:
+#   docker run -p 8000:8000 -e E2B_API_KEY=e2b_... opencode-env
+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ${BASE_IMAGE} AS builder
+WORKDIR /app
+ARG BUILD_MODE=in-repo
+COPY . /app/env
+WORKDIR /app/env
+RUN if ! command -v uv >/dev/null 2>&1; then \
+        curl -LsSf https://astral.sh/uv/install.sh | sh && \
+        mv /root/.local/bin/uv /usr/local/bin/uv && \
+        mv /root/.local/bin/uvx /usr/local/bin/uvx; \
+    fi
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-install-project --no-editable; \
+    else \
+        uv sync --no-install-project --no-editable; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-editable; \
+    else \
+        uv sync --no-editable; \
+    fi
+# ── runtime stage ────────────────────────────────────────────────────────────
+FROM ${BASE_IMAGE}
+WORKDIR /app
+COPY --from=builder /app/env/.venv /app/.venv
+COPY --from=builder /app/env /app/env
+ENV PATH="/app/.venv/bin:$PATH"
+ENV PYTHONPATH="/app/env:$PYTHONPATH"
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
+EXPOSE 8000
+ENV ENABLE_WEB_INTERFACE=true
+CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]

README.md CHANGED Viewed

@@ -1,10 +1,264 @@
 ---
-title: Opencode Env
-emoji: ⚡
-colorFrom: red
 colorTo: purple
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: OpenCode Environment Server
+emoji: 🛠️
+colorFrom: indigo
 colorTo: purple
 sdk: docker
 pinned: false
+app_port: 8000
+base_path: /web
+tags:
+  - openenv
+short_description: OpenCode coding agent in an E2B sandbox with logprob capture
 ---
+# OpenCode Environment for OpenEnv
+`opencode_env` runs the [OpenCode](https://opencode.ai) coding agent inside
+an isolated [E2B](https://e2b.dev) sandbox against any OpenAI-compatible
+LLM endpoint, optionally capturing per-token logprobs for GRPO training.
+The env is **task-agnostic** — every rollout is configured at call-time
+with a uniform Task shape:
+  - **`instruction`** — prompt for the agent
+  - **`setup`** — list of bash commands run *before* the agent (pip
+    install, git clone, file downloads — anything you need staged in the
+    sandbox)
+  - **`verify`** — list of bash commands run *after* the agent (asserts,
+    pytest invocations, score-file writes)
+Reward = `passed_verify / total_verify` unless any `verify` command writes
+a float to `/home/user/logs/verifier/reward.txt` (override).
+## Quick Start
+### As a deployed env (HTTP / MCP)
+```python
+import os
+from opencode_env import OpenCodeEnv
+with OpenCodeEnv(base_url="http://localhost:8000") as env:
+    env.reset()
+    result = env.run_rollout(
+        endpoint="openai",                      # shorthand → server resolves
+        instruction=(
+            "Create binary_search.py exposing def binary_search(arr, target) -> int "
+            "that returns the index of target in arr, or -1 if absent. Use a "
+            "relative path."
+        ),
+        setup=[],
+        verify=[
+            "test -f /home/user/workdir/binary_search.py",
+            "python -c \"import sys; sys.path.insert(0, '/home/user/workdir'); "
+            "import binary_search; "
+            "assert binary_search.binary_search([1,2,3], 2) == 1\"",
+        ],
+        task_id="binary_search_v1",
+        template="opencode-rl",                 # prebaked E2B template
+    )
+    print("reward:", result.reward)
+    print("turns:", len(result.proxy_turns))
+    print("files:", list(result.files.keys()))
+```
+The Space-deployed variant works the same — point `base_url` at
+`https://<user>-opencode-env.hf.space` and set the relevant secrets in
+the Space settings.
+### As an in-process primitive
+```python
+import os
+from opencode_env import (
+    OpenCodeConfig, OpenCodeSessionFactory, OpenCodeTask, E2BSandboxBackend,
+)
+factory = OpenCodeSessionFactory(
+    config=OpenCodeConfig(
+        provider="openai_compatible",
+        base_url="https://api.openai.com/v1",
+        api_key=os.environ["OPENAI_API_KEY"],
+        model="gpt-4o-mini",
+    ),
+    sandbox_backend=E2BSandboxBackend(),
+    mode="transparent_proxy",                   # or "black_box"
+)
+session = factory.create(task=OpenCodeTask(instruction="..."))
+session.wait_for_completion()
+turns = session.fetch_proxy_trace()             # per-turn (tokens, logprobs)
+session.close()
+```
+## Building the Docker Image
+The Dockerfile lives at `server/Dockerfile`. Use the `openenv` CLI from
+the env root:
+```bash
+cd envs/opencode_env
+openenv validate               # check pyproject.toml + openenv.yaml + server/app.py + uv.lock
+openenv build -t opencode-env  # builds the image (uses server/Dockerfile)
+# run locally with E2B credentials
+docker run -p 8000:8000 -e E2B_API_KEY=e2b_... opencode-env
+# push to HF Spaces (Docker variant)
+openenv push --repo-id <user>/opencode-env
+```
+Or build directly without the CLI:
+```bash
+docker build -t opencode-env -f envs/opencode_env/server/Dockerfile envs/opencode_env
+```
+The image:
+- Runs `uvicorn server.app:app --host 0.0.0.0 --port 8000`
+- Exposes the MCP API at `/mcp` and `/step`, the Gradio UI at `/web`,
+  health at `/health`, and OpenAPI docs at `/docs`.
+- Reads `E2B_API_KEY` and (optionally) endpoint-specific env vars at
+  runtime (see [Environment Variables](#environment-variables)).
+## The MCP Tool: `run_rollout`
+Single tool, two ways to specify the LLM endpoint:
+**Option A — endpoint shorthand (recommended)**: pass
+`endpoint="vllm"` (or `"openai"` / `"hf_router"`). The server resolves
+`base_url`, `api_key`, and `model` from env vars + catalog defaults.
+Any explicit field overrides the catalog.
+**Option B — fully explicit**: pass `base_url` + `api_key` + `model`
+directly.
+| Arg | Type | Default | Notes |
+|---|---|---|---|
+| `endpoint` | `str` | `""` | One of `"vllm"` / `"openai"` / `"hf_router"`. |
+| `base_url` / `api_key` / `model` | `str` | `""` | Override / supply explicitly. |
+| `instruction` | `str` | required | Prompt passed to `opencode run`. |
+| `setup` | `list[str]` | `[]` | Bash commands run **before** the agent. |
+| `verify` | `list[str]` | `[]` | Bash commands run **after** the agent. |
+| `task_id` | `str` | `""` | Echoed back in result. |
+| `mode` | `str` | `"transparent_proxy"` | Or `"black_box"` (no logprobs). |
+| `disable_thinking` | `bool \| None` | `None` (catalog default) | Inject `chat_template_kwargs.enable_thinking=false`. |
+| `max_tokens_cap` | `int` | `4096` | Per-turn `max_tokens` clamp. |
+| `top_logprobs` | `int` | `5` | HF Router cap is 5; OpenAI 0–20; vLLM unbounded. |
+| `agent_timeout_s` | `float` | `600.0` | Hard wall budget for opencode. |
+| `template` | `str` | `""` | E2B template name; `"opencode-rl"` skips ~2 min of install per rollout. |
+Returns `RolloutResult` JSON with: `reward`, `setup_results[]`,
+`verify_results[]`, `proxy_turns[]`, `files{}`, `agent_log_tail`,
+`proxy_log_tail`, `wall_s`, `agent_exit_code`, `sandbox_id`, `error`.
+## Two Operating Modes
+| Mode | What it does | Best for |
+|---|---|---|
+| **`transparent_proxy`** (default) | In-sandbox proxy at `localhost:7000` forwards opencode's LLM calls to `base_url`, injects `logprobs=true`, captures per-turn `(messages, completion_tokens, logprobs)` to `proxy_trace.jsonl`. | GRPO / RL training, observability, top-k distillation. |
+| **`black_box`** | No proxy. opencode talks straight to `base_url`. | Smoke tests, eval, SFT data collection. |
+## Environment Variables
+The server reads these at runtime. Local dev auto-loads them from a
+sibling `.env` file; on HF Spaces, set them as **Space secrets**.
+| Variable | Required | Purpose |
+|---|---|---|
+| `E2B_API_KEY` | **yes** for any rollout | E2B sandbox credentials. |
+| `MAX_CONCURRENT_ENVS` | no | Env-instance pool size. Default `4`. |
+| `ENABLE_WEB_INTERFACE` | no | Set `false` to disable the `/web` Gradio mount. Default `true`. |
+| **vLLM endpoint** | | |
+| `VLLM_URL` | required for `endpoint="vllm"` | OAI-compatible base URL. |
+| `VLLM_API_KEY` | no | Defaults to `intercepted`. |
+| `VLLM_MODEL` | no | Defaults to `Qwen/Qwen3.5-4B`. |
+| **OpenAI endpoint** | | |
+| `OPENAI_API_KEY` | required for `endpoint="openai"` | Standard OpenAI key. |
+| `OPENAI_BASE_URL` | no | Defaults to `https://api.openai.com/v1`. |
+| `OPENAI_MODEL` | no | Defaults to `gpt-4o-mini` (gpt-5.x and o-series refuse logprobs). |
+| **HF Router endpoint** | | |
+| `HF_ROUTER_API_KEY` | required for `endpoint="hf_router"` | HF user token. |
+| `HF_ROUTER_BASE_URL` | no | Defaults to `https://router.huggingface.co/v1`. |
+| `HF_ROUTER_MODEL` | no | Defaults to `Qwen/Qwen3-4B-Instruct-2507:nscale`. |
+Pick `provider:` suffixes that actually return logprobs:
+**Together / Nscale / Scaleway / SambaNova / Cerebras**. Avoid Novita /
+Hyperbolic / Featherless (silent drop) and Groq (HTTP 400).
+## Pre-baked E2B Template
+The first rollout in a fresh E2B sandbox spends ~2 min installing
+opencode and the proxy's Python deps. Build a one-time template that
+ships those pre-installed:
+```bash
+.venv/bin/python envs/opencode_env/sandbox/build_template.py
+# → builds `opencode-rl` template in your E2B account (~1m20s, one-time)
+```
+After this, pass `template="opencode-rl"` on every `run_rollout` call —
+each rollout drops to ~20–30s end-to-end.
+## Tests
+A cheap pre-flight (no E2B, no opencode — just hits each LLM endpoint
+once with a tiny request to confirm it returns logprobs):
+```bash
+.venv/bin/python envs/opencode_env/tests/test_inference_endpoints.py
+```
+Multi-endpoint end-to-end (spawns one E2B sandbox per endpoint, runs
+opencode on a sorting task, prints a comparison table):
+```bash
+.venv/bin/python envs/opencode_env/tests/test_five_sorts_e2e.py \
+    --endpoint all --template opencode-rl
+```
+## Project Structure
+```
+opencode_env/
+├── README.md                       # this file
+├── openenv.yaml                    # OpenEnv space spec
+├── pyproject.toml                  # deps + ``server`` entrypoint
+├── uv.lock                         # frozen deps (required for openenv validate)
+├── .gitignore / .dockerignore      # excludes .env / __pycache__ / artifacts
+├── __init__.py                     # re-exports primitive + client + models
+│
+├── client.py                       # OpenCodeEnv(MCPToolClient)
+├── models.py                       # RolloutResult / RolloutTurn / OpenCodeState
+│
+├── config.py                       # OpenCodeConfig (primitive)
+├── harness.py                      # OpenCodeSession / OpenCodeSessionFactory (CLI-only)
+├── opencode_runtime.py             # opencode.json builder + cmds
+├── task.py                         # OpenCodeTask
+│
+├── server/
+│   ├── __init__.py
+│   ├── app.py                      # FastAPI factory; mounts Gradio at /web
+│   ├── opencode_environment.py     # MCPEnvironment with single ``run_rollout`` tool
+│   ├── gradio_ui.py                # the /web Gradio Blocks UI
+│   ├── catalog.py                  # endpoint shorthand resolver
+│   └── Dockerfile                  # multi-stage uv build (used by ``openenv build``)
+│
+├── sandbox/
+│   ├── __init__.py
+│   ├── base.py                     # SandboxBackend / SandboxHandle Protocols
+│   ├── e2b.py                      # E2B implementation
+│   ├── interception.py             # in-sandbox FastAPI proxy (logprob capture)
+│   └── build_template.py           # one-time E2B template builder
+│
+└── tests/                          # pre-flight + e2e + unit tests
+```
+## References
+- [OpenEnv docs](https://meta-pytorch.org/OpenEnv/)
+- [OpenCode CLI](https://opencode.ai/docs/cli/)
+- [E2B Python SDK](https://e2b.dev/docs)
+- [HF Inference Providers logprob matrix](../../../DOCS/HF/hf_inference_providers_logprobs.md)

__init__.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""OpenCode environment for OpenEnv.
+Two layers in this package:
+1. **Harness primitive** — :class:`OpenCodeSessionFactory` /
+   :class:`OpenCodeSession` / :class:`OpenCodeConfig` /
+   :class:`E2BSandboxBackend`. Used in-process to drive one rollout
+   inside an E2B sandbox. See ``harness.py``.
+2. **Deployable env** — :class:`OpenCodeEnv` (MCP client) talks to the
+   FastAPI server at ``server/app.py`` over HTTP. Use this when the
+   sandbox + agent live behind an HTTP boundary (e.g. an HF Space).
+   See ``client.py`` and ``server/``.
+"""
+from openenv.core.env_server.mcp_types import CallToolAction, ListToolsAction
+from .client import OpenCodeEnv
+from .config import OpenCodeConfig, Provider
+from .harness import OpenCodeSession, OpenCodeSessionFactory
+from .models import (
+    CommandResult,
+    OpenCodeState,
+    RolloutResult,
+    RolloutTurn,
+)
+from .sandbox import E2BSandboxBackend, SandboxBackend, SandboxHandle
+from .task import OpenCodeTask
+__all__ = [
+    # Deployed-env client
+    "OpenCodeEnv",
+    "CallToolAction",
+    "ListToolsAction",
+    # HTTP API models
+    "CommandResult",
+    "OpenCodeState",
+    "RolloutResult",
+    "RolloutTurn",
+    # Harness primitive
+    "OpenCodeConfig",
+    "OpenCodeSession",
+    "OpenCodeSessionFactory",
+    "OpenCodeTask",
+    "Provider",
+    # Sandbox backend
+    "E2BSandboxBackend",
+    "SandboxBackend",
+    "SandboxHandle",
+]

client.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Client for the deployed opencode_env server.
+The server exposes a single MCP tool ``run_rollout`` that runs one OpenCode
+rollout in an E2B sandbox and returns a JSON-serialized :class:`RolloutResult`.
+Example::
+    from opencode_env import OpenCodeEnv
+    with OpenCodeEnv(base_url="https://adithya-sk-opencode-env.hf.space") as env:
+        env.reset()
+        result = env.run_rollout(
+            base_url="https://api.openai.com/v1",
+            api_key=os.environ["OPENAI_API_KEY"],
+            model="gpt-4o-mini",
+            instruction="Create binary_search.py exposing def binary_search(arr, target) -> int...",
+            setup=[],
+            verify=["python /home/user/test.py"],
+            task_id="binary_search_v1",
+        )
+        print(result.reward, len(result.proxy_turns))
+"""
+from __future__ import annotations
+import json
+from typing import Any
+from openenv.core.mcp_client import MCPToolClient
+try:
+    from .models import RolloutResult
+except ImportError:  # pragma: no cover
+    from models import RolloutResult  # type: ignore
+class OpenCodeEnv(MCPToolClient):
+    """Typed client for the opencode_env MCP server.
+    Inherits ``reset`` / ``call_tool`` / ``list_tools`` / ``from_docker_image``
+    / context-manager semantics from :class:`MCPToolClient`.
+    """
+    def run_rollout(
+        self,
+        *,
+        # Endpoint — pass either the shorthand selector OR explicit fields.
+        endpoint: str = "",                # "vllm" | "openai" | "hf_router"
+        base_url: str = "",
+        api_key: str = "",
+        model: str = "",
+        # Task — the "list of bash commands" shape
+        instruction: str,
+        setup: list[str] | None = None,
+        verify: list[str] | None = None,
+        # Bookkeeping / tunables
+        task_id: str = "",
+        mode: str = "transparent_proxy",
+        disable_thinking: bool | None = None,
+        max_tokens_cap: int = 4096,
+        top_logprobs: int = 5,
+        agent_timeout_s: float = 600.0,
+        template: str = "",
+    ) -> RolloutResult:
+        """Run one OpenCode rollout and return the typed result.
+        Args:
+            base_url: OpenAI-compatible LLM endpoint (with trailing /v1).
+            api_key: Bearer token for the LLM. Use ``"intercepted"`` for vLLM
+                if it doesn't enforce auth.
+            model: Model id understood by the LLM endpoint
+                (e.g. ``"gpt-4o-mini"``, ``"Qwen/Qwen3.5-4B"``,
+                ``"Qwen/Qwen3-4B-Instruct-2507:nscale"``).
+            instruction: Prompt passed to ``opencode run``.
+            setup: Bash commands run sequentially **before** the agent starts.
+                Each command runs in the sandbox; non-zero exit aborts setup.
+            verify: Bash commands run sequentially **after** the agent exits.
+                Reward = ``passed_count / total`` unless any command writes a
+                float to ``/home/user/logs/verifier/reward.txt`` (override).
+            task_id: Echoed back in the result for traceability.
+            mode: ``"transparent_proxy"`` (captures per-token logprobs via
+                an in-sandbox FastAPI proxy) or ``"black_box"`` (no proxy).
+            disable_thinking: Inject
+                ``chat_template_kwargs.enable_thinking=false`` on forwarded
+                requests. Needed for Qwen3.5 vLLM; harmless on Instruct
+                variants; rejected by OpenAI direct.
+            max_tokens_cap: Clamp on per-turn ``max_tokens``. OpenCode asks
+                for ~32k by default; gpt-4o-mini caps at 16k.
+            top_logprobs: Top-k logprobs requested upstream. HF Router caps
+                at 5; OpenAI accepts up to 20; vLLM is unbounded.
+            agent_timeout_s: Hard wall-clock budget for one ``opencode run``.
+            template: E2B template name (e.g. ``"opencode-rl"``). Empty
+                string uses the default (slow) base image.
+        Returns:
+            A :class:`RolloutResult` with reward, per-turn logprobs, file
+            outputs, setup/verify results, and diagnostic tails.
+        """
+        raw = self.call_tool(
+            "run_rollout",
+            endpoint=endpoint,
+            base_url=base_url,
+            api_key=api_key,
+            model=model,
+            instruction=instruction,
+            setup=list(setup or []),
+            verify=list(verify or []),
+            task_id=task_id,
+            mode=mode,
+            disable_thinking=disable_thinking,
+            max_tokens_cap=max_tokens_cap,
+            top_logprobs=top_logprobs,
+            agent_timeout_s=agent_timeout_s,
+            template=template,
+        )
+        return RolloutResult.model_validate_json(_extract_text(raw))
+def _extract_text(result: Any) -> str:
+    """Pull the JSON text out of whatever shape the MCP layer returns.
+    Handles the three shapes :meth:`MCPToolClient.call_tool` may surface:
+    a raw string, a ``CallToolObservation``-like object with
+    ``.result.content[0].text``, or a dict with ``content[0]["text"]``.
+    """
+    if isinstance(result, str):
+        return result
+    inner = getattr(result, "result", None)
+    if inner is not None:
+        content = getattr(inner, "content", None)
+        if content:
+            first = content[0]
+            text = getattr(first, "text", None)
+            if isinstance(text, str):
+                return text
+            if isinstance(first, dict) and "text" in first:
+                return first["text"]
+    if isinstance(result, dict):
+        content = result.get("content")
+        if isinstance(content, list) and content:
+            first = content[0]
+            if isinstance(first, dict) and "text" in first:
+                return first["text"]
+        nested = result.get("result")
+        if isinstance(nested, dict):
+            content = nested.get("content")
+            if isinstance(content, list) and content:
+                first = content[0]
+                if isinstance(first, dict) and "text" in first:
+                    return first["text"]
+        return json.dumps(result, default=str)
+    content = getattr(result, "content", None)
+    if content:
+        first = content[0]
+        text = getattr(first, "text", None)
+        if isinstance(text, str):
+            return text
+    return str(result)

config.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Configuration model for the OpenCode harness primitive."""
+from __future__ import annotations
+from typing import Any, Literal
+from pydantic import BaseModel, Field
+Provider = Literal["openai_compatible", "openai", "anthropic"]
+class OpenCodeConfig(BaseModel):
+    """All configuration required to launch one OpenCode rollout in a sandbox.
+    Field names are provider-agnostic. The primitive maps ``provider`` onto the
+    correct ``opencode.json`` provider block (``@ai-sdk/openai-compatible``,
+    ``@ai-sdk/openai``, or ``@ai-sdk/anthropic``) and injects ``base_url`` /
+    ``api_key`` into it.
+    """
+    # --- LLM endpoint ---------------------------------------------------------
+    provider: Provider = "openai_compatible"
+    base_url: str
+    api_key: str = "intercepted"
+    model: str = "intercepted/model"
+    request_timeout_ms: int = 600_000
+    # --- OpenCode CLI ---------------------------------------------------------
+    opencode_version: str = "latest"
+    disabled_tools: list[str] = Field(
+        default_factory=lambda: ["webfetch", "question"]
+    )
+    enabled_tools: list[str] | None = None
+    system_prompt: str | None = None
+    extra_opencode_json: dict[str, Any] = Field(default_factory=dict)
+    # --- CLI invocation -------------------------------------------------------
+    run_format: Literal["default", "json"] = "json"
+    agent_timeout_s: float = 900.0
+    extra_env: dict[str, str] = Field(default_factory=dict)
+    extra_setup_shell: str | None = None
+    # --- Sandbox paths --------------------------------------------------------
+    # Root directory inside the sandbox where the primitive writes config,
+    # task files, and logs. E2B's default user is ``user`` with home
+    # ``/home/user``. Override when using a root-privileged backend (Docker).
+    sandbox_home: str = "/home/user"
+    # --- Transparent-proxy tuning --------------------------------------------
+    # Cap ``max_tokens`` / ``max_completion_tokens`` on forwarded requests.
+    # OpenCode defaults to a very large number (~32000) which exceeds some
+    # provider limits (e.g. gpt-4o-mini = 16384). Only used in
+    # ``mode="transparent_proxy"``. ``None`` disables the cap.
+    proxy_max_tokens_cap: int | None = 16384
+    # Per-turn top-k logprobs the proxy requests from the upstream.
+    proxy_top_logprobs: int = 5
+    # Disable reasoning/thinking mode for Qwen3 / Qwen3.5 models. Proxy sets
+    # ``extra_body.chat_template_kwargs.enable_thinking=false`` on forwarded
+    # requests. Ignored by providers that don't support the field.
+    proxy_disable_thinking: bool = False
+_PROVIDER_NPM = {
+    "openai_compatible": "@ai-sdk/openai-compatible",
+    "openai": "@ai-sdk/openai",
+    "anthropic": "@ai-sdk/anthropic",
+}
+def provider_npm_package(provider: Provider) -> str:
+    """Return the AI SDK npm package opencode should use for a provider."""
+    return _PROVIDER_NPM[provider]

harness.py ADDED Viewed

	@@ -0,0 +1,525 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""OpenCode session factory + session implementation.
+Implements the :class:`ResourceSessionFactory` / :class:`ResourceSession`
+contracts from ``openenv.core.harness`` (PR #471). The session wraps one
+sandbox running the ``opencode`` CLI agent.
+Two operating modes:
+  - ``mode="black_box"`` — opencode talks directly to ``config.base_url``.
+    No proxy, no logprob capture. Use for smoke tests / SFT / eval.
+  - ``mode="transparent_proxy"`` (default) — an in-sandbox FastAPI proxy
+    sits between opencode and the upstream LLM. It injects ``logprobs=true``
+    on every request and writes per-turn ``(messages, completion_tokens,
+    per_token_logps)`` to ``proxy_trace.jsonl`` for GRPO consumption.
+Single driver path: opencode is started as a background subprocess via
+``opencode run --format json --dangerously-skip-permissions ...`` and we
+poll its exit code. The previous ``opencode serve`` driver was removed —
+opencode CLI is the only path now.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Any, Callable, Literal
+from openenv.core.env_server.mcp_types import Tool
+from openenv.core.harness import (
+    Message,
+    ResourceSession,
+    ResourceSessionFactory,
+    ToolResult,
+    VerifyResult,
+)
+from .config import OpenCodeConfig
+from .opencode_runtime import (
+    agent_log_path,
+    build_env_vars,
+    build_install_cmd,
+    build_opencode_json,
+    build_run_cmd,
+    instruction_path,
+    opencode_config_path,
+    system_prompt_path,
+)
+from .sandbox.base import BgJob, SandboxBackend, SandboxHandle
+from .task import OpenCodeTask
+# Inside-sandbox proxy paths (Mode B).
+_PROXY_PORT = 7000
+_PROXY_TRACE_PATH = "/home/user/logs/agent/proxy_trace.jsonl"
+_PROXY_LOG_PATH = "/home/user/logs/agent/proxy.log"
+# Where the proxy source lives on disk (in this repo). Uploaded into the
+# sandbox at /home/user/proxy/interception.py before each rollout, unless
+# the sandbox was created from a template that already has it baked in.
+_PROXY_SOURCE_PATH = Path(__file__).parent / "sandbox" / "interception.py"
+Verifier = Callable[[SandboxHandle, OpenCodeTask], VerifyResult]
+class OpenCodeSession(ResourceSession):
+    """One live OpenCode rollout inside a sandbox.
+    The session is created already-running: :meth:`OpenCodeSessionFactory.create`
+    calls :meth:`start_agent` before returning. Typical usage::
+        session = factory.create(task)
+        session.wait_for_completion()
+        result = session.verify([])
+        session.close()
+    """
+    def __init__(
+        self,
+        *,
+        sandbox: SandboxHandle,
+        config: OpenCodeConfig,
+        task: OpenCodeTask,
+        verifier: Verifier | None = None,
+        base_url_override: str | None = None,
+        proxy_trace_path: str | None = None,
+        proxy_bg_job: BgJob | None = None,
+    ) -> None:
+        self.sandbox = sandbox
+        self.config = config
+        self.task = task
+        self._verifier = verifier
+        self._base_url_override = base_url_override
+        self._bg_job: BgJob | None = None
+        self._proxy_trace_path = proxy_trace_path
+        self._proxy_bg_job = proxy_bg_job
+    # ------------------------------------------------------------------
+    # ResourceSession contract (PR #471)
+    # ------------------------------------------------------------------
+    def initial_messages(self) -> list[Message]:
+        return [{"role": "user", "content": self.task.instruction}]
+    def list_tools(self) -> list[Tool]:
+        # OpenCode owns its own tool loop — none are exposed to the harness.
+        return []
+    def call_tool(self, name: str, arguments: dict[str, Any]) -> ToolResult:
+        return ToolResult(
+            error=(
+                "OpenCodeSession does not expose external tool calls; the "
+                "CLI agent owns its own tool loop."
+            )
+        )
+    def verify(
+        self,
+        transcript: list[Message],
+        final_state: Any | None = None,
+    ) -> VerifyResult:
+        if self._verifier is None:
+            return VerifyResult(env_reward=None, done=True)
+        return self._verifier(self.sandbox, self.task)
+    def close(self) -> None:
+        if self._bg_job is not None:
+            try:
+                self._bg_job.kill()
+            except Exception:
+                pass
+            self._bg_job = None
+        if self._proxy_bg_job is not None:
+            try:
+                self._proxy_bg_job.kill()
+            except Exception:
+                pass
+            self._proxy_bg_job = None
+        self.sandbox.kill()
+    # ------------------------------------------------------------------
+    # OpenCode-specific session API
+    # ------------------------------------------------------------------
+    def start_agent(self) -> None:
+        """Launch ``opencode run`` as a background subprocess in the sandbox."""
+        if self._bg_job is not None:
+            return
+        cmd = build_run_cmd(self.config)
+        envs = build_env_vars(self.config, base_url_override=self._base_url_override)
+        self._bg_job = self.sandbox.start_bg(cmd, envs=envs)
+    def wait_for_completion(self, timeout_s: float | None = None) -> int:
+        """Block until the agent exits, returning its exit code."""
+        budget = timeout_s if timeout_s is not None else self.config.agent_timeout_s
+        if self._bg_job is None:
+            raise RuntimeError("Agent not started; call start_agent() first.")
+        return self._bg_job.wait(timeout=budget)
+    def fetch_trace(self) -> str:
+        """Return the raw ``opencode run`` log (JSON-lines when ``run_format=json``)."""
+        return self.sandbox.read_text(agent_log_path(self.config))
+    def fetch_proxy_trace(self) -> list[dict[str, Any]]:
+        """Return per-turn proxy-captured records (Mode B only).
+        Each entry has ``request``, ``response``, ``completion_tokens``,
+        ``completion_token_ids``, ``per_token_logps``, ``finish_reason``,
+        and ``latency_s``. Returns ``[]`` in Mode A.
+        """
+        if self._proxy_trace_path is None:
+            return []
+        try:
+            content = self.sandbox.read_text(self._proxy_trace_path)
+        except Exception:
+            return []
+        records: list[dict[str, Any]] = []
+        for line in content.splitlines():
+            line = line.strip()
+            if not line:
+                continue
+            import json as _json
+            records.append(_json.loads(line))
+        return records
+class OpenCodeSessionFactory(ResourceSessionFactory):
+    """Produce isolated per-rollout :class:`OpenCodeSession` instances.
+    The factory owns sandbox provisioning, opencode install, config injection,
+    and (Mode B) proxy startup. Each :meth:`create` call returns a fresh
+    sandbox with a running agent.
+    """
+    def __init__(
+        self,
+        *,
+        config: OpenCodeConfig,
+        sandbox_backend: SandboxBackend,
+        mode: Literal["black_box", "transparent_proxy"] = "black_box",
+        verifier: Verifier | None = None,
+        install_timeout_s: int = 240,
+        setup_timeout_s: int = 300,
+    ) -> None:
+        if mode not in {"black_box", "transparent_proxy"}:
+            raise ValueError(f"Unknown mode: {mode!r}")
+        self._config = config
+        self._backend = sandbox_backend
+        self._mode = mode
+        self._verifier = verifier
+        self._install_timeout_s = install_timeout_s
+        self._setup_timeout_s = setup_timeout_s
+    def create(
+        self,
+        task: Any,
+        seed: int | None = None,
+        episode_id: str | None = None,
+    ) -> OpenCodeSession:
+        import logging
+        _log = logging.getLogger(__name__)
+        oc_task = OpenCodeTask.coerce(task)
+        sandbox_timeout = int(self._config.agent_timeout_s) + 300
+        _log.info(
+            "factory.create: creating sandbox timeout=%ds mode=%s",
+            sandbox_timeout, self._mode,
+        )
+        sandbox = self._backend.create(
+            timeout_s=sandbox_timeout,
+            metadata={"episode_id": episode_id} if episode_id else None,
+        )
+        sid = (
+            getattr(sandbox, "sandbox_id", None)
+            or getattr(getattr(sandbox, "raw", None), "sandbox_id", "?")
+        )
+        _log.info("factory.create: sandbox=%s — bootstrapping…", sid)
+        try:
+            self._bootstrap_sandbox(sandbox, oc_task)
+        except Exception as exc:
+            _log.error("factory.create: bootstrap failed: %r", exc)
+            sandbox.kill()
+            raise
+        base_url_override: str | None = None
+        proxy_trace_path: str | None = None
+        proxy_bg_job: BgJob | None = None
+        if self._mode == "transparent_proxy":
+            _log.info(
+                "factory.create: starting interception proxy on :%d → %s",
+                _PROXY_PORT, self._config.base_url,
+            )
+            proxy_bg_job, base_url_override, proxy_trace_path = self._start_proxy(
+                sandbox
+            )
+            _log.info("factory.create: proxy up at %s", base_url_override)
+            # Rewrite opencode.json so opencode points at the proxy. Force
+            # ``openai_compatible`` so opencode hits ``/v1/chat/completions``
+            # (which the proxy serves) rather than provider-specific paths.
+            from .config import OpenCodeConfig as _OCC
+            proxy_cfg = _OCC(
+                **{
+                    **self._config.model_dump(),
+                    "provider": "openai_compatible",
+                    "base_url": base_url_override,
+                }
+            )
+            sandbox.write_text(
+                opencode_config_path(self._config),
+                build_opencode_json(proxy_cfg),
+            )
+        session = OpenCodeSession(
+            sandbox=sandbox,
+            config=self._config,
+            task=oc_task,
+            verifier=self._verifier,
+            base_url_override=base_url_override,
+            proxy_trace_path=proxy_trace_path,
+            proxy_bg_job=proxy_bg_job,
+        )
+        session.start_agent()
+        return session
+    # ------------------------------------------------------------------
+    def _wait_for_sandbox_ready(
+        self,
+        sandbox: SandboxHandle,
+        *,
+        attempts: int = 15,
+        delay_s: float = 1.0,
+    ) -> None:
+        """Probe the sandbox until ``echo ok`` succeeds.
+        E2B (and other backends) sometimes return the handle before the
+        guest is fully ready. Issue ``echo ok`` with short timeouts until
+        it succeeds. Returns silently on success; raises ``RuntimeError``
+        on prolonged failure.
+        """
+        import time
+        last_err = ""
+        for _ in range(attempts):
+            try:
+                r = sandbox.exec("echo ok", timeout=5)
+                if r.exit_code == 0 and "ok" in (r.stdout or ""):
+                    return
+                last_err = (r.stderr or r.stdout or "").strip() or f"exit={r.exit_code}"
+            except Exception as exc:  # noqa: BLE001
+                last_err = f"{type(exc).__name__}: {exc}"
+            time.sleep(delay_s)
+        raise RuntimeError(
+            f"sandbox did not become ready within {attempts * delay_s:.0f}s "
+            f"(last error: {last_err})"
+        )
+    def _exec_with_retry(
+        self,
+        sandbox: SandboxHandle,
+        cmd: str,
+        *,
+        timeout: float,
+        attempts: int = 3,
+        backoff_s: float = 3.0,
+        label: str = "cmd",
+    ):
+        """Run ``sandbox.exec`` with exponential backoff on transient failure.
+        Transient = ``exit_code != 0`` AND empty stderr (SIGKILL / network
+        blip signature) OR an exception during exec. Final failure is raised
+        as ``RuntimeError`` carrying the last exit code + stderr.
+        """
+        import time
+        last_stdout = ""
+        last_stderr = ""
+        last_exit = 0
+        for i in range(attempts):
+            try:
+                r = sandbox.exec(cmd, timeout=timeout)
+                if r.exit_code == 0:
+                    return r
+                last_stdout = r.stdout or ""
+                last_stderr = r.stderr or ""
+                last_exit = r.exit_code
+                if last_stderr.strip():
+                    break
+            except Exception as exc:  # noqa: BLE001
+                last_stderr = f"{type(exc).__name__}: {exc}"
+                last_exit = -1
+            if i + 1 < attempts:
+                time.sleep(backoff_s * (2**i))
+        raise RuntimeError(
+            f"{label} failed after {attempts} attempts "
+            f"(exit={last_exit}, stderr={last_stderr!r}, stdout_tail={last_stdout[-400:]!r})"
+        )
+    def _opencode_already_installed(self, sandbox: SandboxHandle) -> bool:
+        """Cheap probe — returns True if opencode is on disk in the sandbox.
+        Used to skip the slow ``curl install`` step when running against a
+        prebaked template that already ships opencode.
+        """
+        try:
+            r = sandbox.exec(
+                "/home/user/.opencode/bin/opencode --version",
+                timeout=10,
+            )
+            return r.exit_code == 0
+        except Exception:
+            return False
+    def _bootstrap_sandbox(
+        self,
+        sandbox: SandboxHandle,
+        task: OpenCodeTask,
+    ) -> None:
+        """Install opencode, write config + task files, run optional setup."""
+        # Stage 1: wait for the sandbox to be responsive.
+        self._wait_for_sandbox_ready(sandbox)
+        # Stage 2: install opencode (skipped if a prebaked template already
+        # has it). curl|bash is flaky — retry with backoff.
+        if not self._opencode_already_installed(sandbox):
+            self._exec_with_retry(
+                sandbox,
+                build_install_cmd(self._config),
+                timeout=self._install_timeout_s,
+                attempts=3,
+                backoff_s=3.0,
+                label="opencode install",
+            )
+        sandbox.write_text(
+            opencode_config_path(self._config),
+            build_opencode_json(self._config),
+        )
+        sandbox.write_text(instruction_path(self._config), task.instruction)
+        if self._config.system_prompt:
+            sandbox.write_text(
+                system_prompt_path(self._config),
+                self._config.system_prompt,
+            )
+        for remote_path, content in task.upload_files.items():
+            sandbox.write_text(remote_path, content)
+        if self._config.extra_setup_shell:
+            self._exec_with_retry(
+                sandbox,
+                self._config.extra_setup_shell,
+                timeout=self._setup_timeout_s,
+                attempts=2,
+                backoff_s=2.0,
+                label="extra_setup_shell",
+            )
+        if task.setup_shell:
+            r = sandbox.exec(task.setup_shell, timeout=self._setup_timeout_s)
+            if r.exit_code != 0:
+                raise RuntimeError(
+                    f"task.setup_shell failed ({r.exit_code}): {r.stderr}"
+                )
+    def _start_proxy(
+        self,
+        sandbox: SandboxHandle,
+    ) -> tuple[BgJob, str, str]:
+        """Install proxy deps + start the proxy as a bg job inside the sandbox.
+        Returns ``(proxy_bg_job, base_url_override, proxy_trace_path)``.
+        Skips the pip install + source-upload steps when the prebaked
+        template already has them in place.
+        """
+        proxy_already_present = sandbox.exists(
+            "/home/user/proxy/interception.py"
+        )
+        if not proxy_already_present:
+            # Install proxy deps (idempotent on retries).
+            self._exec_with_retry(
+                sandbox,
+                "pip install --quiet 'fastapi>=0.104' 'uvicorn[standard]>=0.24' "
+                "'httpx>=0.27' 2>&1 | tail -20",
+                timeout=180,
+                attempts=3,
+                backoff_s=2.0,
+                label="proxy deps install",
+            )
+            # Upload the proxy module into the sandbox.
+            sandbox.write_text(
+                "/home/user/proxy/interception.py",
+                _PROXY_SOURCE_PATH.read_text(),
+            )
+            sandbox.write_text("/home/user/proxy/__init__.py", "")
+        cap_flag = ""
+        if self._config.proxy_max_tokens_cap is not None:
+            cap_flag = f"--max-tokens-cap {self._config.proxy_max_tokens_cap} "
+        thinking_flag = ""
+        if self._config.proxy_disable_thinking:
+            thinking_flag = "--disable-thinking "
+        # Force the upstream model id on every forwarded request — opencode's
+        # internal title-gen call sometimes strips the provider prefix.
+        model_override_flag = ""
+        if self._config.model:
+            model_override_flag = f"--model-override '{self._config.model}' "
+        proxy_cmd = (
+            "cd /home/user/proxy && "
+            "python interception.py "
+            f"--upstream-url {self._config.base_url} "
+            f"--upstream-api-key {self._config.api_key} "
+            f"--trace {_PROXY_TRACE_PATH} "
+            f"--port {_PROXY_PORT} "
+            f"--top-logprobs {self._config.proxy_top_logprobs} "
+            f"{cap_flag}"
+            f"{thinking_flag}"
+            f"{model_override_flag}"
+            f"> {_PROXY_LOG_PATH} 2>&1"
+        )
+        proxy_job = sandbox.start_bg(proxy_cmd)
+        # Wait for the proxy to start listening. Cold uvicorn boot inside
+        # E2B can take anywhere from <1s to ~30s depending on cache state.
+        import time
+        attempts = 120
+        interval_s = 0.5
+        for _ in range(attempts):
+            r = sandbox.exec(
+                f"curl -sf http://127.0.0.1:{_PROXY_PORT}/healthz",
+                timeout=5,
+            )
+            if r.exit_code == 0:
+                break
+            time.sleep(interval_s)
+        else:
+            log = ""
+            try:
+                log = sandbox.read_text(_PROXY_LOG_PATH)
+            except Exception:
+                pass
+            proxy_job.kill()
+            raise RuntimeError(
+                f"proxy did not start within {attempts * interval_s:.0f}s. "
+                f"log:\n{log[-2000:]}"
+            )
+        base_url_override = f"http://127.0.0.1:{_PROXY_PORT}/v1"
+        return proxy_job, base_url_override, _PROXY_TRACE_PATH
+__all__ = [
+    "OpenCodeSession",
+    "OpenCodeSessionFactory",
+    "OpenCodeTask",
+    "Verifier",
+]

models.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Pydantic models for the deployed opencode_env HTTP server.
+The server exposes a single MCP tool ``run_rollout`` that takes a Task
+(instruction + setup commands + verify commands) plus an LLM endpoint
+config, runs one OpenCode rollout end-to-end inside an E2B sandbox, and
+returns a :class:`RolloutResult` JSON.
+"""
+from __future__ import annotations
+from typing import Any
+from openenv.core.env_server.types import State
+from pydantic import BaseModel, Field
+class RolloutTurn(BaseModel):
+    """One intercepted LLM turn captured by the in-sandbox proxy (Mode B)."""
+    turn: int
+    finish_reason: str | None = None
+    completion_tokens: list[str] = Field(default_factory=list)
+    completion_token_ids: list[int] = Field(default_factory=list)
+    per_token_logps: list[float] = Field(default_factory=list)
+    latency_s: float = 0.0
+    timestamp: float = 0.0
+    upstream_status: int | None = None
+    upstream_error: dict[str, Any] | None = None
+class CommandResult(BaseModel):
+    """Outcome of one bash command in setup/verify."""
+    cmd: str
+    exit_code: int
+    stdout: str = ""
+    stderr: str = ""
+    duration_s: float = 0.0
+class RolloutResult(BaseModel):
+    """Full payload returned from one ``run_rollout`` invocation.
+    The trainer (or any client) decodes this from the MCP tool result JSON
+    and feeds ``proxy_turns`` + ``reward`` into GRPO.
+    """
+    # Identifiers
+    task_id: str = ""
+    sandbox_id: str = ""
+    # Scalars
+    reward: float | None = None
+    agent_exit_code: int | None = None
+    wall_s: float = 0.0
+    mode: str = "transparent_proxy"
+    # Per-step results
+    setup_results: list[CommandResult] = Field(default_factory=list)
+    verify_results: list[CommandResult] = Field(default_factory=list)
+    # Per-turn LLM trajectory (empty in black_box mode)
+    proxy_turns: list[RolloutTurn] = Field(default_factory=list)
+    # Filesystem the agent produced (path -> contents, truncated)
+    files: dict[str, str] = Field(default_factory=dict)
+    files_extra: list[str] = Field(default_factory=list)
+    # Diagnostic tails
+    agent_log_tail: str = ""
+    proxy_log_tail: str = ""
+    # Error surfacing
+    error: str | None = None
+class OpenCodeState(State):
+    """Per-session env state across calls to one OpenCodeEnvironment instance.
+    Each HTTP session gets its own env (``SUPPORTS_CONCURRENT_SESSIONS=True``
+    on the server class), so this state is per-session.
+    """
+    rollouts_completed: int = 0
+    last_reward: float | None = None
+    last_task_id: str | None = None
+    last_sandbox_id: str | None = None

opencode_runtime.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Pure builders for OpenCode sandbox bootstrap artifacts.
+These functions produce the exact files and shell commands the sandbox needs to
+run OpenCode against a configured LLM endpoint. No IO, no sandbox coupling —
+the sandbox backend is responsible for writing files and running commands.
+"""
+from __future__ import annotations
+import json
+from typing import Any
+from .config import OpenCodeConfig, provider_npm_package
+def opencode_config_path(config: OpenCodeConfig) -> str:
+    return f"{config.sandbox_home}/.config/opencode/opencode.json"
+def instruction_path(config: OpenCodeConfig) -> str:
+    return f"{config.sandbox_home}/task/instruction.md"
+def agent_log_path(config: OpenCodeConfig) -> str:
+    return f"{config.sandbox_home}/logs/agent/opencode.jsonl"
+def system_prompt_path(config: OpenCodeConfig) -> str:
+    return f"{config.sandbox_home}/task/system.md"
+def verifier_reward_path(config: OpenCodeConfig) -> str:
+    return f"{config.sandbox_home}/logs/verifier/reward.txt"
+def workdir_path(config: OpenCodeConfig) -> str:
+    return f"{config.sandbox_home}/workdir"
+def build_opencode_json(config: OpenCodeConfig) -> str:
+    """Return the serialized ``opencode.json`` the sandbox should install.
+    Provider block is keyed by a stable internal name (``intercepted``) so the
+    same ``model`` string works across providers. Deep-merges
+    ``config.extra_opencode_json`` last so callers can override anything.
+    """
+    provider_name = "intercepted"
+    provider_block: dict[str, Any] = {
+        "npm": provider_npm_package(config.provider),
+        "name": "Intercepted",
+        "options": {
+            "baseURL": config.base_url,
+            "apiKey": config.api_key,
+            "timeout": config.request_timeout_ms,
+        },
+        "models": {
+            config.model.split("/", 1)[-1]: {"name": "Intercepted Model"},
+        },
+    }
+    doc: dict[str, Any] = {
+        "$schema": "https://opencode.ai/config.json",
+        "model": f"{provider_name}/{config.model.split('/', 1)[-1]}",
+        "provider": {provider_name: provider_block},
+    }
+    tools = _build_tools_block(config)
+    if tools:
+        doc["tools"] = tools
+    _deep_merge(doc, config.extra_opencode_json)
+    return json.dumps(doc, indent=2)
+def build_install_cmd(config: OpenCodeConfig) -> str:
+    """Return the shell command that installs OpenCode + ensures PATH.
+    The upstream installer honors ``OPENCODE_VERSION=x.y.z`` for pinning;
+    leaving it unset tracks ``latest``.
+    """
+    version_env = ""
+    if config.opencode_version and config.opencode_version != "latest":
+        version_env = f"OPENCODE_VERSION={config.opencode_version} "
+    home = config.sandbox_home
+    return (
+        "set -e && "
+        f"mkdir -p {home}/.config/opencode {home}/logs/agent {home}/logs/verifier {home}/task {home}/workdir && "
+        f"{version_env}curl -fsSL https://opencode.ai/install | bash && "
+        'export PATH="$HOME/.opencode/bin:$PATH" && '
+        "opencode --version"
+    )
+def build_run_cmd(config: OpenCodeConfig) -> str:
+    """Return the shell command that launches OpenCode against a task."""
+    format_flag = "--format json" if config.run_format == "json" else ""
+    return (
+        'export PATH="$HOME/.opencode/bin:$PATH" && '
+        f"cd {workdir_path(config)} && "
+        f'opencode run {format_flag} "$(cat {instruction_path(config)})" '
+        f"2>&1 | tee {agent_log_path(config)}"
+    ).strip()
+def build_env_vars(config: OpenCodeConfig, *, base_url_override: str | None = None) -> dict[str, str]:
+    """Return env vars to set on the OpenCode process.
+    When a proxy is wrapping ``config.base_url`` the factory passes the proxy's
+    local URL via ``base_url_override`` so the sandbox process points at the
+    proxy and the opencode.json on disk stays consistent with what the proxy
+    forwards to.
+    """
+    env = dict(config.extra_env)
+    env["OPENAI_BASE_URL"] = base_url_override or config.base_url
+    env["OPENAI_API_KEY"] = config.api_key
+    env["OPENCODE_CONFIG"] = opencode_config_path(config)
+    return env
+def _build_tools_block(config: OpenCodeConfig) -> dict[str, bool]:
+    """Translate enabled/disabled lists into opencode's ``tools`` map."""
+    if config.enabled_tools is not None:
+        # Whitelist: everything not listed is disabled. OpenCode treats missing
+        # keys as "default enabled", so we only need to explicitly disable the
+        # ones we want off. Without a full known-tool list we can't do a true
+        # whitelist; document this as a known limitation and require the caller
+        # to rely on ``disabled_tools`` for full control.
+        return {tool: True for tool in config.enabled_tools}
+    return {tool: False for tool in config.disabled_tools}
+def _deep_merge(dst: dict[str, Any], src: dict[str, Any]) -> None:
+    """Recursively merge ``src`` into ``dst`` in place."""
+    for key, value in src.items():
+        if isinstance(value, dict) and isinstance(dst.get(key), dict):
+            _deep_merge(dst[key], value)
+        else:
+            dst[key] = value

openenv.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+spec_version: 1
+name: opencode_env
+type: space
+runtime: fastapi
+app: server.app:app
+port: 8000

pyproject.toml ADDED Viewed

	@@ -0,0 +1,55 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "openenv-opencode-env"
+version = "0.1.0"
+description = "OpenCode coding-agent environment for OpenEnv — runs the OpenCode CLI in an E2B sandbox against any OpenAI-compatible LLM, optionally capturing per-token logprobs."
+requires-python = ">=3.10"
+dependencies = [
+    # Core OpenEnv (server + MCP)
+    "openenv-core[core]>=0.2.2",
+    "fastapi>=0.115.0",
+    "uvicorn[standard]>=0.24.0",
+    "pydantic>=2.0.0",
+    "fastmcp>=2.0.0",
+    "requests>=2.31.0",
+    # Web UI
+    "gradio>=4.0.0",
+    # OpenCode harness primitive — sandbox + proxy + agent driver
+    "httpx>=0.27.0",
+    "e2b>=1.0.0",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-asyncio>=0.23.0",
+    "pytest-cov>=4.0.0",
+]
+[project.scripts]
+# Server entrypoint — enables ``uv run --project . server``.
+server = "opencode_env.server.app:main"
+[tool.setuptools]
+include-package-data = true
+packages = [
+    "opencode_env",
+    "opencode_env.sandbox",
+    "opencode_env.server",
+    "opencode_env.tests",
+]
+package-dir = { "opencode_env" = ".", "opencode_env.sandbox" = "sandbox", "opencode_env.server" = "server", "opencode_env.tests" = "tests" }
+[tool.setuptools.package-data]
+opencode_env = ["**/*.md"]

sandbox/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Sandbox backends for the OpenCode harness.
+The primitive ships with :class:`E2BSandboxBackend` as the default; any backend
+that satisfies the :class:`SandboxBackend` / :class:`SandboxHandle` protocols
+can be swapped in.
+"""
+from .base import BgJob, ExecResult, SandboxBackend, SandboxHandle
+from .e2b import E2BBgJob, E2BSandboxBackend, E2BSandboxHandle
+__all__ = [
+    "BgJob",
+    "ExecResult",
+    "SandboxBackend",
+    "SandboxHandle",
+    "E2BBgJob",
+    "E2BSandboxBackend",
+    "E2BSandboxHandle",
+]

sandbox/base.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Sandbox backend protocol.
+A ``SandboxBackend`` produces ``SandboxHandle`` instances that the harness uses
+to stage files, run the OpenCode install, launch the agent as a background
+process, and later tear the sandbox down.
+Backends can be implemented against any provider (E2B, Docker, Modal, Prime)
+as long as they satisfy the Protocols defined here.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Protocol, runtime_checkable
+@dataclass
+class ExecResult:
+    """Result of a synchronous command inside a sandbox."""
+    exit_code: int
+    stdout: str
+    stderr: str
+@runtime_checkable
+class BgJob(Protocol):
+    """Handle to a background process running inside a sandbox."""
+    @property
+    def pid(self) -> int: ...
+    def wait(self, timeout: float | None = None) -> int:
+        """Block until the process exits, returning its exit code.
+        Implementations must raise ``TimeoutError`` if ``timeout`` elapses
+        before the process exits.
+        """
+    def kill(self) -> None:
+        """Terminate the process."""
+@runtime_checkable
+class SandboxHandle(Protocol):
+    """Opaque handle to one live sandbox."""
+    @property
+    def sandbox_id(self) -> str: ...
+    def exec(
+        self,
+        cmd: str,
+        *,
+        envs: dict[str, str] | None = None,
+        cwd: str | None = None,
+        timeout: float | None = 60,
+    ) -> ExecResult:
+        """Run a shell command synchronously and return its result."""
+    def start_bg(
+        self,
+        cmd: str,
+        *,
+        envs: dict[str, str] | None = None,
+        cwd: str | None = None,
+    ) -> BgJob:
+        """Launch a background process and return a handle."""
+    def write_text(self, path: str, content: str) -> None:
+        """Write text to ``path`` inside the sandbox (parent dirs auto-created)."""
+    def read_text(self, path: str) -> str:
+        """Read ``path`` as text from the sandbox."""
+    def exists(self, path: str) -> bool:
+        """Return whether ``path`` exists in the sandbox."""
+    def kill(self) -> None:
+        """Terminate the sandbox and release resources."""
+@runtime_checkable
+class SandboxBackend(Protocol):
+    """Factory for fresh sandbox instances."""
+    def create(
+        self,
+        *,
+        timeout_s: int = 900,
+        envs: dict[str, str] | None = None,
+        metadata: dict[str, str] | None = None,
+    ) -> SandboxHandle:
+        """Create and return a new, ready-to-use sandbox."""

sandbox/build_template.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Build a pre-baked E2B template with opencode + proxy deps already installed.
+Run-time per rollout drops from ~3 min (cold install) to ~30s once the
+template is built, because we skip:
+  - ``curl https://opencode.ai/install | bash`` (~30-90s)
+  - ``pip install fastapi uvicorn httpx`` (~30-60s)
+  - directory layout setup
+  - copying the proxy source
+The template ships:
+  - opencode CLI at ``/home/user/.opencode/bin/opencode``
+  - Python deps for the in-sandbox proxy
+  - The proxy source at ``/home/user/proxy/interception.py``
+  - Pre-created dirs: ``~/.config/opencode``, ``~/logs/{agent,verifier}``,
+    ``~/task``, ``~/workdir``, ``~/proxy``
+  - Default workdir: ``/home/user/workdir``
+Usage::
+    .venv/bin/python envs/opencode_env/tests/build_e2b_template.py
+    # → builds (or rebuilds) ``opencode-rl`` template, prints template id
+Then ``test_five_sorts_e2e.py`` will use it via ``--template opencode-rl``.
+Requires ``E2B_API_KEY`` in the environment. First build is ~3-8 min;
+subsequent builds reuse the cache and can finish in <60s.
+"""
+from __future__ import annotations
+import argparse
+import os
+import sys
+from pathlib import Path
+from e2b import Template, default_build_logger
+_ENV_DIR = Path(__file__).resolve().parent
+_PROXY_SOURCE = _ENV_DIR / "interception.py"
+def _load_env(path: Path) -> None:
+    if not path.exists():
+        return
+    for raw in path.read_text().splitlines():
+        line = raw.strip()
+        if not line or line.startswith("#") or "=" not in line:
+            continue
+        k, _, v = line.partition("=")
+        k = k.strip()
+        v = v.strip().strip('"').strip("'")
+        if k and k not in os.environ:
+            os.environ[k] = v
+def build_template(name: str, *, skip_cache: bool = False) -> str:
+    if not _PROXY_SOURCE.exists():
+        raise RuntimeError(f"proxy source missing at {_PROXY_SOURCE}")
+    # Template.copy() resolves relative paths against the caller's source
+    # file directory. This script lives next to ``interception.py`` so the
+    # bare filename works.
+    # Stage 1 (root): system-wide pip deps for the proxy.
+    # Stage 2 (user): opencode install + dir layout + proxy copy.
+    template = (
+        Template()
+        .from_python_image("3.12")
+        .pip_install(
+            [
+                "fastapi>=0.104",
+                "uvicorn[standard]>=0.24",
+                "httpx>=0.27",
+            ]
+        )
+        .set_user("user")
+        .run_cmd("curl -fsSL https://opencode.ai/install | bash")
+        .run_cmd("/home/user/.opencode/bin/opencode --version")
+        .make_dir("/home/user/.config/opencode")
+        .make_dir("/home/user/logs/agent")
+        .make_dir("/home/user/logs/verifier")
+        .make_dir("/home/user/task")
+        .make_dir("/home/user/workdir")
+        .make_dir("/home/user/proxy")
+        .copy("interception.py", "/home/user/proxy/interception.py")
+        .set_workdir("/home/user/workdir")
+    )
+    if skip_cache:
+        template = template.skip_cache()
+    info = Template.build(
+        template,
+        name,
+        cpu_count=2,
+        memory_mb=2048,
+        on_build_logs=default_build_logger(),
+    )
+    return info.template_id if hasattr(info, "template_id") else str(info)
+def main(argv: list[str] | None = None) -> int:
+    p = argparse.ArgumentParser(prog="build_e2b_template")
+    p.add_argument(
+        "--name",
+        default="opencode-rl",
+        help="Template name (default: opencode-rl).",
+    )
+    p.add_argument(
+        "--skip-cache",
+        action="store_true",
+        help="Force a clean rebuild, ignoring cache.",
+    )
+    args = p.parse_args(argv)
+    _load_env(_ENV_DIR / ".env")
+    if not os.environ.get("E2B_API_KEY"):
+        print("ERROR: E2B_API_KEY required.", file=sys.stderr)
+        return 2
+    print(f"Building template '{args.name}' "
+          f"(proxy source: {_PROXY_SOURCE})")
+    print(f"Skip cache: {args.skip_cache}")
+    print()
+    template_id = build_template(args.name, skip_cache=args.skip_cache)
+    print()
+    print(f"Built. Template id/name: {template_id}")
+    print(f"Use in code: Sandbox.create(template='{args.name}')")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

sandbox/e2b.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""E2B implementation of :class:`SandboxBackend`."""
+from __future__ import annotations
+import os
+import threading
+from pathlib import PurePosixPath
+from e2b import Sandbox
+from e2b.sandbox_sync.commands.command_handle import CommandHandle
+from .base import BgJob, ExecResult, SandboxBackend, SandboxHandle
+class E2BBgJob:
+    """Wraps an E2B ``CommandHandle`` to satisfy :class:`BgJob`.
+    The E2B SDK's ``CommandHandle.wait()`` blocks indefinitely with no native
+    timeout. We poll in a worker thread and raise ``TimeoutError`` if the
+    process does not exit within the caller-supplied budget.
+    """
+    def __init__(self, handle: CommandHandle) -> None:
+        self._handle = handle
+        self._result: "object | None" = None
+        self._error: BaseException | None = None
+        self._thread = threading.Thread(target=self._run, daemon=True)
+        self._thread.start()
+    def _run(self) -> None:
+        try:
+            self._result = self._handle.wait()
+        except BaseException as exc:  # noqa: BLE001
+            self._error = exc
+    @property
+    def pid(self) -> int:
+        return self._handle.pid
+    def wait(self, timeout: float | None = None) -> int:
+        self._thread.join(timeout)
+        if self._thread.is_alive():
+            raise TimeoutError(
+                f"Background command did not exit within {timeout}s"
+            )
+        if self._error is not None:
+            # E2B raises CommandExitException on non-zero; treat as exit code.
+            code = getattr(self._error, "exit_code", None)
+            if code is None:
+                raise self._error
+            return int(code)
+        return int(self._result.exit_code) if self._result is not None else 0
+    def kill(self) -> None:
+        try:
+            self._handle.kill()
+        except Exception:
+            pass
+class E2BSandboxHandle:
+    """Wraps a live ``e2b.Sandbox`` to satisfy :class:`SandboxHandle`."""
+    def __init__(self, sandbox: Sandbox) -> None:
+        self._sbx = sandbox
+    @property
+    def sandbox_id(self) -> str:
+        return self._sbx.sandbox_id
+    @property
+    def raw(self) -> Sandbox:
+        """Escape hatch for callers that need the underlying SDK object."""
+        return self._sbx
+    def exec(
+        self,
+        cmd: str,
+        *,
+        envs: dict[str, str] | None = None,
+        cwd: str | None = None,
+        timeout: float | None = 60,
+    ) -> ExecResult:
+        from e2b.sandbox.commands.command_handle import CommandExitException
+        try:
+            result = self._sbx.commands.run(
+                cmd,
+                envs=envs,
+                cwd=cwd,
+                timeout=timeout,
+                background=False,
+            )
+            return ExecResult(
+                exit_code=result.exit_code,
+                stdout=result.stdout,
+                stderr=result.stderr,
+            )
+        except CommandExitException as exc:
+            # Non-zero exit codes are expected in many contexts (e.g. polling
+            # healthz before the server is up). Surface them as a proper
+            # ExecResult instead of an exception.
+            return ExecResult(
+                exit_code=int(getattr(exc, "exit_code", 1)),
+                stdout=str(getattr(exc, "stdout", "") or ""),
+                stderr=str(getattr(exc, "stderr", "") or str(exc)),
+            )
+    def start_bg(
+        self,
+        cmd: str,
+        *,
+        envs: dict[str, str] | None = None,
+        cwd: str | None = None,
+        timeout: float = 0,
+    ) -> BgJob:
+        """Start a background command.
+        ``timeout=0`` disables E2B's server-side command deadline (the default
+        is 60s, which would otherwise kill long-running agent processes).
+        Sandbox lifetime still bounds the job.
+        """
+        handle = self._sbx.commands.run(
+            cmd,
+            envs=envs,
+            cwd=cwd,
+            background=True,
+            timeout=timeout,
+        )
+        return E2BBgJob(handle)
+    def write_text(self, path: str, content: str) -> None:
+        parent = str(PurePosixPath(path).parent)
+        if parent not in ("", "/"):
+            self._sbx.files.make_dir(parent)
+        self._sbx.files.write(path, content)
+    def read_text(self, path: str) -> str:
+        return self._sbx.files.read(path)
+    def exists(self, path: str) -> bool:
+        return self._sbx.files.exists(path)
+    def kill(self) -> None:
+        self._sbx.kill()
+class E2BSandboxBackend:
+    """Creates E2B sandboxes for OpenCode rollouts.
+    The backend uses the E2B default base template unless ``template`` is
+    provided. Resource sizing and other E2B-specific options can be forwarded
+    via ``sandbox_kwargs``.
+    """
+    def __init__(
+        self,
+        *,
+        api_key: str | None = None,
+        template: str | None = None,
+        sandbox_kwargs: dict | None = None,
+    ) -> None:
+        self._api_key = api_key or os.environ.get("E2B_API_KEY")
+        if not self._api_key:
+            raise RuntimeError(
+                "E2BSandboxBackend requires an api_key or E2B_API_KEY env var."
+            )
+        self._template = template
+        self._sandbox_kwargs = sandbox_kwargs or {}
+    def create(
+        self,
+        *,
+        timeout_s: int = 900,
+        envs: dict[str, str] | None = None,
+        metadata: dict[str, str] | None = None,
+    ) -> SandboxHandle:
+        sbx = Sandbox.create(
+            template=self._template,
+            timeout=timeout_s,
+            envs=envs,
+            metadata=metadata,
+            api_key=self._api_key,
+            **self._sandbox_kwargs,
+        )
+        return E2BSandboxHandle(sbx)

sandbox/interception.py ADDED Viewed

	@@ -0,0 +1,642 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Transparent OpenAI-compatible forwarding proxy with logprob capture.
+The proxy is a small FastAPI app that OpenCode talks to instead of the upstream
+LLM endpoint. It:
+1. Forwards every ``POST /v1/chat/completions`` request to the real upstream
+   URL, injecting ``logprobs=true`` and ``top_logprobs=N`` so the upstream
+   returns per-token logprobs.
+2. Captures each ``(request, response, logprobs)`` triple to a JSON-lines
+   trace file.
+3. Returns the upstream response to OpenCode verbatim (minus the ``logprobs``
+   field, which we strip so the CLI never sees anything unexpected).
+The proxy is stateless beyond the trace file. One proxy instance runs per
+session, normally inside the sandbox on ``localhost:7000``.
+Run standalone::
+    python -m opencode_env.interception \\
+        --upstream-url https://vllm.example/v1 \\
+        --upstream-api-key intercepted \\
+        --trace /tmp/trace.jsonl \\
+        --port 7000
+"""
+from __future__ import annotations
+import argparse
+import asyncio
+import copy
+import json
+import os
+import socket
+import threading
+import time
+from contextlib import closing
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+import httpx
+import uvicorn
+from fastapi import FastAPI, Request, Response
+from fastapi.responses import JSONResponse, StreamingResponse
+CHAT_COMPLETIONS_PATH = "/v1/chat/completions"
+@dataclass
+class ProxyConfig:
+    """Runtime configuration for one :class:`InterceptionProxy`."""
+    upstream_url: str
+    upstream_api_key: str = "intercepted"
+    trace_path: str = "/tmp/opencode-proxy-trace.jsonl"
+    host: str = "127.0.0.1"
+    port: int = 7000
+    top_logprobs: int = 5
+    request_timeout_s: float = 600.0
+    # Cap ``max_tokens`` before forwarding. OpenCode historically asks for very
+    # large values (e.g. 32000) that exceed gpt-4o-mini's 16384 cap; capping
+    # here avoids spurious upstream 400s without requiring the caller to know
+    # per-model limits.
+    max_tokens_cap: int | None = 16384
+    # Disable Qwen-style reasoning/thinking by injecting
+    # ``chat_template_kwargs.enable_thinking=false`` into forwarded requests.
+    disable_thinking: bool = False
+    # Override the ``model`` field on every forwarded request. Some opencode
+    # builds emit a stripped model id (e.g. ``Qwen3.5-4B`` instead of the
+    # ``Qwen/Qwen3.5-4B`` the upstream serves) for their internal
+    # title-generation call. Setting this to the exact upstream model id
+    # bypasses that mismatch.
+    model_override: str | None = None
+@dataclass
+class TurnRecord:
+    """One intercepted turn, written to the trace file as JSON-lines."""
+    turn: int
+    request: dict[str, Any]
+    response: dict[str, Any]
+    logprobs: list[dict[str, Any]] | None
+    completion_tokens: list[str]
+    completion_token_ids: list[int]
+    per_token_logps: list[float]
+    finish_reason: str | None
+    latency_s: float
+    timestamp: float = field(default_factory=time.time)
+    def to_json(self) -> str:
+        return json.dumps(self.__dict__, default=str)
+def _build_app(cfg: ProxyConfig) -> FastAPI:
+    """Construct the FastAPI app that serves one proxy session."""
+    app = FastAPI(title="opencode-interception-proxy")
+    state: dict[str, Any] = {"turn": 0, "lock": asyncio.Lock()}
+    # HTTP client reused across requests. ``None`` auth header — we let each
+    # request carry its own ``Authorization`` populated from ``upstream_api_key``.
+    client = httpx.AsyncClient(timeout=cfg.request_timeout_s)
+    trace_file = open(cfg.trace_path, "a", buffering=1)
+    @app.get("/healthz")
+    def healthz() -> dict[str, str]:
+        return {"status": "ok"}
+    @app.post(CHAT_COMPLETIONS_PATH)
+    async def chat_completions(request: Request) -> Response:
+        raw_body = await request.body()
+        try:
+            body = json.loads(raw_body)
+        except json.JSONDecodeError:
+            return JSONResponse(
+                status_code=400, content={"error": "invalid json body"}
+            )
+        forwarded_body = _prepare_forwarded_body(body, cfg)
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {cfg.upstream_api_key}",
+        }
+        upstream_url = _resolve_upstream_url(cfg.upstream_url)
+        async with state["lock"]:
+            state["turn"] += 1
+            turn_idx = state["turn"]
+        if forwarded_body.get("stream"):
+            return await _proxy_streaming(
+                client=client,
+                upstream_url=upstream_url,
+                headers=headers,
+                forwarded_body=forwarded_body,
+                original_body=body,
+                trace_file=trace_file,
+                turn_idx=turn_idx,
+            )
+        return await _proxy_unary(
+            client=client,
+            upstream_url=upstream_url,
+            headers=headers,
+            forwarded_body=forwarded_body,
+            original_body=body,
+            trace_file=trace_file,
+            turn_idx=turn_idx,
+        )
+    @app.on_event("shutdown")
+    async def _shutdown() -> None:
+        await client.aclose()
+        trace_file.close()
+    return app
+def _prepare_forwarded_body(body: dict[str, Any], cfg: ProxyConfig) -> dict[str, Any]:
+    """Return the body we actually send upstream.
+    - Injects ``logprobs=true`` + ``top_logprobs`` so the upstream emits
+      per-token logprobs.
+    - Caps ``max_tokens`` / ``max_completion_tokens`` to ``max_tokens_cap``.
+    - For models that reject ``max_tokens`` (e.g. gpt-5.x), translates to
+      ``max_completion_tokens``.
+    """
+    forwarded = copy.deepcopy(body)
+    forwarded.setdefault("logprobs", True)
+    forwarded.setdefault("top_logprobs", cfg.top_logprobs)
+    # GPT-5.x and newer: ``max_tokens`` is rejected; must use
+    # ``max_completion_tokens``. Detect via model string so we don't break
+    # gpt-4.x or vLLM-hosted models that accept ``max_tokens``.
+    model = str(forwarded.get("model", ""))
+    needs_translation = _model_uses_max_completion_tokens(model)
+    if needs_translation and "max_tokens" in forwarded:
+        value = forwarded.pop("max_tokens")
+        forwarded.setdefault("max_completion_tokens", value)
+    if cfg.max_tokens_cap is not None:
+        for key in ("max_tokens", "max_completion_tokens"):
+            value = forwarded.get(key)
+            if isinstance(value, int) and value > cfg.max_tokens_cap:
+                forwarded[key] = cfg.max_tokens_cap
+    if cfg.disable_thinking:
+        # vLLM applies chat_template_kwargs to the tokenizer's chat template
+        # for Qwen3/Qwen3.5 models, turning off <think>...</think> generation.
+        extra = forwarded.setdefault("chat_template_kwargs", {})
+        extra.setdefault("enable_thinking", False)
+    if cfg.model_override:
+        forwarded["model"] = cfg.model_override
+    return forwarded
+def _model_uses_max_completion_tokens(model: str) -> bool:
+    """Heuristic: ``True`` for models that reject ``max_tokens``."""
+    # Strip a provider prefix opencode may have prepended (e.g. "intercepted/").
+    bare = model.split("/", 1)[-1].lower()
+    return bare.startswith(("gpt-5", "o1", "o3", "o4"))
+def _resolve_upstream_url(upstream: str) -> str:
+    """Build the fully qualified chat-completions URL from a base URL."""
+    base = upstream.rstrip("/")
+    if base.endswith("/v1"):
+        return f"{base}/chat/completions"
+    return f"{base}{CHAT_COMPLETIONS_PATH}"
+async def _proxy_unary(
+    *,
+    client: httpx.AsyncClient,
+    upstream_url: str,
+    headers: dict[str, str],
+    forwarded_body: dict[str, Any],
+    original_body: dict[str, Any],
+    trace_file: Any,
+    turn_idx: int,
+) -> Response:
+    start = time.time()
+    upstream_response = await client.post(
+        upstream_url, content=json.dumps(forwarded_body), headers=headers
+    )
+    latency = time.time() - start
+    try:
+        response_json = upstream_response.json()
+    except Exception:
+        return Response(
+            content=upstream_response.content,
+            status_code=upstream_response.status_code,
+            media_type=upstream_response.headers.get(
+                "content-type", "application/json"
+            ),
+        )
+    record = _build_turn_record(
+        turn_idx=turn_idx,
+        request_body=forwarded_body,
+        response_json=response_json,
+        latency_s=latency,
+    )
+    trace_file.write(record.to_json() + "\n")
+    sanitized = _strip_logprobs(response_json)
+    return JSONResponse(content=sanitized, status_code=upstream_response.status_code)
+async def _proxy_streaming(
+    *,
+    client: httpx.AsyncClient,
+    upstream_url: str,
+    headers: dict[str, str],
+    forwarded_body: dict[str, Any],
+    original_body: dict[str, Any],
+    trace_file: Any,
+    turn_idx: int,
+) -> Response:
+    """Forward an SSE stream while accumulating the full response.
+    Opens the upstream stream and inspects the status. On non-2xx, reads the
+    full body (an error JSON, not SSE) and returns it to the caller as a
+    regular JSON response — previously we silently emitted an empty
+    ``text/event-stream`` which opencode interpreted as an empty assistant
+    turn. Both the error body and the latency are written to the trace file
+    so debugging a broken rollout doesn't require another round-trip.
+    """
+    start = time.time()
+    # Open the stream outside the generator so we can branch on status before
+    # committing to a streaming response shape.
+    upstream_cm = client.stream(
+        "POST",
+        upstream_url,
+        content=json.dumps(forwarded_body),
+        headers=headers,
+    )
+    upstream = await upstream_cm.__aenter__()
+    if upstream.status_code >= 400:
+        # Upstream responded with an error body (not SSE). Read it fully and
+        # return as a non-streaming JSON payload.
+        error_bytes = await upstream.aread()
+        await upstream_cm.__aexit__(None, None, None)
+        latency = time.time() - start
+        try:
+            error_json = json.loads(error_bytes.decode() or "{}")
+        except Exception:
+            error_json = {"error": error_bytes.decode(errors="replace")[:4000]}
+        record = _build_turn_record(
+            turn_idx=turn_idx,
+            request_body=forwarded_body,
+            response_json={
+                "choices": [],
+                "usage": None,
+                "upstream_status": upstream.status_code,
+                "upstream_error": error_json,
+            },
+            latency_s=latency,
+        )
+        trace_file.write(record.to_json() + "\n")
+        print(
+            f"[proxy] turn {turn_idx}: upstream {upstream.status_code}: "
+            f"{str(error_json)[:400]}",
+            flush=True,
+        )
+        return JSONResponse(content=error_json, status_code=upstream.status_code)
+    async def _stream() -> Any:
+        accumulated: dict[str, Any] = {
+            "content_by_idx": {},
+            "tool_calls_by_idx": {},
+            "finish_by_idx": {},
+            "logprobs_by_idx": {},
+        }
+        last_chunk: dict[str, Any] = {}
+        try:
+            async for line in upstream.aiter_lines():
+                if not line:
+                    yield "\n"
+                    continue
+                yield line + "\n"
+                if not line.startswith("data:"):
+                    continue
+                data = line[len("data:"):].strip()
+                if data == "[DONE]":
+                    continue
+                try:
+                    chunk = json.loads(data)
+                except json.JSONDecodeError:
+                    continue
+                last_chunk = chunk
+                _accumulate_stream_chunk(chunk, accumulated)
+        finally:
+            await upstream_cm.__aexit__(None, None, None)
+        latency = time.time() - start
+        response_json = _assemble_streamed_response(last_chunk, accumulated)
+        record = _build_turn_record(
+            turn_idx=turn_idx,
+            request_body=forwarded_body,
+            response_json=response_json,
+            latency_s=latency,
+        )
+        trace_file.write(record.to_json() + "\n")
+    return StreamingResponse(_stream(), media_type="text/event-stream")
+def _accumulate_stream_chunk(chunk: dict[str, Any], acc: dict[str, Any]) -> None:
+    for choice in chunk.get("choices", []) or []:
+        idx = choice.get("index", 0)
+        delta = choice.get("delta") or {}
+        content = delta.get("content")
+        if content:
+            acc["content_by_idx"].setdefault(idx, []).append(content)
+        # HF-Router's Qwen thinking mode streams the chain-of-thought under a
+        # separate ``reasoning`` field (per Together/Scaleway). Accumulate it
+        # so the assembled response surfaces it — otherwise it's dropped and
+        # proxy_turn observability is lost for thinking-mode rollouts.
+        reasoning = delta.get("reasoning")
+        if reasoning:
+            acc.setdefault("reasoning_by_idx", {}).setdefault(idx, []).append(reasoning)
+        for tc in delta.get("tool_calls") or []:
+            tc_idx = tc.get("index", 0)
+            bucket = acc["tool_calls_by_idx"].setdefault(
+                (idx, tc_idx),
+                {"id": None, "type": "function", "function": {"name": "", "arguments": ""}},
+            )
+            if tc.get("id"):
+                bucket["id"] = tc["id"]
+            fn = tc.get("function") or {}
+            if fn.get("name"):
+                bucket["function"]["name"] += fn["name"]
+            if fn.get("arguments"):
+                bucket["function"]["arguments"] += fn["arguments"]
+        if choice.get("finish_reason"):
+            acc["finish_by_idx"][idx] = choice["finish_reason"]
+        lp = choice.get("logprobs") or {}
+        content_lp = lp.get("content")
+        if content_lp:
+            acc["logprobs_by_idx"].setdefault(idx, []).extend(content_lp)
+def _assemble_streamed_response(
+    last_chunk: dict[str, Any], acc: dict[str, Any]
+) -> dict[str, Any]:
+    indices = sorted(
+        set(acc["content_by_idx"])
+        | set(acc["finish_by_idx"])
+        | {k[0] for k in acc["tool_calls_by_idx"]}
+        | set(acc["logprobs_by_idx"])
+        | {0}
+    )
+    choices: list[dict[str, Any]] = []
+    for idx in indices:
+        tool_calls = [
+            acc["tool_calls_by_idx"][k]
+            for k in sorted(acc["tool_calls_by_idx"])
+            if k[0] == idx
+        ]
+        message: dict[str, Any] = {"role": "assistant"}
+        content = "".join(acc["content_by_idx"].get(idx, []))
+        if content:
+            message["content"] = content
+        reasoning = "".join((acc.get("reasoning_by_idx") or {}).get(idx, []))
+        if reasoning:
+            message["reasoning"] = reasoning
+        if tool_calls:
+            message["tool_calls"] = tool_calls
+        choice: dict[str, Any] = {
+            "index": idx,
+            "message": message,
+            "finish_reason": acc["finish_by_idx"].get(idx),
+        }
+        if acc["logprobs_by_idx"].get(idx):
+            choice["logprobs"] = {"content": acc["logprobs_by_idx"][idx]}
+        choices.append(choice)
+    return {
+        "id": last_chunk.get("id", ""),
+        "object": "chat.completion",
+        "model": last_chunk.get("model", ""),
+        "choices": choices,
+        "usage": last_chunk.get("usage"),
+    }
+def _build_turn_record(
+    *,
+    turn_idx: int,
+    request_body: dict[str, Any],
+    response_json: dict[str, Any],
+    latency_s: float,
+) -> TurnRecord:
+    """Extract per-token logprobs into a normalized :class:`TurnRecord`."""
+    choice = (response_json.get("choices") or [{}])[0]
+    logprobs_field = choice.get("logprobs") or {}
+    content_lp = logprobs_field.get("content") or []
+    tokens: list[str] = []
+    token_ids: list[int] = []
+    per_token_logps: list[float] = []
+    for entry in content_lp:
+        tokens.append(entry.get("token", ""))
+        # OpenAI returns no raw token ids; vLLM returns them as ``token_id``.
+        token_id = entry.get("token_id")
+        if token_id is not None:
+            token_ids.append(int(token_id))
+        lp = entry.get("logprob")
+        if lp is not None:
+            per_token_logps.append(float(lp))
+    return TurnRecord(
+        turn=turn_idx,
+        request=request_body,
+        response=response_json,
+        logprobs=content_lp,
+        completion_tokens=tokens,
+        completion_token_ids=token_ids,
+        per_token_logps=per_token_logps,
+        finish_reason=choice.get("finish_reason"),
+        latency_s=latency_s,
+    )
+def _strip_logprobs(response_json: dict[str, Any]) -> dict[str, Any]:
+    """Return a copy of the response with ``choices[*].logprobs`` removed."""
+    out = dict(response_json)
+    choices = out.get("choices")
+    if isinstance(choices, list):
+        out["choices"] = [
+            {k: v for k, v in (ch or {}).items() if k != "logprobs"}
+            for ch in choices
+        ]
+    return out
+# ---------------------------------------------------------------------------
+# Standalone runner (used inside the sandbox)
+# ---------------------------------------------------------------------------
+def serve(cfg: ProxyConfig) -> None:
+    """Start the proxy and block (for use as the sandbox-side entry point)."""
+    app = _build_app(cfg)
+    uvicorn.run(app, host=cfg.host, port=cfg.port, log_level="warning")
+class InterceptionProxy:
+    """Thread-backed controller for running the proxy locally.
+    Used by unit tests and by any in-process driver that wants a short-lived
+    proxy on the local machine. Inside a sandbox we invoke :func:`serve`
+    directly via ``python -m opencode_env.interception``.
+    """
+    def __init__(self, cfg: ProxyConfig) -> None:
+        self._cfg = cfg
+        self._server: uvicorn.Server | None = None
+        self._thread: threading.Thread | None = None
+        self._ready = threading.Event()
+    @property
+    def url(self) -> str:
+        return f"http://{self._cfg.host}:{self._cfg.port}/v1"
+    @property
+    def config(self) -> ProxyConfig:
+        return self._cfg
+    def start(self) -> None:
+        app = _build_app(self._cfg)
+        config = uvicorn.Config(
+            app,
+            host=self._cfg.host,
+            port=self._cfg.port,
+            log_level="warning",
+            lifespan="on",
+        )
+        self._server = uvicorn.Server(config)
+        self._thread = threading.Thread(
+            target=self._run_server, daemon=True
+        )
+        self._thread.start()
+        # Wait for the server to accept connections.
+        deadline = time.time() + 10
+        while time.time() < deadline:
+            if _port_open(self._cfg.host, self._cfg.port):
+                self._ready.set()
+                return
+            time.sleep(0.05)
+        raise RuntimeError("InterceptionProxy failed to start within 10s")
+    def _run_server(self) -> None:
+        assert self._server is not None
+        self._server.run()
+    def stop(self) -> None:
+        if self._server is None:
+            return
+        self._server.should_exit = True
+        if self._thread is not None:
+            self._thread.join(timeout=5)
+        self._server = None
+        self._thread = None
+    def __enter__(self) -> "InterceptionProxy":
+        self.start()
+        return self
+    def __exit__(self, *exc) -> None:
+        self.stop()
+def _port_open(host: str, port: int) -> bool:
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+        s.settimeout(0.2)
+        return s.connect_ex((host, port)) == 0
+# ---------------------------------------------------------------------------
+# Trace reader (used by the session to pull captured turns back)
+# ---------------------------------------------------------------------------
+def read_trace(path: str | os.PathLike) -> list[dict[str, Any]]:
+    """Read a proxy trace file into a list of dicts."""
+    trace: list[dict[str, Any]] = []
+    p = Path(path)
+    if not p.exists():
+        return trace
+    for line in p.read_text().splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        trace.append(json.loads(line))
+    return trace
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+def main() -> None:
+    parser = argparse.ArgumentParser(prog="opencode_env.interception")
+    parser.add_argument("--upstream-url", required=True)
+    parser.add_argument("--upstream-api-key", default="intercepted")
+    parser.add_argument("--trace", default="/tmp/opencode-proxy-trace.jsonl")
+    parser.add_argument("--host", default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=7000)
+    parser.add_argument("--top-logprobs", type=int, default=5)
+    parser.add_argument("--request-timeout", type=float, default=600.0)
+    parser.add_argument(
+        "--max-tokens-cap",
+        type=int,
+        default=None,
+        help="Clamp max_tokens/max_completion_tokens on forwarded requests.",
+    )
+    parser.add_argument(
+        "--disable-thinking",
+        action="store_true",
+        help="Inject chat_template_kwargs.enable_thinking=false (Qwen3/Qwen3.5).",
+    )
+    parser.add_argument(
+        "--model-override",
+        default=None,
+        help="Rewrite the `model` field on every forwarded request.",
+    )
+    args = parser.parse_args()
+    cfg = ProxyConfig(
+        upstream_url=args.upstream_url,
+        upstream_api_key=args.upstream_api_key,
+        trace_path=args.trace,
+        host=args.host,
+        port=args.port,
+        top_logprobs=args.top_logprobs,
+        request_timeout_s=args.request_timeout,
+        max_tokens_cap=args.max_tokens_cap,
+        disable_thinking=args.disable_thinking,
+        model_override=args.model_override,
+    )
+    serve(cfg)
+if __name__ == "__main__":
+    main()

server/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Server-side for the deployed opencode_env."""

server/app.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""FastAPI app for the opencode_env MCP server.
+Mirrors the standard OpenEnv pattern (echo_env / repl_env / jupyter_agent)
+plus the custom Gradio UI mounted at ``/web`` per the
+``customizing-web-ui`` doc.
+Usage::
+    # Local dev:
+    E2B_API_KEY=... uvicorn server.app:app --host 0.0.0.0 --port 8000
+    # Docker:
+    docker run -p 8000:8000 -e E2B_API_KEY=... opencode-env
+    # HF Space: deploys via the root ``Dockerfile``.
+The ``ENABLE_WEB_INTERFACE`` env var is set to ``true`` automatically so
+the UI is always reachable at ``/web``. Set it to ``false`` to disable.
+"""
+from __future__ import annotations
+import os
+from pathlib import Path
+def _load_env_file() -> None:
+    """Lightweight ``.env`` loader (no python-dotenv dep).
+    Loads ``../.env`` (env dir's ``.env``) into ``os.environ`` for local
+    development convenience. Existing process env vars take precedence so
+    HF Space secrets always win.
+    """
+    candidate = Path(__file__).resolve().parents[1] / ".env"
+    if not candidate.exists():
+        return
+    for raw in candidate.read_text().splitlines():
+        line = raw.strip()
+        if not line or line.startswith("#") or "=" not in line:
+            continue
+        k, _, v = line.partition("=")
+        k = k.strip()
+        v = v.strip().strip('"').strip("'")
+        if k and k not in os.environ:
+            os.environ[k] = v
+_load_env_file()
+try:
+    from openenv.core.env_server.http_server import create_app
+    from openenv.core.env_server.mcp_types import (
+        CallToolAction,
+        CallToolObservation,
+    )
+    from .gradio_ui import opencode_gradio_builder
+    from .opencode_environment import OpenCodeEnvironment
+except ImportError:  # pragma: no cover
+    from openenv.core.env_server.http_server import create_app
+    from openenv.core.env_server.mcp_types import (
+        CallToolAction,
+        CallToolObservation,
+    )
+    from server.gradio_ui import opencode_gradio_builder  # type: ignore
+    from server.opencode_environment import OpenCodeEnvironment  # type: ignore
+# Always expose the Gradio UI at /web. Set ENABLE_WEB_INTERFACE=false to
+# disable (e.g., on HF Spaces where you want the API only).
+os.environ.setdefault("ENABLE_WEB_INTERFACE", "true")
+def _custom_gradio_builder(
+    web_manager,
+    action_fields,
+    metadata,
+    is_chat_env,
+    title,
+    quick_start_md,
+):
+    """Hand off to ``server.gradio_ui.opencode_gradio_builder``."""
+    return opencode_gradio_builder(
+        web_manager,
+        action_fields,
+        metadata,
+        is_chat_env,
+        title or "opencode_env",
+        quick_start_md,
+    )
+app = create_app(
+    OpenCodeEnvironment,
+    CallToolAction,
+    CallToolObservation,
+    env_name="opencode_env",
+    max_concurrent_envs=int(os.getenv("MAX_CONCURRENT_ENVS", "4")),
+    gradio_builder=_custom_gradio_builder,
+)
+def main() -> None:
+    """Entrypoint for ``uv run --project . server`` and direct invocation."""
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)
+if __name__ == "__main__":
+    main()

server/catalog.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Endpoint shorthand catalog.
+Lets the MCP tool ``run_rollout`` and the HTMX UI accept a short endpoint
+label (``vllm`` / ``openai`` / ``hf_router``) and resolve the actual
+``base_url`` / ``api_key`` / ``model`` from environment variables (with
+sane defaults). Explicit overrides on the call always win.
+"""
+from __future__ import annotations
+import os
+from dataclasses import dataclass
+ENDPOINT_KINDS = ("vllm", "openai", "hf_router")
+@dataclass(frozen=True)
+class _EndpointSpec:
+    base_url_env: str
+    api_key_env: str
+    model_env: str
+    default_base_url: str | None
+    default_api_key: str | None
+    default_model: str | None
+    disable_thinking_default: bool
+_CATALOG: dict[str, _EndpointSpec] = {
+    "vllm": _EndpointSpec(
+        base_url_env="VLLM_URL",
+        api_key_env="VLLM_API_KEY",
+        model_env="VLLM_MODEL",
+        default_base_url=None,  # cluster URL must be set
+        default_api_key="intercepted",  # vLLM rarely enforces auth
+        default_model="Qwen/Qwen3.5-4B",
+        disable_thinking_default=True,  # Qwen3.5 thinks by default
+    ),
+    "openai": _EndpointSpec(
+        base_url_env="OPENAI_BASE_URL",
+        api_key_env="OPENAI_API_KEY",
+        model_env="OPENAI_MODEL",
+        default_base_url="https://api.openai.com/v1",
+        default_api_key=None,
+        default_model="gpt-4o-mini",
+        disable_thinking_default=False,  # OpenAI rejects unknown kwargs
+    ),
+    "hf_router": _EndpointSpec(
+        base_url_env="HF_ROUTER_BASE_URL",
+        api_key_env="HF_ROUTER_API_KEY",
+        model_env="HF_ROUTER_MODEL",
+        default_base_url="https://router.huggingface.co/v1",
+        default_api_key=None,
+        default_model="Qwen/Qwen3-4B-Instruct-2507:nscale",
+        disable_thinking_default=False,  # Instruct variant doesn't think
+    ),
+}
+@dataclass(frozen=True)
+class ResolvedEndpoint:
+    kind: str
+    base_url: str
+    api_key: str
+    model: str
+    disable_thinking_default: bool
+def resolve_endpoint(
+    kind: str,
+    *,
+    base_url: str = "",
+    api_key: str = "",
+    model: str = "",
+) -> ResolvedEndpoint:
+    """Resolve an endpoint shorthand into concrete (base_url, api_key, model).
+    Precedence per field: **explicit arg > env var > catalog default**.
+    Always normalizes to a ``/v1`` base URL.
+    Raises ``ValueError`` for unknown kinds, missing creds, or missing model.
+    """
+    spec = _CATALOG.get(kind)
+    if spec is None:
+        raise ValueError(
+            f"unknown endpoint kind: {kind!r}; expected one of {ENDPOINT_KINDS}"
+        )
+    base = (
+        base_url or os.environ.get(spec.base_url_env) or spec.default_base_url or ""
+    ).rstrip("/")
+    if not base:
+        raise ValueError(
+            f"{kind}: no base_url (set {spec.base_url_env} env var or pass "
+            "base_url=...)"
+        )
+    if not base.endswith("/v1"):
+        base = f"{base}/v1"
+    key = api_key or os.environ.get(spec.api_key_env) or spec.default_api_key or ""
+    if not key:
+        raise ValueError(
+            f"{kind}: no api_key (set {spec.api_key_env} env var or pass api_key=...)"
+        )
+    mdl = model or os.environ.get(spec.model_env) or spec.default_model or ""
+    if not mdl:
+        raise ValueError(
+            f"{kind}: no model (set {spec.model_env} env var or pass model=...)"
+        )
+    return ResolvedEndpoint(
+        kind=kind,
+        base_url=base,
+        api_key=key,
+        model=mdl,
+        disable_thinking_default=spec.disable_thinking_default,
+    )
+def catalog_summary() -> list[dict[str, object]]:
+    """Return a JSON-friendly view of the catalog (for the UI dropdown)."""
+    out: list[dict[str, object]] = []
+    for kind, spec in _CATALOG.items():
+        out.append(
+            {
+                "kind": kind,
+                "base_url_env": spec.base_url_env,
+                "api_key_env": spec.api_key_env,
+                "model_env": spec.model_env,
+                "default_base_url": spec.default_base_url,
+                "default_model": spec.default_model,
+                "disable_thinking_default": spec.disable_thinking_default,
+                "configured": _is_configured(spec),
+            }
+        )
+    return out
+def _is_configured(spec: _EndpointSpec) -> bool:
+    base = os.environ.get(spec.base_url_env) or spec.default_base_url or ""
+    key = os.environ.get(spec.api_key_env) or spec.default_api_key or ""
+    model = os.environ.get(spec.model_env) or spec.default_model or ""
+    return bool(base and key and model)

server/gradio_ui.py ADDED Viewed

	@@ -0,0 +1,453 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Minimal Gradio UI for opencode_env.
+Mounts under the standard OpenEnv ``/web`` path via the
+``gradio_builder=`` callback documented at
+https://meta-pytorch.org/OpenEnv/customizing-web-ui.html.
+One page with:
+  - endpoint selector (``vllm`` / ``openai`` / ``hf_router``) — the catalog
+    resolves the actual base_url / api_key / model from env vars.
+  - instruction + setup (bash, one cmd per line) + verify (bash, one cmd
+    per line) textareas — the same Task shape the MCP tool accepts.
+  - Tunables (mode, disable_thinking, max_tokens_cap, top_logprobs,
+    agent_timeout_s, template).
+  - Preset buttons for the ready-made example tasks.
+  - Run button → result panel with reward, setup/verify per-command
+    results, file outputs, logprob stats, agent + proxy log tails,
+    and the raw RolloutResult JSON.
+"""
+from __future__ import annotations
+import json
+from typing import Any
+import gradio as gr
+try:
+    from .catalog import ENDPOINT_KINDS, catalog_summary, resolve_endpoint
+    from .opencode_environment import OpenCodeEnvironment
+except ImportError:  # pragma: no cover
+    from server.catalog import ENDPOINT_KINDS, catalog_summary, resolve_endpoint  # type: ignore
+    from server.opencode_environment import OpenCodeEnvironment  # type: ignore
+# ────────────────────────────────────────────────────────────────────────────
+# Preset task examples — each fills (instruction, setup, verify).
+# ────────────────────────────────────────────────────────────────────────────
+PRESETS: dict[str, dict[str, str]] = {
+    "binary_search": {
+        "instruction": (
+            "Create a single Python file named `binary_search.py` in the "
+            "current working directory. Use the relative path `binary_search.py`. "
+            "Expose exactly one function:\n"
+            "    def binary_search(arr: list[int], target: int) -> int\n"
+            "Return the index of `target` in the sorted list `arr`, or -1 if "
+            "absent. Use the binary-search algorithm; do not call list.index."
+        ),
+        "setup": "",
+        "verify": (
+            "test -f /home/user/workdir/binary_search.py\n"
+            "python -c \"import sys; sys.path.insert(0, '/home/user/workdir'); "
+            "import binary_search; "
+            "assert binary_search.binary_search([1,2,3,4,5], 3) == 2; "
+            "assert binary_search.binary_search([1,2,3], 99) == -1; "
+            "assert binary_search.binary_search([], 1) == -1; "
+            "print('OK')\""
+        ),
+    },
+    "fizzbuzz": {
+        "instruction": (
+            "Create `fizzbuzz.py` in the current directory exposing "
+            "`def fizzbuzz(n: int) -> list[str]` that returns the FizzBuzz "
+            "sequence for the integers 1..n. 'Fizz' for multiples of 3, 'Buzz' "
+            "for 5, 'FizzBuzz' for both, otherwise the number as a string."
+        ),
+        "setup": "",
+        "verify": (
+            "test -f /home/user/workdir/fizzbuzz.py\n"
+            "python -c \"import sys; sys.path.insert(0, '/home/user/workdir'); "
+            "import fizzbuzz; "
+            "assert fizzbuzz.fizzbuzz(5) == ['1','2','Fizz','4','Buzz']; "
+            "assert fizzbuzz.fizzbuzz(15)[-1] == 'FizzBuzz'; "
+            "print('OK')\""
+        ),
+    },
+    "pandas_csv": {
+        "instruction": (
+            "Read `/home/user/data/numbers.csv` (a CSV with a single column "
+            "`x` of integers) using pandas. Compute the mean of the `x` "
+            "column and write it as a single float to `/home/user/workdir/mean.txt` "
+            "(no extra characters, no newline)."
+        ),
+        "setup": (
+            "pip install --quiet pandas\n"
+            "mkdir -p /home/user/data\n"
+            "printf 'x\\n1\\n2\\n3\\n4\\n5\\n6\\n7\\n8\\n9\\n10\\n' > /home/user/data/numbers.csv"
+        ),
+        "verify": (
+            "test -f /home/user/workdir/mean.txt\n"
+            "python -c \"v=float(open('/home/user/workdir/mean.txt').read().strip()); "
+            "assert abs(v-5.5) < 1e-6, v; print('mean=', v)\""
+        ),
+    },
+}
+# ────────────────────────────────────────────────────────────────────────────
+# Result rendering helpers
+# ────────────────────────────────────────────────────��───────────────────────
+def _split_commands(text: str) -> list[str]:
+    return [line for line in (text or "").splitlines() if line.strip()]
+def _badge_for_reward(reward: float | None) -> str:
+    if reward is None:
+        return "**reward**: _n/a_"
+    if reward >= 0.999:
+        emoji = "[PASS]"
+    elif reward > 0.0:
+        emoji = "[PARTIAL]"
+    else:
+        emoji = "[FAIL]"
+    return f"### {emoji}  reward = `{reward:.2f}`"
+def _summary_md(result: dict[str, Any]) -> str:
+    parts = [_badge_for_reward(result.get("reward"))]
+    parts.append(
+        f"**sandbox**: `{result.get('sandbox_id') or 'n/a'}`  ·  "
+        f"**wall**: `{result.get('wall_s', 0):.1f}s`  ·  "
+        f"**agent_exit**: `{result.get('agent_exit_code')}`  ·  "
+        f"**mode**: `{result.get('mode', 'n/a')}`"
+    )
+    if result.get("error"):
+        parts.append(f"**error**: `{result['error']}`")
+    return "\n\n".join(parts)
+def _command_rows(items: list[dict[str, Any]]) -> list[list[str]]:
+    rows: list[list[str]] = []
+    for it in items or []:
+        cmd = it.get("cmd", "")
+        rows.append(
+            [
+                cmd if len(cmd) <= 80 else cmd[:77] + "...",
+                str(it.get("exit_code", "")),
+                f"{it.get('duration_s', 0):.2f}s",
+                (it.get("stderr") or "").splitlines()[-1][:80] if it.get("exit_code") else "",
+            ]
+        )
+    return rows
+def _logprobs_md(turns: list[dict[str, Any]]) -> str:
+    if not turns:
+        return "_No proxy turns captured._\n\nThis is normal in `black_box` mode. In `transparent_proxy` mode, an empty list usually means the agent never made an LLM call (check the agent log)."
+    n = len(turns)
+    productive = sum(1 for t in turns if t.get("completion_tokens"))
+    total_toks = sum(len(t.get("completion_tokens") or []) for t in turns)
+    all_lps = [
+        float(x)
+        for t in turns
+        for x in (t.get("per_token_logps") or [])
+        if x is not None
+    ]
+    mean_lp = (sum(all_lps) / len(all_lps)) if all_lps else None
+    lines = [
+        f"**turns**: `{n}`  ·  **productive**: `{productive}`  ·  "
+        f"**total_completion_tokens**: `{total_toks}`",
+    ]
+    if mean_lp is not None:
+        lines.append(f"**mean_logprob**: `{mean_lp:+.4f}`")
+    finishes: dict[str, int] = {}
+    for t in turns:
+        f = t.get("finish_reason") or "unknown"
+        finishes[f] = finishes.get(f, 0) + 1
+    if finishes:
+        lines.append(
+            "**finish_reasons**: " + "  ".join(f"`{k}={v}`" for k, v in finishes.items())
+        )
+    productive_rows = [t for t in turns if t.get("completion_tokens")]
+    if productive_rows:
+        first = productive_rows[0]
+        toks = first["completion_tokens"][:10]
+        lps = first.get("per_token_logps") or []
+        lines.append(
+            f"\n**first productive turn (first 10 tokens)**\n\n"
+            f"```\n"
+            + "\n".join(
+                f"  {tok!r:<14}  {lp:+.3f}" if i < len(lps) else f"  {tok!r:<14}  -"
+                for i, (tok, lp) in enumerate(zip(toks, lps + [None] * len(toks)))
+            )
+            + "\n```"
+        )
+    return "\n\n".join(lines)
+def _files_md(files: dict[str, str]) -> str:
+    if not files:
+        return "_No files in the workdir._"
+    chunks = []
+    for path, content in files.items():
+        chunks.append(f"**`{path}`**\n```python\n{content[:4000]}\n```")
+    return "\n\n".join(chunks)
+def _catalog_banner() -> str:
+    rows = ["**Endpoint catalog (env vars + defaults)**", ""]
+    rows.append("| kind | base_url | model | env vars | configured |")
+    rows.append("|---|---|---|---|---|")
+    for entry in catalog_summary():
+        envs = (
+            f"`{entry['base_url_env']}`<br/>`{entry['api_key_env']}`<br/>"
+            f"`{entry['model_env']}`"
+        )
+        ok = "yes" if entry["configured"] else "**no**"
+        rows.append(
+            f"| `{entry['kind']}` | `{entry['default_base_url'] or '-'}` | "
+            f"`{entry['default_model'] or '-'}` | {envs} | {ok} |"
+        )
+    return "\n".join(rows)
+# ────────────────────────────────────────────────────────────────────────────
+# Builder
+# ────────────────────────────────────────────────────────────────────────────
+def opencode_gradio_builder(
+    web_manager,        # noqa: ARG001 (unused: we instantiate the env directly)
+    action_fields,      # noqa: ARG001
+    metadata,           # noqa: ARG001
+    is_chat_env,        # noqa: ARG001
+    title,
+    quick_start_md,     # noqa: ARG001
+) -> gr.Blocks:
+    """Build the opencode_env console.
+    Compatible with ``create_app(..., gradio_builder=...)``. We ignore
+    ``web_manager`` and instantiate :class:`OpenCodeEnvironment` ourselves
+    inside the run handler — opencode_env's run_rollout doesn't need any
+    per-session state beyond the env's own bookkeeping, and instantiating
+    is cheap (no sandbox is created until the tool fires).
+    """
+    def run(
+        endpoint: str,
+        model: str,
+        base_url: str,
+        api_key: str,
+        instruction: str,
+        setup_text: str,
+        verify_text: str,
+        mode: str,
+        disable_thinking: str,
+        template: str,
+        max_tokens_cap: int,
+        top_logprobs: int,
+        agent_timeout_s: float,
+        progress: gr.Progress = gr.Progress(),
+    ):
+        progress(0.05, desc="resolving endpoint…")
+        try:
+            resolved = resolve_endpoint(
+                endpoint, base_url=base_url, api_key=api_key, model=model
+            )
+        except ValueError as exc:
+            err = f"endpoint resolution failed: {exc}"
+            return (err, [], [], "", "", "", {"error": err})
+        # Translate "auto" / "on" / "off" into bool / None.
+        if disable_thinking == "on":
+            dt: bool | None = True
+        elif disable_thinking == "off":
+            dt = False
+        else:
+            dt = None  # let the catalog default win
+        progress(0.10, desc=f"{resolved.kind}: {resolved.model}")
+        env = OpenCodeEnvironment()
+        progress(0.15, desc="creating sandbox + running agent…")
+        try:
+            payload = env._run_rollout_impl(
+                base_url=resolved.base_url,
+                api_key=resolved.api_key,
+                model=resolved.model,
+                instruction=instruction,
+                setup=_split_commands(setup_text),
+                verify=_split_commands(verify_text),
+                task_id="ui",
+                mode=mode,
+                disable_thinking=(
+                    dt if dt is not None else resolved.disable_thinking_default
+                ),
+                max_tokens_cap=int(max_tokens_cap),
+                top_logprobs=int(top_logprobs),
+                agent_timeout_s=float(agent_timeout_s),
+                template=template,
+            )
+        except Exception as exc:  # noqa: BLE001
+            err = f"{type(exc).__name__}: {exc}"
+            return (err, [], [], "", "", "", {"error": err})
+        progress(0.95, desc="rendering result…")
+        result = json.loads(payload)
+        return (
+            _summary_md(result),
+            _command_rows(result.get("setup_results") or []),
+            _command_rows(result.get("verify_results") or []),
+            _files_md(result.get("files") or {}),
+            _logprobs_md(result.get("proxy_turns") or []),
+            (
+                f"### agent log (tail)\n```\n{result.get('agent_log_tail', '')[:4000]}\n```\n\n"
+                f"### proxy log (tail)\n```\n{result.get('proxy_log_tail', '')[:4000]}\n```"
+            ),
+            result,
+        )
+    def apply_preset(name: str) -> tuple[str, str, str]:
+        p = PRESETS.get(name) or {"instruction": "", "setup": "", "verify": ""}
+        return p["instruction"], p["setup"], p["verify"]
+    with gr.Blocks(title=title or "opencode_env") as app:
+        gr.Markdown(f"# {title or 'opencode_env'}")
+        gr.Markdown(
+            "Run one OpenCode rollout in an E2B sandbox against your chosen "
+            "LLM endpoint. Pick an endpoint, write the task as `(instruction, "
+            "setup, verify)`, and inspect the reward + per-token logprobs."
+        )
+        gr.Markdown(_catalog_banner())
+        with gr.Row():
+            endpoint = gr.Dropdown(
+                choices=list(ENDPOINT_KINDS),
+                value="openai",
+                label="Endpoint",
+                scale=1,
+            )
+            model = gr.Textbox(
+                label="Model (blank → catalog default)", placeholder="gpt-4o-mini",
+                scale=2,
+            )
+        with gr.Row():
+            base_url = gr.Textbox(
+                label="Base URL (blank → env / catalog default)",
+                placeholder="https://api.openai.com/v1", scale=2,
+            )
+            api_key = gr.Textbox(
+                label="API key (blank → server env var)",
+                placeholder="(server env)", type="password", scale=1,
+            )
+        instruction = gr.Textbox(
+            label="Instruction (the prompt opencode runs)",
+            lines=4,
+            value=PRESETS["binary_search"]["instruction"],
+        )
+        with gr.Row():
+            setup_text = gr.Textbox(
+                label="Setup (one bash command per line — runs BEFORE the agent)",
+                lines=5,
+                value=PRESETS["binary_search"]["setup"],
+            )
+            verify_text = gr.Textbox(
+                label="Verify (one bash command per line — runs AFTER the agent)",
+                lines=5,
+                value=PRESETS["binary_search"]["verify"],
+            )
+        with gr.Row():
+            preset_bs = gr.Button("preset · binary_search", size="sm")
+            preset_fb = gr.Button("preset · fizzbuzz", size="sm")
+            preset_pd = gr.Button("preset · pandas_csv", size="sm")
+        with gr.Accordion("Tunables", open=False):
+            with gr.Row():
+                mode = gr.Dropdown(
+                    choices=["transparent_proxy", "black_box"],
+                    value="transparent_proxy",
+                    label="mode",
+                )
+                disable_thinking = gr.Dropdown(
+                    choices=["auto", "on", "off"],
+                    value="auto",
+                    label="disable_thinking",
+                )
+                template = gr.Textbox(
+                    label="E2B template (e.g. opencode-rl)",
+                    placeholder="(blank → cold install per rollout)",
+                )
+            with gr.Row():
+                max_tokens_cap = gr.Number(value=4096, label="max_tokens_cap", precision=0)
+                top_logprobs = gr.Number(value=5, label="top_logprobs", precision=0)
+                agent_timeout_s = gr.Number(value=600, label="agent_timeout_s", precision=0)
+        run_btn = gr.Button("Run rollout", variant="primary")
+        gr.Markdown("---")
+        summary_md = gr.Markdown("_Submit a rollout above to see results._")
+        with gr.Tabs():
+            with gr.Tab("Setup"):
+                setup_table = gr.Dataframe(
+                    headers=["cmd", "exit", "duration", "stderr"],
+                    datatype=["str", "str", "str", "str"],
+                    interactive=False,
+                    wrap=True,
+                )
+            with gr.Tab("Verify"):
+                verify_table = gr.Dataframe(
+                    headers=["cmd", "exit", "duration", "stderr"],
+                    datatype=["str", "str", "str", "str"],
+                    interactive=False,
+                    wrap=True,
+                )
+            with gr.Tab("Files"):
+                files_md = gr.Markdown("")
+            with gr.Tab("Logprobs"):
+                logprobs_md = gr.Markdown("")
+            with gr.Tab("Logs"):
+                logs_md = gr.Markdown("")
+            with gr.Tab("Raw JSON"):
+                raw_json = gr.JSON(value={})
+        # Wire it up.
+        for btn, name in [
+            (preset_bs, "binary_search"),
+            (preset_fb, "fizzbuzz"),
+            (preset_pd, "pandas_csv"),
+        ]:
+            btn.click(
+                fn=lambda n=name: apply_preset(n),
+                outputs=[instruction, setup_text, verify_text],
+            )
+        run_btn.click(
+            fn=run,
+            inputs=[
+                endpoint, model, base_url, api_key,
+                instruction, setup_text, verify_text,
+                mode, disable_thinking, template,
+                max_tokens_cap, top_logprobs, agent_timeout_s,
+            ],
+            outputs=[
+                summary_md, setup_table, verify_table,
+                files_md, logprobs_md, logs_md, raw_json,
+            ],
+        )
+    return app

server/opencode_environment.py ADDED Viewed

	@@ -0,0 +1,472 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""OpenCode MCP environment.
+Single MCP tool ``run_rollout`` that takes a uniform Task shape:
+  - ``instruction``  — prompt for the agent
+  - ``setup``        — bash commands run BEFORE the agent (in the sandbox)
+  - ``verify``       — bash commands run AFTER the agent
+Reward = ``passed_verify_commands / total`` unless a verify command writes
+a float to ``/home/user/logs/verifier/reward.txt`` (override).
+Returns a JSON-serialized :class:`RolloutResult` with reward + per-turn
+logprobs (Mode B) + setup/verify command results + file outputs.
+"""
+from __future__ import annotations
+import os
+import time
+from typing import Any, Optional
+from uuid import uuid4
+from fastmcp import FastMCP
+try:
+    from openenv.core.env_server.mcp_environment import MCPEnvironment
+    from openenv.core.env_server.types import Action, Observation
+    from .catalog import ENDPOINT_KINDS, resolve_endpoint
+except ImportError:  # pragma: no cover
+    from openenv.core.env_server.mcp_environment import MCPEnvironment
+    from openenv.core.env_server.types import Action, Observation
+    from server.catalog import ENDPOINT_KINDS, resolve_endpoint  # type: ignore
+# One rollout (sandbox cold start + opencode install + opencode run +
+# verifier) typically takes 30-180s; can spike to ~600s under load. Override
+# OpenEnv's 30s MCP-tool default so the server doesn't cut us off.
+_RUN_ROLLOUT_TIMEOUT_S = 900.0
+# Inside-sandbox paths the server writes/reads.
+HOME = "/home/user"
+WORKDIR = f"{HOME}/workdir"
+INSTRUCTION_PATH = f"{HOME}/task/instruction.md"
+REWARD_FILE = f"{HOME}/logs/verifier/reward.txt"
+PROXY_LOG = f"{HOME}/logs/agent/proxy.log"
+AGENT_LOG = f"{HOME}/logs/agent/opencode.jsonl"
+VERIFY_TIMEOUT_S = 120
+class OpenCodeEnvironment(MCPEnvironment):
+    """Per-session environment exposing a single ``run_rollout`` MCP tool."""
+    SUPPORTS_CONCURRENT_SESSIONS = True
+    def __init__(self) -> None:
+        # Lazy imports so module import stays cheap and so tests can patch.
+        try:
+            from ..models import (
+                CommandResult,
+                OpenCodeState,
+                RolloutResult,
+                RolloutTurn,
+            )
+        except ImportError:  # pragma: no cover
+            from models import (  # type: ignore
+                CommandResult,
+                OpenCodeState,
+                RolloutResult,
+                RolloutTurn,
+            )
+        from opencode_env import (
+            E2BSandboxBackend,
+            OpenCodeConfig,
+            OpenCodeSessionFactory,
+            OpenCodeTask,
+        )
+        self._CommandResult = CommandResult
+        self._RolloutResult = RolloutResult
+        self._RolloutTurn = RolloutTurn
+        self._OpenCodeState = OpenCodeState
+        self._OpenCodeConfig = OpenCodeConfig
+        self._OpenCodeSessionFactory = OpenCodeSessionFactory
+        self._OpenCodeTask = OpenCodeTask
+        self._E2BSandboxBackend = E2BSandboxBackend
+        # Don't raise on missing E2B_API_KEY here — OpenEnv's web-interface
+        # layer instantiates the env at import time for schema introspection,
+        # and we want the docs / Gradio UI to load even when the operator is
+        # just exploring. The real check happens lazily in
+        # ``_run_rollout_impl`` (any rollout without creds fails fast there
+        # with a clear error in the result payload).
+        self._state = self._OpenCodeState(episode_id=str(uuid4()))
+        mcp = FastMCP("opencode_env")
+        @mcp.tool
+        def run_rollout(
+            # Endpoint — either a shorthand (resolved from env vars + catalog
+            # defaults) OR explicit base_url+api_key+model. Explicit fields
+            # always win over the catalog.
+            endpoint: str = "",
+            base_url: str = "",
+            api_key: str = "",
+            model: str = "",
+            # Task
+            instruction: str = "",
+            setup: Optional[list[str]] = None,
+            verify: Optional[list[str]] = None,
+            # Bookkeeping / tunables
+            task_id: str = "",
+            mode: str = "transparent_proxy",
+            disable_thinking: Optional[bool] = None,
+            max_tokens_cap: int = 4096,
+            top_logprobs: int = 5,
+            agent_timeout_s: float = 600.0,
+            template: str = "",
+        ) -> str:
+            """Run one OpenCode rollout end-to-end.
+            ``endpoint`` is the shorthand selector (one of
+            ``"vllm"`` / ``"openai"`` / ``"hf_router"``) — the server
+            resolves base_url / api_key / model from env vars + catalog
+            defaults. Pass any of those explicitly to override.
+            See ``opencode_env.client.OpenCodeEnv.run_rollout`` for full
+            arg docs. Returns a JSON-serialized ``RolloutResult``.
+            """
+            # Resolve via catalog when shorthand is provided.
+            disable_thinking_resolved = disable_thinking
+            if endpoint:
+                resolved = resolve_endpoint(
+                    endpoint, base_url=base_url, api_key=api_key, model=model
+                )
+                base_url = resolved.base_url
+                api_key = resolved.api_key
+                model = resolved.model
+                if disable_thinking_resolved is None:
+                    disable_thinking_resolved = resolved.disable_thinking_default
+            if disable_thinking_resolved is None:
+                disable_thinking_resolved = False
+            if not (base_url and api_key and model):
+                raise ValueError(
+                    "must provide either ``endpoint`` (one of "
+                    f"{ENDPOINT_KINDS}) or all of base_url + api_key + model"
+                )
+            if not instruction:
+                raise ValueError("instruction is required")
+            return self._run_rollout_impl(
+                base_url=base_url,
+                api_key=api_key,
+                model=model,
+                instruction=instruction,
+                setup=list(setup or []),
+                verify=list(verify or []),
+                task_id=task_id,
+                mode=mode,
+                disable_thinking=disable_thinking_resolved,
+                max_tokens_cap=max_tokens_cap,
+                top_logprobs=top_logprobs,
+                agent_timeout_s=agent_timeout_s,
+                template=template,
+            )
+        super().__init__(mcp)
+    # ── OpenEnv lifecycle ──────────────────────────────────────────────────
+    def reset(
+        self,
+        seed: Optional[int] = None,
+        episode_id: Optional[str] = None,
+        **_: Any,
+    ) -> Observation:
+        self._state = self._OpenCodeState(episode_id=episode_id or str(uuid4()))
+        return Observation(
+            done=False,
+            reward=None,
+            metadata={
+                "status": "ready",
+                "message": (
+                    "opencode_env ready. Call run_rollout(...) with a task."
+                ),
+            },
+        )
+    def _step_impl(
+        self,
+        action: Action,
+        timeout_s: Optional[float] = None,
+        **_: Any,
+    ) -> Observation:
+        return Observation(
+            done=False,
+            reward=None,
+            metadata={
+                "error": (
+                    f"Unknown action type: {type(action).__name__}. "
+                    "Use CallToolAction(name='run_rollout', ...)."
+                ),
+            },
+        )
+    def step(
+        self,
+        action: Action,
+        timeout_s: Optional[float] = None,
+        **kwargs: Any,
+    ) -> Observation:
+        if timeout_s is None:
+            timeout_s = _RUN_ROLLOUT_TIMEOUT_S
+        return super().step(action, timeout_s=timeout_s, **kwargs)
+    async def step_async(
+        self,
+        action: Action,
+        timeout_s: Optional[float] = None,
+        **kwargs: Any,
+    ) -> Observation:
+        if timeout_s is None:
+            timeout_s = _RUN_ROLLOUT_TIMEOUT_S
+        return await super().step_async(action, timeout_s=timeout_s, **kwargs)
+    @property
+    def state(self) -> Any:
+        return self._state
+    # ── Rollout orchestration ──────────────────────────────────────────────
+    def _run_rollout_impl(
+        self,
+        *,
+        base_url: str,
+        api_key: str,
+        model: str,
+        instruction: str,
+        setup: list[str],
+        verify: list[str],
+        task_id: str,
+        mode: str,
+        disable_thinking: bool,
+        max_tokens_cap: int,
+        top_logprobs: int,
+        agent_timeout_s: float,
+        template: str,
+    ) -> str:
+        result = self._RolloutResult(task_id=task_id, mode=mode)
+        t0 = time.time()
+        # Late credential check — keeps the server importable in dev /
+        # docs-only contexts.
+        if not os.environ.get("E2B_API_KEY"):
+            result.error = (
+                "E2B_API_KEY is not set on the server. Configure it in the "
+                "Space's secrets / your .env / your shell before calling "
+                "run_rollout."
+            )
+            result.wall_s = round(time.time() - t0, 3)
+            return result.model_dump_json()
+        # Build OpenCodeConfig + factory. We keep the proxy in charge of
+        # ``model_override`` / ``logprobs`` / ``max_tokens``-cap injection.
+        config = self._OpenCodeConfig(
+            provider="openai_compatible",
+            base_url=base_url.rstrip("/"),
+            api_key=api_key,
+            model=model,
+            agent_timeout_s=agent_timeout_s,
+            proxy_disable_thinking=disable_thinking,
+            proxy_top_logprobs=top_logprobs,
+            proxy_max_tokens_cap=max_tokens_cap if max_tokens_cap > 0 else None,
+        )
+        # Concatenate setup commands into a single ``set -e`` script and let
+        # the primitive run it as ``task.setup_shell`` before the agent
+        # starts. The per-command tracking happens here too — we re-run
+        # each command in a wrapper that captures exit/stdout/stderr.
+        # That way the primitive still aborts on setup failure AND we get
+        # observability in the response.
+        instruction_payload = instruction
+        opencode_task = self._OpenCodeTask(
+            instruction=instruction_payload,
+            metadata={"task_id": task_id},
+        )
+        backend_kwargs: dict[str, Any] = {}
+        if template:
+            backend_kwargs["template"] = template
+        factory = self._OpenCodeSessionFactory(
+            config=config,
+            sandbox_backend=self._E2BSandboxBackend(**backend_kwargs),
+            mode=mode,
+            verifier=None,
+        )
+        session = None
+        try:
+            session = factory.create(task=opencode_task)
+            result.sandbox_id = session.sandbox.sandbox_id
+            # Run setup commands one at a time, *before* the agent starts.
+            # The factory has already started the agent in start_agent()
+            # during create(); to keep the order "setup → agent → verify"
+            # we'd need to restructure. As a pragmatic compromise we run
+            # setup IMMEDIATELY after create(), which races with the agent
+            # for ~1-2s but is fine for typical pip/git/download work
+            # because opencode itself takes >=20s to make its first model
+            # call.
+            for cmd in setup:
+                cr = self._exec_command(session.sandbox, cmd)
+                result.setup_results.append(cr)
+                if cr.exit_code != 0:
+                    result.error = (
+                        f"setup command failed (exit {cr.exit_code}): {cmd[:120]}"
+                    )
+                    break
+            # Block until the agent is done (or setup already failed).
+            if result.error is None:
+                try:
+                    result.agent_exit_code = session.wait_for_completion(
+                        timeout_s=agent_timeout_s
+                    )
+                except TimeoutError as exc:
+                    result.error = f"agent timeout: {exc}"
+            # Run verify commands one at a time, capture each.
+            verify_passed = 0
+            for cmd in verify:
+                cr = self._exec_command(session.sandbox, cmd)
+                result.verify_results.append(cr)
+                if cr.exit_code == 0:
+                    verify_passed += 1
+            # Reward: explicit reward.txt wins; else passed/total of verify.
+            override = self._read_reward(session.sandbox)
+            if override is not None:
+                result.reward = override
+            elif verify:
+                result.reward = verify_passed / len(verify)
+            else:
+                result.reward = None
+            # Collect filesystem + proxy trace.
+            result.files, result.files_extra = self._collect_files(session.sandbox)
+            result.proxy_turns = self._collect_proxy_turns(session)
+            result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[-2000:]
+            result.agent_log_tail = self._safe_read(session.sandbox, AGENT_LOG)[-2000:]
+        except Exception as exc:  # noqa: BLE001
+            result.error = f"{type(exc).__name__}: {exc}"
+            if session is not None:
+                result.proxy_log_tail = self._safe_read(session.sandbox, PROXY_LOG)[-2000:]
+                result.agent_log_tail = self._safe_read(session.sandbox, AGENT_LOG)[-2000:]
+        finally:
+            if session is not None:
+                try:
+                    session.close()
+                except Exception:
+                    pass
+        result.wall_s = round(time.time() - t0, 3)
+        # Bookkeeping on the per-session state.
+        self._state.rollouts_completed += 1
+        self._state.last_reward = result.reward
+        self._state.last_task_id = task_id or None
+        self._state.last_sandbox_id = result.sandbox_id or None
+        return result.model_dump_json()
+    # ── Helpers ────────────────────────────────────────────────────────────
+    def _exec_command(self, sandbox: Any, cmd: str) -> Any:
+        t = time.time()
+        try:
+            r = sandbox.exec(cmd, timeout=VERIFY_TIMEOUT_S)
+            return self._CommandResult(
+                cmd=cmd,
+                exit_code=int(r.exit_code),
+                stdout=(r.stdout or "")[-2000:],
+                stderr=(r.stderr or "")[-2000:],
+                duration_s=round(time.time() - t, 3),
+            )
+        except Exception as exc:  # noqa: BLE001
+            return self._CommandResult(
+                cmd=cmd,
+                exit_code=-1,
+                stderr=f"{type(exc).__name__}: {exc}",
+                duration_s=round(time.time() - t, 3),
+            )
+    def _read_reward(self, sandbox: Any) -> float | None:
+        raw = self._safe_read(sandbox, REWARD_FILE).strip()
+        if not raw:
+            return None
+        try:
+            return float(raw)
+        except ValueError:
+            return None
+    def _collect_files(
+        self, sandbox: Any
+    ) -> tuple[dict[str, str], list[str]]:
+        listing = sandbox.exec(
+            f"find {WORKDIR} -maxdepth 2 -type f -size -64k 2>/dev/null | head -32",
+            timeout=10,
+        )
+        files: dict[str, str] = {}
+        extras: list[str] = []
+        for line in (listing.stdout or "").splitlines():
+            path = line.strip()
+            if not path:
+                continue
+            try:
+                files[path] = sandbox.read_text(path)[:8000]
+            except Exception:
+                extras.append(path)
+        return files, extras
+    def _collect_proxy_turns(self, session: Any) -> list[Any]:
+        turns: list[Any] = []
+        proxy_trace_path = getattr(session, "_proxy_trace_path", None)
+        if not proxy_trace_path:
+            return turns
+        raw = self._safe_read(session.sandbox, proxy_trace_path)
+        for line in raw.splitlines():
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                import json as _json
+                rec = _json.loads(line)
+            except Exception:
+                continue
+            response = rec.get("response") or {}
+            choice = (response.get("choices") or [{}])[0] if response.get("choices") else {}
+            turns.append(
+                self._RolloutTurn(
+                    turn=int(rec.get("turn") or 0),
+                    finish_reason=rec.get("finish_reason"),
+                    completion_tokens=list(rec.get("completion_tokens") or []),
+                    completion_token_ids=list(rec.get("completion_token_ids") or []),
+                    per_token_logps=[
+                        float(x) for x in (rec.get("per_token_logps") or [])
+                        if x is not None
+                    ],
+                    latency_s=float(rec.get("latency_s") or 0.0),
+                    timestamp=float(rec.get("timestamp") or 0.0),
+                    upstream_status=response.get("upstream_status"),
+                    upstream_error=response.get("upstream_error"),
+                )
+            )
+        return turns
+    @staticmethod
+    def _safe_read(sandbox: Any, path: str) -> str:
+        try:
+            return sandbox.read_text(path) or ""
+        except Exception:
+            return ""

task.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Task payload accepted by :class:`OpenCodeSessionFactory`."""
+from __future__ import annotations
+from typing import Any
+from pydantic import BaseModel, Field
+class OpenCodeTask(BaseModel):
+    """One task for an OpenCode rollout.
+    The primitive only needs ``instruction`` (the prompt handed to ``opencode
+    run``). Callers may attach ``setup_shell`` (run once inside the sandbox
+    before the agent starts) and ``upload_files`` (written into the sandbox at
+    absolute paths). Any additional metadata belongs in ``metadata`` and is
+    passed through to the verifier untouched.
+    """
+    instruction: str
+    setup_shell: str | None = None
+    upload_files: dict[str, str] = Field(default_factory=dict)
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    @classmethod
+    def coerce(cls, value: Any) -> "OpenCodeTask":
+        """Accept a bare string, a dict, or an existing ``OpenCodeTask``."""
+        if isinstance(value, cls):
+            return value
+        if isinstance(value, str):
+            return cls(instruction=value)
+        if isinstance(value, dict):
+            return cls(**value)
+        raise TypeError(
+            f"Cannot coerce {type(value).__name__} to OpenCodeTask; "
+            "pass a str, dict, or OpenCodeTask."
+        )

tests/__init__.py ADDED Viewed

File without changes

tests/test_config.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import annotations
+import pytest
+from opencode_env.config import OpenCodeConfig, provider_npm_package
+def test_defaults_require_only_base_url():
+    cfg = OpenCodeConfig(base_url="http://localhost:8000/v1")
+    assert cfg.provider == "openai_compatible"
+    assert cfg.api_key == "intercepted"
+    assert cfg.model == "intercepted/model"
+    assert cfg.opencode_version == "latest"
+    assert "webfetch" in cfg.disabled_tools
+    assert cfg.run_format == "json"
+def test_provider_npm_mapping():
+    assert provider_npm_package("openai_compatible") == "@ai-sdk/openai-compatible"
+    assert provider_npm_package("openai") == "@ai-sdk/openai"
+    assert provider_npm_package("anthropic") == "@ai-sdk/anthropic"
+def test_rejects_unknown_provider():
+    with pytest.raises(ValueError):
+        OpenCodeConfig(provider="bogus", base_url="x")  # type: ignore[arg-type]
+def test_custom_fields_override_defaults():
+    cfg = OpenCodeConfig(
+        provider="openai",
+        base_url="https://api.openai.com/v1",
+        api_key="sk-test",
+        model="openai/gpt-5.3-codex",
+        opencode_version="0.5.3",
+        disabled_tools=["webfetch"],
+        system_prompt="be brief",
+        extra_env={"FOO": "bar"},
+    )
+    assert cfg.model == "openai/gpt-5.3-codex"
+    assert cfg.opencode_version == "0.5.3"
+    assert cfg.extra_env == {"FOO": "bar"}

tests/test_five_sorts_e2e.py ADDED Viewed

	@@ -0,0 +1,1045 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""End-to-end: spawn E2B, install opencode, write 5 sorting algorithms, verify.
+Talks to E2B and the LLM endpoints directly via the e2b SDK and httpx — no
+imports from the ``opencode_env`` package at runtime. The proxy that captures
+per-token logprobs (Mode B) is uploaded into the sandbox as a standalone
+source file from ``../interception.py``.
+For each endpoint configured in ``envs/opencode_env/.env`` (vLLM / OpenAI /
+HF Router) this script:
+  1. Creates a fresh E2B sandbox
+  2. Installs opencode (``curl https://opencode.ai/install | bash``)
+  3. (Mode B only) uploads + starts the in-sandbox logprob-capture proxy
+  4. Writes ``opencode.json`` pointing at the proxy (or the LLM directly)
+  5. Runs ``opencode run --format json "<instruction>"`` to completion
+  6. Runs an in-sandbox verifier that imports each sort module and tests it
+  7. Reads back: per-file pass/fail, file contents, proxy logprob stats,
+     wall time, sandbox id
+Default usage (runs every endpoint that has the required vars in .env)::
+    .venv/bin/python envs/opencode_env/tests/test_five_sorts_e2e.py
+Common flags::
+    --endpoint vllm|openai|hf_router|all      (default: all)
+    --mode transparent_proxy|black_box        (default: transparent_proxy)
+    --agent-timeout 600                       (seconds before opencode is killed)
+    --max-tokens-cap 4096                     (per-turn max_tokens clamp)
+    --save-artifacts                          (dump JSON per run to tests/_artifacts/)
+    --instruction-override "..."              (custom instruction)
+Requires ``E2B_API_KEY`` in the environment plus per-endpoint creds in .env.
+Each rollout takes 1–7 minutes of wall plus ~10s sandbox cold start.
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import secrets
+import sys
+import time
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from statistics import mean
+from typing import Any
+import httpx
+from e2b import Sandbox
+# ---------------------------------------------------------------------------
+# .env loader — minimal, no python-dotenv dep.
+# ---------------------------------------------------------------------------
+_THIS_DIR = Path(__file__).resolve().parent
+_ENV_DIR = _THIS_DIR.parent
+_DOTENV_PATH = _ENV_DIR / ".env"
+_PROXY_SOURCE_PATH = _ENV_DIR / "sandbox" / "interception.py"
+def _load_env(path: Path) -> None:
+    if not path.exists():
+        return
+    for raw in path.read_text().splitlines():
+        line = raw.strip()
+        if not line or line.startswith("#") or "=" not in line:
+            continue
+        k, _, v = line.partition("=")
+        k = k.strip()
+        v = v.strip().strip('"').strip("'")
+        if k and k not in os.environ:
+            os.environ[k] = v
+_load_env(_DOTENV_PATH)
+# ---------------------------------------------------------------------------
+# Endpoint specs — three flavors, all OAI-compatible.
+# ---------------------------------------------------------------------------
+@dataclass
+class Endpoint:
+    label: str
+    base_url: str
+    model: str
+    api_key: str
+    # Inject ``chat_template_kwargs.enable_thinking=false`` on forwarded
+    # requests. Needed for Qwen3.5 served via vLLM (otherwise the model
+    # spends its budget on reasoning). OpenAI rejects this field with HTTP
+    # 400 ("Unrecognized request argument"); HF Router's Instruct variant
+    # doesn't need it. Default per-endpoint, overridable via CLI.
+    disable_thinking_default: bool = False
+def _resolve_endpoints() -> tuple[list[Endpoint], list[str]]:
+    """Return (configured, skipped_reasons) from current process env."""
+    specs = [
+        # (label, base_url_env, default_base_url, model_env, default_model,
+        #  api_key_env, default_api_key)
+        (
+            "vllm",
+            "VLLM_URL",
+            "",
+            "VLLM_MODEL",
+            "Qwen/Qwen3.5-4B",
+            "VLLM_API_KEY",
+            "intercepted",
+        ),
+        (
+            "openai",
+            "OPENAI_BASE_URL",
+            "https://api.openai.com/v1",
+            "OPENAI_MODEL",
+            "gpt-4o-mini",
+            "OPENAI_API_KEY",
+            "",
+        ),
+        (
+            "hf_router",
+            "HF_ROUTER_BASE_URL",
+            "https://router.huggingface.co/v1",
+            "HF_ROUTER_MODEL",
+            "Qwen/Qwen3-4B-Instruct-2507:nscale",
+            "HF_ROUTER_API_KEY",
+            "",
+        ),
+    ]
+    chosen: list[Endpoint] = []
+    skipped: list[str] = []
+    for label, bu_env, bu_default, mdl_env, mdl_default, ak_env, ak_default in specs:
+        base = os.environ.get(bu_env) or bu_default
+        model = os.environ.get(mdl_env) or mdl_default
+        api_key = os.environ.get(ak_env) or ak_default
+        if not (base and model and api_key):
+            skipped.append(
+                f"{label} (need {bu_env} / {mdl_env} / {ak_env} in .env)"
+            )
+            continue
+        # Always normalize to a /v1 base URL — opencode + the proxy expect it.
+        base = base.rstrip("/")
+        if not base.endswith("/v1"):
+            base = f"{base}/v1"
+        chosen.append(
+            Endpoint(
+                label=label,
+                base_url=base,
+                model=model,
+                api_key=api_key,
+                disable_thinking_default=(label == "vllm"),
+            )
+        )
+    return chosen, skipped
+# ---------------------------------------------------------------------------
+# The locked task: instruction + verifier source. Identical for all endpoints.
+# ---------------------------------------------------------------------------
+MODULES = ["bubble_sort", "merge_sort", "quick_sort"]
+INSTRUCTION = (
+    "Create THREE Python files in the current working directory, one per "
+    "sorting algorithm. Use RELATIVE paths — do NOT write to absolute paths "
+    "like `/bubble_sort.py`. Files (one algorithm each):\n"
+    "  - bubble_sort.py  -> bubble sort\n"
+    "  - merge_sort.py   -> merge sort\n"
+    "  - quick_sort.py   -> quicksort\n\n"
+    "Each file MUST expose exactly one function with this signature:\n"
+    "    def sort(arr: list[int]) -> list[int]\n\n"
+    "It must return a NEW list sorted in non-decreasing order (do not mutate "
+    "the input). Each file must implement the algorithm named for it — do "
+    "NOT call `sorted()` or `list.sort()`, and do NOT import third-party "
+    "libraries. Handle edge cases: empty list, single element, duplicates, "
+    "already-sorted, reverse-sorted, negative numbers. Do not write tests, "
+    "a main block, README, or any other files."
+)
+VERIFIER_SOURCE = '''\
+"""Verifier for the three-sorts E2E test. Runs inside the sandbox."""
+import importlib
+import json
+import re
+import shutil
+import sys
+import traceback
+from pathlib import Path
+WORKDIR = Path("/home/user/workdir")
+LOG_DIR = Path("/home/user/logs/verifier")
+LOG_DIR.mkdir(parents=True, exist_ok=True)
+WORKDIR.mkdir(parents=True, exist_ok=True)
+sys.path.insert(0, str(WORKDIR))
+MODULES = ["bubble_sort", "merge_sort", "quick_sort"]
+# Some models (notably Qwen3 served via vLLM) ignore "use relative paths"
+# and write files to ``/<name>.py``. With ``--dangerously-skip-permissions``
+# opencode allows it, so we relocate any stray files into WORKDIR so the
+# import side below is path-uniform.
+for name in MODULES:
+    stray = Path("/") / f"{name}.py"
+    target = WORKDIR / f"{name}.py"
+    if stray.exists() and not target.exists():
+        shutil.move(str(stray), str(target))
+# Each test case: (input, expected_sorted_output)
+CASES = [
+    ([3, 1, 4, 1, 5, 9, 2, 6, 5, 3, 5], [1, 1, 2, 3, 3, 4, 5, 5, 5, 6, 9]),
+    ([], []),
+    ([42], [42]),
+    ([2, 1], [1, 2]),
+    ([10, 9, 8, 7, 6, 5, 4, 3, 2, 1], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
+    ([1, 2, 3, 4, 5], [1, 2, 3, 4, 5]),
+    ([-3, -1, -2, 0, 5, 4], [-3, -2, -1, 0, 4, 5]),
+    ([7, 7, 7, 7, 7], [7, 7, 7, 7, 7]),
+]
+# Catch a model that calls ``sorted()`` / ``.sort()`` while pretending to
+# "implement" the named algorithm.
+SOURCE_FORBIDDEN = re.compile(r"\\b(sorted\\s*\\(|\\.sort\\s*\\()")
+results = {}
+for name in MODULES:
+    fpath = WORKDIR / f"{name}.py"
+    if not fpath.exists():
+        results[name] = "missing"
+        continue
+    try:
+        src = fpath.read_text()
+        if SOURCE_FORBIDDEN.search(src):
+            results[name] = "cheat: uses sorted()/list.sort()"
+            continue
+        sys.modules.pop(name, None)
+        mod = importlib.import_module(name)
+        # Accept either ``sort`` (per spec) or the algorithm-named function
+        # (a common drift — e.g. gpt-4o-mini emits ``def bubble_sort(...)``).
+        fn = getattr(mod, "sort", None) or getattr(mod, name, None)
+        if fn is None:
+            results[name] = "no_sort_or_named_function"
+            continue
+        all_pass = True
+        for inp, expected in CASES:
+            inp_copy = list(inp)
+            actual = fn(list(inp))
+            if actual != expected:
+                all_pass = False
+                results[name] = (
+                    f"fail: {fn.__name__}({inp!r}) -> {actual!r}, "
+                    f"expected {expected!r}"
+                )
+                break
+            # The callee should not mutate the caller's list.
+            if list(inp) != inp_copy:
+                all_pass = False
+                results[name] = (
+                    f"fail: {fn.__name__} mutated input {inp!r} -> {inp_copy!r}"
+                )
+                break
+        if all_pass:
+            results[name] = "pass"
+    except Exception:
+        tb = traceback.format_exc()
+        results[name] = f"error: {tb.splitlines()[-1]}"
+passed = sum(1 for v in results.values() if v == "pass")
+reward = passed / len(MODULES)
+(LOG_DIR / "reward.txt").write_text(f"{reward:.4f}")
+(LOG_DIR / "results.json").write_text(json.dumps(results, indent=2))
+print(f"REWARD={reward:.4f}  PASSED={passed}/{len(MODULES)}")
+print(f"RESULTS={json.dumps(results)}")
+'''
+# ---------------------------------------------------------------------------
+# Sandbox paths.
+# ---------------------------------------------------------------------------
+HOME = "/home/user"
+WORKDIR = f"{HOME}/workdir"
+OPENCODE_BIN = f"{HOME}/.opencode/bin/opencode"
+OPENCODE_CONFIG = f"{HOME}/.config/opencode/opencode.json"
+INSTRUCTION_PATH = f"{HOME}/task/instruction.md"
+VERIFIER_PATH = f"{HOME}/test.py"
+AGENT_LOG = f"{HOME}/logs/agent/opencode.jsonl"
+PROXY_LOG = f"{HOME}/logs/agent/proxy.log"
+PROXY_TRACE = f"{HOME}/logs/agent/proxy_trace.jsonl"
+PROXY_SCRIPT_PATH = f"{HOME}/proxy/interception.py"
+REWARD_FILE = f"{HOME}/logs/verifier/reward.txt"
+RESULTS_FILE = f"{HOME}/logs/verifier/results.json"
+PROXY_PORT = 7000
+# ---------------------------------------------------------------------------
+# Result types.
+# ---------------------------------------------------------------------------
+@dataclass
+class LogprobStats:
+    n_turns: int = 0
+    productive_turns: int = 0
+    total_completion_tokens: int = 0
+    tokens_per_turn: list[int] = field(default_factory=list)
+    mean_logprob: float | None = None
+    first_token: str = ""
+    first_logprob: float | None = None
+    last_token: str = ""
+    last_logprob: float | None = None
+    finish_reasons: dict[str, int] = field(default_factory=dict)
+@dataclass
+class RunResult:
+    endpoint: str
+    model: str
+    base_url: str
+    sandbox_id: str = ""
+    reward: float | None = None
+    tests: dict[str, str] = field(default_factory=dict)
+    files: dict[str, str] = field(default_factory=dict)
+    files_extra: list[str] = field(default_factory=list)
+    logprobs: LogprobStats = field(default_factory=LogprobStats)
+    wall_s: float = 0.0
+    agent_exit_code: int | None = None
+    error: str = ""
+    proxy_log_tail: str = ""
+    agent_log_tail: str = ""
+    verifier_stdout: str = ""
+    # Raw per-turn dump (request body + response body, truncated). Saved
+    # into artifacts so failures can be debugged without re-running.
+    raw_turns: list[dict[str, Any]] = field(default_factory=list)
+    @property
+    def passed(self) -> int:
+        return sum(1 for v in self.tests.values() if v == "pass")
+# ---------------------------------------------------------------------------
+# Sandbox helpers — thin wrappers around e2b SDK.
+# ---------------------------------------------------------------------------
+def _exec(
+    sbx: Sandbox,
+    cmd: str,
+    *,
+    envs: dict[str, str] | None = None,
+    cwd: str | None = None,
+    timeout: float = 60,
+) -> tuple[int, str, str]:
+    """Synchronous shell exec. Returns (exit_code, stdout, stderr)."""
+    from e2b.sandbox.commands.command_handle import CommandExitException
+    try:
+        r = sbx.commands.run(
+            cmd, envs=envs, cwd=cwd, timeout=timeout, background=False
+        )
+        return r.exit_code, r.stdout or "", r.stderr or ""
+    except CommandExitException as exc:
+        return (
+            int(getattr(exc, "exit_code", 1)),
+            str(getattr(exc, "stdout", "") or ""),
+            str(getattr(exc, "stderr", "") or str(exc)),
+        )
+def _exec_bg_with_timeout(
+    sbx: Sandbox,
+    cmd: str,
+    *,
+    envs: dict[str, str] | None = None,
+    cwd: str | None = None,
+    timeout_s: float = 600,
+    poll_interval_s: float = 1.0,
+) -> int:
+    """Run ``cmd`` in the background and poll until it writes a marker file.
+    Returns the command's exit code. Raises ``TimeoutError`` if the marker
+    does not appear within ``timeout_s``. ``timeout=0`` is passed to E2B so
+    the server-side 60s deadline does not kill the process.
+    """
+    marker = f"/tmp/cmd_done_{secrets.token_hex(4)}"
+    wrapped = f"({cmd}); echo $? > {marker}"
+    sbx.commands.run(
+        wrapped, envs=envs, cwd=cwd, background=True, timeout=0
+    )
+    deadline = time.time() + timeout_s
+    while time.time() < deadline:
+        try:
+            if sbx.files.exists(marker):
+                code_str = sbx.files.read(marker).strip()
+                return int(code_str) if code_str else -1
+        except Exception:
+            pass
+        time.sleep(poll_interval_s)
+    raise TimeoutError(f"command did not finish within {timeout_s}s")
+def _safe_read(sbx: Sandbox, path: str) -> str:
+    try:
+        return sbx.files.read(path)
+    except Exception:
+        return ""
+def _write_text(sbx: Sandbox, path: str, content: str) -> None:
+    parent = str(Path(path).parent)
+    if parent not in ("", "/"):
+        sbx.files.make_dir(parent)
+    sbx.files.write(path, content)
+# ---------------------------------------------------------------------------
+# Bootstrap: install opencode, write config, optionally start proxy.
+# ---------------------------------------------------------------------------
+def _wait_for_sandbox_ready(sbx: Sandbox, *, attempts: int = 15) -> None:
+    for _ in range(attempts):
+        code, out, _ = _exec(sbx, "echo ok", timeout=5)
+        if code == 0 and "ok" in out:
+            return
+        time.sleep(1)
+    raise RuntimeError("sandbox did not become ready within ~15s")
+def _install_opencode(sbx: Sandbox) -> None:
+    cmd = (
+        "set -e && "
+        f"mkdir -p {HOME}/.config/opencode {HOME}/logs/agent "
+        f"{HOME}/logs/verifier {HOME}/task {WORKDIR} {HOME}/proxy && "
+        "curl -fsSL https://opencode.ai/install | bash && "
+        f'export PATH="{HOME}/.opencode/bin:$PATH" && '
+        "opencode --version"
+    )
+    last_stderr = ""
+    for attempt in range(3):
+        code, _, err = _exec(sbx, cmd, timeout=240)
+        if code == 0:
+            return
+        last_stderr = err
+        time.sleep(3 * (attempt + 1))
+    raise RuntimeError(f"opencode install failed: {last_stderr[-1000:]}")
+def _ensure_dirs_exist(sbx: Sandbox) -> None:
+    """When using a pre-baked template, dirs already exist. This is a no-op
+    safety net that ensures the layout is present (cheap mkdir -p)."""
+    _exec(
+        sbx,
+        f"mkdir -p {HOME}/.config/opencode {HOME}/logs/agent "
+        f"{HOME}/logs/verifier {HOME}/task {WORKDIR} {HOME}/proxy",
+        timeout=30,
+    )
+def _start_proxy(
+    sbx: Sandbox,
+    upstream_url: str,
+    upstream_api_key: str,
+    upstream_model: str,
+    *,
+    top_logprobs: int,
+    max_tokens_cap: int,
+    disable_thinking: bool,
+    skip_install: bool = False,
+) -> str:
+    """Upload + start the logprob-capture proxy, return its baseURL.
+    Returns the URL opencode should hit (``http://127.0.0.1:7000/v1``).
+    When ``skip_install`` is True (pre-baked template), the proxy source
+    and pip deps are assumed to already be present.
+    """
+    if not skip_install:
+        if not _PROXY_SOURCE_PATH.exists():
+            raise RuntimeError(
+                f"proxy source not found at {_PROXY_SOURCE_PATH} — needed "
+                "for transparent_proxy mode"
+            )
+        _write_text(sbx, PROXY_SCRIPT_PATH, _PROXY_SOURCE_PATH.read_text())
+        code, _, err = _exec(
+            sbx,
+            "pip install --quiet 'fastapi>=0.104' 'uvicorn[standard]>=0.24' "
+            "'httpx>=0.27' 2>&1 | tail -20",
+            timeout=180,
+        )
+        if code != 0:
+            raise RuntimeError(f"proxy deps install failed: {err[-800:]}")
+    flags = (
+        f"--upstream-url {upstream_url} "
+        f"--upstream-api-key {upstream_api_key} "
+        f"--trace {PROXY_TRACE} "
+        f"--port {PROXY_PORT} "
+        f"--top-logprobs {top_logprobs} "
+        f"--max-tokens-cap {max_tokens_cap} "
+        f"--model-override '{upstream_model}' "
+    )
+    if disable_thinking:
+        flags += "--disable-thinking "
+    cmd = (
+        f"cd {HOME}/proxy && "
+        f"python interception.py {flags}> {PROXY_LOG} 2>&1"
+    )
+    sbx.commands.run(cmd, background=True, timeout=0)
+    # Wait for healthz.
+    for _ in range(120):
+        code, _, _ = _exec(
+            sbx, f"curl -sf http://127.0.0.1:{PROXY_PORT}/healthz", timeout=5
+        )
+        if code == 0:
+            return f"http://127.0.0.1:{PROXY_PORT}/v1"
+        time.sleep(0.5)
+    log = _safe_read(sbx, PROXY_LOG)
+    raise RuntimeError(f"proxy did not start within 60s. log:\n{log[-2000:]}")
+def _write_opencode_json(
+    sbx: Sandbox,
+    base_url: str,
+    api_key: str,
+    model: str,
+    request_timeout_ms: int = 600_000,
+) -> None:
+    """Stage opencode.json for ``@ai-sdk/openai-compatible``.
+    All three endpoints route through the OAI-compatible adapter — the proxy
+    serves ``/v1/chat/completions`` and so does each upstream we target.
+    """
+    inner_model = model.split("/", 1)[-1]
+    doc = {
+        "$schema": "https://opencode.ai/config.json",
+        "model": f"intercepted/{inner_model}",
+        "provider": {
+            "intercepted": {
+                "npm": "@ai-sdk/openai-compatible",
+                "name": "Intercepted",
+                "options": {
+                    "baseURL": base_url,
+                    "apiKey": api_key,
+                    "timeout": request_timeout_ms,
+                },
+                "models": {inner_model: {"name": "Intercepted Model"}},
+            }
+        },
+        "tools": {"webfetch": False, "question": False},
+    }
+    _write_text(sbx, OPENCODE_CONFIG, json.dumps(doc, indent=2))
+# ---------------------------------------------------------------------------
+# Run + verify + collect.
+# ---------------------------------------------------------------------------
+def _run_agent(
+    sbx: Sandbox,
+    *,
+    instruction_path: str,
+    base_url: str,
+    api_key: str,
+    timeout_s: float,
+) -> int:
+    """Invoke ``opencode run`` synchronously, return its exit code."""
+    envs = {
+        "OPENAI_BASE_URL": base_url,
+        "OPENAI_API_KEY": api_key,
+        "OPENCODE_CONFIG": OPENCODE_CONFIG,
+        "PATH": f"{HOME}/.opencode/bin:/usr/local/bin:/usr/bin:/bin",
+    }
+    cmd = (
+        f'export PATH="{HOME}/.opencode/bin:$PATH" && '
+        f"cd {WORKDIR} && "
+        f"opencode run --format json --dangerously-skip-permissions "
+        f'"$(cat {instruction_path})" 2>&1 | tee {AGENT_LOG}'
+    )
+    return _exec_bg_with_timeout(
+        sbx, cmd, envs=envs, timeout_s=timeout_s
+    )
+def _run_verifier(sbx: Sandbox) -> tuple[float | None, dict[str, str], str]:
+    cmd = f"mkdir -p {HOME}/logs/verifier && python {VERIFIER_PATH}"
+    code, out, err = _exec(sbx, cmd, timeout=120)
+    reward_str = _safe_read(sbx, REWARD_FILE).strip()
+    results_str = _safe_read(sbx, RESULTS_FILE)
+    try:
+        reward = float(reward_str) if reward_str else None
+    except ValueError:
+        reward = None
+    try:
+        tests = json.loads(results_str) if results_str.strip() else {}
+    except json.JSONDecodeError:
+        tests = {}
+    combined = (out + ("\n" + err if err else "")).strip()
+    return reward, tests, combined[-3000:]
+def _collect_files(sbx: Sandbox) -> tuple[dict[str, str], list[str]]:
+    files: dict[str, str] = {}
+    for name in MODULES:
+        path = f"{WORKDIR}/{name}.py"
+        try:
+            if sbx.files.exists(path):
+                files[f"{name}.py"] = sbx.files.read(path)[:8000]
+        except Exception:
+            pass
+    code, out, _ = _exec(
+        sbx,
+        f"find {WORKDIR} -maxdepth 1 -type f -printf '%f\\n' 2>/dev/null",
+        timeout=10,
+    )
+    extras: list[str] = []
+    expected = {f"{m}.py" for m in MODULES}
+    for line in (out or "").splitlines():
+        n = line.strip()
+        if n and n not in expected and not n.startswith("."):
+            extras.append(n)
+    return files, extras
+def _read_proxy_trace(sbx: Sandbox) -> list[dict[str, Any]]:
+    raw = _safe_read(sbx, PROXY_TRACE)
+    out: list[dict[str, Any]] = []
+    for line in raw.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            out.append(json.loads(line))
+        except Exception:
+            pass
+    return out
+def _logprob_stats(turns: list[dict[str, Any]]) -> LogprobStats:
+    s = LogprobStats(n_turns=len(turns))
+    if not turns:
+        return s
+    all_lps: list[float] = []
+    finish: dict[str, int] = {}
+    for t in turns:
+        toks = t.get("completion_tokens") or []
+        lps = t.get("per_token_logps") or []
+        s.tokens_per_turn.append(len(toks))
+        s.total_completion_tokens += len(toks)
+        if toks:
+            s.productive_turns += 1
+        all_lps.extend(float(x) for x in lps if x is not None)
+        fr = t.get("finish_reason") or "unknown"
+        finish[fr] = finish.get(fr, 0) + 1
+    s.finish_reasons = finish
+    if all_lps:
+        s.mean_logprob = mean(all_lps)
+    first = next((t for t in turns if t.get("completion_tokens")), None)
+    last = next(
+        (t for t in reversed(turns) if t.get("completion_tokens")), None
+    )
+    if first:
+        s.first_token = str(first["completion_tokens"][0])
+        lp = (first.get("per_token_logps") or [None])[0]
+        if lp is not None:
+            s.first_logprob = float(lp)
+    if last:
+        s.last_token = str(last["completion_tokens"][-1])
+        lp = (last.get("per_token_logps") or [None])[-1]
+        if lp is not None:
+            s.last_logprob = float(lp)
+    return s
+# ---------------------------------------------------------------------------
+# One full rollout.
+# ---------------------------------------------------------------------------
+def run_one(
+    ep: Endpoint,
+    *,
+    mode: str,
+    agent_timeout_s: float,
+    max_tokens_cap: int,
+    top_logprobs: int,
+    disable_thinking: bool,
+    instruction: str,
+    e2b_api_key: str,
+    template: str | None = None,
+) -> RunResult:
+    print(
+        f"[{ep.label}] launching base_url={ep.base_url} model={ep.model} "
+        f"mode={mode} template={template or '(default)'}",
+        flush=True,
+    )
+    res = RunResult(endpoint=ep.label, model=ep.model, base_url=ep.base_url)
+    started = time.time()
+    sbx = Sandbox.create(
+        template=template,
+        timeout=int(agent_timeout_s) + 300,
+        api_key=e2b_api_key,
+    )
+    res.sandbox_id = sbx.sandbox_id
+    print(f"[{ep.label}] sandbox={sbx.sandbox_id}", flush=True)
+    try:
+        _wait_for_sandbox_ready(sbx)
+        if template:
+            _ensure_dirs_exist(sbx)
+        else:
+            _install_opencode(sbx)
+        _write_text(sbx, INSTRUCTION_PATH, instruction)
+        _write_text(sbx, VERIFIER_PATH, VERIFIER_SOURCE)
+        if mode == "transparent_proxy":
+            base_url = _start_proxy(
+                sbx,
+                upstream_url=ep.base_url,
+                upstream_api_key=ep.api_key,
+                upstream_model=ep.model,
+                top_logprobs=top_logprobs,
+                max_tokens_cap=max_tokens_cap,
+                disable_thinking=disable_thinking,
+                skip_install=bool(template),
+            )
+        else:
+            base_url = ep.base_url
+        _write_opencode_json(
+            sbx,
+            base_url=base_url,
+            api_key=ep.api_key if mode == "black_box" else "intercepted",
+            model=ep.model,
+        )
+        try:
+            res.agent_exit_code = _run_agent(
+                sbx,
+                instruction_path=INSTRUCTION_PATH,
+                base_url=base_url,
+                api_key=ep.api_key if mode == "black_box" else "intercepted",
+                timeout_s=agent_timeout_s,
+            )
+            print(
+                f"[{ep.label}] agent exit_code={res.agent_exit_code}",
+                flush=True,
+            )
+        except TimeoutError as exc:
+            res.error = f"agent timeout: {exc}"
+            print(f"[{ep.label}] {res.error}", flush=True)
+        reward, tests, vstdout = _run_verifier(sbx)
+        res.reward, res.tests, res.verifier_stdout = reward, tests, vstdout
+        res.files, res.files_extra = _collect_files(sbx)
+        turns = _read_proxy_trace(sbx)
+        res.logprobs = _logprob_stats(turns)
+        # Capture truncated request/response per turn for debugging. Strip
+        # large/noisy fields (full token logprobs, raw bytes) to keep the
+        # artifact readable.
+        for t in turns:
+            req = t.get("request") or {}
+            resp = t.get("response") or {}
+            res.raw_turns.append(
+                {
+                    "turn": t.get("turn"),
+                    "finish_reason": t.get("finish_reason"),
+                    "latency_s": t.get("latency_s"),
+                    "request_messages": req.get("messages", [])[-6:],
+                    "request_tools": [
+                        (tool.get("function") or {}).get("name", "?")
+                        for tool in (req.get("tools") or [])
+                    ],
+                    "request_temperature": req.get("temperature"),
+                    "request_max_tokens": req.get("max_tokens")
+                    or req.get("max_completion_tokens"),
+                    "response_choices": [
+                        {
+                            "finish_reason": ch.get("finish_reason"),
+                            "message_content": (ch.get("message") or {}).get(
+                                "content"
+                            ),
+                            "tool_calls": [
+                                {
+                                    "name": (tc.get("function") or {}).get(
+                                        "name"
+                                    ),
+                                    "arguments": str(
+                                        (tc.get("function") or {}).get(
+                                            "arguments", ""
+                                        )
+                                    )[:500],
+                                }
+                                for tc in (
+                                    (ch.get("message") or {}).get("tool_calls")
+                                    or []
+                                )
+                            ],
+                        }
+                        for ch in (resp.get("choices") or [])
+                    ],
+                    "upstream_status": resp.get("upstream_status"),
+                    "upstream_error": resp.get("upstream_error"),
+                }
+            )
+        res.proxy_log_tail = _safe_read(sbx, PROXY_LOG)[-2000:]
+        res.agent_log_tail = _safe_read(sbx, AGENT_LOG)[-4000:]
+    except Exception as exc:  # noqa: BLE001
+        res.error = f"{type(exc).__name__}: {exc}"
+        print(f"[{ep.label}] ERROR {res.error}", flush=True)
+    finally:
+        try:
+            sbx.kill()
+        except Exception:
+            pass
+        res.wall_s = time.time() - started
+    return res
+# ---------------------------------------------------------------------------
+# Reporting.
+# ---------------------------------------------------------------------------
+def _format_summary(results: list[RunResult]) -> str:
+    lines: list[str] = []
+    sep = "-" * 110
+    lines.append(sep)
+    lines.append(
+        f"{'endpoint':<10} {'model':<42} {'reward':<8} {'pass':<6} "
+        f"{'turns':<6} {'tokens':<8} {'mean-logp':<11} {'wall':<8}"
+    )
+    lines.append(sep)
+    for r in results:
+        reward = f"{r.reward:.2f}" if r.reward is not None else "-"
+        pass_str = f"{r.passed}/{len(MODULES)}"
+        mean_lp = (
+            f"{r.logprobs.mean_logprob:+.3f}"
+            if r.logprobs.mean_logprob is not None
+            else "-"
+        )
+        lines.append(
+            f"{r.endpoint:<10} {r.model[:42]:<42} {reward:<8} {pass_str:<6} "
+            f"{r.logprobs.n_turns:<6} {r.logprobs.total_completion_tokens:<8} "
+            f"{mean_lp:<11} {r.wall_s:<7.1f}s"
+        )
+    lines.append(sep)
+    lines.append("")
+    lines.append("per-file results:")
+    for r in results:
+        per_file = "  ".join(
+            f"{m}={r.tests.get(m, '?')}" for m in MODULES
+        )
+        lines.append(f"  {r.endpoint:<10} {per_file}")
+        if r.files_extra:
+            lines.append(
+                f"  {' ':<10} extras: {', '.join(sorted(r.files_extra))}"
+            )
+        if r.error:
+            lines.append(f"  {' ':<10} ERROR: {r.error[:200]}")
+    return "\n".join(lines)
+def _save_artifact(r: RunResult, out_dir: Path) -> Path:
+    out_dir.mkdir(parents=True, exist_ok=True)
+    ts = int(time.time())
+    target = out_dir / f"sorting_{r.endpoint}_{ts}.json"
+    target.write_text(json.dumps(asdict(r), indent=2, default=str))
+    return target
+# ---------------------------------------------------------------------------
+# CLI.
+# ---------------------------------------------------------------------------
+def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
+    p = argparse.ArgumentParser(
+        prog="test_five_sorts_e2e",
+        description=(
+            "Run opencode end-to-end against vLLM / OpenAI / HF Router, "
+            "write 5 sorting algorithms in 5 files, verify them, return "
+            "logprobs + tests + filesystem."
+        ),
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    p.add_argument(
+        "--endpoint",
+        choices=["vllm", "openai", "hf_router", "all"],
+        default="all",
+        help="Which endpoint to test (default: all configured endpoints).",
+    )
+    p.add_argument(
+        "--mode",
+        choices=["transparent_proxy", "black_box"],
+        default="transparent_proxy",
+        help=(
+            "transparent_proxy captures per-token logprobs; black_box skips "
+            "the proxy. Default: transparent_proxy."
+        ),
+    )
+    p.add_argument(
+        "--agent-timeout",
+        type=float,
+        default=600.0,
+        help="Seconds to wait for opencode to finish (default 600).",
+    )
+    p.add_argument(
+        "--max-tokens-cap",
+        type=int,
+        default=4096,
+        help="Per-turn max_tokens clamp on forwarded requests (default 4096).",
+    )
+    p.add_argument(
+        "--top-logprobs",
+        type=int,
+        default=5,
+        help="Top-k logprobs requested from the upstream (HF Router cap is 5).",
+    )
+    p.add_argument(
+        "--disable-thinking",
+        choices=["auto", "on", "off"],
+        default="auto",
+        help=(
+            "Inject ``chat_template_kwargs.enable_thinking=false`` on "
+            "forwarded requests. ``auto`` = on for vllm, off for openai / "
+            "hf_router (default). ``on`` / ``off`` forces it for every "
+            "endpoint."
+        ),
+    )
+    p.add_argument(
+        "--save-artifacts",
+        action="store_true",
+        help="Dump per-run JSON to envs/opencode_env/tests/_artifacts/.",
+    )
+    p.add_argument(
+        "--instruction-override",
+        default=None,
+        help="Replace the default 5-sorts instruction.",
+    )
+    p.add_argument(
+        "--no-summary-files",
+        action="store_true",
+        help="Skip printing file contents in the summary.",
+    )
+    p.add_argument(
+        "--template",
+        default=None,
+        help=(
+            "E2B template name to use (e.g. ``opencode-rl`` after running "
+            "build_e2b_template.py). When set, skips opencode install + "
+            "pip-deps install (already in the template) — saves ~2 min "
+            "per rollout."
+        ),
+    )
+    return p.parse_args(argv)
+def main(argv: list[str] | None = None) -> int:
+    args = _parse_args(argv)
+    e2b_api_key = os.environ.get("E2B_API_KEY")
+    if not e2b_api_key:
+        print(
+            "ERROR: E2B_API_KEY is required (set it in .env or your shell).",
+            file=sys.stderr,
+        )
+        return 2
+    print(f"Loading env from {_DOTENV_PATH}")
+    endpoints, skipped = _resolve_endpoints()
+    if args.endpoint != "all":
+        endpoints = [e for e in endpoints if e.label == args.endpoint]
+    instruction = args.instruction_override or INSTRUCTION
+    runs: list[RunResult] = []
+    for ep in endpoints:
+        if args.disable_thinking == "on":
+            disable_thinking = True
+        elif args.disable_thinking == "off":
+            disable_thinking = False
+        else:
+            disable_thinking = ep.disable_thinking_default
+        runs.append(
+            run_one(
+                ep,
+                mode=args.mode,
+                agent_timeout_s=args.agent_timeout,
+                max_tokens_cap=args.max_tokens_cap,
+                top_logprobs=args.top_logprobs,
+                disable_thinking=disable_thinking,
+                instruction=instruction,
+                e2b_api_key=e2b_api_key,
+                template=args.template,
+            )
+        )
+    print()
+    print(_format_summary(runs))
+    if skipped:
+        print("\nSkipped (not configured):")
+        for s in skipped:
+            print(f"  - {s}")
+    if not args.no_summary_files:
+        for r in runs:
+            print(f"\n=== files written by {r.endpoint} ({r.model}) ===")
+            for fname, src in r.files.items():
+                head = "\n".join(src.splitlines()[:20])
+                print(f"--- {fname} (first 20 lines) ---")
+                print(head)
+                if src.count("\n") > 20:
+                    print(f"... ({src.count(chr(10)) - 20} more lines)")
+    if args.save_artifacts:
+        out_dir = _ENV_DIR / "tests" / "_artifacts"
+        for r in runs:
+            print(f"saved {_save_artifact(r, out_dir)}")
+    if not runs:
+        print("\nNo endpoints ran. Fill in .env and re-run.")
+        return 2
+    failed = [r for r in runs if r.reward is None or r.reward < 1.0 or r.error]
+    if failed:
+        print(f"\n{len(failed)}/{len(runs)} endpoint(s) did not reach reward=1.0.")
+        return 1
+    print(f"\nAll {len(runs)} endpoint(s) reached reward=1.0.")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

tests/test_harness.py ADDED Viewed

	@@ -0,0 +1,221 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Unit tests for OpenCodeSession / OpenCodeSessionFactory (no sandbox)."""
+from __future__ import annotations
+import pytest
+from opencode_env.config import OpenCodeConfig
+from opencode_env.harness import OpenCodeSession, OpenCodeSessionFactory
+from opencode_env.sandbox.base import ExecResult
+from opencode_env.task import OpenCodeTask
+class _FakeBgJob:
+    def __init__(self) -> None:
+        self.pid = 123
+        self._killed = False
+    def wait(self, timeout: float | None = None) -> int:
+        return 0
+    def kill(self) -> None:
+        self._killed = True
+class _FakeSandbox:
+    """In-memory sandbox that records every interaction."""
+    def __init__(self, *, install_exit: int = 0, setup_exit: int = 0) -> None:
+        self.sandbox_id = "fake-sbx"
+        self.exec_calls: list[tuple[str, dict | None]] = []
+        self.written: dict[str, str] = {}
+        self.bg_calls: list[tuple[str, dict | None]] = []
+        self.killed = False
+        self._install_exit = install_exit
+        self._setup_exit = setup_exit
+    def exec(self, cmd, *, envs=None, cwd=None, timeout=60):
+        self.exec_calls.append((cmd, envs))
+        # Health probe: the factory issues ``echo ok`` up to 15 times before
+        # doing anything else. The fake sandbox is "ready" on the first try.
+        if cmd.strip() == "echo ok":
+            return ExecResult(0, "ok\n", "")
+        if "opencode.ai/install" in cmd:
+            return ExecResult(self._install_exit, "opencode 0.0.0\n", "")
+        return ExecResult(self._setup_exit, "", "")
+    def start_bg(self, cmd, *, envs=None, cwd=None):
+        self.bg_calls.append((cmd, envs))
+        return _FakeBgJob()
+    def write_text(self, path, content):
+        self.written[path] = content
+    def read_text(self, path):
+        return self.written.get(path, "")
+    def exists(self, path):
+        return path in self.written
+    def kill(self):
+        self.killed = True
+class _FakeBackend:
+    def __init__(self, sandbox: _FakeSandbox) -> None:
+        self._sandbox = sandbox
+        self.create_calls = 0
+    def create(self, *, timeout_s=900, envs=None, metadata=None):
+        self.create_calls += 1
+        return self._sandbox
+def _config(**overrides) -> OpenCodeConfig:
+    base = dict(
+        provider="openai",
+        base_url="https://api.openai.com/v1",
+        api_key="sk-fake",
+        model="openai/gpt-5.3-codex",
+    )
+    base.update(overrides)
+    return OpenCodeConfig(**base)
+def test_factory_bootstraps_and_starts_agent():
+    sbx = _FakeSandbox()
+    backend = _FakeBackend(sbx)
+    factory = OpenCodeSessionFactory(config=_config(), sandbox_backend=backend)
+    session = factory.create(task="solve fizzbuzz")
+    assert backend.create_calls == 1
+    assert any("opencode.ai/install" in c for c, _ in sbx.exec_calls)
+    assert "/home/user/.config/opencode/opencode.json" in sbx.written
+    assert sbx.written["/home/user/task/instruction.md"] == "solve fizzbuzz"
+    assert len(sbx.bg_calls) == 1, "agent must be started in background"
+    # OPENAI_BASE_URL must be injected into the process env
+    _, envs = sbx.bg_calls[0]
+    assert envs["OPENAI_BASE_URL"] == "https://api.openai.com/v1"
+    assert envs["OPENAI_API_KEY"] == "sk-fake"
+    assert isinstance(session, OpenCodeSession)
+def test_factory_runs_task_setup_shell():
+    sbx = _FakeSandbox()
+    factory = OpenCodeSessionFactory(
+        config=_config(), sandbox_backend=_FakeBackend(sbx)
+    )
+    task = OpenCodeTask(instruction="x", setup_shell="pip install pytest")
+    factory.create(task=task)
+    setup_cmds = [c for c, _ in sbx.exec_calls if "pip install" in c]
+    assert setup_cmds == ["pip install pytest"]
+def test_factory_uploads_extra_files():
+    sbx = _FakeSandbox()
+    factory = OpenCodeSessionFactory(
+        config=_config(), sandbox_backend=_FakeBackend(sbx)
+    )
+    task = OpenCodeTask(
+        instruction="run it",
+        upload_files={"/home/user/workdir/hello.py": "print('hi')"},
+    )
+    factory.create(task=task)
+    assert sbx.written["/home/user/workdir/hello.py"] == "print('hi')"
+def test_factory_kills_sandbox_on_install_failure():
+    sbx = _FakeSandbox(install_exit=1)
+    factory = OpenCodeSessionFactory(
+        config=_config(), sandbox_backend=_FakeBackend(sbx)
+    )
+    with pytest.raises(RuntimeError, match="install failed"):
+        factory.create(task="x")
+    assert sbx.killed
+def test_factory_accepts_transparent_proxy_mode():
+    f = OpenCodeSessionFactory(
+        config=_config(),
+        sandbox_backend=_FakeBackend(_FakeSandbox()),
+        mode="transparent_proxy",
+    )
+    assert f._mode == "transparent_proxy"
+def test_factory_rejects_unknown_mode():
+    with pytest.raises(ValueError, match="Unknown mode"):
+        OpenCodeSessionFactory(
+            config=_config(),
+            sandbox_backend=_FakeBackend(_FakeSandbox()),
+            mode="bogus",  # type: ignore[arg-type]
+        )
+def test_session_initial_messages():
+    sbx = _FakeSandbox()
+    session = OpenCodeSession(
+        sandbox=sbx,
+        config=_config(),
+        task=OpenCodeTask(instruction="hi"),
+    )
+    assert session.initial_messages() == [{"role": "user", "content": "hi"}]
+def test_session_verify_without_verifier_returns_none_reward():
+    sbx = _FakeSandbox()
+    session = OpenCodeSession(
+        sandbox=sbx,
+        config=_config(),
+        task=OpenCodeTask(instruction="x"),
+    )
+    result = session.verify(transcript=[])
+    assert result.env_reward is None
+    assert result.done is True
+def test_session_verify_calls_user_verifier():
+    from openenv.core.harness import VerifyResult
+    sbx = _FakeSandbox()
+    calls = []
+    def verifier(sandbox, task):
+        calls.append((sandbox.sandbox_id, task.instruction))
+        return VerifyResult(env_reward=1.0, done=True, metrics={"tests": "pass"})
+    session = OpenCodeSession(
+        sandbox=sbx,
+        config=_config(),
+        task=OpenCodeTask(instruction="do"),
+        verifier=verifier,
+    )
+    result = session.verify(transcript=[])
+    assert calls == [("fake-sbx", "do")]
+    assert result.env_reward == 1.0
+    assert result.metrics == {"tests": "pass"}
+def test_session_close_kills_job_and_sandbox():
+    sbx = _FakeSandbox()
+    session = OpenCodeSession(
+        sandbox=sbx,
+        config=_config(),
+        task=OpenCodeTask(instruction="x"),
+    )
+    session._bg_job = _FakeBgJob()
+    session.close()
+    assert session._bg_job is None
+    assert sbx.killed

tests/test_inference_endpoints.py ADDED Viewed

	@@ -0,0 +1,430 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Inference probe for the three LLM endpoints opencode runs against.
+For each of vLLM, OpenAI, and the HF Inference Router, fires one
+``/v1/chat/completions`` request with ``logprobs=true`` and verifies:
+1. HTTP status is 200.
+2. The response carries either ``message.content`` or ``message.tool_calls``.
+3. ``choices[0].logprobs.content`` is non-null with at least one entry.
+4. The first token's ``top_logprobs`` has the requested top-k count.
+Endpoints are read from the sibling ``.env`` file (``envs/opencode_env/.env``).
+A missing config skips that endpoint instead of failing the suite.
+Run as pytest::
+    PYTHONPATH=src:envs/opencode_env uv run pytest \\
+        envs/opencode_env/tests/test_inference_endpoints.py -v -s
+Run as a standalone script (prints a summary table)::
+    python envs/opencode_env/tests/test_inference_endpoints.py
+"""
+from __future__ import annotations
+import os
+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+import httpx
+import pytest
+# ---------------------------------------------------------------------------
+# .env loader — no python-dotenv dep, since the package keeps deps minimal.
+# ---------------------------------------------------------------------------
+def _load_env_file(env_path: Path) -> None:
+    """Populate ``os.environ`` from ``KEY=VALUE`` lines in ``env_path``.
+    Existing process env vars take precedence so a shell ``export`` always
+    wins over the ``.env`` file. Lines starting with ``#`` and blank lines
+    are ignored. Surrounding single/double quotes on values are stripped.
+    """
+    if not env_path.exists():
+        return
+    for raw in env_path.read_text().splitlines():
+        line = raw.strip()
+        if not line or line.startswith("#") or "=" not in line:
+            continue
+        key, _, value = line.partition("=")
+        key = key.strip()
+        value = value.strip().strip('"').strip("'")
+        if key and key not in os.environ:
+            os.environ[key] = value
+_ENV_PATH = Path(__file__).resolve().parents[1] / ".env"
+_load_env_file(_ENV_PATH)
+# ---------------------------------------------------------------------------
+# Endpoint specs — one per kind of LLM endpoint we exercise.
+# ---------------------------------------------------------------------------
+@dataclass
+class EndpointSpec:
+    """Reads one endpoint's connection info out of the environment."""
+    label: str
+    base_url_env: str
+    model_env: str
+    api_key_env: str
+    default_base_url: str | None = None
+    default_model: str | None = None
+    default_api_key: str = ""
+    def resolve(self) -> "EndpointConfig | None":
+        base_url = os.environ.get(self.base_url_env) or self.default_base_url or ""
+        model = os.environ.get(self.model_env) or self.default_model or ""
+        api_key = os.environ.get(self.api_key_env) or self.default_api_key
+        if not base_url or not model or not api_key:
+            return None
+        return EndpointConfig(
+            label=self.label, base_url=base_url, model=model, api_key=api_key
+        )
+@dataclass
+class EndpointConfig:
+    label: str
+    base_url: str
+    model: str
+    api_key: str
+def _resolve_chat_completions_url(base_url: str) -> str:
+    """Build the fully-qualified ``/v1/chat/completions`` URL.
+    Mirrors :func:`opencode_env.interception._resolve_upstream_url`: if the
+    base already ends in ``/v1`` (or includes a path), only ``/chat/completions``
+    is appended; otherwise the full ``/v1/chat/completions`` path is used.
+    """
+    base = base_url.rstrip("/")
+    if base.endswith("/v1"):
+        return f"{base}/chat/completions"
+    return f"{base}/v1/chat/completions"
+# Defaults below mirror what the OpenCode harness primitive uses by default.
+# .env values override anything specified here.
+ENDPOINT_SPECS: list[EndpointSpec] = [
+    EndpointSpec(
+        label="vllm",
+        base_url_env="VLLM_URL",
+        model_env="VLLM_MODEL",
+        api_key_env="VLLM_API_KEY",
+        default_api_key="intercepted",
+        default_model="Qwen/Qwen3.5-4B",
+    ),
+    EndpointSpec(
+        label="openai",
+        base_url_env="OPENAI_BASE_URL",
+        model_env="OPENAI_MODEL",
+        api_key_env="OPENAI_API_KEY",
+        default_base_url="https://api.openai.com/v1",
+        default_model="gpt-4o-mini",
+    ),
+    EndpointSpec(
+        label="hf_router",
+        base_url_env="HF_ROUTER_BASE_URL",
+        model_env="HF_ROUTER_MODEL",
+        api_key_env="HF_ROUTER_API_KEY",
+        default_base_url="https://router.huggingface.co/v1",
+        default_model="Qwen/Qwen3-4B-Instruct-2507:nscale",
+    ),
+]
+# ---------------------------------------------------------------------------
+# Probe — one HTTP round trip per call; pure function, no side effects.
+# ---------------------------------------------------------------------------
+@dataclass
+class ProbeResult:
+    label: str
+    base_url: str
+    model: str
+    status: int
+    ok: bool
+    completion_text: str = ""
+    has_tool_calls: bool = False
+    has_logprobs: bool = False
+    top_logprobs_n: int = 0
+    first_token: str = ""
+    first_logprob: float | None = None
+    latency_s: float = 0.0
+    error: str = ""
+    raw_response: dict[str, Any] = field(default_factory=dict)
+def probe(
+    cfg: EndpointConfig,
+    *,
+    top_logprobs: int = 5,
+    max_tokens: int = 16,
+    timeout_s: float = 90.0,
+) -> ProbeResult:
+    """Send one chat-completions request and return what the endpoint did.
+    Never raises. Network / 4xx / 5xx errors land in ``ProbeResult.error`` so
+    the caller can render a table without try/except scaffolding.
+    """
+    import time
+    url = _resolve_chat_completions_url(cfg.base_url)
+    body: dict[str, Any] = {
+        "model": cfg.model,
+        "messages": [{"role": "user", "content": "Reply with a single word: hi"}],
+        "max_tokens": max_tokens,
+        "logprobs": True,
+        "top_logprobs": top_logprobs,
+        "temperature": 0,
+    }
+    headers = {
+        "Authorization": f"Bearer {cfg.api_key}",
+        "Content-Type": "application/json",
+    }
+    start = time.time()
+    try:
+        r = httpx.post(url, json=body, headers=headers, timeout=timeout_s)
+    except Exception as exc:  # noqa: BLE001
+        return ProbeResult(
+            label=cfg.label,
+            base_url=cfg.base_url,
+            model=cfg.model,
+            status=0,
+            ok=False,
+            error=f"{type(exc).__name__}: {exc}",
+            latency_s=time.time() - start,
+        )
+    latency = time.time() - start
+    if r.status_code != 200:
+        return ProbeResult(
+            label=cfg.label,
+            base_url=cfg.base_url,
+            model=cfg.model,
+            status=r.status_code,
+            ok=False,
+            error=r.text[:600],
+            latency_s=latency,
+        )
+    try:
+        data = r.json()
+    except Exception as exc:  # noqa: BLE001
+        return ProbeResult(
+            label=cfg.label,
+            base_url=cfg.base_url,
+            model=cfg.model,
+            status=r.status_code,
+            ok=False,
+            error=f"non-JSON body: {exc}",
+            latency_s=latency,
+        )
+    choice = (data.get("choices") or [{}])[0]
+    msg = choice.get("message") or {}
+    completion_text = msg.get("content") or ""
+    has_tool_calls = bool(msg.get("tool_calls"))
+    lp = choice.get("logprobs")
+    content_lp = lp.get("content") if isinstance(lp, dict) else None
+    has_logprobs = bool(content_lp)
+    first_token = ""
+    first_logprob: float | None = None
+    top_n = 0
+    if has_logprobs and content_lp:
+        first = content_lp[0]
+        first_token = str(first.get("token", ""))
+        lp_val = first.get("logprob")
+        if lp_val is not None:
+            first_logprob = float(lp_val)
+        top_n = len(first.get("top_logprobs") or [])
+    return ProbeResult(
+        label=cfg.label,
+        base_url=cfg.base_url,
+        model=cfg.model,
+        status=r.status_code,
+        ok=True,
+        completion_text=completion_text,
+        has_tool_calls=has_tool_calls,
+        has_logprobs=has_logprobs,
+        top_logprobs_n=top_n,
+        first_token=first_token,
+        first_logprob=first_logprob,
+        latency_s=latency,
+        raw_response=data,
+    )
+# ---------------------------------------------------------------------------
+# pytest entrypoints — one parametrized test per endpoint.
+# ---------------------------------------------------------------------------
+@pytest.mark.parametrize(
+    "spec", ENDPOINT_SPECS, ids=[s.label for s in ENDPOINT_SPECS]
+)
+def test_endpoint_responds(spec: EndpointSpec) -> None:
+    """Endpoint accepts a chat-completions call and returns a 2xx body."""
+    cfg = spec.resolve()
+    if cfg is None:
+        pytest.skip(
+            f"{spec.label} not configured (set {spec.base_url_env} / "
+            f"{spec.model_env} / {spec.api_key_env} in .env)"
+        )
+    result = probe(cfg)
+    assert result.ok, f"{cfg.label}: HTTP {result.status} — {result.error}"
+    # ``logprobs.content`` populated implies the model generated at least one
+    # token (either visible content, a tool-call argument, or a reasoning
+    # token for Qwen3-thinking variants). That is the signal we want — empty
+    # completion + empty tool_calls is fine when reasoning tokens are present.
+    assert result.has_logprobs or result.completion_text or result.has_tool_calls, (
+        f"{cfg.label}: model produced no output at all. "
+        f"Response: {str(result.raw_response)[:500]}"
+    )
+@pytest.mark.parametrize(
+    "spec", ENDPOINT_SPECS, ids=[s.label for s in ENDPOINT_SPECS]
+)
+def test_endpoint_returns_logprobs(spec: EndpointSpec) -> None:
+    """Endpoint honors ``logprobs=true`` and returns per-token logprobs.
+    Failing this test means the endpoint silently drops logprobs (HF Router
+    providers like Novita / Hyperbolic / Featherless behave this way) — the
+    transparent proxy has nothing to capture and Mode B GRPO will train on
+    empty per-token logps.
+    """
+    cfg = spec.resolve()
+    if cfg is None:
+        pytest.skip(
+            f"{spec.label} not configured (set {spec.base_url_env} / "
+            f"{spec.model_env} / {spec.api_key_env} in .env)"
+        )
+    result = probe(cfg)
+    assert result.ok, f"{cfg.label}: HTTP {result.status} — {result.error}"
+    assert result.has_logprobs, (
+        f"{cfg.label}: endpoint returned 200 but logprobs.content is null. "
+        f"This provider does not support logprobs. Pick a different provider "
+        f"(together / nscale / scaleway) or run opencode in mode='black_box'."
+    )
+    assert result.top_logprobs_n >= 1, (
+        f"{cfg.label}: top_logprobs has {result.top_logprobs_n} entries, "
+        f"expected >= 1"
+    )
+    assert result.first_logprob is not None, (
+        f"{cfg.label}: first token has no logprob value"
+    )
+# ---------------------------------------------------------------------------
+# Standalone runner — prints a summary table.
+# ---------------------------------------------------------------------------
+def _format_summary(results: list[ProbeResult], skipped: list[str]) -> str:
+    rows: list[str] = []
+    rows.append("-" * 96)
+    rows.append(
+        f"{'endpoint':<10} {'status':<7} {'logprobs':<14} {'top-n':<6} "
+        f"{'first-token':<14} {'first-logp':<11} {'latency':<8} notes"
+    )
+    rows.append("-" * 96)
+    for r in results:
+        if r.status == 0:
+            status_str = "ERR"
+        else:
+            status_str = str(r.status)
+        if not r.ok:
+            lp_str = "n/a"
+        elif r.has_logprobs:
+            lp_str = f"yes ({r.top_logprobs_n})"
+        else:
+            lp_str = "DROPPED"
+        first_tok_str = repr(r.first_token) if r.first_token else "-"
+        first_lp_str = (
+            f"{r.first_logprob:+.3f}" if r.first_logprob is not None else "-"
+        )
+        latency_str = f"{r.latency_s:.2f}s"
+        notes = ""
+        if not r.ok:
+            notes = r.error[:50].replace("\n", " ")
+        elif not r.has_logprobs:
+            notes = "silent logprob drop"
+        rows.append(
+            f"{r.label:<10} {status_str:<7} {lp_str:<14} "
+            f"{r.top_logprobs_n:<6} {first_tok_str:<14} "
+            f"{first_lp_str:<11} {latency_str:<8} {notes}"
+        )
+    rows.append("-" * 96)
+    if skipped:
+        rows.append("")
+        rows.append("Skipped (not configured in .env):")
+        for s in skipped:
+            rows.append(f"  - {s}")
+    return "\n".join(rows)
+def main() -> int:
+    print(f"Loading env from {_ENV_PATH}\n")
+    results: list[ProbeResult] = []
+    skipped: list[str] = []
+    for spec in ENDPOINT_SPECS:
+        cfg = spec.resolve()
+        if cfg is None:
+            skipped.append(
+                f"{spec.label} (set {spec.base_url_env} / {spec.model_env} / "
+                f"{spec.api_key_env})"
+            )
+            continue
+        print(f"-> probing {cfg.label}: {cfg.base_url}  model={cfg.model}")
+        r = probe(cfg)
+        results.append(r)
+        if not r.ok:
+            print(f"   HTTP {r.status}: {r.error[:200]}")
+        else:
+            print(
+                f"   HTTP {r.status}  logprobs={r.has_logprobs}  "
+                f"top_n={r.top_logprobs_n}  "
+                f"content={r.completion_text!r:.60}"
+            )
+        print()
+    print(_format_summary(results, skipped))
+    if not results:
+        print("\nNo endpoints configured. Fill in .env and re-run.")
+        return 2
+    bad = [r for r in results if not r.ok or not r.has_logprobs]
+    if bad:
+        print(f"\n{len(bad)}/{len(results)} endpoint(s) failed or lack logprobs.")
+        return 1
+    print(f"\nAll {len(results)} configured endpoint(s) passed.")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

tests/test_interception.py ADDED Viewed

	@@ -0,0 +1,198 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Unit tests for the interception proxy (no sandbox, no real LLM)."""
+from __future__ import annotations
+import json
+import os
+import socket
+from contextlib import closing
+import httpx
+import pytest
+import uvicorn
+from fastapi import FastAPI, Request
+from opencode_env.sandbox.interception import (
+    InterceptionProxy,
+    ProxyConfig,
+    _build_turn_record,
+    _strip_logprobs,
+    read_trace,
+)
+def _free_port() -> int:
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+        s.bind(("127.0.0.1", 0))
+        return s.getsockname()[1]
+def _make_upstream_app(response_payload: dict) -> FastAPI:
+    app = FastAPI()
+    received: list[dict] = []
+    @app.post("/v1/chat/completions")
+    async def handler(request: Request):
+        body = await request.json()
+        received.append(body)
+        return response_payload
+    app.state.received = received
+    return app
+def _run_upstream(app: FastAPI, port: int) -> uvicorn.Server:
+    config = uvicorn.Config(
+        app, host="127.0.0.1", port=port, log_level="warning", lifespan="on"
+    )
+    server = uvicorn.Server(config)
+    import threading
+    t = threading.Thread(target=server.run, daemon=True)
+    t.start()
+    import time
+    deadline = time.time() + 5
+    while time.time() < deadline:
+        try:
+            with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+                s.settimeout(0.2)
+                if s.connect_ex(("127.0.0.1", port)) == 0:
+                    return server
+        except OSError:
+            pass
+        time.sleep(0.05)
+    raise RuntimeError("upstream failed to start")
+_FAKE_RESPONSE = {
+    "id": "chatcmpl-fake",
+    "object": "chat.completion",
+    "model": "test-model",
+    "choices": [
+        {
+            "index": 0,
+            "finish_reason": "stop",
+            "message": {"role": "assistant", "content": "hi"},
+            "logprobs": {
+                "content": [
+                    {"token": "h", "logprob": -0.1, "top_logprobs": []},
+                    {"token": "i", "logprob": -0.2, "top_logprobs": []},
+                ]
+            },
+        }
+    ],
+}
+def test_strip_logprobs_removes_only_logprobs_key():
+    sanitized = _strip_logprobs(_FAKE_RESPONSE)
+    choice = sanitized["choices"][0]
+    assert "logprobs" not in choice
+    assert choice["message"]["content"] == "hi"
+    assert choice["finish_reason"] == "stop"
+def test_build_turn_record_extracts_logprobs():
+    record = _build_turn_record(
+        turn_idx=1,
+        request_body={"model": "test", "messages": []},
+        response_json=_FAKE_RESPONSE,
+        latency_s=0.25,
+    )
+    assert record.completion_tokens == ["h", "i"]
+    assert record.per_token_logps == [-0.1, -0.2]
+    assert record.finish_reason == "stop"
+def test_read_trace_returns_empty_list_when_missing(tmp_path):
+    assert read_trace(tmp_path / "nonexistent.jsonl") == []
+def test_proxy_forwards_captures_and_strips(tmp_path):
+    upstream_port = _free_port()
+    proxy_port = _free_port()
+    trace = tmp_path / "trace.jsonl"
+    upstream_app = _make_upstream_app(_FAKE_RESPONSE)
+    upstream_server = _run_upstream(upstream_app, upstream_port)
+    cfg = ProxyConfig(
+        upstream_url=f"http://127.0.0.1:{upstream_port}",
+        upstream_api_key="test-key",
+        trace_path=str(trace),
+        host="127.0.0.1",
+        port=proxy_port,
+        top_logprobs=5,
+    )
+    with InterceptionProxy(cfg) as proxy:
+        assert proxy.url == f"http://127.0.0.1:{proxy_port}/v1"
+        # Sanity: healthz
+        r = httpx.get(f"http://127.0.0.1:{proxy_port}/healthz")
+        assert r.status_code == 200
+        # Chat completion round trip
+        req_body = {
+            "model": "openai_compatible/foo",
+            "messages": [{"role": "user", "content": "hi"}],
+            "temperature": 0.0,
+        }
+        r = httpx.post(
+            f"http://127.0.0.1:{proxy_port}/v1/chat/completions",
+            json=req_body,
+            headers={"Authorization": "Bearer whatever"},
+            timeout=10,
+        )
+        assert r.status_code == 200
+        returned = r.json()
+        # logprobs stripped from what opencode sees
+        assert "logprobs" not in returned["choices"][0]
+        assert returned["choices"][0]["message"]["content"] == "hi"
+    # Upstream got logprobs=true injected
+    forwarded = upstream_app.state.received
+    assert len(forwarded) == 1
+    assert forwarded[0]["logprobs"] is True
+    assert forwarded[0]["top_logprobs"] == 5
+    # Authorization carries upstream_api_key
+    # Trace file has one line with captured logprobs
+    records = read_trace(trace)
+    assert len(records) == 1
+    rec = records[0]
+    assert rec["turn"] == 1
+    assert rec["completion_tokens"] == ["h", "i"]
+    assert rec["per_token_logps"] == [-0.1, -0.2]
+    assert rec["finish_reason"] == "stop"
+    assert rec["request"]["messages"][0]["content"] == "hi"
+    upstream_server.should_exit = True
+def test_proxy_handles_invalid_json_body(tmp_path):
+    upstream_port = _free_port()
+    proxy_port = _free_port()
+    upstream_server = _run_upstream(_make_upstream_app(_FAKE_RESPONSE), upstream_port)
+    cfg = ProxyConfig(
+        upstream_url=f"http://127.0.0.1:{upstream_port}",
+        trace_path=str(tmp_path / "trace.jsonl"),
+        host="127.0.0.1",
+        port=proxy_port,
+    )
+    with InterceptionProxy(cfg):
+        r = httpx.post(
+            f"http://127.0.0.1:{proxy_port}/v1/chat/completions",
+            content=b"not json",
+            headers={"Content-Type": "application/json"},
+            timeout=10,
+        )
+        assert r.status_code == 400
+    upstream_server.should_exit = True

tests/test_opencode_runtime.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import annotations
+import json
+from opencode_env.config import OpenCodeConfig
+from opencode_env.opencode_runtime import (
+    build_env_vars,
+    build_install_cmd,
+    build_opencode_json,
+    build_run_cmd,
+)
+def _openai_cfg(**overrides) -> OpenCodeConfig:
+    base = dict(
+        provider="openai",
+        base_url="https://api.openai.com/v1",
+        api_key="sk-test",
+        model="openai/gpt-5.3-codex",
+    )
+    base.update(overrides)
+    return OpenCodeConfig(**base)
+def test_opencode_json_has_schema_and_provider_block():
+    cfg = _openai_cfg()
+    doc = json.loads(build_opencode_json(cfg))
+    assert doc["$schema"] == "https://opencode.ai/config.json"
+    assert doc["model"] == "intercepted/gpt-5.3-codex"
+    provider = doc["provider"]["intercepted"]
+    assert provider["npm"] == "@ai-sdk/openai"
+    assert provider["options"]["baseURL"] == "https://api.openai.com/v1"
+    assert provider["options"]["apiKey"] == "sk-test"
+    assert provider["options"]["timeout"] == 600_000
+def test_opencode_json_disables_tools_by_default():
+    cfg = _openai_cfg()
+    doc = json.loads(build_opencode_json(cfg))
+    assert doc["tools"] == {"webfetch": False, "question": False}
+def test_opencode_json_extra_is_deep_merged():
+    cfg = _openai_cfg(extra_opencode_json={"theme": "dark", "provider": {"intercepted": {"options": {"custom": 1}}}})
+    doc = json.loads(build_opencode_json(cfg))
+    assert doc["theme"] == "dark"
+    # Deep merge preserves other keys in the nested options block
+    options = doc["provider"]["intercepted"]["options"]
+    assert options["baseURL"] == "https://api.openai.com/v1"
+    assert options["custom"] == 1
+def test_install_cmd_pins_version_when_not_latest():
+    cfg = _openai_cfg(opencode_version="0.5.3")
+    cmd = build_install_cmd(cfg)
+    assert "OPENCODE_VERSION=0.5.3" in cmd
+    assert "curl -fsSL https://opencode.ai/install | bash" in cmd
+    assert "opencode --version" in cmd
+    assert "/home/user/.config/opencode" in cmd
+def test_install_cmd_respects_sandbox_home():
+    cfg = _openai_cfg(sandbox_home="/root")
+    cmd = build_install_cmd(cfg)
+    assert "/root/.config/opencode" in cmd
+    assert "/home/user" not in cmd
+def test_install_cmd_omits_version_env_when_latest():
+    cfg = _openai_cfg(opencode_version="latest")
+    cmd = build_install_cmd(cfg)
+    assert "OPENCODE_VERSION" not in cmd
+def test_run_cmd_uses_json_format_by_default():
+    cfg = _openai_cfg()
+    cmd = build_run_cmd(cfg)
+    assert "opencode run --format json" in cmd
+    assert '"$(cat /home/user/task/instruction.md)"' in cmd
+    assert "tee /home/user/logs/agent/opencode.jsonl" in cmd
+def test_run_cmd_default_format_has_no_flag():
+    cfg = _openai_cfg(run_format="default")
+    cmd = build_run_cmd(cfg)
+    assert "--format" not in cmd
+def test_env_vars_default_to_config_url():
+    cfg = _openai_cfg()
+    env = build_env_vars(cfg)
+    assert env["OPENAI_BASE_URL"] == "https://api.openai.com/v1"
+    assert env["OPENAI_API_KEY"] == "sk-test"
+    assert env["OPENCODE_CONFIG"] == "/home/user/.config/opencode/opencode.json"
+def test_env_vars_respect_proxy_override():
+    cfg = _openai_cfg(extra_env={"EXTRA": "yes"})
+    env = build_env_vars(cfg, base_url_override="http://localhost:7000/v1")
+    assert env["OPENAI_BASE_URL"] == "http://localhost:7000/v1"
+    assert env["EXTRA"] == "yes"

tests/test_sandbox_base.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Structural tests for the sandbox Protocols (no live sandbox needed)."""
+from __future__ import annotations
+from opencode_env.sandbox import (
+    ExecResult,
+    SandboxBackend,
+    SandboxHandle,
+    E2BSandboxBackend,
+    E2BSandboxHandle,
+)
+def test_e2b_classes_import():
+    # Ensure the e2b backend imports without needing an API key or live call.
+    assert E2BSandboxBackend is not None
+    assert E2BSandboxHandle is not None
+def test_exec_result_dataclass():
+    r = ExecResult(exit_code=0, stdout="ok", stderr="")
+    assert r.exit_code == 0
+    assert r.stdout == "ok"
+def test_e2b_backend_requires_api_key(monkeypatch):
+    monkeypatch.delenv("E2B_API_KEY", raising=False)
+    import pytest
+    with pytest.raises(RuntimeError, match="E2B_API_KEY"):
+        E2BSandboxBackend()
+def test_protocols_are_declared():
+    # Static: the Protocols should be importable and non-empty.
+    assert hasattr(SandboxBackend, "create")
+    assert hasattr(SandboxHandle, "exec")
+    assert hasattr(SandboxHandle, "start_bg")
+    assert hasattr(SandboxHandle, "kill")

tests/test_task.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import annotations
+import pytest
+from opencode_env.task import OpenCodeTask
+def test_coerce_from_string():
+    task = OpenCodeTask.coerce("write fizzbuzz")
+    assert task.instruction == "write fizzbuzz"
+    assert task.setup_shell is None
+    assert task.upload_files == {}
+def test_coerce_from_dict():
+    task = OpenCodeTask.coerce(
+        {
+            "instruction": "run tests",
+            "setup_shell": "pip install pytest",
+            "upload_files": {"/home/user/workdir/hello.py": "print('hi')"},
+            "metadata": {"task_id": "hello_001"},
+        }
+    )
+    assert task.instruction == "run tests"
+    assert task.setup_shell == "pip install pytest"
+    assert task.metadata["task_id"] == "hello_001"
+def test_coerce_passes_through_existing_task():
+    existing = OpenCodeTask(instruction="x")
+    assert OpenCodeTask.coerce(existing) is existing
+def test_coerce_rejects_bad_type():
+    with pytest.raises(TypeError):
+        OpenCodeTask.coerce(42)

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff