AdithyaSK HF Staff commited on
Commit
d4d3fde
Β·
verified Β·
1 Parent(s): 19f2fe7

Upload folder using huggingface_hub

Browse files
.dockerignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ .venv
2
+ __pycache__
3
+ *.pyc
4
+ *.pyo
5
+ *.egg-info
6
+ .env
7
+ .git
8
+ .gitignore
9
+ README.md
10
+ *.md
.env.example ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Required
2
+ E2B_API_KEY=e2b_...
3
+
4
+ # Optional - for Mode A rollouts against real OpenAI
5
+ OPENAI_API_KEY=sk-...
6
+
7
+ # Optional - max concurrent sandbox sessions per environment (default: 4)
8
+ MAX_CONCURRENT_ENVS=4
9
+
10
+ # Optional - enable the Gradio UI mounted at /
11
+ ENABLE_WEB_INTERFACE=true
.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ .venv/
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.egg-info/
6
+ .env
7
+ *.egg-info
8
+ .pytest_cache/
9
+ .gradio/
README.md CHANGED
@@ -6,7 +6,6 @@ colorTo: pink
6
  sdk: docker
7
  app_port: 8000
8
  pinned: false
9
- base_path: /web
10
  ---
11
 
12
  # opencode-openenv
 
6
  sdk: docker
7
  app_port: 8000
8
  pinned: false
 
9
  ---
10
 
11
  # opencode-openenv
local_ui.py ADDED
@@ -0,0 +1,517 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Chat-style Gradio UI for a locally-running ``opencode serve``.
2
+
3
+ Prereq: ``opencode serve`` on http://127.0.0.1:4096.
4
+
5
+ Run:
6
+ uv run --with gradio --with httpx python local_ui.py
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import html as _html
12
+ import json
13
+ import threading
14
+ import time
15
+ from typing import Any, Generator
16
+
17
+ import gradio as gr
18
+ import httpx
19
+
20
+
21
+ BASE = "http://127.0.0.1:4096"
22
+
23
+
24
+ # ── HTTP helpers ────────────────────────────────────────────────────────────
25
+
26
+
27
+ def _get(path: str, **kw) -> Any:
28
+ r = httpx.get(f"{BASE}{path}", timeout=15, **kw)
29
+ r.raise_for_status()
30
+ return r.json()
31
+
32
+
33
+ def _create_session() -> str:
34
+ return httpx.post(f"{BASE}/session", json={"title": "gradio"}, timeout=15).json()["id"]
35
+
36
+
37
+ def _fire_async(sid: str, prompt: str) -> None:
38
+ httpx.post(
39
+ f"{BASE}/session/{sid}/prompt_async",
40
+ json={"parts": [{"type": "text", "text": prompt}]},
41
+ timeout=30,
42
+ ).raise_for_status()
43
+
44
+
45
+ def _abort(sid: str) -> None:
46
+ try:
47
+ httpx.post(f"{BASE}/session/{sid}/abort", timeout=10)
48
+ except Exception:
49
+ pass
50
+
51
+
52
+ def _session_diff(sid: str) -> list[dict]:
53
+ try:
54
+ return _get(f"/session/{sid}/diff") or []
55
+ except Exception:
56
+ return []
57
+
58
+
59
+ def _session_todo(sid: str) -> list[dict]:
60
+ try:
61
+ return _get(f"/session/{sid}/todo") or []
62
+ except Exception:
63
+ return []
64
+
65
+
66
+ # ── Server identity ────────────────────────────────────────────────────────
67
+
68
+
69
+ def _banner() -> str:
70
+ try:
71
+ h = _get("/global/health")
72
+ c = _get("/config")
73
+ prov = (c.get("provider") or {}).get("vllm") or {}
74
+ opts = prov.get("options") or {}
75
+ model = c.get("model") or "?"
76
+ base_url = opts.get("baseURL") or "?"
77
+ limit = next(iter(prov.get("models", {}).values()), {}).get("limit") or {}
78
+ try:
79
+ tools = _get("/experimental/tool/ids") or []
80
+ except Exception:
81
+ tools = []
82
+ tool_line = (
83
+ f"<div class='tools'>tools: {', '.join(_esc(t) for t in tools)}</div>"
84
+ if tools else ""
85
+ )
86
+ return (
87
+ "<div class='banner'>"
88
+ f"<span class='chip ok'>opencode v{_esc(h.get('version','?'))}</span> "
89
+ f"<span class='chip'>model: <code>{_esc(model)}</code></span> "
90
+ f"<span class='chip'>baseURL: <code>{_esc(base_url)}</code></span> "
91
+ f"<span class='chip'>ctx: <code>{limit.get('context','?')}</code></span> "
92
+ f"<span class='chip'>out: <code>{limit.get('output','?')}</code></span>"
93
+ f"</div>{tool_line}"
94
+ )
95
+ except Exception as exc:
96
+ return f"<div class='banner'><span class='chip err'>server unreachable: {_esc(exc)}</span></div>"
97
+
98
+
99
+ # ── SSE ────────────────────────────────────────────────────────────────────
100
+
101
+
102
+ def _stream(sid_filter: str, events: list, stop: threading.Event) -> None:
103
+ """Tail GET /event, append every frame (caller filters)."""
104
+ try:
105
+ with httpx.stream("GET", f"{BASE}/event", timeout=None) as r:
106
+ for line in r.iter_lines():
107
+ if stop.is_set():
108
+ return
109
+ if not line or not line.startswith("data:"):
110
+ continue
111
+ try:
112
+ events.append(json.loads(line[5:].strip()))
113
+ except Exception:
114
+ pass
115
+ except Exception:
116
+ return
117
+
118
+
119
+ # ── Part + delta assembly ──────────────────────────────────────────────────
120
+
121
+
122
+ def _assemble(events: list[dict]) -> tuple[list[dict], list[str]]:
123
+ """Reduce events to ordered parts and collect any error reasons.
124
+
125
+ - ``message.part.updated`` is authoritative per ``part.id``.
126
+ - ``message.part.delta`` frames for a text part whose last snapshot is
127
+ shorter than the accumulated delta are appended live so streaming
128
+ looks smooth.
129
+ """
130
+ order: list[str] = []
131
+ latest: dict[str, dict] = {}
132
+ deltas: dict[str, str] = {}
133
+ errors: list[str] = []
134
+ for ev in events:
135
+ t = ev.get("type")
136
+ props = ev.get("properties") or {}
137
+ if t == "message.part.updated":
138
+ p = props.get("part") or {}
139
+ pid = p.get("id")
140
+ if not pid:
141
+ continue
142
+ if pid not in latest:
143
+ order.append(pid)
144
+ latest[pid] = p
145
+ if (p.get("state") or {}).get("status") == "error":
146
+ err = (p.get("state") or {}).get("error") or "tool error"
147
+ errors.append(f"{p.get('tool','?')}: {err}")
148
+ elif t == "message.part.delta":
149
+ p = props.get("part") or {}
150
+ pid = p.get("partID") or p.get("id")
151
+ if not pid:
152
+ continue
153
+ delta = p.get("delta") or p.get("text") or ""
154
+ if isinstance(delta, str) and delta:
155
+ deltas[pid] = deltas.get(pid, "") + delta
156
+ elif t in ("error", "client.error"):
157
+ errors.append(_esc(props.get("reason") or ev.get("reason") or "unknown"))
158
+
159
+ # Splice in any deltas that exceed the latest snapshot (live streaming).
160
+ parts: list[dict] = []
161
+ for pid in order:
162
+ p = dict(latest[pid])
163
+ if p.get("type") == "text" and pid in deltas:
164
+ if len(deltas[pid]) > len(p.get("text") or ""):
165
+ p["text"] = deltas[pid]
166
+ parts.append(p)
167
+ return parts, errors
168
+
169
+
170
+ # ── Rendering ──────────────────────────────────────────────────────────────
171
+
172
+
173
+ def _esc(s: Any) -> str:
174
+ return _html.escape("" if s is None else str(s))
175
+
176
+
177
+ def _cap(s: str, n: int = 6000) -> str:
178
+ if len(s) <= n:
179
+ return s
180
+ return s[:n] + f"\n… ({len(s) - n} chars hidden)"
181
+
182
+
183
+ def _fmt_tool(name: str, state: dict, raw: dict) -> str:
184
+ status = (state or {}).get("status") or "?"
185
+ inp = (state or {}).get("input") or raw.get("input") or {}
186
+ out = (state or {}).get("output") or raw.get("output") or ""
187
+ badge = {"completed": "ok", "error": "err", "running": "run"}.get(status, "")
188
+
189
+ if name == "read":
190
+ summary = f"πŸ“– read <code>{_esc(inp.get('filePath') or inp.get('path'))}</code>"
191
+ body = f"<pre>{_esc(_cap(str(out)))}</pre>"
192
+ elif name == "write":
193
+ path = inp.get("filePath") or inp.get("path")
194
+ content = inp.get("content") or ""
195
+ summary = f"✍️ write <code>{_esc(path)}</code> ({len(content)} chars)"
196
+ body = f"<pre>{_esc(_cap(content))}</pre>"
197
+ elif name == "edit":
198
+ path = inp.get("filePath") or inp.get("path")
199
+ old = inp.get("oldString") or ""
200
+ new = inp.get("newString") or ""
201
+ summary = f"✏️ edit <code>{_esc(path)}</code>"
202
+ body = (
203
+ f"<div class='lbl'>- old</div><pre class='del'>{_esc(_cap(old, 3000))}</pre>"
204
+ f"<div class='lbl'>+ new</div><pre class='add'>{_esc(_cap(new, 3000))}</pre>"
205
+ )
206
+ if out:
207
+ body += f"<div class='lbl'>output</div><pre>{_esc(_cap(str(out), 2000))}</pre>"
208
+ elif name == "bash":
209
+ cmd = inp.get("command") or inp.get("cmd") or ""
210
+ summary = f"⚑ bash <code>{_esc(cmd[:160])}</code>"
211
+ body = f"<pre>{_esc(_cap(str(out)))}</pre>"
212
+ elif name in ("glob", "find"):
213
+ pattern = inp.get("pattern") or inp.get("query") or ""
214
+ summary = f"πŸ”Ž {name} <code>{_esc(pattern)}</code>"
215
+ body = f"<pre>{_esc(_cap(str(out), 4000))}</pre>"
216
+ elif name == "grep":
217
+ pattern = inp.get("pattern") or ""
218
+ path = inp.get("path") or ""
219
+ summary = f"πŸ”Ž grep <code>{_esc(pattern)}</code>" + (
220
+ f" in <code>{_esc(path)}</code>" if path else ""
221
+ )
222
+ body = f"<pre>{_esc(_cap(str(out), 4000))}</pre>"
223
+ elif name == "todowrite":
224
+ todos = inp.get("todos") or []
225
+ summary = f"πŸ“ todowrite ({len(todos)} items)"
226
+ body = "<ul>" + "".join(
227
+ f"<li>{_todo_icon(t.get('status'))} {_esc(t.get('content'))}</li>"
228
+ for t in todos
229
+ ) + "</ul>"
230
+ elif name == "task":
231
+ desc = inp.get("description") or inp.get("prompt") or ""
232
+ summary = f"🧩 task β€” {_esc(desc[:160])}"
233
+ body = f"<pre>{_esc(_cap(str(out), 4000))}</pre>"
234
+ elif name == "webfetch":
235
+ summary = f"🌐 webfetch <code>{_esc(inp.get('url'))}</code>"
236
+ body = f"<pre>{_esc(_cap(str(out), 4000))}</pre>"
237
+ else:
238
+ summary = f"πŸ”§ {_esc(name)}"
239
+ body = (
240
+ f"<div class='lbl'>input</div><pre>{_esc(_cap(json.dumps(inp, indent=2, default=str), 4000))}</pre>"
241
+ f"<div class='lbl'>output</div><pre>{_esc(_cap(str(out), 4000))}</pre>"
242
+ )
243
+ return (
244
+ "<details class='tool' open>"
245
+ f"<summary>{summary} <span class='badge {badge}'>{_esc(status)}</span></summary>"
246
+ f"<div class='tbody'>{body}</div>"
247
+ "</details>"
248
+ )
249
+
250
+
251
+ def _todo_icon(status: str | None) -> str:
252
+ return {"completed": "βœ…", "in_progress": "πŸ”„"}.get(status or "", "⏳")
253
+
254
+
255
+ def _render_transcript(parts: list[dict], errors: list[str]) -> str:
256
+ out: list[str] = []
257
+ if errors:
258
+ out.append(
259
+ "<div class='errbox'><b>⚠️ errors</b><ul>"
260
+ + "".join(f"<li>{_esc(e)}</li>" for e in errors[:8])
261
+ + "</ul></div>"
262
+ )
263
+ if not parts:
264
+ out.append("<div class='empty'>waiting for first part…</div>")
265
+ return "".join(out)
266
+ out.append("<div class='chat'>")
267
+ for p in parts:
268
+ t = p.get("type")
269
+ if t == "step-start":
270
+ out.append("<div class='step'>── new step ──</div>")
271
+ elif t == "reasoning":
272
+ txt = (p.get("text") or "").strip()
273
+ if txt:
274
+ out.append(
275
+ "<details class='reasoning'><summary>🧠 reasoning</summary>"
276
+ f"<pre>{_esc(_cap(txt, 4000))}</pre></details>"
277
+ )
278
+ elif t == "text":
279
+ txt = (p.get("text") or "").strip()
280
+ if txt:
281
+ out.append(f"<div class='assistant'><pre>{_esc(txt)}</pre></div>")
282
+ elif t == "tool":
283
+ out.append(_fmt_tool(p.get("tool") or "?", p.get("state") or {}, p))
284
+ elif t == "step-finish":
285
+ tokens = p.get("tokens") or (p.get("state") or {}).get("tokens") or {}
286
+ if tokens:
287
+ out.append(f"<div class='stepfin'>tokens: {_esc(json.dumps(tokens, default=str))}</div>")
288
+ out.append("</div>")
289
+ return "".join(out)
290
+
291
+
292
+ def _render_todo(todos: list[dict]) -> str:
293
+ if not todos:
294
+ return ""
295
+ items = "".join(
296
+ f"<li>{_todo_icon(t.get('status'))} {_esc(t.get('content') or t.get('text',''))}</li>"
297
+ for t in todos
298
+ )
299
+ return f"<div class='todostrip'><b>plan</b><ul>{items}</ul></div>"
300
+
301
+
302
+ def _render_diff(diffs: list[dict]) -> str:
303
+ if not diffs:
304
+ return ""
305
+ blocks = []
306
+ for d in diffs:
307
+ path = d.get("path") or d.get("file") or "?"
308
+ patch = d.get("patch") or d.get("diff") or ""
309
+ blocks.append(
310
+ f"<details class='diff'><summary>{_esc(path)}</summary>"
311
+ f"<pre>{_esc(_cap(patch, 6000))}</pre></details>"
312
+ )
313
+ return (
314
+ "<details class='diff-wrap' open>"
315
+ f"<summary>πŸ“‹ session diff ({len(diffs)} files)</summary>"
316
+ f"{''.join(blocks)}</details>"
317
+ )
318
+
319
+
320
+ # ── State ──────────────────────────────────────────────────────────────────
321
+
322
+
323
+ class _State:
324
+ sid: str = "" # empty β†’ next Run creates a new session
325
+ stop: threading.Event | None = None
326
+ events: list[dict] = [] # reset per session
327
+ sse_thread: threading.Thread | None = None
328
+
329
+
330
+ _STATE = _State()
331
+
332
+
333
+ def _ensure_session() -> str:
334
+ """Create a session if none exists; reuse across runs for multi-turn."""
335
+ if _STATE.sid:
336
+ return _STATE.sid
337
+ _STATE.sid = _create_session()
338
+ _STATE.stop = threading.Event()
339
+ _STATE.events = []
340
+ _STATE.sse_thread = threading.Thread(
341
+ target=_stream, args=(_STATE.sid, _STATE.events, _STATE.stop), daemon=True
342
+ )
343
+ _STATE.sse_thread.start()
344
+ time.sleep(0.15)
345
+ return _STATE.sid
346
+
347
+
348
+ def _new_session_cb() -> tuple[str, str, str, str]:
349
+ """Tear down any existing SSE and clear state. Next Run opens a fresh session."""
350
+ if _STATE.stop:
351
+ _STATE.stop.set()
352
+ if _STATE.sid:
353
+ _abort(_STATE.sid)
354
+ _STATE.sid = ""
355
+ _STATE.stop = None
356
+ _STATE.events = []
357
+ return (
358
+ "✨ new session β€” Run to start",
359
+ "", # transcript
360
+ "", # todo
361
+ "", # diff
362
+ )
363
+
364
+
365
+ # ── Main ───────────────────────────────────────────────────────────────────
366
+
367
+
368
+ def run(prompt: str) -> Generator[tuple[str, str, str, str], None, None]:
369
+ try:
370
+ sid = _ensure_session()
371
+ except Exception as exc:
372
+ yield f"❌ session create failed: {exc}", "", "", ""
373
+ return
374
+
375
+ # Snapshot the event index BEFORE firing β€” "idle for THIS turn" must be
376
+ # scoped to events that arrive after the prompt is sent, otherwise the
377
+ # idle frame from the previous turn fires the break immediately.
378
+ turn_start = len(_STATE.events)
379
+
380
+ try:
381
+ _fire_async(sid, prompt)
382
+ except Exception as exc:
383
+ yield f"❌ prompt failed: {exc}", "", "", ""
384
+ return
385
+
386
+ t0 = time.time()
387
+ last_todo_refresh = 0.0
388
+ todos: list[dict] = []
389
+
390
+ while time.time() - t0 < 600:
391
+ new_events = _STATE.events[turn_start:]
392
+ idle = any(e.get("type") in ("session.idle", "idle") for e in new_events)
393
+ parts, errors = _assemble(_STATE.events)
394
+
395
+ if time.time() - last_todo_refresh > 3.0:
396
+ todos = _session_todo(sid)
397
+ last_todo_refresh = time.time()
398
+
399
+ status = (
400
+ f"{'βœ… idle' if idle else '⚑ running'} Β· "
401
+ f"session <code>{sid[:18]}…</code> Β· "
402
+ f"{time.time()-t0:.1f}s Β· {len(parts)} parts Β· {len(_STATE.events)} events"
403
+ )
404
+
405
+ diff_html = ""
406
+ if idle:
407
+ diff_html = _render_diff(_session_diff(sid))
408
+
409
+ yield status, _render_transcript(parts, errors), _render_todo(todos), diff_html
410
+
411
+ if idle:
412
+ break
413
+ time.sleep(0.4)
414
+
415
+
416
+ def abort_cb() -> str:
417
+ if _STATE.sid:
418
+ _abort(_STATE.sid)
419
+ # leave SSE open so user sees the abort-related events; actual teardown on new session
420
+ return "⏹ aborted (session kept β€” click New session to clear)"
421
+
422
+
423
+ def refresh_banner() -> str:
424
+ return _banner()
425
+
426
+
427
+ # ── CSS ────────────────────────────────────────────────────────────────────
428
+
429
+ _CSS = """
430
+ .banner { margin:4px 0 2px; }
431
+ .tools { font-size:11px; color:#888; margin:2px 0 8px; }
432
+ .chip { display:inline-block; padding:2px 8px; margin:2px; border-radius:10px;
433
+ background:#2b2d31; color:#ddd; font-size:12px; }
434
+ .chip.ok { background:#1f6f43; }
435
+ .chip.err { background:#7a1e1e; }
436
+ .chip code { background:transparent; color:#9ad; }
437
+ .errbox { background:#2a1414; border:1px solid #7a1e1e; border-radius:6px;
438
+ padding:6px 10px; margin:6px 0; color:#f88; font-size:13px; }
439
+ .errbox ul { margin:2px 0 0 18px; }
440
+ .chat { font-size:14px; }
441
+ .assistant pre { background:#0e1013; padding:10px; border-radius:8px;
442
+ white-space:pre-wrap; color:#eee; margin:6px 0; }
443
+ .reasoning { opacity:0.8; margin:4px 0; }
444
+ .reasoning pre { background:#0a0b0d; color:#aab; padding:8px; white-space:pre-wrap; }
445
+ .tool { border:1px solid #2a2f3a; border-radius:8px; padding:6px 10px;
446
+ margin:6px 0; background:#12161c; }
447
+ .tool summary { cursor:pointer; color:#ddd; }
448
+ .tool code { background:#222; color:#9cf; padding:1px 4px; border-radius:3px; }
449
+ .tbody { margin-top:6px; }
450
+ .tbody pre { background:#0a0b0d; padding:8px; border-radius:4px;
451
+ white-space:pre-wrap; max-height:400px; overflow:auto;
452
+ font-size:12px; color:#ddd; margin:2px 0; }
453
+ .tbody pre.add { border-left:3px solid #2e6; }
454
+ .tbody pre.del { border-left:3px solid #e53; }
455
+ .tbody .lbl { color:#888; font-size:11px; margin-top:6px; }
456
+ .badge { padding:1px 6px; border-radius:8px; font-size:11px;
457
+ background:#333; color:#ddd; }
458
+ .badge.ok { background:#1f6f43; color:white; }
459
+ .badge.err { background:#7a1e1e; color:white; }
460
+ .badge.run { background:#7a5c1e; color:white; }
461
+ .step { color:#555; text-align:center; margin:10px 0; font-size:11px; }
462
+ .stepfin { color:#666; font-size:11px; margin:4px 0 12px; }
463
+ .empty { color:#666; font-style:italic; padding:12px; }
464
+ .todostrip { background:#14181e; border:1px solid #2a2f3a; border-radius:6px;
465
+ padding:6px 10px; margin:6px 0; font-size:13px; }
466
+ .todostrip ul { margin:4px 0 0 18px; }
467
+ .diff-wrap { margin:8px 0; }
468
+ .diff summary { cursor:pointer; color:#9ad; font-family:monospace; }
469
+ .diff pre { background:#0a0b0d; padding:8px; border-radius:4px;
470
+ white-space:pre; font-size:12px; color:#ddd; overflow:auto; }
471
+ """
472
+
473
+
474
+ # ── Layout ─────────────────────────────────────────────────────────────────
475
+
476
+
477
+ with gr.Blocks(title="opencode serve", css=_CSS) as demo:
478
+ banner_html = gr.HTML(value="_(loading…)_")
479
+ status_md = gr.Markdown()
480
+ todo_html = gr.HTML()
481
+ transcript_html = gr.HTML(value="<div class='empty'>run a prompt to start</div>")
482
+ diff_html = gr.HTML()
483
+
484
+ with gr.Row():
485
+ prompt = gr.Textbox(
486
+ label="Prompt",
487
+ value="Write fizzbuzz.py that prints FizzBuzz for 1..15 and run it.",
488
+ lines=3,
489
+ scale=5,
490
+ )
491
+ run_btn = gr.Button("β–Ά Run", variant="primary", scale=1)
492
+ with gr.Column(scale=1, min_width=120):
493
+ abort_btn = gr.Button("⏹ Abort", variant="stop")
494
+ new_btn = gr.Button("✨ New session")
495
+
496
+ run_btn.click(
497
+ run,
498
+ inputs=[prompt],
499
+ outputs=[status_md, transcript_html, todo_html, diff_html],
500
+ )
501
+ abort_btn.click(abort_cb, outputs=[status_md])
502
+ new_btn.click(
503
+ _new_session_cb,
504
+ outputs=[status_md, transcript_html, todo_html, diff_html],
505
+ )
506
+ demo.load(refresh_banner, outputs=[banner_html])
507
+
508
+
509
+ if __name__ == "__main__":
510
+ import os
511
+
512
+ demo.queue().launch(
513
+ server_name="0.0.0.0",
514
+ server_port=int(os.environ.get("GRADIO_PORT", "7861")),
515
+ share=True,
516
+ show_error=True,
517
+ )
server/catalog.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Curated Qwen model catalog for the OpenCode OpenEnv server.
2
+
3
+ Lives in the server (not the primitive) because routing decisions β€”
4
+ which HF router backend to pick for a given Qwen repo, what counts as
5
+ the "default" model, whether a model supports thinking β€” are
6
+ deployment concerns, not harness concerns. The primitive remains
7
+ provider-agnostic; this catalog is what the Gradio UI and the MCP
8
+ tools consult to turn a UI selection into a concrete
9
+ ``(base_url, api_key, model_string, disable_thinking)`` quadruple.
10
+
11
+ Backends supported:
12
+
13
+ - ``vllm`` β€” user-supplied OpenAI-compatible endpoint (e.g. cloudflared
14
+ tunnel to ``vllm serve``, or a colocated vLLM server).
15
+ - ``hf_router`` β€” Hugging Face Inference Providers router at
16
+ ``https://router.huggingface.co/v1``. Auth via ``HF_TOKEN``.
17
+ Model id carries a ``:provider`` suffix to pick the HF
18
+ backend (``:together``, ``:scaleway``, ``:nscale``, ...).
19
+
20
+ Only HF providers verified to return ``logprobs`` are listed (see
21
+ ``DOCS/HF/hf_inference_providers_logprobs.md``).
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ from typing import Literal
27
+
28
+ from pydantic import BaseModel
29
+
30
+
31
+ BackendKind = Literal["vllm", "hf_router"]
32
+
33
+ HF_ROUTER_BASE_URL = "https://router.huggingface.co/v1"
34
+
35
+
36
+ class CatalogModel(BaseModel):
37
+ """One model entry in the curated Qwen catalog."""
38
+
39
+ #: Canonical HF-Hub repo id (no ``:provider`` suffix).
40
+ repo: str
41
+ #: Backend kind β€” drives routing + auth shape.
42
+ backend: BackendKind
43
+ #: For ``hf_router`` entries, the ``:<provider>`` suffix HF uses to
44
+ #: force a specific backend inference provider. Empty for ``vllm``.
45
+ hf_route: str = ""
46
+ #: Whether this model supports Qwen-style thinking mode.
47
+ supports_thinking: bool = False
48
+ #: Short human-readable label for UI dropdowns.
49
+ label: str = ""
50
+
51
+ @property
52
+ def dropdown_key(self) -> str:
53
+ """Stable unique key for UI selectors."""
54
+ if self.backend == "hf_router":
55
+ return f"hf-router://{self.repo}{self.hf_route}"
56
+ return f"vllm://{self.repo}"
57
+
58
+ @property
59
+ def opencode_model_string(self) -> str:
60
+ """Model id opencode should send to the endpoint.
61
+
62
+ For HF router we bake the ``:provider`` suffix into the model
63
+ string so the HF router picks the right backend.
64
+ """
65
+ if self.backend == "hf_router":
66
+ return f"{self.repo}{self.hf_route}"
67
+ return self.repo
68
+
69
+
70
+ # Ordered: self-hosted vLLM first (default), then HF router options.
71
+ CATALOG: list[CatalogModel] = [
72
+ # --- Local vLLM (tunneled or colocated) ---
73
+ CatalogModel(
74
+ repo="Qwen/Qwen3.5-4B",
75
+ backend="vllm",
76
+ supports_thinking=True,
77
+ label="Qwen3.5-4B (self-hosted vLLM)",
78
+ ),
79
+ # --- HF Inference Router (Together / Scaleway / Nscale) ---
80
+ CatalogModel(
81
+ repo="Qwen/Qwen3.5-397B-A17B",
82
+ backend="hf_router",
83
+ hf_route=":together",
84
+ supports_thinking=True,
85
+ label="Qwen3.5-397B-A17B β€” HF/Together",
86
+ ),
87
+ CatalogModel(
88
+ repo="Qwen/Qwen3.5-397B-A17B",
89
+ backend="hf_router",
90
+ hf_route=":scaleway",
91
+ supports_thinking=True,
92
+ label="Qwen3.5-397B-A17B β€” HF/Scaleway",
93
+ ),
94
+ CatalogModel(
95
+ repo="Qwen/Qwen3-Coder-480B-A35B-Instruct",
96
+ backend="hf_router",
97
+ hf_route=":together",
98
+ supports_thinking=False,
99
+ label="Qwen3-Coder-480B β€” HF/Together",
100
+ ),
101
+ CatalogModel(
102
+ repo="Qwen/Qwen3-235B-A22B-Instruct-2507",
103
+ backend="hf_router",
104
+ hf_route=":nscale",
105
+ supports_thinking=False,
106
+ label="Qwen3-235B-A22B-2507 β€” HF/Nscale",
107
+ ),
108
+ CatalogModel(
109
+ repo="Qwen/Qwen3-4B-Instruct-2507",
110
+ backend="hf_router",
111
+ hf_route=":nscale",
112
+ supports_thinking=False,
113
+ label="Qwen3-4B-Instruct-2507 β€” HF/Nscale",
114
+ ),
115
+ CatalogModel(
116
+ repo="Qwen/Qwen3-Coder-30B-A3B-Instruct",
117
+ backend="hf_router",
118
+ hf_route=":scaleway",
119
+ supports_thinking=False,
120
+ label="Qwen3-Coder-30B-A3B β€” HF/Scaleway",
121
+ ),
122
+ ]
123
+
124
+
125
+ def by_key(key: str) -> CatalogModel:
126
+ """Look up a catalog entry by ``dropdown_key``.
127
+
128
+ Falls back to synthesising an ad-hoc entry from the key's prefix so
129
+ users can enter a custom vLLM model id or a custom HF-router model
130
+ id without editing the catalog:
131
+
132
+ - ``"vllm://<repo>"`` β†’ ad-hoc vllm entry with ``repo`` as the model id.
133
+ - ``"hf-router://<repo>[:<provider>]"`` β†’ ad-hoc hf_router entry; the
134
+ provider suffix (if present) is preserved verbatim in ``hf_route``.
135
+ """
136
+ for m in CATALOG:
137
+ if m.dropdown_key == key:
138
+ return m
139
+ if key.startswith("vllm://"):
140
+ repo = key[len("vllm://"):].strip()
141
+ if not repo:
142
+ raise KeyError(f"missing model id in key: {key!r}")
143
+ return CatalogModel(
144
+ repo=repo, backend="vllm", supports_thinking=False,
145
+ label=f"{repo} (custom vLLM)",
146
+ )
147
+ if key.startswith("hf-router://"):
148
+ rest = key[len("hf-router://"):].strip()
149
+ if not rest:
150
+ raise KeyError(f"missing model id in key: {key!r}")
151
+ if ":" in rest:
152
+ repo, _, suffix = rest.partition(":")
153
+ hf_route = ":" + suffix
154
+ else:
155
+ repo, hf_route = rest, ""
156
+ return CatalogModel(
157
+ repo=repo, backend="hf_router", hf_route=hf_route,
158
+ supports_thinking=False,
159
+ label=f"{repo}{hf_route} (custom HF Router)",
160
+ )
161
+ raise KeyError(f"unknown model key: {key!r}")
162
+
163
+
164
+ def default_model() -> CatalogModel:
165
+ """First entry (self-hosted vLLM 4B)."""
166
+ return CATALOG[0]
167
+
168
+
169
+ def resolve_endpoint(
170
+ model_key: str,
171
+ *,
172
+ vllm_url: str = "",
173
+ hf_token: str = "",
174
+ ) -> tuple[str, str, str, "CatalogModel"]:
175
+ """Translate a UI selection into ``(base_url, api_key, model_string, entry)``.
176
+
177
+ Raises ``ValueError`` with a clear message when a required secret is
178
+ missing so the UI can render a precise "please fill in X" message.
179
+ """
180
+ m = by_key(model_key)
181
+ if m.backend == "vllm":
182
+ vllm_url = (vllm_url or "").strip()
183
+ if not vllm_url:
184
+ raise ValueError(
185
+ f"model {m.dropdown_key!r} requires a vLLM base URL "
186
+ "(the tunneled or in-cluster /v1 endpoint)."
187
+ )
188
+ base = vllm_url.rstrip("/")
189
+ if not base.endswith("/v1"):
190
+ base = base + "/v1"
191
+ return base, "anything", m.opencode_model_string, m
192
+ if m.backend == "hf_router":
193
+ hf_token = (hf_token or "").strip()
194
+ if not hf_token:
195
+ raise ValueError(
196
+ f"model {m.dropdown_key!r} requires an HF token "
197
+ "(hf_... from https://huggingface.co/settings/tokens)."
198
+ )
199
+ return HF_ROUTER_BASE_URL, hf_token, m.opencode_model_string, m
200
+ raise ValueError(f"unknown backend: {m.backend}")
server/gradio_ui.py CHANGED
@@ -15,11 +15,27 @@ ticker instead of a frozen page.
15
  from __future__ import annotations
16
 
17
  import json
 
18
  import time
19
  from typing import Any
20
 
21
  import gradio as gr
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  # ── Preset tasks ──────────────────────────────────────────────────────────
25
  # Shown in the dropdown. Each has instruction + matching bash verifier.
@@ -141,12 +157,20 @@ PRESET_TASKS: dict[str, tuple[str, str]] = {
141
  }
142
 
143
 
144
- _EXAMPLE_MODELS = [
145
- "Qwen/Qwen3.5-4B",
146
- "Qwen/Qwen3-Coder-Next",
147
- "openai/gpt-4o-mini",
148
- "openai/gpt-5.3-chat-latest",
149
  ]
 
 
 
 
 
 
 
 
 
 
 
150
 
151
 
152
  def opencode_ui_builder(
@@ -174,7 +198,7 @@ def opencode_ui_builder(
174
  _env_cache["instance"] = inst
175
  return inst
176
 
177
- with gr.Blocks(title=title, analytics_enabled=False) as demo:
178
  gr.Markdown(
179
  f"# {title}\n"
180
  "Run one OpenCode rollout against any OpenAI-compatible endpoint. "
@@ -184,27 +208,56 @@ def opencode_ui_builder(
184
  )
185
 
186
  # ── Config ─────────────────────────────────────────────────────────
 
 
 
 
 
187
  with gr.Row():
188
- with gr.Column(scale=1):
189
- vllm_url = gr.Textbox(
190
- label="vLLM / LLM base URL",
191
- value="https://<your-public-llm-host>/v1",
192
- placeholder="https://.../v1",
193
  )
194
- model = gr.Textbox(
195
- label="Model id",
196
- value=_EXAMPLE_MODELS[0],
197
- placeholder="Qwen/Qwen3.5-4B",
198
- )
199
- provider = gr.Dropdown(
200
- label="Provider",
201
- choices=["openai_compatible", "openai", "anthropic"],
202
- value="openai_compatible",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  )
204
- api_key = gr.Textbox(
205
- label="API key (ignored by vLLM)",
206
- value="intercepted",
207
- type="password",
208
  )
209
  with gr.Column(scale=1):
210
  mode = gr.Dropdown(
@@ -221,82 +274,132 @@ def opencode_ui_builder(
221
  minimum=60, maximum=1200, value=300, step=30,
222
  )
223
 
224
- # ── Task preset + fields ───────────────────────────────────────────
225
- with gr.Row():
226
- task_preset = gr.Dropdown(
227
- label="Task preset",
228
- choices=list(PRESET_TASKS.keys()),
229
- value="hello",
230
- )
231
- task_id = gr.Textbox(
232
- label="Task id (optional label)",
233
- value="hello_demo",
234
  )
235
 
236
- instruction = gr.Textbox(
237
- label="Instruction",
238
- value=PRESET_TASKS["hello"][0],
239
- lines=4,
240
- )
241
- test_script = gr.Code(
242
- label="test.sh β€” bash verifier. Must write a float reward to /home/user/logs/verifier/reward.txt",
243
- value=PRESET_TASKS["hello"][1],
244
- language="shell",
245
  )
246
- setup_shell = gr.Textbox(
247
- label="Setup shell (optional, runs before opencode)",
248
- value="",
249
- placeholder="e.g. pip install polars",
250
  )
251
 
252
- # Wire dropdown β†’ populate instruction + test.sh
253
- def _on_preset_change(name: str):
254
- ins, tst = PRESET_TASKS.get(name, ("", ""))
255
- return gr.update(value=ins), gr.update(value=tst)
256
-
257
- task_preset.change(
258
- _on_preset_change,
259
- inputs=[task_preset],
260
- outputs=[instruction, test_script],
 
 
 
 
 
261
  )
262
-
263
  with gr.Row():
264
- check_btn = gr.Button("πŸ”Ž Check LLM URL")
265
- run_btn = gr.Button("β–Ά Run rollout (streaming)", variant="primary")
266
- reset_btn = gr.Button("πŸ”„ Reset", variant="secondary")
 
 
 
 
 
 
 
 
267
 
268
- # ── Output panels ─────────────────────────────────────────────────
269
- status = gr.Markdown()
270
  with gr.Row():
271
- reward_out = gr.Number(label="reward", value=None, interactive=False)
272
- wall_out = gr.Number(label="wall_s", value=None, interactive=False)
273
- exit_out = gr.Number(label="exit_code", value=None, interactive=False)
274
- turns_out = gr.Number(label="proxy_turns", value=None, interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
 
276
- with gr.Accordion("Workdir files", open=True):
 
 
 
 
 
 
277
  workdir_md = gr.Markdown()
278
- with gr.Accordion("Proxy trace (per turn)", open=False):
279
  proxy_trace_json = gr.JSON(label=None)
280
- with gr.Accordion("Verifier stdout / stderr", open=False):
281
- verifier_out = gr.Textbox(label="stdout", lines=8)
282
- verifier_err = gr.Textbox(label="stderr", lines=4)
283
  with gr.Accordion("Raw result JSON", open=False):
284
  raw_json = gr.JSON(label=None)
285
 
286
  # ── Streaming Run handler ─────────────────────────────────────────
287
  def _run_streaming(
 
 
288
  vllm_url_v: str,
289
- model_v: str,
290
- provider_v: str,
291
- api_key_v: str,
 
292
  mode_v: str,
293
  max_tokens_cap_v: int,
294
  agent_timeout_s_v: float,
295
  task_id_v: str,
296
  instruction_v: str,
297
- test_script_v: str,
298
  setup_shell_v: str,
299
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
  """Gradio generator: yields UI updates as the rollout progresses.
301
 
302
  Uses the non-blocking fine-grained tools:
@@ -305,29 +408,52 @@ def opencode_ui_builder(
305
  import httpx
306
  from openenv.core.env_server.mcp_types import CallToolAction
307
 
308
- # 0) Pre-flight: verify the LLM URL is reachable before burning
309
- # an E2B sandbox on a URL typo.
 
 
 
 
 
 
 
 
 
 
 
 
310
  yield (
311
- "πŸ”Ž **Validating LLM endpoint...**",
312
  None, None, None, 0,
313
- "", [], "", "", {"stage": "validate"},
 
 
314
  )
315
- probe_url = vllm_url_v.rstrip("/")
316
- if not probe_url.endswith("/v1"):
317
- probe_url = probe_url + "/v1"
318
  try:
319
- r = httpx.get(f"{probe_url}/models", timeout=15)
 
 
320
  if r.status_code != 200:
321
  yield _error_tuple(
322
- f"LLM URL {probe_url}/models returned HTTP {r.status_code}: {r.text[:200]}"
 
323
  )
324
  return
325
  except Exception as exc:
326
  yield _error_tuple(
327
- f"LLM URL unreachable: {type(exc).__name__}: {exc}"
328
  )
329
  return
330
 
 
 
 
 
 
 
331
  try:
332
  env = _get_env()
333
  env.reset()
@@ -335,27 +461,23 @@ def opencode_ui_builder(
335
  yield _error_tuple(f"env init failed: {type(exc).__name__}: {exc}")
336
  return
337
 
338
- # 1) start_rollout (returns in <1s β€” registry bookkeeping only)
 
339
  try:
340
  start_obs = env.step(
341
  CallToolAction(
342
  tool_name="start_rollout",
343
  arguments={
 
344
  "vllm_url": vllm_url_v,
345
- "model": model_v,
 
346
  "instruction": instruction_v,
347
  "test_script": test_script_v,
348
  "task_id": task_id_v,
349
  "setup_shell": setup_shell_v,
350
  "upload_files": {},
351
- "provider": provider_v,
352
- "api_key": api_key_v,
353
  "mode": mode_v,
354
- # chat_template_kwargs.enable_thinking=false is a
355
- # harmless no-op for non-Qwen models (vLLM silently
356
- # ignores unknown template kwargs). Keep it on by
357
- # default so Qwen3/Qwen3.5 don't dump think blocks.
358
- "disable_thinking": True,
359
  "max_tokens_cap": int(max_tokens_cap_v),
360
  "agent_timeout_s": float(agent_timeout_s_v),
361
  },
@@ -372,17 +494,21 @@ def opencode_ui_builder(
372
  yield _error_tuple(f"start_rollout returned no rollout_id: {start_payload}")
373
  return
374
 
375
- # Initial UI update
 
376
  yield (
377
- f"**Started rollout** `{rollout_id}` β€” waiting for first turn…",
378
  None, None, None, 0,
379
  "_(no files yet)_", [], "", "", start_payload,
 
 
380
  )
381
 
382
- # 2) poll get_state every 2s, stream progress
 
383
  deadline = time.time() + float(agent_timeout_s_v) + 120
 
384
  status_str = "running"
385
- state_payload: dict[str, Any] = {}
386
  while time.time() < deadline:
387
  try:
388
  state_obs = env.step(
@@ -390,27 +516,59 @@ def opencode_ui_builder(
390
  tool_name="get_state",
391
  arguments={"rollout_id": rollout_id},
392
  ),
393
- timeout_s=30,
394
  )
395
  state_payload = _parse_result(state_obs)
396
  except Exception as exc:
397
  state_payload = {"error": f"{type(exc).__name__}: {exc}"}
398
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
  status_str = state_payload.get("status", "?")
400
- turns_so_far = state_payload.get("proxy_turns_so_far", 0)
401
- elapsed = time.time() - float(start_payload.get("started_at") or time.time())
 
 
 
 
 
 
 
 
402
 
403
  yield (
404
- f"**Rollout** `{rollout_id}` Β· status=`{status_str}` Β· "
405
- f"turns so far: `{turns_so_far}` Β· elapsed: `{elapsed:.1f}s`",
406
- None, None, None, turns_so_far,
407
  "_(workdir populated on finalize)_",
408
  [], "", "", state_payload,
 
 
409
  )
410
 
411
  if status_str == "done":
412
  break
413
- time.sleep(2.0)
414
 
415
  # 3) finalize_rollout β€” run verifier + collect full result
416
  try:
@@ -429,6 +587,38 @@ def opencode_ui_builder(
429
  status_md = _summarize_status(result)
430
  wd_md = _render_workdir(result.get("workdir_files") or {})
431
  turns = result.get("proxy_turns") or []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
432
  yield (
433
  status_md,
434
  result.get("reward"),
@@ -437,69 +627,156 @@ def opencode_ui_builder(
437
  len(turns),
438
  wd_md,
439
  turns,
440
- (result.get("verifier_stdout") or "")[:4000],
441
- (result.get("verifier_stderr") or "")[:2000],
442
  result,
 
 
443
  )
444
 
 
 
 
 
 
 
445
  run_btn.click(
446
  _run_streaming,
447
  inputs=[
448
- vllm_url, model, provider, api_key, mode,
 
 
 
449
  max_tokens_cap, agent_timeout_s,
450
- task_id, instruction, test_script, setup_shell,
451
- ],
452
- outputs=[
453
- status, reward_out, wall_out, exit_out, turns_out,
454
- workdir_md, proxy_trace_json,
455
- verifier_out, verifier_err, raw_json,
456
  ],
 
457
  )
458
 
459
- # Check-URL handler β€” cheap GET /v1/models probe. Wires here so the
460
- # outputs (status, etc.) are already defined.
461
- def _check_url(vllm_url_v: str) -> str:
 
 
 
 
462
  import httpx
463
- url = vllm_url_v.rstrip("/")
464
- if not url.endswith("/v1"):
465
- url = url + "/v1"
466
- models_url = f"{url}/models"
 
 
 
 
 
 
467
  try:
468
- r = httpx.get(models_url, timeout=15)
 
 
 
 
 
 
 
 
469
  except Exception as exc:
470
  return f"❌ `{models_url}` unreachable: `{type(exc).__name__}: {exc}`"
471
  if r.status_code != 200:
472
  return f"❌ `{models_url}` β†’ HTTP {r.status_code}\n```\n{r.text[:400]}\n```"
473
  try:
474
- body = r.json()
475
- ids = [m.get("id") for m in body.get("data", []) if m.get("id")]
476
  except Exception:
477
  ids = []
 
478
  if ids:
479
- return f"βœ… reachable Β· served models: `{', '.join(ids)}`"
480
- return "⚠️ reachable (HTTP 200) but no `data[*].id` in response"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
481
 
482
- check_btn.click(_check_url, inputs=[vllm_url], outputs=[status])
 
 
 
 
483
 
484
- # Reset handler β€” drop cached env so the next Run creates a fresh
485
- # OpenCodeEnvironment (new rollout registry, new state).
486
- def _reset() -> tuple:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487
  _env_cache["instance"] = None
488
  return (
489
- "πŸ”„ **Reset.** Next Run will create a fresh environment.",
490
  None, None, None, None,
491
  "_(workdir cleared)_",
492
  [], "", "", {"reset": True},
 
 
493
  )
494
 
495
  reset_btn.click(
496
  _reset,
497
- inputs=[],
498
- outputs=[
499
- status, reward_out, wall_out, exit_out, turns_out,
500
- workdir_md, proxy_trace_json,
501
- verifier_out, verifier_err, raw_json,
502
- ],
503
  )
504
 
505
  return demo
@@ -508,11 +785,37 @@ def opencode_ui_builder(
508
  # ── Helpers ─────────────────────────────────────────────────────────────────
509
 
510
 
511
- def _error_tuple(msg: str) -> tuple:
512
  return (
513
  f"❌ **Error:** `{msg}`",
514
  None, None, None, None,
515
  "", [], "", "", {"error": msg},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
516
  )
517
 
518
 
 
15
  from __future__ import annotations
16
 
17
  import json
18
+ import os
19
  import time
20
  from typing import Any
21
 
22
  import gradio as gr
23
 
24
+ try:
25
+ from .catalog import CATALOG, by_key, default_model, resolve_endpoint
26
+ from .transcript import (
27
+ TRANSCRIPT_CSS,
28
+ collect_parts_from_messages,
29
+ render_transcript,
30
+ )
31
+ except ImportError: # pragma: no cover β€” support running as a script
32
+ from catalog import CATALOG, by_key, default_model, resolve_endpoint # type: ignore
33
+ from transcript import ( # type: ignore
34
+ TRANSCRIPT_CSS,
35
+ collect_parts_from_messages,
36
+ render_transcript,
37
+ )
38
+
39
 
40
  # ── Preset tasks ──────────────────────────────────────────────────────────
41
  # Shown in the dropdown. Each has instruction + matching bash verifier.
 
157
  }
158
 
159
 
160
+ _HF_MODEL_CHOICES = [
161
+ (m.label, m.dropdown_key) for m in CATALOG if m.backend == "hf_router"
 
 
 
162
  ]
163
+ # Sentinel value used for the "type your own HF-router id" dropdown option.
164
+ _CUSTOM_HF_KEY = "__custom_hf__"
165
+ _HF_MODEL_CHOICES.append(("Custom β€” enter HF Router model id below", _CUSTOM_HF_KEY))
166
+
167
+ _DEFAULT_HF_KEY = _HF_MODEL_CHOICES[0][1]
168
+ _HF_TOKEN_ENV = os.environ.get("HF_TOKEN", "")
169
+
170
+ # Suggested / recent vllm model ids (user can type anything).
171
+ _VLLM_MODEL_SUGGESTIONS = [
172
+ m.repo for m in CATALOG if m.backend == "vllm"
173
+ ] + ["Qwen/Qwen3.5-4B", "Qwen/Qwen2.5-7B-Instruct"]
174
 
175
 
176
  def opencode_ui_builder(
 
198
  _env_cache["instance"] = inst
199
  return inst
200
 
201
+ with gr.Blocks(title=title, analytics_enabled=False, css=TRANSCRIPT_CSS) as demo:
202
  gr.Markdown(
203
  f"# {title}\n"
204
  "Run one OpenCode rollout against any OpenAI-compatible endpoint. "
 
208
  )
209
 
210
  # ── Config ─────────────────────────────────────────────────────────
211
+ # Two backends:
212
+ # 1. Self-hosted vLLM β€” user supplies model id + base URL.
213
+ # 2. Hosted (HF Router) β€” user picks from the curated Qwen
214
+ # catalog, or selects "Custom" and types their own HF-router
215
+ # model id (e.g. ``Qwen/Qwen3-8B:together``).
216
  with gr.Row():
217
+ with gr.Column(scale=3):
218
+ backend_mode = gr.Radio(
219
+ label="Backend",
220
+ choices=["Self-hosted vLLM", "Hosted (HF Router)"],
221
+ value="Hosted (HF Router)",
222
  )
223
+ # --- Self-hosted vLLM fields (shown only when selected) ---
224
+ with gr.Row(visible=False) as vllm_row:
225
+ vllm_model = gr.Textbox(
226
+ label="Model id (as served by your vLLM)",
227
+ value=_VLLM_MODEL_SUGGESTIONS[0],
228
+ placeholder="Qwen/Qwen3.5-4B",
229
+ scale=1,
230
+ )
231
+ vllm_url = gr.Textbox(
232
+ label="vLLM base URL",
233
+ value="",
234
+ placeholder="https://.../v1",
235
+ scale=2,
236
+ )
237
+ # --- Hosted HF Router fields (default visible) ---
238
+ with gr.Row(visible=True) as hf_row:
239
+ hosted_model = gr.Dropdown(
240
+ label="Hosted model",
241
+ choices=_HF_MODEL_CHOICES,
242
+ value=_DEFAULT_HF_KEY,
243
+ scale=2,
244
+ )
245
+ hf_token = gr.Textbox(
246
+ label="HF token",
247
+ value=_HF_TOKEN_ENV,
248
+ type="password",
249
+ placeholder="hf_...",
250
+ scale=2,
251
+ )
252
+ hosted_custom_id = gr.Textbox(
253
+ label="Custom HF-router model id",
254
+ value="",
255
+ placeholder="Qwen/Qwen3-8B:together (org/repo[:provider])",
256
+ visible=False,
257
  )
258
+ thinking = gr.Checkbox(
259
+ label="Thinking mode (Qwen3.5 only)",
260
+ value=False,
 
261
  )
262
  with gr.Column(scale=1):
263
  mode = gr.Dropdown(
 
274
  minimum=60, maximum=1200, value=300, step=30,
275
  )
276
 
277
+ def _on_backend_change(mode_v: str):
278
+ is_vllm = mode_v == "Self-hosted vLLM"
279
+ return (
280
+ gr.update(visible=is_vllm), # vllm_row
281
+ gr.update(visible=not is_vllm), # hf_row
282
+ gr.update(visible=False), # hosted_custom_id reset
 
 
 
 
283
  )
284
 
285
+ def _on_hosted_change(choice: str):
286
+ return gr.update(visible=(choice == _CUSTOM_HF_KEY))
287
+
288
+ backend_mode.change(
289
+ _on_backend_change,
290
+ inputs=[backend_mode],
291
+ outputs=[vllm_row, hf_row, hosted_custom_id],
 
 
292
  )
293
+ hosted_model.change(
294
+ _on_hosted_change,
295
+ inputs=[hosted_model],
296
+ outputs=[hosted_custom_id],
297
  )
298
 
299
+ # ── Task fields ────────────────────────────────────────────────────
300
+ # Verifier (test.sh) is intentionally not surfaced here β€” it's only
301
+ # needed for scored training. For interactive use, leave it empty
302
+ # and just have the agent finish with something observable (e.g.
303
+ # "print DONE at the end"). MCP tools already accept
304
+ # ``test_script=""`` and skip scoring when empty.
305
+ instruction = gr.Textbox(
306
+ label="Instruction",
307
+ value=(
308
+ "Write `hello.py` in the current directory that prints "
309
+ "`hello` (no quotes). Then run it and print `DONE` when "
310
+ "you are finished."
311
+ ),
312
+ lines=4,
313
  )
 
314
  with gr.Row():
315
+ task_id = gr.Textbox(
316
+ label="Task id (optional label)",
317
+ value="interactive",
318
+ scale=1,
319
+ )
320
+ setup_shell = gr.Textbox(
321
+ label="Setup shell (optional, runs before opencode)",
322
+ value="",
323
+ placeholder="e.g. pip install polars",
324
+ scale=3,
325
+ )
326
 
 
 
327
  with gr.Row():
328
+ run_btn = gr.Button("β–Ά Run", variant="primary", scale=2)
329
+ abort_btn = gr.Button("⏹ Abort", variant="stop", scale=1)
330
+ reset_btn = gr.Button("οΏ½οΏ½οΏ½ Reset", variant="secondary", scale=1)
331
+ check_btn = gr.Button("πŸ”Ž Check endpoint", scale=1)
332
+
333
+ # ── Output: chat-style single-column ──────────────────────────────
334
+ # Transcript is the hero. The status line above it carries a
335
+ # sandbox-boot phase indicator so users know whether we're
336
+ # spawning E2B, installing opencode, or waiting for the agent.
337
+ # Everything else (reward, files, logprob trace, verifier, raw
338
+ # JSON) lives in collapsed accordions below. Matches the chat
339
+ # shape of local_ui.py.
340
+ status = gr.Markdown()
341
+ # Shared state: the active rollout_id so Abort and Reset can find it.
342
+ rollout_state = gr.State("")
343
+ transcript_html = gr.HTML(
344
+ value="<div class='empty'>run a rollout to see the transcript</div>",
345
+ )
346
 
347
+ # Hidden outputs retained only so the streaming handler's tuple
348
+ # shape doesn't have to change. They never render in the UI.
349
+ reward_out = gr.Number(visible=False)
350
+ wall_out = gr.Number(visible=False)
351
+ exit_out = gr.Number(visible=False)
352
+ turns_out = gr.Number(visible=False)
353
+ with gr.Accordion("Workdir files", open=False):
354
  workdir_md = gr.Markdown()
355
+ with gr.Accordion("Proxy trace (per turn β€” logprobs)", open=False):
356
  proxy_trace_json = gr.JSON(label=None)
357
+ with gr.Accordion("Diagnostics (proxy Β· install Β· agent logs)", open=False):
358
+ verifier_out = gr.Textbox(label="proxy/install/agent log tails", lines=12)
359
+ verifier_err = gr.Textbox(label="primitive error (if any)", lines=3)
360
  with gr.Accordion("Raw result JSON", open=False):
361
  raw_json = gr.JSON(label=None)
362
 
363
  # ── Streaming Run handler ─────────────────────────────────────────
364
  def _run_streaming(
365
+ backend_mode_v: str,
366
+ vllm_model_v: str,
367
  vllm_url_v: str,
368
+ hosted_model_v: str,
369
+ hosted_custom_id_v: str,
370
+ hf_token_v: str,
371
+ thinking_v: bool,
372
  mode_v: str,
373
  max_tokens_cap_v: int,
374
  agent_timeout_s_v: float,
375
  task_id_v: str,
376
  instruction_v: str,
 
377
  setup_shell_v: str,
378
  ):
379
+ # Verifier is optional. For interactive use we pass an empty
380
+ # test_script so the finalizer skips scoring.
381
+ test_script_v = ""
382
+ # Assemble the uniform model_key from the UI's two-backend picker.
383
+ if backend_mode_v == "Self-hosted vLLM":
384
+ if not vllm_model_v.strip():
385
+ yield _error_tuple("Self-hosted vLLM requires a model id.")
386
+ return
387
+ model_key_v = f"vllm://{vllm_model_v.strip()}"
388
+ else:
389
+ if hosted_model_v == _CUSTOM_HF_KEY:
390
+ cid = hosted_custom_id_v.strip()
391
+ if not cid:
392
+ yield _error_tuple(
393
+ "Hosted 'Custom' picked but no model id entered."
394
+ )
395
+ return
396
+ if not cid.startswith("hf-router://"):
397
+ # Accept either plain "Org/Repo[:provider]" or a
398
+ # fully-prefixed key.
399
+ cid = f"hf-router://{cid}"
400
+ model_key_v = cid
401
+ else:
402
+ model_key_v = hosted_model_v
403
  """Gradio generator: yields UI updates as the rollout progresses.
404
 
405
  Uses the non-blocking fine-grained tools:
 
408
  import httpx
409
  from openenv.core.env_server.mcp_types import CallToolAction
410
 
411
+ # 0) Resolve the catalog pick into (base_url, api_key, model).
412
+ # This validates the secret matches the selected backend.
413
+ try:
414
+ base_url, _api_key, _model, entry = resolve_endpoint(
415
+ model_key_v,
416
+ vllm_url=vllm_url_v,
417
+ hf_token=hf_token_v,
418
+ )
419
+ except Exception as exc:
420
+ yield _error_tuple(f"config: {exc}")
421
+ return
422
+
423
+ # 1) Pre-flight: verify the endpoint is reachable before burning
424
+ # an E2B sandbox on a URL typo / bad token.
425
  yield (
426
+ "πŸ”Ž **validating endpoint…**",
427
  None, None, None, 0,
428
+ "", [], "", "", {"stage": "validate", "backend": entry.backend},
429
+ "<div class='empty'>validating endpoint…</div>",
430
+ "",
431
  )
432
+ probe_headers: dict[str, str] = {}
433
+ if entry.backend == "hf_router":
434
+ probe_headers["Authorization"] = f"Bearer {hf_token_v}"
435
  try:
436
+ r = httpx.get(
437
+ f"{base_url}/models", headers=probe_headers, timeout=15,
438
+ )
439
  if r.status_code != 200:
440
  yield _error_tuple(
441
+ f"{entry.backend} probe {base_url}/models β†’ HTTP {r.status_code}: "
442
+ f"{r.text[:200]}"
443
  )
444
  return
445
  except Exception as exc:
446
  yield _error_tuple(
447
+ f"endpoint unreachable: {type(exc).__name__}: {exc}"
448
  )
449
  return
450
 
451
+ yield (
452
+ "🟑 **initialising env (creating MCP registry)…**",
453
+ None, None, None, 0, "", [], "", "", {"stage": "env_init"},
454
+ "<div class='empty'>initialising env…</div>",
455
+ "",
456
+ )
457
  try:
458
  env = _get_env()
459
  env.reset()
 
461
  yield _error_tuple(f"env init failed: {type(exc).__name__}: {exc}")
462
  return
463
 
464
+ # 2) start_rollout β€” uniform args: model_key + vllm_url + hf_token
465
+ # + thinking. The env resolves via the catalog server-side.
466
  try:
467
  start_obs = env.step(
468
  CallToolAction(
469
  tool_name="start_rollout",
470
  arguments={
471
+ "model_key": model_key_v,
472
  "vllm_url": vllm_url_v,
473
+ "hf_token": hf_token_v,
474
+ "thinking": bool(thinking_v),
475
  "instruction": instruction_v,
476
  "test_script": test_script_v,
477
  "task_id": task_id_v,
478
  "setup_shell": setup_shell_v,
479
  "upload_files": {},
 
 
480
  "mode": mode_v,
 
 
 
 
 
481
  "max_tokens_cap": int(max_tokens_cap_v),
482
  "agent_timeout_s": float(agent_timeout_s_v),
483
  },
 
494
  yield _error_tuple(f"start_rollout returned no rollout_id: {start_payload}")
495
  return
496
 
497
+ # Initial UI update β€” yield the rollout_id into shared state so
498
+ # Abort / Reset can target the right rollout.
499
  yield (
500
+ f"🟑 **rollout `{rollout_id}` started β€” booting sandbox…**",
501
  None, None, None, 0,
502
  "_(no files yet)_", [], "", "", start_payload,
503
+ "<div class='empty'>booting sandbox β€” this takes ~20–40s cold…</div>",
504
+ rollout_id,
505
  )
506
 
507
+ # 2) Poll get_state + get_messages at 1s cadence. Show a sandbox
508
+ # boot-phase label so users can tell "booting" from "stuck".
509
  deadline = time.time() + float(agent_timeout_s_v) + 120
510
+ t_started = float(start_payload.get("started_at") or time.time())
511
  status_str = "running"
 
512
  while time.time() < deadline:
513
  try:
514
  state_obs = env.step(
 
516
  tool_name="get_state",
517
  arguments={"rollout_id": rollout_id},
518
  ),
519
+ timeout_s=20,
520
  )
521
  state_payload = _parse_result(state_obs)
522
  except Exception as exc:
523
  state_payload = {"error": f"{type(exc).__name__}: {exc}"}
524
 
525
+ # Live transcript β€” only meaningful once opencode serve has
526
+ # created its session (state_payload carries serve_session_id
527
+ # in that case). Before that, get_messages returns an empty
528
+ # list with a ``note`` field.
529
+ parts_list: list = []
530
+ transcript = "<div class='empty'>waiting for first part…</div>"
531
+ try:
532
+ msg_obs = env.step(
533
+ CallToolAction(
534
+ tool_name="get_messages",
535
+ arguments={"rollout_id": rollout_id},
536
+ ),
537
+ timeout_s=20,
538
+ )
539
+ msg_payload = _parse_result(msg_obs)
540
+ parts_list = collect_parts_from_messages(
541
+ msg_payload.get("messages") or []
542
+ )
543
+ if parts_list:
544
+ transcript = render_transcript(parts_list)
545
+ except Exception:
546
+ pass
547
+
548
  status_str = state_payload.get("status", "?")
549
+ elapsed = time.time() - t_started
550
+ msg_count = len(
551
+ (state_payload.get("messages") if isinstance(state_payload, dict) else None) or []
552
+ )
553
+ # Prefer message count from the transcript payload.
554
+ try:
555
+ msg_count = len(msg_payload.get("messages") or [])
556
+ except Exception:
557
+ msg_count = 0
558
+ phase = _boot_phase(state_payload, msg_count, len(parts_list))
559
 
560
  yield (
561
+ f"{phase} Β· elapsed `{elapsed:.1f}s` Β· rollout `{rollout_id}`",
562
+ None, None, None, state_payload.get("proxy_turns_so_far", 0),
 
563
  "_(workdir populated on finalize)_",
564
  [], "", "", state_payload,
565
+ transcript,
566
+ rollout_id,
567
  )
568
 
569
  if status_str == "done":
570
  break
571
+ time.sleep(1.0)
572
 
573
  # 3) finalize_rollout β€” run verifier + collect full result
574
  try:
 
587
  status_md = _summarize_status(result)
588
  wd_md = _render_workdir(result.get("workdir_files") or {})
589
  turns = result.get("proxy_turns") or []
590
+
591
+ # One last transcript fetch β€” captures any final parts that
592
+ # arrived between the last poll and session.idle.
593
+ final_transcript = "<div class='empty'>(transcript unavailable)</div>"
594
+ try:
595
+ msg_obs = env.step(
596
+ CallToolAction(
597
+ tool_name="get_messages",
598
+ arguments={"rollout_id": rollout_id},
599
+ ),
600
+ timeout_s=30,
601
+ )
602
+ msg_payload = _parse_result(msg_obs)
603
+ parts = collect_parts_from_messages(msg_payload.get("messages") or [])
604
+ final_transcript = render_transcript(parts)
605
+ except Exception:
606
+ pass
607
+
608
+ # Diagnostics pane: concat the three log tails so failures
609
+ # are visible without expanding the raw JSON.
610
+ diag_tail = "\n".join([
611
+ "--- PROXY LOG TAIL ---",
612
+ (result.get("proxy_log_tail") or "(empty)")[-2000:],
613
+ "",
614
+ "--- INSTALL LOG TAIL ---",
615
+ (result.get("install_log_tail") or "(empty)")[-1000:],
616
+ "",
617
+ "--- AGENT LOG TAIL ---",
618
+ (result.get("agent_log_tail") or "(empty)")[-2000:],
619
+ ])
620
+ err_line = result.get("error") or ""
621
+
622
  yield (
623
  status_md,
624
  result.get("reward"),
 
627
  len(turns),
628
  wd_md,
629
  turns,
630
+ diag_tail,
631
+ err_line,
632
  result,
633
+ final_transcript,
634
+ rollout_id,
635
  )
636
 
637
+ _output_widgets = [
638
+ status, reward_out, wall_out, exit_out, turns_out,
639
+ workdir_md, proxy_trace_json,
640
+ verifier_out, verifier_err, raw_json,
641
+ transcript_html, rollout_state,
642
+ ]
643
  run_btn.click(
644
  _run_streaming,
645
  inputs=[
646
+ backend_mode,
647
+ vllm_model, vllm_url,
648
+ hosted_model, hosted_custom_id, hf_token,
649
+ thinking, mode,
650
  max_tokens_cap, agent_timeout_s,
651
+ task_id, instruction, setup_shell,
 
 
 
 
 
652
  ],
653
+ outputs=_output_widgets,
654
  )
655
 
656
+ # Check-endpoint handler β€” cheap GET /v1/models probe against the
657
+ # currently-configured backend.
658
+ def _check_endpoint(
659
+ backend_mode_v: str,
660
+ vllm_model_v: str, vllm_url_v: str,
661
+ hosted_model_v: str, hosted_custom_id_v: str, hf_token_v: str,
662
+ ) -> str:
663
  import httpx
664
+ if backend_mode_v == "Self-hosted vLLM":
665
+ model_key_v = f"vllm://{(vllm_model_v or '').strip()}"
666
+ else:
667
+ if hosted_model_v == _CUSTOM_HF_KEY:
668
+ cid = (hosted_custom_id_v or "").strip()
669
+ if not cid:
670
+ return "❌ custom HF model id is empty"
671
+ model_key_v = cid if cid.startswith("hf-router://") else f"hf-router://{cid}"
672
+ else:
673
+ model_key_v = hosted_model_v
674
  try:
675
+ base_url, _key, _model, entry = resolve_endpoint(
676
+ model_key_v, vllm_url=vllm_url_v, hf_token=hf_token_v,
677
+ )
678
+ except Exception as exc:
679
+ return f"❌ {exc}"
680
+ headers = {"Authorization": f"Bearer {hf_token_v}"} if entry.backend == "hf_router" else {}
681
+ models_url = f"{base_url}/models"
682
+ try:
683
+ r = httpx.get(models_url, headers=headers, timeout=15)
684
  except Exception as exc:
685
  return f"❌ `{models_url}` unreachable: `{type(exc).__name__}: {exc}`"
686
  if r.status_code != 200:
687
  return f"❌ `{models_url}` β†’ HTTP {r.status_code}\n```\n{r.text[:400]}\n```"
688
  try:
689
+ ids = [m.get("id") for m in r.json().get("data", []) if m.get("id")]
 
690
  except Exception:
691
  ids = []
692
+ hint = f" Β· backend=`{entry.backend}` Β· resolved=`{_model}`"
693
  if ids:
694
+ shown = ", ".join(ids[:5]) + (f", … (+{len(ids)-5} more)" if len(ids) > 5 else "")
695
+ return f"βœ… reachable{hint} Β· models: `{shown}`"
696
+ return f"⚠️ reachable (HTTP 200) but no `data[*].id` in response{hint}"
697
+
698
+ check_btn.click(
699
+ _check_endpoint,
700
+ inputs=[backend_mode, vllm_model, vllm_url, hosted_model, hosted_custom_id, hf_token],
701
+ outputs=[status],
702
+ )
703
+
704
+ # ── Abort handler ────────────────────────────────────────────────
705
+ # Fire-and-forget abort on the active rollout. Keeps the env + UI
706
+ # state so the user can see what the transcript looked like at the
707
+ # moment of abort.
708
+ def _abort(current_rollout_id: str) -> tuple:
709
+ from openenv.core.env_server.mcp_types import CallToolAction
710
+ if not current_rollout_id:
711
+ return (
712
+ "⚠️ nothing to abort (no active rollout).",
713
+ None, None, None, None,
714
+ "", [], "", "", {"abort": "no-op"},
715
+ gr.update(), current_rollout_id,
716
+ )
717
+ try:
718
+ env = _get_env()
719
+ env.step(
720
+ CallToolAction(
721
+ tool_name="abort_rollout",
722
+ arguments={"rollout_id": current_rollout_id},
723
+ ),
724
+ timeout_s=30,
725
+ )
726
+ except Exception as exc: # noqa: BLE001
727
+ return (
728
+ f"⚠️ abort failed: `{type(exc).__name__}: {exc}`",
729
+ None, None, None, None,
730
+ "", [], "", "", {"abort": str(exc)},
731
+ gr.update(), current_rollout_id,
732
+ )
733
+ return (
734
+ f"⏹ **aborted** rollout `{current_rollout_id}`",
735
+ None, None, None, None,
736
+ "", [], "", "", {"abort": current_rollout_id},
737
+ gr.update(), current_rollout_id,
738
+ )
739
 
740
+ abort_btn.click(
741
+ _abort,
742
+ inputs=[rollout_state],
743
+ outputs=_output_widgets,
744
+ )
745
 
746
+ # ── Reset handler ────────────────────────────────────────────────
747
+ # Aborts any in-flight rollout, drops the cached env so the next Run
748
+ # creates a fresh :class:`OpenCodeEnvironment` (new MCP registry),
749
+ # and clears all UI panels including the transcript.
750
+ def _reset(current_rollout_id: str) -> tuple:
751
+ from openenv.core.env_server.mcp_types import CallToolAction
752
+ if current_rollout_id:
753
+ try:
754
+ env = _get_env()
755
+ env.step(
756
+ CallToolAction(
757
+ tool_name="abort_rollout",
758
+ arguments={"rollout_id": current_rollout_id},
759
+ ),
760
+ timeout_s=30,
761
+ )
762
+ except Exception:
763
+ # Best-effort β€” if abort fails, still drop the env below
764
+ # so the next Run starts clean.
765
+ pass
766
  _env_cache["instance"] = None
767
  return (
768
+ "πŸ”„ **reset.** next Run will create a fresh environment.",
769
  None, None, None, None,
770
  "_(workdir cleared)_",
771
  [], "", "", {"reset": True},
772
+ "<div class='empty'>run a rollout to see the transcript</div>",
773
+ "",
774
  )
775
 
776
  reset_btn.click(
777
  _reset,
778
+ inputs=[rollout_state],
779
+ outputs=_output_widgets,
 
 
 
 
780
  )
781
 
782
  return demo
 
785
  # ── Helpers ─────────────────────────────────────────────────────────────────
786
 
787
 
788
+ def _error_tuple(msg: str, rollout_id: str = "") -> tuple:
789
  return (
790
  f"❌ **Error:** `{msg}`",
791
  None, None, None, None,
792
  "", [], "", "", {"error": msg},
793
+ f"<div class='errbox'>❌ {msg}</div>",
794
+ rollout_id,
795
+ )
796
+
797
+
798
+ def _boot_phase(state: dict, msg_count: int, parts_count: int) -> str:
799
+ """Human-readable sandbox + session boot phase label."""
800
+ if state.get("error"):
801
+ return f"⚠️ state error: `{state.get('error')}`"
802
+ status = state.get("status", "?")
803
+ if status == "unknown":
804
+ return "⏳ **starting rollout…**"
805
+ serve_sid = state.get("serve_session_id")
806
+ if not serve_sid:
807
+ return (
808
+ "🟑 **booting sandbox** β€” spawning E2B, installing opencode, "
809
+ "starting proxy + opencode serve (this takes ~20–40s cold)"
810
+ )
811
+ if msg_count == 0:
812
+ return "🟑 **creating session** β€” serve is up, prompt about to fire"
813
+ if parts_count == 0:
814
+ return "πŸ’­ **agent thinking** β€” first LLM call in flight"
815
+ turns = state.get("proxy_turns_so_far", 0)
816
+ return (
817
+ f"⚑ **running** Β· serve session `{serve_sid[:14]}…` Β· "
818
+ f"parts `{parts_count}` Β· turns `{turns}`"
819
  )
820
 
821
 
server/opencode_environment.py CHANGED
@@ -31,6 +31,11 @@ from fastmcp import FastMCP
31
  from openenv.core.env_server.mcp_environment import MCPEnvironment
32
  from openenv.core.env_server.types import Action, Observation
33
 
 
 
 
 
 
34
  load_dotenv()
35
 
36
 
@@ -168,56 +173,62 @@ class OpenCodeEnvironment(MCPEnvironment):
168
 
169
  @mcp.tool
170
  def run_rollout(
171
- vllm_url: str,
172
- model: str,
173
  instruction: str,
174
  test_script: str,
 
 
 
175
  task_id: str = "",
176
  setup_shell: str = "",
177
  upload_files: Optional[dict[str, str]] = None,
178
- provider: str = "openai_compatible",
179
- api_key: str = "intercepted",
180
  mode: str = "transparent_proxy",
181
- disable_thinking: bool = False,
182
  max_tokens_cap: int = 4096,
183
  agent_timeout_s: float = 600.0,
184
  ) -> str:
185
  """Run one OpenCode rollout end-to-end.
186
 
187
  Args:
188
- vllm_url: LLM endpoint (``https://host/v1``).
189
- model: Model id the provider recognizes.
 
190
  instruction: Prompt passed to ``opencode run``.
191
  test_script: Bash verifier. Must write a float reward to
192
  ``/home/user/logs/verifier/reward.txt``.
 
 
 
 
 
 
 
193
  task_id: Optional identifier echoed back for traceability.
194
  setup_shell: Optional shell run before opencode starts.
195
- upload_files: Optional {remote_path: content} staged into the
196
- sandbox.
197
- provider: OpenCodeConfig provider id. For vLLM use
198
- ``"openai_compatible"``; for real OpenAI ``"openai"``.
199
- api_key: Provider API key. vLLM ignores this.
200
- mode: ``"transparent_proxy"`` (captures per-turn logprobs) or
201
- ``"black_box"`` (direct connection, no logprobs).
202
- disable_thinking: Qwen3/Qwen3.5 proxy-side thinking disable.
203
  max_tokens_cap: Clamp forwarded ``max_tokens``.
204
  agent_timeout_s: Max opencode runtime in seconds.
205
 
206
  Returns:
207
  JSON-serialized :class:`RolloutResult`.
208
  """
 
 
 
209
  return self._run_rollout_impl(
210
- vllm_url=vllm_url,
211
  model=model,
212
  instruction=instruction,
213
  test_script=test_script,
214
  task_id=task_id,
215
  setup_shell=setup_shell,
216
  upload_files=upload_files or {},
217
- provider=provider,
218
  api_key=api_key,
219
  mode=mode,
220
- disable_thinking=disable_thinking,
221
  max_tokens_cap=max_tokens_cap,
222
  agent_timeout_s=agent_timeout_s,
223
  )
@@ -230,41 +241,46 @@ class OpenCodeEnvironment(MCPEnvironment):
230
 
231
  @mcp.tool
232
  def start_rollout(
233
- vllm_url: str,
234
- model: str,
235
  instruction: str,
236
  test_script: str = "",
 
 
 
237
  task_id: str = "",
238
  setup_shell: str = "",
239
  upload_files: Optional[dict[str, str]] = None,
240
- provider: str = "openai_compatible",
241
- api_key: str = "intercepted",
242
  mode: str = "transparent_proxy",
243
- disable_thinking: bool = False,
244
  max_tokens_cap: int = 4096,
245
  agent_timeout_s: float = 600.0,
246
  ) -> str:
247
  """Start a rollout asynchronously; return a ``rollout_id`` immediately.
248
 
249
- Spawns a background worker that creates the sandbox, installs
250
- opencode, boots ``opencode serve``, and fires the instruction.
251
- The caller then uses ``subscribe_events`` / ``get_state`` /
252
- ``abort_rollout`` / ``finalize`` with the returned id.
 
 
 
253
  """
 
 
 
254
  rid = uuid4().hex[:12]
255
  handle = self._spawn_async_rollout(
256
  rollout_id=rid,
257
- vllm_url=vllm_url,
258
  model=model,
259
  instruction=instruction,
260
  test_script=test_script,
261
  task_id=task_id,
262
  setup_shell=setup_shell,
263
  upload_files=upload_files or {},
264
- provider=provider,
265
  api_key=api_key,
266
  mode=mode,
267
- disable_thinking=disable_thinking,
268
  max_tokens_cap=max_tokens_cap,
269
  agent_timeout_s=agent_timeout_s,
270
  )
@@ -305,6 +321,53 @@ class OpenCodeEnvironment(MCPEnvironment):
305
  "finished_at": handle.finished_at,
306
  })
307
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  @mcp.tool
309
  def abort_rollout(rollout_id: str) -> str:
310
  """Cancel an in-flight rollout.
@@ -449,13 +512,15 @@ class OpenCodeEnvironment(MCPEnvironment):
449
  result = self._result_cls(task_id=task_id, mode=mode)
450
  t0 = time.time()
451
 
452
- provider_model = _qualify_model(provider, model)
453
-
 
 
454
  config = self._OpenCodeConfig(
455
  provider=provider,
456
  base_url=vllm_url.rstrip("/"),
457
  api_key=api_key,
458
- model=provider_model,
459
  agent_timeout_s=agent_timeout_s,
460
  proxy_disable_thinking=disable_thinking,
461
  proxy_max_tokens_cap=max_tokens_cap if max_tokens_cap > 0 else None,
@@ -574,12 +639,13 @@ class OpenCodeEnvironment(MCPEnvironment):
574
  metadata={"task_id": task_id},
575
  )
576
 
577
- provider_model = _qualify_model(provider, model)
 
578
  config = self._OpenCodeConfig(
579
  provider=provider,
580
  base_url=vllm_url.rstrip("/"),
581
  api_key=api_key,
582
- model=provider_model,
583
  agent_timeout_s=agent_timeout_s,
584
  proxy_disable_thinking=disable_thinking,
585
  proxy_max_tokens_cap=max_tokens_cap if max_tokens_cap > 0 else None,
@@ -597,12 +663,16 @@ class OpenCodeEnvironment(MCPEnvironment):
597
 
598
  def worker() -> None:
599
  try:
 
 
 
 
600
  factory = self._OpenCodeSessionFactory(
601
  config=config,
602
  sandbox_backend=self._E2BSandboxBackend(),
603
  mode=mode,
604
  verifier=None,
605
- driver="cli", # serve is a state store, not an agent runner
606
  )
607
  handle.session = factory.create(task=task)
608
  try:
 
31
  from openenv.core.env_server.mcp_environment import MCPEnvironment
32
  from openenv.core.env_server.types import Action, Observation
33
 
34
+ try:
35
+ from .catalog import resolve_endpoint
36
+ except ImportError: # pragma: no cover
37
+ from catalog import resolve_endpoint # type: ignore
38
+
39
  load_dotenv()
40
 
41
 
 
173
 
174
  @mcp.tool
175
  def run_rollout(
176
+ model_key: str,
 
177
  instruction: str,
178
  test_script: str,
179
+ vllm_url: str = "",
180
+ hf_token: str = "",
181
+ thinking: bool = False,
182
  task_id: str = "",
183
  setup_shell: str = "",
184
  upload_files: Optional[dict[str, str]] = None,
 
 
185
  mode: str = "transparent_proxy",
 
186
  max_tokens_cap: int = 4096,
187
  agent_timeout_s: float = 600.0,
188
  ) -> str:
189
  """Run one OpenCode rollout end-to-end.
190
 
191
  Args:
192
+ model_key: Catalog key β€” one of the entries in
193
+ :data:`server.catalog.CATALOG`. Shape is
194
+ ``"vllm://<repo>"`` or ``"hf-router://<repo>:<provider>"``.
195
  instruction: Prompt passed to ``opencode run``.
196
  test_script: Bash verifier. Must write a float reward to
197
  ``/home/user/logs/verifier/reward.txt``.
198
+ vllm_url: Required when ``model_key`` is a ``vllm://...``
199
+ entry. The tunneled or in-cluster ``/v1`` endpoint.
200
+ hf_token: Required when ``model_key`` is a
201
+ ``hf-router://...`` entry. User's HF token.
202
+ thinking: Enable Qwen-style thinking mode. Ignored for
203
+ models where ``supports_thinking`` is False. Passed to
204
+ the proxy as ``chat_template_kwargs.enable_thinking``.
205
  task_id: Optional identifier echoed back for traceability.
206
  setup_shell: Optional shell run before opencode starts.
207
+ upload_files: Optional ``{remote_path: content}`` staged
208
+ into the sandbox.
209
+ mode: ``"transparent_proxy"`` (captures per-turn logprobs)
210
+ or ``"black_box"`` (direct connection, no logprobs).
 
 
 
 
211
  max_tokens_cap: Clamp forwarded ``max_tokens``.
212
  agent_timeout_s: Max opencode runtime in seconds.
213
 
214
  Returns:
215
  JSON-serialized :class:`RolloutResult`.
216
  """
217
+ base_url, api_key, model, _entry = resolve_endpoint(
218
+ model_key, vllm_url=vllm_url, hf_token=hf_token
219
+ )
220
  return self._run_rollout_impl(
221
+ vllm_url=base_url,
222
  model=model,
223
  instruction=instruction,
224
  test_script=test_script,
225
  task_id=task_id,
226
  setup_shell=setup_shell,
227
  upload_files=upload_files or {},
228
+ provider="openai_compatible",
229
  api_key=api_key,
230
  mode=mode,
231
+ disable_thinking=not bool(thinking),
232
  max_tokens_cap=max_tokens_cap,
233
  agent_timeout_s=agent_timeout_s,
234
  )
 
241
 
242
  @mcp.tool
243
  def start_rollout(
244
+ model_key: str,
 
245
  instruction: str,
246
  test_script: str = "",
247
+ vllm_url: str = "",
248
+ hf_token: str = "",
249
+ thinking: bool = False,
250
  task_id: str = "",
251
  setup_shell: str = "",
252
  upload_files: Optional[dict[str, str]] = None,
 
 
253
  mode: str = "transparent_proxy",
 
254
  max_tokens_cap: int = 4096,
255
  agent_timeout_s: float = 600.0,
256
  ) -> str:
257
  """Start a rollout asynchronously; return a ``rollout_id`` immediately.
258
 
259
+ Same uniform args as :func:`run_rollout`: ``model_key``, plus
260
+ ``vllm_url`` OR ``hf_token`` (depending on backend), plus
261
+ ``thinking``. Spawns a background worker that creates the
262
+ sandbox, installs opencode, boots ``opencode serve``, and
263
+ fires the instruction. The caller then uses
264
+ ``subscribe_events`` / ``get_state`` / ``abort_rollout`` /
265
+ ``finalize`` with the returned id.
266
  """
267
+ base_url, api_key, model, _entry = resolve_endpoint(
268
+ model_key, vllm_url=vllm_url, hf_token=hf_token
269
+ )
270
  rid = uuid4().hex[:12]
271
  handle = self._spawn_async_rollout(
272
  rollout_id=rid,
273
+ vllm_url=base_url,
274
  model=model,
275
  instruction=instruction,
276
  test_script=test_script,
277
  task_id=task_id,
278
  setup_shell=setup_shell,
279
  upload_files=upload_files or {},
280
+ provider="openai_compatible",
281
  api_key=api_key,
282
  mode=mode,
283
+ disable_thinking=not bool(thinking),
284
  max_tokens_cap=max_tokens_cap,
285
  agent_timeout_s=agent_timeout_s,
286
  )
 
321
  "finished_at": handle.finished_at,
322
  })
323
 
324
+ @mcp.tool
325
+ def get_messages(rollout_id: str) -> str:
326
+ """Return the sandbox-side opencode serve transcript for a rollout.
327
+
328
+ Shape matches opencode's ``GET /session/:id/message`` β€”
329
+ ``{"messages": [{info, parts}, ...]}``. Empty ``messages`` list
330
+ if the rollout hasn't created its serve session yet, isn't
331
+ running under the ``serve`` driver, or fetching the transcript
332
+ failed. Designed for UI polling to render a live chat view.
333
+ """
334
+ handle = self._registry.get(rollout_id)
335
+ if handle is None:
336
+ return json.dumps({"rollout_id": rollout_id, "messages": [], "status": "unknown"})
337
+ session = handle.session
338
+ status = "done" if handle.is_done() else "running"
339
+ if session is None:
340
+ return json.dumps({
341
+ "rollout_id": rollout_id,
342
+ "messages": [],
343
+ "status": status,
344
+ "error": handle.error,
345
+ })
346
+ serve_client = getattr(session, "serve_client", None)
347
+ serve_sid = getattr(session, "serve_session_id", None)
348
+ if serve_client is None or not serve_sid:
349
+ return json.dumps({
350
+ "rollout_id": rollout_id,
351
+ "messages": [],
352
+ "status": status,
353
+ "note": "no serve driver (transcript unavailable)",
354
+ })
355
+ try:
356
+ msgs = serve_client.list_messages(serve_sid) or []
357
+ except Exception as exc: # noqa: BLE001
358
+ return json.dumps({
359
+ "rollout_id": rollout_id,
360
+ "messages": [],
361
+ "status": status,
362
+ "error": f"list_messages failed: {type(exc).__name__}: {exc}",
363
+ })
364
+ return json.dumps({
365
+ "rollout_id": rollout_id,
366
+ "messages": msgs,
367
+ "status": status,
368
+ "serve_session_id": serve_sid,
369
+ })
370
+
371
  @mcp.tool
372
  def abort_rollout(rollout_id: str) -> str:
373
  """Cancel an in-flight rollout.
 
512
  result = self._result_cls(task_id=task_id, mode=mode)
513
  t0 = time.time()
514
 
515
+ # Pass the resolved model id straight through β€” the primitive now
516
+ # preserves ``config.model`` verbatim as the upstream model override,
517
+ # so any ``_qualify_model`` wrapping here would double-prefix and
518
+ # cause a 404 (``openai_compatible/Qwen/Qwen3.5-4B does not exist``).
519
  config = self._OpenCodeConfig(
520
  provider=provider,
521
  base_url=vllm_url.rstrip("/"),
522
  api_key=api_key,
523
+ model=model,
524
  agent_timeout_s=agent_timeout_s,
525
  proxy_disable_thinking=disable_thinking,
526
  proxy_max_tokens_cap=max_tokens_cap if max_tokens_cap > 0 else None,
 
639
  metadata={"task_id": task_id},
640
  )
641
 
642
+ # Pass model verbatim (no _qualify_model) β€” primitive now uses
643
+ # ``config.model`` as the upstream override directly.
644
  config = self._OpenCodeConfig(
645
  provider=provider,
646
  base_url=vllm_url.rstrip("/"),
647
  api_key=api_key,
648
+ model=model,
649
  agent_timeout_s=agent_timeout_s,
650
  proxy_disable_thinking=disable_thinking,
651
  proxy_max_tokens_cap=max_tokens_cap if max_tokens_cap > 0 else None,
 
663
 
664
  def worker() -> None:
665
  try:
666
+ # serve driver: opencode serve runs inside the sandbox, the
667
+ # primitive fires the prompt via POST /session/:id/prompt_async,
668
+ # and ``list_messages(serve_session_id)`` is what powers the
669
+ # live chat transcript exposed via the ``get_messages`` tool.
670
  factory = self._OpenCodeSessionFactory(
671
  config=config,
672
  sandbox_backend=self._E2BSandboxBackend(),
673
  mode=mode,
674
  verifier=None,
675
+ driver="serve",
676
  )
677
  handle.session = factory.create(task=task)
678
  try:
server/sandbox_smoke.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Stand-alone E2B sandbox smoke β€” boot opencode serve, expose it publicly.
2
+
3
+ This script isolates "can a sandbox even stand up opencode serve?" from
4
+ the rest of the env (no MCP server, no proxy, no primitive, no UI). Good
5
+ for when a full rollout fails and you want to rule out the sandbox path.
6
+
7
+ What it does:
8
+ 1. Create a fresh E2B sandbox.
9
+ 2. Write ``~/.config/opencode/opencode.json`` pointing at either:
10
+ - the HF Router (default, just needs HF_TOKEN), or
11
+ - a user-provided vLLM URL.
12
+ 3. Install opencode via the upstream one-liner.
13
+ 4. Start ``opencode serve --port 4096 --hostname 0.0.0.0`` in bg.
14
+ 5. ``sandbox.get_host(4096)`` β†’ a public ``https://4096-<sbx>.e2b.app``.
15
+ 6. Poll ``{public}/doc`` until it answers 200.
16
+ 7. Print the public URL + ``sandbox_id`` and keep the sandbox alive so
17
+ you can hit it manually. Ctrl-C closes the sandbox.
18
+
19
+ Usage:
20
+ # HF Router (default)
21
+ HF_TOKEN=hf_... uv run python server/sandbox_smoke.py
22
+
23
+ # or self-hosted vLLM
24
+ uv run python server/sandbox_smoke.py \\
25
+ --backend vllm \\
26
+ --vllm-url https://my-tunnel.example/v1 \\
27
+ --model Qwen/Qwen3.5-4B
28
+
29
+ Once it prints the URL you can:
30
+
31
+ curl https://4096-<sbx>.e2b.app/global/health
32
+ curl https://4096-<sbx>.e2b.app/config
33
+ # create + send prompt
34
+ SID=$(curl -s -X POST https://4096-<sbx>.e2b.app/session \\
35
+ -H 'content-type: application/json' \\
36
+ -d '{"title":"smoke"}' | python3 -c 'import json,sys;print(json.load(sys.stdin)["id"])')
37
+ curl -X POST https://4096-<sbx>.e2b.app/session/$SID/prompt_async \\
38
+ -H 'content-type: application/json' \\
39
+ -d '{"parts":[{"type":"text","text":"write hello.py"}]}'
40
+ curl -N https://4096-<sbx>.e2b.app/event
41
+ """
42
+
43
+ from __future__ import annotations
44
+
45
+ import argparse
46
+ import json
47
+ import os
48
+ import signal
49
+ import sys
50
+ import time
51
+ from pathlib import Path
52
+ from typing import Any
53
+
54
+ # Load the env-server's .env (E2B_API_KEY, HF_TOKEN, etc.) before importing
55
+ # anything that needs them. Walks up from this file to find ``openenv/.env``.
56
+ try:
57
+ from dotenv import load_dotenv
58
+
59
+ _env_path = Path(__file__).resolve().parent.parent / ".env"
60
+ if _env_path.is_file():
61
+ load_dotenv(_env_path, override=False)
62
+ print(f"loaded env from {_env_path}")
63
+ except ImportError:
64
+ pass
65
+
66
+ try:
67
+ from e2b_code_interpreter import Sandbox
68
+ except ImportError:
69
+ from e2b import Sandbox # type: ignore
70
+
71
+
72
+ SERVE_PORT = 4096
73
+ CONFIG_DIR = "/home/user/.config/opencode"
74
+ CONFIG_PATH = f"{CONFIG_DIR}/opencode.json"
75
+ LOG_DIR = "/home/user/logs/agent"
76
+ SERVE_LOG = f"{LOG_DIR}/serve.log"
77
+
78
+
79
+ def build_opencode_json(
80
+ *,
81
+ backend: str,
82
+ model_id: str,
83
+ base_url: str,
84
+ api_key: str,
85
+ context_limit: int = 32768,
86
+ output_limit: int = 16384,
87
+ ) -> str:
88
+ """Emit a minimal, valid opencode.json for the chosen backend."""
89
+ provider_id = "vllm" if backend == "vllm" else "hf-router"
90
+ return json.dumps({
91
+ "$schema": "https://opencode.ai/config.json",
92
+ "model": f"{provider_id}/{model_id}",
93
+ "provider": {
94
+ provider_id: {
95
+ "npm": "@ai-sdk/openai-compatible",
96
+ "name": f"{provider_id} (smoke)",
97
+ "options": {
98
+ "baseURL": base_url,
99
+ "apiKey": api_key,
100
+ "timeout": 600_000,
101
+ },
102
+ "models": {
103
+ model_id: {
104
+ "name": model_id,
105
+ "limit": {"context": context_limit, "output": output_limit},
106
+ },
107
+ },
108
+ },
109
+ },
110
+ "tools": {"webfetch": False, "question": False},
111
+ }, indent=2)
112
+
113
+
114
+ _START = time.time()
115
+
116
+
117
+ def log(msg: str) -> None:
118
+ """Timestamped progress line, flushed so it appears in real time."""
119
+ t = time.time() - _START
120
+ print(f"[{t:6.1f}s] {msg}", flush=True)
121
+
122
+
123
+ def run_shell(sbx: Any, cmd: str, *, timeout_s: int = 120) -> tuple[int, str, str]:
124
+ """Run a shell command, return (exit_code, stdout, stderr)."""
125
+ out = sbx.commands.run(cmd, timeout=timeout_s)
126
+ return (out.exit_code, out.stdout or "", out.stderr or "")
127
+
128
+
129
+ def main() -> int:
130
+ ap = argparse.ArgumentParser()
131
+ ap.add_argument("--backend", choices=["hf", "vllm"], default="hf")
132
+ ap.add_argument("--model", default="Qwen/Qwen3.5-397B-A17B:together")
133
+ ap.add_argument("--vllm-url", default="")
134
+ ap.add_argument("--hf-token", default=os.environ.get("HF_TOKEN", ""))
135
+ ap.add_argument("--sandbox-timeout-s", type=int, default=900)
136
+ ap.add_argument("--idle-hold-s", type=int, default=1200,
137
+ help="keep the sandbox alive for this many seconds after boot")
138
+ args = ap.parse_args()
139
+
140
+ if args.backend == "hf":
141
+ if not args.hf_token:
142
+ print("ERROR: --backend hf needs --hf-token or $HF_TOKEN", file=sys.stderr)
143
+ return 2
144
+ base_url = "https://router.huggingface.co/v1"
145
+ api_key = args.hf_token
146
+ else:
147
+ if not args.vllm_url:
148
+ print("ERROR: --backend vllm needs --vllm-url", file=sys.stderr)
149
+ return 2
150
+ base_url = args.vllm_url.rstrip("/")
151
+ if not base_url.endswith("/v1"):
152
+ base_url += "/v1"
153
+ api_key = "anything"
154
+
155
+ if not os.environ.get("E2B_API_KEY"):
156
+ print("ERROR: E2B_API_KEY not set", file=sys.stderr)
157
+ return 2
158
+
159
+ log(f"[1/7] creating sandbox (timeout={args.sandbox_timeout_s}s) …")
160
+ sbx = Sandbox.create(timeout=args.sandbox_timeout_s)
161
+ log(f" sandbox_id = {sbx.sandbox_id}")
162
+
163
+ try:
164
+ log("[2/7] mkdir config + logs …")
165
+ rc, out, err = run_shell(sbx, f"mkdir -p {CONFIG_DIR} {LOG_DIR}")
166
+ if rc != 0:
167
+ log(f" FAIL rc={rc} stderr={err[:500]}")
168
+ return 1
169
+
170
+ log(f"[3/7] writing {CONFIG_PATH} …")
171
+ cfg = build_opencode_json(
172
+ backend=args.backend,
173
+ model_id=args.model,
174
+ base_url=base_url,
175
+ api_key=api_key,
176
+ )
177
+ sbx.files.write(CONFIG_PATH, cfg)
178
+ log(f" backend={args.backend} model={args.model}")
179
+ log(f" baseURL={base_url}")
180
+
181
+ log("[4/7] installing opencode via curl opencode.ai/install … (~10-30s cold)")
182
+ rc, out, err = run_shell(
183
+ sbx,
184
+ "curl -fsSL https://opencode.ai/install | bash 2>&1",
185
+ timeout_s=300,
186
+ )
187
+ log(f" install rc={rc}")
188
+ if out:
189
+ for line in out.strip().splitlines()[-8:]:
190
+ log(f" β”‚ {line}")
191
+ if rc != 0:
192
+ log(" stderr tail:")
193
+ for line in (err or "").strip().splitlines()[-10:]:
194
+ log(f" β”‚ {line}")
195
+ return 1
196
+
197
+ log("[5/7] verifying opencode binary …")
198
+ rc, out, err = run_shell(sbx, '$HOME/.opencode/bin/opencode --version')
199
+ log(f" opencode --version rc={rc} out={(out or '').strip()[:120]}")
200
+ if rc != 0:
201
+ log(f" stderr: {(err or '')[:400]}")
202
+ return 1
203
+
204
+ log(f"[6/7] starting opencode serve in bg on :{SERVE_PORT} …")
205
+ serve_cmd = (
206
+ 'export PATH="$HOME/.opencode/bin:$PATH" && '
207
+ f"opencode serve --port {SERVE_PORT} --hostname 0.0.0.0 "
208
+ f"> {SERVE_LOG} 2>&1"
209
+ )
210
+ serve_bg = sbx.commands.run(serve_cmd, background=True)
211
+ log(f" serve pid = {getattr(serve_bg, 'pid', '?')}")
212
+
213
+ host = sbx.get_host(SERVE_PORT)
214
+ public_url = f"https://{host}"
215
+ log(f" public URL = {public_url}")
216
+
217
+ log("[7/7] waiting for /doc to answer (polls every 0.5s for 60s) …")
218
+ import httpx
219
+ ok = False
220
+ for i in range(120):
221
+ try:
222
+ r = httpx.get(f"{public_url}/doc", timeout=5)
223
+ if r.status_code == 200:
224
+ log(f" /doc ok (poll #{i+1}, {i*0.5:.1f}s)")
225
+ ok = True
226
+ break
227
+ elif i % 6 == 5: # ~every 3s print progress
228
+ log(f" /doc β†’ HTTP {r.status_code} (still trying, {i*0.5:.1f}s)")
229
+ except Exception as exc:
230
+ if i % 6 == 5:
231
+ log(f" /doc unreachable ({type(exc).__name__}, {i*0.5:.1f}s)")
232
+ time.sleep(0.5)
233
+ if not ok:
234
+ log(" /doc never answered β€” tailing serve log (last 2KB):")
235
+ try:
236
+ tail = sbx.files.read(SERVE_LOG)[-2000:]
237
+ except Exception as exc:
238
+ tail = f"(could not read log: {exc})"
239
+ for line in tail.splitlines()[-40:]:
240
+ log(f" β”‚ {line}")
241
+ return 1
242
+
243
+ print("\n" + "=" * 70)
244
+ print("sandbox is up β€” manual probe recipes:")
245
+ print("=" * 70)
246
+ print(f"curl -s {public_url}/global/health | jq .")
247
+ print(f"curl -s {public_url}/config | jq '.model, .provider'")
248
+ print()
249
+ print(f"SID=$(curl -s -X POST {public_url}/session \\")
250
+ print(" -H 'content-type: application/json' \\")
251
+ print(" -d '{\"title\":\"smoke\"}' | jq -r .id)")
252
+ print(f"curl -X POST {public_url}/session/$SID/prompt_async \\")
253
+ print(" -H 'content-type: application/json' \\")
254
+ print(" -d '{\"parts\":[{\"type\":\"text\",\"text\":\"write hello.py and run it\"}]}'")
255
+ print(f"curl -N {public_url}/event # SSE stream")
256
+ print()
257
+ print(f"serve log: sbx.files.read('{SERVE_LOG}')")
258
+ print(f"sandbox_id: {sbx.sandbox_id}")
259
+ print(f"holding for up to {args.idle_hold_s}s β€” Ctrl-C to close")
260
+ print("=" * 70 + "\n")
261
+
262
+ stopper = {"stop": False}
263
+ def _sigh(*_a):
264
+ print("\nsignal β€” closing sandbox")
265
+ stopper["stop"] = True
266
+ signal.signal(signal.SIGINT, _sigh)
267
+ signal.signal(signal.SIGTERM, _sigh)
268
+
269
+ # Periodic /doc ping so we catch opencode-serve crashes in real time.
270
+ # Any non-200 (incl. E2B's 502 "port not open") is a crash signal β€”
271
+ # dump serve.log and stop the hold.
272
+ import httpx
273
+ last_ok_ts = time.time()
274
+ deadline = time.time() + args.idle_hold_s
275
+ def _dump_serve_log() -> None:
276
+ try:
277
+ tail = sbx.files.read(SERVE_LOG)
278
+ log(" --- serve.log tail (last 4KB) ---")
279
+ for line in tail[-4000:].splitlines()[-60:]:
280
+ log(f" β”‚ {line}")
281
+ log(" --- end serve.log ---")
282
+ except Exception as exc2:
283
+ log(f" could not read serve.log: {exc2}")
284
+ # Also list workdir so we can see if the agent did anything.
285
+ try:
286
+ rc, out, err = run_shell(sbx, "ls -la /home/user/workdir 2>&1 | head -40")
287
+ log(" --- workdir ls ---")
288
+ for line in (out or err).splitlines():
289
+ log(f" β”‚ {line}")
290
+ except Exception:
291
+ pass
292
+ while time.time() < deadline and not stopper["stop"]:
293
+ try:
294
+ r = httpx.get(f"{public_url}/doc", timeout=5)
295
+ if r.status_code == 200:
296
+ last_ok_ts = time.time()
297
+ else:
298
+ log(f"!!! /doc β†’ HTTP {r.status_code} "
299
+ f"(last ok {time.time()-last_ok_ts:.1f}s ago) β€” "
300
+ f"opencode serve appears dead, dumping log")
301
+ _dump_serve_log()
302
+ break
303
+ except Exception as exc:
304
+ log(f"!!! /doc probe failed: {type(exc).__name__}: {exc} "
305
+ f"(last ok {time.time()-last_ok_ts:.1f}s ago)")
306
+ _dump_serve_log()
307
+ break
308
+ time.sleep(10.0)
309
+ return 0
310
+
311
+ finally:
312
+ try:
313
+ print("killing sandbox …")
314
+ sbx.kill()
315
+ except Exception as exc:
316
+ print(f" kill failed (probably already dead): {exc}")
317
+
318
+
319
+ if __name__ == "__main__":
320
+ sys.exit(main())
server/transcript.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared transcript rendering used by both UIs.
2
+
3
+ Both ``local_ui.py`` (driving a raw ``opencode serve``) and the deployed
4
+ ``server/gradio_ui.py`` (driving an in-sandbox ``opencode serve`` through
5
+ the env's MCP tools) consume the same opencode message+parts shape:
6
+
7
+ messages: [
8
+ {
9
+ "info": {id, role, sessionID, time, ...},
10
+ "parts": [
11
+ {"type": "step-start", ...},
12
+ {"type": "reasoning", "text": ..., "id": ...},
13
+ {"type": "text", "text": ..., "id": ...},
14
+ {"type": "tool", "tool": "...", "state": {status, input, output}, ...},
15
+ {"type": "step-finish", "tokens": {...}, ...},
16
+ ],
17
+ },
18
+ ...
19
+ ]
20
+
21
+ or the flat SSE form:
22
+
23
+ events: [{"type": "message.part.updated", "properties": {"part": {...}}}, ...]
24
+
25
+ Both reduce to an ordered list of parts keyed on ``part.id``.
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ import html as _html
31
+ import json
32
+ from typing import Any
33
+
34
+
35
+ # ── Part collection ────────────────────────────────────────────────────────
36
+
37
+
38
+ def collect_parts_from_events(events: list[dict[str, Any]]) -> list[dict[str, Any]]:
39
+ """Reduce SSE ``message.part.updated`` frames to latest snapshot per ``part.id``.
40
+
41
+ Used by ``local_ui.py`` (direct SSE consumer).
42
+ """
43
+ order: list[str] = []
44
+ latest: dict[str, dict[str, Any]] = {}
45
+ for ev in events:
46
+ if ev.get("type") != "message.part.updated":
47
+ continue
48
+ p = (ev.get("properties") or {}).get("part") or {}
49
+ pid = p.get("id")
50
+ if not pid:
51
+ continue
52
+ if pid not in latest:
53
+ order.append(pid)
54
+ latest[pid] = p
55
+ return [latest[i] for i in order]
56
+
57
+
58
+ def collect_parts_from_messages(
59
+ messages: list[dict[str, Any]],
60
+ ) -> list[dict[str, Any]]:
61
+ """Flatten the ``GET /session/:id/message`` shape into an ordered parts list.
62
+
63
+ Used by the deployed Gradio UI which polls via ``get_messages`` MCP tool.
64
+ Message order is preserved; within a message the server returns parts in
65
+ emission order so no further sorting is needed.
66
+ """
67
+ parts: list[dict[str, Any]] = []
68
+ for m in messages or []:
69
+ if not isinstance(m, dict):
70
+ continue
71
+ for p in m.get("parts") or []:
72
+ if isinstance(p, dict):
73
+ parts.append(p)
74
+ return parts
75
+
76
+
77
+ # ── Rendering ──────────────────────────────────────────────────────────────
78
+
79
+
80
+ def _esc(s: Any) -> str:
81
+ return _html.escape("" if s is None else str(s))
82
+
83
+
84
+ def _cap(s: str, n: int = 6000) -> str:
85
+ if len(s) <= n:
86
+ return s
87
+ return s[:n] + f"\n… ({len(s) - n} chars hidden)"
88
+
89
+
90
+ def _todo_icon(status: str | None) -> str:
91
+ return {"completed": "βœ…", "in_progress": "πŸ”„"}.get(status or "", "⏳")
92
+
93
+
94
+ def fmt_tool(name: str, state: dict[str, Any], raw: dict[str, Any]) -> str:
95
+ """Per-tool card β€” mirrors opencode's own UI shapes."""
96
+ status = (state or {}).get("status") or "?"
97
+ inp = (state or {}).get("input") or raw.get("input") or {}
98
+ out = (state or {}).get("output") or raw.get("output") or ""
99
+ badge = {"completed": "ok", "error": "err", "running": "run"}.get(status, "")
100
+
101
+ if name == "read":
102
+ summary = f"πŸ“– read <code>{_esc(inp.get('filePath') or inp.get('path'))}</code>"
103
+ body = f"<pre>{_esc(_cap(str(out)))}</pre>"
104
+ elif name == "write":
105
+ path = inp.get("filePath") or inp.get("path")
106
+ content = inp.get("content") or ""
107
+ summary = f"✍️ write <code>{_esc(path)}</code> ({len(content)} chars)"
108
+ body = f"<pre>{_esc(_cap(content))}</pre>"
109
+ elif name == "edit":
110
+ path = inp.get("filePath") or inp.get("path")
111
+ old = inp.get("oldString") or ""
112
+ new = inp.get("newString") or ""
113
+ summary = f"✏️ edit <code>{_esc(path)}</code>"
114
+ body = (
115
+ f"<div class='lbl'>- old</div><pre class='del'>{_esc(_cap(old, 3000))}</pre>"
116
+ f"<div class='lbl'>+ new</div><pre class='add'>{_esc(_cap(new, 3000))}</pre>"
117
+ )
118
+ if out:
119
+ body += f"<div class='lbl'>output</div><pre>{_esc(_cap(str(out), 2000))}</pre>"
120
+ elif name == "bash":
121
+ cmd = inp.get("command") or inp.get("cmd") or ""
122
+ summary = f"⚑ bash <code>{_esc(cmd[:160])}</code>"
123
+ body = f"<pre>{_esc(_cap(str(out)))}</pre>"
124
+ elif name in ("glob", "find"):
125
+ pattern = inp.get("pattern") or inp.get("query") or ""
126
+ summary = f"πŸ”Ž {name} <code>{_esc(pattern)}</code>"
127
+ body = f"<pre>{_esc(_cap(str(out), 4000))}</pre>"
128
+ elif name == "grep":
129
+ pattern = inp.get("pattern") or ""
130
+ path = inp.get("path") or ""
131
+ summary = f"πŸ”Ž grep <code>{_esc(pattern)}</code>" + (
132
+ f" in <code>{_esc(path)}</code>" if path else ""
133
+ )
134
+ body = f"<pre>{_esc(_cap(str(out), 4000))}</pre>"
135
+ elif name == "todowrite":
136
+ todos = inp.get("todos") or []
137
+ summary = f"πŸ“ todowrite ({len(todos)} items)"
138
+ body = "<ul>" + "".join(
139
+ f"<li>{_todo_icon(t.get('status'))} {_esc(t.get('content'))}</li>"
140
+ for t in todos
141
+ ) + "</ul>"
142
+ elif name == "task":
143
+ desc = inp.get("description") or inp.get("prompt") or ""
144
+ summary = f"🧩 task β€” {_esc(desc[:160])}"
145
+ body = f"<pre>{_esc(_cap(str(out), 4000))}</pre>"
146
+ elif name == "webfetch":
147
+ summary = f"🌐 webfetch <code>{_esc(inp.get('url'))}</code>"
148
+ body = f"<pre>{_esc(_cap(str(out), 4000))}</pre>"
149
+ else:
150
+ summary = f"πŸ”§ {_esc(name)}"
151
+ body = (
152
+ f"<div class='lbl'>input</div><pre>{_esc(_cap(json.dumps(inp, indent=2, default=str), 4000))}</pre>"
153
+ f"<div class='lbl'>output</div><pre>{_esc(_cap(str(out), 4000))}</pre>"
154
+ )
155
+ return (
156
+ "<details class='tool' open>"
157
+ f"<summary>{summary} <span class='badge {badge}'>{_esc(status)}</span></summary>"
158
+ f"<div class='tbody'>{body}</div>"
159
+ "</details>"
160
+ )
161
+
162
+
163
+ def render_transcript(
164
+ parts: list[dict[str, Any]], errors: list[str] | None = None
165
+ ) -> str:
166
+ """Render a parts list as HTML cards. Emits wrapped CSS-friendly markup.
167
+
168
+ Consumers should inject the CSS from :data:`TRANSCRIPT_CSS`.
169
+ """
170
+ out: list[str] = []
171
+ if errors:
172
+ out.append(
173
+ "<div class='errbox'><b>⚠️ errors</b><ul>"
174
+ + "".join(f"<li>{_esc(e)}</li>" for e in errors[:8])
175
+ + "</ul></div>"
176
+ )
177
+ if not parts:
178
+ out.append("<div class='empty'>waiting for first part…</div>")
179
+ return "".join(out)
180
+ out.append("<div class='chat'>")
181
+ for p in parts:
182
+ t = p.get("type")
183
+ if t == "step-start":
184
+ out.append("<div class='step'>── new step ──</div>")
185
+ elif t == "reasoning":
186
+ txt = (p.get("text") or "").strip()
187
+ if txt:
188
+ out.append(
189
+ "<details class='reasoning'><summary>🧠 reasoning</summary>"
190
+ f"<pre>{_esc(_cap(txt, 4000))}</pre></details>"
191
+ )
192
+ elif t == "text":
193
+ txt = (p.get("text") or "").strip()
194
+ if txt:
195
+ out.append(f"<div class='assistant'><pre>{_esc(txt)}</pre></div>")
196
+ elif t == "tool":
197
+ out.append(fmt_tool(p.get("tool") or "?", p.get("state") or {}, p))
198
+ elif t == "step-finish":
199
+ tokens = p.get("tokens") or (p.get("state") or {}).get("tokens") or {}
200
+ if tokens:
201
+ out.append(
202
+ f"<div class='stepfin'>tokens: "
203
+ f"{_esc(json.dumps(tokens, default=str))}</div>"
204
+ )
205
+ out.append("</div>")
206
+ return "".join(out)
207
+
208
+
209
+ TRANSCRIPT_CSS = """
210
+ .chat { font-size:14px; }
211
+ .assistant pre { background:#0e1013; padding:10px; border-radius:8px;
212
+ white-space:pre-wrap; color:#eee; margin:6px 0; }
213
+ .reasoning { opacity:0.8; margin:4px 0; }
214
+ .reasoning pre { background:#0a0b0d; color:#aab; padding:8px; white-space:pre-wrap; }
215
+ .tool { border:1px solid #2a2f3a; border-radius:8px; padding:6px 10px;
216
+ margin:6px 0; background:#12161c; }
217
+ .tool summary { cursor:pointer; color:#ddd; }
218
+ .tool code { background:#222; color:#9cf; padding:1px 4px; border-radius:3px; }
219
+ .tbody { margin-top:6px; }
220
+ .tbody pre { background:#0a0b0d; padding:8px; border-radius:4px;
221
+ white-space:pre-wrap; max-height:400px; overflow:auto;
222
+ font-size:12px; color:#ddd; margin:2px 0; }
223
+ .tbody pre.add { border-left:3px solid #2e6; }
224
+ .tbody pre.del { border-left:3px solid #e53; }
225
+ .tbody .lbl { color:#888; font-size:11px; margin-top:6px; }
226
+ .badge { padding:1px 6px; border-radius:8px; font-size:11px;
227
+ background:#333; color:#ddd; }
228
+ .badge.ok { background:#1f6f43; color:white; }
229
+ .badge.err { background:#7a1e1e; color:white; }
230
+ .badge.run { background:#7a5c1e; color:white; }
231
+ .step { color:#555; text-align:center; margin:10px 0; font-size:11px; }
232
+ .stepfin { color:#666; font-size:11px; margin:4px 0 12px; }
233
+ .empty { color:#666; font-style:italic; padding:12px; }
234
+ .errbox { background:#2a1414; border:1px solid #7a1e1e; border-radius:6px;
235
+ padding:6px 10px; margin:6px 0; color:#f88; font-size:13px; }
236
+ .errbox ul { margin:2px 0 0 18px; }
237
+ """