modelbuilderhq commited on
Commit
ff293b1
·
verified ·
1 Parent(s): fef31e6

Upload folder using huggingface_hub

Browse files
Files changed (50) hide show
  1. Dockerfile +83 -0
  2. README.md +303 -7
  3. __init__.py +20 -0
  4. client.py +64 -0
  5. conftest.py +12 -0
  6. graders.py +121 -0
  7. inference.py +265 -0
  8. models.py +204 -0
  9. openenv.yaml +32 -0
  10. openenv_ghostexec.egg-info/PKG-INFO +15 -0
  11. openenv_ghostexec.egg-info/SOURCES.txt +45 -0
  12. openenv_ghostexec.egg-info/dependency_links.txt +1 -0
  13. openenv_ghostexec.egg-info/entry_points.txt +2 -0
  14. openenv_ghostexec.egg-info/requires.txt +13 -0
  15. openenv_ghostexec.egg-info/top_level.txt +1 -0
  16. outputs/logs/api_dead_live_500.jsonl +500 -0
  17. outputs/logs/episode_rewards.jsonl +0 -0
  18. outputs/training/_integration_ckpt/run_summary.json +28 -0
  19. outputs/training/checkpoints/run_summary.json +28 -0
  20. outputs/training/episode_returns.jsonl +10 -0
  21. outputs/training/smoke/checkpoints/run_summary.json +28 -0
  22. outputs/training/smoke/reinforce_returns.jsonl +48 -0
  23. outputs/training/test_returns.jsonl +25 -0
  24. pyproject.toml +61 -0
  25. scenarios/dinner_disaster.json +107 -0
  26. scenarios/monday_morning.json +257 -0
  27. scenarios/phase2_core.json +83 -0
  28. scenarios/schema_drift_test.json +27 -0
  29. scenarios/vip_meltdown.json +63 -0
  30. scenarios/vip_meltdown_drift.json +25 -0
  31. scripts/__init__.py +1 -0
  32. scripts/http_endpoint_smoke.py +184 -0
  33. scripts/run_live_api_dead_500.py +196 -0
  34. server/__init__.py +11 -0
  35. server/app.py +169 -0
  36. server/ghostexec_environment.py +706 -0
  37. server/requirements.txt +6 -0
  38. server/reward.py +350 -0
  39. tests/test_api_reward_dead_500.py +150 -0
  40. tests/test_complete_integration.py +235 -0
  41. tests/test_docker_build.py +60 -0
  42. tests/test_env.py +48 -0
  43. tests/test_live_server_exhaustive.py +287 -0
  44. tests/test_phase1.py +42 -0
  45. tests/test_phase2.py +77 -0
  46. tests/test_phase3.py +153 -0
  47. tests/test_phase4.py +206 -0
  48. tests/test_reward_dead_suite.py +319 -0
  49. uv.lock +0 -0
  50. validate-submission.sh +163 -0
Dockerfile ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # Multi-stage build using openenv-base
8
+ # This Dockerfile is flexible and works for both:
9
+ # - In-repo environments (with local OpenEnv sources)
10
+ # - Standalone environments (with openenv from PyPI/Git)
11
+ # The build script (openenv build) handles context detection and sets appropriate build args.
12
+
13
+ ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
14
+ FROM ${BASE_IMAGE} AS builder
15
+
16
+ WORKDIR /app
17
+
18
+ # Ensure git is available (required for installing dependencies from VCS)
19
+ RUN apt-get update && \
20
+ apt-get install -y --no-install-recommends git && \
21
+ rm -rf /var/lib/apt/lists/*
22
+
23
+ # Build argument to control whether we're building standalone or in-repo
24
+ ARG BUILD_MODE=in-repo
25
+ ARG ENV_NAME=ghostexec
26
+
27
+ # Copy environment code (always at root of build context)
28
+ COPY . /app/env
29
+
30
+ # For in-repo builds, openenv is already vendored in the build context
31
+ # For standalone builds, openenv will be installed via pyproject.toml
32
+ WORKDIR /app/env
33
+
34
+ # Ensure uv is available (for local builds where base image lacks it)
35
+ RUN if ! command -v uv >/dev/null 2>&1; then \
36
+ curl -LsSf https://astral.sh/uv/install.sh | sh && \
37
+ mv /root/.local/bin/uv /usr/local/bin/uv && \
38
+ mv /root/.local/bin/uvx /usr/local/bin/uvx; \
39
+ fi
40
+
41
+ # Install dependencies using uv sync
42
+ # If uv.lock exists, use it; otherwise resolve on the fly
43
+ RUN --mount=type=cache,target=/root/.cache/uv \
44
+ if [ -f uv.lock ]; then \
45
+ uv sync --frozen --no-install-project --no-editable; \
46
+ else \
47
+ uv sync --no-install-project --no-editable; \
48
+ fi
49
+
50
+ RUN --mount=type=cache,target=/root/.cache/uv \
51
+ if [ -f uv.lock ]; then \
52
+ uv sync --frozen --no-editable; \
53
+ else \
54
+ uv sync --no-editable; \
55
+ fi
56
+
57
+ # Final runtime stage
58
+ FROM ${BASE_IMAGE}
59
+
60
+ WORKDIR /app
61
+
62
+ # Copy the virtual environment from builder
63
+ COPY --from=builder /app/env/.venv /app/.venv
64
+
65
+ # Copy the environment code
66
+ COPY --from=builder /app/env /app/env
67
+
68
+ # Set PATH to use the virtual environment
69
+ ENV PATH="/app/.venv/bin:$PATH"
70
+
71
+ # Set PYTHONPATH so imports work correctly
72
+ ENV PYTHONPATH="/app/env:$PYTHONPATH"
73
+
74
+ # Mount Gradio OpenEnv UI at /web (matches HF Space README expectations)
75
+ ENV ENABLE_WEB_INTERFACE=true
76
+
77
+ # Health check
78
+ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
79
+ CMD sh -c 'curl -f "http://localhost:${PORT:-7860}/health" || exit 1'
80
+
81
+ # Same entrypoint as local `uv run server` (console script from the project venv)
82
+ WORKDIR /app/env
83
+ CMD ["/bin/sh", "-lc", "/app/.venv/bin/server --port ${PORT:-7860}"]
README.md CHANGED
@@ -1,12 +1,308 @@
1
  ---
2
- title: Ghostexec
3
- emoji: 🌖
4
- colorFrom: blue
5
- colorTo: blue
6
  sdk: docker
7
  pinned: false
8
- license: apache-2.0
9
- short_description: OpenEnv RL environment for executive chief-of-staff decision
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Ghostexec Environment Server
3
+ emoji: 📢
4
+ colorFrom: pink
5
+ colorTo: yellow
6
  sdk: docker
7
  pinned: false
8
+ app_port: 7860
9
+ base_path: /web
10
+ tags:
11
+ - openenv
12
  ---
13
 
14
+ # Ghostexec
15
+
16
+ **Ghostexec** is an [OpenEnv](https://github.com/meta-pytorch/OpenEnv)-compatible environment that simulates a busy executive’s world: inbox, calendar, contacts, tasks, and stakeholder moods. The agent chooses **structured actions** (reply, reschedule, delegate, …); the server returns a **plain-text briefing** as the main observation and a **scalar reward** shaped around conflict, relationships, and task progress. Scenario data lives in `scenarios/*.json` — nothing is hardcoded in Python for world content.
17
+
18
+ **Manifest:** `openenv.yaml` (name **`ghostexec`**, HF Space identifier).
19
+ **Package:** `openenv-ghostexec` in `pyproject.toml` (import as `ghostexec`).
20
+
21
+ ---
22
+
23
+ ## Deliverables
24
+
25
+ | Deliverable | URL |
26
+ |-------------|-----|
27
+ | Public HF Space (required) | `TODO: https://huggingface.co/spaces/<org>/ghostexec` |
28
+ | Write-up / blog (HF post preferred) | `TODO: https://huggingface.co/blog/...` |
29
+ | Short demo video (&lt;2 min) | `TODO: https://youtube.com/...` |
30
+
31
+ Fill these URLs before submission freeze so reviewers can verify everything from one place.
32
+
33
+ ---
34
+
35
+ ## OpenEnv Hackathon alignment (themes + submission checklist)
36
+
37
+ **Theme fit (examples, not exhaustive):** Ghostexec targets **Theme 3.2 — Personalized tasks** (executive-style inbox, calendar, conflicts, delegation via structured actions). **Theme 4** is partially supported via curriculum + perturb (`GHOSTEXEC_CURRICULUM`, `GHOSTEXEC_PERTURB`) and diverse scenarios under `scenarios/`.
38
+
39
+ **Minimum submission checklist (fill before freeze):**
40
+
41
+ | Item | Status |
42
+ |------|--------|
43
+ | OpenEnv-based env + `openenv.yaml` | Done in-repo (`openenv-core[core]>=0.2.3` in `pyproject.toml`; aligns with current PyPI release line). |
44
+ | Short write-up or &lt;2 min video | **You:** publish and paste links in [Deliverables](#deliverables). |
45
+ | Public HF Space URL | **You:** `openenv push` and paste the URL in [Deliverables](#deliverables). |
46
+
47
+ ---
48
+
49
+ ## Design narrative
50
+
51
+ Ghostexec is intentionally built as an **AI Chief of Staff** environment, not a grid-world clone: the model must triage inbox, calendar, stakeholder mood, and task deadlines under conflict pressure while taking only legal structured actions.
52
+
53
+ - **Environment Innovation (40%)** — scenario-driven executive operations with competing priorities, conflict queues, and relationship-sensitive outcomes in `scenarios/*.json` + `server/ghostexec_environment.py`.
54
+ - **Storytelling & Presentation (30%)** — each scenario encodes a narrative arc (VIP escalations, family/professional collisions, deadline cascades) so policy behavior reads like realistic assistant decisions rather than abstract moves.
55
+ - **Showing Improvement in Rewards (20%)** — environment reward remains deterministic, inspectable, and traceable through metadata + episode logs under `outputs/logs/`.
56
+ - **Reward Quality (10%)** — fixed weighted core signal (0.35 conflict / 0.35 relationship / 0.30 task), bounded shaping terms, explicit invalid-action handling, and do_nothing penalties.
57
+
58
+ This framing gives judges a clear throughline: **realistic executive chaos -> constrained legal actions -> measurable policy improvement on held-out scenarios**.
59
+
60
+ ---
61
+
62
+ ## Features
63
+
64
+ - **Legal action set** — `reply_email`, `archive_email`, `reschedule_meeting`, `cancel_meeting`, `complete_task`, `delegate_task`, `send_message`, `do_nothing` (see `models.py`).
65
+ - **Human-readable observations** — `GhostexecObservation.echoed_message` is the full briefing text for the model (not raw JSON).
66
+ - **Invalid actions** — Handled in-process: structured metadata (e.g. `step_ok`), no server crash.
67
+ - **Reward** — Weighted blend of conflict, relationship, and task signals (see [Reward](#reward)); per-step logging under `outputs/logs/` (gitignored).
68
+ - **HTTP + WebSocket** — FastAPI app in `server/app.py`; `GhostexecEnv` uses WebSockets for persistent episodes.
69
+
70
+ ---
71
+
72
+ ## Quick start (Python client)
73
+
74
+ From the repo root (`ghostexec/` — where `pyproject.toml` lives):
75
+
76
+ ```bash
77
+ uv sync
78
+ uv run server --port 8000
79
+ ```
80
+
81
+ In another terminal or notebook:
82
+
83
+ ```python
84
+ from ghostexec import GhostexecAction, GhostexecEnv
85
+
86
+ with GhostexecEnv(base_url="http://127.0.0.1:8000") as env:
87
+ out = env.reset()
88
+ print(out.observation.echoed_message[:500], "…") # plain-text briefing
89
+
90
+ step = env.step(
91
+ GhostexecAction(
92
+ action_type="reply_email",
93
+ email_id="e01",
94
+ message_body=(
95
+ "Marcus — acknowledged. Revised figures and short rationale "
96
+ "before noon. — Exec"
97
+ ),
98
+ )
99
+ )
100
+ print("reward:", step.reward)
101
+ print("metadata keys:", sorted((step.observation.metadata or {}).keys()))
102
+ ```
103
+
104
+ **Docker image** (optional): if your OpenEnv client supports it, you can point `GhostexecEnv` at a container built from the root `Dockerfile`. Build from repo root:
105
+
106
+ ```bash
107
+ docker build -t ghostexec-env:latest .
108
+ ```
109
+
110
+ ---
111
+
112
+ ## Actions and fields
113
+
114
+ `GhostexecAction` (`models.py`) includes:
115
+
116
+ | `action_type` | Typical fields used |
117
+ |------------------------|----------------------|
118
+ | `reply_email` | `email_id`, `message_body` |
119
+ | `archive_email` | `email_id` |
120
+ | `reschedule_meeting` | `meeting_id`, `new_time`, `reason` |
121
+ | `cancel_meeting` | `meeting_id`, `reason` |
122
+ | `complete_task` | `task_id` |
123
+ | `delegate_task` | `task_id`, `contact_name` |
124
+ | `send_message` | `contact_name`, `message` (channel text) |
125
+ | `do_nothing` | — (intentionally weak / penalised path) |
126
+
127
+ Unknown or malformed HTTP payloads deserialize safely to `do_nothing`-style defaults where applicable so older clients do not crash.
128
+
129
+ ---
130
+
131
+ ## Observation
132
+
133
+ `GhostexecObservation`:
134
+
135
+ - **`echoed_message`** — Full briefing (emails, conflicts, contacts, tasks, stress, steps remaining).
136
+ - **`message_length`** — Length of `echoed_message` for quick checks.
137
+ - **`reward`**, **`done`**, **`metadata`** — Step outcome; metadata carries flags such as `step_ok`, reward breakdown fields, and ids for debugging.
138
+
139
+ ---
140
+
141
+ ## Reward
142
+
143
+ Phase-4 scoring (`server/reward.py`) combines three channels with **fixed weights**:
144
+
145
+ \[
146
+ \text{weighted base} = 0.35 \cdot \text{conflict} + 0.35 \cdot \text{relationship} + 0.30 \cdot \text{task}
147
+ \]
148
+
149
+ Then applies output scaling, invalid-step adjustments, bonuses/penalties, and a floor for `do_nothing`. Full component values are available on `RewardBreakdown` and are mirrored into observation metadata where configured. **Episode reward traces** append to `outputs/logs/episode_rewards.jsonl` (directory gitignored).
150
+
151
+ **Reward-engineering provenance.** The design follows the reward-shaping playbook surveyed in *Comprehensive Overview of Reward Engineering and Shaping in Advancing Reinforcement Learning Applications* ([arXiv:2408.10215](https://arxiv.org/abs/2408.10215)): dense per-step shaping around proxy signals (conflict / relationship / task) instead of a single sparse end-of-episode reward, fixed weights to keep channel trade-offs inspectable, and bounded per-step magnitudes to resist hacking.
152
+
153
+ ---
154
+
155
+ ## HTTP vs WebSocket (episode state)
156
+
157
+ - **HTTP** `POST /reset` and `POST /step` often bind to **short-lived** environment instances depending on deployment; consecutive HTTP calls may not share one in-memory episode.
158
+ - **Ghostexec** still applies your action against a scenario-primed instance so a lone `POST /step` can return a meaningful reward and metadata.
159
+ - **WebSocket `/ws`** — Use this (or `GhostexecEnv(base_url=...)`, which speaks WebSocket) for **multi-step episodes** on the same session.
160
+
161
+ Endpoints (typical OpenEnv layout): **`/web`**, **`/docs`**, **`/health`**, **`/ws`**.
162
+
163
+ ---
164
+
165
+ ## Running and testing locally
166
+
167
+ ```bash
168
+ # Dev server (package layout)
169
+ uv run uvicorn ghostexec.server.app:app --reload --host 0.0.0.0 --port 8000
170
+
171
+ # Or console entrypoint (matches Dockerfile)
172
+ uv run server --port 8000
173
+ ```
174
+
175
+ **Smoke script** (HTTP):
176
+
177
+ ```bash
178
+ uv run python scripts/http_endpoint_smoke.py --local
179
+ uv run python scripts/http_endpoint_smoke.py --url http://127.0.0.1:8000
180
+ uv run python scripts/http_endpoint_smoke.py --print-curl
181
+ ```
182
+
183
+ **Tests:**
184
+
185
+ ```bash
186
+ uv run pytest tests/ -q
187
+ ```
188
+
189
+ Opt-in Docker build smoke (Phase 1 gate):
190
+
191
+ ```bash
192
+ GHOSTEXEC_RUN_DOCKER_BUILD=1 uv run pytest tests/test_docker_build.py -q
193
+ ```
194
+
195
+ With the server already on port 8000:
196
+
197
+ ```bash
198
+ uv run pytest tests/test_live_server_exhaustive.py -v --tb=short
199
+ ```
200
+
201
+ Override live URL (Windows PowerShell example):
202
+
203
+ ```powershell
204
+ $env:GHOSTEXEC_LIVE_BASE_URL = "http://127.0.0.1:9000"
205
+ uv run pytest tests/test_live_server_exhaustive.py -q
206
+ ```
207
+
208
+ Optional real WebSocket client check:
209
+
210
+ ```bash
211
+ # Terminal 1
212
+ uv run server --port 8000
213
+ # Terminal 2
214
+ set GHOSTEXEC_WS_BASE_URL=http://127.0.0.1:8000
215
+ uv run pytest tests/test_complete_integration.py::test_ghostexec_env_client_against_live_url_if_set -q
216
+ ```
217
+
218
+ ---
219
+
220
+ ## Hugging Face Spaces
221
+
222
+ Full OpenEnv CLI flow from this directory (matches steps 5–8 of the [Packaging & Deploying guide](https://meta-pytorch.org/OpenEnv/auto_getting_started/environment-builder.html)):
223
+
224
+ ```bash
225
+ openenv serve # local dev server on :8000
226
+ openenv build # build the Docker image
227
+ openenv validate --verbose # structure + Dockerfile + entrypoint checks
228
+ openenv push # deploy to HF Spaces
229
+ # openenv push --repo-id your-username/ghostexec
230
+ ```
231
+
232
+ Use a **public** Space for the default hackathon flow unless you intentionally need a private Space. Authenticate with Hugging Face first (`huggingface-cli login` or equivalent).
233
+
234
+ ---
235
+
236
+ ## Scenarios
237
+
238
+ | File | Role |
239
+ |------|------|
240
+ | `scenarios/phase2_core.json` | Default dense inbox/calendar/tasks fixture |
241
+ | `scenarios/monday_morning.json`, `dinner_disaster.json`, `vip_meltdown.json` | Narrative demos |
242
+ | `scenarios/vip_meltdown_drift.json` | Mood / escalation drift |
243
+ | `scenarios/schema_drift_test.json` | Drift-event harness |
244
+
245
+ ---
246
+
247
+ ## Concurrent WebSocket sessions
248
+
249
+ `server/app.py` passes **`GhostexecEnvironment`** (the class) into `create_app` with `max_concurrent_envs=1` by default. Increase `max_concurrent_envs` if you need multiple simultaneous WebSocket clients.
250
+
251
+ ---
252
+
253
+ ## Project layout
254
+
255
+ ```
256
+ ghostexec/
257
+ ├── openenv.yaml # OpenEnv name, version, description
258
+ ├── pyproject.toml # Package metadata + optional extras
259
+ ├── uv.lock
260
+ ├── models.py # World + GhostexecAction / GhostexecObservation
261
+ ├── client.py # GhostexecEnv (WebSocket client)
262
+ ├── scenarios/ # World JSON (source of truth for episodes)
263
+ ├── scripts/ # http_endpoint_smoke.py
264
+ ├── tests/
265
+ └── server/
266
+ ├── app.py # FastAPI + create_app
267
+ ├── ghostexec_environment.py
268
+ ├── reward.py
269
+ └── Dockerfile
270
+ ```
271
+
272
+ ---
273
+
274
+ ## Resources & references
275
+
276
+ Ghostexec is built against the official Meta PyTorch OpenEnv stack. Every design choice below is traceable to one of these sources.
277
+
278
+ **OpenEnv core.** The Gymnasium-style `reset()` / `step()` / `state` interface in `server/ghostexec_environment.py`, the `EnvClient` subclass in `client.py`, and the `create_app(...)` wiring in `server/app.py` follow the [Packaging & Deploying guide](https://meta-pytorch.org/OpenEnv/auto_getting_started/environment-builder.html) exactly.
279
+
280
+ - Core repo: [meta-pytorch/OpenEnv](https://github.com/meta-pytorch/OpenEnv)
281
+ - Docs: [meta-pytorch.org/OpenEnv](https://meta-pytorch.org/OpenEnv/)
282
+
283
+ **OpenEnv Hub (Hugging Face).** Target deployment for `openenv push`. The Space metadata at the top of this README + `openenv.yaml` are the knobs HF Spaces reads.
284
+
285
+ - Environments: [huggingface.co/openenv](https://huggingface.co/openenv)
286
+ - Spaces: [huggingface.co/openenv/spaces](https://huggingface.co/openenv/spaces)
287
+
288
+ **Tutorials.** General OpenEnv environment patterns are documented in the official tutorial pages and examples.
289
+
290
+ - All tutorials: [OpenEnv/tutorial](https://github.com/meta-pytorch/OpenEnv/tree/main/tutorial)
291
+ - Environment examples: [OpenEnv/envs](https://github.com/meta-pytorch/OpenEnv/tree/main/envs)
292
+
293
+ **YouTube — Building RL environments.** Talks from Meta / OpenEnv contributors that informed the scenario-driven reset, WebSocket session model, and reward breakdown used here:
294
+
295
+ - [Building RL Environments with OpenEnv](https://www.youtube.com/watch?v=0airz7BhBiA)
296
+ - [OpenEnv Deep Dive](https://www.youtube.com/watch?v=ap4q4sAK4OY)
297
+ - [Agentic RL Environments](https://www.youtube.com/watch?v=Jew4lhAiqnw)
298
+ - [OpenEnv Livestream (4-hour walkthrough)](https://www.youtube.com/live/kkCNMz0Ptd8)
299
+
300
+ **Reward-engineering papers.** See [Reward](#reward) for how each paper maps to specific components of `server/reward.py`.
301
+
302
+ - Jnadi, A. (2024). *Comprehensive Overview of Reward Engineering and Shaping in Advancing Reinforcement Learning Applications*. [arXiv:2408.10215](https://arxiv.org/abs/2408.10215). Informs the dense per-step conflict / relationship / task shaping and the bounded-magnitude design.
303
+
304
+ ---
305
+
306
+ ## License
307
+
308
+ BSD-style — see the license notice at the top of each source file (Meta / OpenEnv lineage).
__init__.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Ghostexec Environment."""
8
+
9
+ from .models import GhostexecAction, GhostexecObservation
10
+
11
+ # Importing ghostexec.models in notebooks should not require websocket client deps.
12
+ # Keep client import optional so package imports survive OpenEnv layout differences.
13
+ try:
14
+ from .client import GhostexecEnv
15
+ except Exception: # pragma: no cover - import-compat shim
16
+ GhostexecEnv = None # type: ignore[assignment]
17
+
18
+ __all__ = ["GhostexecAction", "GhostexecObservation"]
19
+ if GhostexecEnv is not None:
20
+ __all__.append("GhostexecEnv")
client.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Ghostexec Environment Client."""
8
+
9
+ from typing import Any, Dict
10
+
11
+ try:
12
+ # OpenEnv newer layout.
13
+ from openenv.client import EnvClient
14
+ except ImportError:
15
+ try:
16
+ # Some builds expose the class one level deeper.
17
+ from openenv.client.client import EnvClient
18
+ except ImportError:
19
+ # Backward compatibility with older OpenEnv versions.
20
+ from openenv.core import EnvClient
21
+ from openenv.core.client_types import StepResult
22
+ from openenv.core.env_server.types import State
23
+
24
+ from .models import GhostexecAction, GhostexecObservation
25
+
26
+
27
+ class GhostexecEnv(
28
+ EnvClient[GhostexecAction, GhostexecObservation, State]
29
+ ):
30
+ """
31
+ Client for the Ghostexec Environment.
32
+
33
+ This client maintains a persistent WebSocket connection to the environment server,
34
+ enabling efficient multi-step interactions with lower latency.
35
+ Each client instance has its own dedicated environment session on the server.
36
+ """
37
+
38
+ def _step_payload(self, action: GhostexecAction) -> Dict[str, Any]:
39
+ payload = action.model_dump(mode="json")
40
+ if not payload.get("metadata"):
41
+ payload.pop("metadata", None)
42
+ return payload
43
+
44
+ def _parse_result(self, payload: Dict) -> StepResult[GhostexecObservation]:
45
+ obs_data = payload.get("observation", {})
46
+ observation = GhostexecObservation(
47
+ echoed_message=obs_data.get("echoed_message", ""),
48
+ message_length=obs_data.get("message_length", 0),
49
+ done=payload.get("done", False),
50
+ reward=payload.get("reward"),
51
+ metadata=obs_data.get("metadata", {}),
52
+ )
53
+
54
+ return StepResult(
55
+ observation=observation,
56
+ reward=payload.get("reward"),
57
+ done=payload.get("done", False),
58
+ )
59
+
60
+ def _parse_state(self, payload: Dict) -> State:
61
+ return State(
62
+ episode_id=payload.get("episode_id"),
63
+ step_count=payload.get("step_count", 0),
64
+ )
conftest.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # Put repo root on sys.path before test collection (supports `uv run pytest` without editable install).
4
+
5
+ from __future__ import annotations
6
+
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ _ROOT = Path(__file__).resolve().parent
11
+ if str(_ROOT) not in sys.path:
12
+ sys.path.insert(0, str(_ROOT))
graders.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Public trajectory graders for OpenEnv Phase 2 / HF deep validation.
3
+
4
+ These are **episode-level** scores (strictly inside (0, 1)), separate from per-step
5
+ rewards in `server/reward.py`. The hackathon validator reads `openenv.yaml`
6
+ `tasks[].grader` and calls these functions with trajectory dicts.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from typing import Iterable, List
11
+
12
+ STRICT_MIN = 0.01
13
+ STRICT_MAX = 0.99
14
+
15
+
16
+ def _bounded(value: float) -> float:
17
+ return min(max(round(float(value), 4), STRICT_MIN), STRICT_MAX)
18
+
19
+
20
+ def _as_reward_list(trajectory: dict | None) -> List[float]:
21
+ payload = trajectory or {}
22
+ rewards = payload.get("rewards")
23
+ if isinstance(rewards, list) and rewards:
24
+ return [float(r) for r in rewards]
25
+ if "score" in payload:
26
+ return [float(payload["score"])]
27
+ reward = payload.get("reward")
28
+ if isinstance(reward, dict) and "total" in reward:
29
+ return [float(reward["total"])]
30
+ if reward is not None:
31
+ return [float(reward)]
32
+ return []
33
+
34
+
35
+ def _profile(reward: float) -> str:
36
+ if reward <= 0.05:
37
+ return "unsafe_miss"
38
+ if reward <= 0.20:
39
+ return "bad_call"
40
+ if reward < 0.50:
41
+ return "weak"
42
+ if reward < 0.80:
43
+ return "workable"
44
+ if reward < 0.95:
45
+ return "strong"
46
+ return "expert"
47
+
48
+
49
+ def _score_episode(
50
+ rewards: List[float],
51
+ *,
52
+ miss_cost: float,
53
+ overcall_cost: float,
54
+ stability_gain: float,
55
+ expertise_gain: float,
56
+ ) -> float:
57
+ if not rewards:
58
+ return _bounded(0.5)
59
+ labels = [_profile(r) for r in rewards]
60
+ mean_r = sum(rewards) / len(rewards)
61
+ n = len(rewards)
62
+ miss = labels.count("unsafe_miss")
63
+ bad = labels.count("bad_call")
64
+ weak = labels.count("weak")
65
+ strong = labels.count("strong") + labels.count("expert")
66
+ expert = labels.count("expert")
67
+
68
+ downward = (
69
+ min(miss * miss_cost, 0.35)
70
+ + min(bad * overcall_cost, 0.15)
71
+ + min(weak * 0.015, 0.06)
72
+ )
73
+ upward = 0.0
74
+ if strong / n >= 0.80:
75
+ upward += stability_gain
76
+ if expert / n >= 0.60:
77
+ upward += expertise_gain
78
+
79
+ return _bounded(mean_r - downward + upward)
80
+
81
+
82
+ def phase2_core_grader(trajectory: dict | None = None) -> float:
83
+ """Easy tier — dense default inbox (scenarios/phase2_core.json)."""
84
+ return _score_episode(
85
+ _as_reward_list(trajectory),
86
+ miss_cost=0.12,
87
+ overcall_cost=0.03,
88
+ stability_gain=0.05,
89
+ expertise_gain=0.01,
90
+ )
91
+
92
+
93
+ def monday_morning_grader(trajectory: dict | None = None) -> float:
94
+ """Medium tier — stacked Monday conflicts (scenarios/monday_morning.json)."""
95
+ return _score_episode(
96
+ _as_reward_list(trajectory),
97
+ miss_cost=0.09,
98
+ overcall_cost=0.04,
99
+ stability_gain=0.03,
100
+ expertise_gain=0.02,
101
+ )
102
+
103
+
104
+ def dinner_disaster_grader(trajectory: dict | None = None) -> float:
105
+ """Hard tier — personal/professional collision (scenarios/dinner_disaster.json)."""
106
+ return _score_episode(
107
+ _as_reward_list(trajectory),
108
+ miss_cost=0.07,
109
+ overcall_cost=0.03,
110
+ stability_gain=0.02,
111
+ expertise_gain=0.04,
112
+ )
113
+
114
+
115
+ __all__ = [
116
+ "phase2_core_grader",
117
+ "monday_morning_grader",
118
+ "dinner_disaster_grader",
119
+ "STRICT_MIN",
120
+ "STRICT_MAX",
121
+ ]
inference.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Baseline runner for the Ghostexec submission.
3
+
4
+ This script queries a chat model through the OpenAI client, sends its decision
5
+ to the environment server, and prints machine-readable lines expected by simple
6
+ evaluators/log parsers.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import json
13
+ import os
14
+ from typing import Any, Iterable
15
+
16
+ import requests
17
+ from pydantic import ValidationError
18
+
19
+ try:
20
+ from .graders import dinner_disaster_grader, monday_morning_grader, phase2_core_grader
21
+ from .models import GhostexecAction
22
+ except ImportError:
23
+ from graders import dinner_disaster_grader, monday_morning_grader, phase2_core_grader
24
+ from models import GhostexecAction
25
+
26
+
27
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
28
+ MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
29
+ HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
30
+ ENV_URL = os.getenv("ENV_URL", "http://localhost:7860").rstrip("/")
31
+ TASK_OVERRIDE = os.getenv("TASK_NAME", "").strip()
32
+ BENCHMARK = "ghostexec"
33
+
34
+ TASK_SETS: dict[str, tuple[str, ...]] = {
35
+ "easy": ("phase2_core",),
36
+ "medium": ("monday_morning",),
37
+ "hard": ("dinner_disaster",),
38
+ "all": ("phase2_core", "monday_morning", "dinner_disaster"),
39
+ }
40
+
41
+ TASK_TO_GRADER = {
42
+ "phase2_core": phase2_core_grader,
43
+ "monday_morning": monday_morning_grader,
44
+ "dinner_disaster": dinner_disaster_grader,
45
+ }
46
+
47
+ SYSTEM_MESSAGE = """
48
+ You are acting as an AI Chief-of-Staff assistant in Ghostexec.
49
+
50
+ You must output exactly one JSON object that matches GhostexecAction.
51
+
52
+ Allowed action_type values:
53
+ - reply_email
54
+ - archive_email
55
+ - reschedule_meeting
56
+ - cancel_meeting
57
+ - complete_task
58
+ - delegate_task
59
+ - send_message
60
+ - do_nothing
61
+
62
+ Allowed keys:
63
+ - action_type
64
+ - email_id
65
+ - message_body
66
+ - meeting_id
67
+ - new_time
68
+ - reason
69
+ - task_id
70
+ - contact_name
71
+ - message
72
+
73
+ Rules:
74
+ - Output valid JSON only (no markdown, no prose).
75
+ - Prefer high-impact conflict-reducing actions over do_nothing.
76
+ - Only reference ids/entities that appear in the briefing.
77
+ - If unsure, output {"action_type":"do_nothing"}.
78
+ """.strip()
79
+
80
+
81
+ def emit_start(task_name: str) -> None:
82
+ print(f"[START] task={task_name} env={BENCHMARK} model={MODEL_NAME}", flush=True)
83
+
84
+
85
+ def emit_step(step_no: int, action_text: str, reward: float, done: bool, error: str | None) -> None:
86
+ error_text = error if error else "null"
87
+ print(
88
+ f"[STEP] step={step_no} action={action_text} reward={reward:.2f} "
89
+ f"done={str(done).lower()} error={error_text}",
90
+ flush=True,
91
+ )
92
+
93
+
94
+ def emit_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
95
+ reward_text = ",".join(f"{reward:.2f}" for reward in rewards)
96
+ print(
97
+ f"[END] success={str(success).lower()} steps={steps} "
98
+ f"score={score:.6f} rewards={reward_text}",
99
+ flush=True,
100
+ )
101
+
102
+
103
+ def choose_tasks(selection: str) -> Iterable[str]:
104
+ if TASK_OVERRIDE:
105
+ return (TASK_OVERRIDE,)
106
+ return TASK_SETS[selection]
107
+
108
+
109
+ def client() -> Any:
110
+ if not HF_TOKEN:
111
+ raise EnvironmentError("HF_TOKEN or API_KEY must be set before running inference.py")
112
+ from openai import OpenAI
113
+
114
+ return OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
115
+
116
+
117
+ def fetch_reset(task_name: str) -> dict[str, Any]:
118
+ response = requests.post(
119
+ f"{ENV_URL}/reset",
120
+ json={"task_id": task_name},
121
+ timeout=30,
122
+ )
123
+ response.raise_for_status()
124
+ return response.json()
125
+
126
+
127
+ def submit_action(action: GhostexecAction) -> dict[str, Any]:
128
+ response = requests.post(
129
+ f"{ENV_URL}/step",
130
+ json={"action": action.model_dump()},
131
+ timeout=30,
132
+ )
133
+ response.raise_for_status()
134
+ return response.json()
135
+
136
+
137
+ def _extract_json_object(text: str) -> str:
138
+ s = text.strip()
139
+ if s.startswith("```"):
140
+ # tolerate fenced output from weak model instruction following
141
+ s = s.strip("`")
142
+ if "\n" in s:
143
+ s = s.split("\n", 1)[1]
144
+ start = s.find("{")
145
+ end = s.rfind("}")
146
+ if start == -1 or end == -1 or end <= start:
147
+ raise json.JSONDecodeError("No JSON object found", s, 0)
148
+ return s[start : end + 1]
149
+
150
+
151
+ def prompt_for_case(observation: dict[str, Any]) -> str:
152
+ return (
153
+ "Take one best next action for the Ghostexec environment.\n\n"
154
+ "Return one final structured GhostexecAction JSON object.\n\n"
155
+ f"{json.dumps(observation, ensure_ascii=True, indent=2)}\n\n"
156
+ "Choose the action that most reduces conflicts, protects relationships, "
157
+ "and advances urgent tasks."
158
+ )
159
+
160
+
161
+ def ask_model(llm: Any, observation: dict[str, Any]) -> GhostexecAction:
162
+ completion = llm.chat.completions.create(
163
+ model=MODEL_NAME,
164
+ messages=[
165
+ {"role": "system", "content": SYSTEM_MESSAGE},
166
+ {"role": "user", "content": prompt_for_case(observation)},
167
+ ],
168
+ temperature=0.0,
169
+ max_tokens=260,
170
+ stream=False,
171
+ )
172
+ text = (completion.choices[0].message.content or "").strip()
173
+ payload = json.loads(_extract_json_object(text))
174
+ return GhostexecAction(**payload)
175
+
176
+
177
+ def compact_action(action: GhostexecAction) -> str:
178
+ label = action.action_type
179
+ for candidate in (action.email_id, action.meeting_id, action.task_id, action.contact_name):
180
+ if candidate:
181
+ return f"{label}/{candidate}"
182
+ return label
183
+
184
+
185
+ def _extract_reward(payload: dict[str, Any]) -> float:
186
+ reward_payload = payload.get("reward")
187
+ if isinstance(reward_payload, dict):
188
+ return float(reward_payload.get("total", 0.0))
189
+ if reward_payload is not None:
190
+ return float(reward_payload)
191
+ obs = payload.get("observation")
192
+ if isinstance(obs, dict) and obs.get("reward") is not None:
193
+ return float(obs["reward"])
194
+ return 0.0
195
+
196
+
197
+ def final_score(task_name: str, rewards: list[float]) -> float:
198
+ grader = TASK_TO_GRADER.get(task_name)
199
+ if grader is None:
200
+ score = sum(rewards) / len(rewards) if rewards else 0.0
201
+ return min(max(round(score, 4), 0.01), 0.99)
202
+ return float(grader({"rewards": rewards}))
203
+
204
+
205
+ def run_one_task(llm: Any, task_name: str) -> None:
206
+ rewards: list[float] = []
207
+ steps_taken = 0
208
+ score = 0.0
209
+ success = False
210
+
211
+ emit_start(task_name)
212
+
213
+ try:
214
+ result = fetch_reset(task_name)
215
+ done = bool(result.get("done", False))
216
+
217
+ while not done:
218
+ observation = result.get("observation", result)
219
+ action = ask_model(llm, observation if isinstance(observation, dict) else result)
220
+ action_text = compact_action(action)
221
+
222
+ result = submit_action(action)
223
+ reward = _extract_reward(result)
224
+ done = bool(result.get("done", False))
225
+
226
+ rewards.append(reward)
227
+ steps_taken += 1
228
+ emit_step(steps_taken, action_text, reward, done, None)
229
+
230
+ score = final_score(task_name, rewards)
231
+ success = score >= 0.60
232
+
233
+ except json.JSONDecodeError:
234
+ rewards = [0.0]
235
+ steps_taken = 1
236
+ emit_step(1, "parse_error", 0.0, True, "parse_error")
237
+ except ValidationError:
238
+ rewards = [0.0]
239
+ steps_taken = 1
240
+ emit_step(1, "schema_error", 0.0, True, "schema_error")
241
+ except Exception as exc:
242
+ rewards = [0.0]
243
+ steps_taken = 1
244
+ emit_step(1, "error", 0.0, True, str(exc))
245
+ finally:
246
+ emit_end(success, steps_taken, score, rewards or [0.0])
247
+
248
+
249
+ def main() -> None:
250
+ parser = argparse.ArgumentParser(description="Run the Ghostexec baseline agent")
251
+ parser.add_argument(
252
+ "--difficulty",
253
+ choices=["easy", "medium", "hard", "all"],
254
+ default="all",
255
+ help="Which task subset to run",
256
+ )
257
+ args = parser.parse_args()
258
+
259
+ llm = client()
260
+ for task_name in choose_tasks(args.difficulty):
261
+ run_one_task(llm, task_name)
262
+
263
+
264
+ if __name__ == "__main__":
265
+ main()
models.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Data models for GhostExec — all world and API types live here."""
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Any, Literal
12
+
13
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
14
+
15
+ try:
16
+ from openenv.core.env_server.types import Action as _OpenEnvAction
17
+ from openenv.core.env_server.types import Observation as _OpenEnvObservation
18
+ except Exception:
19
+ _OpenEnvAction = BaseModel # type: ignore[assignment]
20
+ _OpenEnvObservation = BaseModel # type: ignore[assignment]
21
+
22
+
23
+ def _is_pydantic_model_class(cls: object) -> bool:
24
+ try:
25
+ return isinstance(cls, type) and issubclass(cls, BaseModel)
26
+ except TypeError:
27
+ return False
28
+
29
+
30
+ # Some OpenEnv builds expose dataclass-style Action/Observation that do not accept
31
+ # additional keyword fields, which breaks GhostexecAction/GhostexecObservation
32
+ # construction in Colab. Fall back to BaseModel in that case.
33
+ ActionBase = _OpenEnvAction if _is_pydantic_model_class(_OpenEnvAction) else BaseModel
34
+ ObservationBase = (
35
+ _OpenEnvObservation if _is_pydantic_model_class(_OpenEnvObservation) else BaseModel
36
+ )
37
+
38
+ # --- Aliases for scenario / world strings ---
39
+
40
+ EmailPriority = Literal["critical", "high", "normal", "low"]
41
+ SenderRelationship = Literal["VIP", "personal", "professional", "unknown"]
42
+ ContactRelationship = Literal[
43
+ "board_member",
44
+ "spouse",
45
+ "investor",
46
+ "direct_report",
47
+ "client",
48
+ "friend",
49
+ "team_member",
50
+ ]
51
+ CommPreference = Literal["email", "text", "call"]
52
+ Mood = Literal["happy", "neutral", "annoyed", "angry", "furious"]
53
+ TaskStatus = Literal["pending", "in-progress", "done", "overdue"]
54
+ Effort = Literal["low", "medium", "high"]
55
+ MeetingPriority = Literal["critical", "high", "normal", "low"]
56
+
57
+ GhostexecActionType = Literal[
58
+ "reply_email",
59
+ "archive_email",
60
+ "reschedule_meeting",
61
+ "cancel_meeting",
62
+ "complete_task",
63
+ "delegate_task",
64
+ "send_message",
65
+ "do_nothing",
66
+ ]
67
+
68
+
69
+ class Email(BaseModel):
70
+ """Single inbox message."""
71
+
72
+ model_config = ConfigDict(extra="forbid")
73
+
74
+ id: str
75
+ sender: str
76
+ subject: str
77
+ body: str
78
+ read: bool = False
79
+ replied: bool = False
80
+ priority: EmailPriority
81
+ sender_relationship: SenderRelationship
82
+
83
+
84
+ class Meeting(BaseModel):
85
+ """Calendar block."""
86
+
87
+ model_config = ConfigDict(extra="forbid")
88
+
89
+ id: str
90
+ title: str
91
+ start: str = Field(..., description="ISO 8601 start datetime")
92
+ duration_minutes: int = Field(..., ge=1)
93
+ attendees: list[str] = Field(default_factory=list)
94
+ location: str = ""
95
+ priority: MeetingPriority = "normal"
96
+ cancelled: bool = False
97
+
98
+
99
+ class Contact(BaseModel):
100
+ """Stakeholder in the exec's network."""
101
+
102
+ model_config = ConfigDict(extra="forbid")
103
+
104
+ name: str
105
+ relationship_type: ContactRelationship
106
+ communication_preference: CommPreference
107
+ importance: int = Field(..., ge=1, le=5)
108
+ mood: Mood = "neutral"
109
+
110
+
111
+ class Task(BaseModel):
112
+ """To-do item."""
113
+
114
+ model_config = ConfigDict(extra="forbid")
115
+
116
+ id: str
117
+ description: str
118
+ deadline: str = Field(..., description="ISO 8601 deadline")
119
+ owner: str
120
+ status: TaskStatus = "pending"
121
+ effort: Effort = "medium"
122
+ delegated_to: str | None = None
123
+
124
+
125
+ class WorldState(BaseModel):
126
+ """Full simulated world — JSON-serialisable."""
127
+
128
+ model_config = ConfigDict(extra="forbid")
129
+
130
+ simulation_time: str = Field(..., description="Current simulated instant, ISO 8601")
131
+ stress: int = Field(default=0, ge=0, le=100)
132
+ active_conflicts: list[str] = Field(default_factory=list)
133
+ action_log: list[str] = Field(default_factory=list)
134
+ episode_active: bool = True
135
+ episode_end_reason: str | None = None
136
+ max_episode_steps: int = Field(default=48, ge=1, le=10_000)
137
+ emails: list[Email] = Field(default_factory=list)
138
+ meetings: list[Meeting] = Field(default_factory=list)
139
+ contacts: list[Contact] = Field(default_factory=list)
140
+ tasks: list[Task] = Field(default_factory=list)
141
+
142
+
143
+ class GhostexecAction(ActionBase):
144
+ """
145
+ Legal agent actions (Phase 3). Unknown HTTP payloads default to do_nothing
146
+ so older clients do not crash deserialization.
147
+ """
148
+
149
+ action_type: GhostexecActionType = Field(
150
+ default="do_nothing",
151
+ description="Which legal action to execute this step",
152
+ )
153
+ email_id: str = ""
154
+ message_body: str = ""
155
+ meeting_id: str = ""
156
+ new_time: str = ""
157
+ reason: str = ""
158
+ task_id: str = ""
159
+ contact_name: str = ""
160
+ message: str = Field(default="", description="Optional note for action_log (legacy / debug)")
161
+
162
+ @model_validator(mode="before")
163
+ @classmethod
164
+ def _default_action_type(cls, data: Any) -> Any:
165
+ if isinstance(data, dict) and "action_type" not in data:
166
+ data = {**data, "action_type": "do_nothing"}
167
+ return data
168
+
169
+
170
+ class GhostexecObservation(ObservationBase):
171
+ """
172
+ Primary LLM-facing field is `echoed_message`: full plain-text briefing (Phase 3).
173
+ """
174
+
175
+ # Keep these fields explicit for compatibility with OpenEnv builds where
176
+ # Observation is not a pydantic base carrying done/reward/metadata.
177
+ done: bool = False
178
+ reward: float | None = None
179
+ metadata: dict[str, Any] = Field(default_factory=dict)
180
+
181
+ echoed_message: str = Field(
182
+ default="",
183
+ description="Human-readable briefing text for the LLM (not JSON)",
184
+ )
185
+ message_length: int = Field(default=0, description="Byte length of echoed_message for quick checks")
186
+
187
+
188
+ class RewardBreakdown(BaseModel):
189
+ """Phase 4 reward components (logged and exposed in observation metadata)."""
190
+
191
+ model_config = ConfigDict(extra="forbid")
192
+
193
+ conflict_raw: float = 0.0
194
+ critical_queue_bonus: float = 0.0
195
+ conflict: float = 0.0
196
+ relationship: float = 0.0
197
+ task: float = 0.0
198
+ weighted_base: float = 0.0
199
+ output_scale: float = 1.0
200
+ invalid_step_adjustment: float = 0.0
201
+ episode_completion_bonus: float = 0.0
202
+ catastrophic_penalty: float = 0.0
203
+ do_nothing_floor: float = 0.0
204
+ final: float = 0.0
openenv.yaml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ spec_version: 1
2
+ name: ghostexec
3
+ version: "0.1.0"
4
+ description: "GhostExec — RL training environment for personal and executive task conflict resolution (The AI Chief of Staff)."
5
+ type: space
6
+ runtime: fastapi
7
+ app: server.app:app
8
+ port: 8000
9
+
10
+ difficulties: [easy, medium, hard]
11
+ max_steps: 20
12
+
13
+ tasks:
14
+ - id: phase2_core
15
+ difficulty: easy
16
+ description: >
17
+ Default dense inbox/calendar fixture (scenarios/phase2_core.json).
18
+ Stress-test triage, VIP queues, and calendar relief.
19
+ grader: graders.phase2_core_grader
20
+
21
+ - id: monday_morning
22
+ difficulty: medium
23
+ description: >
24
+ Monday morning rush with stacked conflicts (scenarios/monday_morning.json).
25
+ grader: graders.monday_morning_grader
26
+
27
+ - id: dinner_disaster
28
+ difficulty: hard
29
+ description: >
30
+ Personal/professional collision with escalation risk
31
+ (scenarios/dinner_disaster.json).
32
+ grader: graders.dinner_disaster_grader
openenv_ghostexec.egg-info/PKG-INFO ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: openenv-ghostexec
3
+ Version: 0.1.0
4
+ Summary: Ghostexec environment for OpenEnv
5
+ Requires-Python: >=3.10
6
+ Requires-Dist: openenv-core[core]>=0.2.3
7
+ Provides-Extra: dev
8
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
9
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
10
+ Requires-Dist: pyyaml>=6.0.0; extra == "dev"
11
+ Requires-Dist: matplotlib>=3.8.0; extra == "dev"
12
+ Provides-Extra: constrained
13
+ Requires-Dist: lm-format-enforcer>=0.10; extra == "constrained"
14
+ Provides-Extra: constrained-outlines
15
+ Requires-Dist: outlines>=0.1; extra == "constrained-outlines"
openenv_ghostexec.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ README.md
2
+ __init__.py
3
+ client.py
4
+ conftest.py
5
+ graders.py
6
+ models.py
7
+ pyproject.toml
8
+ ./__init__.py
9
+ ./client.py
10
+ ./conftest.py
11
+ ./graders.py
12
+ ./models.py
13
+ ./scenarios/dinner_disaster.json
14
+ ./scenarios/monday_morning.json
15
+ ./scenarios/phase2_core.json
16
+ ./scenarios/schema_drift_test.json
17
+ ./scenarios/vip_meltdown.json
18
+ ./scenarios/vip_meltdown_drift.json
19
+ openenv_ghostexec.egg-info/PKG-INFO
20
+ openenv_ghostexec.egg-info/SOURCES.txt
21
+ openenv_ghostexec.egg-info/dependency_links.txt
22
+ openenv_ghostexec.egg-info/entry_points.txt
23
+ openenv_ghostexec.egg-info/requires.txt
24
+ openenv_ghostexec.egg-info/top_level.txt
25
+ scenarios/dinner_disaster.json
26
+ scenarios/monday_morning.json
27
+ scenarios/phase2_core.json
28
+ scenarios/schema_drift_test.json
29
+ scenarios/vip_meltdown.json
30
+ scenarios/vip_meltdown_drift.json
31
+ server/__init__.py
32
+ server/app.py
33
+ server/ghostexec_environment.py
34
+ server/reward.py
35
+ tests/test_api_reward_dead_500.py
36
+ tests/test_complete_integration.py
37
+ tests/test_docker_build.py
38
+ tests/test_env.py
39
+ tests/test_live_server_exhaustive.py
40
+ tests/test_phase1.py
41
+ tests/test_phase2.py
42
+ tests/test_phase3.py
43
+ tests/test_phase4.py
44
+ tests/test_reward_dead_suite.py
45
+ tests/test_submission_plots_committed.py
openenv_ghostexec.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
openenv_ghostexec.egg-info/entry_points.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [console_scripts]
2
+ server = ghostexec.server.app:main
openenv_ghostexec.egg-info/requires.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openenv-core[core]>=0.2.3
2
+
3
+ [constrained]
4
+ lm-format-enforcer>=0.10
5
+
6
+ [constrained-outlines]
7
+ outlines>=0.1
8
+
9
+ [dev]
10
+ pytest>=8.0.0
11
+ pytest-cov>=4.0.0
12
+ pyyaml>=6.0.0
13
+ matplotlib>=3.8.0
openenv_ghostexec.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ghostexec
outputs/logs/api_dead_live_500.jsonl ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"idx": 0, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
2
+ {"idx": 1, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
3
+ {"idx": 2, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
4
+ {"idx": 3, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
5
+ {"idx": 4, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
6
+ {"idx": 5, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
7
+ {"idx": 6, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
8
+ {"idx": 7, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
9
+ {"idx": 8, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
10
+ {"idx": 9, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
11
+ {"idx": 10, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
12
+ {"idx": 11, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
13
+ {"idx": 12, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
14
+ {"idx": 13, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
15
+ {"idx": 14, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
16
+ {"idx": 15, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
17
+ {"idx": 16, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
18
+ {"idx": 17, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
19
+ {"idx": 18, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
20
+ {"idx": 19, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
21
+ {"idx": 20, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
22
+ {"idx": 21, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
23
+ {"idx": 22, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
24
+ {"idx": 23, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
25
+ {"idx": 24, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
26
+ {"idx": 25, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
27
+ {"idx": 26, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
28
+ {"idx": 27, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
29
+ {"idx": 28, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
30
+ {"idx": 29, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
31
+ {"idx": 30, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
32
+ {"idx": 31, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
33
+ {"idx": 32, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
34
+ {"idx": 33, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
35
+ {"idx": 34, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
36
+ {"idx": 35, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
37
+ {"idx": 36, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
38
+ {"idx": 37, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
39
+ {"idx": 38, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
40
+ {"idx": 39, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
41
+ {"idx": 40, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
42
+ {"idx": 41, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
43
+ {"idx": 42, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
44
+ {"idx": 43, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
45
+ {"idx": 44, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
46
+ {"idx": 45, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
47
+ {"idx": 46, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
48
+ {"idx": 47, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
49
+ {"idx": 48, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
50
+ {"idx": 49, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
51
+ {"idx": 50, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
52
+ {"idx": 51, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
53
+ {"idx": 52, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
54
+ {"idx": 53, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
55
+ {"idx": 54, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
56
+ {"idx": 55, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
57
+ {"idx": 56, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
58
+ {"idx": 57, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
59
+ {"idx": 58, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
60
+ {"idx": 59, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
61
+ {"idx": 60, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
62
+ {"idx": 61, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
63
+ {"idx": 62, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
64
+ {"idx": 63, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
65
+ {"idx": 64, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
66
+ {"idx": 65, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
67
+ {"idx": 66, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
68
+ {"idx": 67, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
69
+ {"idx": 68, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
70
+ {"idx": 69, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
71
+ {"idx": 70, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
72
+ {"idx": 71, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
73
+ {"idx": 72, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
74
+ {"idx": 73, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
75
+ {"idx": 74, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
76
+ {"idx": 75, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
77
+ {"idx": 76, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
78
+ {"idx": 77, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
79
+ {"idx": 78, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
80
+ {"idx": 79, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
81
+ {"idx": 80, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
82
+ {"idx": 81, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
83
+ {"idx": 82, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
84
+ {"idx": 83, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
85
+ {"idx": 84, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
86
+ {"idx": 85, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
87
+ {"idx": 86, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
88
+ {"idx": 87, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
89
+ {"idx": 88, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
90
+ {"idx": 89, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
91
+ {"idx": 90, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
92
+ {"idx": 91, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
93
+ {"idx": 92, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
94
+ {"idx": 93, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
95
+ {"idx": 94, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
96
+ {"idx": 95, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
97
+ {"idx": 96, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
98
+ {"idx": 97, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
99
+ {"idx": 98, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
100
+ {"idx": 99, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
101
+ {"idx": 100, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
102
+ {"idx": 101, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
103
+ {"idx": 102, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
104
+ {"idx": 103, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
105
+ {"idx": 104, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
106
+ {"idx": 105, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
107
+ {"idx": 106, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
108
+ {"idx": 107, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
109
+ {"idx": 108, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
110
+ {"idx": 109, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
111
+ {"idx": 110, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
112
+ {"idx": 111, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
113
+ {"idx": 112, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
114
+ {"idx": 113, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
115
+ {"idx": 114, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
116
+ {"idx": 115, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
117
+ {"idx": 116, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
118
+ {"idx": 117, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
119
+ {"idx": 118, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
120
+ {"idx": 119, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
121
+ {"idx": 120, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
122
+ {"idx": 121, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
123
+ {"idx": 122, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
124
+ {"idx": 123, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
125
+ {"idx": 124, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
126
+ {"idx": 125, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
127
+ {"idx": 126, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
128
+ {"idx": 127, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
129
+ {"idx": 128, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
130
+ {"idx": 129, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
131
+ {"idx": 130, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
132
+ {"idx": 131, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
133
+ {"idx": 132, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
134
+ {"idx": 133, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
135
+ {"idx": 134, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
136
+ {"idx": 135, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
137
+ {"idx": 136, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
138
+ {"idx": 137, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
139
+ {"idx": 138, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
140
+ {"idx": 139, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
141
+ {"idx": 140, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
142
+ {"idx": 141, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
143
+ {"idx": 142, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
144
+ {"idx": 143, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
145
+ {"idx": 144, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
146
+ {"idx": 145, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
147
+ {"idx": 146, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
148
+ {"idx": 147, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
149
+ {"idx": 148, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
150
+ {"idx": 149, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
151
+ {"idx": 150, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
152
+ {"idx": 151, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
153
+ {"idx": 152, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
154
+ {"idx": 153, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
155
+ {"idx": 154, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
156
+ {"idx": 155, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
157
+ {"idx": 156, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
158
+ {"idx": 157, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
159
+ {"idx": 158, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
160
+ {"idx": 159, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
161
+ {"idx": 160, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
162
+ {"idx": 161, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
163
+ {"idx": 162, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
164
+ {"idx": 163, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
165
+ {"idx": 164, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
166
+ {"idx": 165, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
167
+ {"idx": 166, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
168
+ {"idx": 167, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
169
+ {"idx": 168, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
170
+ {"idx": 169, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
171
+ {"idx": 170, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
172
+ {"idx": 171, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
173
+ {"idx": 172, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
174
+ {"idx": 173, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
175
+ {"idx": 174, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
176
+ {"idx": 175, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
177
+ {"idx": 176, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
178
+ {"idx": 177, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
179
+ {"idx": 178, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
180
+ {"idx": 179, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
181
+ {"idx": 180, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
182
+ {"idx": 181, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
183
+ {"idx": 182, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
184
+ {"idx": 183, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
185
+ {"idx": 184, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
186
+ {"idx": 185, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
187
+ {"idx": 186, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
188
+ {"idx": 187, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
189
+ {"idx": 188, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
190
+ {"idx": 189, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
191
+ {"idx": 190, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
192
+ {"idx": 191, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
193
+ {"idx": 192, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
194
+ {"idx": 193, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
195
+ {"idx": 194, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
196
+ {"idx": 195, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
197
+ {"idx": 196, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
198
+ {"idx": 197, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
199
+ {"idx": 198, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
200
+ {"idx": 199, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
201
+ {"idx": 200, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
202
+ {"idx": 201, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
203
+ {"idx": 202, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
204
+ {"idx": 203, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
205
+ {"idx": 204, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
206
+ {"idx": 205, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
207
+ {"idx": 206, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
208
+ {"idx": 207, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
209
+ {"idx": 208, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
210
+ {"idx": 209, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
211
+ {"idx": 210, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
212
+ {"idx": 211, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
213
+ {"idx": 212, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
214
+ {"idx": 213, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
215
+ {"idx": 214, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
216
+ {"idx": 215, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
217
+ {"idx": 216, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
218
+ {"idx": 217, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
219
+ {"idx": 218, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
220
+ {"idx": 219, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
221
+ {"idx": 220, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
222
+ {"idx": 221, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
223
+ {"idx": 222, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
224
+ {"idx": 223, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
225
+ {"idx": 224, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
226
+ {"idx": 225, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
227
+ {"idx": 226, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
228
+ {"idx": 227, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
229
+ {"idx": 228, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
230
+ {"idx": 229, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
231
+ {"idx": 230, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
232
+ {"idx": 231, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
233
+ {"idx": 232, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
234
+ {"idx": 233, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
235
+ {"idx": 234, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
236
+ {"idx": 235, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
237
+ {"idx": 236, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
238
+ {"idx": 237, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
239
+ {"idx": 238, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
240
+ {"idx": 239, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
241
+ {"idx": 240, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
242
+ {"idx": 241, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
243
+ {"idx": 242, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
244
+ {"idx": 243, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
245
+ {"idx": 244, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
246
+ {"idx": 245, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
247
+ {"idx": 246, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
248
+ {"idx": 247, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
249
+ {"idx": 248, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
250
+ {"idx": 249, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
251
+ {"idx": 250, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
252
+ {"idx": 251, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
253
+ {"idx": 252, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
254
+ {"idx": 253, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
255
+ {"idx": 254, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
256
+ {"idx": 255, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
257
+ {"idx": 256, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
258
+ {"idx": 257, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
259
+ {"idx": 258, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
260
+ {"idx": 259, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
261
+ {"idx": 260, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
262
+ {"idx": 261, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
263
+ {"idx": 262, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
264
+ {"idx": 263, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
265
+ {"idx": 264, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
266
+ {"idx": 265, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
267
+ {"idx": 266, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
268
+ {"idx": 267, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
269
+ {"idx": 268, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
270
+ {"idx": 269, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
271
+ {"idx": 270, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
272
+ {"idx": 271, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
273
+ {"idx": 272, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
274
+ {"idx": 273, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
275
+ {"idx": 274, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
276
+ {"idx": 275, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
277
+ {"idx": 276, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
278
+ {"idx": 277, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
279
+ {"idx": 278, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
280
+ {"idx": 279, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
281
+ {"idx": 280, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
282
+ {"idx": 281, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
283
+ {"idx": 282, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
284
+ {"idx": 283, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
285
+ {"idx": 284, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
286
+ {"idx": 285, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
287
+ {"idx": 286, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
288
+ {"idx": 287, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
289
+ {"idx": 288, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
290
+ {"idx": 289, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
291
+ {"idx": 290, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
292
+ {"idx": 291, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
293
+ {"idx": 292, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
294
+ {"idx": 293, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
295
+ {"idx": 294, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
296
+ {"idx": 295, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
297
+ {"idx": 296, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
298
+ {"idx": 297, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
299
+ {"idx": 298, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
300
+ {"idx": 299, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
301
+ {"idx": 300, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
302
+ {"idx": 301, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
303
+ {"idx": 302, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
304
+ {"idx": 303, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
305
+ {"idx": 304, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
306
+ {"idx": 305, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
307
+ {"idx": 306, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
308
+ {"idx": 307, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
309
+ {"idx": 308, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
310
+ {"idx": 309, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
311
+ {"idx": 310, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
312
+ {"idx": 311, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
313
+ {"idx": 312, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
314
+ {"idx": 313, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
315
+ {"idx": 314, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
316
+ {"idx": 315, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
317
+ {"idx": 316, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
318
+ {"idx": 317, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
319
+ {"idx": 318, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
320
+ {"idx": 319, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
321
+ {"idx": 320, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
322
+ {"idx": 321, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
323
+ {"idx": 322, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
324
+ {"idx": 323, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
325
+ {"idx": 324, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
326
+ {"idx": 325, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
327
+ {"idx": 326, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
328
+ {"idx": 327, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
329
+ {"idx": 328, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
330
+ {"idx": 329, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
331
+ {"idx": 330, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
332
+ {"idx": 331, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
333
+ {"idx": 332, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
334
+ {"idx": 333, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
335
+ {"idx": 334, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
336
+ {"idx": 335, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
337
+ {"idx": 336, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
338
+ {"idx": 337, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
339
+ {"idx": 338, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
340
+ {"idx": 339, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
341
+ {"idx": 340, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
342
+ {"idx": 341, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
343
+ {"idx": 342, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
344
+ {"idx": 343, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
345
+ {"idx": 344, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
346
+ {"idx": 345, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
347
+ {"idx": 346, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
348
+ {"idx": 347, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
349
+ {"idx": 348, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
350
+ {"idx": 349, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
351
+ {"idx": 350, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
352
+ {"idx": 351, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
353
+ {"idx": 352, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
354
+ {"idx": 353, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
355
+ {"idx": 354, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
356
+ {"idx": 355, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
357
+ {"idx": 356, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
358
+ {"idx": 357, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
359
+ {"idx": 358, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
360
+ {"idx": 359, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
361
+ {"idx": 360, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
362
+ {"idx": 361, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
363
+ {"idx": 362, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
364
+ {"idx": 363, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
365
+ {"idx": 364, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
366
+ {"idx": 365, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
367
+ {"idx": 366, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
368
+ {"idx": 367, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
369
+ {"idx": 368, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
370
+ {"idx": 369, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
371
+ {"idx": 370, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
372
+ {"idx": 371, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
373
+ {"idx": 372, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
374
+ {"idx": 373, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
375
+ {"idx": 374, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
376
+ {"idx": 375, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
377
+ {"idx": 376, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
378
+ {"idx": 377, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
379
+ {"idx": 378, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
380
+ {"idx": 379, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
381
+ {"idx": 380, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
382
+ {"idx": 381, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
383
+ {"idx": 382, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
384
+ {"idx": 383, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
385
+ {"idx": 384, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
386
+ {"idx": 385, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
387
+ {"idx": 386, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
388
+ {"idx": 387, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
389
+ {"idx": 388, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
390
+ {"idx": 389, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
391
+ {"idx": 390, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
392
+ {"idx": 391, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
393
+ {"idx": 392, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
394
+ {"idx": 393, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
395
+ {"idx": 394, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
396
+ {"idx": 395, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
397
+ {"idx": 396, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
398
+ {"idx": 397, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
399
+ {"idx": 398, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
400
+ {"idx": 399, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
401
+ {"idx": 400, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
402
+ {"idx": 401, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
403
+ {"idx": 402, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
404
+ {"idx": 403, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
405
+ {"idx": 404, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
406
+ {"idx": 405, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
407
+ {"idx": 406, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
408
+ {"idx": 407, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
409
+ {"idx": 408, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
410
+ {"idx": 409, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
411
+ {"idx": 410, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
412
+ {"idx": 411, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
413
+ {"idx": 412, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
414
+ {"idx": 413, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
415
+ {"idx": 414, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
416
+ {"idx": 415, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
417
+ {"idx": 416, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
418
+ {"idx": 417, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
419
+ {"idx": 418, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
420
+ {"idx": 419, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
421
+ {"idx": 420, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
422
+ {"idx": 421, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
423
+ {"idx": 422, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
424
+ {"idx": 423, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
425
+ {"idx": 424, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
426
+ {"idx": 425, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
427
+ {"idx": 426, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
428
+ {"idx": 427, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
429
+ {"idx": 428, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
430
+ {"idx": 429, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
431
+ {"idx": 430, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
432
+ {"idx": 431, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
433
+ {"idx": 432, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
434
+ {"idx": 433, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
435
+ {"idx": 434, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
436
+ {"idx": 435, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
437
+ {"idx": 436, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
438
+ {"idx": 437, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
439
+ {"idx": 438, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
440
+ {"idx": 439, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
441
+ {"idx": 440, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
442
+ {"idx": 441, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
443
+ {"idx": 442, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
444
+ {"idx": 443, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
445
+ {"idx": 444, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
446
+ {"idx": 445, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
447
+ {"idx": 446, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
448
+ {"idx": 447, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
449
+ {"idx": 448, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
450
+ {"idx": 449, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
451
+ {"idx": 450, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
452
+ {"idx": 451, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
453
+ {"idx": 452, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
454
+ {"idx": 453, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
455
+ {"idx": 454, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
456
+ {"idx": 455, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
457
+ {"idx": 456, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
458
+ {"idx": 457, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
459
+ {"idx": 458, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
460
+ {"idx": 459, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
461
+ {"idx": 460, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
462
+ {"idx": 461, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
463
+ {"idx": 462, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
464
+ {"idx": 463, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
465
+ {"idx": 464, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
466
+ {"idx": 465, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
467
+ {"idx": 466, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
468
+ {"idx": 467, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
469
+ {"idx": 468, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
470
+ {"idx": 469, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
471
+ {"idx": 470, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
472
+ {"idx": 471, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
473
+ {"idx": 472, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
474
+ {"idx": 473, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
475
+ {"idx": 474, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
476
+ {"idx": 475, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
477
+ {"idx": 476, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
478
+ {"idx": 477, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
479
+ {"idx": 478, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
480
+ {"idx": 479, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
481
+ {"idx": 480, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
482
+ {"idx": 481, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
483
+ {"idx": 482, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
484
+ {"idx": 483, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
485
+ {"idx": 484, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
486
+ {"idx": 485, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
487
+ {"idx": 486, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
488
+ {"idx": 487, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
489
+ {"idx": 488, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
490
+ {"idx": 489, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
491
+ {"idx": 490, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
492
+ {"idx": 491, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
493
+ {"idx": 492, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
494
+ {"idx": 493, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
495
+ {"idx": 494, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
496
+ {"idx": 495, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
497
+ {"idx": 496, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
498
+ {"idx": 497, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
499
+ {"idx": 498, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
500
+ {"idx": 499, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
outputs/logs/episode_rewards.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
outputs/training/_integration_ckpt/run_summary.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "episodes": 5,
3
+ "log_path": "D:\\Scalar Final\\Final\\ghostexec\\outputs\\training\\_integration_train_smoke.jsonl",
4
+ "first_episode_first_action": {
5
+ "metadata": {},
6
+ "action_type": "reply_email",
7
+ "email_id": "e01",
8
+ "message_body": "On it \u2014 drafting a response and owners now.",
9
+ "meeting_id": "",
10
+ "new_time": "",
11
+ "reason": "",
12
+ "task_id": "",
13
+ "contact_name": "",
14
+ "message": ""
15
+ },
16
+ "last_episode_first_action": {
17
+ "metadata": {},
18
+ "action_type": "reply_email",
19
+ "email_id": "e01",
20
+ "message_body": "On it \u2014 drafting a response and owners now.",
21
+ "meeting_id": "",
22
+ "new_time": "",
23
+ "reason": "",
24
+ "task_id": "",
25
+ "contact_name": "",
26
+ "message": ""
27
+ }
28
+ }
outputs/training/checkpoints/run_summary.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "episodes": 5,
3
+ "log_path": "D:\\Scalar Final\\Final\\ghostexec\\outputs\\training\\episode_returns.jsonl",
4
+ "first_episode_first_action": {
5
+ "metadata": {},
6
+ "action_type": "reply_email",
7
+ "email_id": "e01",
8
+ "message_body": "On it \u2014 drafting a response and owners now.",
9
+ "meeting_id": "",
10
+ "new_time": "",
11
+ "reason": "",
12
+ "task_id": "",
13
+ "contact_name": "",
14
+ "message": ""
15
+ },
16
+ "last_episode_first_action": {
17
+ "metadata": {},
18
+ "action_type": "reply_email",
19
+ "email_id": "e01",
20
+ "message_body": "On it \u2014 drafting a response and owners now.",
21
+ "meeting_id": "",
22
+ "new_time": "",
23
+ "reason": "",
24
+ "task_id": "",
25
+ "contact_name": "",
26
+ "message": ""
27
+ }
28
+ }
outputs/training/episode_returns.jsonl ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {"episode": 0, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
2
+ {"episode": 1, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
3
+ {"episode": 2, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
4
+ {"episode": 3, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
5
+ {"episode": 4, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
6
+ {"episode": 0, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
7
+ {"episode": 1, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
8
+ {"episode": 2, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
9
+ {"episode": 3, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
10
+ {"episode": 4, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
outputs/training/smoke/checkpoints/run_summary.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "episodes": 48,
3
+ "log_path": "D:\\Scalar Final\\Final\\ghostexec\\outputs\\training\\smoke\\reinforce_returns.jsonl",
4
+ "first_episode_first_action": {
5
+ "metadata": {},
6
+ "action_type": "reply_email",
7
+ "email_id": "e01",
8
+ "message_body": "Acknowledged \u2014 working the thread now.",
9
+ "meeting_id": "",
10
+ "new_time": "",
11
+ "reason": "",
12
+ "task_id": "",
13
+ "contact_name": "",
14
+ "message": ""
15
+ },
16
+ "last_episode_first_action": {
17
+ "metadata": {},
18
+ "action_type": "archive_email",
19
+ "email_id": "e06",
20
+ "message_body": "",
21
+ "meeting_id": "",
22
+ "new_time": "",
23
+ "reason": "",
24
+ "task_id": "",
25
+ "contact_name": "",
26
+ "message": ""
27
+ }
28
+ }
outputs/training/smoke/reinforce_returns.jsonl ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"episode": 0, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.391999999999998, "length": 14, "mean_step_reward": -0.5279999999999998}
2
+ {"episode": 1, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.297919999999997, "length": 14, "mean_step_reward": -0.5212799999999997}
3
+ {"episode": 2, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -9.334079999999998, "length": 14, "mean_step_reward": -0.6667199999999999}
4
+ {"episode": 3, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -8.641919999999997, "length": 14, "mean_step_reward": -0.6172799999999998}
5
+ {"episode": 4, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.924959999999997, "length": 14, "mean_step_reward": -0.4946399999999998}
6
+ {"episode": 5, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.465919999999997, "length": 14, "mean_step_reward": -0.5332799999999998}
7
+ {"episode": 6, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999998, "length": 14, "mean_step_reward": -0.47591999999999984}
8
+ {"episode": 7, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.633919999999996, "length": 14, "mean_step_reward": -0.5452799999999998}
9
+ {"episode": 8, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.998879999999997, "length": 14, "mean_step_reward": -0.4999199999999998}
10
+ {"episode": 9, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.625919999999997, "length": 14, "mean_step_reward": -0.4732799999999998}
11
+ {"episode": 10, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.998879999999997, "length": 14, "mean_step_reward": -0.4999199999999998}
12
+ {"episode": 11, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.428959999999997, "length": 14, "mean_step_reward": -0.5306399999999998}
13
+ {"episode": 12, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.166879999999997, "length": 14, "mean_step_reward": -0.5119199999999998}
14
+ {"episode": 13, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999998, "length": 14, "mean_step_reward": -0.47591999999999984}
15
+ {"episode": 14, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.297919999999998, "length": 14, "mean_step_reward": -0.5212799999999999}
16
+ {"episode": 15, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.756959999999998, "length": 14, "mean_step_reward": -0.48263999999999985}
17
+ {"episode": 16, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.670879999999997, "length": 14, "mean_step_reward": -0.5479199999999997}
18
+ {"episode": 17, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -8.174879999999996, "length": 14, "mean_step_reward": -0.5839199999999998}
19
+ {"episode": 18, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999998, "length": 14, "mean_step_reward": -0.47591999999999984}
20
+ {"episode": 19, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999997, "length": 14, "mean_step_reward": -0.4759199999999998}
21
+ {"episode": 20, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.8678399999999975, "length": 14, "mean_step_reward": -0.49055999999999983}
22
+ {"episode": 21, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.551999999999998, "length": 14, "mean_step_reward": -0.46799999999999986}
23
+ {"episode": 22, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999997, "length": 14, "mean_step_reward": -0.4759199999999998}
24
+ {"episode": 23, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.961919999999997, "length": 14, "mean_step_reward": -0.49727999999999983}
25
+ {"episode": 24, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.5889599999999975, "length": 14, "mean_step_reward": -0.47063999999999984}
26
+ {"episode": 25, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999997, "length": 14, "mean_step_reward": -0.4759199999999998}
27
+ {"episode": 26, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.793919999999997, "length": 14, "mean_step_reward": -0.4852799999999998}
28
+ {"episode": 27, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.5889599999999975, "length": 14, "mean_step_reward": -0.47063999999999984}
29
+ {"episode": 28, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.625919999999997, "length": 14, "mean_step_reward": -0.4732799999999998}
30
+ {"episode": 29, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.961919999999997, "length": 14, "mean_step_reward": -0.49727999999999983}
31
+ {"episode": 30, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.924959999999998, "length": 14, "mean_step_reward": -0.49463999999999986}
32
+ {"episode": 31, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.699839999999997, "length": 14, "mean_step_reward": -0.4785599999999998}
33
+ {"episode": 32, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.793919999999997, "length": 14, "mean_step_reward": -0.4852799999999998}
34
+ {"episode": 33, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.830879999999997, "length": 14, "mean_step_reward": -0.4879199999999998}
35
+ {"episode": 34, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.961919999999997, "length": 14, "mean_step_reward": -0.49727999999999983}
36
+ {"episode": 35, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.559999999999997, "length": 14, "mean_step_reward": -0.5399999999999998}
37
+ {"episode": 36, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.334879999999997, "length": 14, "mean_step_reward": -0.5239199999999998}
38
+ {"episode": 37, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.699839999999997, "length": 14, "mean_step_reward": -0.4785599999999998}
39
+ {"episode": 38, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.924959999999997, "length": 14, "mean_step_reward": -0.4946399999999998}
40
+ {"episode": 39, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999998, "length": 14, "mean_step_reward": -0.47591999999999984}
41
+ {"episode": 40, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.699839999999997, "length": 14, "mean_step_reward": -0.4785599999999998}
42
+ {"episode": 41, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999998, "length": 14, "mean_step_reward": -0.47591999999999984}
43
+ {"episode": 42, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.625919999999997, "length": 14, "mean_step_reward": -0.4732799999999998}
44
+ {"episode": 43, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.8678399999999975, "length": 14, "mean_step_reward": -0.49055999999999983}
45
+ {"episode": 44, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.699839999999997, "length": 14, "mean_step_reward": -0.4785599999999998}
46
+ {"episode": 45, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.830879999999997, "length": 14, "mean_step_reward": -0.4879199999999998}
47
+ {"episode": 46, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.551999999999998, "length": 14, "mean_step_reward": -0.46799999999999986}
48
+ {"episode": 47, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.166879999999997, "length": 14, "mean_step_reward": -0.5119199999999998}
outputs/training/test_returns.jsonl ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"episode": 0, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.166879999999997, "length": 14, "mean_step_reward": -0.5119199999999998}
2
+ {"episode": 1, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.8019199999999955, "length": 14, "mean_step_reward": -0.5572799999999997}
3
+ {"episode": 2, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -8.641919999999999, "length": 14, "mean_step_reward": -0.6172799999999999}
4
+ {"episode": 3, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.801919999999996, "length": 14, "mean_step_reward": -0.5572799999999998}
5
+ {"episode": 4, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -8.735999999999997, "length": 14, "mean_step_reward": -0.6239999999999998}
6
+ {"episode": 5, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.961919999999997, "length": 14, "mean_step_reward": -0.49727999999999983}
7
+ {"episode": 6, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -8.137919999999998, "length": 14, "mean_step_reward": -0.5812799999999998}
8
+ {"episode": 7, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.428959999999997, "length": 14, "mean_step_reward": -0.5306399999999998}
9
+ {"episode": 8, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.830879999999997, "length": 14, "mean_step_reward": -0.4879199999999998}
10
+ {"episode": 9, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.334879999999997, "length": 14, "mean_step_reward": -0.5239199999999998}
11
+ {"episode": 10, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.588959999999998, "length": 14, "mean_step_reward": -0.4706399999999999}
12
+ {"episode": 11, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.625919999999997, "length": 14, "mean_step_reward": -0.4732799999999998}
13
+ {"episode": 12, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.793919999999997, "length": 14, "mean_step_reward": -0.4852799999999998}
14
+ {"episode": 13, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.166879999999997, "length": 14, "mean_step_reward": -0.5119199999999998}
15
+ {"episode": 14, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.793919999999998, "length": 14, "mean_step_reward": -0.4852799999999999}
16
+ {"episode": 15, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.625919999999997, "length": 14, "mean_step_reward": -0.4732799999999998}
17
+ {"episode": 16, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.187039999999998, "length": 14, "mean_step_reward": -0.5133599999999998}
18
+ {"episode": 17, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.719999999999998, "length": 14, "mean_step_reward": -0.47999999999999987}
19
+ {"episode": 18, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.719999999999998, "length": 14, "mean_step_reward": -0.47999999999999987}
20
+ {"episode": 19, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.8678399999999975, "length": 14, "mean_step_reward": -0.49055999999999983}
21
+ {"episode": 20, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.8510399999999985, "length": 14, "mean_step_reward": -0.4893599999999999}
22
+ {"episode": 21, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.719999999999998, "length": 14, "mean_step_reward": -0.47999999999999987}
23
+ {"episode": 22, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.092959999999998, "length": 14, "mean_step_reward": -0.5066399999999999}
24
+ {"episode": 23, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.887999999999998, "length": 14, "mean_step_reward": -0.4919999999999999}
25
+ {"episode": 24, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999997, "length": 14, "mean_step_reward": -0.4759199999999998}
pyproject.toml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ [build-system]
8
+ requires = ["setuptools>=45", "wheel"]
9
+ build-backend = "setuptools.build_meta"
10
+
11
+ [project]
12
+ name = "openenv-ghostexec"
13
+ version = "0.1.0"
14
+ description = "Ghostexec environment for OpenEnv"
15
+ requires-python = ">=3.10"
16
+ dependencies = [
17
+ # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
18
+ # install from github
19
+ # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
20
+ "openenv-core[core]>=0.2.3",
21
+ # Environment-specific dependencies
22
+ # Add all dependencies needed for your environment here
23
+ # Examples:
24
+ # "numpy>=1.19.0",
25
+ # "torch>=2.0.0",
26
+ # "gymnasium>=0.29.0",
27
+ # "openspiel>=1.0.0",
28
+ # "smolagents>=1.22.0,<2",
29
+ ]
30
+
31
+ [project.optional-dependencies]
32
+ dev = [
33
+ "pytest>=8.0.0",
34
+ "pytest-cov>=4.0.0",
35
+ "pyyaml>=6.0.0",
36
+ "matplotlib>=3.8.0",
37
+ ]
38
+ # Optional JSON-schema-constrained decoding backends (pick one).
39
+ constrained = [
40
+ "lm-format-enforcer>=0.10",
41
+ ]
42
+ constrained-outlines = [
43
+ "outlines>=0.1",
44
+ ]
45
+
46
+ [project.scripts]
47
+ # Server entry point - enables running via: uv run --project . server
48
+ # or: python -m ghostexec.server.app
49
+ server = "ghostexec.server.app:main"
50
+
51
+ [tool.setuptools]
52
+ include-package-data = true
53
+ packages = ["ghostexec", "ghostexec.server"]
54
+ package-dir = { "ghostexec" = ".", "ghostexec.server" = "server" }
55
+
56
+ [tool.setuptools.package-data]
57
+ ghostexec = ["scenarios/*.json"]
58
+
59
+ [tool.pytest.ini_options]
60
+ testpaths = ["tests"]
61
+ pythonpath = ["."]
scenarios/dinner_disaster.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "simulation_time": "2026-04-21T18:45:00",
3
+ "stress": 61,
4
+ "active_conflicts": [],
5
+ "action_log": [
6
+ "Client call ran long; dinner reservation at 19:00."
7
+ ],
8
+ "episode_active": true,
9
+ "episode_end_reason": null,
10
+ "max_episode_steps": 40,
11
+ "emails": [
12
+ {
13
+ "id": "d1",
14
+ "sender": "Sarah Chen",
15
+ "subject": "Dinner \u2014 I am at the restaurant",
16
+ "body": "We have held the table until 7:15. Please leave the call.",
17
+ "read": false,
18
+ "replied": false,
19
+ "priority": "high",
20
+ "sender_relationship": "personal"
21
+ },
22
+ {
23
+ "id": "d2",
24
+ "sender": "Taylor Brooks",
25
+ "subject": "Need sign-off tonight",
26
+ "body": "Board-adjacent ask: one paragraph on risk posture before morning.",
27
+ "read": false,
28
+ "replied": false,
29
+ "priority": "critical",
30
+ "sender_relationship": "VIP"
31
+ }
32
+ ],
33
+ "meetings": [
34
+ {
35
+ "id": "dc1",
36
+ "title": "Client escalation call",
37
+ "start": "2026-04-21T17:30:00",
38
+ "duration_minutes": 90,
39
+ "attendees": [
40
+ "David Okonkwo"
41
+ ],
42
+ "location": "virtual",
43
+ "priority": "high",
44
+ "cancelled": false
45
+ },
46
+ {
47
+ "id": "dc2",
48
+ "title": "Dinner reservation window",
49
+ "start": "2026-04-21T19:00:00",
50
+ "duration_minutes": 120,
51
+ "attendees": [
52
+ "Sarah Chen"
53
+ ],
54
+ "location": "Osteria",
55
+ "priority": "normal",
56
+ "cancelled": false
57
+ }
58
+ ],
59
+ "contacts": [
60
+ {
61
+ "name": "Sarah Chen",
62
+ "relationship_type": "spouse",
63
+ "communication_preference": "text",
64
+ "importance": 5,
65
+ "mood": "annoyed"
66
+ },
67
+ {
68
+ "name": "Taylor Brooks",
69
+ "relationship_type": "investor",
70
+ "communication_preference": "call",
71
+ "importance": 4,
72
+ "mood": "neutral"
73
+ },
74
+ {
75
+ "name": "David Okonkwo",
76
+ "relationship_type": "client",
77
+ "communication_preference": "email",
78
+ "importance": 4,
79
+ "mood": "angry"
80
+ },
81
+ {
82
+ "name": "Jordan Lee",
83
+ "relationship_type": "direct_report",
84
+ "communication_preference": "call",
85
+ "importance": 3,
86
+ "mood": "neutral"
87
+ }
88
+ ],
89
+ "tasks": [
90
+ {
91
+ "id": "dt1",
92
+ "description": "Text Sarah ETA for dinner",
93
+ "deadline": "2026-04-21T18:50:00",
94
+ "owner": "Self",
95
+ "status": "pending",
96
+ "effort": "low"
97
+ },
98
+ {
99
+ "id": "dt2",
100
+ "description": "Send Taylor the risk paragraph",
101
+ "deadline": "2026-04-21T23:00:00",
102
+ "owner": "Self",
103
+ "status": "pending",
104
+ "effort": "medium"
105
+ }
106
+ ]
107
+ }
scenarios/monday_morning.json ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "simulation_time": "2026-04-22T07:00:00",
3
+ "stress": 84,
4
+ "active_conflicts": [],
5
+ "action_log": [
6
+ "Monday: board + investor travel + home commitments collide."
7
+ ],
8
+ "episode_active": true,
9
+ "episode_end_reason": null,
10
+ "max_episode_steps": 56,
11
+ "emails": [
12
+ {
13
+ "id": "mb1",
14
+ "sender": "Marcus Webb",
15
+ "subject": "Board deck \u2014 need numbers before market open",
16
+ "body": "We are missing revised Q3 figures. This is blocking the packet.",
17
+ "read": false,
18
+ "replied": false,
19
+ "priority": "critical",
20
+ "sender_relationship": "VIP"
21
+ },
22
+ {
23
+ "id": "mb2",
24
+ "sender": "Marcus Webb",
25
+ "subject": "Re: investor flight \u2014 your call",
26
+ "body": "Taylor is airborne in 4 hours. We need a decision on the alternate slot.",
27
+ "read": false,
28
+ "replied": false,
29
+ "priority": "critical",
30
+ "sender_relationship": "VIP"
31
+ },
32
+ {
33
+ "id": "mb3",
34
+ "sender": "Marcus Webb",
35
+ "subject": "Confidential \u2014 board sentiment",
36
+ "body": "Side channel: two directors are rattled. Your visible handling today matters.",
37
+ "read": false,
38
+ "replied": false,
39
+ "priority": "critical",
40
+ "sender_relationship": "VIP"
41
+ },
42
+ {
43
+ "id": "sp1",
44
+ "sender": "Sarah Chen",
45
+ "subject": "Dinner 7pm \u2014 please confirm",
46
+ "body": "Reservation is 7pm. If you are going to be late again, tell me now.",
47
+ "read": false,
48
+ "replied": false,
49
+ "priority": "high",
50
+ "sender_relationship": "personal"
51
+ },
52
+ {
53
+ "id": "fl1",
54
+ "sender": "Alex Rivera",
55
+ "subject": "Flight cancelled \u2014 investor dinner at risk",
56
+ "body": "Airline put me on a 6am tomorrow. Investor dinner prep is slipping.",
57
+ "read": false,
58
+ "replied": false,
59
+ "priority": "high",
60
+ "sender_relationship": "professional"
61
+ }
62
+ ],
63
+ "meetings": [
64
+ {
65
+ "id": "mx1",
66
+ "title": "Commitment block 1",
67
+ "start": "2026-04-22T07:00:00",
68
+ "duration_minutes": 60,
69
+ "attendees": [
70
+ "Marcus Webb"
71
+ ],
72
+ "location": "virtual",
73
+ "priority": "critical",
74
+ "cancelled": false
75
+ },
76
+ {
77
+ "id": "mx2",
78
+ "title": "Commitment block 2",
79
+ "start": "2026-04-22T07:00:00",
80
+ "duration_minutes": 60,
81
+ "attendees": [
82
+ "Taylor Brooks"
83
+ ],
84
+ "location": "virtual",
85
+ "priority": "critical",
86
+ "cancelled": false
87
+ },
88
+ {
89
+ "id": "mx3",
90
+ "title": "Commitment block 3",
91
+ "start": "2026-04-22T07:00:00",
92
+ "duration_minutes": 60,
93
+ "attendees": [
94
+ "Marcus Webb"
95
+ ],
96
+ "location": "virtual",
97
+ "priority": "high",
98
+ "cancelled": false
99
+ },
100
+ {
101
+ "id": "mx4",
102
+ "title": "Commitment block 4",
103
+ "start": "2026-04-22T07:00:00",
104
+ "duration_minutes": 60,
105
+ "attendees": [
106
+ "Taylor Brooks"
107
+ ],
108
+ "location": "virtual",
109
+ "priority": "high",
110
+ "cancelled": false
111
+ },
112
+ {
113
+ "id": "mx5",
114
+ "title": "Commitment block 5",
115
+ "start": "2026-04-22T07:00:00",
116
+ "duration_minutes": 60,
117
+ "attendees": [
118
+ "Marcus Webb"
119
+ ],
120
+ "location": "virtual",
121
+ "priority": "high",
122
+ "cancelled": false
123
+ },
124
+ {
125
+ "id": "mx6",
126
+ "title": "Commitment block 6",
127
+ "start": "2026-04-22T07:00:00",
128
+ "duration_minutes": 60,
129
+ "attendees": [
130
+ "Taylor Brooks"
131
+ ],
132
+ "location": "virtual",
133
+ "priority": "high",
134
+ "cancelled": false
135
+ },
136
+ {
137
+ "id": "m_pm",
138
+ "title": "Afternoon sync",
139
+ "start": "2026-04-22T15:00:00",
140
+ "duration_minutes": 30,
141
+ "attendees": [
142
+ "Jordan Lee"
143
+ ],
144
+ "location": "virtual",
145
+ "priority": "normal",
146
+ "cancelled": false
147
+ }
148
+ ],
149
+ "contacts": [
150
+ {
151
+ "name": "Marcus Webb",
152
+ "relationship_type": "board_member",
153
+ "communication_preference": "email",
154
+ "importance": 5,
155
+ "mood": "angry"
156
+ },
157
+ {
158
+ "name": "Sarah Chen",
159
+ "relationship_type": "spouse",
160
+ "communication_preference": "text",
161
+ "importance": 5,
162
+ "mood": "annoyed"
163
+ },
164
+ {
165
+ "name": "Taylor Brooks",
166
+ "relationship_type": "investor",
167
+ "communication_preference": "call",
168
+ "importance": 4,
169
+ "mood": "neutral"
170
+ },
171
+ {
172
+ "name": "Alex Rivera",
173
+ "relationship_type": "direct_report",
174
+ "communication_preference": "text",
175
+ "importance": 3,
176
+ "mood": "annoyed"
177
+ },
178
+ {
179
+ "name": "Jordan Lee",
180
+ "relationship_type": "direct_report",
181
+ "communication_preference": "call",
182
+ "importance": 3,
183
+ "mood": "happy"
184
+ },
185
+ {
186
+ "name": "Priya Sharma",
187
+ "relationship_type": "investor",
188
+ "communication_preference": "email",
189
+ "importance": 5,
190
+ "mood": "annoyed"
191
+ },
192
+ {
193
+ "name": "Elena Vogt",
194
+ "relationship_type": "team_member",
195
+ "communication_preference": "email",
196
+ "importance": 3,
197
+ "mood": "neutral"
198
+ },
199
+ {
200
+ "name": "David Okonkwo",
201
+ "relationship_type": "client",
202
+ "communication_preference": "email",
203
+ "importance": 4,
204
+ "mood": "neutral"
205
+ }
206
+ ],
207
+ "tasks": [
208
+ {
209
+ "id": "ov1",
210
+ "description": "Finalize board packet figures",
211
+ "deadline": "2026-04-22T06:00:00",
212
+ "owner": "Self",
213
+ "status": "pending",
214
+ "effort": "high"
215
+ },
216
+ {
217
+ "id": "ov2",
218
+ "description": "Callback legal on redlines",
219
+ "deadline": "2026-04-22T05:30:00",
220
+ "owner": "Self",
221
+ "status": "pending",
222
+ "effort": "medium"
223
+ },
224
+ {
225
+ "id": "ov3",
226
+ "description": "Approve investor comms draft",
227
+ "deadline": "2026-04-22T06:15:00",
228
+ "owner": "Self",
229
+ "status": "pending",
230
+ "effort": "high"
231
+ },
232
+ {
233
+ "id": "ov4",
234
+ "description": "Expense report sign-off",
235
+ "deadline": "2026-04-22T04:00:00",
236
+ "owner": "Self",
237
+ "status": "overdue",
238
+ "effort": "low"
239
+ },
240
+ {
241
+ "id": "ov5",
242
+ "description": "Brief EA on calendar triage",
243
+ "deadline": "2026-04-22T05:00:00",
244
+ "owner": "Self",
245
+ "status": "overdue",
246
+ "effort": "low"
247
+ },
248
+ {
249
+ "id": "fu1",
250
+ "description": "Team social RSVP",
251
+ "deadline": "2026-04-25T12:00:00",
252
+ "owner": "Self",
253
+ "status": "pending",
254
+ "effort": "low"
255
+ }
256
+ ]
257
+ }
scenarios/phase2_core.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "simulation_time": "2026-04-21T08:00:00",
3
+ "stress": 52,
4
+ "active_conflicts": [],
5
+ "action_log": [],
6
+ "episode_active": true,
7
+ "episode_end_reason": null,
8
+ "emails": [
9
+ {"id": "e01", "sender": "Marcus Webb", "subject": "RE: Q3 Numbers — we need to talk", "body": "Board expects revised figures before noon. This is urgent.", "read": false, "replied": false, "priority": "critical", "sender_relationship": "VIP"},
10
+ {"id": "e02", "sender": "Sarah Chen", "subject": "Dinner tonight", "body": "Reservation at 7pm — please confirm you will make it.", "read": false, "replied": false, "priority": "high", "sender_relationship": "personal"},
11
+ {"id": "e03", "sender": "Priya Sharma", "subject": "Still waiting on the deck", "body": "Investor read-through is tomorrow. Where is the version you promised?", "read": false, "replied": false, "priority": "critical", "sender_relationship": "VIP"},
12
+ {"id": "e04", "sender": "Legal", "subject": "Contract redlines due", "body": "Please return comments on the MSA by EOD.", "read": false, "replied": false, "priority": "high", "sender_relationship": "professional"},
13
+ {"id": "e05", "sender": "Jordan Lee", "subject": "Quick question on roadmap", "body": "Can we grab 10 minutes before standup?", "read": false, "replied": false, "priority": "normal", "sender_relationship": "professional"},
14
+ {"id": "e06", "sender": "Alex Rivera", "subject": "Flight cancelled — options?", "body": "Airline moved me to a 6am tomorrow. Need guidance.", "read": false, "replied": false, "priority": "high", "sender_relationship": "professional"},
15
+ {"id": "e07", "sender": "HR Benefits", "subject": "Open enrollment reminder", "body": "Friendly reminder window closes Friday.", "read": true, "replied": false, "priority": "low", "sender_relationship": "professional"},
16
+ {"id": "e08", "sender": "David Okonkwo", "subject": "Angry about last meeting", "body": "We were not heard. Expect a follow-up call.", "read": false, "replied": false, "priority": "high", "sender_relationship": "professional"},
17
+ {"id": "e09", "sender": "Newsletter", "subject": "Your weekly digest", "body": "Top stories in tech leadership.", "read": false, "replied": false, "priority": "low", "sender_relationship": "unknown"},
18
+ {"id": "e10", "sender": "Elena Vogt", "subject": "Board prep materials", "body": "Slides uploaded to the secure folder.", "read": false, "replied": false, "priority": "normal", "sender_relationship": "professional"},
19
+ {"id": "e11", "sender": "Chris Park", "subject": "Lunch?", "body": "Want to grab something casual near the office?", "read": false, "replied": false, "priority": "low", "sender_relationship": "personal"},
20
+ {"id": "e12", "sender": "Morgan Blake", "subject": "RE: Budget variance", "body": "Finance needs sign-off today or we slip the quarter.", "read": false, "replied": false, "priority": "critical", "sender_relationship": "VIP"},
21
+ {"id": "e13", "sender": "IT Security", "subject": "Password rotation", "body": "Your account expires in 48 hours.", "read": true, "replied": true, "priority": "normal", "sender_relationship": "professional"},
22
+ {"id": "e14", "sender": "Jamie Liu", "subject": "Sprint demo feedback", "body": "Mostly positive — a few UX nits to track.", "read": false, "replied": false, "priority": "normal", "sender_relationship": "professional"},
23
+ {"id": "e15", "sender": "Taylor Brooks", "subject": "Investor dinner follow-up", "body": "Thanks for last night — next steps attached.", "read": false, "replied": false, "priority": "high", "sender_relationship": "VIP"},
24
+ {"id": "e16", "sender": "Operations", "subject": "Incident report #4421", "body": "Minor outage resolved; postmortem scheduled.", "read": false, "replied": false, "priority": "normal", "sender_relationship": "professional"},
25
+ {"id": "e17", "sender": "Riley Santos", "subject": "Can you approve PTO?", "body": "Team coverage looks fine for next week.", "read": false, "replied": false, "priority": "low", "sender_relationship": "professional"},
26
+ {"id": "e18", "sender": "Noah Patel", "subject": "Vendor pricing", "body": "They moved numbers again — need a decision.", "read": false, "replied": false, "priority": "high", "sender_relationship": "professional"},
27
+ {"id": "e19", "sender": "Calendar Bot", "subject": "You have 12 conflicts today", "body": "Automated summary of overlapping meetings.", "read": false, "replied": false, "priority": "normal", "sender_relationship": "unknown"},
28
+ {"id": "e20", "sender": "Casey Nguyen", "subject": "Design review moved", "body": "We shifted to 3pm — hope that works.", "read": false, "replied": false, "priority": "normal", "sender_relationship": "professional"},
29
+ {"id": "e21", "sender": "Samira Haddad", "subject": "Formal complaint received", "body": "Please acknowledge receipt per policy.", "read": false, "replied": false, "priority": "critical", "sender_relationship": "professional"},
30
+ {"id": "e22", "sender": "Spouse", "subject": "Kids pickup", "body": "I have a dentist appointment — can you cover 4pm?", "read": false, "replied": false, "priority": "high", "sender_relationship": "personal"},
31
+ {"id": "e23", "sender": "Marketing", "subject": "Launch blog draft", "body": "Casual tone OK? LMK by 2pm.", "read": false, "replied": false, "priority": "normal", "sender_relationship": "professional"},
32
+ {"id": "e24", "sender": "Vikram Singh", "subject": "Partnership term sheet", "body": "Legal asked for your eyes on section 4 only.", "read": false, "replied": false, "priority": "high", "sender_relationship": "VIP"},
33
+ {"id": "e25", "sender": "Facilities", "subject": "Office move checklist", "body": "Please label your boxes by Friday.", "read": true, "replied": false, "priority": "low", "sender_relationship": "professional"},
34
+ {"id": "e26", "sender": "Quinn Murphy", "subject": "Sorry for the tone earlier", "body": "Rough morning — can we reset?", "read": false, "replied": false, "priority": "normal", "sender_relationship": "personal"},
35
+ {"id": "e27", "sender": "Board Secretary", "subject": "Confidential — agenda", "body": "Materials under embargo until 5pm.", "read": false, "replied": false, "priority": "critical", "sender_relationship": "VIP"},
36
+ {"id": "e28", "sender": "Recruiting", "subject": "VP Eng loop feedback", "body": "Candidate availability Thursday.", "read": false, "replied": false, "priority": "normal", "sender_relationship": "professional"},
37
+ {"id": "e29", "sender": "Avery Cole", "subject": "Weekend golf?", "body": "Totally casual — no pressure.", "read": false, "replied": false, "priority": "low", "sender_relationship": "personal"},
38
+ {"id": "e30", "sender": "Support", "subject": "Ticket #9982 closed", "body": "Your laptop swap is complete.", "read": true, "replied": false, "priority": "low", "sender_relationship": "unknown"}
39
+ ],
40
+ "meetings": [
41
+ {"id": "m01", "title": "Board Call", "start": "2026-04-21T09:00:00", "duration_minutes": 60, "attendees": ["Marcus Webb", "Elena Vogt"], "location": "virtual", "priority": "critical", "cancelled": false},
42
+ {"id": "m02", "title": "Client Demo", "start": "2026-04-21T09:00:00", "duration_minutes": 60, "attendees": ["David Okonkwo"], "location": "virtual", "priority": "high", "cancelled": false},
43
+ {"id": "m03", "title": "Coffee with Jordan", "start": "2026-04-21T10:30:00", "duration_minutes": 30, "attendees": ["Jordan Lee"], "location": "Cafe North", "priority": "low", "cancelled": false},
44
+ {"id": "m04", "title": "Team Standup", "start": "2026-04-21T11:00:00", "duration_minutes": 30, "attendees": ["Jordan Lee", "Jamie Liu"], "location": "virtual", "priority": "normal", "cancelled": false},
45
+ {"id": "m05", "title": "Lunch with Priya", "start": "2026-04-21T11:00:00", "duration_minutes": 90, "attendees": ["Priya Sharma"], "location": "Osteria", "priority": "high", "cancelled": false},
46
+ {"id": "m06", "title": "1:1 Avery", "start": "2026-04-21T13:00:00", "duration_minutes": 60, "attendees": ["Avery Cole"], "location": "Office 12B", "priority": "normal", "cancelled": false},
47
+ {"id": "m07", "title": "Investor Update", "start": "2026-04-21T14:00:00", "duration_minutes": 60, "attendees": ["Taylor Brooks"], "location": "virtual", "priority": "critical", "cancelled": false},
48
+ {"id": "m08", "title": "Legal Review", "start": "2026-04-21T14:00:00", "duration_minutes": 60, "attendees": ["Legal"], "location": "virtual", "priority": "high", "cancelled": false},
49
+ {"id": "m09", "title": "Sales QBR", "start": "2026-04-21T15:00:00", "duration_minutes": 60, "attendees": ["Noah Patel"], "location": "virtual", "priority": "normal", "cancelled": false},
50
+ {"id": "m10", "title": "Ops Incident Review", "start": "2026-04-21T15:30:00", "duration_minutes": 60, "attendees": ["Operations"], "location": "virtual", "priority": "high", "cancelled": false}
51
+ ],
52
+ "contacts": [
53
+ {"name": "Marcus Webb", "relationship_type": "board_member", "communication_preference": "email", "importance": 5, "mood": "angry"},
54
+ {"name": "Sarah Chen", "relationship_type": "spouse", "communication_preference": "text", "importance": 5, "mood": "neutral"},
55
+ {"name": "Priya Sharma", "relationship_type": "investor", "communication_preference": "email", "importance": 5, "mood": "annoyed"},
56
+ {"name": "Jordan Lee", "relationship_type": "direct_report", "communication_preference": "call", "importance": 3, "mood": "happy"},
57
+ {"name": "David Okonkwo", "relationship_type": "client", "communication_preference": "email", "importance": 4, "mood": "angry"},
58
+ {"name": "Elena Vogt", "relationship_type": "team_member", "communication_preference": "email", "importance": 3, "mood": "neutral"},
59
+ {"name": "Taylor Brooks", "relationship_type": "investor", "communication_preference": "call", "importance": 4, "mood": "neutral"},
60
+ {"name": "Alex Rivera", "relationship_type": "direct_report", "communication_preference": "text", "importance": 2, "mood": "neutral"},
61
+ {"name": "Jamie Liu", "relationship_type": "team_member", "communication_preference": "email", "importance": 2, "mood": "happy"},
62
+ {"name": "Chris Park", "relationship_type": "friend", "communication_preference": "text", "importance": 2, "mood": "happy"},
63
+ {"name": "Morgan Blake", "relationship_type": "board_member", "communication_preference": "email", "importance": 5, "mood": "neutral"},
64
+ {"name": "Riley Santos", "relationship_type": "direct_report", "communication_preference": "email", "importance": 3, "mood": "neutral"},
65
+ {"name": "Noah Patel", "relationship_type": "client", "communication_preference": "email", "importance": 4, "mood": "annoyed"},
66
+ {"name": "Casey Nguyen", "relationship_type": "team_member", "communication_preference": "email", "importance": 2, "mood": "neutral"},
67
+ {"name": "Vikram Singh", "relationship_type": "investor", "communication_preference": "email", "importance": 4, "mood": "neutral"}
68
+ ],
69
+ "tasks": [
70
+ {"id": "t01", "description": "Send Q3 deck to Marcus", "deadline": "2026-04-21T09:30:00", "owner": "Marcus Webb", "status": "pending", "effort": "high"},
71
+ {"id": "t02", "description": "Confirm dinner reservation", "deadline": "2026-04-21T10:00:00", "owner": "Sarah Chen", "status": "pending", "effort": "low"},
72
+ {"id": "t03", "description": "Rebook investor flight", "deadline": "2026-04-21T11:00:00", "owner": "Alex Rivera", "status": "pending", "effort": "medium"},
73
+ {"id": "t04", "description": "Legal MSA redlines", "deadline": "2026-04-21T17:00:00", "owner": "Legal", "status": "in-progress", "effort": "high"},
74
+ {"id": "t05", "description": "Approve vendor SOW", "deadline": "2026-04-22T12:00:00", "owner": "Noah Patel", "status": "pending", "effort": "medium"},
75
+ {"id": "t06", "description": "Prep board talking points", "deadline": "2026-04-21T08:30:00", "owner": "Self", "status": "pending", "effort": "high"},
76
+ {"id": "t07", "description": "Expense report Q1", "deadline": "2026-04-25T23:59:59", "owner": "Finance", "status": "pending", "effort": "low"},
77
+ {"id": "t08", "description": "Callback David Okonkwo", "deadline": "2026-04-21T12:00:00", "owner": "Self", "status": "pending", "effort": "low"},
78
+ {"id": "t09", "description": "Review design mocks", "deadline": "2026-04-21T15:00:00", "owner": "Casey Nguyen", "status": "done", "effort": "medium"},
79
+ {"id": "t10", "description": "Submit benefits election", "deadline": "2026-04-28T23:59:59", "owner": "HR", "status": "pending", "effort": "low"},
80
+ {"id": "t11", "description": "Brief PR on launch timing", "deadline": "2026-04-21T14:00:00", "owner": "Marketing", "status": "pending", "effort": "medium"},
81
+ {"id": "t12", "description": "Sign birthday card for Avery", "deadline": "2026-04-21T16:00:00", "owner": "Jamie Liu", "status": "pending", "effort": "low"}
82
+ ]
83
+ }
scenarios/schema_drift_test.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "description": "Patronus-style schema drift bundle (three mid-episode rule changes).",
3
+ "events": [
4
+ {
5
+ "after_step": 1,
6
+ "shift_all_meetings_hours": 1,
7
+ "comment": "Scenario 1: calendar system shifts all local meeting times by +1 hour."
8
+ },
9
+ {
10
+ "after_step": 2,
11
+ "set_contact_preference": {
12
+ "name": "Sarah Chen",
13
+ "communication_preference": "text"
14
+ },
15
+ "comment": "Scenario 2: VIP/personal contact switches to text-only preference."
16
+ },
17
+ {
18
+ "after_step": 3,
19
+ "set_task_deadline": {
20
+ "task_id": "t02",
21
+ "deadline": "2026-04-21T07:00:00"
22
+ },
23
+ "suppress_reply_relationship_for_senders": ["Marcus Webb"],
24
+ "comment": "Scenario 3: task deadline moved earlier; Marcus email replies yield zero relationship score."
25
+ }
26
+ ]
27
+ }
scenarios/vip_meltdown.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "simulation_time": "2026-04-21T09:00:00",
3
+ "stress": 44,
4
+ "active_conflicts": [],
5
+ "action_log": [
6
+ "VIP meltdown demo: external pressure escalates if ignored."
7
+ ],
8
+ "episode_active": true,
9
+ "episode_end_reason": null,
10
+ "max_episode_steps": 24,
11
+ "emails": [
12
+ {
13
+ "id": "v1",
14
+ "sender": "Taylor Brooks",
15
+ "subject": "We need alignment now",
16
+ "body": "Neutral opening: waiting on your stance before we brief others.",
17
+ "read": false,
18
+ "replied": false,
19
+ "priority": "critical",
20
+ "sender_relationship": "VIP"
21
+ }
22
+ ],
23
+ "meetings": [
24
+ {
25
+ "id": "vm1",
26
+ "title": "Investor sync",
27
+ "start": "2026-04-21T10:00:00",
28
+ "duration_minutes": 45,
29
+ "attendees": [
30
+ "Taylor Brooks"
31
+ ],
32
+ "location": "virtual",
33
+ "priority": "high",
34
+ "cancelled": false
35
+ }
36
+ ],
37
+ "contacts": [
38
+ {
39
+ "name": "Taylor Brooks",
40
+ "relationship_type": "investor",
41
+ "communication_preference": "call",
42
+ "importance": 4,
43
+ "mood": "neutral"
44
+ },
45
+ {
46
+ "name": "Jordan Lee",
47
+ "relationship_type": "direct_report",
48
+ "communication_preference": "email",
49
+ "importance": 3,
50
+ "mood": "happy"
51
+ }
52
+ ],
53
+ "tasks": [
54
+ {
55
+ "id": "vt1",
56
+ "description": "Prep one-pager for Taylor",
57
+ "deadline": "2026-04-21T11:00:00",
58
+ "owner": "Self",
59
+ "status": "pending",
60
+ "effort": "medium"
61
+ }
62
+ ]
63
+ }
scenarios/vip_meltdown_drift.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "events": [
3
+ {
4
+ "after_step": 1,
5
+ "set_contact_mood": {
6
+ "name": "Taylor Brooks",
7
+ "mood": "annoyed"
8
+ }
9
+ },
10
+ {
11
+ "after_step": 2,
12
+ "set_contact_mood": {
13
+ "name": "Taylor Brooks",
14
+ "mood": "angry"
15
+ }
16
+ },
17
+ {
18
+ "after_step": 3,
19
+ "set_contact_mood": {
20
+ "name": "Taylor Brooks",
21
+ "mood": "furious"
22
+ }
23
+ }
24
+ ]
25
+ }
scripts/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Makes ``scripts.*`` importable when repo root is on PYTHONPATH (pytest).
scripts/http_endpoint_smoke.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ #
4
+ # CLI: hit GhostExec HTTP endpoints (live URL or --local in-process app).
5
+ #
6
+ # uv run python scripts/http_endpoint_smoke.py --local
7
+ # uv run python scripts/http_endpoint_smoke.py --url http://127.0.0.1:8000
8
+
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import json
13
+ import sys
14
+ import urllib.error
15
+ import urllib.request
16
+ from typing import Any
17
+ from urllib.parse import urljoin
18
+
19
+ ROOT = __import__("pathlib").Path(__file__).resolve().parents[1]
20
+ if str(ROOT) not in sys.path:
21
+ sys.path.insert(0, str(ROOT))
22
+
23
+
24
+ def _print_curl(base: str) -> None:
25
+ print("# --- copy/paste (bash) ---")
26
+ for method, path in [
27
+ ("GET", "/health"),
28
+ ("GET", "/metadata"),
29
+ ("GET", "/state"),
30
+ ("GET", "/schema"),
31
+ ("GET", "/openapi.json"),
32
+ ]:
33
+ print(f"curl -sS -X {method} '{base.rstrip('/')}{path}' | head -c 200 && echo")
34
+ print(
35
+ "curl -sS -X POST '{base}/reset' -H 'Content-Type: application/json' -d '{{}}' | head -c 300 && echo".format(
36
+ base=base.rstrip("/")
37
+ )
38
+ )
39
+ print(
40
+ "curl -sS -X POST '{base}/step' -H 'Content-Type: application/json' "
41
+ "-d '{{\"action\":{{\"action_type\":\"do_nothing\"}}}}' | head -c 300 && echo".format(base=base.rstrip("/"))
42
+ )
43
+ print(
44
+ "# Note: HTTP uses a new env per request — not one multi-step episode; use WebSocket /ws for that."
45
+ )
46
+
47
+
48
+ class LiveClient:
49
+ def __init__(self, base: str) -> None:
50
+ self.base = base.rstrip("/")
51
+
52
+ def request(
53
+ self,
54
+ method: str,
55
+ path: str,
56
+ *,
57
+ data: bytes | None = None,
58
+ headers: dict[str, str] | None = None,
59
+ ) -> tuple[int, str]:
60
+ url = urljoin(self.base + "/", path.lstrip("/"))
61
+ req = urllib.request.Request(url, data=data, headers=headers or {}, method=method)
62
+ try:
63
+ with urllib.request.urlopen(req, timeout=20) as resp:
64
+ return resp.status, resp.read().decode(errors="replace")
65
+ except urllib.error.HTTPError as e:
66
+ return e.code, e.read().decode(errors="replace")
67
+
68
+
69
+ class LocalClient:
70
+ def __init__(self) -> None:
71
+ from fastapi.testclient import TestClient
72
+
73
+ from ghostexec.server.app import app
74
+
75
+ self._client = TestClient(app, raise_server_exceptions=True)
76
+
77
+ def request(
78
+ self,
79
+ method: str,
80
+ path: str,
81
+ *,
82
+ data: bytes | None = None,
83
+ headers: dict[str, str] | None = None,
84
+ ) -> tuple[int, str]:
85
+ hdrs = headers or {}
86
+ kwargs: dict[str, Any] = {}
87
+ if data is not None:
88
+ kwargs["content"] = data
89
+ kwargs["headers"] = hdrs
90
+ r = self._client.request(method, path, **kwargs)
91
+ return r.status_code, r.text
92
+
93
+
94
+ def main() -> int:
95
+ p = argparse.ArgumentParser(description="GhostExec HTTP endpoint smoke (CLI).")
96
+ p.add_argument(
97
+ "--url",
98
+ default="http://127.0.0.1:8000",
99
+ help="Live server base URL (ignored with --local).",
100
+ )
101
+ p.add_argument(
102
+ "--local",
103
+ action="store_true",
104
+ help="Use in-process FastAPI TestClient (no server required).",
105
+ )
106
+ p.add_argument(
107
+ "--print-curl",
108
+ action="store_true",
109
+ help="Print example curl commands and exit 0.",
110
+ )
111
+ args = p.parse_args()
112
+
113
+ if args.print_curl:
114
+ _print_curl(args.url)
115
+ return 0
116
+
117
+ client: LiveClient | LocalClient
118
+ label: str
119
+ if args.local:
120
+ client = LocalClient()
121
+ label = "local TestClient"
122
+ else:
123
+ client = LiveClient(args.url)
124
+ label = args.url
125
+
126
+ def check_get(path: str) -> None:
127
+ code, body = client.request("GET", path)
128
+ ok = 200 <= code < 300
129
+ status = "OK" if ok else "FAIL"
130
+ print(f"[{status}] GET {path} -> HTTP {code} (body ~{len(body)} chars)")
131
+ if not ok:
132
+ raise SystemExit(1)
133
+
134
+ print(f"GhostExec HTTP smoke ({label})\n")
135
+
136
+ for path in (
137
+ "/health",
138
+ "/metadata",
139
+ "/state",
140
+ "/schema",
141
+ "/openapi.json",
142
+ "/docs",
143
+ "/redoc",
144
+ ):
145
+ check_get(path)
146
+
147
+ body = json.dumps({}).encode()
148
+ hdrs = {"Content-Type": "application/json"}
149
+ code, txt = client.request("POST", "/reset", data=body, headers=hdrs)
150
+ print(f"[{'OK' if code == 200 else 'FAIL'}] POST /reset -> HTTP {code}")
151
+ if code != 200:
152
+ raise SystemExit(1)
153
+ j = json.loads(txt)
154
+ em = (j.get("observation") or {}).get("echoed_message", "")[:50]
155
+ print(f" briefing prefix: {em!r}")
156
+
157
+ step_payload = json.dumps({"action": {"action_type": "do_nothing"}}).encode()
158
+ code2, txt2 = client.request("POST", "/step", data=step_payload, headers=hdrs)
159
+ print(f"[{'OK' if code2 == 200 else 'FAIL'}] POST /step do_nothing -> HTTP {code2}")
160
+ if code2 != 200:
161
+ raise SystemExit(1)
162
+
163
+ print(
164
+ "\nNote: OpenEnv HTTP may use a new env per request, so separate POSTs do not advance "
165
+ "one long episode; each POST /step runs a single action on a fresh instance. "
166
+ "Multi-step learning on one episode: WebSocket /ws (see ghostexec/README.md)."
167
+ )
168
+
169
+ code3, _ = client.request("POST", "/mcp", data=json.dumps({"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}}).encode(), headers=hdrs)
170
+ print(f"[{'OK' if code3 == 200 else 'FAIL'}] POST /mcp tools/list -> HTTP {code3}")
171
+ if code3 != 200:
172
+ raise SystemExit(1)
173
+
174
+ code4, _ = client.request("GET", "/reset")
175
+ print(f"[{'OK' if code4 == 405 else 'FAIL'}] GET /reset (expect 405) -> HTTP {code4}")
176
+ if code4 != 405:
177
+ raise SystemExit(1)
178
+
179
+ print("\nAll checks passed.")
180
+ return 0
181
+
182
+
183
+ if __name__ == "__main__":
184
+ raise SystemExit(main())
scripts/run_live_api_dead_500.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Run 500+ LIVE HTTP API reward dead-tests against a running GhostExec server.
2
+
3
+ Usage:
4
+ uv run python scripts/run_live_api_dead_500.py --url http://127.0.0.1:8002 --cases 500
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import argparse
10
+ import json
11
+ from pathlib import Path
12
+ from typing import Any
13
+ from urllib.parse import urljoin
14
+ import urllib.error
15
+ import urllib.request
16
+
17
+ W_CONFLICT = 0.35
18
+ W_REL = 0.35
19
+ W_TASK = 0.30
20
+ OUTPUT_SCALE = 0.48
21
+
22
+
23
+ def _request(
24
+ base_url: str,
25
+ method: str,
26
+ path: str,
27
+ *,
28
+ body: dict[str, Any] | None = None,
29
+ timeout: float = 20.0,
30
+ ) -> tuple[int, str]:
31
+ data = None
32
+ headers = {"Accept": "application/json"}
33
+ if body is not None:
34
+ data = json.dumps(body).encode()
35
+ headers["Content-Type"] = "application/json"
36
+ req = urllib.request.Request(
37
+ urljoin(base_url.rstrip("/") + "/", path.lstrip("/")),
38
+ data=data,
39
+ headers=headers,
40
+ method=method,
41
+ )
42
+ try:
43
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
44
+ return resp.status, resp.read().decode(errors="replace")
45
+ except urllib.error.HTTPError as e:
46
+ return e.code, e.read().decode(errors="replace")
47
+
48
+
49
+ def _step_payload_for(i: int) -> dict[str, Any]:
50
+ templates: list[dict[str, Any]] = [
51
+ {"action": {"action_type": "do_nothing"}},
52
+ {"action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}},
53
+ {"action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}},
54
+ {"action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}},
55
+ {"action": {"action_type": "archive_email", "email_id": "e09"}},
56
+ {"action": {"action_type": "archive_email", "email_id": "bad_id"}},
57
+ {
58
+ "action": {
59
+ "action_type": "reschedule_meeting",
60
+ "meeting_id": "m02",
61
+ "new_time": "2026-04-21T18:00:00",
62
+ }
63
+ },
64
+ {
65
+ "action": {
66
+ "action_type": "reschedule_meeting",
67
+ "meeting_id": "m03",
68
+ "new_time": "2026-04-21T09:30:00", # overlap -> invalid semantic
69
+ }
70
+ },
71
+ {"action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}},
72
+ {"action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}},
73
+ {"action": {"action_type": "complete_task", "task_id": "t07"}},
74
+ {"action": {"action_type": "complete_task", "task_id": "t09"}}, # already done
75
+ {"action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}},
76
+ {"action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}},
77
+ {
78
+ "action": {
79
+ "action_type": "send_message",
80
+ "contact_name": "Jamie Liu",
81
+ "message_body": "Quick sync please.",
82
+ }
83
+ },
84
+ {
85
+ "action": {
86
+ "action_type": "send_message",
87
+ "contact_name": "Nobody",
88
+ "message_body": "hello",
89
+ }
90
+ },
91
+ ]
92
+ return templates[i % len(templates)]
93
+
94
+
95
+ def _assert_api_surface(base_url: str) -> None:
96
+ for path in ("/health", "/metadata", "/state", "/schema", "/openapi.json", "/docs", "/redoc"):
97
+ code, _ = _request(base_url, "GET", path)
98
+ assert code == 200, f"{path} -> {code}"
99
+ assert _request(base_url, "GET", "/reset")[0] == 405
100
+ assert _request(base_url, "GET", "/step")[0] == 405
101
+ assert _request(base_url, "GET", "/this-path-should-not-exist-ghostexec")[0] == 404
102
+ assert _request(base_url, "POST", "/mcp", body={"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}})[0] == 200
103
+
104
+
105
+ def main() -> int:
106
+ p = argparse.ArgumentParser(description="Run live 500+ reward dead-tests.")
107
+ p.add_argument("--url", default="http://127.0.0.1:8002", help="Base server URL")
108
+ p.add_argument("--cases", type=int, default=500, help="Number of /reset+/step cases")
109
+ args = p.parse_args()
110
+
111
+ base_url = args.url.rstrip("/")
112
+ cases = max(1, args.cases)
113
+
114
+ _assert_api_surface(base_url)
115
+
116
+ out_dir = Path("outputs") / "logs"
117
+ out_dir.mkdir(parents=True, exist_ok=True)
118
+ out_path = out_dir / f"api_dead_live_{cases}.jsonl"
119
+
120
+ passed = 0
121
+ failed = 0
122
+ failures: list[str] = []
123
+
124
+ with out_path.open("w", encoding="utf-8") as f:
125
+ for idx in range(cases):
126
+ rec: dict[str, Any] = {"idx": idx, "ok": False, "error": None}
127
+ try:
128
+ rc, rb = _request(
129
+ base_url,
130
+ "POST",
131
+ "/reset",
132
+ body={"episode_id": f"live-dead-{idx:04d}", "seed": 42},
133
+ )
134
+ assert rc == 200, f"reset status {rc}"
135
+
136
+ payload = _step_payload_for(idx)
137
+ rec["action"] = payload["action"]
138
+ sc, sb = _request(base_url, "POST", "/step", body=payload)
139
+ assert sc == 200, f"step status {sc}"
140
+ body = json.loads(sb)
141
+
142
+ obs = body["observation"]
143
+ meta = obs.get("metadata") or {}
144
+ bd = meta.get("reward_breakdown") or {}
145
+
146
+ reward = float(body["reward"])
147
+ final = float(bd["final"])
148
+ assert reward == final, "reward != breakdown.final"
149
+
150
+ c = float(bd.get("conflict", 0.0))
151
+ r = float(bd.get("relationship", 0.0))
152
+ t = float(bd.get("task", 0.0))
153
+ expected_weighted = OUTPUT_SCALE * (W_CONFLICT * c + W_REL * r + W_TASK * t)
154
+ assert float(bd["weighted_base"]) == expected_weighted, "weighted_base mismatch"
155
+
156
+ expected_final = (
157
+ float(bd.get("weighted_base", 0.0))
158
+ + float(bd.get("invalid_step_adjustment", 0.0))
159
+ + float(bd.get("episode_completion_bonus", 0.0))
160
+ + float(bd.get("catastrophic_penalty", 0.0))
161
+ + float(bd.get("do_nothing_floor", 0.0))
162
+ )
163
+ assert final == expected_final, "final aggregation mismatch"
164
+
165
+ if payload["action"]["action_type"] == "do_nothing":
166
+ assert float(bd.get("do_nothing_floor", 0.0)) == -0.15, "do_nothing floor mismatch"
167
+ assert reward < 0, "do_nothing should be negative"
168
+
169
+ if meta.get("step_ok") is False:
170
+ assert float(bd.get("invalid_step_adjustment", 0.0)) == -0.25, "invalid penalty mismatch"
171
+
172
+ rec["ok"] = True
173
+ rec["reward"] = reward
174
+ rec["step_ok"] = meta.get("step_ok")
175
+ passed += 1
176
+ except Exception as e: # noqa: BLE001
177
+ rec["ok"] = False
178
+ rec["error"] = str(e)
179
+ failed += 1
180
+ if len(failures) < 10:
181
+ failures.append(f"idx={idx}: {e}")
182
+ finally:
183
+ f.write(json.dumps(rec, ensure_ascii=False) + "\n")
184
+
185
+ print(f"Live API dead-test complete: passed={passed} failed={failed} total={cases}")
186
+ print(f"Report: {out_path}")
187
+ if failures:
188
+ print("First failures:")
189
+ for row in failures:
190
+ print(" -", row)
191
+ return 0 if failed == 0 else 1
192
+
193
+
194
+ if __name__ == "__main__":
195
+ raise SystemExit(main())
196
+
server/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Ghostexec environment server components."""
8
+
9
+ from .ghostexec_environment import GhostexecEnvironment
10
+
11
+ __all__ = ["GhostexecEnvironment"]
server/app.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ FastAPI application for the Ghostexec Environment.
9
+
10
+ This module creates an HTTP server that exposes the GhostexecEnvironment
11
+ over HTTP and WebSocket endpoints, compatible with EnvClient.
12
+
13
+ Endpoints:
14
+ - POST /reset: Reset the environment
15
+ - POST /step: Execute an action
16
+ - GET /state: Get current environment state
17
+ - GET /schema: Get action/observation schemas
18
+ - WS /ws: WebSocket endpoint for persistent sessions
19
+
20
+ Usage:
21
+ # Development (with auto-reload):
22
+ uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
23
+
24
+ # Production:
25
+ uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
26
+
27
+ # Or run directly:
28
+ python -m server.app
29
+ """
30
+
31
+ try:
32
+ import openenv.core.env_server.http_server as _openenv_http
33
+ except Exception as e: # pragma: no cover
34
+ raise ImportError(
35
+ "openenv is required for the web interface. Install dependencies with '\n uv sync\n'"
36
+ ) from e
37
+
38
+ # OpenEnv's serialize_observation drops `metadata` from the JSON body; Ghostexec
39
+ # trainers and live tests rely on step_ok / ids inside observation.metadata.
40
+ _orig_serialize_observation = _openenv_http.serialize_observation
41
+
42
+
43
+ def _ghostexec_serialize_observation(observation): # type: ignore[no-untyped-def]
44
+ payload = _orig_serialize_observation(observation)
45
+ inner = payload.get("observation")
46
+ if isinstance(inner, dict):
47
+ meta = getattr(observation, "metadata", None) or {}
48
+ inner["metadata"] = _openenv_http._make_json_serializable(meta)
49
+ return payload
50
+
51
+
52
+ _openenv_http.serialize_observation = _ghostexec_serialize_observation
53
+
54
+ from openenv.core.env_server.http_server import create_app # noqa: E402
55
+
56
+ try:
57
+ # Editable / normal install (package name `ghostexec`).
58
+ from ghostexec.models import GhostexecAction, GhostexecObservation
59
+ from ghostexec.server.ghostexec_environment import GhostexecEnvironment
60
+ except ImportError:
61
+ # Plain `uvicorn server.app:app` from repo root: top-level `models` + `server` package.
62
+ from models import GhostexecAction, GhostexecObservation
63
+ from server.ghostexec_environment import GhostexecEnvironment
64
+
65
+
66
+ # Create the app with web interface and README integration
67
+ app = create_app(
68
+ GhostexecEnvironment,
69
+ GhostexecAction,
70
+ GhostexecObservation,
71
+ env_name="ghostexec",
72
+ max_concurrent_envs=1, # increase this number to allow more concurrent WebSocket sessions
73
+ )
74
+
75
+
76
+ def _patch_openapi_ghostexec_examples(schema: dict) -> None:
77
+ """Replace OpenEnv's generic observation examples with GhostExec's plain-text briefing shape."""
78
+ briefing = (
79
+ "=== GHOSTEXEC BRIEFING — Tue 21 Apr 2026 08:00 ===\n\n"
80
+ "UNREAD EMAILS (…): …\n\n"
81
+ "CALENDAR CONFLICTS IN NEXT 4 HOURS: …\n\n"
82
+ "CONTACTS TO WATCH: …\n\n"
83
+ "OVERDUE OR DUE-SOON TASKS: …\n\n"
84
+ "EXEC STRESS LEVEL: 52/100\n"
85
+ "STEPS REMAINING: 48"
86
+ )
87
+ obs = {"echoed_message": briefing, "message_length": len(briefing)}
88
+ reset_ex = {"observation": obs, "reward": 0.0, "done": False}
89
+ step_ex = {"observation": obs, "reward": -0.42, "done": False}
90
+ for path, example in (("/reset", reset_ex), ("/step", step_ex)):
91
+ try:
92
+ cell = (
93
+ schema["paths"][path]["post"]["responses"]["200"]["content"]["application/json"]
94
+ )
95
+ if isinstance(cell, dict):
96
+ cell["example"] = example
97
+ except KeyError:
98
+ continue
99
+
100
+
101
+ _OPENAPI_HTTP_EPISODE_SENTINEL = "Ghostexec / OpenEnv HTTP"
102
+
103
+ _OPENAPI_HTTP_EPISODE_NOTE = f"""
104
+ ---
105
+ ## {_OPENAPI_HTTP_EPISODE_SENTINEL}
106
+
107
+ Each `POST /reset` and `POST /step` may run on a **new** environment instance, so
108
+ separate HTTP requests do **not** share one in-memory episode across calls. A lone
109
+ `POST /step` still applies your action once (after internal scenario load). For
110
+ **many steps on the same episode**, use **WebSocket `/ws`**: open a connection,
111
+ reset once, then send many step messages on that same socket. See **ghostexec/README.md**
112
+ for details.
113
+ """
114
+
115
+
116
+ def _patch_openapi_ghostexec_http_note(schema: dict) -> None:
117
+ """Document HTTP statelessness vs /ws so Swagger and OpenAPI clients see it."""
118
+ try:
119
+ info = schema.get("info")
120
+ if not isinstance(info, dict):
121
+ return
122
+ desc = info.get("description") or ""
123
+ if _OPENAPI_HTTP_EPISODE_SENTINEL in desc:
124
+ return
125
+ info["description"] = desc + _OPENAPI_HTTP_EPISODE_NOTE
126
+ except (TypeError, KeyError):
127
+ return
128
+
129
+
130
+ _fastapi_openapi = type(app).openapi.__get__(app, type(app))
131
+
132
+
133
+ def _ghostexec_openapi() -> dict:
134
+ if app.openapi_schema is None:
135
+ _fastapi_openapi()
136
+ _patch_openapi_ghostexec_examples(app.openapi_schema)
137
+ _patch_openapi_ghostexec_http_note(app.openapi_schema)
138
+ return app.openapi_schema # type: ignore[return-value]
139
+
140
+
141
+ app.openapi = _ghostexec_openapi # type: ignore[method-assign]
142
+
143
+
144
+ def main() -> None:
145
+ """
146
+ Entry point for direct execution via uv run or python -m.
147
+
148
+ This function enables running the server without Docker:
149
+ uv run --project . server
150
+ uv run --project . server --port 8001
151
+ python -m ghostexec.server.app
152
+
153
+ For production deployments, consider using uvicorn directly with
154
+ multiple workers:
155
+ uvicorn ghostexec.server.app:app --workers 4
156
+ """
157
+ import argparse
158
+
159
+ import uvicorn
160
+
161
+ parser = argparse.ArgumentParser(description="GhostExec OpenEnv HTTP server")
162
+ parser.add_argument("--host", type=str, default="0.0.0.0", help="Bind address")
163
+ parser.add_argument("--port", type=int, default=8000, help="Listen port")
164
+ args = parser.parse_args()
165
+ uvicorn.run(app, host=args.host, port=args.port)
166
+
167
+
168
+ if __name__ == '__main__':
169
+ main()
server/ghostexec_environment.py ADDED
@@ -0,0 +1,706 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ GhostExec simulated world, agent step (Phases 2–3), and reward (Phase 4).
9
+
10
+ Scenario payloads load from scenarios/*.json. Observations are plain-text briefings.
11
+ Invalid actions return a structured error in observation metadata without raising.
12
+ Rewards aggregate conflict / relationship / task scores and log each step to outputs/logs/.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import json
18
+ from datetime import datetime, timedelta, timezone
19
+ from pathlib import Path
20
+ from typing import Any
21
+ from uuid import uuid4
22
+
23
+ from openenv.core.env_server.interfaces import Environment
24
+ from openenv.core.env_server.types import State
25
+
26
+ try:
27
+ from ..models import (
28
+ Contact,
29
+ Email,
30
+ GhostexecAction,
31
+ GhostexecObservation,
32
+ Meeting,
33
+ Mood,
34
+ RewardBreakdown,
35
+ Task,
36
+ TaskStatus,
37
+ WorldState,
38
+ )
39
+ except ImportError:
40
+ from models import (
41
+ Contact,
42
+ Email,
43
+ GhostexecAction,
44
+ GhostexecObservation,
45
+ Meeting,
46
+ Mood,
47
+ RewardBreakdown,
48
+ Task,
49
+ TaskStatus,
50
+ WorldState,
51
+ )
52
+
53
+ try:
54
+ from . import reward as _reward
55
+ except ImportError:
56
+ try:
57
+ from server import reward as _reward
58
+ except ImportError:
59
+ import reward as _reward # type: ignore[no-redef]
60
+
61
+ _PRIORITY_RANK: dict[str, int] = {"critical": 0, "high": 1, "normal": 2, "low": 3}
62
+ _REL_DISPLAY: dict[str, str] = {
63
+ "board_member": "Board",
64
+ "spouse": "Spouse",
65
+ "investor": "Investor",
66
+ "direct_report": "Direct report",
67
+ "client": "Client",
68
+ "friend": "Friend",
69
+ "team_member": "Team",
70
+ }
71
+
72
+ _INVALID_ACTION_REWARD = -0.25
73
+ _DEFAULT_STEP_REWARD = 0.0
74
+
75
+
76
+ def _default_scenario_path() -> Path:
77
+ return Path(__file__).resolve().parent.parent / "scenarios" / "phase2_core.json"
78
+
79
+
80
+ def _parse_dt(value: str) -> datetime:
81
+ if value.endswith("Z"):
82
+ return datetime.fromisoformat(value[:-1]).replace(tzinfo=timezone.utc)
83
+ dt = datetime.fromisoformat(value)
84
+ if dt.tzinfo is None:
85
+ return dt.replace(tzinfo=timezone.utc)
86
+ return dt
87
+
88
+
89
+ def _meeting_end(m: Meeting) -> datetime:
90
+ start = _parse_dt(m.start)
91
+ return start + timedelta(minutes=m.duration_minutes)
92
+
93
+
94
+ def _windows_overlap(a_start: datetime, a_end: datetime, b_start: datetime, b_end: datetime) -> bool:
95
+ return a_start < b_end and b_start < a_end
96
+
97
+
98
+ class GhostexecEnvironment(Environment):
99
+ """Inbox, calendar, contacts, tasks, actions, briefings, and Phase 4 rewards."""
100
+
101
+ SUPPORTS_CONCURRENT_SESSIONS: bool = True
102
+
103
+ def __init__(
104
+ self,
105
+ scenario_path: str | Path | None = None,
106
+ schema_drift_events_path: str | Path | None = None,
107
+ ) -> None:
108
+ self._scenario_path = Path(scenario_path) if scenario_path else _default_scenario_path()
109
+ self._drift_events_path = (
110
+ Path(schema_drift_events_path) if schema_drift_events_path is not None else None
111
+ )
112
+ self._drift_events: list[dict[str, Any]] = []
113
+ if self._drift_events_path and self._drift_events_path.is_file():
114
+ drift_raw = json.loads(self._drift_events_path.read_text(encoding="utf-8"))
115
+ self._drift_events = list(drift_raw.get("events", []))
116
+ self._reply_relationship_suppressed: set[str] = set()
117
+ self._reward_log_path = (
118
+ Path(__file__).resolve().parent.parent / "outputs" / "logs" / "episode_rewards.jsonl"
119
+ )
120
+ self._world: WorldState | None = None
121
+ self._base_stress: int = 0
122
+ self._state = State(episode_id=str(uuid4()), step_count=0)
123
+ self._last_step_ok: bool = True
124
+ self._last_step_error: str | None = None
125
+ self._last_step_detail: str = ""
126
+ self._last_reward_breakdown: RewardBreakdown | None = None
127
+
128
+ # --- lifecycle ---
129
+
130
+ def reset(self) -> GhostexecObservation: # type: ignore[override]
131
+ self._world = self.load_world_from_json(self._scenario_path)
132
+ self._base_stress = self._world.stress
133
+ self._rebuild_conflict_list()
134
+ self._state = State(episode_id=str(uuid4()), step_count=0)
135
+ self._last_step_ok = True
136
+ self._last_step_error = None
137
+ self._last_step_detail = "Episode started."
138
+ self._reply_relationship_suppressed.clear()
139
+ self._last_reward_breakdown = None
140
+ self._ensure_reward_log_dir()
141
+ briefing = self.build_briefing_text()
142
+ return self._observation_from_briefing(
143
+ briefing,
144
+ reward=_DEFAULT_STEP_REWARD,
145
+ done=False,
146
+ reward_breakdown=None,
147
+ )
148
+
149
+ def step(self, action: GhostexecAction) -> GhostexecObservation: # type: ignore[override]
150
+ if self._world is None:
151
+ # OpenEnv HTTP uses a new env per request; prime the world so this step still
152
+ # runs the requested action (invalid actions get step_ok False, rewards apply).
153
+ self.reset()
154
+
155
+ assert self._world is not None
156
+ if not self._world.episode_active:
157
+ self._last_step_ok = False
158
+ self._last_step_error = "Episode is already finished."
159
+ bd = RewardBreakdown(
160
+ final=_INVALID_ACTION_REWARD,
161
+ invalid_step_adjustment=_INVALID_ACTION_REWARD,
162
+ )
163
+ self._last_reward_breakdown = bd
164
+ return self._observation_from_briefing(
165
+ self.build_briefing_text(),
166
+ reward=bd.final,
167
+ done=True,
168
+ reward_breakdown=bd,
169
+ )
170
+
171
+ self._state.step_count += 1
172
+ self._maybe_apply_schema_drift_events()
173
+
174
+ if action.message.strip():
175
+ self._world.action_log.append(f"note: {action.message.strip()}")
176
+
177
+ before = self.world.model_copy(deep=True)
178
+ action_ok = self._apply_action(action)
179
+ self._rebuild_conflict_list()
180
+
181
+ episode_done = False
182
+ if self._state.step_count >= self._world.max_episode_steps:
183
+ episode_done = True
184
+ self._world.episode_active = False
185
+ self._world.episode_end_reason = self._world.episode_end_reason or "step_limit"
186
+
187
+ breakdown = _reward.compute_step_reward(
188
+ before,
189
+ self.world,
190
+ action,
191
+ action_ok=action_ok,
192
+ episode_done=episode_done,
193
+ relationship_suppressed_for_email_to=frozenset(self._reply_relationship_suppressed),
194
+ )
195
+ self._last_reward_breakdown = breakdown
196
+ self._append_reward_log(breakdown, episode_done, action)
197
+
198
+ briefing = self.build_briefing_text()
199
+ return self._observation_from_briefing(
200
+ briefing,
201
+ reward=breakdown.final,
202
+ done=episode_done,
203
+ reward_breakdown=breakdown,
204
+ )
205
+
206
+ @property
207
+ def state(self) -> State:
208
+ return self._state
209
+
210
+ @property
211
+ def world(self) -> WorldState:
212
+ if self._world is None:
213
+ raise RuntimeError("World not initialised; call reset() first.")
214
+ return self._world
215
+
216
+ # --- Phase 3 briefing (plain text for LLM) ---
217
+
218
+ def build_briefing_text(self) -> str:
219
+ w = self.world
220
+ now = _parse_dt(w.simulation_time)
221
+ header = now.strftime("=== GHOSTEXEC BRIEFING — %a %d %b %Y %H:%M ===")
222
+
223
+ unread = self.get_unread_emails_sorted()
224
+ email_lines = [
225
+ f"- [{e.priority.upper()}] From: {e.sender} ({_REL_DISPLAY.get(e.sender_relationship, e.sender_relationship)}) — "
226
+ f'"{e.subject}"\n Preview: {(e.body[:100] + ("…" if len(e.body) > 100 else "")).replace(chr(10), " ")}'
227
+ for e in unread[:20]
228
+ ]
229
+ email_block = "\n".join(email_lines) if email_lines else "(none)"
230
+
231
+ horizon = now + timedelta(hours=4)
232
+ conflict_lines: list[str] = []
233
+ for row in self.detect_meeting_conflicts():
234
+ o0 = _parse_dt(row["overlap_start"])
235
+ o1 = _parse_dt(row["overlap_end"])
236
+ if o1 <= now or o0 >= horizon:
237
+ continue
238
+ ma = self._meeting_by_id(row["meeting_a"])
239
+ mb = self._meeting_by_id(row["meeting_b"])
240
+ if not ma or not mb or ma.cancelled or mb.cancelled:
241
+ continue
242
+ conflict_lines.append(
243
+ f"- {_fmt_meeting_line(ma)} CLASHES WITH -> {_fmt_meeting_line(mb)}"
244
+ )
245
+ conflict_block = "\n".join(conflict_lines) if conflict_lines else "(none in next 4 hours)"
246
+
247
+ top_contacts = sorted(w.contacts, key=lambda c: (-c.importance, c.name))[:5]
248
+ contact_lines = [
249
+ f"- {c.name}: {c.mood.upper()} — {_REL_DISPLAY.get(c.relationship_type, c.relationship_type)}; "
250
+ f"prefers {c.communication_preference}"
251
+ for c in top_contacts
252
+ ]
253
+ contact_block = "\n".join(contact_lines) if contact_lines else "(none)"
254
+
255
+ soon = now + timedelta(hours=24)
256
+ task_lines: list[str] = []
257
+ for t in w.tasks:
258
+ if t.status == "done":
259
+ continue
260
+ dl = _parse_dt(t.deadline)
261
+ if dl < now or (now <= dl <= soon):
262
+ flag = "OVERDUE" if dl < now else "due soon"
263
+ task_lines.append(f"- [{flag}] {t.description} (deadline {t.deadline}, owner {t.owner})")
264
+ task_block = "\n".join(task_lines[:15]) if task_lines else "(none)"
265
+
266
+ remaining = max(0, w.max_episode_steps - self._state.step_count)
267
+
268
+ parts = [
269
+ header,
270
+ "",
271
+ f"UNREAD EMAILS ({len(unread)} unread):",
272
+ email_block,
273
+ "",
274
+ "CALENDAR CONFLICTS IN NEXT 4 HOURS:",
275
+ conflict_block,
276
+ "",
277
+ "CONTACTS TO WATCH (top 5 by importance):",
278
+ contact_block,
279
+ "",
280
+ "OVERDUE OR DUE-SOON TASKS (next 24h window):",
281
+ task_block,
282
+ "",
283
+ f"EXEC STRESS LEVEL: {w.stress}/100",
284
+ f"STEPS REMAINING: {remaining}",
285
+ ]
286
+ if self._last_step_error:
287
+ parts += ["", f"LAST ACTION: ERROR — {self._last_step_error}"]
288
+ elif self._last_step_detail:
289
+ parts += ["", f"LAST ACTION: OK — {self._last_step_detail}"]
290
+
291
+ return "\n".join(parts)
292
+
293
+ def _meeting_by_id(self, mid: str) -> Meeting | None:
294
+ for m in self.world.meetings:
295
+ if m.id == mid:
296
+ return m
297
+ return None
298
+
299
+ # --- scenario IO ---
300
+
301
+ @staticmethod
302
+ def load_world_from_json(path: str | Path) -> WorldState:
303
+ raw = Path(path).read_text(encoding="utf-8")
304
+ data = json.loads(raw)
305
+ return WorldState.model_validate(data)
306
+
307
+ @staticmethod
308
+ def world_to_json(world: WorldState) -> str:
309
+ return world.model_dump_json()
310
+
311
+ @staticmethod
312
+ def world_from_json(blob: str) -> WorldState:
313
+ return WorldState.model_validate_json(blob)
314
+
315
+ # --- inbox ---
316
+
317
+ def get_unread_emails_sorted(self) -> list[Email]:
318
+ w = self.world
319
+ unread = [e for e in w.emails if not e.read]
320
+ return sorted(
321
+ unread,
322
+ key=lambda e: (_PRIORITY_RANK.get(e.priority, 99), e.id),
323
+ )
324
+
325
+ def mark_email_read(self, email_id: str) -> bool:
326
+ for i, e in enumerate(self.world.emails):
327
+ if e.id == email_id:
328
+ self.world.emails[i] = e.model_copy(update={"read": True})
329
+ return True
330
+ return False
331
+
332
+ def mark_email_replied(self, email_id: str) -> bool:
333
+ for i, e in enumerate(self.world.emails):
334
+ if e.id == email_id:
335
+ self.world.emails[i] = e.model_copy(update={"read": True, "replied": True})
336
+ return True
337
+ return False
338
+
339
+ # --- calendar ---
340
+
341
+ def detect_meeting_conflicts(self) -> list[dict[str, Any]]:
342
+ active = [m for m in self.world.meetings if not m.cancelled]
343
+ out: list[dict[str, Any]] = []
344
+ for i, a in enumerate(active):
345
+ a_start = _parse_dt(a.start)
346
+ a_end = _meeting_end(a)
347
+ for b in active[i + 1 :]:
348
+ b_start = _parse_dt(b.start)
349
+ b_end = _meeting_end(b)
350
+ if _windows_overlap(a_start, a_end, b_start, b_end):
351
+ overlap_start = max(a_start, b_start)
352
+ overlap_end = min(a_end, b_end)
353
+ out.append(
354
+ {
355
+ "meeting_a": a.id,
356
+ "meeting_b": b.id,
357
+ "overlap_start": overlap_start.isoformat(),
358
+ "overlap_end": overlap_end.isoformat(),
359
+ }
360
+ )
361
+ return out
362
+
363
+ def _reschedule_causes_overlap(self, meeting_id: str, new_start_iso: str) -> bool:
364
+ idx = next((i for i, m in enumerate(self.world.meetings) if m.id == meeting_id), None)
365
+ if idx is None:
366
+ return True
367
+ cand = self.world.meetings[idx].model_copy(update={"start": new_start_iso})
368
+ c_start = _parse_dt(cand.start)
369
+ c_end = _meeting_end(cand)
370
+ for m in self.world.meetings:
371
+ if m.cancelled or m.id == meeting_id:
372
+ continue
373
+ if _windows_overlap(c_start, c_end, _parse_dt(m.start), _meeting_end(m)):
374
+ return True
375
+ return False
376
+
377
+ def reschedule_meeting(self, meeting_id: str, new_start_iso: str) -> bool:
378
+ for i, m in enumerate(self.world.meetings):
379
+ if m.id == meeting_id and not m.cancelled:
380
+ self.world.meetings[i] = m.model_copy(update={"start": new_start_iso})
381
+ self._rebuild_conflict_list()
382
+ return True
383
+ return False
384
+
385
+ def cancel_meeting(self, meeting_id: str) -> bool:
386
+ for i, m in enumerate(self.world.meetings):
387
+ if m.id == meeting_id:
388
+ self.world.meetings[i] = m.model_copy(update={"cancelled": True})
389
+ self._rebuild_conflict_list()
390
+ return True
391
+ return False
392
+
393
+ def add_meeting(self, meeting: Meeting) -> None:
394
+ self.world.meetings.append(meeting)
395
+ self._rebuild_conflict_list()
396
+
397
+ # --- contacts ---
398
+
399
+ def get_contact(self, name: str) -> Contact | None:
400
+ for c in self.world.contacts:
401
+ if c.name == name:
402
+ return c
403
+ return None
404
+
405
+ def update_contact_mood(self, name: str, mood: Mood) -> bool:
406
+ for i, c in enumerate(self.world.contacts):
407
+ if c.name == name:
408
+ self.world.contacts[i] = c.model_copy(update={"mood": mood})
409
+ return True
410
+ return False
411
+
412
+ # --- tasks ---
413
+
414
+ def update_task_status(self, task_id: str, status: TaskStatus) -> bool:
415
+ for i, t in enumerate(self.world.tasks):
416
+ if t.id == task_id:
417
+ self.world.tasks[i] = t.model_copy(update={"status": status})
418
+ return True
419
+ return False
420
+
421
+ def overdue_tasks_at(self, simulation_iso: str) -> list[Task]:
422
+ now = _parse_dt(simulation_iso)
423
+ out: list[Task] = []
424
+ for t in self.world.tasks:
425
+ if t.status in ("done",):
426
+ continue
427
+ if _parse_dt(t.deadline) < now:
428
+ out.append(t)
429
+ return out
430
+
431
+ def set_simulation_time(self, simulation_iso: str) -> None:
432
+ self.world.simulation_time = simulation_iso
433
+ self._reapply_task_overdue_flags()
434
+ self._rebuild_conflict_list()
435
+
436
+ # --- Phase 3 action execution ---
437
+
438
+ def _apply_action(self, action: GhostexecAction) -> bool:
439
+ self._last_step_ok = True
440
+ self._last_step_error = None
441
+ self._last_step_detail = ""
442
+ at = action.action_type
443
+
444
+ if at == "do_nothing":
445
+ self._last_step_detail = "No action taken."
446
+ return True
447
+
448
+ if at == "reply_email":
449
+ if not action.email_id:
450
+ return self._fail("reply_email requires email_id")
451
+ if not any(e.id == action.email_id for e in self.world.emails):
452
+ return self._fail(f"Unknown email_id {action.email_id!r}")
453
+ if not action.message_body.strip():
454
+ return self._fail("reply_email requires non-empty message_body")
455
+ self.mark_email_replied(action.email_id)
456
+ self._last_step_detail = f"Replied to email {action.email_id}."
457
+ return True
458
+
459
+ if at == "archive_email":
460
+ if not action.email_id:
461
+ return self._fail("archive_email requires email_id")
462
+ if not self.mark_email_read(action.email_id):
463
+ return self._fail(f"Unknown email_id {action.email_id!r}")
464
+ self._last_step_detail = f"Archived (read) email {action.email_id}."
465
+ return True
466
+
467
+ if at == "reschedule_meeting":
468
+ if not action.meeting_id or not action.new_time:
469
+ return self._fail("reschedule_meeting requires meeting_id and new_time")
470
+ if not any(m.id == action.meeting_id for m in self.world.meetings):
471
+ return self._fail(f"Unknown meeting_id {action.meeting_id!r}")
472
+ if self._reschedule_causes_overlap(action.meeting_id, action.new_time):
473
+ return self._fail("Target time overlaps another active meeting.")
474
+ if not self.reschedule_meeting(action.meeting_id, action.new_time):
475
+ return self._fail("Could not reschedule meeting.")
476
+ self._last_step_detail = f"Rescheduled {action.meeting_id} to {action.new_time}."
477
+ return True
478
+
479
+ if at == "cancel_meeting":
480
+ if not action.meeting_id:
481
+ return self._fail("cancel_meeting requires meeting_id")
482
+ if not any(m.id == action.meeting_id for m in self.world.meetings):
483
+ return self._fail(f"Unknown meeting_id {action.meeting_id!r}")
484
+ if not self.cancel_meeting(action.meeting_id):
485
+ return self._fail("Could not cancel meeting.")
486
+ reason = action.reason.strip() or "(no reason given)"
487
+ self._world.action_log.append(f"cancelled {action.meeting_id}: {reason}")
488
+ self._last_step_detail = f"Cancelled meeting {action.meeting_id}."
489
+ return True
490
+
491
+ if at == "complete_task":
492
+ if not action.task_id:
493
+ return self._fail("complete_task requires task_id")
494
+ t = next((x for x in self.world.tasks if x.id == action.task_id), None)
495
+ if not t:
496
+ return self._fail(f"Unknown task_id {action.task_id!r}")
497
+ if t.status == "done":
498
+ return self._fail("Task is already done.")
499
+ self.update_task_status(action.task_id, "done")
500
+ self._last_step_detail = f"Completed task {action.task_id}."
501
+ return True
502
+
503
+ if at == "delegate_task":
504
+ if not action.task_id or not action.contact_name.strip():
505
+ return self._fail("delegate_task requires task_id and contact_name")
506
+ if not any(t.id == action.task_id for t in self.world.tasks):
507
+ return self._fail(f"Unknown task_id {action.task_id!r}")
508
+ if not self.get_contact(action.contact_name.strip()):
509
+ return self._fail(f"Unknown contact {action.contact_name.strip()!r}")
510
+ for i, t in enumerate(self.world.tasks):
511
+ if t.id == action.task_id:
512
+ self.world.tasks[i] = t.model_copy(
513
+ update={
514
+ "delegated_to": action.contact_name.strip(),
515
+ "status": "in-progress",
516
+ }
517
+ )
518
+ break
519
+ self._last_step_detail = f"Delegated {action.task_id} to {action.contact_name.strip()}."
520
+ return True
521
+
522
+ if at == "send_message":
523
+ name = action.contact_name.strip()
524
+ if not name:
525
+ return self._fail("send_message requires contact_name")
526
+ if not self.get_contact(name):
527
+ return self._fail(f"Unknown contact {name!r}")
528
+ if not action.message_body.strip():
529
+ return self._fail("send_message requires non-empty message_body")
530
+ self._world.action_log.append(f"message to {name}: {action.message_body.strip()[:500]}")
531
+ self._last_step_detail = f"Message sent to {name}."
532
+ return True
533
+
534
+ return self._fail(f"Unsupported action_type {at!r}")
535
+
536
+ def _fail(self, msg: str) -> bool:
537
+ self._last_step_ok = False
538
+ self._last_step_error = msg
539
+ self._last_step_detail = ""
540
+ self._world.action_log.append(f"error: {msg}")
541
+ return False
542
+
543
+ def _ensure_reward_log_dir(self) -> None:
544
+ self._reward_log_path.parent.mkdir(parents=True, exist_ok=True)
545
+
546
+ def _append_reward_log(
547
+ self,
548
+ breakdown: RewardBreakdown,
549
+ episode_done: bool,
550
+ action: GhostexecAction,
551
+ ) -> None:
552
+ self._ensure_reward_log_dir()
553
+ w = self.world
554
+ crit_open = sum(1 for e in w.emails if e.priority == "critical" and not e.replied)
555
+ overdue_n = len(self.overdue_tasks_at(w.simulation_time))
556
+ line = {
557
+ "episode_id": self._state.episode_id,
558
+ "step": self._state.step_count,
559
+ "action_type": action.action_type,
560
+ "step_ok": self._last_step_ok,
561
+ "reward": breakdown.final,
562
+ "conflict_raw": breakdown.conflict_raw,
563
+ "critical_queue_bonus": breakdown.critical_queue_bonus,
564
+ "conflict": breakdown.conflict,
565
+ "relationship": breakdown.relationship,
566
+ "task": breakdown.task,
567
+ "weighted_base": breakdown.weighted_base,
568
+ "output_scale": breakdown.output_scale,
569
+ "invalid_step_adjustment": breakdown.invalid_step_adjustment,
570
+ "episode_completion_bonus": breakdown.episode_completion_bonus,
571
+ "catastrophic_penalty": breakdown.catastrophic_penalty,
572
+ "episode_done": episode_done,
573
+ "calendar_overlap_pairs": len(self.detect_meeting_conflicts()),
574
+ "critical_unreplied": crit_open,
575
+ "overdue_tasks": overdue_n,
576
+ }
577
+ with self._reward_log_path.open("a", encoding="utf-8") as fh:
578
+ fh.write(json.dumps(line) + "\n")
579
+
580
+ def _maybe_apply_schema_drift_events(self) -> None:
581
+ if not self._world or not self._drift_events:
582
+ return
583
+ step = self._state.step_count
584
+ for ev in self._drift_events:
585
+ if ev.get("after_step") != step:
586
+ continue
587
+ if "shift_all_meetings_hours" in ev:
588
+ delta = int(ev["shift_all_meetings_hours"])
589
+ for i, m in enumerate(self._world.meetings):
590
+ new_start = (_parse_dt(m.start) + timedelta(hours=delta)).replace(tzinfo=None)
591
+ self._world.meetings[i] = m.model_copy(
592
+ update={"start": new_start.isoformat(timespec="seconds")}
593
+ )
594
+ self._world.action_log.append(
595
+ f"schema drift: shifted all meeting starts by {delta:+d} hour(s) (calendar TZ policy)."
596
+ )
597
+ pref = ev.get("set_contact_preference")
598
+ if isinstance(pref, dict):
599
+ name = str(pref.get("name", ""))
600
+ comm = str(pref.get("communication_preference", "text"))
601
+ for i, c in enumerate(self._world.contacts):
602
+ if c.name == name:
603
+ self._world.contacts[i] = c.model_copy(
604
+ update={"communication_preference": comm} # type: ignore[arg-type]
605
+ )
606
+ break
607
+ self._world.action_log.append(
608
+ f"schema drift: contact {name!r} now prefers {comm} only (relationship channel change)."
609
+ )
610
+ td = ev.get("set_task_deadline")
611
+ if isinstance(td, dict):
612
+ tid = str(td.get("task_id", ""))
613
+ dl = str(td.get("deadline", ""))
614
+ for i, t in enumerate(self._world.tasks):
615
+ if t.id == tid:
616
+ self._world.tasks[i] = t.model_copy(update={"deadline": dl})
617
+ break
618
+ self._world.action_log.append(
619
+ f"schema drift: task {tid!r} deadline moved earlier to {dl!r}."
620
+ )
621
+ for name in ev.get("suppress_reply_relationship_for_senders", []) or []:
622
+ self._reply_relationship_suppressed.add(str(name))
623
+ self._world.action_log.append(
624
+ f"schema drift: replies to emails from {name!r} yield zero relationship score this episode."
625
+ )
626
+ scm = ev.get("set_contact_mood")
627
+ if isinstance(scm, dict):
628
+ cname = str(scm.get("name", ""))
629
+ mood_raw = str(scm.get("mood", "neutral"))
630
+ allowed: tuple[Mood, ...] = ("happy", "neutral", "annoyed", "angry", "furious")
631
+ if cname and mood_raw in allowed and self.update_contact_mood(cname, mood_raw):
632
+ self._world.action_log.append(
633
+ f"schema drift: stakeholder {cname!r} mood is now {mood_raw} (external pressure)."
634
+ )
635
+ if any(ev.get("after_step") == step for ev in self._drift_events):
636
+ self._rebuild_conflict_list()
637
+
638
+ # --- internals ---
639
+
640
+ def _reapply_task_overdue_flags(self) -> None:
641
+ now = _parse_dt(self.world.simulation_time)
642
+ for i, t in enumerate(self.world.tasks):
643
+ if t.status == "done":
644
+ continue
645
+ if _parse_dt(t.deadline) < now and t.status != "overdue":
646
+ self.world.tasks[i] = t.model_copy(update={"status": "overdue"})
647
+
648
+ def _rebuild_conflict_list(self) -> None:
649
+ lines: list[str] = []
650
+ for row in self.detect_meeting_conflicts():
651
+ lines.append(
652
+ f"Calendar overlap: {row['meeting_a']} vs {row['meeting_b']} "
653
+ f"({row['overlap_start']} – {row['overlap_end']})"
654
+ )
655
+ for e in self.world.emails:
656
+ if e.priority == "critical" and not e.replied:
657
+ lines.append(f"Unanswered critical email {e.id}: {e.subject}")
658
+ bump = min(35, len(lines) * 2)
659
+ self.world.active_conflicts = lines
660
+ self.world.stress = min(100, self._base_stress + bump)
661
+
662
+ def _observation_from_briefing(
663
+ self,
664
+ briefing: str,
665
+ reward: float,
666
+ done: bool,
667
+ reward_breakdown: RewardBreakdown | None = None,
668
+ ) -> GhostexecObservation:
669
+ w = self.world
670
+ unread_sorted = self.get_unread_emails_sorted()
671
+ meta: dict[str, Any] = {
672
+ "simulation_time": w.simulation_time,
673
+ "stress": w.stress,
674
+ "unread_email_count": sum(1 for e in w.emails if not e.read),
675
+ "calendar_conflict_pairs": len(self.detect_meeting_conflicts()),
676
+ "episode_step": self._state.step_count,
677
+ "max_episode_steps": w.max_episode_steps,
678
+ "episode_active": w.episode_active,
679
+ "episode_end_reason": w.episode_end_reason,
680
+ "step_ok": self._last_step_ok,
681
+ "step_error": self._last_step_error,
682
+ "step_detail": self._last_step_detail,
683
+ # Compact ids for remote trainers / Colab (briefing stays plain text).
684
+ "critical_unreplied_email_ids": [
685
+ e.id for e in w.emails if e.priority == "critical" and not e.replied
686
+ ][:12],
687
+ "unread_email_ids": [e.id for e in unread_sorted[:15]],
688
+ "overdue_task_ids": [t.id for t in self.overdue_tasks_at(w.simulation_time)][:12],
689
+ "active_meeting_ids": [m.id for m in w.meetings if not m.cancelled][:20],
690
+ }
691
+ if reward_breakdown is not None:
692
+ meta["reward_breakdown"] = reward_breakdown.model_dump()
693
+ cap = 48_000
694
+ text = briefing if len(briefing) <= cap else briefing[: cap - 1] + "…"
695
+ return GhostexecObservation(
696
+ echoed_message=text,
697
+ message_length=len(text),
698
+ done=done,
699
+ reward=reward,
700
+ metadata=meta,
701
+ )
702
+
703
+
704
+ def _fmt_meeting_line(m: Meeting) -> str:
705
+ st = _parse_dt(m.start)
706
+ return f"{st.strftime('%H:%M')}: {m.title} ({m.duration_minutes}min)"
server/requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ openenv[core]>=0.2.0
2
+ fastapi>=0.115.0
3
+ uvicorn>=0.24.0
4
+
5
+
6
+
server/reward.py ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Phase 4 reward: weighted (0.35 / 0.35 / 0.30) with potential-style deltas, critical-queue
9
+ shaping, full sub-scores even on invalid steps (+ explicit invalid penalty), and mild output
10
+ scaling.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from datetime import datetime, timedelta, timezone
16
+ from typing import Any
17
+
18
+ try:
19
+ from ..models import GhostexecAction, RewardBreakdown, WorldState
20
+ except ImportError:
21
+ from models import GhostexecAction, RewardBreakdown, WorldState
22
+
23
+ W_CONFLICT = 0.35
24
+ W_REL = 0.35
25
+ W_TASK = 0.30
26
+
27
+ # Raw conflict units (pre-weight) are clamped to keep invalid / idle steps from exploding.
28
+ CONFLICT_RAW_CAP: float = 6.0
29
+
30
+ # Scales the weighted sum of the three channels (weights stay fixed per hackathon rules).
31
+ WEIGHTED_OUTPUT_SCALE: float = 0.48
32
+
33
+ # Tone misfit penalties kept small vs outcome terms (~<20% of a strong +2 conflict step after weights).
34
+ TONE_PENALTY_CASUAL_ANGRY_BOARD: float = 0.35
35
+ TONE_PENALTY_FORMAL_PERSONAL: float = 0.08
36
+
37
+ _RESOLVE_MICRO_BONUS: float = 0.12
38
+ _CRITICAL_PER_EMAIL_BONUS: float = 0.22
39
+ _RESCHEDULE_VALID_MICRO_BONUS: float = 0.10
40
+ _SEND_MESSAGE_VALID_MICRO_BONUS: float = 0.08
41
+ _COMPLETE_TASK_VALID_MICRO_BONUS: float = 0.06
42
+ _DELEGATE_TASK_VALID_MICRO_BONUS: float = 0.10
43
+ _DO_NOTHING_STRICT_PENALTY: float = -0.15
44
+ _REPLY_PRIORITY_MICRO_BONUS: dict[str, float] = {
45
+ "critical": 0.30,
46
+ "high": 0.15,
47
+ "normal": 0.04,
48
+ "low": 0.02,
49
+ }
50
+
51
+ _MOOD_RANK: dict[str, int] = {
52
+ "happy": 4,
53
+ "neutral": 3,
54
+ "annoyed": 2,
55
+ "angry": 1,
56
+ "furious": 0,
57
+ }
58
+
59
+
60
+ def _parse_dt(value: str) -> datetime:
61
+ if value.endswith("Z"):
62
+ return datetime.fromisoformat(value[:-1]).replace(tzinfo=timezone.utc)
63
+ dt = datetime.fromisoformat(value)
64
+ if dt.tzinfo is None:
65
+ return dt.replace(tzinfo=timezone.utc)
66
+ return dt
67
+
68
+
69
+ def _meeting_end(m: Any) -> datetime:
70
+ start = _parse_dt(m.start)
71
+ return start + timedelta(minutes=m.duration_minutes)
72
+
73
+
74
+ def _overlap(a0: datetime, a1: datetime, b0: datetime, b1: datetime) -> bool:
75
+ return a0 < b1 and b0 < a1
76
+
77
+
78
+ def meeting_conflicts(world: WorldState) -> list[dict[str, Any]]:
79
+ active = [m for m in world.meetings if not m.cancelled]
80
+ out: list[dict[str, Any]] = []
81
+ for i, a in enumerate(active):
82
+ a0, a1 = _parse_dt(a.start), _meeting_end(a)
83
+ for b in active[i + 1 :]:
84
+ b0, b1 = _parse_dt(b.start), _meeting_end(b)
85
+ if _overlap(a0, a1, b0, b1):
86
+ o0, o1 = max(a0, b0), min(a1, b1)
87
+ out.append(
88
+ {
89
+ "meeting_a": a.id,
90
+ "meeting_b": b.id,
91
+ "overlap_start": o0.isoformat(),
92
+ "overlap_end": o1.isoformat(),
93
+ }
94
+ )
95
+ return out
96
+
97
+
98
+ def _pair_set(rows: list[dict[str, Any]]) -> set[frozenset[str]]:
99
+ return {frozenset((r["meeting_a"], r["meeting_b"])) for r in rows}
100
+
101
+
102
+ def _attendee_moods_ok(world: WorldState, pair: frozenset[str]) -> bool:
103
+ names: set[str] = set()
104
+ for mid in pair:
105
+ m = next((x for x in world.meetings if x.id == mid), None)
106
+ if m:
107
+ names.update(m.attendees)
108
+ for n in names:
109
+ c = next((x for x in world.contacts if x.name == n), None)
110
+ if c is None:
111
+ continue
112
+ if c.mood not in ("happy", "neutral"):
113
+ return False
114
+ return True
115
+
116
+
117
+ def score_conflict_resolution(
118
+ before: WorldState,
119
+ after: WorldState,
120
+ action: GhostexecAction,
121
+ *,
122
+ action_ok: bool,
123
+ ) -> float:
124
+ b = _pair_set(meeting_conflicts(before))
125
+ a = _pair_set(meeting_conflicts(after))
126
+ s = 0.0
127
+ for _p in b - a:
128
+ s += 2.0 + _RESOLVE_MICRO_BONUS
129
+ if _attendee_moods_ok(after, _p):
130
+ s += 1.0
131
+ for _ in a - b:
132
+ s -= 3.0
133
+ if action_ok and action.action_type == "reschedule_meeting":
134
+ s += _RESCHEDULE_VALID_MICRO_BONUS
135
+ return s
136
+
137
+
138
+ def critical_unreplied_count(world: WorldState) -> int:
139
+ return sum(1 for e in world.emails if e.priority == "critical" and not e.replied)
140
+
141
+
142
+ def score_critical_queue_bonus(before: WorldState, after: WorldState) -> float:
143
+ reduction = critical_unreplied_count(before) - critical_unreplied_count(after)
144
+ return _CRITICAL_PER_EMAIL_BONUS * max(0, reduction)
145
+
146
+
147
+ def _classify_tone(text: str) -> str:
148
+ t = text.lower()
149
+ if any(w in t for w in ("sorry", "apologize", "apologies", "my mistake")):
150
+ return "apologetic"
151
+ if any(w in t for w in ("dear ", "sincerely", "best regards", "respectfully", "cordially")):
152
+ return "formal"
153
+ if any(w in t for w in ("hey", "lol", "haha", "👋", "no worries", "cheers")):
154
+ return "casual"
155
+ if any(w in t for w in ("must", "immediately", "asap", "non-negotiable", "demand")):
156
+ return "assertive"
157
+ return "neutral"
158
+
159
+
160
+ def score_relationship(
161
+ before: WorldState,
162
+ after: WorldState,
163
+ action: GhostexecAction,
164
+ *,
165
+ action_ok: bool,
166
+ relationship_suppressed_for_email_to: frozenset[str] | None = None,
167
+ ) -> float:
168
+ rel_sup = relationship_suppressed_for_email_to or frozenset()
169
+ s = 0.0
170
+ before_map = {c.name: c for c in before.contacts}
171
+ after_map = {c.name: c for c in after.contacts}
172
+ for name, ca in after_map.items():
173
+ cb = before_map.get(name)
174
+ if not cb:
175
+ continue
176
+ ra, rb = _MOOD_RANK[ca.mood], _MOOD_RANK[cb.mood]
177
+ vip = ca.importance >= 4
178
+ if ra > rb:
179
+ s += 3.0 if vip else 1.0
180
+ elif ra < rb:
181
+ s -= 4.0 if vip else 2.0
182
+
183
+ if action.action_type == "reply_email" and action.email_id:
184
+ em = next((e for e in before.emails if e.id == action.email_id), None)
185
+ if em and em.sender in rel_sup:
186
+ return 0.0
187
+ if em:
188
+ if action_ok and (action.message_body or "").strip():
189
+ pri = (em.priority or "").lower()
190
+ micro = _REPLY_PRIORITY_MICRO_BONUS.get(pri, 0.0)
191
+ if em.sender_relationship == "VIP":
192
+ micro *= 2.0
193
+ s += micro
194
+ tone = _classify_tone(action.message_body)
195
+ contact = next((c for c in before.contacts if c.name == em.sender), None)
196
+ if (
197
+ contact
198
+ and contact.relationship_type == "board_member"
199
+ and contact.mood in ("angry", "furious", "annoyed")
200
+ and tone == "casual"
201
+ ):
202
+ s -= TONE_PENALTY_CASUAL_ANGRY_BOARD
203
+ if em.sender_relationship == "personal" and tone == "formal":
204
+ s -= TONE_PENALTY_FORMAL_PERSONAL
205
+ if action_ok and action.action_type == "send_message" and action.contact_name:
206
+ known_contact = any(c.name == action.contact_name for c in before.contacts)
207
+ if known_contact and (action.message_body or "").strip():
208
+ s += _SEND_MESSAGE_VALID_MICRO_BONUS
209
+ return s
210
+
211
+
212
+ def _overdue_tasks(world: WorldState) -> list[Any]:
213
+ now = _parse_dt(world.simulation_time)
214
+ out = []
215
+ for t in world.tasks:
216
+ if t.status == "done":
217
+ continue
218
+ if _parse_dt(t.deadline) < now:
219
+ out.append(t)
220
+ return out
221
+
222
+
223
+ def score_task_completion(
224
+ before: WorldState,
225
+ after: WorldState,
226
+ action: GhostexecAction,
227
+ *,
228
+ action_ok: bool,
229
+ ) -> float:
230
+ s = 0.0
231
+ now = _parse_dt(after.simulation_time)
232
+
233
+ before_tasks = {t.id: t for t in before.tasks}
234
+ after_tasks = {t.id: t for t in after.tasks}
235
+ for tid, ta in after_tasks.items():
236
+ tb = before_tasks.get(tid)
237
+ if not tb:
238
+ continue
239
+ if tb.status != "overdue" and tb.status != "done" and ta.status == "overdue":
240
+ s -= 2.0
241
+ if tb.status != "done" and ta.status == "done":
242
+ dl = _parse_dt(tb.deadline)
243
+ if dl >= now:
244
+ s += 2.0
245
+ else:
246
+ s += 0.5
247
+ if (not tb.delegated_to) and ta.delegated_to:
248
+ de = next((c for c in after.contacts if c.name == ta.delegated_to), None)
249
+ if de and de.importance <= 3:
250
+ s += 1.0
251
+ if action_ok and action.action_type == "complete_task":
252
+ s += _COMPLETE_TASK_VALID_MICRO_BONUS
253
+ if action_ok and action.action_type == "delegate_task":
254
+ s += _DELEGATE_TASK_VALID_MICRO_BONUS
255
+ return s
256
+
257
+
258
+ def catastrophic(world: WorldState) -> bool:
259
+ vip_furious = any(c.importance >= 4 and c.mood == "furious" for c in world.contacts)
260
+ critical_open = sum(1 for e in world.emails if e.priority == "critical" and not e.replied)
261
+ return vip_furious or critical_open > 3
262
+
263
+
264
+ def aggregate_scores(
265
+ conflict: float,
266
+ relationship: float,
267
+ task: float,
268
+ *,
269
+ conflict_raw: float,
270
+ critical_queue_bonus: float,
271
+ weighted_inner: float,
272
+ action_ok: bool,
273
+ episode_done: bool,
274
+ world_after: WorldState,
275
+ ) -> RewardBreakdown:
276
+ weighted = WEIGHTED_OUTPUT_SCALE * weighted_inner
277
+ inv = 0.0
278
+ if not action_ok:
279
+ inv = -0.25
280
+ bonus = 0.0
281
+ cata = 0.0
282
+ if episode_done:
283
+ if world_after.stress < 40:
284
+ bonus = 10.0
285
+ if catastrophic(world_after):
286
+ cata = -15.0
287
+ final = weighted + inv + bonus + cata
288
+ return RewardBreakdown(
289
+ conflict_raw=conflict_raw,
290
+ critical_queue_bonus=critical_queue_bonus,
291
+ conflict=conflict,
292
+ relationship=relationship,
293
+ task=task,
294
+ weighted_base=weighted,
295
+ output_scale=WEIGHTED_OUTPUT_SCALE,
296
+ invalid_step_adjustment=inv,
297
+ episode_completion_bonus=bonus,
298
+ catastrophic_penalty=cata,
299
+ do_nothing_floor=0.0,
300
+ final=final,
301
+ )
302
+
303
+
304
+ def apply_do_nothing_penalty_floor(
305
+ action: GhostexecAction,
306
+ breakdown: RewardBreakdown,
307
+ ) -> RewardBreakdown:
308
+ if action.action_type != "do_nothing":
309
+ return breakdown
310
+ floor_delta = _DO_NOTHING_STRICT_PENALTY
311
+ new_final = breakdown.final + floor_delta
312
+ return breakdown.model_copy(
313
+ update={"do_nothing_floor": floor_delta, "final": new_final},
314
+ )
315
+
316
+
317
+ def compute_step_reward(
318
+ before: WorldState,
319
+ after: WorldState,
320
+ action: GhostexecAction,
321
+ *,
322
+ action_ok: bool,
323
+ episode_done: bool,
324
+ relationship_suppressed_for_email_to: frozenset[str] | None = None,
325
+ ) -> RewardBreakdown:
326
+ c_core = score_conflict_resolution(before, after, action, action_ok=action_ok)
327
+ crit_b = score_critical_queue_bonus(before, after)
328
+ c_raw = c_core + crit_b
329
+ c = max(-CONFLICT_RAW_CAP, min(CONFLICT_RAW_CAP, c_raw))
330
+ r = score_relationship(
331
+ before,
332
+ after,
333
+ action,
334
+ action_ok=action_ok,
335
+ relationship_suppressed_for_email_to=relationship_suppressed_for_email_to,
336
+ )
337
+ t = score_task_completion(before, after, action, action_ok=action_ok)
338
+ weighted_inner = W_CONFLICT * c + W_REL * r + W_TASK * t
339
+ bd = aggregate_scores(
340
+ c,
341
+ r,
342
+ t,
343
+ conflict_raw=c_raw,
344
+ critical_queue_bonus=crit_b,
345
+ weighted_inner=weighted_inner,
346
+ action_ok=action_ok,
347
+ episode_done=episode_done,
348
+ world_after=after,
349
+ )
350
+ return apply_do_nothing_penalty_floor(action, bd)
tests/test_api_reward_dead_500.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hard API dead-test: 500+ calls with reward-consistency checks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import pytest
8
+ from fastapi.testclient import TestClient
9
+
10
+ from ghostexec.server.app import app
11
+
12
+ W_CONFLICT = 0.35
13
+ W_REL = 0.35
14
+ W_TASK = 0.30
15
+ OUTPUT_SCALE = 0.48
16
+
17
+
18
+ def _step_payload_for(i: int) -> dict[str, Any]:
19
+ templates: list[dict[str, Any]] = [
20
+ {"action": {"action_type": "do_nothing"}},
21
+ {"action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}},
22
+ {"action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}},
23
+ {"action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}},
24
+ {"action": {"action_type": "archive_email", "email_id": "e09"}},
25
+ {"action": {"action_type": "archive_email", "email_id": "bad_id"}},
26
+ {
27
+ "action": {
28
+ "action_type": "reschedule_meeting",
29
+ "meeting_id": "m02",
30
+ "new_time": "2026-04-21T18:00:00",
31
+ }
32
+ },
33
+ {
34
+ "action": {
35
+ "action_type": "reschedule_meeting",
36
+ "meeting_id": "m03",
37
+ "new_time": "2026-04-21T09:30:00", # overlap -> invalid semantic
38
+ }
39
+ },
40
+ {"action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}},
41
+ {"action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}},
42
+ {"action": {"action_type": "complete_task", "task_id": "t07"}},
43
+ {"action": {"action_type": "complete_task", "task_id": "t09"}}, # already done
44
+ {"action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}},
45
+ {"action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}},
46
+ {
47
+ "action": {
48
+ "action_type": "send_message",
49
+ "contact_name": "Jamie Liu",
50
+ "message_body": "Quick sync please.",
51
+ }
52
+ },
53
+ {
54
+ "action": {
55
+ "action_type": "send_message",
56
+ "contact_name": "Nobody",
57
+ "message_body": "hello",
58
+ }
59
+ },
60
+ ]
61
+ return templates[i % len(templates)]
62
+
63
+
64
+ @pytest.fixture(scope="module")
65
+ def client() -> TestClient:
66
+ return TestClient(app, raise_server_exceptions=True)
67
+
68
+
69
+ def test_api_surface_all_endpoints(client: TestClient) -> None:
70
+ # Core GET endpoints.
71
+ for path in ("/health", "/metadata", "/state", "/schema", "/openapi.json", "/docs", "/redoc"):
72
+ r = client.get(path)
73
+ assert r.status_code == 200, f"{path} -> {r.status_code}"
74
+
75
+ # Control routes: method contracts.
76
+ assert client.get("/reset").status_code == 405
77
+ assert client.get("/step").status_code == 405
78
+ assert client.put("/reset", json={}).status_code in (405, 422)
79
+ assert client.get("/this-path-should-not-exist-ghostexec").status_code == 404
80
+
81
+ # Reset variants.
82
+ for body in ({}, {"seed": 42}, {"episode_id": "dead-api-001"}, {"seed": 1, "future_field": True}):
83
+ rr = client.post("/reset", json=body)
84
+ assert rr.status_code == 200
85
+ j = rr.json()
86
+ assert "observation" in j and "done" in j
87
+
88
+ # MCP endpoint variants.
89
+ mcp_ok = client.post(
90
+ "/mcp",
91
+ json={"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}},
92
+ )
93
+ assert mcp_ok.status_code == 200
94
+ mcp_bad_json = client.post("/mcp", content="{", headers={"Content-Type": "application/json"})
95
+ assert mcp_bad_json.status_code == 200
96
+
97
+
98
+ @pytest.mark.parametrize("idx", range(520))
99
+ def test_api_reward_dead_520_cases(client: TestClient, idx: int) -> None:
100
+ # Keep each case independent and deterministic.
101
+ rr = client.post("/reset", json={"episode_id": f"dead-{idx:04d}", "seed": 42})
102
+ assert rr.status_code == 200
103
+
104
+ payload = _step_payload_for(idx)
105
+ rs = client.post("/step", json=payload)
106
+ assert rs.status_code == 200, f"idx={idx} payload={payload} status={rs.status_code}"
107
+
108
+ body = rs.json()
109
+ assert "observation" in body and "reward" in body and "done" in body
110
+ obs = body["observation"]
111
+ meta = obs.get("metadata") or {}
112
+ bd = meta.get("reward_breakdown") or {}
113
+
114
+ # Structural contracts.
115
+ assert isinstance(obs.get("echoed_message", ""), str) and obs.get("echoed_message")
116
+ assert "step_ok" in meta
117
+ assert "step_detail" in meta
118
+ assert "final" in bd
119
+ assert "weighted_base" in bd
120
+
121
+ # Reward identity: top-level reward must equal breakdown.final.
122
+ reward = float(body["reward"])
123
+ final = float(bd["final"])
124
+ assert reward == pytest.approx(final, abs=1e-9)
125
+
126
+ # Aggregation formula must hold exactly (within floating tolerance).
127
+ conflict = float(bd.get("conflict", 0.0))
128
+ relationship = float(bd.get("relationship", 0.0))
129
+ task = float(bd.get("task", 0.0))
130
+ weighted_inner = W_CONFLICT * conflict + W_REL * relationship + W_TASK * task
131
+ expected_weighted = OUTPUT_SCALE * weighted_inner
132
+ assert float(bd["weighted_base"]) == pytest.approx(expected_weighted, abs=1e-9)
133
+
134
+ expected_final = (
135
+ float(bd.get("weighted_base", 0.0))
136
+ + float(bd.get("invalid_step_adjustment", 0.0))
137
+ + float(bd.get("episode_completion_bonus", 0.0))
138
+ + float(bd.get("catastrophic_penalty", 0.0))
139
+ + float(bd.get("do_nothing_floor", 0.0))
140
+ )
141
+ assert final == pytest.approx(expected_final, abs=1e-9)
142
+
143
+ action_type = payload["action"]["action_type"]
144
+ if action_type == "do_nothing":
145
+ assert float(bd.get("do_nothing_floor", 0.0)) == pytest.approx(-0.15, abs=1e-12)
146
+ assert reward < 0
147
+
148
+ if meta.get("step_ok") is False:
149
+ assert float(bd.get("invalid_step_adjustment", 0.0)) == pytest.approx(-0.25, abs=1e-12)
150
+
tests/test_complete_integration.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # End-to-end stack test: FastAPI/OpenEnv HTTP + WebSocket, GhostExec env,
4
+ # and (optionally) GhostexecEnv client over ASGI TestClient.
5
+
6
+ from __future__ import annotations
7
+
8
+ import json
9
+ import os
10
+ import shutil
11
+ import subprocess
12
+ import sys
13
+ from pathlib import Path
14
+ import pytest
15
+ from fastapi.testclient import TestClient
16
+
17
+ from ghostexec.models import GhostexecAction
18
+ from ghostexec.server.app import app
19
+ from ghostexec.server.ghostexec_environment import GhostexecEnvironment
20
+
21
+ ROOT = Path(__file__).resolve().parents[1]
22
+ SCENARIO = ROOT / "scenarios" / "phase2_core.json"
23
+ MONDAY = ROOT / "scenarios" / "monday_morning.json"
24
+
25
+
26
+ def _http_paths(client: TestClient) -> set[str]:
27
+ paths: set[str] = set()
28
+ for r in app.routes:
29
+ p = getattr(r, "path", None)
30
+ if isinstance(p, str) and p:
31
+ paths.add(p)
32
+ return paths
33
+
34
+
35
+ def test_server_app_import_matches_uvicorn_server_string() -> None:
36
+ """`uvicorn server.app:app` loads `server.app` with cwd on path (no `ghostexec.` prefix)."""
37
+ rc = subprocess.run(
38
+ [sys.executable, "-c", "import server.app; assert server.app.app is not None"],
39
+ cwd=str(ROOT),
40
+ check=False,
41
+ )
42
+ assert rc.returncode == 0, "import server.app must work from ghostexec repo root"
43
+
44
+
45
+ def test_openapi_docs_and_schema_discovery() -> None:
46
+ with TestClient(app, raise_server_exceptions=True) as client:
47
+ r = client.get("/openapi.json")
48
+ assert r.status_code == 200
49
+ spec = r.json()
50
+ assert spec.get("openapi")
51
+ assert "paths" in spec and spec["paths"]
52
+
53
+ for path in ("/docs", "/redoc"):
54
+ resp = client.get(path)
55
+ assert resp.status_code == 200
56
+ assert len(resp.text) > 100
57
+
58
+
59
+ def test_openapi_examples_match_ghostexec_observation_shape() -> None:
60
+ spec = app.openapi()
61
+ for path in ("/reset", "/step"):
62
+ ex = spec["paths"][path]["post"]["responses"]["200"]["content"]["application/json"]["example"]
63
+ obs = ex["observation"]
64
+ assert "echoed_message" in obs and "message_length" in obs
65
+ assert "status" not in obs and "data" not in obs
66
+ assert "reward" in ex and "done" in ex
67
+
68
+
69
+ def test_openapi_info_documents_http_vs_websocket_episode() -> None:
70
+ """Runtime-visible API docs: HTTP reset/step are not one persistent episode; /ws is."""
71
+ spec = app.openapi()
72
+ desc = spec.get("info", {}).get("description") or ""
73
+ assert "Ghostexec / OpenEnv HTTP" in desc
74
+ assert "/ws" in desc and "WebSocket" in desc
75
+
76
+
77
+ def test_all_registered_get_post_routes_smoke() -> None:
78
+ """Smoke every stable OpenEnv HTTP route (simulation mode, no Gradio /web)."""
79
+ with TestClient(app, raise_server_exceptions=True) as client:
80
+ paths = _http_paths(client)
81
+ assert "/health" in paths
82
+ assert "/metadata" in paths
83
+ assert "/schema" in paths
84
+ assert "/state" in paths
85
+ assert "/reset" in paths
86
+ assert "/step" in paths
87
+ assert "/ws" in paths
88
+ assert "/mcp" in paths
89
+
90
+ h = client.get("/health")
91
+ assert h.status_code == 200
92
+ assert h.json().get("status") == "healthy"
93
+
94
+ meta = client.get("/metadata")
95
+ assert meta.status_code == 200
96
+ body = meta.json()
97
+ assert body.get("name") in ("ghostexec", "GhostexecEnvironment")
98
+ assert "description" in body
99
+
100
+ st = client.get("/state")
101
+ assert st.status_code == 200
102
+ assert "step_count" in st.json()
103
+
104
+ sch = client.get("/schema")
105
+ assert sch.status_code == 200
106
+ sj = sch.json()
107
+ assert "action" in sj and "observation" in sj and "state" in sj
108
+ assert sj["action"].get("title") or sj["action"].get("properties")
109
+
110
+
111
+ def test_http_reset_and_step_return_valid_payloads() -> None:
112
+ """
113
+ Stateless HTTP: each request builds a fresh env (OpenEnv design).
114
+ POST /step on a new instance loads the scenario then applies the action (primed reset).
115
+ """
116
+ with TestClient(app, raise_server_exceptions=True) as client:
117
+ reset = client.post("/reset", json={})
118
+ assert reset.status_code == 200
119
+ rj = reset.json()
120
+ assert "observation" in rj
121
+ obs = rj["observation"]
122
+ assert "echoed_message" in obs
123
+ assert "GHOSTEXEC BRIEFING" in (obs.get("echoed_message") or "")
124
+
125
+ step = client.post(
126
+ "/step",
127
+ json={
128
+ "action": {
129
+ "action_type": "reply_email",
130
+ "email_id": "e05",
131
+ "message_body": "On it.",
132
+ }
133
+ },
134
+ )
135
+ assert step.status_code == 200
136
+ sj = step.json()
137
+ assert "observation" in sj
138
+ assert sj.get("reward") is not None or sj["observation"].get("reward") is not None
139
+
140
+
141
+ def test_http_step_invalid_action_422() -> None:
142
+ with TestClient(app, raise_server_exceptions=True) as client:
143
+ bad = client.post("/step", json={"action": "not-an-object"})
144
+ assert bad.status_code == 422
145
+
146
+
147
+ def test_mcp_jsonrpc_tools_list() -> None:
148
+ with TestClient(app, raise_server_exceptions=True) as client:
149
+ payload = {"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}}
150
+ r = client.post("/mcp", json=payload)
151
+ assert r.status_code == 200
152
+ data = r.json()
153
+ assert "result" in data or "error" in data
154
+
155
+
156
+ def test_websocket_full_episode_reset_step_state_close() -> None:
157
+ with TestClient(app, raise_server_exceptions=True) as client:
158
+ with client.websocket_connect("/ws") as ws:
159
+ ws.send_json({"type": "reset", "data": {}})
160
+ msg = ws.receive_json()
161
+ assert msg.get("type") == "observation"
162
+ data = msg.get("data") or {}
163
+ assert "observation" in data
164
+ inner = data["observation"]
165
+ assert "echoed_message" in inner
166
+ assert "GHOSTEXEC BRIEFING" in inner.get("echoed_message", "")
167
+
168
+ ws.send_json(
169
+ {
170
+ "type": "step",
171
+ "data": {
172
+ "action_type": "reschedule_meeting",
173
+ "meeting_id": "m02",
174
+ "new_time": "2026-04-21T18:00:00",
175
+ },
176
+ }
177
+ )
178
+ msg2 = ws.receive_json()
179
+ assert msg2.get("type") == "observation"
180
+ d2 = msg2.get("data") or {}
181
+ assert d2.get("reward") is not None
182
+
183
+ ws.send_json({"type": "state"})
184
+ msg3 = ws.receive_json()
185
+ assert msg3.get("type") == "state", msg3
186
+ st = msg3.get("data") or {}
187
+ assert st.get("step_count", 0) >= 1
188
+
189
+ ws.send_json({"type": "close", "data": {}})
190
+
191
+
192
+ def test_inprocess_env_matches_ws_briefing_shape() -> None:
193
+ env = GhostexecEnvironment(SCENARIO)
194
+ obs = env.reset()
195
+ assert "BRIEFING" in obs.echoed_message
196
+ o2 = env.step(
197
+ GhostexecAction(
198
+ action_type="reschedule_meeting",
199
+ meeting_id="m02",
200
+ new_time="2026-04-21T18:00:00",
201
+ )
202
+ )
203
+ assert o2.reward is not None
204
+ assert o2.metadata.get("step_ok") is True
205
+
206
+
207
+ def test_monday_morning_scenario_reward_signal() -> None:
208
+ assert MONDAY.is_file()
209
+ env = GhostexecEnvironment(MONDAY)
210
+ env.reset()
211
+ r = env.step(GhostexecAction(action_type="do_nothing")).reward
212
+ assert isinstance(r, float)
213
+
214
+
215
+ def test_ghostexec_env_client_against_live_url_if_set() -> None:
216
+ """
217
+ GhostexecEnv opens a real TCP WebSocket; Starlette TestClient uses the
218
+ non-resolvable host ``testserver`` on some platforms, so this only runs when
219
+ ``GHOSTEXEC_WS_BASE_URL`` points at a live server (e.g. local uvicorn).
220
+ """
221
+ base = os.environ.get("GHOSTEXEC_WS_BASE_URL", "").strip().rstrip("/")
222
+ if not base:
223
+ pytest.skip("Set GHOSTEXEC_WS_BASE_URL (e.g. http://127.0.0.1:8000) to test GhostexecEnv client.")
224
+
225
+ from ghostexec.client import GhostexecEnv
226
+
227
+ sync_client = GhostexecEnv(base_url=base).sync()
228
+ with sync_client:
229
+ res = sync_client.reset()
230
+ assert res.observation.echoed_message
231
+ res2 = sync_client.step(GhostexecAction(action_type="do_nothing"))
232
+ assert res2.observation.echoed_message
233
+
234
+
235
+
tests/test_docker_build.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Opt-in Docker build smoke test for Phase 1 deployment readiness."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import shutil
7
+ import subprocess
8
+ from pathlib import Path
9
+
10
+ import pytest
11
+
12
+ ROOT = Path(__file__).resolve().parents[1]
13
+
14
+
15
+ @pytest.mark.skipif(
16
+ shutil.which("docker") is None or os.environ.get("GHOSTEXEC_RUN_DOCKER_BUILD") != "1",
17
+ reason="Set GHOSTEXEC_RUN_DOCKER_BUILD=1 and ensure docker is installed to run this test.",
18
+ )
19
+ def test_server_dockerfile_builds():
20
+ daemon = subprocess.run(
21
+ ["docker", "version"],
22
+ cwd=str(ROOT),
23
+ capture_output=True,
24
+ text=True,
25
+ timeout=60,
26
+ check=False,
27
+ )
28
+ if daemon.returncode != 0:
29
+ pytest.skip("Docker daemon is unavailable on this machine.")
30
+
31
+ image_tag = "ghostexec-env:ci"
32
+ build_cmd = ["docker", "build", "-t", image_tag, "."]
33
+ built = subprocess.run(
34
+ build_cmd,
35
+ cwd=str(ROOT),
36
+ capture_output=True,
37
+ text=True,
38
+ timeout=900,
39
+ check=False,
40
+ )
41
+ assert built.returncode == 0, (
42
+ "docker build failed\n"
43
+ f"stdout:\n{built.stdout}\n"
44
+ f"stderr:\n{built.stderr}\n"
45
+ )
46
+
47
+ inspect_cmd = ["docker", "image", "inspect", image_tag]
48
+ inspected = subprocess.run(
49
+ inspect_cmd,
50
+ cwd=str(ROOT),
51
+ capture_output=True,
52
+ text=True,
53
+ timeout=120,
54
+ check=False,
55
+ )
56
+ assert inspected.returncode == 0, (
57
+ f"image inspect failed for {image_tag}\n"
58
+ f"stdout:\n{inspected.stdout}\n"
59
+ f"stderr:\n{inspected.stderr}\n"
60
+ )
tests/test_env.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """OpenEnv Phase 2 submission guardrails (graders + manifest wiring)."""
2
+ from __future__ import annotations
3
+
4
+ import importlib
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ import pytest
9
+
10
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
11
+
12
+ from graders import (
13
+ dinner_disaster_grader,
14
+ monday_morning_grader,
15
+ phase2_core_grader,
16
+ )
17
+
18
+ PUBLIC_GRADERS = (phase2_core_grader, monday_morning_grader, dinner_disaster_grader)
19
+
20
+
21
+ @pytest.mark.parametrize("grader", PUBLIC_GRADERS)
22
+ def test_public_graders_are_strictly_bounded(grader):
23
+ assert grader({"rewards": [1.0]}) == 0.99
24
+ assert grader({"rewards": [0.0]}) == 0.01
25
+ assert grader({"rewards": [-5.0]}) == 0.01
26
+ assert grader({"score": 1.5}) == 0.99
27
+ assert grader({"score": -0.5}) == 0.01
28
+ assert grader({"reward": {"total": 1.0}}) == 0.99
29
+ v = grader(None)
30
+ assert 0.0 < v < 1.0
31
+ v = grader({})
32
+ assert 0.0 < v < 1.0
33
+
34
+
35
+ def test_openenv_yaml_declares_three_tasks_with_graders():
36
+ import yaml
37
+
38
+ root = Path(__file__).resolve().parent.parent
39
+ with (root / "openenv.yaml").open("r", encoding="utf-8") as f:
40
+ spec = yaml.safe_load(f)
41
+
42
+ tasks = spec.get("tasks", [])
43
+ assert len(tasks) >= 3, "Phase 2 requires >= 3 tasks"
44
+ for t in tasks:
45
+ assert "grader" in t, f"Task {t.get('id')} missing grader"
46
+ module_path, _, func_name = t["grader"].rpartition(".")
47
+ mod = importlib.import_module(module_path)
48
+ assert callable(getattr(mod, func_name)), f"{t['grader']} not callable"
tests/test_live_server_exhaustive.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # Exhaustive / adversarial probes against a RUNNING GhostExec HTTP server.
4
+ # Default: http://127.0.0.1:8000 (override with GHOSTEXEC_LIVE_BASE_URL).
5
+ # Skips all tests if /health is unreachable.
6
+
7
+ from __future__ import annotations
8
+
9
+ import asyncio
10
+ import json
11
+ import os
12
+ import urllib.error
13
+ import urllib.request
14
+ from typing import Any
15
+
16
+ import pytest
17
+
18
+ BASE = os.environ.get("GHOSTEXEC_LIVE_BASE_URL", "http://127.0.0.1:8000").rstrip("/")
19
+
20
+
21
+ def _req(
22
+ method: str,
23
+ path: str,
24
+ *,
25
+ data: bytes | None = None,
26
+ headers: dict[str, str] | None = None,
27
+ timeout: float = 15.0,
28
+ ) -> tuple[int, bytes]:
29
+ url = BASE + path
30
+ h = urllib.request.Request(url, data=data, headers=headers or {}, method=method)
31
+ try:
32
+ with urllib.request.urlopen(h, timeout=timeout) as resp:
33
+ return resp.status, resp.read()
34
+ except urllib.error.HTTPError as e:
35
+ try:
36
+ body = e.read()
37
+ except (ConnectionResetError, OSError):
38
+ body = b""
39
+ return e.code, body
40
+
41
+
42
+ @pytest.fixture(scope="module")
43
+ def live() -> str:
44
+ try:
45
+ code, _ = _req("GET", "/health", timeout=3.0)
46
+ except OSError as e:
47
+ pytest.skip(f"Live server not reachable at {BASE!r}: {e}")
48
+ if code != 200:
49
+ pytest.skip(f"Live /health returned {code} at {BASE!r}")
50
+ return BASE
51
+
52
+
53
+ def test_get_core_docs(live: str) -> None:
54
+ for path, min_len in [
55
+ ("/health", 10),
56
+ ("/metadata", 20),
57
+ ("/state", 10),
58
+ ("/schema", 500),
59
+ ("/openapi.json", 1000),
60
+ ("/docs", 200),
61
+ ("/redoc", 200),
62
+ ]:
63
+ code, body = _req("GET", path)
64
+ assert code == 200, f"{path} -> {code}"
65
+ assert len(body) >= min_len, f"{path} body tiny"
66
+
67
+
68
+ def test_wrong_http_methods_on_control_routes(live: str) -> None:
69
+ assert _req("GET", "/reset")[0] == 405
70
+ assert _req("GET", "/step")[0] == 405
71
+ assert _req("PUT", "/reset", data=b"{}")[0] in (405, 422)
72
+ code, _ = _req("DELETE", "/health")
73
+ assert code in (405, 404)
74
+ assert _req("GET", "/this-path-should-not-exist-ghostexec")[0] == 404
75
+
76
+
77
+ def test_reset_payload_variants(live: str) -> None:
78
+ for label, payload in [
79
+ ("empty", {}),
80
+ ("seed", {"seed": 42}),
81
+ ("episode_id", {"episode_id": "probe-episode-1"}),
82
+ ("extra_ignored", {"seed": 1, "unknown_future_field_xyz": True}),
83
+ ]:
84
+ code, body = _req(
85
+ "POST",
86
+ "/reset",
87
+ data=json.dumps(payload).encode(),
88
+ headers={"Content-Type": "application/json"},
89
+ )
90
+ assert code == 200, f"reset {label}: {code}"
91
+ j = json.loads(body.decode())
92
+ assert "observation" in j and "done" in j
93
+ obs = j["observation"]
94
+ assert "echoed_message" in obs
95
+
96
+
97
+ def test_step_valid_action_types(live: str) -> None:
98
+ cases: list[tuple[str, dict[str, Any]]] = [
99
+ ("do_nothing", {"action_type": "do_nothing"}),
100
+ (
101
+ "reply_email",
102
+ {"action_type": "reply_email", "email_id": "e14", "message_body": "Live exhaustive probe."},
103
+ ),
104
+ ("archive_email", {"action_type": "archive_email", "email_id": "e09"}),
105
+ (
106
+ "reschedule_meeting",
107
+ {
108
+ "action_type": "reschedule_meeting",
109
+ "meeting_id": "m02",
110
+ "new_time": "2026-04-21T18:00:00",
111
+ },
112
+ ),
113
+ (
114
+ "cancel_meeting",
115
+ {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "probe cancel"},
116
+ ),
117
+ ("complete_task", {"action_type": "complete_task", "task_id": "t07"}),
118
+ (
119
+ "delegate_task",
120
+ {
121
+ "action_type": "delegate_task",
122
+ "task_id": "t08",
123
+ "contact_name": "Jordan Lee",
124
+ },
125
+ ),
126
+ (
127
+ "send_message",
128
+ {
129
+ "action_type": "send_message",
130
+ "contact_name": "Jamie Liu",
131
+ "message_body": "Exhaustive live test ping.",
132
+ },
133
+ ),
134
+ ]
135
+ for name, action in cases:
136
+ code, body = _req(
137
+ "POST",
138
+ "/step",
139
+ data=json.dumps({"action": action}).encode(),
140
+ headers={"Content-Type": "application/json"},
141
+ )
142
+ assert code == 200, f"step {name}: HTTP {code} {body[:200]!r}"
143
+ j = json.loads(body.decode())
144
+ assert "observation" in j
145
+ meta = (j.get("observation") or {}).get("metadata") or {}
146
+ assert "step_ok" in meta, f"step {name}: missing step_ok"
147
+
148
+
149
+ def test_step_invalid_contracts(live: str) -> None:
150
+ assert _req("POST", "/step", data=b"not-json", headers={"Content-Type": "application/json"})[0] in (
151
+ 400,
152
+ 422,
153
+ )
154
+ assert (
155
+ _req(
156
+ "POST",
157
+ "/step",
158
+ data=json.dumps({"action": "not-a-dict"}).encode(),
159
+ headers={"Content-Type": "application/json"},
160
+ )[0]
161
+ == 422
162
+ )
163
+ assert (
164
+ _req(
165
+ "POST",
166
+ "/step",
167
+ data=json.dumps({"action": {"action_type": "reply_email", "email_id": "nope", "message_body": "x"}}).encode(),
168
+ headers={"Content-Type": "application/json"},
169
+ )[0]
170
+ == 200
171
+ )
172
+ j = json.loads(
173
+ _req(
174
+ "POST",
175
+ "/step",
176
+ data=json.dumps(
177
+ {"action": {"action_type": "reply_email", "email_id": "nope", "message_body": "x"}}
178
+ ).encode(),
179
+ headers={"Content-Type": "application/json"},
180
+ )[1].decode()
181
+ )
182
+ assert j["observation"]["metadata"].get("step_ok") is False
183
+
184
+ assert (
185
+ _req(
186
+ "POST",
187
+ "/step",
188
+ data=json.dumps({"action": {"action_type": "complete_task", "task_id": "t09"}}).encode(),
189
+ headers={"Content-Type": "application/json"},
190
+ )[0]
191
+ == 200
192
+ )
193
+ j2 = json.loads(
194
+ _req(
195
+ "POST",
196
+ "/step",
197
+ data=json.dumps({"action": {"action_type": "complete_task", "task_id": "t09"}}).encode(),
198
+ headers={"Content-Type": "application/json"},
199
+ )[1].decode()
200
+ )
201
+ assert j2["observation"]["metadata"].get("step_ok") is False
202
+
203
+
204
+ def test_step_unicode_and_long_message(live: str) -> None:
205
+ long_body = ("Line note.\n" * 80) + " café naïve résumé 日本語"
206
+ code, body = _req(
207
+ "POST",
208
+ "/step",
209
+ data=json.dumps(
210
+ {"action": {"action_type": "reply_email", "email_id": "e05", "message_body": long_body}}
211
+ ).encode(),
212
+ headers={"Content-Type": "application/json"},
213
+ )
214
+ assert code == 200
215
+
216
+
217
+ def test_step_wrong_content_type(live: str) -> None:
218
+ code, _ = _req(
219
+ "POST",
220
+ "/step",
221
+ data=b"action_type=do_nothing",
222
+ headers={"Content-Type": "application/x-www-form-urlencoded"},
223
+ )
224
+ assert code in (400, 415, 422)
225
+
226
+
227
+ def test_reset_invalid_json(live: str) -> None:
228
+ code, _ = _req("POST", "/reset", data=b"{", headers={"Content-Type": "application/json"})
229
+ assert code in (400, 422)
230
+
231
+
232
+ def test_mcp_variants(live: str) -> None:
233
+ assert _req("POST", "/mcp", data=b"{", headers={"Content-Type": "application/json"})[0] == 200
234
+ body = _req(
235
+ "POST",
236
+ "/mcp",
237
+ data=json.dumps({"jsonrpc": "2.0", "id": 1, "method": "bogus/thing", "params": {}}).encode(),
238
+ headers={"Content-Type": "application/json"},
239
+ )[1].decode()
240
+ j = json.loads(body)
241
+ assert "error" in j or "result" in j
242
+
243
+
244
+ def test_openapi_lists_expected_paths(live: str) -> None:
245
+ _, raw = _req("GET", "/openapi.json")
246
+ spec = json.loads(raw.decode())
247
+ paths = spec.get("paths") or {}
248
+ for p in ("/health", "/reset", "/step", "/schema", "/metadata", "/state", "/mcp"):
249
+ assert p in paths, f"missing path {p} in OpenAPI"
250
+
251
+
252
+ def test_websocket_dead_ends(live: str) -> None:
253
+ try:
254
+ import websockets
255
+ except ImportError:
256
+ pytest.skip("websockets not installed")
257
+
258
+ async def _run() -> None:
259
+ ws_url = live.replace("http://", "ws://").replace("https://", "wss://") + "/ws"
260
+ async with websockets.connect(ws_url, max_size=10_000_000) as ws:
261
+ await ws.send("{ not json")
262
+ e1 = json.loads(await ws.recv())
263
+ assert e1.get("type") == "error"
264
+
265
+ await ws.send(json.dumps({"type": "nosuch", "data": {}}))
266
+ e2 = json.loads(await ws.recv())
267
+ assert e2.get("type") == "error"
268
+
269
+ await ws.send(json.dumps({"type": "reset", "data": {}}))
270
+ ok = json.loads(await ws.recv())
271
+ assert ok.get("type") == "observation"
272
+
273
+ await ws.send(
274
+ json.dumps({"type": "step", "data": {"action_type": "reply_email", "email_id": "missing"}})
275
+ )
276
+ bad = json.loads(await ws.recv())
277
+ assert bad.get("type") == "observation"
278
+ meta = (bad.get("data") or {}).get("observation", {}).get("metadata") or {}
279
+ assert meta.get("step_ok") is False
280
+
281
+ await ws.send(json.dumps({"type": "state"}))
282
+ st = json.loads(await ws.recv())
283
+ assert st.get("type") == "state"
284
+
285
+ await ws.send(json.dumps({"type": "close", "data": {}}))
286
+
287
+ asyncio.run(_run())
tests/test_phase1.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Phase 1: scaffold, OpenEnv manifest, layout, and HTTP health surface."""
2
+
3
+ from pathlib import Path
4
+
5
+ import yaml
6
+ from starlette.testclient import TestClient
7
+
8
+ ROOT = Path(__file__).resolve().parents[1]
9
+
10
+
11
+ def test_openenv_yaml_exists_and_metadata():
12
+ path = ROOT / "openenv.yaml"
13
+ assert path.is_file(), "openenv.yaml must exist at project root"
14
+ data = yaml.safe_load(path.read_text(encoding="utf-8"))
15
+ assert data.get("name") == "ghostexec"
16
+ assert data.get("spec_version") == 1
17
+ assert data.get("type") == "space"
18
+ assert data.get("runtime") == "fastapi"
19
+ assert data.get("app") == "server.app:app"
20
+ desc = data.get("description")
21
+ assert desc and isinstance(desc, str) and len(desc.strip()) > 0
22
+ ver = data.get("version")
23
+ assert ver and isinstance(ver, str) and len(ver.strip()) > 0
24
+
25
+
26
+ def test_expected_folder_structure():
27
+ assert (ROOT / "models.py").is_file()
28
+ assert (ROOT / "client.py").is_file()
29
+ assert (ROOT / "pyproject.toml").is_file()
30
+ assert (ROOT / "server" / "app.py").is_file()
31
+ assert (ROOT / "server" / "ghostexec_environment.py").is_file()
32
+ assert (ROOT / "Dockerfile").is_file() or (ROOT / "server" / "Dockerfile").is_file()
33
+ assert (ROOT / "server" / "requirements.txt").is_file()
34
+
35
+
36
+ def test_server_health_ping():
37
+ from ghostexec.server.app import app
38
+
39
+ client = TestClient(app)
40
+ response = client.get("/health")
41
+ assert response.status_code == 200
42
+ assert response.json().get("status") == "healthy"
tests/test_phase2.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Phase 2: world state, inbox, calendar, contacts, tasks (scenario-driven)."""
2
+
3
+ from pathlib import Path
4
+
5
+ from ghostexec.server.ghostexec_environment import GhostexecEnvironment
6
+
7
+ ROOT = Path(__file__).resolve().parents[1]
8
+ SCENARIO = ROOT / "scenarios" / "phase2_core.json"
9
+
10
+
11
+ def test_scenario_file_exists():
12
+ assert SCENARIO.is_file()
13
+
14
+
15
+ def test_world_json_roundtrip():
16
+ world = GhostexecEnvironment.load_world_from_json(SCENARIO)
17
+ blob = GhostexecEnvironment.world_to_json(world)
18
+ again = GhostexecEnvironment.world_from_json(blob)
19
+ assert again.simulation_time == world.simulation_time
20
+ assert len(again.emails) == len(world.emails)
21
+ assert len(again.meetings) == len(world.meetings)
22
+
23
+
24
+ def test_pool_sizes_from_scenario():
25
+ w = GhostexecEnvironment.load_world_from_json(SCENARIO)
26
+ assert len(w.emails) >= 30
27
+ assert len(w.meetings) >= 8
28
+ assert len(w.contacts) >= 15
29
+ assert len(w.tasks) >= 10
30
+
31
+
32
+ def test_inbox_unread_priority_order():
33
+ env = GhostexecEnvironment(SCENARIO)
34
+ env.reset()
35
+ unread = env.get_unread_emails_sorted()
36
+ priorities = [e.priority for e in unread]
37
+ rank = {"critical": 0, "high": 1, "normal": 2, "low": 3}
38
+ assert priorities == sorted(priorities, key=lambda p: rank[p])
39
+ assert unread[0].priority == "critical"
40
+
41
+
42
+ def test_calendar_detects_four_conflicts():
43
+ env = GhostexecEnvironment(SCENARIO)
44
+ env.reset()
45
+ conflicts = env.detect_meeting_conflicts()
46
+ assert len(conflicts) >= 4
47
+
48
+
49
+ def test_contact_mood_update():
50
+ env = GhostexecEnvironment(SCENARIO)
51
+ env.reset()
52
+ c = env.get_contact("David Okonkwo")
53
+ assert c is not None
54
+ assert c.mood == "angry"
55
+ assert env.update_contact_mood("David Okonkwo", "neutral")
56
+ assert env.get_contact("David Okonkwo") is not None
57
+ assert env.get_contact("David Okonkwo").mood == "neutral"
58
+
59
+
60
+ def test_overdue_tasks_after_time_advance():
61
+ env = GhostexecEnvironment(SCENARIO)
62
+ env.reset()
63
+ future = "2026-04-22T12:00:00"
64
+ env.set_simulation_time(future)
65
+ overdue = env.overdue_tasks_at(future)
66
+ assert len(overdue) >= 2
67
+ assert all(t.status == "overdue" for t in overdue)
68
+
69
+
70
+ def test_mark_email_read_and_reschedule_reduces_calendar_conflicts():
71
+ env = GhostexecEnvironment(SCENARIO)
72
+ env.reset()
73
+ before = len(env.detect_meeting_conflicts())
74
+ assert env.reschedule_meeting("m02", "2026-04-21T18:00:00")
75
+ after = len(env.detect_meeting_conflicts())
76
+ assert after < before
77
+ assert env.mark_email_read("e01")
tests/test_phase3.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Phase 3: plain-text briefing, eight legal actions, validation without crashes."""
2
+
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+
7
+ from ghostexec.models import GhostexecAction
8
+ from ghostexec.server.ghostexec_environment import GhostexecEnvironment
9
+
10
+ ROOT = Path(__file__).resolve().parents[1]
11
+ SCENARIO = ROOT / "scenarios" / "phase2_core.json"
12
+
13
+
14
+ def _env() -> GhostexecEnvironment:
15
+ e = GhostexecEnvironment(SCENARIO)
16
+ e.reset()
17
+ return e
18
+
19
+
20
+ def test_briefing_is_plain_text_after_reset():
21
+ env = _env()
22
+ obs = env.reset()
23
+ text = obs.echoed_message
24
+ assert "=== GHOSTEXEC BRIEFING" in text
25
+ assert "UNREAD EMAILS" in text
26
+ assert "CALENDAR CONFLICTS IN NEXT 4 HOURS" in text
27
+ assert "CONTACTS TO WATCH" in text
28
+ assert "OVERDUE OR DUE-SOON TASKS" in text
29
+ assert "EXEC STRESS LEVEL" in text
30
+ assert "STEPS REMAINING" in text
31
+ assert obs.message_length == len(text)
32
+
33
+
34
+ @pytest.mark.parametrize(
35
+ "action,check",
36
+ [
37
+ (
38
+ GhostexecAction(action_type="reply_email", email_id="e05", message_body="On it."),
39
+ lambda env: next(e for e in env.world.emails if e.id == "e05").replied is True,
40
+ ),
41
+ (
42
+ GhostexecAction(action_type="archive_email", email_id="e09"),
43
+ lambda env: next(e for e in env.world.emails if e.id == "e09").read is True,
44
+ ),
45
+ (
46
+ GhostexecAction(
47
+ action_type="reschedule_meeting",
48
+ meeting_id="m03",
49
+ new_time="2026-04-21T18:00:00",
50
+ ),
51
+ lambda env: next(m for m in env.world.meetings if m.id == "m03").start
52
+ == "2026-04-21T18:00:00",
53
+ ),
54
+ (
55
+ GhostexecAction(
56
+ action_type="cancel_meeting",
57
+ meeting_id="m10",
58
+ reason="Merged into ops review",
59
+ ),
60
+ lambda env: next(m for m in env.world.meetings if m.id == "m10").cancelled is True,
61
+ ),
62
+ (
63
+ GhostexecAction(action_type="complete_task", task_id="t07"),
64
+ lambda env: next(t for t in env.world.tasks if t.id == "t07").status == "done",
65
+ ),
66
+ (
67
+ GhostexecAction(
68
+ action_type="delegate_task",
69
+ task_id="t08",
70
+ contact_name="Jordan Lee",
71
+ ),
72
+ lambda env: next(t for t in env.world.tasks if t.id == "t08").delegated_to == "Jordan Lee",
73
+ ),
74
+ (
75
+ GhostexecAction(
76
+ action_type="send_message",
77
+ contact_name="Jamie Liu",
78
+ message_body="Thanks for the demo feedback.",
79
+ ),
80
+ lambda env: any("message to Jamie Liu" in line for line in env.world.action_log),
81
+ ),
82
+ (
83
+ GhostexecAction(action_type="do_nothing"),
84
+ lambda env: True,
85
+ ),
86
+ ],
87
+ )
88
+ def test_each_legal_action_runs_without_crash(action, check):
89
+ env = _env()
90
+ obs = env.step(action)
91
+ assert obs.echoed_message
92
+ assert check(env)
93
+
94
+
95
+ def test_reply_marks_email_handled():
96
+ env = _env()
97
+ e = next(x for x in env.world.emails if x.id == "e14")
98
+ assert not e.read
99
+ env.step(GhostexecAction(action_type="reply_email", email_id="e14", message_body="Noted."))
100
+ e2 = next(x for x in env.world.emails if x.id == "e14")
101
+ assert e2.read and e2.replied
102
+
103
+
104
+ def test_invalid_actions_return_error_metadata_not_exception():
105
+ base = _env()
106
+ r_do_nothing = base.step(GhostexecAction(action_type="do_nothing")).reward
107
+
108
+ env = _env()
109
+ obs = env.step(GhostexecAction(action_type="reply_email", email_id="nope", message_body="x"))
110
+ assert obs.metadata.get("step_ok") is False
111
+ assert obs.metadata.get("step_error")
112
+ # Same before→after sub-scores as do_nothing, plus explicit invalid add-on.
113
+ # do_nothing has an additional strict additive floor (-0.15), so the delta is -0.10 here.
114
+ assert obs.reward == pytest.approx((r_do_nothing or 0) - (0.25 - 0.15))
115
+
116
+ obs2 = env.step(GhostexecAction(action_type="complete_task", task_id="t09"))
117
+ assert obs2.metadata.get("step_ok") is False
118
+ assert "already done" in (obs2.metadata.get("step_error") or "").lower()
119
+
120
+ obs3 = env.step(
121
+ GhostexecAction(
122
+ action_type="send_message",
123
+ contact_name="Nobody By That Name",
124
+ message_body="hello",
125
+ )
126
+ )
127
+ assert obs3.metadata.get("step_ok") is False
128
+
129
+ obs4 = env.step(
130
+ GhostexecAction(
131
+ action_type="reschedule_meeting",
132
+ meeting_id="m03",
133
+ new_time="2026-04-21T09:30:00",
134
+ )
135
+ )
136
+ assert obs4.metadata.get("step_ok") is False
137
+ assert "overlap" in (obs4.metadata.get("step_error") or "").lower()
138
+
139
+
140
+ def test_reschedule_resolves_prior_conflict_pair():
141
+ env = _env()
142
+ before = {frozenset((r["meeting_a"], r["meeting_b"])) for r in env.detect_meeting_conflicts()}
143
+ assert frozenset(("m01", "m02")) in before
144
+ obs = env.step(
145
+ GhostexecAction(
146
+ action_type="reschedule_meeting",
147
+ meeting_id="m02",
148
+ new_time="2026-04-21T18:00:00",
149
+ )
150
+ )
151
+ assert obs.metadata.get("step_ok") is True
152
+ after = {frozenset((r["meeting_a"], r["meeting_b"])) for r in env.detect_meeting_conflicts()}
153
+ assert frozenset(("m01", "m02")) not in after
tests/test_phase4.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Phase 4: reward sub-scores, aggregation, logging, schema drift."""
2
+
3
+ import json
4
+ import random
5
+ import statistics
6
+ from pathlib import Path
7
+
8
+ import pytest
9
+
10
+ from ghostexec.models import GhostexecAction
11
+ from ghostexec.server import reward as reward_mod
12
+ from ghostexec.server.reward import aggregate_scores
13
+ from ghostexec.server.ghostexec_environment import GhostexecEnvironment
14
+
15
+ ROOT = Path(__file__).resolve().parents[1]
16
+ SCENARIO = ROOT / "scenarios" / "phase2_core.json"
17
+ DRIFT = ROOT / "scenarios" / "schema_drift_test.json"
18
+
19
+
20
+ def test_reward_weights_and_aggregator_helpers():
21
+ w = GhostexecEnvironment.load_world_from_json(SCENARIO)
22
+ c, r, t = 1.0, -1.0, 2.5
23
+ weighted_inner = reward_mod.W_CONFLICT * c + reward_mod.W_REL * r + reward_mod.W_TASK * t
24
+ bd = aggregate_scores(
25
+ c,
26
+ r,
27
+ t,
28
+ conflict_raw=c,
29
+ critical_queue_bonus=0.0,
30
+ weighted_inner=weighted_inner,
31
+ action_ok=True,
32
+ episode_done=False,
33
+ world_after=w,
34
+ )
35
+ assert bd.weighted_base == pytest.approx(reward_mod.WEIGHTED_OUTPUT_SCALE * weighted_inner)
36
+
37
+
38
+ def test_catastrophic_and_completion_bonuses_only_when_episode_done():
39
+ w0 = GhostexecEnvironment.load_world_from_json(SCENARIO)
40
+ w1 = w0.model_copy(deep=True)
41
+ w1.stress = 30
42
+ w2 = w1.model_copy(deep=True)
43
+ action = GhostexecAction(action_type="do_nothing")
44
+ mid = reward_mod.compute_step_reward(w1, w2, action, action_ok=True, episode_done=False)
45
+ assert mid.episode_completion_bonus == 0.0
46
+ assert mid.catastrophic_penalty == 0.0
47
+
48
+ w_bad = w1.model_copy(deep=True)
49
+ for i, c in enumerate(w_bad.contacts):
50
+ if c.name == "Marcus Webb":
51
+ w_bad.contacts[i] = c.model_copy(update={"mood": "furious"})
52
+ break
53
+ end = reward_mod.compute_step_reward(w1, w_bad, action, action_ok=True, episode_done=True)
54
+ assert end.episode_completion_bonus == pytest.approx(10.0)
55
+ assert end.catastrophic_penalty == pytest.approx(-15.0)
56
+
57
+
58
+ def test_invalid_step_matches_do_nothing_subscores_plus_invalid_addon():
59
+ w = GhostexecEnvironment.load_world_from_json(SCENARIO)
60
+ noop = GhostexecAction(action_type="do_nothing")
61
+ bad = GhostexecAction(action_type="reply_email", email_id="missing", message_body="x")
62
+ bd_ok = reward_mod.compute_step_reward(w, w, noop, action_ok=True, episode_done=False)
63
+ bd_bad = reward_mod.compute_step_reward(w, w, bad, action_ok=False, episode_done=False)
64
+ assert bd_bad.invalid_step_adjustment == pytest.approx(-0.25)
65
+ # do_nothing carries an additional strict additive floor (-0.15) not applied to invalid non-idle actions.
66
+ assert bd_bad.final == pytest.approx(bd_ok.final - (0.25 - 0.15))
67
+
68
+
69
+ def test_scripted_episode_reward_direction_and_log(tmp_path, monkeypatch):
70
+ logf = tmp_path / "rewards.jsonl"
71
+ env = GhostexecEnvironment(SCENARIO)
72
+ env.reset()
73
+ monkeypatch.setattr(env, "_reward_log_path", logf)
74
+
75
+ r_resolve = env.step(
76
+ GhostexecAction(
77
+ action_type="reschedule_meeting",
78
+ meeting_id="m02",
79
+ new_time="2026-04-21T18:00:00",
80
+ )
81
+ )
82
+ r_bad = env.step(GhostexecAction(action_type="do_nothing"))
83
+
84
+ assert r_resolve.metadata.get("step_ok") is True
85
+ assert r_bad.metadata.get("step_ok") is True
86
+ assert (r_resolve.reward or 0) > (r_bad.reward or 0)
87
+
88
+ assert logf.is_file()
89
+ lines = logf.read_text(encoding="utf-8").strip().splitlines()
90
+ assert len(lines) >= 2
91
+ row = json.loads(lines[0])
92
+ assert "reward" in row and "episode_id" in row
93
+ assert row.get("action_type") == "reschedule_meeting"
94
+ assert "conflict_raw" in row and "step_ok" in row
95
+
96
+
97
+ def test_schema_drift_events_mutate_world():
98
+ env = GhostexecEnvironment(SCENARIO, schema_drift_events_path=DRIFT)
99
+ env.reset()
100
+ assert env.step(GhostexecAction(action_type="do_nothing")).metadata.get("step_ok") is True
101
+ assert any("schema drift: shifted" in x for x in env.world.action_log)
102
+ env.step(GhostexecAction(action_type="do_nothing"))
103
+ sarah = env.get_contact("Sarah Chen")
104
+ assert sarah is not None
105
+ assert sarah.communication_preference == "text"
106
+ env.step(GhostexecAction(action_type="do_nothing"))
107
+ t02 = next(t for t in env.world.tasks if t.id == "t02")
108
+ assert t02.deadline == "2026-04-21T07:00:00"
109
+ assert "Marcus Webb" in env._reply_relationship_suppressed # noqa: SLF001
110
+
111
+
112
+ def test_rewards_differ_between_helpful_and_idle_steps():
113
+ env = GhostexecEnvironment(SCENARIO)
114
+ env.reset()
115
+ r_help = env.step(
116
+ GhostexecAction(
117
+ action_type="reschedule_meeting",
118
+ meeting_id="m02",
119
+ new_time="2026-04-21T18:00:00",
120
+ )
121
+ ).reward
122
+ r_idle = env.step(GhostexecAction(action_type="do_nothing")).reward
123
+ assert r_help is not None and r_idle is not None
124
+ assert r_help != r_idle
125
+
126
+
127
+ # Whitelisted reschedules (known non-overlapping targets for phase2_core at 08:00).
128
+ _SAFE_RESCHEDULES: list[tuple[str, str]] = [
129
+ ("m02", "2026-04-21T18:00:00"),
130
+ ("m03", "2026-04-21T18:30:00"),
131
+ ("m06", "2026-04-21T20:00:00"),
132
+ ("m09", "2026-04-21T21:00:00"),
133
+ ]
134
+
135
+
136
+ def test_seeded_stochastic_policy_reward_spread():
137
+ random.seed(1234)
138
+ K = 80
139
+ archive_ids = [f"e{i:02d}" for i in range(1, 31)]
140
+ contacts = ["Jordan Lee", "Jamie Liu", "Marcus Webb", "Sarah Chen"]
141
+ env = GhostexecEnvironment(SCENARIO)
142
+ env.reset()
143
+ rewards: list[float] = []
144
+ ai = ri = 0
145
+ for _ in range(K):
146
+ u = random.random()
147
+ if u < 0.32:
148
+ obs = env.step(GhostexecAction(action_type="do_nothing"))
149
+ elif u < 0.58:
150
+ eid = archive_ids[ai % len(archive_ids)]
151
+ ai += 1
152
+ obs = env.step(GhostexecAction(action_type="archive_email", email_id=eid))
153
+ elif u < 0.78:
154
+ mid, nt = _SAFE_RESCHEDULES[ri % len(_SAFE_RESCHEDULES)]
155
+ ri += 1
156
+ obs = env.step(
157
+ GhostexecAction(action_type="reschedule_meeting", meeting_id=mid, new_time=nt)
158
+ )
159
+ else:
160
+ cname = contacts[ai % len(contacts)]
161
+ ai += 1
162
+ obs = env.step(
163
+ GhostexecAction(
164
+ action_type="send_message",
165
+ contact_name=cname,
166
+ message_body="Quick sync on priorities.",
167
+ )
168
+ )
169
+ assert obs.reward is not None
170
+ rewards.append(float(obs.reward))
171
+
172
+ std = statistics.pstdev(rewards)
173
+ sr = sorted(rewards)
174
+ p5 = sr[max(0, int(0.05 * (len(sr) - 1)))]
175
+ p95 = sr[min(len(sr) - 1, int(0.95 * (len(sr) - 1)))]
176
+ assert std > 0.06
177
+ assert (p95 - p5) > 0.09
178
+
179
+
180
+ def test_good_script_beats_do_nothing_spam_on_mean_reward():
181
+ good = GhostexecEnvironment(SCENARIO)
182
+ good.reset()
183
+ good_actions = [
184
+ GhostexecAction(
185
+ action_type="reschedule_meeting",
186
+ meeting_id="m02",
187
+ new_time="2026-04-21T18:00:00",
188
+ ),
189
+ GhostexecAction(action_type="reply_email", email_id="e01", message_body="Drafting revised figures now."),
190
+ GhostexecAction(action_type="archive_email", email_id="e09"),
191
+ GhostexecAction(
192
+ action_type="send_message",
193
+ contact_name="Jordan Lee",
194
+ message_body="Standup notes attached.",
195
+ ),
196
+ GhostexecAction(action_type="complete_task", task_id="t06"),
197
+ ]
198
+ g_rewards = [good.step(a).reward for a in good_actions]
199
+ g_mean = sum(float(x) for x in g_rewards) / len(g_rewards)
200
+
201
+ bad = GhostexecEnvironment(SCENARIO)
202
+ bad.reset()
203
+ b_rewards = [bad.step(GhostexecAction(action_type="do_nothing")).reward for _ in range(5)]
204
+ b_mean = sum(float(x) for x in b_rewards) / len(b_rewards)
205
+
206
+ assert g_mean > b_mean + 0.2
tests/test_reward_dead_suite.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # Dead-test suite for Phase 4 step rewards: 100+ independent scenarios on
4
+ # phase2_core.json. Asserts penalization (do_nothing, invalid), priority
5
+ # ordering (VIP critical > normal), and legal-action signatures for GRPO-style
6
+ # post-training signal quality.
7
+
8
+ from __future__ import annotations
9
+
10
+ from pathlib import Path
11
+
12
+ import pytest
13
+
14
+ from ghostexec.models import GhostexecAction
15
+ from ghostexec.server import reward as reward_mod
16
+ from ghostexec.server.ghostexec_environment import GhostexecEnvironment
17
+
18
+ ROOT = Path(__file__).resolve().parents[1]
19
+ SCENARIO = ROOT / "scenarios" / "phase2_core.json"
20
+
21
+ # All inbox ids from phase2_core (e01–e30).
22
+ REPLY_EMAIL_IDS = [f"e{i:02d}" for i in range(1, 31)]
23
+
24
+ # Unread or replyable ids suitable for archive (skip if unknown — all exist).
25
+ ARCHIVE_EMAIL_IDS = [f"e{i:02d}" for i in range(1, 16)]
26
+
27
+ # Pending / in-progress tasks only (t09 is done in fixture).
28
+ COMPLETE_TASK_IDS = [f"t{i:02d}" for i in range(1, 13) if i != 9]
29
+
30
+ # Known non-overlapping reschedules for 08:00 sim time (from phase4 tests).
31
+ _SAFE_RESCHEDULES: list[tuple[str, str]] = [
32
+ ("m02", "2026-04-21T18:00:00"),
33
+ ("m03", "2026-04-21T18:30:00"),
34
+ ("m06", "2026-04-21T20:00:00"),
35
+ ("m09", "2026-04-21T21:00:00"),
36
+ ("m04", "2026-04-21T19:00:00"),
37
+ ("m05", "2026-04-21T19:30:00"),
38
+ ("m07", "2026-04-21T20:30:00"),
39
+ ("m08", "2026-04-21T21:30:00"),
40
+ ("m01", "2026-04-21T17:00:00"),
41
+ ("m10", "2026-04-21T22:00:00"),
42
+ ]
43
+
44
+ MEETING_IDS_CANCEL = [f"m{i:02d}" for i in range(1, 11)]
45
+
46
+ KNOWN_CONTACTS = ["Jordan Lee", "Jamie Liu", "Marcus Webb", "Sarah Chen", "Priya Sharma", "David Okonkwo"]
47
+
48
+ _BODY = "Thanks — triaging and will follow up shortly."
49
+
50
+
51
+ # --- 30 cases: reply every email id -------------------------------------------
52
+
53
+
54
+ @pytest.mark.parametrize("email_id", REPLY_EMAIL_IDS)
55
+ def test_dead_reply_email_each_id_positive_or_neutral(email_id: str) -> None:
56
+ e = GhostexecEnvironment(SCENARIO)
57
+ e.reset()
58
+ obs = e.step(GhostexecAction(action_type="reply_email", email_id=email_id, message_body=_BODY))
59
+ assert obs.metadata.get("step_ok") is True
60
+ assert obs.reward is not None
61
+ bd = (obs.metadata or {}).get("reward_breakdown") or {}
62
+ assert bd.get("invalid_step_adjustment", 0) == pytest.approx(0.0)
63
+ assert bd.get("do_nothing_floor", 0) == pytest.approx(0.0)
64
+ # No snapshot -4 conflict tax: legal reply should not tank below -0.5
65
+ assert float(obs.reward) > -0.5
66
+
67
+
68
+ @pytest.mark.parametrize("email_id", ("e01", "e03", "e12", "e21", "e27"))
69
+ def test_dead_reply_vip_critical_queue_bonus(email_id: str) -> None:
70
+ e = GhostexecEnvironment(SCENARIO)
71
+ e.reset()
72
+ obs = e.step(GhostexecAction(action_type="reply_email", email_id=email_id, message_body=_BODY))
73
+ assert obs.metadata.get("step_ok") is True
74
+ # VIP+critical micro + critical_queue bonus; exact float varies slightly (0.48 scale).
75
+ assert float(obs.reward or 0) > 0.06
76
+ bd = (obs.metadata or {}).get("reward_breakdown") or {}
77
+ assert float(bd.get("critical_queue_bonus") or 0) > 0
78
+
79
+
80
+ @pytest.mark.parametrize("email_id", ("e02", "e04", "e06", "e14", "e23"))
81
+ def test_dead_reply_high_or_normal_small_positive(email_id: str) -> None:
82
+ e = GhostexecEnvironment(SCENARIO)
83
+ e.reset()
84
+ obs = e.step(GhostexecAction(action_type="reply_email", email_id=email_id, message_body=_BODY))
85
+ assert obs.metadata.get("step_ok") is True
86
+ assert float(obs.reward or 0) > 0.0
87
+
88
+
89
+ # --- 20 cases: do_nothing always penalized ------------------------------------
90
+
91
+
92
+ @pytest.mark.parametrize("seed", range(20))
93
+ def test_dead_do_nothing_strict_penalty(seed: int) -> None:
94
+ e = GhostexecEnvironment(SCENARIO)
95
+ e.reset()
96
+ obs = e.step(GhostexecAction(action_type="do_nothing"))
97
+ assert obs.metadata.get("step_ok") is True
98
+ assert float(obs.reward or 0) < 0
99
+ bd = (obs.metadata or {}).get("reward_breakdown") or {}
100
+ assert float(bd.get("do_nothing_floor") or 0) == pytest.approx(reward_mod._DO_NOTHING_STRICT_PENALTY)
101
+
102
+
103
+ # --- 15 cases: archive --------------------------------------------------------
104
+
105
+
106
+ @pytest.mark.parametrize("email_id", ARCHIVE_EMAIL_IDS)
107
+ def test_dead_archive_email_step_ok(email_id: str) -> None:
108
+ e = GhostexecEnvironment(SCENARIO)
109
+ e.reset()
110
+ obs = e.step(GhostexecAction(action_type="archive_email", email_id=email_id))
111
+ assert obs.metadata.get("step_ok") is True
112
+ assert obs.reward is not None
113
+
114
+
115
+ # --- 11 cases: complete pending task -----------------------------------------
116
+
117
+
118
+ @pytest.mark.parametrize("task_id", COMPLETE_TASK_IDS)
119
+ def test_dead_complete_task_step_ok(task_id: str) -> None:
120
+ e = GhostexecEnvironment(SCENARIO)
121
+ e.reset()
122
+ obs = e.step(GhostexecAction(action_type="complete_task", task_id=task_id))
123
+ assert obs.metadata.get("step_ok") is True
124
+ assert obs.reward is not None
125
+ bd = (obs.metadata or {}).get("reward_breakdown") or {}
126
+ assert float(bd.get("task") or 0) >= reward_mod._COMPLETE_TASK_VALID_MICRO_BONUS
127
+
128
+
129
+ # --- 10 cases: reschedule safe slots -----------------------------------------
130
+
131
+
132
+ @pytest.mark.parametrize("meeting_id,new_time", _SAFE_RESCHEDULES)
133
+ def test_dead_reschedule_meeting_resolves_or_micro(meeting_id: str, new_time: str) -> None:
134
+ e = GhostexecEnvironment(SCENARIO)
135
+ e.reset()
136
+ obs = e.step(
137
+ GhostexecAction(action_type="reschedule_meeting", meeting_id=meeting_id, new_time=new_time)
138
+ )
139
+ assert obs.metadata.get("step_ok") is True
140
+ assert obs.reward is not None
141
+ # Should beat idle do-nothing on same fresh env
142
+ e2 = GhostexecEnvironment(SCENARIO)
143
+ e2.reset()
144
+ idle = e2.step(GhostexecAction(action_type="do_nothing"))
145
+ assert float(obs.reward or 0) > float(idle.reward or 0)
146
+
147
+
148
+ # --- 10 cases: cancel meeting --------------------------------------------------
149
+
150
+
151
+ @pytest.mark.parametrize("meeting_id", MEETING_IDS_CANCEL)
152
+ def test_dead_cancel_meeting_step_ok(meeting_id: str) -> None:
153
+ e = GhostexecEnvironment(SCENARIO)
154
+ e.reset()
155
+ obs = e.step(
156
+ GhostexecAction(action_type="cancel_meeting", meeting_id=meeting_id, reason="dead test cancel")
157
+ )
158
+ assert obs.metadata.get("step_ok") is True
159
+ assert obs.reward is not None
160
+
161
+
162
+ # --- 6 cases: send_message -----------------------------------------------------
163
+
164
+
165
+ @pytest.mark.parametrize("contact_name", KNOWN_CONTACTS)
166
+ def test_dead_send_message_known_contact(contact_name: str) -> None:
167
+ e = GhostexecEnvironment(SCENARIO)
168
+ e.reset()
169
+ obs = e.step(
170
+ GhostexecAction(
171
+ action_type="send_message",
172
+ contact_name=contact_name,
173
+ message_body="Quick sync on priorities.",
174
+ )
175
+ )
176
+ assert obs.metadata.get("step_ok") is True
177
+ bd = (obs.metadata or {}).get("reward_breakdown") or {}
178
+ assert float(bd.get("relationship") or 0) >= reward_mod._SEND_MESSAGE_VALID_MICRO_BONUS - 0.01
179
+
180
+
181
+ # --- 5 cases: delegate_task ---------------------------------------------------
182
+
183
+
184
+ @pytest.mark.parametrize(
185
+ "task_id,contact",
186
+ [
187
+ ("t08", "Jordan Lee"),
188
+ ("t07", "Jamie Liu"),
189
+ ("t01", "Marcus Webb"),
190
+ ("t02", "Sarah Chen"),
191
+ ("t11", "Casey Nguyen"),
192
+ ],
193
+ )
194
+ def test_dead_delegate_task(task_id: str, contact: str) -> None:
195
+ e = GhostexecEnvironment(SCENARIO)
196
+ e.reset()
197
+ obs = e.step(
198
+ GhostexecAction(action_type="delegate_task", task_id=task_id, contact_name=contact)
199
+ )
200
+ assert obs.metadata.get("step_ok") is True
201
+ bd = (obs.metadata or {}).get("reward_breakdown") or {}
202
+ assert float(bd.get("task") or 0) >= reward_mod._DELEGATE_TASK_VALID_MICRO_BONUS - 0.01
203
+
204
+
205
+ # --- 10 cases: invalid actions ------------------------------------------------
206
+
207
+
208
+ @pytest.mark.parametrize(
209
+ "action,expect_ok",
210
+ [
211
+ (GhostexecAction(action_type="reply_email", email_id="nope", message_body="x"), False),
212
+ (GhostexecAction(action_type="complete_task", task_id="t09"), False),
213
+ (GhostexecAction(action_type="archive_email", email_id="nope"), False),
214
+ (GhostexecAction(action_type="reschedule_meeting", meeting_id="m99", new_time="2026-04-21T18:00:00"), False),
215
+ (GhostexecAction(action_type="cancel_meeting", meeting_id="m99", reason="x"), False),
216
+ (GhostexecAction(action_type="delegate_task", task_id="t01", contact_name="Nobody"), False),
217
+ (GhostexecAction(action_type="send_message", contact_name="Nobody", message_body="hi"), False),
218
+ (GhostexecAction(action_type="reply_email", email_id="", message_body="hi"), False),
219
+ (GhostexecAction(action_type="complete_task", task_id=""), False),
220
+ (GhostexecAction(action_type="archive_email", email_id=""), False),
221
+ ],
222
+ )
223
+ def test_dead_invalid_action_step_ok_false(action: GhostexecAction, expect_ok: bool) -> None:
224
+ e = GhostexecEnvironment(SCENARIO)
225
+ e.reset()
226
+ obs = e.step(action)
227
+ assert obs.metadata.get("step_ok") is expect_ok
228
+ bd = (obs.metadata or {}).get("reward_breakdown") or {}
229
+ assert float(bd.get("invalid_step_adjustment") or 0) == pytest.approx(-0.25)
230
+
231
+
232
+ # --- Ordering: VIP critical reply >> do_nothing --------------------------------
233
+
234
+
235
+ def test_dead_priority_ordering_vip_critical_over_normal_over_idle() -> None:
236
+ r_vip: list[float] = []
237
+ r_norm: list[float] = []
238
+ r_idle: list[float] = []
239
+ for _ in range(5):
240
+ e1 = GhostexecEnvironment(SCENARIO)
241
+ e1.reset()
242
+ r_vip.append(float(e1.step(GhostexecAction(action_type="reply_email", email_id="e01", message_body=_BODY)).reward or 0))
243
+ e2 = GhostexecEnvironment(SCENARIO)
244
+ e2.reset()
245
+ r_norm.append(float(e2.step(GhostexecAction(action_type="reply_email", email_id="e14", message_body=_BODY)).reward or 0))
246
+ e3 = GhostexecEnvironment(SCENARIO)
247
+ e3.reset()
248
+ r_idle.append(float(e3.step(GhostexecAction(action_type="do_nothing")).reward or 0))
249
+ assert min(r_vip) > max(r_idle)
250
+ assert min(r_norm) > max(r_idle)
251
+ assert sum(r_vip) / len(r_vip) > sum(r_norm) / len(r_norm)
252
+
253
+
254
+ # --- Tone penalty: casual to angry board contact ------------------------------
255
+
256
+
257
+ def test_dead_tone_penalty_casual_to_angry_board() -> None:
258
+ e = GhostexecEnvironment(SCENARIO)
259
+ e.reset()
260
+ # Marcus Webb is board; ensure angry mood in scenario or pick contact - phase2 has Marcus ANGRY in briefing
261
+ obs_bad = e.step(
262
+ GhostexecAction(
263
+ action_type="reply_email",
264
+ email_id="e01",
265
+ message_body="hey lol no worries",
266
+ )
267
+ )
268
+ assert obs_bad.metadata.get("step_ok") is True
269
+ e2 = GhostexecEnvironment(SCENARIO)
270
+ e2.reset()
271
+ obs_good = e2.step(
272
+ GhostexecAction(
273
+ action_type="reply_email",
274
+ email_id="e01",
275
+ message_body="Dear Marcus, sincerely addressing the board request now.",
276
+ )
277
+ )
278
+ assert float(obs_good.reward or 0) > float(obs_bad.reward or 0)
279
+
280
+
281
+ # --- Reschedule adds conflict channel micro even if overlap unchanged ---------
282
+
283
+
284
+ def test_dead_reschedule_micro_in_breakdown() -> None:
285
+ e = GhostexecEnvironment(SCENARIO)
286
+ e.reset()
287
+ obs = e.step(
288
+ GhostexecAction(action_type="reschedule_meeting", meeting_id="m07", new_time="2026-04-21T20:30:00")
289
+ )
290
+ assert obs.metadata.get("step_ok") is True
291
+ bd = (obs.metadata or {}).get("reward_breakdown") or {}
292
+ assert float(bd.get("conflict_raw") or 0) >= reward_mod._RESCHEDULE_VALID_MICRO_BONUS - 0.01
293
+
294
+
295
+ # --- Unit: compute_step_reward invalid vs noop delta matches contract ---------
296
+
297
+
298
+ def test_dead_compute_reward_invalid_vs_noop_delta() -> None:
299
+ w = GhostexecEnvironment.load_world_from_json(SCENARIO)
300
+ noop = GhostexecAction(action_type="do_nothing")
301
+ bad = GhostexecAction(action_type="reply_email", email_id="missing", message_body="x")
302
+ bd_ok = reward_mod.compute_step_reward(w, w, noop, action_ok=True, episode_done=False)
303
+ bd_bad = reward_mod.compute_step_reward(w, w, bad, action_ok=False, episode_done=False)
304
+ assert bd_bad.final == pytest.approx(bd_ok.final - (0.25 - 0.15))
305
+
306
+
307
+ def test_dead_vip_critical_reply_outscores_professional_critical() -> None:
308
+ """VIP x2 micro on critical senders should dominate professional critical."""
309
+ e_vip = GhostexecEnvironment(SCENARIO)
310
+ e_vip.reset()
311
+ r_vip = float(
312
+ e_vip.step(GhostexecAction(action_type="reply_email", email_id="e01", message_body=_BODY)).reward or 0
313
+ )
314
+ e_pro = GhostexecEnvironment(SCENARIO)
315
+ e_pro.reset()
316
+ r_pro = float(
317
+ e_pro.step(GhostexecAction(action_type="reply_email", email_id="e21", message_body=_BODY)).reward or 0
318
+ )
319
+ assert r_vip > r_pro
uv.lock ADDED
The diff for this file is too large to render. See raw diff
 
validate-submission.sh ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # validate-submission.sh — Ghostexec OpenEnv Submission Validator
4
+ #
5
+ # Checks that your HF Space is live, Docker image builds, and openenv validate passes.
6
+
7
+ set -uo pipefail
8
+
9
+ DOCKER_BUILD_TIMEOUT=600
10
+ if [ -t 1 ]; then
11
+ RED='\033[0;31m'
12
+ GREEN='\033[0;32m'
13
+ YELLOW='\033[1;33m'
14
+ BOLD='\033[1m'
15
+ NC='\033[0m'
16
+ else
17
+ RED='' GREEN='' YELLOW='' BOLD='' NC=''
18
+ fi
19
+
20
+ run_with_timeout() {
21
+ local secs="$1"; shift
22
+ if command -v timeout &>/dev/null; then
23
+ timeout "$secs" "$@"
24
+ elif command -v gtimeout &>/dev/null; then
25
+ gtimeout "$secs" "$@"
26
+ else
27
+ "$@" &
28
+ local pid=$!
29
+ ( sleep "$secs" && kill "$pid" 2>/dev/null ) &
30
+ local watcher=$!
31
+ wait "$pid" 2>/dev/null
32
+ local rc=$?
33
+ kill "$watcher" 2>/dev/null
34
+ wait "$watcher" 2>/dev/null
35
+ return $rc
36
+ fi
37
+ }
38
+
39
+ portable_mktemp() {
40
+ local prefix="${1:-validate}"
41
+ mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
42
+ }
43
+
44
+ CLEANUP_FILES=()
45
+ cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
46
+ trap cleanup EXIT
47
+
48
+ PING_URL="${1:-}"
49
+ REPO_DIR="${2:-.}"
50
+
51
+ if [ -z "$PING_URL" ]; then
52
+ printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
53
+ printf "\n"
54
+ printf " ping_url Your HuggingFace Space URL (e.g. https://modelbuilderhq-ghostexec.hf.space)\n"
55
+ printf " repo_dir Path to your repo (default: current directory)\n"
56
+ exit 1
57
+ fi
58
+
59
+ if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
60
+ printf "Error: directory '%s' not found\n" "${2:-.}"
61
+ exit 1
62
+ fi
63
+ PING_URL="${PING_URL%/}"
64
+ PASS=0
65
+
66
+ log() { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
67
+ pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
68
+ fail() { log "${RED}FAILED${NC} -- $1"; }
69
+ hint() { printf " ${YELLOW}Hint:${NC} %b\n" "$1"; }
70
+ stop_at() {
71
+ printf "\n"
72
+ printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
73
+ exit 1
74
+ }
75
+
76
+ printf "\n"
77
+ printf "${BOLD}========================================${NC}\n"
78
+ printf "${BOLD} Ghostexec OpenEnv Validator${NC}\n"
79
+ printf "${BOLD}========================================${NC}\n"
80
+ log "Repo: $REPO_DIR"
81
+ log "Ping URL: $PING_URL"
82
+ printf "\n"
83
+
84
+ log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
85
+
86
+ CURL_OUTPUT=$(portable_mktemp "validate-curl")
87
+ CLEANUP_FILES+=("$CURL_OUTPUT")
88
+ HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
89
+ -H "Content-Type: application/json" -d '{}' \
90
+ "$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
91
+
92
+ if [ "$HTTP_CODE" = "200" ]; then
93
+ pass "HF Space is live and responds to /reset"
94
+ elif [ "$HTTP_CODE" = "000" ]; then
95
+ fail "HF Space not reachable (connection failed or timed out)"
96
+ hint "Check your network connection and that the Space is running."
97
+ hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
98
+ stop_at "Step 1"
99
+ else
100
+ fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
101
+ hint "Make sure your Space is running and the URL is correct."
102
+ hint "Try opening $PING_URL in your browser first."
103
+ stop_at "Step 1"
104
+ fi
105
+
106
+ log "${BOLD}Step 2/3: Running docker build${NC} ..."
107
+
108
+ if ! command -v docker &>/dev/null; then
109
+ fail "docker command not found"
110
+ hint "Install Docker: https://docs.docker.com/get-docker/"
111
+ stop_at "Step 2"
112
+ fi
113
+
114
+ if [ -f "$REPO_DIR/Dockerfile" ]; then
115
+ DOCKER_CONTEXT="$REPO_DIR"
116
+ elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
117
+ DOCKER_CONTEXT="$REPO_DIR/server"
118
+ else
119
+ fail "No Dockerfile found in repo root or server/ directory"
120
+ stop_at "Step 2"
121
+ fi
122
+
123
+ log " Found Dockerfile in $DOCKER_CONTEXT"
124
+
125
+ BUILD_OK=false
126
+ BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
127
+
128
+ if [ "$BUILD_OK" = true ]; then
129
+ pass "Docker build succeeded"
130
+ else
131
+ fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
132
+ printf "%s\n" "$BUILD_OUTPUT" | tail -20
133
+ stop_at "Step 2"
134
+ fi
135
+
136
+ log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
137
+
138
+ if ! command -v openenv &>/dev/null; then
139
+ fail "openenv command not found"
140
+ hint "Install it with your project env, e.g.: uv run pip install openenv-core"
141
+ stop_at "Step 3"
142
+ fi
143
+
144
+ VALIDATE_OK=false
145
+ VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
146
+
147
+ if [ "$VALIDATE_OK" = true ]; then
148
+ pass "openenv validate passed"
149
+ [ -n "$VALIDATE_OUTPUT" ] && log " $VALIDATE_OUTPUT"
150
+ else
151
+ fail "openenv validate failed"
152
+ printf "%s\n" "$VALIDATE_OUTPUT"
153
+ stop_at "Step 3"
154
+ fi
155
+
156
+ printf "\n"
157
+ printf "${BOLD}========================================${NC}\n"
158
+ printf "${GREEN}${BOLD} All 3/3 checks passed!${NC}\n"
159
+ printf "${GREEN}${BOLD} Ghostexec is ready for submission.${NC}\n"
160
+ printf "${BOLD}========================================${NC}\n"
161
+ printf "\n"
162
+
163
+ exit 0