Spaces:

modelbuilderhq
/

ghostexec

Running

App Files Files Community

modelbuilderhq commited on 5 days ago

Commit

ff293b1

verified ·

1 Parent(s): fef31e6

Upload folder using huggingface_hub

Browse files

Files changed (50) hide show

Dockerfile +83 -0
README.md +303 -7
__init__.py +20 -0
client.py +64 -0
conftest.py +12 -0
graders.py +121 -0
inference.py +265 -0
models.py +204 -0
openenv.yaml +32 -0
openenv_ghostexec.egg-info/PKG-INFO +15 -0
openenv_ghostexec.egg-info/SOURCES.txt +45 -0
openenv_ghostexec.egg-info/dependency_links.txt +1 -0
openenv_ghostexec.egg-info/entry_points.txt +2 -0
openenv_ghostexec.egg-info/requires.txt +13 -0
openenv_ghostexec.egg-info/top_level.txt +1 -0
outputs/logs/api_dead_live_500.jsonl +500 -0
outputs/logs/episode_rewards.jsonl +0 -0
outputs/training/_integration_ckpt/run_summary.json +28 -0
outputs/training/checkpoints/run_summary.json +28 -0
outputs/training/episode_returns.jsonl +10 -0
outputs/training/smoke/checkpoints/run_summary.json +28 -0
outputs/training/smoke/reinforce_returns.jsonl +48 -0
outputs/training/test_returns.jsonl +25 -0
pyproject.toml +61 -0
scenarios/dinner_disaster.json +107 -0
scenarios/monday_morning.json +257 -0
scenarios/phase2_core.json +83 -0
scenarios/schema_drift_test.json +27 -0
scenarios/vip_meltdown.json +63 -0
scenarios/vip_meltdown_drift.json +25 -0
scripts/__init__.py +1 -0
scripts/http_endpoint_smoke.py +184 -0
scripts/run_live_api_dead_500.py +196 -0
server/__init__.py +11 -0
server/app.py +169 -0
server/ghostexec_environment.py +706 -0
server/requirements.txt +6 -0
server/reward.py +350 -0
tests/test_api_reward_dead_500.py +150 -0
tests/test_complete_integration.py +235 -0
tests/test_docker_build.py +60 -0
tests/test_env.py +48 -0
tests/test_live_server_exhaustive.py +287 -0
tests/test_phase1.py +42 -0
tests/test_phase2.py +77 -0
tests/test_phase3.py +153 -0
tests/test_phase4.py +206 -0
tests/test_reward_dead_suite.py +319 -0
uv.lock +0 -0
validate-submission.sh +163 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,83 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Multi-stage build using openenv-base
+# This Dockerfile is flexible and works for both:
+# - In-repo environments (with local OpenEnv sources)
+# - Standalone environments (with openenv from PyPI/Git)
+# The build script (openenv build) handles context detection and sets appropriate build args.
+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ${BASE_IMAGE} AS builder
+WORKDIR /app
+# Ensure git is available (required for installing dependencies from VCS)
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git && \
+    rm -rf /var/lib/apt/lists/*
+# Build argument to control whether we're building standalone or in-repo
+ARG BUILD_MODE=in-repo
+ARG ENV_NAME=ghostexec
+# Copy environment code (always at root of build context)
+COPY . /app/env
+# For in-repo builds, openenv is already vendored in the build context
+# For standalone builds, openenv will be installed via pyproject.toml
+WORKDIR /app/env
+# Ensure uv is available (for local builds where base image lacks it)
+RUN if ! command -v uv >/dev/null 2>&1; then \
+        curl -LsSf https://astral.sh/uv/install.sh | sh && \
+        mv /root/.local/bin/uv /usr/local/bin/uv && \
+        mv /root/.local/bin/uvx /usr/local/bin/uvx; \
+    fi
+# Install dependencies using uv sync
+# If uv.lock exists, use it; otherwise resolve on the fly
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-install-project --no-editable; \
+    else \
+        uv sync --no-install-project --no-editable; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-editable; \
+    else \
+        uv sync --no-editable; \
+    fi
+# Final runtime stage
+FROM ${BASE_IMAGE}
+WORKDIR /app
+# Copy the virtual environment from builder
+COPY --from=builder /app/env/.venv /app/.venv
+# Copy the environment code
+COPY --from=builder /app/env /app/env
+# Set PATH to use the virtual environment
+ENV PATH="/app/.venv/bin:$PATH"
+# Set PYTHONPATH so imports work correctly
+ENV PYTHONPATH="/app/env:$PYTHONPATH"
+# Mount Gradio OpenEnv UI at /web (matches HF Space README expectations)
+ENV ENABLE_WEB_INTERFACE=true
+# Health check
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+    CMD sh -c 'curl -f "http://localhost:${PORT:-7860}/health" || exit 1'
+# Same entrypoint as local `uv run server` (console script from the project venv)
+WORKDIR /app/env
+CMD ["/bin/sh", "-lc", "/app/.venv/bin/server --port ${PORT:-7860}"]

README.md CHANGED Viewed

@@ -1,12 +1,308 @@
 ---
-title: Ghostexec
-emoji: 🌖
-colorFrom: blue
-colorTo: blue
 sdk: docker
 pinned: false
-license: apache-2.0
-short_description: OpenEnv RL environment for executive chief-of-staff decision
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Ghostexec Environment Server
+emoji: 📢
+colorFrom: pink
+colorTo: yellow
 sdk: docker
 pinned: false
+app_port: 7860
+base_path: /web
+tags:
+  - openenv
 ---
+# Ghostexec
+**Ghostexec** is an [OpenEnv](https://github.com/meta-pytorch/OpenEnv)-compatible environment that simulates a busy executive’s world: inbox, calendar, contacts, tasks, and stakeholder moods. The agent chooses **structured actions** (reply, reschedule, delegate, …); the server returns a **plain-text briefing** as the main observation and a **scalar reward** shaped around conflict, relationships, and task progress. Scenario data lives in `scenarios/*.json` — nothing is hardcoded in Python for world content.
+**Manifest:** `openenv.yaml` (name **`ghostexec`**, HF Space identifier).
+**Package:** `openenv-ghostexec` in `pyproject.toml` (import as `ghostexec`).
+---
+## Deliverables
+| Deliverable | URL |
+|-------------|-----|
+| Public HF Space (required) | `TODO: https://huggingface.co/spaces/<org>/ghostexec` |
+| Write-up / blog (HF post preferred) | `TODO: https://huggingface.co/blog/...` |
+| Short demo video (&lt;2 min) | `TODO: https://youtube.com/...` |
+Fill these URLs before submission freeze so reviewers can verify everything from one place.
+---
+## OpenEnv Hackathon alignment (themes + submission checklist)
+**Theme fit (examples, not exhaustive):** Ghostexec targets **Theme 3.2 — Personalized tasks** (executive-style inbox, calendar, conflicts, delegation via structured actions). **Theme 4** is partially supported via curriculum + perturb (`GHOSTEXEC_CURRICULUM`, `GHOSTEXEC_PERTURB`) and diverse scenarios under `scenarios/`.
+**Minimum submission checklist (fill before freeze):**
+| Item | Status |
+|------|--------|
+| OpenEnv-based env + `openenv.yaml` | Done in-repo (`openenv-core[core]>=0.2.3` in `pyproject.toml`; aligns with current PyPI release line). |
+| Short write-up or &lt;2 min video | **You:** publish and paste links in [Deliverables](#deliverables). |
+| Public HF Space URL | **You:** `openenv push` and paste the URL in [Deliverables](#deliverables). |
+---
+## Design narrative
+Ghostexec is intentionally built as an **AI Chief of Staff** environment, not a grid-world clone: the model must triage inbox, calendar, stakeholder mood, and task deadlines under conflict pressure while taking only legal structured actions.
+- **Environment Innovation (40%)** — scenario-driven executive operations with competing priorities, conflict queues, and relationship-sensitive outcomes in `scenarios/*.json` + `server/ghostexec_environment.py`.
+- **Storytelling & Presentation (30%)** — each scenario encodes a narrative arc (VIP escalations, family/professional collisions, deadline cascades) so policy behavior reads like realistic assistant decisions rather than abstract moves.
+- **Showing Improvement in Rewards (20%)** — environment reward remains deterministic, inspectable, and traceable through metadata + episode logs under `outputs/logs/`.
+- **Reward Quality (10%)** — fixed weighted core signal (0.35 conflict / 0.35 relationship / 0.30 task), bounded shaping terms, explicit invalid-action handling, and do_nothing penalties.
+This framing gives judges a clear throughline: **realistic executive chaos -> constrained legal actions -> measurable policy improvement on held-out scenarios**.
+---
+## Features
+- **Legal action set** — `reply_email`, `archive_email`, `reschedule_meeting`, `cancel_meeting`, `complete_task`, `delegate_task`, `send_message`, `do_nothing` (see `models.py`).
+- **Human-readable observations** — `GhostexecObservation.echoed_message` is the full briefing text for the model (not raw JSON).
+- **Invalid actions** — Handled in-process: structured metadata (e.g. `step_ok`), no server crash.
+- **Reward** — Weighted blend of conflict, relationship, and task signals (see [Reward](#reward)); per-step logging under `outputs/logs/` (gitignored).
+- **HTTP + WebSocket** — FastAPI app in `server/app.py`; `GhostexecEnv` uses WebSockets for persistent episodes.
+---
+## Quick start (Python client)
+From the repo root (`ghostexec/` — where `pyproject.toml` lives):
+```bash
+uv sync
+uv run server --port 8000
+```
+In another terminal or notebook:
+```python
+from ghostexec import GhostexecAction, GhostexecEnv
+with GhostexecEnv(base_url="http://127.0.0.1:8000") as env:
+    out = env.reset()
+    print(out.observation.echoed_message[:500], "…")  # plain-text briefing
+    step = env.step(
+        GhostexecAction(
+            action_type="reply_email",
+            email_id="e01",
+            message_body=(
+                "Marcus — acknowledged. Revised figures and short rationale "
+                "before noon. — Exec"
+            ),
+        )
+    )
+    print("reward:", step.reward)
+    print("metadata keys:", sorted((step.observation.metadata or {}).keys()))
+```
+**Docker image** (optional): if your OpenEnv client supports it, you can point `GhostexecEnv` at a container built from the root `Dockerfile`. Build from repo root:
+```bash
+docker build -t ghostexec-env:latest .
+```
+---
+## Actions and fields
+`GhostexecAction` (`models.py`) includes:
+| `action_type`          | Typical fields used |
+|------------------------|----------------------|
+| `reply_email`          | `email_id`, `message_body` |
+| `archive_email`      | `email_id` |
+| `reschedule_meeting` | `meeting_id`, `new_time`, `reason` |
+| `cancel_meeting`     | `meeting_id`, `reason` |
+| `complete_task`      | `task_id` |
+| `delegate_task`      | `task_id`, `contact_name` |
+| `send_message`       | `contact_name`, `message` (channel text) |
+| `do_nothing`         | — (intentionally weak / penalised path) |
+Unknown or malformed HTTP payloads deserialize safely to `do_nothing`-style defaults where applicable so older clients do not crash.
+---
+## Observation
+`GhostexecObservation`:
+- **`echoed_message`** — Full briefing (emails, conflicts, contacts, tasks, stress, steps remaining).
+- **`message_length`** — Length of `echoed_message` for quick checks.
+- **`reward`**, **`done`**, **`metadata`** — Step outcome; metadata carries flags such as `step_ok`, reward breakdown fields, and ids for debugging.
+---
+## Reward
+Phase-4 scoring (`server/reward.py`) combines three channels with **fixed weights**:
+\[
+\text{weighted base} = 0.35 \cdot \text{conflict} + 0.35 \cdot \text{relationship} + 0.30 \cdot \text{task}
+\]
+Then applies output scaling, invalid-step adjustments, bonuses/penalties, and a floor for `do_nothing`. Full component values are available on `RewardBreakdown` and are mirrored into observation metadata where configured. **Episode reward traces** append to `outputs/logs/episode_rewards.jsonl` (directory gitignored).
+**Reward-engineering provenance.** The design follows the reward-shaping playbook surveyed in *Comprehensive Overview of Reward Engineering and Shaping in Advancing Reinforcement Learning Applications* ([arXiv:2408.10215](https://arxiv.org/abs/2408.10215)): dense per-step shaping around proxy signals (conflict / relationship / task) instead of a single sparse end-of-episode reward, fixed weights to keep channel trade-offs inspectable, and bounded per-step magnitudes to resist hacking.
+---
+## HTTP vs WebSocket (episode state)
+- **HTTP** `POST /reset` and `POST /step` often bind to **short-lived** environment instances depending on deployment; consecutive HTTP calls may not share one in-memory episode.
+- **Ghostexec** still applies your action against a scenario-primed instance so a lone `POST /step` can return a meaningful reward and metadata.
+- **WebSocket `/ws`** — Use this (or `GhostexecEnv(base_url=...)`, which speaks WebSocket) for **multi-step episodes** on the same session.
+Endpoints (typical OpenEnv layout): **`/web`**, **`/docs`**, **`/health`**, **`/ws`**.
+---
+## Running and testing locally
+```bash
+# Dev server (package layout)
+uv run uvicorn ghostexec.server.app:app --reload --host 0.0.0.0 --port 8000
+# Or console entrypoint (matches Dockerfile)
+uv run server --port 8000
+```
+**Smoke script** (HTTP):
+```bash
+uv run python scripts/http_endpoint_smoke.py --local
+uv run python scripts/http_endpoint_smoke.py --url http://127.0.0.1:8000
+uv run python scripts/http_endpoint_smoke.py --print-curl
+```
+**Tests:**
+```bash
+uv run pytest tests/ -q
+```
+Opt-in Docker build smoke (Phase 1 gate):
+```bash
+GHOSTEXEC_RUN_DOCKER_BUILD=1 uv run pytest tests/test_docker_build.py -q
+```
+With the server already on port 8000:
+```bash
+uv run pytest tests/test_live_server_exhaustive.py -v --tb=short
+```
+Override live URL (Windows PowerShell example):
+```powershell
+$env:GHOSTEXEC_LIVE_BASE_URL = "http://127.0.0.1:9000"
+uv run pytest tests/test_live_server_exhaustive.py -q
+```
+Optional real WebSocket client check:
+```bash
+# Terminal 1
+uv run server --port 8000
+# Terminal 2
+set GHOSTEXEC_WS_BASE_URL=http://127.0.0.1:8000
+uv run pytest tests/test_complete_integration.py::test_ghostexec_env_client_against_live_url_if_set -q
+```
+---
+## Hugging Face Spaces
+Full OpenEnv CLI flow from this directory (matches steps 5–8 of the [Packaging & Deploying guide](https://meta-pytorch.org/OpenEnv/auto_getting_started/environment-builder.html)):
+```bash
+openenv serve                       # local dev server on :8000
+openenv build                       # build the Docker image
+openenv validate --verbose          # structure + Dockerfile + entrypoint checks
+openenv push                        # deploy to HF Spaces
+# openenv push --repo-id your-username/ghostexec
+```
+Use a **public** Space for the default hackathon flow unless you intentionally need a private Space. Authenticate with Hugging Face first (`huggingface-cli login` or equivalent).
+---
+## Scenarios
+| File | Role |
+|------|------|
+| `scenarios/phase2_core.json` | Default dense inbox/calendar/tasks fixture |
+| `scenarios/monday_morning.json`, `dinner_disaster.json`, `vip_meltdown.json` | Narrative demos |
+| `scenarios/vip_meltdown_drift.json` | Mood / escalation drift |
+| `scenarios/schema_drift_test.json` | Drift-event harness |
+---
+## Concurrent WebSocket sessions
+`server/app.py` passes **`GhostexecEnvironment`** (the class) into `create_app` with `max_concurrent_envs=1` by default. Increase `max_concurrent_envs` if you need multiple simultaneous WebSocket clients.
+---
+## Project layout
+```
+ghostexec/
+├── openenv.yaml           # OpenEnv name, version, description
+├── pyproject.toml         # Package metadata + optional extras
+├── uv.lock
+├── models.py              # World + GhostexecAction / GhostexecObservation
+├── client.py              # GhostexecEnv (WebSocket client)
+├── scenarios/             # World JSON (source of truth for episodes)
+├── scripts/               # http_endpoint_smoke.py
+├── tests/
+└── server/
+    ├── app.py             # FastAPI + create_app
+    ├── ghostexec_environment.py
+    ├── reward.py
+    └── Dockerfile
+```
+---
+## Resources & references
+Ghostexec is built against the official Meta PyTorch OpenEnv stack. Every design choice below is traceable to one of these sources.
+**OpenEnv core.** The Gymnasium-style `reset()` / `step()` / `state` interface in `server/ghostexec_environment.py`, the `EnvClient` subclass in `client.py`, and the `create_app(...)` wiring in `server/app.py` follow the [Packaging & Deploying guide](https://meta-pytorch.org/OpenEnv/auto_getting_started/environment-builder.html) exactly.
+- Core repo: [meta-pytorch/OpenEnv](https://github.com/meta-pytorch/OpenEnv)
+- Docs: [meta-pytorch.org/OpenEnv](https://meta-pytorch.org/OpenEnv/)
+**OpenEnv Hub (Hugging Face).** Target deployment for `openenv push`. The Space metadata at the top of this README + `openenv.yaml` are the knobs HF Spaces reads.
+- Environments: [huggingface.co/openenv](https://huggingface.co/openenv)
+- Spaces: [huggingface.co/openenv/spaces](https://huggingface.co/openenv/spaces)
+**Tutorials.** General OpenEnv environment patterns are documented in the official tutorial pages and examples.
+- All tutorials: [OpenEnv/tutorial](https://github.com/meta-pytorch/OpenEnv/tree/main/tutorial)
+- Environment examples: [OpenEnv/envs](https://github.com/meta-pytorch/OpenEnv/tree/main/envs)
+**YouTube — Building RL environments.** Talks from Meta / OpenEnv contributors that informed the scenario-driven reset, WebSocket session model, and reward breakdown used here:
+- [Building RL Environments with OpenEnv](https://www.youtube.com/watch?v=0airz7BhBiA)
+- [OpenEnv Deep Dive](https://www.youtube.com/watch?v=ap4q4sAK4OY)
+- [Agentic RL Environments](https://www.youtube.com/watch?v=Jew4lhAiqnw)
+- [OpenEnv Livestream (4-hour walkthrough)](https://www.youtube.com/live/kkCNMz0Ptd8)
+**Reward-engineering papers.** See [Reward](#reward) for how each paper maps to specific components of `server/reward.py`.
+- Jnadi, A. (2024). *Comprehensive Overview of Reward Engineering and Shaping in Advancing Reinforcement Learning Applications*. [arXiv:2408.10215](https://arxiv.org/abs/2408.10215). Informs the dense per-step conflict / relationship / task shaping and the bounded-magnitude design.
+---
+## License
+BSD-style — see the license notice at the top of each source file (Meta / OpenEnv lineage).

__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Ghostexec Environment."""
+from .models import GhostexecAction, GhostexecObservation
+# Importing ghostexec.models in notebooks should not require websocket client deps.
+# Keep client import optional so package imports survive OpenEnv layout differences.
+try:
+    from .client import GhostexecEnv
+except Exception:  # pragma: no cover - import-compat shim
+    GhostexecEnv = None  # type: ignore[assignment]
+__all__ = ["GhostexecAction", "GhostexecObservation"]
+if GhostexecEnv is not None:
+    __all__.append("GhostexecEnv")

client.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Ghostexec Environment Client."""
+from typing import Any, Dict
+try:
+    # OpenEnv newer layout.
+    from openenv.client import EnvClient
+except ImportError:
+    try:
+        # Some builds expose the class one level deeper.
+        from openenv.client.client import EnvClient
+    except ImportError:
+        # Backward compatibility with older OpenEnv versions.
+        from openenv.core import EnvClient
+from openenv.core.client_types import StepResult
+from openenv.core.env_server.types import State
+from .models import GhostexecAction, GhostexecObservation
+class GhostexecEnv(
+    EnvClient[GhostexecAction, GhostexecObservation, State]
+):
+    """
+    Client for the Ghostexec Environment.
+    This client maintains a persistent WebSocket connection to the environment server,
+    enabling efficient multi-step interactions with lower latency.
+    Each client instance has its own dedicated environment session on the server.
+    """
+    def _step_payload(self, action: GhostexecAction) -> Dict[str, Any]:
+        payload = action.model_dump(mode="json")
+        if not payload.get("metadata"):
+            payload.pop("metadata", None)
+        return payload
+    def _parse_result(self, payload: Dict) -> StepResult[GhostexecObservation]:
+        obs_data = payload.get("observation", {})
+        observation = GhostexecObservation(
+            echoed_message=obs_data.get("echoed_message", ""),
+            message_length=obs_data.get("message_length", 0),
+            done=payload.get("done", False),
+            reward=payload.get("reward"),
+            metadata=obs_data.get("metadata", {}),
+        )
+        return StepResult(
+            observation=observation,
+            reward=payload.get("reward"),
+            done=payload.get("done", False),
+        )
+    def _parse_state(self, payload: Dict) -> State:
+        return State(
+            episode_id=payload.get("episode_id"),
+            step_count=payload.get("step_count", 0),
+        )

conftest.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Put repo root on sys.path before test collection (supports `uv run pytest` without editable install).
+from __future__ import annotations
+import sys
+from pathlib import Path
+_ROOT = Path(__file__).resolve().parent
+if str(_ROOT) not in sys.path:
+    sys.path.insert(0, str(_ROOT))

graders.py ADDED Viewed

	@@ -0,0 +1,121 @@

+"""
+Public trajectory graders for OpenEnv Phase 2 / HF deep validation.
+These are **episode-level** scores (strictly inside (0, 1)), separate from per-step
+rewards in `server/reward.py`. The hackathon validator reads `openenv.yaml`
+`tasks[].grader` and calls these functions with trajectory dicts.
+"""
+from __future__ import annotations
+from typing import Iterable, List
+STRICT_MIN = 0.01
+STRICT_MAX = 0.99
+def _bounded(value: float) -> float:
+    return min(max(round(float(value), 4), STRICT_MIN), STRICT_MAX)
+def _as_reward_list(trajectory: dict | None) -> List[float]:
+    payload = trajectory or {}
+    rewards = payload.get("rewards")
+    if isinstance(rewards, list) and rewards:
+        return [float(r) for r in rewards]
+    if "score" in payload:
+        return [float(payload["score"])]
+    reward = payload.get("reward")
+    if isinstance(reward, dict) and "total" in reward:
+        return [float(reward["total"])]
+    if reward is not None:
+        return [float(reward)]
+    return []
+def _profile(reward: float) -> str:
+    if reward <= 0.05:
+        return "unsafe_miss"
+    if reward <= 0.20:
+        return "bad_call"
+    if reward < 0.50:
+        return "weak"
+    if reward < 0.80:
+        return "workable"
+    if reward < 0.95:
+        return "strong"
+    return "expert"
+def _score_episode(
+    rewards: List[float],
+    *,
+    miss_cost: float,
+    overcall_cost: float,
+    stability_gain: float,
+    expertise_gain: float,
+) -> float:
+    if not rewards:
+        return _bounded(0.5)
+    labels = [_profile(r) for r in rewards]
+    mean_r = sum(rewards) / len(rewards)
+    n = len(rewards)
+    miss = labels.count("unsafe_miss")
+    bad = labels.count("bad_call")
+    weak = labels.count("weak")
+    strong = labels.count("strong") + labels.count("expert")
+    expert = labels.count("expert")
+    downward = (
+        min(miss * miss_cost, 0.35)
+        + min(bad * overcall_cost, 0.15)
+        + min(weak * 0.015, 0.06)
+    )
+    upward = 0.0
+    if strong / n >= 0.80:
+        upward += stability_gain
+    if expert / n >= 0.60:
+        upward += expertise_gain
+    return _bounded(mean_r - downward + upward)
+def phase2_core_grader(trajectory: dict | None = None) -> float:
+    """Easy tier — dense default inbox (scenarios/phase2_core.json)."""
+    return _score_episode(
+        _as_reward_list(trajectory),
+        miss_cost=0.12,
+        overcall_cost=0.03,
+        stability_gain=0.05,
+        expertise_gain=0.01,
+    )
+def monday_morning_grader(trajectory: dict | None = None) -> float:
+    """Medium tier — stacked Monday conflicts (scenarios/monday_morning.json)."""
+    return _score_episode(
+        _as_reward_list(trajectory),
+        miss_cost=0.09,
+        overcall_cost=0.04,
+        stability_gain=0.03,
+        expertise_gain=0.02,
+    )
+def dinner_disaster_grader(trajectory: dict | None = None) -> float:
+    """Hard tier — personal/professional collision (scenarios/dinner_disaster.json)."""
+    return _score_episode(
+        _as_reward_list(trajectory),
+        miss_cost=0.07,
+        overcall_cost=0.03,
+        stability_gain=0.02,
+        expertise_gain=0.04,
+    )
+__all__ = [
+    "phase2_core_grader",
+    "monday_morning_grader",
+    "dinner_disaster_grader",
+    "STRICT_MIN",
+    "STRICT_MAX",
+]

inference.py ADDED Viewed

	@@ -0,0 +1,265 @@

+"""
+Baseline runner for the Ghostexec submission.
+This script queries a chat model through the OpenAI client, sends its decision
+to the environment server, and prints machine-readable lines expected by simple
+evaluators/log parsers.
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+from typing import Any, Iterable
+import requests
+from pydantic import ValidationError
+try:
+    from .graders import dinner_disaster_grader, monday_morning_grader, phase2_core_grader
+    from .models import GhostexecAction
+except ImportError:
+    from graders import dinner_disaster_grader, monday_morning_grader, phase2_core_grader
+    from models import GhostexecAction
+API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
+MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
+HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
+ENV_URL = os.getenv("ENV_URL", "http://localhost:7860").rstrip("/")
+TASK_OVERRIDE = os.getenv("TASK_NAME", "").strip()
+BENCHMARK = "ghostexec"
+TASK_SETS: dict[str, tuple[str, ...]] = {
+    "easy": ("phase2_core",),
+    "medium": ("monday_morning",),
+    "hard": ("dinner_disaster",),
+    "all": ("phase2_core", "monday_morning", "dinner_disaster"),
+}
+TASK_TO_GRADER = {
+    "phase2_core": phase2_core_grader,
+    "monday_morning": monday_morning_grader,
+    "dinner_disaster": dinner_disaster_grader,
+}
+SYSTEM_MESSAGE = """
+You are acting as an AI Chief-of-Staff assistant in Ghostexec.
+You must output exactly one JSON object that matches GhostexecAction.
+Allowed action_type values:
+- reply_email
+- archive_email
+- reschedule_meeting
+- cancel_meeting
+- complete_task
+- delegate_task
+- send_message
+- do_nothing
+Allowed keys:
+- action_type
+- email_id
+- message_body
+- meeting_id
+- new_time
+- reason
+- task_id
+- contact_name
+- message
+Rules:
+- Output valid JSON only (no markdown, no prose).
+- Prefer high-impact conflict-reducing actions over do_nothing.
+- Only reference ids/entities that appear in the briefing.
+- If unsure, output {"action_type":"do_nothing"}.
+""".strip()
+def emit_start(task_name: str) -> None:
+    print(f"[START] task={task_name} env={BENCHMARK} model={MODEL_NAME}", flush=True)
+def emit_step(step_no: int, action_text: str, reward: float, done: bool, error: str | None) -> None:
+    error_text = error if error else "null"
+    print(
+        f"[STEP] step={step_no} action={action_text} reward={reward:.2f} "
+        f"done={str(done).lower()} error={error_text}",
+        flush=True,
+    )
+def emit_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
+    reward_text = ",".join(f"{reward:.2f}" for reward in rewards)
+    print(
+        f"[END] success={str(success).lower()} steps={steps} "
+        f"score={score:.6f} rewards={reward_text}",
+        flush=True,
+    )
+def choose_tasks(selection: str) -> Iterable[str]:
+    if TASK_OVERRIDE:
+        return (TASK_OVERRIDE,)
+    return TASK_SETS[selection]
+def client() -> Any:
+    if not HF_TOKEN:
+        raise EnvironmentError("HF_TOKEN or API_KEY must be set before running inference.py")
+    from openai import OpenAI
+    return OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
+def fetch_reset(task_name: str) -> dict[str, Any]:
+    response = requests.post(
+        f"{ENV_URL}/reset",
+        json={"task_id": task_name},
+        timeout=30,
+    )
+    response.raise_for_status()
+    return response.json()
+def submit_action(action: GhostexecAction) -> dict[str, Any]:
+    response = requests.post(
+        f"{ENV_URL}/step",
+        json={"action": action.model_dump()},
+        timeout=30,
+    )
+    response.raise_for_status()
+    return response.json()
+def _extract_json_object(text: str) -> str:
+    s = text.strip()
+    if s.startswith("```"):
+        # tolerate fenced output from weak model instruction following
+        s = s.strip("`")
+        if "\n" in s:
+            s = s.split("\n", 1)[1]
+    start = s.find("{")
+    end = s.rfind("}")
+    if start == -1 or end == -1 or end <= start:
+        raise json.JSONDecodeError("No JSON object found", s, 0)
+    return s[start : end + 1]
+def prompt_for_case(observation: dict[str, Any]) -> str:
+    return (
+        "Take one best next action for the Ghostexec environment.\n\n"
+        "Return one final structured GhostexecAction JSON object.\n\n"
+        f"{json.dumps(observation, ensure_ascii=True, indent=2)}\n\n"
+        "Choose the action that most reduces conflicts, protects relationships, "
+        "and advances urgent tasks."
+    )
+def ask_model(llm: Any, observation: dict[str, Any]) -> GhostexecAction:
+    completion = llm.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[
+            {"role": "system", "content": SYSTEM_MESSAGE},
+            {"role": "user", "content": prompt_for_case(observation)},
+        ],
+        temperature=0.0,
+        max_tokens=260,
+        stream=False,
+    )
+    text = (completion.choices[0].message.content or "").strip()
+    payload = json.loads(_extract_json_object(text))
+    return GhostexecAction(**payload)
+def compact_action(action: GhostexecAction) -> str:
+    label = action.action_type
+    for candidate in (action.email_id, action.meeting_id, action.task_id, action.contact_name):
+        if candidate:
+            return f"{label}/{candidate}"
+    return label
+def _extract_reward(payload: dict[str, Any]) -> float:
+    reward_payload = payload.get("reward")
+    if isinstance(reward_payload, dict):
+        return float(reward_payload.get("total", 0.0))
+    if reward_payload is not None:
+        return float(reward_payload)
+    obs = payload.get("observation")
+    if isinstance(obs, dict) and obs.get("reward") is not None:
+        return float(obs["reward"])
+    return 0.0
+def final_score(task_name: str, rewards: list[float]) -> float:
+    grader = TASK_TO_GRADER.get(task_name)
+    if grader is None:
+        score = sum(rewards) / len(rewards) if rewards else 0.0
+        return min(max(round(score, 4), 0.01), 0.99)
+    return float(grader({"rewards": rewards}))
+def run_one_task(llm: Any, task_name: str) -> None:
+    rewards: list[float] = []
+    steps_taken = 0
+    score = 0.0
+    success = False
+    emit_start(task_name)
+    try:
+        result = fetch_reset(task_name)
+        done = bool(result.get("done", False))
+        while not done:
+            observation = result.get("observation", result)
+            action = ask_model(llm, observation if isinstance(observation, dict) else result)
+            action_text = compact_action(action)
+            result = submit_action(action)
+            reward = _extract_reward(result)
+            done = bool(result.get("done", False))
+            rewards.append(reward)
+            steps_taken += 1
+            emit_step(steps_taken, action_text, reward, done, None)
+        score = final_score(task_name, rewards)
+        success = score >= 0.60
+    except json.JSONDecodeError:
+        rewards = [0.0]
+        steps_taken = 1
+        emit_step(1, "parse_error", 0.0, True, "parse_error")
+    except ValidationError:
+        rewards = [0.0]
+        steps_taken = 1
+        emit_step(1, "schema_error", 0.0, True, "schema_error")
+    except Exception as exc:
+        rewards = [0.0]
+        steps_taken = 1
+        emit_step(1, "error", 0.0, True, str(exc))
+    finally:
+        emit_end(success, steps_taken, score, rewards or [0.0])
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Run the Ghostexec baseline agent")
+    parser.add_argument(
+        "--difficulty",
+        choices=["easy", "medium", "hard", "all"],
+        default="all",
+        help="Which task subset to run",
+    )
+    args = parser.parse_args()
+    llm = client()
+    for task_name in choose_tasks(args.difficulty):
+        run_one_task(llm, task_name)
+if __name__ == "__main__":
+    main()

models.py ADDED Viewed

	@@ -0,0 +1,204 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Data models for GhostExec — all world and API types live here."""
+from __future__ import annotations
+from typing import Any, Literal
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+try:
+    from openenv.core.env_server.types import Action as _OpenEnvAction
+    from openenv.core.env_server.types import Observation as _OpenEnvObservation
+except Exception:
+    _OpenEnvAction = BaseModel  # type: ignore[assignment]
+    _OpenEnvObservation = BaseModel  # type: ignore[assignment]
+def _is_pydantic_model_class(cls: object) -> bool:
+    try:
+        return isinstance(cls, type) and issubclass(cls, BaseModel)
+    except TypeError:
+        return False
+# Some OpenEnv builds expose dataclass-style Action/Observation that do not accept
+# additional keyword fields, which breaks GhostexecAction/GhostexecObservation
+# construction in Colab. Fall back to BaseModel in that case.
+ActionBase = _OpenEnvAction if _is_pydantic_model_class(_OpenEnvAction) else BaseModel
+ObservationBase = (
+    _OpenEnvObservation if _is_pydantic_model_class(_OpenEnvObservation) else BaseModel
+)
+# --- Aliases for scenario / world strings ---
+EmailPriority = Literal["critical", "high", "normal", "low"]
+SenderRelationship = Literal["VIP", "personal", "professional", "unknown"]
+ContactRelationship = Literal[
+    "board_member",
+    "spouse",
+    "investor",
+    "direct_report",
+    "client",
+    "friend",
+    "team_member",
+]
+CommPreference = Literal["email", "text", "call"]
+Mood = Literal["happy", "neutral", "annoyed", "angry", "furious"]
+TaskStatus = Literal["pending", "in-progress", "done", "overdue"]
+Effort = Literal["low", "medium", "high"]
+MeetingPriority = Literal["critical", "high", "normal", "low"]
+GhostexecActionType = Literal[
+    "reply_email",
+    "archive_email",
+    "reschedule_meeting",
+    "cancel_meeting",
+    "complete_task",
+    "delegate_task",
+    "send_message",
+    "do_nothing",
+]
+class Email(BaseModel):
+    """Single inbox message."""
+    model_config = ConfigDict(extra="forbid")
+    id: str
+    sender: str
+    subject: str
+    body: str
+    read: bool = False
+    replied: bool = False
+    priority: EmailPriority
+    sender_relationship: SenderRelationship
+class Meeting(BaseModel):
+    """Calendar block."""
+    model_config = ConfigDict(extra="forbid")
+    id: str
+    title: str
+    start: str = Field(..., description="ISO 8601 start datetime")
+    duration_minutes: int = Field(..., ge=1)
+    attendees: list[str] = Field(default_factory=list)
+    location: str = ""
+    priority: MeetingPriority = "normal"
+    cancelled: bool = False
+class Contact(BaseModel):
+    """Stakeholder in the exec's network."""
+    model_config = ConfigDict(extra="forbid")
+    name: str
+    relationship_type: ContactRelationship
+    communication_preference: CommPreference
+    importance: int = Field(..., ge=1, le=5)
+    mood: Mood = "neutral"
+class Task(BaseModel):
+    """To-do item."""
+    model_config = ConfigDict(extra="forbid")
+    id: str
+    description: str
+    deadline: str = Field(..., description="ISO 8601 deadline")
+    owner: str
+    status: TaskStatus = "pending"
+    effort: Effort = "medium"
+    delegated_to: str | None = None
+class WorldState(BaseModel):
+    """Full simulated world — JSON-serialisable."""
+    model_config = ConfigDict(extra="forbid")
+    simulation_time: str = Field(..., description="Current simulated instant, ISO 8601")
+    stress: int = Field(default=0, ge=0, le=100)
+    active_conflicts: list[str] = Field(default_factory=list)
+    action_log: list[str] = Field(default_factory=list)
+    episode_active: bool = True
+    episode_end_reason: str | None = None
+    max_episode_steps: int = Field(default=48, ge=1, le=10_000)
+    emails: list[Email] = Field(default_factory=list)
+    meetings: list[Meeting] = Field(default_factory=list)
+    contacts: list[Contact] = Field(default_factory=list)
+    tasks: list[Task] = Field(default_factory=list)
+class GhostexecAction(ActionBase):
+    """
+    Legal agent actions (Phase 3). Unknown HTTP payloads default to do_nothing
+    so older clients do not crash deserialization.
+    """
+    action_type: GhostexecActionType = Field(
+        default="do_nothing",
+        description="Which legal action to execute this step",
+    )
+    email_id: str = ""
+    message_body: str = ""
+    meeting_id: str = ""
+    new_time: str = ""
+    reason: str = ""
+    task_id: str = ""
+    contact_name: str = ""
+    message: str = Field(default="", description="Optional note for action_log (legacy / debug)")
+    @model_validator(mode="before")
+    @classmethod
+    def _default_action_type(cls, data: Any) -> Any:
+        if isinstance(data, dict) and "action_type" not in data:
+            data = {**data, "action_type": "do_nothing"}
+        return data
+class GhostexecObservation(ObservationBase):
+    """
+    Primary LLM-facing field is `echoed_message`: full plain-text briefing (Phase 3).
+    """
+    # Keep these fields explicit for compatibility with OpenEnv builds where
+    # Observation is not a pydantic base carrying done/reward/metadata.
+    done: bool = False
+    reward: float | None = None
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    echoed_message: str = Field(
+        default="",
+        description="Human-readable briefing text for the LLM (not JSON)",
+    )
+    message_length: int = Field(default=0, description="Byte length of echoed_message for quick checks")
+class RewardBreakdown(BaseModel):
+    """Phase 4 reward components (logged and exposed in observation metadata)."""
+    model_config = ConfigDict(extra="forbid")
+    conflict_raw: float = 0.0
+    critical_queue_bonus: float = 0.0
+    conflict: float = 0.0
+    relationship: float = 0.0
+    task: float = 0.0
+    weighted_base: float = 0.0
+    output_scale: float = 1.0
+    invalid_step_adjustment: float = 0.0
+    episode_completion_bonus: float = 0.0
+    catastrophic_penalty: float = 0.0
+    do_nothing_floor: float = 0.0
+    final: float = 0.0

openenv.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+spec_version: 1
+name: ghostexec
+version: "0.1.0"
+description: "GhostExec — RL training environment for personal and executive task conflict resolution (The AI Chief of Staff)."
+type: space
+runtime: fastapi
+app: server.app:app
+port: 8000
+difficulties: [easy, medium, hard]
+max_steps: 20
+tasks:
+  - id: phase2_core
+    difficulty: easy
+    description: >
+      Default dense inbox/calendar fixture (scenarios/phase2_core.json).
+      Stress-test triage, VIP queues, and calendar relief.
+    grader: graders.phase2_core_grader
+  - id: monday_morning
+    difficulty: medium
+    description: >
+      Monday morning rush with stacked conflicts (scenarios/monday_morning.json).
+    grader: graders.monday_morning_grader
+  - id: dinner_disaster
+    difficulty: hard
+    description: >
+      Personal/professional collision with escalation risk
+      (scenarios/dinner_disaster.json).
+    grader: graders.dinner_disaster_grader

openenv_ghostexec.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,15 @@

+Metadata-Version: 2.4
+Name: openenv-ghostexec
+Version: 0.1.0
+Summary: Ghostexec environment for OpenEnv
+Requires-Python: >=3.10
+Requires-Dist: openenv-core[core]>=0.2.3
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0.0; extra == "dev"
+Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
+Requires-Dist: pyyaml>=6.0.0; extra == "dev"
+Requires-Dist: matplotlib>=3.8.0; extra == "dev"
+Provides-Extra: constrained
+Requires-Dist: lm-format-enforcer>=0.10; extra == "constrained"
+Provides-Extra: constrained-outlines
+Requires-Dist: outlines>=0.1; extra == "constrained-outlines"

openenv_ghostexec.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,45 @@

+README.md
+__init__.py
+client.py
+conftest.py
+graders.py
+models.py
+pyproject.toml
+./__init__.py
+./client.py
+./conftest.py
+./graders.py
+./models.py
+./scenarios/dinner_disaster.json
+./scenarios/monday_morning.json
+./scenarios/phase2_core.json
+./scenarios/schema_drift_test.json
+./scenarios/vip_meltdown.json
+./scenarios/vip_meltdown_drift.json
+openenv_ghostexec.egg-info/PKG-INFO
+openenv_ghostexec.egg-info/SOURCES.txt
+openenv_ghostexec.egg-info/dependency_links.txt
+openenv_ghostexec.egg-info/entry_points.txt
+openenv_ghostexec.egg-info/requires.txt
+openenv_ghostexec.egg-info/top_level.txt
+scenarios/dinner_disaster.json
+scenarios/monday_morning.json
+scenarios/phase2_core.json
+scenarios/schema_drift_test.json
+scenarios/vip_meltdown.json
+scenarios/vip_meltdown_drift.json
+server/__init__.py
+server/app.py
+server/ghostexec_environment.py
+server/reward.py
+tests/test_api_reward_dead_500.py
+tests/test_complete_integration.py
+tests/test_docker_build.py
+tests/test_env.py
+tests/test_live_server_exhaustive.py
+tests/test_phase1.py
+tests/test_phase2.py
+tests/test_phase3.py
+tests/test_phase4.py
+tests/test_reward_dead_suite.py
+tests/test_submission_plots_committed.py

openenv_ghostexec.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

openenv_ghostexec.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [console_scripts]
2	+ server = ghostexec.server.app:main

openenv_ghostexec.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+openenv-core[core]>=0.2.3
+[constrained]
+lm-format-enforcer>=0.10
+[constrained-outlines]
+outlines>=0.1
+[dev]
+pytest>=8.0.0
+pytest-cov>=4.0.0
+pyyaml>=6.0.0
+matplotlib>=3.8.0

openenv_ghostexec.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ghostexec

outputs/logs/api_dead_live_500.jsonl ADDED Viewed

	@@ -0,0 +1,500 @@

+{"idx": 0, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 1, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 2, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 3, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 4, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 5, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 6, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 7, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 8, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 9, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 10, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 11, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 12, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 13, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 14, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 15, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 16, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 17, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 18, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 19, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 20, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 21, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 22, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 23, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 24, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 25, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 26, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 27, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 28, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 29, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 30, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 31, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 32, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 33, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 34, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 35, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 36, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 37, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 38, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 39, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 40, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 41, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 42, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 43, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 44, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 45, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 46, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 47, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 48, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 49, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 50, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 51, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 52, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 53, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 54, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 55, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 56, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 57, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 58, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 59, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 60, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 61, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 62, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 63, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 64, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 65, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 66, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 67, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 68, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 69, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 70, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 71, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 72, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 73, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 74, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 75, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 76, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 77, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 78, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 79, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 80, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 81, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 82, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 83, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 84, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 85, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 86, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 87, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 88, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 89, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 90, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 91, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 92, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 93, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 94, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 95, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 96, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 97, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 98, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 99, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 100, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 101, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 102, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 103, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 104, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 105, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 106, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 107, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 108, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 109, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 110, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 111, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 112, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 113, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 114, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 115, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 116, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 117, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 118, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 119, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 120, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 121, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 122, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 123, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 124, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 125, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 126, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 127, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 128, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 129, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 130, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 131, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 132, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 133, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 134, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 135, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 136, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 137, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 138, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 139, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 140, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 141, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 142, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 143, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 144, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 145, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 146, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 147, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 148, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 149, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 150, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 151, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 152, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 153, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 154, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 155, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 156, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 157, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 158, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 159, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 160, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 161, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 162, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 163, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 164, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 165, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 166, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 167, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 168, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 169, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 170, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 171, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 172, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 173, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 174, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 175, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 176, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 177, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 178, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 179, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 180, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 181, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 182, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 183, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 184, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 185, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 186, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 187, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 188, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 189, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 190, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 191, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 192, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 193, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 194, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 195, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 196, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 197, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 198, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 199, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 200, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 201, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 202, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 203, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 204, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 205, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 206, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 207, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 208, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 209, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 210, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 211, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 212, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 213, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 214, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 215, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 216, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 217, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 218, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 219, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 220, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 221, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 222, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 223, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 224, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 225, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 226, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 227, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 228, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 229, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 230, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 231, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 232, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 233, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 234, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 235, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 236, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 237, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 238, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 239, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 240, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 241, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 242, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 243, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 244, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 245, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 246, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 247, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 248, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 249, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 250, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 251, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 252, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 253, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 254, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 255, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 256, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 257, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 258, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 259, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 260, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 261, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 262, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 263, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 264, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 265, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 266, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 267, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 268, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 269, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 270, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 271, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 272, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 273, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 274, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 275, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 276, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 277, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 278, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 279, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 280, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 281, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 282, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 283, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 284, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 285, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 286, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 287, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 288, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 289, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 290, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 291, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 292, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 293, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 294, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 295, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 296, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 297, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 298, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 299, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 300, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 301, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 302, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 303, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 304, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 305, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 306, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 307, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 308, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 309, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 310, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 311, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 312, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 313, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 314, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 315, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 316, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 317, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 318, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 319, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 320, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 321, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 322, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 323, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 324, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 325, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 326, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 327, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 328, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 329, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 330, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 331, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 332, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 333, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 334, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 335, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 336, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 337, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 338, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 339, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 340, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 341, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 342, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 343, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 344, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 345, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 346, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 347, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 348, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 349, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 350, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 351, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 352, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 353, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 354, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 355, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 356, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 357, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 358, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 359, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 360, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 361, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 362, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 363, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 364, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 365, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 366, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 367, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 368, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 369, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 370, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 371, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 372, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 373, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 374, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 375, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 376, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 377, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 378, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 379, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 380, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 381, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 382, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 383, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 384, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 385, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 386, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 387, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 388, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 389, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 390, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 391, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 392, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 393, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 394, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 395, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 396, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 397, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 398, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 399, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 400, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 401, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 402, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 403, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 404, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 405, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 406, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 407, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 408, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 409, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 410, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 411, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 412, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 413, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 414, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 415, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 416, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 417, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 418, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 419, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 420, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 421, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 422, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 423, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 424, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 425, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 426, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 427, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 428, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 429, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 430, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 431, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 432, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 433, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 434, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 435, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 436, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 437, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 438, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 439, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 440, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 441, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 442, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 443, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 444, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 445, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 446, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 447, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 448, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 449, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 450, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 451, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 452, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 453, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 454, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 455, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 456, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 457, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 458, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 459, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 460, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 461, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 462, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 463, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 464, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 465, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 466, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 467, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 468, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 469, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 470, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 471, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 472, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 473, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 474, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 475, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 476, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 477, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 478, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 479, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 480, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 481, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 482, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 483, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
+{"idx": 484, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
+{"idx": 485, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
+{"idx": 486, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
+{"idx": 487, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
+{"idx": 488, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
+{"idx": 489, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
+{"idx": 490, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
+{"idx": 491, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
+{"idx": 492, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
+{"idx": 493, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
+{"idx": 494, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
+{"idx": 495, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
+{"idx": 496, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
+{"idx": 497, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
+{"idx": 498, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
+{"idx": 499, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}

outputs/logs/episode_rewards.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

outputs/training/_integration_ckpt/run_summary.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "episodes": 5,
+  "log_path": "D:\\Scalar Final\\Final\\ghostexec\\outputs\\training\\_integration_train_smoke.jsonl",
+  "first_episode_first_action": {
+    "metadata": {},
+    "action_type": "reply_email",
+    "email_id": "e01",
+    "message_body": "On it \u2014 drafting a response and owners now.",
+    "meeting_id": "",
+    "new_time": "",
+    "reason": "",
+    "task_id": "",
+    "contact_name": "",
+    "message": ""
+  },
+  "last_episode_first_action": {
+    "metadata": {},
+    "action_type": "reply_email",
+    "email_id": "e01",
+    "message_body": "On it \u2014 drafting a response and owners now.",
+    "meeting_id": "",
+    "new_time": "",
+    "reason": "",
+    "task_id": "",
+    "contact_name": "",
+    "message": ""
+  }
+}

outputs/training/checkpoints/run_summary.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "episodes": 5,
+  "log_path": "D:\\Scalar Final\\Final\\ghostexec\\outputs\\training\\episode_returns.jsonl",
+  "first_episode_first_action": {
+    "metadata": {},
+    "action_type": "reply_email",
+    "email_id": "e01",
+    "message_body": "On it \u2014 drafting a response and owners now.",
+    "meeting_id": "",
+    "new_time": "",
+    "reason": "",
+    "task_id": "",
+    "contact_name": "",
+    "message": ""
+  },
+  "last_episode_first_action": {
+    "metadata": {},
+    "action_type": "reply_email",
+    "email_id": "e01",
+    "message_body": "On it \u2014 drafting a response and owners now.",
+    "meeting_id": "",
+    "new_time": "",
+    "reason": "",
+    "task_id": "",
+    "contact_name": "",
+    "message": ""
+  }
+}

outputs/training/episode_returns.jsonl ADDED Viewed

	@@ -0,0 +1,10 @@

+{"episode": 0, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
+{"episode": 1, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
+{"episode": 2, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
+{"episode": 3, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
+{"episode": 4, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
+{"episode": 0, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
+{"episode": 1, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
+{"episode": 2, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
+{"episode": 3, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
+{"episode": 4, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}

outputs/training/smoke/checkpoints/run_summary.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "episodes": 48,
+  "log_path": "D:\\Scalar Final\\Final\\ghostexec\\outputs\\training\\smoke\\reinforce_returns.jsonl",
+  "first_episode_first_action": {
+    "metadata": {},
+    "action_type": "reply_email",
+    "email_id": "e01",
+    "message_body": "Acknowledged \u2014 working the thread now.",
+    "meeting_id": "",
+    "new_time": "",
+    "reason": "",
+    "task_id": "",
+    "contact_name": "",
+    "message": ""
+  },
+  "last_episode_first_action": {
+    "metadata": {},
+    "action_type": "archive_email",
+    "email_id": "e06",
+    "message_body": "",
+    "meeting_id": "",
+    "new_time": "",
+    "reason": "",
+    "task_id": "",
+    "contact_name": "",
+    "message": ""
+  }
+}

outputs/training/smoke/reinforce_returns.jsonl ADDED Viewed

	@@ -0,0 +1,48 @@

+{"episode": 0, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.391999999999998, "length": 14, "mean_step_reward": -0.5279999999999998}
+{"episode": 1, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.297919999999997, "length": 14, "mean_step_reward": -0.5212799999999997}
+{"episode": 2, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -9.334079999999998, "length": 14, "mean_step_reward": -0.6667199999999999}
+{"episode": 3, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -8.641919999999997, "length": 14, "mean_step_reward": -0.6172799999999998}
+{"episode": 4, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.924959999999997, "length": 14, "mean_step_reward": -0.4946399999999998}
+{"episode": 5, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.465919999999997, "length": 14, "mean_step_reward": -0.5332799999999998}
+{"episode": 6, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999998, "length": 14, "mean_step_reward": -0.47591999999999984}
+{"episode": 7, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.633919999999996, "length": 14, "mean_step_reward": -0.5452799999999998}
+{"episode": 8, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.998879999999997, "length": 14, "mean_step_reward": -0.4999199999999998}
+{"episode": 9, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.625919999999997, "length": 14, "mean_step_reward": -0.4732799999999998}
+{"episode": 10, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.998879999999997, "length": 14, "mean_step_reward": -0.4999199999999998}
+{"episode": 11, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.428959999999997, "length": 14, "mean_step_reward": -0.5306399999999998}
+{"episode": 12, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.166879999999997, "length": 14, "mean_step_reward": -0.5119199999999998}
+{"episode": 13, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999998, "length": 14, "mean_step_reward": -0.47591999999999984}
+{"episode": 14, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.297919999999998, "length": 14, "mean_step_reward": -0.5212799999999999}
+{"episode": 15, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.756959999999998, "length": 14, "mean_step_reward": -0.48263999999999985}
+{"episode": 16, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.670879999999997, "length": 14, "mean_step_reward": -0.5479199999999997}
+{"episode": 17, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -8.174879999999996, "length": 14, "mean_step_reward": -0.5839199999999998}
+{"episode": 18, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999998, "length": 14, "mean_step_reward": -0.47591999999999984}
+{"episode": 19, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999997, "length": 14, "mean_step_reward": -0.4759199999999998}
+{"episode": 20, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.8678399999999975, "length": 14, "mean_step_reward": -0.49055999999999983}
+{"episode": 21, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.551999999999998, "length": 14, "mean_step_reward": -0.46799999999999986}
+{"episode": 22, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999997, "length": 14, "mean_step_reward": -0.4759199999999998}
+{"episode": 23, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.961919999999997, "length": 14, "mean_step_reward": -0.49727999999999983}
+{"episode": 24, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.5889599999999975, "length": 14, "mean_step_reward": -0.47063999999999984}
+{"episode": 25, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999997, "length": 14, "mean_step_reward": -0.4759199999999998}
+{"episode": 26, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.793919999999997, "length": 14, "mean_step_reward": -0.4852799999999998}
+{"episode": 27, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.5889599999999975, "length": 14, "mean_step_reward": -0.47063999999999984}
+{"episode": 28, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.625919999999997, "length": 14, "mean_step_reward": -0.4732799999999998}
+{"episode": 29, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.961919999999997, "length": 14, "mean_step_reward": -0.49727999999999983}
+{"episode": 30, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.924959999999998, "length": 14, "mean_step_reward": -0.49463999999999986}
+{"episode": 31, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.699839999999997, "length": 14, "mean_step_reward": -0.4785599999999998}
+{"episode": 32, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.793919999999997, "length": 14, "mean_step_reward": -0.4852799999999998}
+{"episode": 33, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.830879999999997, "length": 14, "mean_step_reward": -0.4879199999999998}
+{"episode": 34, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.961919999999997, "length": 14, "mean_step_reward": -0.49727999999999983}
+{"episode": 35, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.559999999999997, "length": 14, "mean_step_reward": -0.5399999999999998}
+{"episode": 36, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.334879999999997, "length": 14, "mean_step_reward": -0.5239199999999998}
+{"episode": 37, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.699839999999997, "length": 14, "mean_step_reward": -0.4785599999999998}
+{"episode": 38, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.924959999999997, "length": 14, "mean_step_reward": -0.4946399999999998}
+{"episode": 39, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999998, "length": 14, "mean_step_reward": -0.47591999999999984}
+{"episode": 40, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.699839999999997, "length": 14, "mean_step_reward": -0.4785599999999998}
+{"episode": 41, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999998, "length": 14, "mean_step_reward": -0.47591999999999984}
+{"episode": 42, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.625919999999997, "length": 14, "mean_step_reward": -0.4732799999999998}
+{"episode": 43, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.8678399999999975, "length": 14, "mean_step_reward": -0.49055999999999983}
+{"episode": 44, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.699839999999997, "length": 14, "mean_step_reward": -0.4785599999999998}
+{"episode": 45, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.830879999999997, "length": 14, "mean_step_reward": -0.4879199999999998}
+{"episode": 46, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.551999999999998, "length": 14, "mean_step_reward": -0.46799999999999986}
+{"episode": 47, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.166879999999997, "length": 14, "mean_step_reward": -0.5119199999999998}

outputs/training/test_returns.jsonl ADDED Viewed

	@@ -0,0 +1,25 @@

+{"episode": 0, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.166879999999997, "length": 14, "mean_step_reward": -0.5119199999999998}
+{"episode": 1, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.8019199999999955, "length": 14, "mean_step_reward": -0.5572799999999997}
+{"episode": 2, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -8.641919999999999, "length": 14, "mean_step_reward": -0.6172799999999999}
+{"episode": 3, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.801919999999996, "length": 14, "mean_step_reward": -0.5572799999999998}
+{"episode": 4, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -8.735999999999997, "length": 14, "mean_step_reward": -0.6239999999999998}
+{"episode": 5, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.961919999999997, "length": 14, "mean_step_reward": -0.49727999999999983}
+{"episode": 6, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -8.137919999999998, "length": 14, "mean_step_reward": -0.5812799999999998}
+{"episode": 7, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.428959999999997, "length": 14, "mean_step_reward": -0.5306399999999998}
+{"episode": 8, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.830879999999997, "length": 14, "mean_step_reward": -0.4879199999999998}
+{"episode": 9, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.334879999999997, "length": 14, "mean_step_reward": -0.5239199999999998}
+{"episode": 10, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.588959999999998, "length": 14, "mean_step_reward": -0.4706399999999999}
+{"episode": 11, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.625919999999997, "length": 14, "mean_step_reward": -0.4732799999999998}
+{"episode": 12, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.793919999999997, "length": 14, "mean_step_reward": -0.4852799999999998}
+{"episode": 13, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.166879999999997, "length": 14, "mean_step_reward": -0.5119199999999998}
+{"episode": 14, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.793919999999998, "length": 14, "mean_step_reward": -0.4852799999999999}
+{"episode": 15, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.625919999999997, "length": 14, "mean_step_reward": -0.4732799999999998}
+{"episode": 16, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.187039999999998, "length": 14, "mean_step_reward": -0.5133599999999998}
+{"episode": 17, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.719999999999998, "length": 14, "mean_step_reward": -0.47999999999999987}
+{"episode": 18, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.719999999999998, "length": 14, "mean_step_reward": -0.47999999999999987}
+{"episode": 19, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.8678399999999975, "length": 14, "mean_step_reward": -0.49055999999999983}
+{"episode": 20, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.8510399999999985, "length": 14, "mean_step_reward": -0.4893599999999999}
+{"episode": 21, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.719999999999998, "length": 14, "mean_step_reward": -0.47999999999999987}
+{"episode": 22, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.092959999999998, "length": 14, "mean_step_reward": -0.5066399999999999}
+{"episode": 23, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.887999999999998, "length": 14, "mean_step_reward": -0.4919999999999999}
+{"episode": 24, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999997, "length": 14, "mean_step_reward": -0.4759199999999998}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,61 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "openenv-ghostexec"
+version = "0.1.0"
+description = "Ghostexec environment for OpenEnv"
+requires-python = ">=3.10"
+dependencies = [
+    # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
+    # install from github
+    # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
+    "openenv-core[core]>=0.2.3",
+    # Environment-specific dependencies
+    # Add all dependencies needed for your environment here
+    # Examples:
+    # "numpy>=1.19.0",
+    # "torch>=2.0.0",
+    # "gymnasium>=0.29.0",
+    # "openspiel>=1.0.0",
+    # "smolagents>=1.22.0,<2",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-cov>=4.0.0",
+    "pyyaml>=6.0.0",
+    "matplotlib>=3.8.0",
+]
+# Optional JSON-schema-constrained decoding backends (pick one).
+constrained = [
+    "lm-format-enforcer>=0.10",
+]
+constrained-outlines = [
+    "outlines>=0.1",
+]
+[project.scripts]
+# Server entry point - enables running via: uv run --project . server
+# or: python -m ghostexec.server.app
+server = "ghostexec.server.app:main"
+[tool.setuptools]
+include-package-data = true
+packages = ["ghostexec", "ghostexec.server"]
+package-dir = { "ghostexec" = ".", "ghostexec.server" = "server" }
+[tool.setuptools.package-data]
+ghostexec = ["scenarios/*.json"]
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+pythonpath = ["."]

scenarios/dinner_disaster.json ADDED Viewed

	@@ -0,0 +1,107 @@

+{
+  "simulation_time": "2026-04-21T18:45:00",
+  "stress": 61,
+  "active_conflicts": [],
+  "action_log": [
+    "Client call ran long; dinner reservation at 19:00."
+  ],
+  "episode_active": true,
+  "episode_end_reason": null,
+  "max_episode_steps": 40,
+  "emails": [
+    {
+      "id": "d1",
+      "sender": "Sarah Chen",
+      "subject": "Dinner \u2014 I am at the restaurant",
+      "body": "We have held the table until 7:15. Please leave the call.",
+      "read": false,
+      "replied": false,
+      "priority": "high",
+      "sender_relationship": "personal"
+    },
+    {
+      "id": "d2",
+      "sender": "Taylor Brooks",
+      "subject": "Need sign-off tonight",
+      "body": "Board-adjacent ask: one paragraph on risk posture before morning.",
+      "read": false,
+      "replied": false,
+      "priority": "critical",
+      "sender_relationship": "VIP"
+    }
+  ],
+  "meetings": [
+    {
+      "id": "dc1",
+      "title": "Client escalation call",
+      "start": "2026-04-21T17:30:00",
+      "duration_minutes": 90,
+      "attendees": [
+        "David Okonkwo"
+      ],
+      "location": "virtual",
+      "priority": "high",
+      "cancelled": false
+    },
+    {
+      "id": "dc2",
+      "title": "Dinner reservation window",
+      "start": "2026-04-21T19:00:00",
+      "duration_minutes": 120,
+      "attendees": [
+        "Sarah Chen"
+      ],
+      "location": "Osteria",
+      "priority": "normal",
+      "cancelled": false
+    }
+  ],
+  "contacts": [
+    {
+      "name": "Sarah Chen",
+      "relationship_type": "spouse",
+      "communication_preference": "text",
+      "importance": 5,
+      "mood": "annoyed"
+    },
+    {
+      "name": "Taylor Brooks",
+      "relationship_type": "investor",
+      "communication_preference": "call",
+      "importance": 4,
+      "mood": "neutral"
+    },
+    {
+      "name": "David Okonkwo",
+      "relationship_type": "client",
+      "communication_preference": "email",
+      "importance": 4,
+      "mood": "angry"
+    },
+    {
+      "name": "Jordan Lee",
+      "relationship_type": "direct_report",
+      "communication_preference": "call",
+      "importance": 3,
+      "mood": "neutral"
+    }
+  ],
+  "tasks": [
+    {
+      "id": "dt1",
+      "description": "Text Sarah ETA for dinner",
+      "deadline": "2026-04-21T18:50:00",
+      "owner": "Self",
+      "status": "pending",
+      "effort": "low"
+    },
+    {
+      "id": "dt2",
+      "description": "Send Taylor the risk paragraph",
+      "deadline": "2026-04-21T23:00:00",
+      "owner": "Self",
+      "status": "pending",
+      "effort": "medium"
+    }
+  ]
+}

scenarios/monday_morning.json ADDED Viewed

	@@ -0,0 +1,257 @@

+{
+  "simulation_time": "2026-04-22T07:00:00",
+  "stress": 84,
+  "active_conflicts": [],
+  "action_log": [
+    "Monday: board + investor travel + home commitments collide."
+  ],
+  "episode_active": true,
+  "episode_end_reason": null,
+  "max_episode_steps": 56,
+  "emails": [
+    {
+      "id": "mb1",
+      "sender": "Marcus Webb",
+      "subject": "Board deck \u2014 need numbers before market open",
+      "body": "We are missing revised Q3 figures. This is blocking the packet.",
+      "read": false,
+      "replied": false,
+      "priority": "critical",
+      "sender_relationship": "VIP"
+    },
+    {
+      "id": "mb2",
+      "sender": "Marcus Webb",
+      "subject": "Re: investor flight \u2014 your call",
+      "body": "Taylor is airborne in 4 hours. We need a decision on the alternate slot.",
+      "read": false,
+      "replied": false,
+      "priority": "critical",
+      "sender_relationship": "VIP"
+    },
+    {
+      "id": "mb3",
+      "sender": "Marcus Webb",
+      "subject": "Confidential \u2014 board sentiment",
+      "body": "Side channel: two directors are rattled. Your visible handling today matters.",
+      "read": false,
+      "replied": false,
+      "priority": "critical",
+      "sender_relationship": "VIP"
+    },
+    {
+      "id": "sp1",
+      "sender": "Sarah Chen",
+      "subject": "Dinner 7pm \u2014 please confirm",
+      "body": "Reservation is 7pm. If you are going to be late again, tell me now.",
+      "read": false,
+      "replied": false,
+      "priority": "high",
+      "sender_relationship": "personal"
+    },
+    {
+      "id": "fl1",
+      "sender": "Alex Rivera",
+      "subject": "Flight cancelled \u2014 investor dinner at risk",
+      "body": "Airline put me on a 6am tomorrow. Investor dinner prep is slipping.",
+      "read": false,
+      "replied": false,
+      "priority": "high",
+      "sender_relationship": "professional"
+    }
+  ],
+  "meetings": [
+    {
+      "id": "mx1",
+      "title": "Commitment block 1",
+      "start": "2026-04-22T07:00:00",
+      "duration_minutes": 60,
+      "attendees": [
+        "Marcus Webb"
+      ],
+      "location": "virtual",
+      "priority": "critical",
+      "cancelled": false
+    },
+    {
+      "id": "mx2",
+      "title": "Commitment block 2",
+      "start": "2026-04-22T07:00:00",
+      "duration_minutes": 60,
+      "attendees": [
+        "Taylor Brooks"
+      ],
+      "location": "virtual",
+      "priority": "critical",
+      "cancelled": false
+    },
+    {
+      "id": "mx3",
+      "title": "Commitment block 3",
+      "start": "2026-04-22T07:00:00",
+      "duration_minutes": 60,
+      "attendees": [
+        "Marcus Webb"
+      ],
+      "location": "virtual",
+      "priority": "high",
+      "cancelled": false
+    },
+    {
+      "id": "mx4",
+      "title": "Commitment block 4",
+      "start": "2026-04-22T07:00:00",
+      "duration_minutes": 60,
+      "attendees": [
+        "Taylor Brooks"
+      ],
+      "location": "virtual",
+      "priority": "high",
+      "cancelled": false
+    },
+    {
+      "id": "mx5",
+      "title": "Commitment block 5",
+      "start": "2026-04-22T07:00:00",
+      "duration_minutes": 60,
+      "attendees": [
+        "Marcus Webb"
+      ],
+      "location": "virtual",
+      "priority": "high",
+      "cancelled": false
+    },
+    {
+      "id": "mx6",
+      "title": "Commitment block 6",
+      "start": "2026-04-22T07:00:00",
+      "duration_minutes": 60,
+      "attendees": [
+        "Taylor Brooks"
+      ],
+      "location": "virtual",
+      "priority": "high",
+      "cancelled": false
+    },
+    {
+      "id": "m_pm",
+      "title": "Afternoon sync",
+      "start": "2026-04-22T15:00:00",
+      "duration_minutes": 30,
+      "attendees": [
+        "Jordan Lee"
+      ],
+      "location": "virtual",
+      "priority": "normal",
+      "cancelled": false
+    }
+  ],
+  "contacts": [
+    {
+      "name": "Marcus Webb",
+      "relationship_type": "board_member",
+      "communication_preference": "email",
+      "importance": 5,
+      "mood": "angry"
+    },
+    {
+      "name": "Sarah Chen",
+      "relationship_type": "spouse",
+      "communication_preference": "text",
+      "importance": 5,
+      "mood": "annoyed"
+    },
+    {
+      "name": "Taylor Brooks",
+      "relationship_type": "investor",
+      "communication_preference": "call",
+      "importance": 4,
+      "mood": "neutral"
+    },
+    {
+      "name": "Alex Rivera",
+      "relationship_type": "direct_report",
+      "communication_preference": "text",
+      "importance": 3,
+      "mood": "annoyed"
+    },
+    {
+      "name": "Jordan Lee",
+      "relationship_type": "direct_report",
+      "communication_preference": "call",
+      "importance": 3,
+      "mood": "happy"
+    },
+    {
+      "name": "Priya Sharma",
+      "relationship_type": "investor",
+      "communication_preference": "email",
+      "importance": 5,
+      "mood": "annoyed"
+    },
+    {
+      "name": "Elena Vogt",
+      "relationship_type": "team_member",
+      "communication_preference": "email",
+      "importance": 3,
+      "mood": "neutral"
+    },
+    {
+      "name": "David Okonkwo",
+      "relationship_type": "client",
+      "communication_preference": "email",
+      "importance": 4,
+      "mood": "neutral"
+    }
+  ],
+  "tasks": [
+    {
+      "id": "ov1",
+      "description": "Finalize board packet figures",
+      "deadline": "2026-04-22T06:00:00",
+      "owner": "Self",
+      "status": "pending",
+      "effort": "high"
+    },
+    {
+      "id": "ov2",
+      "description": "Callback legal on redlines",
+      "deadline": "2026-04-22T05:30:00",
+      "owner": "Self",
+      "status": "pending",
+      "effort": "medium"
+    },
+    {
+      "id": "ov3",
+      "description": "Approve investor comms draft",
+      "deadline": "2026-04-22T06:15:00",
+      "owner": "Self",
+      "status": "pending",
+      "effort": "high"
+    },
+    {
+      "id": "ov4",
+      "description": "Expense report sign-off",
+      "deadline": "2026-04-22T04:00:00",
+      "owner": "Self",
+      "status": "overdue",
+      "effort": "low"
+    },
+    {
+      "id": "ov5",
+      "description": "Brief EA on calendar triage",
+      "deadline": "2026-04-22T05:00:00",
+      "owner": "Self",
+      "status": "overdue",
+      "effort": "low"
+    },
+    {
+      "id": "fu1",
+      "description": "Team social RSVP",
+      "deadline": "2026-04-25T12:00:00",
+      "owner": "Self",
+      "status": "pending",
+      "effort": "low"
+    }
+  ]
+}

scenarios/phase2_core.json ADDED Viewed

	@@ -0,0 +1,83 @@

+{
+  "simulation_time": "2026-04-21T08:00:00",
+  "stress": 52,
+  "active_conflicts": [],
+  "action_log": [],
+  "episode_active": true,
+  "episode_end_reason": null,
+  "emails": [
+    {"id": "e01", "sender": "Marcus Webb", "subject": "RE: Q3 Numbers — we need to talk", "body": "Board expects revised figures before noon. This is urgent.", "read": false, "replied": false, "priority": "critical", "sender_relationship": "VIP"},
+    {"id": "e02", "sender": "Sarah Chen", "subject": "Dinner tonight", "body": "Reservation at 7pm — please confirm you will make it.", "read": false, "replied": false, "priority": "high", "sender_relationship": "personal"},
+    {"id": "e03", "sender": "Priya Sharma", "subject": "Still waiting on the deck", "body": "Investor read-through is tomorrow. Where is the version you promised?", "read": false, "replied": false, "priority": "critical", "sender_relationship": "VIP"},
+    {"id": "e04", "sender": "Legal", "subject": "Contract redlines due", "body": "Please return comments on the MSA by EOD.", "read": false, "replied": false, "priority": "high", "sender_relationship": "professional"},
+    {"id": "e05", "sender": "Jordan Lee", "subject": "Quick question on roadmap", "body": "Can we grab 10 minutes before standup?", "read": false, "replied": false, "priority": "normal", "sender_relationship": "professional"},
+    {"id": "e06", "sender": "Alex Rivera", "subject": "Flight cancelled — options?", "body": "Airline moved me to a 6am tomorrow. Need guidance.", "read": false, "replied": false, "priority": "high", "sender_relationship": "professional"},
+    {"id": "e07", "sender": "HR Benefits", "subject": "Open enrollment reminder", "body": "Friendly reminder window closes Friday.", "read": true, "replied": false, "priority": "low", "sender_relationship": "professional"},
+    {"id": "e08", "sender": "David Okonkwo", "subject": "Angry about last meeting", "body": "We were not heard. Expect a follow-up call.", "read": false, "replied": false, "priority": "high", "sender_relationship": "professional"},
+    {"id": "e09", "sender": "Newsletter", "subject": "Your weekly digest", "body": "Top stories in tech leadership.", "read": false, "replied": false, "priority": "low", "sender_relationship": "unknown"},
+    {"id": "e10", "sender": "Elena Vogt", "subject": "Board prep materials", "body": "Slides uploaded to the secure folder.", "read": false, "replied": false, "priority": "normal", "sender_relationship": "professional"},
+    {"id": "e11", "sender": "Chris Park", "subject": "Lunch?", "body": "Want to grab something casual near the office?", "read": false, "replied": false, "priority": "low", "sender_relationship": "personal"},
+    {"id": "e12", "sender": "Morgan Blake", "subject": "RE: Budget variance", "body": "Finance needs sign-off today or we slip the quarter.", "read": false, "replied": false, "priority": "critical", "sender_relationship": "VIP"},
+    {"id": "e13", "sender": "IT Security", "subject": "Password rotation", "body": "Your account expires in 48 hours.", "read": true, "replied": true, "priority": "normal", "sender_relationship": "professional"},
+    {"id": "e14", "sender": "Jamie Liu", "subject": "Sprint demo feedback", "body": "Mostly positive — a few UX nits to track.", "read": false, "replied": false, "priority": "normal", "sender_relationship": "professional"},
+    {"id": "e15", "sender": "Taylor Brooks", "subject": "Investor dinner follow-up", "body": "Thanks for last night — next steps attached.", "read": false, "replied": false, "priority": "high", "sender_relationship": "VIP"},
+    {"id": "e16", "sender": "Operations", "subject": "Incident report #4421", "body": "Minor outage resolved; postmortem scheduled.", "read": false, "replied": false, "priority": "normal", "sender_relationship": "professional"},
+    {"id": "e17", "sender": "Riley Santos", "subject": "Can you approve PTO?", "body": "Team coverage looks fine for next week.", "read": false, "replied": false, "priority": "low", "sender_relationship": "professional"},
+    {"id": "e18", "sender": "Noah Patel", "subject": "Vendor pricing", "body": "They moved numbers again — need a decision.", "read": false, "replied": false, "priority": "high", "sender_relationship": "professional"},
+    {"id": "e19", "sender": "Calendar Bot", "subject": "You have 12 conflicts today", "body": "Automated summary of overlapping meetings.", "read": false, "replied": false, "priority": "normal", "sender_relationship": "unknown"},
+    {"id": "e20", "sender": "Casey Nguyen", "subject": "Design review moved", "body": "We shifted to 3pm — hope that works.", "read": false, "replied": false, "priority": "normal", "sender_relationship": "professional"},
+    {"id": "e21", "sender": "Samira Haddad", "subject": "Formal complaint received", "body": "Please acknowledge receipt per policy.", "read": false, "replied": false, "priority": "critical", "sender_relationship": "professional"},
+    {"id": "e22", "sender": "Spouse", "subject": "Kids pickup", "body": "I have a dentist appointment — can you cover 4pm?", "read": false, "replied": false, "priority": "high", "sender_relationship": "personal"},
+    {"id": "e23", "sender": "Marketing", "subject": "Launch blog draft", "body": "Casual tone OK? LMK by 2pm.", "read": false, "replied": false, "priority": "normal", "sender_relationship": "professional"},
+    {"id": "e24", "sender": "Vikram Singh", "subject": "Partnership term sheet", "body": "Legal asked for your eyes on section 4 only.", "read": false, "replied": false, "priority": "high", "sender_relationship": "VIP"},
+    {"id": "e25", "sender": "Facilities", "subject": "Office move checklist", "body": "Please label your boxes by Friday.", "read": true, "replied": false, "priority": "low", "sender_relationship": "professional"},
+    {"id": "e26", "sender": "Quinn Murphy", "subject": "Sorry for the tone earlier", "body": "Rough morning — can we reset?", "read": false, "replied": false, "priority": "normal", "sender_relationship": "personal"},
+    {"id": "e27", "sender": "Board Secretary", "subject": "Confidential — agenda", "body": "Materials under embargo until 5pm.", "read": false, "replied": false, "priority": "critical", "sender_relationship": "VIP"},
+    {"id": "e28", "sender": "Recruiting", "subject": "VP Eng loop feedback", "body": "Candidate availability Thursday.", "read": false, "replied": false, "priority": "normal", "sender_relationship": "professional"},
+    {"id": "e29", "sender": "Avery Cole", "subject": "Weekend golf?", "body": "Totally casual — no pressure.", "read": false, "replied": false, "priority": "low", "sender_relationship": "personal"},
+    {"id": "e30", "sender": "Support", "subject": "Ticket #9982 closed", "body": "Your laptop swap is complete.", "read": true, "replied": false, "priority": "low", "sender_relationship": "unknown"}
+  ],
+  "meetings": [
+    {"id": "m01", "title": "Board Call", "start": "2026-04-21T09:00:00", "duration_minutes": 60, "attendees": ["Marcus Webb", "Elena Vogt"], "location": "virtual", "priority": "critical", "cancelled": false},
+    {"id": "m02", "title": "Client Demo", "start": "2026-04-21T09:00:00", "duration_minutes": 60, "attendees": ["David Okonkwo"], "location": "virtual", "priority": "high", "cancelled": false},
+    {"id": "m03", "title": "Coffee with Jordan", "start": "2026-04-21T10:30:00", "duration_minutes": 30, "attendees": ["Jordan Lee"], "location": "Cafe North", "priority": "low", "cancelled": false},
+    {"id": "m04", "title": "Team Standup", "start": "2026-04-21T11:00:00", "duration_minutes": 30, "attendees": ["Jordan Lee", "Jamie Liu"], "location": "virtual", "priority": "normal", "cancelled": false},
+    {"id": "m05", "title": "Lunch with Priya", "start": "2026-04-21T11:00:00", "duration_minutes": 90, "attendees": ["Priya Sharma"], "location": "Osteria", "priority": "high", "cancelled": false},
+    {"id": "m06", "title": "1:1 Avery", "start": "2026-04-21T13:00:00", "duration_minutes": 60, "attendees": ["Avery Cole"], "location": "Office 12B", "priority": "normal", "cancelled": false},
+    {"id": "m07", "title": "Investor Update", "start": "2026-04-21T14:00:00", "duration_minutes": 60, "attendees": ["Taylor Brooks"], "location": "virtual", "priority": "critical", "cancelled": false},
+    {"id": "m08", "title": "Legal Review", "start": "2026-04-21T14:00:00", "duration_minutes": 60, "attendees": ["Legal"], "location": "virtual", "priority": "high", "cancelled": false},
+    {"id": "m09", "title": "Sales QBR", "start": "2026-04-21T15:00:00", "duration_minutes": 60, "attendees": ["Noah Patel"], "location": "virtual", "priority": "normal", "cancelled": false},
+    {"id": "m10", "title": "Ops Incident Review", "start": "2026-04-21T15:30:00", "duration_minutes": 60, "attendees": ["Operations"], "location": "virtual", "priority": "high", "cancelled": false}
+  ],
+  "contacts": [
+    {"name": "Marcus Webb", "relationship_type": "board_member", "communication_preference": "email", "importance": 5, "mood": "angry"},
+    {"name": "Sarah Chen", "relationship_type": "spouse", "communication_preference": "text", "importance": 5, "mood": "neutral"},
+    {"name": "Priya Sharma", "relationship_type": "investor", "communication_preference": "email", "importance": 5, "mood": "annoyed"},
+    {"name": "Jordan Lee", "relationship_type": "direct_report", "communication_preference": "call", "importance": 3, "mood": "happy"},
+    {"name": "David Okonkwo", "relationship_type": "client", "communication_preference": "email", "importance": 4, "mood": "angry"},
+    {"name": "Elena Vogt", "relationship_type": "team_member", "communication_preference": "email", "importance": 3, "mood": "neutral"},
+    {"name": "Taylor Brooks", "relationship_type": "investor", "communication_preference": "call", "importance": 4, "mood": "neutral"},
+    {"name": "Alex Rivera", "relationship_type": "direct_report", "communication_preference": "text", "importance": 2, "mood": "neutral"},
+    {"name": "Jamie Liu", "relationship_type": "team_member", "communication_preference": "email", "importance": 2, "mood": "happy"},
+    {"name": "Chris Park", "relationship_type": "friend", "communication_preference": "text", "importance": 2, "mood": "happy"},
+    {"name": "Morgan Blake", "relationship_type": "board_member", "communication_preference": "email", "importance": 5, "mood": "neutral"},
+    {"name": "Riley Santos", "relationship_type": "direct_report", "communication_preference": "email", "importance": 3, "mood": "neutral"},
+    {"name": "Noah Patel", "relationship_type": "client", "communication_preference": "email", "importance": 4, "mood": "annoyed"},
+    {"name": "Casey Nguyen", "relationship_type": "team_member", "communication_preference": "email", "importance": 2, "mood": "neutral"},
+    {"name": "Vikram Singh", "relationship_type": "investor", "communication_preference": "email", "importance": 4, "mood": "neutral"}
+  ],
+  "tasks": [
+    {"id": "t01", "description": "Send Q3 deck to Marcus", "deadline": "2026-04-21T09:30:00", "owner": "Marcus Webb", "status": "pending", "effort": "high"},
+    {"id": "t02", "description": "Confirm dinner reservation", "deadline": "2026-04-21T10:00:00", "owner": "Sarah Chen", "status": "pending", "effort": "low"},
+    {"id": "t03", "description": "Rebook investor flight", "deadline": "2026-04-21T11:00:00", "owner": "Alex Rivera", "status": "pending", "effort": "medium"},
+    {"id": "t04", "description": "Legal MSA redlines", "deadline": "2026-04-21T17:00:00", "owner": "Legal", "status": "in-progress", "effort": "high"},
+    {"id": "t05", "description": "Approve vendor SOW", "deadline": "2026-04-22T12:00:00", "owner": "Noah Patel", "status": "pending", "effort": "medium"},
+    {"id": "t06", "description": "Prep board talking points", "deadline": "2026-04-21T08:30:00", "owner": "Self", "status": "pending", "effort": "high"},
+    {"id": "t07", "description": "Expense report Q1", "deadline": "2026-04-25T23:59:59", "owner": "Finance", "status": "pending", "effort": "low"},
+    {"id": "t08", "description": "Callback David Okonkwo", "deadline": "2026-04-21T12:00:00", "owner": "Self", "status": "pending", "effort": "low"},
+    {"id": "t09", "description": "Review design mocks", "deadline": "2026-04-21T15:00:00", "owner": "Casey Nguyen", "status": "done", "effort": "medium"},
+    {"id": "t10", "description": "Submit benefits election", "deadline": "2026-04-28T23:59:59", "owner": "HR", "status": "pending", "effort": "low"},
+    {"id": "t11", "description": "Brief PR on launch timing", "deadline": "2026-04-21T14:00:00", "owner": "Marketing", "status": "pending", "effort": "medium"},
+    {"id": "t12", "description": "Sign birthday card for Avery", "deadline": "2026-04-21T16:00:00", "owner": "Jamie Liu", "status": "pending", "effort": "low"}
+  ]
+}

scenarios/schema_drift_test.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "description": "Patronus-style schema drift bundle (three mid-episode rule changes).",
+  "events": [
+    {
+      "after_step": 1,
+      "shift_all_meetings_hours": 1,
+      "comment": "Scenario 1: calendar system shifts all local meeting times by +1 hour."
+    },
+    {
+      "after_step": 2,
+      "set_contact_preference": {
+        "name": "Sarah Chen",
+        "communication_preference": "text"
+      },
+      "comment": "Scenario 2: VIP/personal contact switches to text-only preference."
+    },
+    {
+      "after_step": 3,
+      "set_task_deadline": {
+        "task_id": "t02",
+        "deadline": "2026-04-21T07:00:00"
+      },
+      "suppress_reply_relationship_for_senders": ["Marcus Webb"],
+      "comment": "Scenario 3: task deadline moved earlier; Marcus email replies yield zero relationship score."
+    }
+  ]
+}

scenarios/vip_meltdown.json ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+  "simulation_time": "2026-04-21T09:00:00",
+  "stress": 44,
+  "active_conflicts": [],
+  "action_log": [
+    "VIP meltdown demo: external pressure escalates if ignored."
+  ],
+  "episode_active": true,
+  "episode_end_reason": null,
+  "max_episode_steps": 24,
+  "emails": [
+    {
+      "id": "v1",
+      "sender": "Taylor Brooks",
+      "subject": "We need alignment now",
+      "body": "Neutral opening: waiting on your stance before we brief others.",
+      "read": false,
+      "replied": false,
+      "priority": "critical",
+      "sender_relationship": "VIP"
+    }
+  ],
+  "meetings": [
+    {
+      "id": "vm1",
+      "title": "Investor sync",
+      "start": "2026-04-21T10:00:00",
+      "duration_minutes": 45,
+      "attendees": [
+        "Taylor Brooks"
+      ],
+      "location": "virtual",
+      "priority": "high",
+      "cancelled": false
+    }
+  ],
+  "contacts": [
+    {
+      "name": "Taylor Brooks",
+      "relationship_type": "investor",
+      "communication_preference": "call",
+      "importance": 4,
+      "mood": "neutral"
+    },
+    {
+      "name": "Jordan Lee",
+      "relationship_type": "direct_report",
+      "communication_preference": "email",
+      "importance": 3,
+      "mood": "happy"
+    }
+  ],
+  "tasks": [
+    {
+      "id": "vt1",
+      "description": "Prep one-pager for Taylor",
+      "deadline": "2026-04-21T11:00:00",
+      "owner": "Self",
+      "status": "pending",
+      "effort": "medium"
+    }
+  ]
+}

scenarios/vip_meltdown_drift.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "events": [
+    {
+      "after_step": 1,
+      "set_contact_mood": {
+        "name": "Taylor Brooks",
+        "mood": "annoyed"
+      }
+    },
+    {
+      "after_step": 2,
+      "set_contact_mood": {
+        "name": "Taylor Brooks",
+        "mood": "angry"
+      }
+    },
+    {
+      "after_step": 3,
+      "set_contact_mood": {
+        "name": "Taylor Brooks",
+        "mood": "furious"
+      }
+    }
+  ]
+}

scripts/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Makes ``scripts.*`` importable when repo root is on PYTHONPATH (pytest).

scripts/http_endpoint_smoke.py ADDED Viewed

	@@ -0,0 +1,184 @@

+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# CLI: hit GhostExec HTTP endpoints (live URL or --local in-process app).
+#
+#   uv run python scripts/http_endpoint_smoke.py --local
+#   uv run python scripts/http_endpoint_smoke.py --url http://127.0.0.1:8000
+from __future__ import annotations
+import argparse
+import json
+import sys
+import urllib.error
+import urllib.request
+from typing import Any
+from urllib.parse import urljoin
+ROOT = __import__("pathlib").Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+def _print_curl(base: str) -> None:
+    print("# --- copy/paste (bash) ---")
+    for method, path in [
+        ("GET", "/health"),
+        ("GET", "/metadata"),
+        ("GET", "/state"),
+        ("GET", "/schema"),
+        ("GET", "/openapi.json"),
+    ]:
+        print(f"curl -sS -X {method} '{base.rstrip('/')}{path}' | head -c 200 && echo")
+    print(
+        "curl -sS -X POST '{base}/reset' -H 'Content-Type: application/json' -d '{{}}' | head -c 300 && echo".format(
+            base=base.rstrip("/")
+        )
+    )
+    print(
+        "curl -sS -X POST '{base}/step' -H 'Content-Type: application/json' "
+        "-d '{{\"action\":{{\"action_type\":\"do_nothing\"}}}}' | head -c 300 && echo".format(base=base.rstrip("/"))
+    )
+    print(
+        "# Note: HTTP uses a new env per request — not one multi-step episode; use WebSocket /ws for that."
+    )
+class LiveClient:
+    def __init__(self, base: str) -> None:
+        self.base = base.rstrip("/")
+    def request(
+        self,
+        method: str,
+        path: str,
+        *,
+        data: bytes | None = None,
+        headers: dict[str, str] | None = None,
+    ) -> tuple[int, str]:
+        url = urljoin(self.base + "/", path.lstrip("/"))
+        req = urllib.request.Request(url, data=data, headers=headers or {}, method=method)
+        try:
+            with urllib.request.urlopen(req, timeout=20) as resp:
+                return resp.status, resp.read().decode(errors="replace")
+        except urllib.error.HTTPError as e:
+            return e.code, e.read().decode(errors="replace")
+class LocalClient:
+    def __init__(self) -> None:
+        from fastapi.testclient import TestClient
+        from ghostexec.server.app import app
+        self._client = TestClient(app, raise_server_exceptions=True)
+    def request(
+        self,
+        method: str,
+        path: str,
+        *,
+        data: bytes | None = None,
+        headers: dict[str, str] | None = None,
+    ) -> tuple[int, str]:
+        hdrs = headers or {}
+        kwargs: dict[str, Any] = {}
+        if data is not None:
+            kwargs["content"] = data
+            kwargs["headers"] = hdrs
+        r = self._client.request(method, path, **kwargs)
+        return r.status_code, r.text
+def main() -> int:
+    p = argparse.ArgumentParser(description="GhostExec HTTP endpoint smoke (CLI).")
+    p.add_argument(
+        "--url",
+        default="http://127.0.0.1:8000",
+        help="Live server base URL (ignored with --local).",
+    )
+    p.add_argument(
+        "--local",
+        action="store_true",
+        help="Use in-process FastAPI TestClient (no server required).",
+    )
+    p.add_argument(
+        "--print-curl",
+        action="store_true",
+        help="Print example curl commands and exit 0.",
+    )
+    args = p.parse_args()
+    if args.print_curl:
+        _print_curl(args.url)
+        return 0
+    client: LiveClient | LocalClient
+    label: str
+    if args.local:
+        client = LocalClient()
+        label = "local TestClient"
+    else:
+        client = LiveClient(args.url)
+        label = args.url
+    def check_get(path: str) -> None:
+        code, body = client.request("GET", path)
+        ok = 200 <= code < 300
+        status = "OK" if ok else "FAIL"
+        print(f"[{status}] GET {path} -> HTTP {code} (body ~{len(body)} chars)")
+        if not ok:
+            raise SystemExit(1)
+    print(f"GhostExec HTTP smoke ({label})\n")
+    for path in (
+        "/health",
+        "/metadata",
+        "/state",
+        "/schema",
+        "/openapi.json",
+        "/docs",
+        "/redoc",
+    ):
+        check_get(path)
+    body = json.dumps({}).encode()
+    hdrs = {"Content-Type": "application/json"}
+    code, txt = client.request("POST", "/reset", data=body, headers=hdrs)
+    print(f"[{'OK' if code == 200 else 'FAIL'}] POST /reset -> HTTP {code}")
+    if code != 200:
+        raise SystemExit(1)
+    j = json.loads(txt)
+    em = (j.get("observation") or {}).get("echoed_message", "")[:50]
+    print(f"      briefing prefix: {em!r}")
+    step_payload = json.dumps({"action": {"action_type": "do_nothing"}}).encode()
+    code2, txt2 = client.request("POST", "/step", data=step_payload, headers=hdrs)
+    print(f"[{'OK' if code2 == 200 else 'FAIL'}] POST /step do_nothing -> HTTP {code2}")
+    if code2 != 200:
+        raise SystemExit(1)
+    print(
+        "\nNote: OpenEnv HTTP may use a new env per request, so separate POSTs do not advance "
+        "one long episode; each POST /step runs a single action on a fresh instance. "
+        "Multi-step learning on one episode: WebSocket /ws (see ghostexec/README.md)."
+    )
+    code3, _ = client.request("POST", "/mcp", data=json.dumps({"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}}).encode(), headers=hdrs)
+    print(f"[{'OK' if code3 == 200 else 'FAIL'}] POST /mcp tools/list -> HTTP {code3}")
+    if code3 != 200:
+        raise SystemExit(1)
+    code4, _ = client.request("GET", "/reset")
+    print(f"[{'OK' if code4 == 405 else 'FAIL'}] GET /reset (expect 405) -> HTTP {code4}")
+    if code4 != 405:
+        raise SystemExit(1)
+    print("\nAll checks passed.")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

scripts/run_live_api_dead_500.py ADDED Viewed

	@@ -0,0 +1,196 @@

+"""Run 500+ LIVE HTTP API reward dead-tests against a running GhostExec server.
+Usage:
+    uv run python scripts/run_live_api_dead_500.py --url http://127.0.0.1:8002 --cases 500
+"""
+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+from urllib.parse import urljoin
+import urllib.error
+import urllib.request
+W_CONFLICT = 0.35
+W_REL = 0.35
+W_TASK = 0.30
+OUTPUT_SCALE = 0.48
+def _request(
+    base_url: str,
+    method: str,
+    path: str,
+    *,
+    body: dict[str, Any] | None = None,
+    timeout: float = 20.0,
+) -> tuple[int, str]:
+    data = None
+    headers = {"Accept": "application/json"}
+    if body is not None:
+        data = json.dumps(body).encode()
+        headers["Content-Type"] = "application/json"
+    req = urllib.request.Request(
+        urljoin(base_url.rstrip("/") + "/", path.lstrip("/")),
+        data=data,
+        headers=headers,
+        method=method,
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            return resp.status, resp.read().decode(errors="replace")
+    except urllib.error.HTTPError as e:
+        return e.code, e.read().decode(errors="replace")
+def _step_payload_for(i: int) -> dict[str, Any]:
+    templates: list[dict[str, Any]] = [
+        {"action": {"action_type": "do_nothing"}},
+        {"action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}},
+        {"action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}},
+        {"action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}},
+        {"action": {"action_type": "archive_email", "email_id": "e09"}},
+        {"action": {"action_type": "archive_email", "email_id": "bad_id"}},
+        {
+            "action": {
+                "action_type": "reschedule_meeting",
+                "meeting_id": "m02",
+                "new_time": "2026-04-21T18:00:00",
+            }
+        },
+        {
+            "action": {
+                "action_type": "reschedule_meeting",
+                "meeting_id": "m03",
+                "new_time": "2026-04-21T09:30:00",  # overlap -> invalid semantic
+            }
+        },
+        {"action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}},
+        {"action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}},
+        {"action": {"action_type": "complete_task", "task_id": "t07"}},
+        {"action": {"action_type": "complete_task", "task_id": "t09"}},  # already done
+        {"action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}},
+        {"action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}},
+        {
+            "action": {
+                "action_type": "send_message",
+                "contact_name": "Jamie Liu",
+                "message_body": "Quick sync please.",
+            }
+        },
+        {
+            "action": {
+                "action_type": "send_message",
+                "contact_name": "Nobody",
+                "message_body": "hello",
+            }
+        },
+    ]
+    return templates[i % len(templates)]
+def _assert_api_surface(base_url: str) -> None:
+    for path in ("/health", "/metadata", "/state", "/schema", "/openapi.json", "/docs", "/redoc"):
+        code, _ = _request(base_url, "GET", path)
+        assert code == 200, f"{path} -> {code}"
+    assert _request(base_url, "GET", "/reset")[0] == 405
+    assert _request(base_url, "GET", "/step")[0] == 405
+    assert _request(base_url, "GET", "/this-path-should-not-exist-ghostexec")[0] == 404
+    assert _request(base_url, "POST", "/mcp", body={"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}})[0] == 200
+def main() -> int:
+    p = argparse.ArgumentParser(description="Run live 500+ reward dead-tests.")
+    p.add_argument("--url", default="http://127.0.0.1:8002", help="Base server URL")
+    p.add_argument("--cases", type=int, default=500, help="Number of /reset+/step cases")
+    args = p.parse_args()
+    base_url = args.url.rstrip("/")
+    cases = max(1, args.cases)
+    _assert_api_surface(base_url)
+    out_dir = Path("outputs") / "logs"
+    out_dir.mkdir(parents=True, exist_ok=True)
+    out_path = out_dir / f"api_dead_live_{cases}.jsonl"
+    passed = 0
+    failed = 0
+    failures: list[str] = []
+    with out_path.open("w", encoding="utf-8") as f:
+        for idx in range(cases):
+            rec: dict[str, Any] = {"idx": idx, "ok": False, "error": None}
+            try:
+                rc, rb = _request(
+                    base_url,
+                    "POST",
+                    "/reset",
+                    body={"episode_id": f"live-dead-{idx:04d}", "seed": 42},
+                )
+                assert rc == 200, f"reset status {rc}"
+                payload = _step_payload_for(idx)
+                rec["action"] = payload["action"]
+                sc, sb = _request(base_url, "POST", "/step", body=payload)
+                assert sc == 200, f"step status {sc}"
+                body = json.loads(sb)
+                obs = body["observation"]
+                meta = obs.get("metadata") or {}
+                bd = meta.get("reward_breakdown") or {}
+                reward = float(body["reward"])
+                final = float(bd["final"])
+                assert reward == final, "reward != breakdown.final"
+                c = float(bd.get("conflict", 0.0))
+                r = float(bd.get("relationship", 0.0))
+                t = float(bd.get("task", 0.0))
+                expected_weighted = OUTPUT_SCALE * (W_CONFLICT * c + W_REL * r + W_TASK * t)
+                assert float(bd["weighted_base"]) == expected_weighted, "weighted_base mismatch"
+                expected_final = (
+                    float(bd.get("weighted_base", 0.0))
+                    + float(bd.get("invalid_step_adjustment", 0.0))
+                    + float(bd.get("episode_completion_bonus", 0.0))
+                    + float(bd.get("catastrophic_penalty", 0.0))
+                    + float(bd.get("do_nothing_floor", 0.0))
+                )
+                assert final == expected_final, "final aggregation mismatch"
+                if payload["action"]["action_type"] == "do_nothing":
+                    assert float(bd.get("do_nothing_floor", 0.0)) == -0.15, "do_nothing floor mismatch"
+                    assert reward < 0, "do_nothing should be negative"
+                if meta.get("step_ok") is False:
+                    assert float(bd.get("invalid_step_adjustment", 0.0)) == -0.25, "invalid penalty mismatch"
+                rec["ok"] = True
+                rec["reward"] = reward
+                rec["step_ok"] = meta.get("step_ok")
+                passed += 1
+            except Exception as e:  # noqa: BLE001
+                rec["ok"] = False
+                rec["error"] = str(e)
+                failed += 1
+                if len(failures) < 10:
+                    failures.append(f"idx={idx}: {e}")
+            finally:
+                f.write(json.dumps(rec, ensure_ascii=False) + "\n")
+    print(f"Live API dead-test complete: passed={passed} failed={failed} total={cases}")
+    print(f"Report: {out_path}")
+    if failures:
+        print("First failures:")
+        for row in failures:
+            print(" -", row)
+    return 0 if failed == 0 else 1
+if __name__ == "__main__":
+    raise SystemExit(main())

server/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Ghostexec environment server components."""
+from .ghostexec_environment import GhostexecEnvironment
+__all__ = ["GhostexecEnvironment"]

server/app.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+FastAPI application for the Ghostexec Environment.
+This module creates an HTTP server that exposes the GhostexecEnvironment
+over HTTP and WebSocket endpoints, compatible with EnvClient.
+Endpoints:
+    - POST /reset: Reset the environment
+    - POST /step: Execute an action
+    - GET /state: Get current environment state
+    - GET /schema: Get action/observation schemas
+    - WS /ws: WebSocket endpoint for persistent sessions
+Usage:
+    # Development (with auto-reload):
+    uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
+    # Production:
+    uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
+    # Or run directly:
+    python -m server.app
+"""
+try:
+    import openenv.core.env_server.http_server as _openenv_http
+except Exception as e:  # pragma: no cover
+    raise ImportError(
+        "openenv is required for the web interface. Install dependencies with '\n    uv sync\n'"
+    ) from e
+# OpenEnv's serialize_observation drops `metadata` from the JSON body; Ghostexec
+# trainers and live tests rely on step_ok / ids inside observation.metadata.
+_orig_serialize_observation = _openenv_http.serialize_observation
+def _ghostexec_serialize_observation(observation):  # type: ignore[no-untyped-def]
+    payload = _orig_serialize_observation(observation)
+    inner = payload.get("observation")
+    if isinstance(inner, dict):
+        meta = getattr(observation, "metadata", None) or {}
+        inner["metadata"] = _openenv_http._make_json_serializable(meta)
+    return payload
+_openenv_http.serialize_observation = _ghostexec_serialize_observation
+from openenv.core.env_server.http_server import create_app  # noqa: E402
+try:
+    # Editable / normal install (package name `ghostexec`).
+    from ghostexec.models import GhostexecAction, GhostexecObservation
+    from ghostexec.server.ghostexec_environment import GhostexecEnvironment
+except ImportError:
+    # Plain `uvicorn server.app:app` from repo root: top-level `models` + `server` package.
+    from models import GhostexecAction, GhostexecObservation
+    from server.ghostexec_environment import GhostexecEnvironment
+# Create the app with web interface and README integration
+app = create_app(
+    GhostexecEnvironment,
+    GhostexecAction,
+    GhostexecObservation,
+    env_name="ghostexec",
+    max_concurrent_envs=1,  # increase this number to allow more concurrent WebSocket sessions
+)
+def _patch_openapi_ghostexec_examples(schema: dict) -> None:
+    """Replace OpenEnv's generic observation examples with GhostExec's plain-text briefing shape."""
+    briefing = (
+        "=== GHOSTEXEC BRIEFING — Tue 21 Apr 2026 08:00 ===\n\n"
+        "UNREAD EMAILS (…): …\n\n"
+        "CALENDAR CONFLICTS IN NEXT 4 HOURS: …\n\n"
+        "CONTACTS TO WATCH: …\n\n"
+        "OVERDUE OR DUE-SOON TASKS: …\n\n"
+        "EXEC STRESS LEVEL: 52/100\n"
+        "STEPS REMAINING: 48"
+    )
+    obs = {"echoed_message": briefing, "message_length": len(briefing)}
+    reset_ex = {"observation": obs, "reward": 0.0, "done": False}
+    step_ex = {"observation": obs, "reward": -0.42, "done": False}
+    for path, example in (("/reset", reset_ex), ("/step", step_ex)):
+        try:
+            cell = (
+                schema["paths"][path]["post"]["responses"]["200"]["content"]["application/json"]
+            )
+            if isinstance(cell, dict):
+                cell["example"] = example
+        except KeyError:
+            continue
+_OPENAPI_HTTP_EPISODE_SENTINEL = "Ghostexec / OpenEnv HTTP"
+_OPENAPI_HTTP_EPISODE_NOTE = f"""
+---
+## {_OPENAPI_HTTP_EPISODE_SENTINEL}
+Each `POST /reset` and `POST /step` may run on a **new** environment instance, so
+separate HTTP requests do **not** share one in-memory episode across calls. A lone
+`POST /step` still applies your action once (after internal scenario load). For
+**many steps on the same episode**, use **WebSocket `/ws`**: open a connection,
+reset once, then send many step messages on that same socket. See **ghostexec/README.md**
+for details.
+"""
+def _patch_openapi_ghostexec_http_note(schema: dict) -> None:
+    """Document HTTP statelessness vs /ws so Swagger and OpenAPI clients see it."""
+    try:
+        info = schema.get("info")
+        if not isinstance(info, dict):
+            return
+        desc = info.get("description") or ""
+        if _OPENAPI_HTTP_EPISODE_SENTINEL in desc:
+            return
+        info["description"] = desc + _OPENAPI_HTTP_EPISODE_NOTE
+    except (TypeError, KeyError):
+        return
+_fastapi_openapi = type(app).openapi.__get__(app, type(app))
+def _ghostexec_openapi() -> dict:
+    if app.openapi_schema is None:
+        _fastapi_openapi()
+        _patch_openapi_ghostexec_examples(app.openapi_schema)
+        _patch_openapi_ghostexec_http_note(app.openapi_schema)
+    return app.openapi_schema  # type: ignore[return-value]
+app.openapi = _ghostexec_openapi  # type: ignore[method-assign]
+def main() -> None:
+    """
+    Entry point for direct execution via uv run or python -m.
+    This function enables running the server without Docker:
+        uv run --project . server
+        uv run --project . server --port 8001
+        python -m ghostexec.server.app
+    For production deployments, consider using uvicorn directly with
+    multiple workers:
+        uvicorn ghostexec.server.app:app --workers 4
+    """
+    import argparse
+    import uvicorn
+    parser = argparse.ArgumentParser(description="GhostExec OpenEnv HTTP server")
+    parser.add_argument("--host", type=str, default="0.0.0.0", help="Bind address")
+    parser.add_argument("--port", type=int, default=8000, help="Listen port")
+    args = parser.parse_args()
+    uvicorn.run(app, host=args.host, port=args.port)
+if __name__ == '__main__':
+    main()

server/ghostexec_environment.py ADDED Viewed

	@@ -0,0 +1,706 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+GhostExec simulated world, agent step (Phases 2–3), and reward (Phase 4).
+Scenario payloads load from scenarios/*.json. Observations are plain-text briefings.
+Invalid actions return a structured error in observation metadata without raising.
+Rewards aggregate conflict / relationship / task scores and log each step to outputs/logs/.
+"""
+from __future__ import annotations
+import json
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from typing import Any
+from uuid import uuid4
+from openenv.core.env_server.interfaces import Environment
+from openenv.core.env_server.types import State
+try:
+    from ..models import (
+        Contact,
+        Email,
+        GhostexecAction,
+        GhostexecObservation,
+        Meeting,
+        Mood,
+        RewardBreakdown,
+        Task,
+        TaskStatus,
+        WorldState,
+    )
+except ImportError:
+    from models import (
+        Contact,
+        Email,
+        GhostexecAction,
+        GhostexecObservation,
+        Meeting,
+        Mood,
+        RewardBreakdown,
+        Task,
+        TaskStatus,
+        WorldState,
+    )
+try:
+    from . import reward as _reward
+except ImportError:
+    try:
+        from server import reward as _reward
+    except ImportError:
+        import reward as _reward  # type: ignore[no-redef]
+_PRIORITY_RANK: dict[str, int] = {"critical": 0, "high": 1, "normal": 2, "low": 3}
+_REL_DISPLAY: dict[str, str] = {
+    "board_member": "Board",
+    "spouse": "Spouse",
+    "investor": "Investor",
+    "direct_report": "Direct report",
+    "client": "Client",
+    "friend": "Friend",
+    "team_member": "Team",
+}
+_INVALID_ACTION_REWARD = -0.25
+_DEFAULT_STEP_REWARD = 0.0
+def _default_scenario_path() -> Path:
+    return Path(__file__).resolve().parent.parent / "scenarios" / "phase2_core.json"
+def _parse_dt(value: str) -> datetime:
+    if value.endswith("Z"):
+        return datetime.fromisoformat(value[:-1]).replace(tzinfo=timezone.utc)
+    dt = datetime.fromisoformat(value)
+    if dt.tzinfo is None:
+        return dt.replace(tzinfo=timezone.utc)
+    return dt
+def _meeting_end(m: Meeting) -> datetime:
+    start = _parse_dt(m.start)
+    return start + timedelta(minutes=m.duration_minutes)
+def _windows_overlap(a_start: datetime, a_end: datetime, b_start: datetime, b_end: datetime) -> bool:
+    return a_start < b_end and b_start < a_end
+class GhostexecEnvironment(Environment):
+    """Inbox, calendar, contacts, tasks, actions, briefings, and Phase 4 rewards."""
+    SUPPORTS_CONCURRENT_SESSIONS: bool = True
+    def __init__(
+        self,
+        scenario_path: str | Path | None = None,
+        schema_drift_events_path: str | Path | None = None,
+    ) -> None:
+        self._scenario_path = Path(scenario_path) if scenario_path else _default_scenario_path()
+        self._drift_events_path = (
+            Path(schema_drift_events_path) if schema_drift_events_path is not None else None
+        )
+        self._drift_events: list[dict[str, Any]] = []
+        if self._drift_events_path and self._drift_events_path.is_file():
+            drift_raw = json.loads(self._drift_events_path.read_text(encoding="utf-8"))
+            self._drift_events = list(drift_raw.get("events", []))
+        self._reply_relationship_suppressed: set[str] = set()
+        self._reward_log_path = (
+            Path(__file__).resolve().parent.parent / "outputs" / "logs" / "episode_rewards.jsonl"
+        )
+        self._world: WorldState | None = None
+        self._base_stress: int = 0
+        self._state = State(episode_id=str(uuid4()), step_count=0)
+        self._last_step_ok: bool = True
+        self._last_step_error: str | None = None
+        self._last_step_detail: str = ""
+        self._last_reward_breakdown: RewardBreakdown | None = None
+    # --- lifecycle ---
+    def reset(self) -> GhostexecObservation:  # type: ignore[override]
+        self._world = self.load_world_from_json(self._scenario_path)
+        self._base_stress = self._world.stress
+        self._rebuild_conflict_list()
+        self._state = State(episode_id=str(uuid4()), step_count=0)
+        self._last_step_ok = True
+        self._last_step_error = None
+        self._last_step_detail = "Episode started."
+        self._reply_relationship_suppressed.clear()
+        self._last_reward_breakdown = None
+        self._ensure_reward_log_dir()
+        briefing = self.build_briefing_text()
+        return self._observation_from_briefing(
+            briefing,
+            reward=_DEFAULT_STEP_REWARD,
+            done=False,
+            reward_breakdown=None,
+        )
+    def step(self, action: GhostexecAction) -> GhostexecObservation:  # type: ignore[override]
+        if self._world is None:
+            # OpenEnv HTTP uses a new env per request; prime the world so this step still
+            # runs the requested action (invalid actions get step_ok False, rewards apply).
+            self.reset()
+        assert self._world is not None
+        if not self._world.episode_active:
+            self._last_step_ok = False
+            self._last_step_error = "Episode is already finished."
+            bd = RewardBreakdown(
+                final=_INVALID_ACTION_REWARD,
+                invalid_step_adjustment=_INVALID_ACTION_REWARD,
+            )
+            self._last_reward_breakdown = bd
+            return self._observation_from_briefing(
+                self.build_briefing_text(),
+                reward=bd.final,
+                done=True,
+                reward_breakdown=bd,
+            )
+        self._state.step_count += 1
+        self._maybe_apply_schema_drift_events()
+        if action.message.strip():
+            self._world.action_log.append(f"note: {action.message.strip()}")
+        before = self.world.model_copy(deep=True)
+        action_ok = self._apply_action(action)
+        self._rebuild_conflict_list()
+        episode_done = False
+        if self._state.step_count >= self._world.max_episode_steps:
+            episode_done = True
+            self._world.episode_active = False
+            self._world.episode_end_reason = self._world.episode_end_reason or "step_limit"
+        breakdown = _reward.compute_step_reward(
+            before,
+            self.world,
+            action,
+            action_ok=action_ok,
+            episode_done=episode_done,
+            relationship_suppressed_for_email_to=frozenset(self._reply_relationship_suppressed),
+        )
+        self._last_reward_breakdown = breakdown
+        self._append_reward_log(breakdown, episode_done, action)
+        briefing = self.build_briefing_text()
+        return self._observation_from_briefing(
+            briefing,
+            reward=breakdown.final,
+            done=episode_done,
+            reward_breakdown=breakdown,
+        )
+    @property
+    def state(self) -> State:
+        return self._state
+    @property
+    def world(self) -> WorldState:
+        if self._world is None:
+            raise RuntimeError("World not initialised; call reset() first.")
+        return self._world
+    # --- Phase 3 briefing (plain text for LLM) ---
+    def build_briefing_text(self) -> str:
+        w = self.world
+        now = _parse_dt(w.simulation_time)
+        header = now.strftime("=== GHOSTEXEC BRIEFING — %a %d %b %Y %H:%M ===")
+        unread = self.get_unread_emails_sorted()
+        email_lines = [
+            f"- [{e.priority.upper()}] From: {e.sender} ({_REL_DISPLAY.get(e.sender_relationship, e.sender_relationship)}) — "
+            f'"{e.subject}"\n  Preview: {(e.body[:100] + ("…" if len(e.body) > 100 else "")).replace(chr(10), " ")}'
+            for e in unread[:20]
+        ]
+        email_block = "\n".join(email_lines) if email_lines else "(none)"
+        horizon = now + timedelta(hours=4)
+        conflict_lines: list[str] = []
+        for row in self.detect_meeting_conflicts():
+            o0 = _parse_dt(row["overlap_start"])
+            o1 = _parse_dt(row["overlap_end"])
+            if o1 <= now or o0 >= horizon:
+                continue
+            ma = self._meeting_by_id(row["meeting_a"])
+            mb = self._meeting_by_id(row["meeting_b"])
+            if not ma or not mb or ma.cancelled or mb.cancelled:
+                continue
+            conflict_lines.append(
+                f"- {_fmt_meeting_line(ma)} CLASHES WITH -> {_fmt_meeting_line(mb)}"
+            )
+        conflict_block = "\n".join(conflict_lines) if conflict_lines else "(none in next 4 hours)"
+        top_contacts = sorted(w.contacts, key=lambda c: (-c.importance, c.name))[:5]
+        contact_lines = [
+            f"- {c.name}: {c.mood.upper()} — {_REL_DISPLAY.get(c.relationship_type, c.relationship_type)}; "
+            f"prefers {c.communication_preference}"
+            for c in top_contacts
+        ]
+        contact_block = "\n".join(contact_lines) if contact_lines else "(none)"
+        soon = now + timedelta(hours=24)
+        task_lines: list[str] = []
+        for t in w.tasks:
+            if t.status == "done":
+                continue
+            dl = _parse_dt(t.deadline)
+            if dl < now or (now <= dl <= soon):
+                flag = "OVERDUE" if dl < now else "due soon"
+                task_lines.append(f"- [{flag}] {t.description} (deadline {t.deadline}, owner {t.owner})")
+        task_block = "\n".join(task_lines[:15]) if task_lines else "(none)"
+        remaining = max(0, w.max_episode_steps - self._state.step_count)
+        parts = [
+            header,
+            "",
+            f"UNREAD EMAILS ({len(unread)} unread):",
+            email_block,
+            "",
+            "CALENDAR CONFLICTS IN NEXT 4 HOURS:",
+            conflict_block,
+            "",
+            "CONTACTS TO WATCH (top 5 by importance):",
+            contact_block,
+            "",
+            "OVERDUE OR DUE-SOON TASKS (next 24h window):",
+            task_block,
+            "",
+            f"EXEC STRESS LEVEL: {w.stress}/100",
+            f"STEPS REMAINING: {remaining}",
+        ]
+        if self._last_step_error:
+            parts += ["", f"LAST ACTION: ERROR — {self._last_step_error}"]
+        elif self._last_step_detail:
+            parts += ["", f"LAST ACTION: OK — {self._last_step_detail}"]
+        return "\n".join(parts)
+    def _meeting_by_id(self, mid: str) -> Meeting | None:
+        for m in self.world.meetings:
+            if m.id == mid:
+                return m
+        return None
+    # --- scenario IO ---
+    @staticmethod
+    def load_world_from_json(path: str | Path) -> WorldState:
+        raw = Path(path).read_text(encoding="utf-8")
+        data = json.loads(raw)
+        return WorldState.model_validate(data)
+    @staticmethod
+    def world_to_json(world: WorldState) -> str:
+        return world.model_dump_json()
+    @staticmethod
+    def world_from_json(blob: str) -> WorldState:
+        return WorldState.model_validate_json(blob)
+    # --- inbox ---
+    def get_unread_emails_sorted(self) -> list[Email]:
+        w = self.world
+        unread = [e for e in w.emails if not e.read]
+        return sorted(
+            unread,
+            key=lambda e: (_PRIORITY_RANK.get(e.priority, 99), e.id),
+        )
+    def mark_email_read(self, email_id: str) -> bool:
+        for i, e in enumerate(self.world.emails):
+            if e.id == email_id:
+                self.world.emails[i] = e.model_copy(update={"read": True})
+                return True
+        return False
+    def mark_email_replied(self, email_id: str) -> bool:
+        for i, e in enumerate(self.world.emails):
+            if e.id == email_id:
+                self.world.emails[i] = e.model_copy(update={"read": True, "replied": True})
+                return True
+        return False
+    # --- calendar ---
+    def detect_meeting_conflicts(self) -> list[dict[str, Any]]:
+        active = [m for m in self.world.meetings if not m.cancelled]
+        out: list[dict[str, Any]] = []
+        for i, a in enumerate(active):
+            a_start = _parse_dt(a.start)
+            a_end = _meeting_end(a)
+            for b in active[i + 1 :]:
+                b_start = _parse_dt(b.start)
+                b_end = _meeting_end(b)
+                if _windows_overlap(a_start, a_end, b_start, b_end):
+                    overlap_start = max(a_start, b_start)
+                    overlap_end = min(a_end, b_end)
+                    out.append(
+                        {
+                            "meeting_a": a.id,
+                            "meeting_b": b.id,
+                            "overlap_start": overlap_start.isoformat(),
+                            "overlap_end": overlap_end.isoformat(),
+                        }
+                    )
+        return out
+    def _reschedule_causes_overlap(self, meeting_id: str, new_start_iso: str) -> bool:
+        idx = next((i for i, m in enumerate(self.world.meetings) if m.id == meeting_id), None)
+        if idx is None:
+            return True
+        cand = self.world.meetings[idx].model_copy(update={"start": new_start_iso})
+        c_start = _parse_dt(cand.start)
+        c_end = _meeting_end(cand)
+        for m in self.world.meetings:
+            if m.cancelled or m.id == meeting_id:
+                continue
+            if _windows_overlap(c_start, c_end, _parse_dt(m.start), _meeting_end(m)):
+                return True
+        return False
+    def reschedule_meeting(self, meeting_id: str, new_start_iso: str) -> bool:
+        for i, m in enumerate(self.world.meetings):
+            if m.id == meeting_id and not m.cancelled:
+                self.world.meetings[i] = m.model_copy(update={"start": new_start_iso})
+                self._rebuild_conflict_list()
+                return True
+        return False
+    def cancel_meeting(self, meeting_id: str) -> bool:
+        for i, m in enumerate(self.world.meetings):
+            if m.id == meeting_id:
+                self.world.meetings[i] = m.model_copy(update={"cancelled": True})
+                self._rebuild_conflict_list()
+                return True
+        return False
+    def add_meeting(self, meeting: Meeting) -> None:
+        self.world.meetings.append(meeting)
+        self._rebuild_conflict_list()
+    # --- contacts ---
+    def get_contact(self, name: str) -> Contact | None:
+        for c in self.world.contacts:
+            if c.name == name:
+                return c
+        return None
+    def update_contact_mood(self, name: str, mood: Mood) -> bool:
+        for i, c in enumerate(self.world.contacts):
+            if c.name == name:
+                self.world.contacts[i] = c.model_copy(update={"mood": mood})
+                return True
+        return False
+    # --- tasks ---
+    def update_task_status(self, task_id: str, status: TaskStatus) -> bool:
+        for i, t in enumerate(self.world.tasks):
+            if t.id == task_id:
+                self.world.tasks[i] = t.model_copy(update={"status": status})
+                return True
+        return False
+    def overdue_tasks_at(self, simulation_iso: str) -> list[Task]:
+        now = _parse_dt(simulation_iso)
+        out: list[Task] = []
+        for t in self.world.tasks:
+            if t.status in ("done",):
+                continue
+            if _parse_dt(t.deadline) < now:
+                out.append(t)
+        return out
+    def set_simulation_time(self, simulation_iso: str) -> None:
+        self.world.simulation_time = simulation_iso
+        self._reapply_task_overdue_flags()
+        self._rebuild_conflict_list()
+    # --- Phase 3 action execution ---
+    def _apply_action(self, action: GhostexecAction) -> bool:
+        self._last_step_ok = True
+        self._last_step_error = None
+        self._last_step_detail = ""
+        at = action.action_type
+        if at == "do_nothing":
+            self._last_step_detail = "No action taken."
+            return True
+        if at == "reply_email":
+            if not action.email_id:
+                return self._fail("reply_email requires email_id")
+            if not any(e.id == action.email_id for e in self.world.emails):
+                return self._fail(f"Unknown email_id {action.email_id!r}")
+            if not action.message_body.strip():
+                return self._fail("reply_email requires non-empty message_body")
+            self.mark_email_replied(action.email_id)
+            self._last_step_detail = f"Replied to email {action.email_id}."
+            return True
+        if at == "archive_email":
+            if not action.email_id:
+                return self._fail("archive_email requires email_id")
+            if not self.mark_email_read(action.email_id):
+                return self._fail(f"Unknown email_id {action.email_id!r}")
+            self._last_step_detail = f"Archived (read) email {action.email_id}."
+            return True
+        if at == "reschedule_meeting":
+            if not action.meeting_id or not action.new_time:
+                return self._fail("reschedule_meeting requires meeting_id and new_time")
+            if not any(m.id == action.meeting_id for m in self.world.meetings):
+                return self._fail(f"Unknown meeting_id {action.meeting_id!r}")
+            if self._reschedule_causes_overlap(action.meeting_id, action.new_time):
+                return self._fail("Target time overlaps another active meeting.")
+            if not self.reschedule_meeting(action.meeting_id, action.new_time):
+                return self._fail("Could not reschedule meeting.")
+            self._last_step_detail = f"Rescheduled {action.meeting_id} to {action.new_time}."
+            return True
+        if at == "cancel_meeting":
+            if not action.meeting_id:
+                return self._fail("cancel_meeting requires meeting_id")
+            if not any(m.id == action.meeting_id for m in self.world.meetings):
+                return self._fail(f"Unknown meeting_id {action.meeting_id!r}")
+            if not self.cancel_meeting(action.meeting_id):
+                return self._fail("Could not cancel meeting.")
+            reason = action.reason.strip() or "(no reason given)"
+            self._world.action_log.append(f"cancelled {action.meeting_id}: {reason}")
+            self._last_step_detail = f"Cancelled meeting {action.meeting_id}."
+            return True
+        if at == "complete_task":
+            if not action.task_id:
+                return self._fail("complete_task requires task_id")
+            t = next((x for x in self.world.tasks if x.id == action.task_id), None)
+            if not t:
+                return self._fail(f"Unknown task_id {action.task_id!r}")
+            if t.status == "done":
+                return self._fail("Task is already done.")
+            self.update_task_status(action.task_id, "done")
+            self._last_step_detail = f"Completed task {action.task_id}."
+            return True
+        if at == "delegate_task":
+            if not action.task_id or not action.contact_name.strip():
+                return self._fail("delegate_task requires task_id and contact_name")
+            if not any(t.id == action.task_id for t in self.world.tasks):
+                return self._fail(f"Unknown task_id {action.task_id!r}")
+            if not self.get_contact(action.contact_name.strip()):
+                return self._fail(f"Unknown contact {action.contact_name.strip()!r}")
+            for i, t in enumerate(self.world.tasks):
+                if t.id == action.task_id:
+                    self.world.tasks[i] = t.model_copy(
+                        update={
+                            "delegated_to": action.contact_name.strip(),
+                            "status": "in-progress",
+                        }
+                    )
+                    break
+            self._last_step_detail = f"Delegated {action.task_id} to {action.contact_name.strip()}."
+            return True
+        if at == "send_message":
+            name = action.contact_name.strip()
+            if not name:
+                return self._fail("send_message requires contact_name")
+            if not self.get_contact(name):
+                return self._fail(f"Unknown contact {name!r}")
+            if not action.message_body.strip():
+                return self._fail("send_message requires non-empty message_body")
+            self._world.action_log.append(f"message to {name}: {action.message_body.strip()[:500]}")
+            self._last_step_detail = f"Message sent to {name}."
+            return True
+        return self._fail(f"Unsupported action_type {at!r}")
+    def _fail(self, msg: str) -> bool:
+        self._last_step_ok = False
+        self._last_step_error = msg
+        self._last_step_detail = ""
+        self._world.action_log.append(f"error: {msg}")
+        return False
+    def _ensure_reward_log_dir(self) -> None:
+        self._reward_log_path.parent.mkdir(parents=True, exist_ok=True)
+    def _append_reward_log(
+        self,
+        breakdown: RewardBreakdown,
+        episode_done: bool,
+        action: GhostexecAction,
+    ) -> None:
+        self._ensure_reward_log_dir()
+        w = self.world
+        crit_open = sum(1 for e in w.emails if e.priority == "critical" and not e.replied)
+        overdue_n = len(self.overdue_tasks_at(w.simulation_time))
+        line = {
+            "episode_id": self._state.episode_id,
+            "step": self._state.step_count,
+            "action_type": action.action_type,
+            "step_ok": self._last_step_ok,
+            "reward": breakdown.final,
+            "conflict_raw": breakdown.conflict_raw,
+            "critical_queue_bonus": breakdown.critical_queue_bonus,
+            "conflict": breakdown.conflict,
+            "relationship": breakdown.relationship,
+            "task": breakdown.task,
+            "weighted_base": breakdown.weighted_base,
+            "output_scale": breakdown.output_scale,
+            "invalid_step_adjustment": breakdown.invalid_step_adjustment,
+            "episode_completion_bonus": breakdown.episode_completion_bonus,
+            "catastrophic_penalty": breakdown.catastrophic_penalty,
+            "episode_done": episode_done,
+            "calendar_overlap_pairs": len(self.detect_meeting_conflicts()),
+            "critical_unreplied": crit_open,
+            "overdue_tasks": overdue_n,
+        }
+        with self._reward_log_path.open("a", encoding="utf-8") as fh:
+            fh.write(json.dumps(line) + "\n")
+    def _maybe_apply_schema_drift_events(self) -> None:
+        if not self._world or not self._drift_events:
+            return
+        step = self._state.step_count
+        for ev in self._drift_events:
+            if ev.get("after_step") != step:
+                continue
+            if "shift_all_meetings_hours" in ev:
+                delta = int(ev["shift_all_meetings_hours"])
+                for i, m in enumerate(self._world.meetings):
+                    new_start = (_parse_dt(m.start) + timedelta(hours=delta)).replace(tzinfo=None)
+                    self._world.meetings[i] = m.model_copy(
+                        update={"start": new_start.isoformat(timespec="seconds")}
+                    )
+                self._world.action_log.append(
+                    f"schema drift: shifted all meeting starts by {delta:+d} hour(s) (calendar TZ policy)."
+                )
+            pref = ev.get("set_contact_preference")
+            if isinstance(pref, dict):
+                name = str(pref.get("name", ""))
+                comm = str(pref.get("communication_preference", "text"))
+                for i, c in enumerate(self._world.contacts):
+                    if c.name == name:
+                        self._world.contacts[i] = c.model_copy(
+                            update={"communication_preference": comm}  # type: ignore[arg-type]
+                        )
+                        break
+                self._world.action_log.append(
+                    f"schema drift: contact {name!r} now prefers {comm} only (relationship channel change)."
+                )
+            td = ev.get("set_task_deadline")
+            if isinstance(td, dict):
+                tid = str(td.get("task_id", ""))
+                dl = str(td.get("deadline", ""))
+                for i, t in enumerate(self._world.tasks):
+                    if t.id == tid:
+                        self._world.tasks[i] = t.model_copy(update={"deadline": dl})
+                        break
+                self._world.action_log.append(
+                    f"schema drift: task {tid!r} deadline moved earlier to {dl!r}."
+                )
+            for name in ev.get("suppress_reply_relationship_for_senders", []) or []:
+                self._reply_relationship_suppressed.add(str(name))
+                self._world.action_log.append(
+                    f"schema drift: replies to emails from {name!r} yield zero relationship score this episode."
+                )
+            scm = ev.get("set_contact_mood")
+            if isinstance(scm, dict):
+                cname = str(scm.get("name", ""))
+                mood_raw = str(scm.get("mood", "neutral"))
+                allowed: tuple[Mood, ...] = ("happy", "neutral", "annoyed", "angry", "furious")
+                if cname and mood_raw in allowed and self.update_contact_mood(cname, mood_raw):
+                    self._world.action_log.append(
+                        f"schema drift: stakeholder {cname!r} mood is now {mood_raw} (external pressure)."
+                    )
+        if any(ev.get("after_step") == step for ev in self._drift_events):
+            self._rebuild_conflict_list()
+    # --- internals ---
+    def _reapply_task_overdue_flags(self) -> None:
+        now = _parse_dt(self.world.simulation_time)
+        for i, t in enumerate(self.world.tasks):
+            if t.status == "done":
+                continue
+            if _parse_dt(t.deadline) < now and t.status != "overdue":
+                self.world.tasks[i] = t.model_copy(update={"status": "overdue"})
+    def _rebuild_conflict_list(self) -> None:
+        lines: list[str] = []
+        for row in self.detect_meeting_conflicts():
+            lines.append(
+                f"Calendar overlap: {row['meeting_a']} vs {row['meeting_b']} "
+                f"({row['overlap_start']} – {row['overlap_end']})"
+            )
+        for e in self.world.emails:
+            if e.priority == "critical" and not e.replied:
+                lines.append(f"Unanswered critical email {e.id}: {e.subject}")
+        bump = min(35, len(lines) * 2)
+        self.world.active_conflicts = lines
+        self.world.stress = min(100, self._base_stress + bump)
+    def _observation_from_briefing(
+        self,
+        briefing: str,
+        reward: float,
+        done: bool,
+        reward_breakdown: RewardBreakdown | None = None,
+    ) -> GhostexecObservation:
+        w = self.world
+        unread_sorted = self.get_unread_emails_sorted()
+        meta: dict[str, Any] = {
+            "simulation_time": w.simulation_time,
+            "stress": w.stress,
+            "unread_email_count": sum(1 for e in w.emails if not e.read),
+            "calendar_conflict_pairs": len(self.detect_meeting_conflicts()),
+            "episode_step": self._state.step_count,
+            "max_episode_steps": w.max_episode_steps,
+            "episode_active": w.episode_active,
+            "episode_end_reason": w.episode_end_reason,
+            "step_ok": self._last_step_ok,
+            "step_error": self._last_step_error,
+            "step_detail": self._last_step_detail,
+            # Compact ids for remote trainers / Colab (briefing stays plain text).
+            "critical_unreplied_email_ids": [
+                e.id for e in w.emails if e.priority == "critical" and not e.replied
+            ][:12],
+            "unread_email_ids": [e.id for e in unread_sorted[:15]],
+            "overdue_task_ids": [t.id for t in self.overdue_tasks_at(w.simulation_time)][:12],
+            "active_meeting_ids": [m.id for m in w.meetings if not m.cancelled][:20],
+        }
+        if reward_breakdown is not None:
+            meta["reward_breakdown"] = reward_breakdown.model_dump()
+        cap = 48_000
+        text = briefing if len(briefing) <= cap else briefing[: cap - 1] + "…"
+        return GhostexecObservation(
+            echoed_message=text,
+            message_length=len(text),
+            done=done,
+            reward=reward,
+            metadata=meta,
+        )
+def _fmt_meeting_line(m: Meeting) -> str:
+    st = _parse_dt(m.start)
+    return f"{st.strftime('%H:%M')}: {m.title} ({m.duration_minutes}min)"

server/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+openenv[core]>=0.2.0
+fastapi>=0.115.0
+uvicorn>=0.24.0

server/reward.py ADDED Viewed

	@@ -0,0 +1,350 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Phase 4 reward: weighted (0.35 / 0.35 / 0.30) with potential-style deltas, critical-queue
+shaping, full sub-scores even on invalid steps (+ explicit invalid penalty), and mild output
+scaling.
+"""
+from __future__ import annotations
+from datetime import datetime, timedelta, timezone
+from typing import Any
+try:
+    from ..models import GhostexecAction, RewardBreakdown, WorldState
+except ImportError:
+    from models import GhostexecAction, RewardBreakdown, WorldState
+W_CONFLICT = 0.35
+W_REL = 0.35
+W_TASK = 0.30
+# Raw conflict units (pre-weight) are clamped to keep invalid / idle steps from exploding.
+CONFLICT_RAW_CAP: float = 6.0
+# Scales the weighted sum of the three channels (weights stay fixed per hackathon rules).
+WEIGHTED_OUTPUT_SCALE: float = 0.48
+# Tone misfit penalties kept small vs outcome terms (~<20% of a strong +2 conflict step after weights).
+TONE_PENALTY_CASUAL_ANGRY_BOARD: float = 0.35
+TONE_PENALTY_FORMAL_PERSONAL: float = 0.08
+_RESOLVE_MICRO_BONUS: float = 0.12
+_CRITICAL_PER_EMAIL_BONUS: float = 0.22
+_RESCHEDULE_VALID_MICRO_BONUS: float = 0.10
+_SEND_MESSAGE_VALID_MICRO_BONUS: float = 0.08
+_COMPLETE_TASK_VALID_MICRO_BONUS: float = 0.06
+_DELEGATE_TASK_VALID_MICRO_BONUS: float = 0.10
+_DO_NOTHING_STRICT_PENALTY: float = -0.15
+_REPLY_PRIORITY_MICRO_BONUS: dict[str, float] = {
+    "critical": 0.30,
+    "high": 0.15,
+    "normal": 0.04,
+    "low": 0.02,
+}
+_MOOD_RANK: dict[str, int] = {
+    "happy": 4,
+    "neutral": 3,
+    "annoyed": 2,
+    "angry": 1,
+    "furious": 0,
+}
+def _parse_dt(value: str) -> datetime:
+    if value.endswith("Z"):
+        return datetime.fromisoformat(value[:-1]).replace(tzinfo=timezone.utc)
+    dt = datetime.fromisoformat(value)
+    if dt.tzinfo is None:
+        return dt.replace(tzinfo=timezone.utc)
+    return dt
+def _meeting_end(m: Any) -> datetime:
+    start = _parse_dt(m.start)
+    return start + timedelta(minutes=m.duration_minutes)
+def _overlap(a0: datetime, a1: datetime, b0: datetime, b1: datetime) -> bool:
+    return a0 < b1 and b0 < a1
+def meeting_conflicts(world: WorldState) -> list[dict[str, Any]]:
+    active = [m for m in world.meetings if not m.cancelled]
+    out: list[dict[str, Any]] = []
+    for i, a in enumerate(active):
+        a0, a1 = _parse_dt(a.start), _meeting_end(a)
+        for b in active[i + 1 :]:
+            b0, b1 = _parse_dt(b.start), _meeting_end(b)
+            if _overlap(a0, a1, b0, b1):
+                o0, o1 = max(a0, b0), min(a1, b1)
+                out.append(
+                    {
+                        "meeting_a": a.id,
+                        "meeting_b": b.id,
+                        "overlap_start": o0.isoformat(),
+                        "overlap_end": o1.isoformat(),
+                    }
+                )
+    return out
+def _pair_set(rows: list[dict[str, Any]]) -> set[frozenset[str]]:
+    return {frozenset((r["meeting_a"], r["meeting_b"])) for r in rows}
+def _attendee_moods_ok(world: WorldState, pair: frozenset[str]) -> bool:
+    names: set[str] = set()
+    for mid in pair:
+        m = next((x for x in world.meetings if x.id == mid), None)
+        if m:
+            names.update(m.attendees)
+    for n in names:
+        c = next((x for x in world.contacts if x.name == n), None)
+        if c is None:
+            continue
+        if c.mood not in ("happy", "neutral"):
+            return False
+    return True
+def score_conflict_resolution(
+    before: WorldState,
+    after: WorldState,
+    action: GhostexecAction,
+    *,
+    action_ok: bool,
+) -> float:
+    b = _pair_set(meeting_conflicts(before))
+    a = _pair_set(meeting_conflicts(after))
+    s = 0.0
+    for _p in b - a:
+        s += 2.0 + _RESOLVE_MICRO_BONUS
+        if _attendee_moods_ok(after, _p):
+            s += 1.0
+    for _ in a - b:
+        s -= 3.0
+    if action_ok and action.action_type == "reschedule_meeting":
+        s += _RESCHEDULE_VALID_MICRO_BONUS
+    return s
+def critical_unreplied_count(world: WorldState) -> int:
+    return sum(1 for e in world.emails if e.priority == "critical" and not e.replied)
+def score_critical_queue_bonus(before: WorldState, after: WorldState) -> float:
+    reduction = critical_unreplied_count(before) - critical_unreplied_count(after)
+    return _CRITICAL_PER_EMAIL_BONUS * max(0, reduction)
+def _classify_tone(text: str) -> str:
+    t = text.lower()
+    if any(w in t for w in ("sorry", "apologize", "apologies", "my mistake")):
+        return "apologetic"
+    if any(w in t for w in ("dear ", "sincerely", "best regards", "respectfully", "cordially")):
+        return "formal"
+    if any(w in t for w in ("hey", "lol", "haha", "👋", "no worries", "cheers")):
+        return "casual"
+    if any(w in t for w in ("must", "immediately", "asap", "non-negotiable", "demand")):
+        return "assertive"
+    return "neutral"
+def score_relationship(
+    before: WorldState,
+    after: WorldState,
+    action: GhostexecAction,
+    *,
+    action_ok: bool,
+    relationship_suppressed_for_email_to: frozenset[str] | None = None,
+) -> float:
+    rel_sup = relationship_suppressed_for_email_to or frozenset()
+    s = 0.0
+    before_map = {c.name: c for c in before.contacts}
+    after_map = {c.name: c for c in after.contacts}
+    for name, ca in after_map.items():
+        cb = before_map.get(name)
+        if not cb:
+            continue
+        ra, rb = _MOOD_RANK[ca.mood], _MOOD_RANK[cb.mood]
+        vip = ca.importance >= 4
+        if ra > rb:
+            s += 3.0 if vip else 1.0
+        elif ra < rb:
+            s -= 4.0 if vip else 2.0
+    if action.action_type == "reply_email" and action.email_id:
+        em = next((e for e in before.emails if e.id == action.email_id), None)
+        if em and em.sender in rel_sup:
+            return 0.0
+        if em:
+            if action_ok and (action.message_body or "").strip():
+                pri = (em.priority or "").lower()
+                micro = _REPLY_PRIORITY_MICRO_BONUS.get(pri, 0.0)
+                if em.sender_relationship == "VIP":
+                    micro *= 2.0
+                s += micro
+            tone = _classify_tone(action.message_body)
+            contact = next((c for c in before.contacts if c.name == em.sender), None)
+            if (
+                contact
+                and contact.relationship_type == "board_member"
+                and contact.mood in ("angry", "furious", "annoyed")
+                and tone == "casual"
+            ):
+                s -= TONE_PENALTY_CASUAL_ANGRY_BOARD
+            if em.sender_relationship == "personal" and tone == "formal":
+                s -= TONE_PENALTY_FORMAL_PERSONAL
+    if action_ok and action.action_type == "send_message" and action.contact_name:
+        known_contact = any(c.name == action.contact_name for c in before.contacts)
+        if known_contact and (action.message_body or "").strip():
+            s += _SEND_MESSAGE_VALID_MICRO_BONUS
+    return s
+def _overdue_tasks(world: WorldState) -> list[Any]:
+    now = _parse_dt(world.simulation_time)
+    out = []
+    for t in world.tasks:
+        if t.status == "done":
+            continue
+        if _parse_dt(t.deadline) < now:
+            out.append(t)
+    return out
+def score_task_completion(
+    before: WorldState,
+    after: WorldState,
+    action: GhostexecAction,
+    *,
+    action_ok: bool,
+) -> float:
+    s = 0.0
+    now = _parse_dt(after.simulation_time)
+    before_tasks = {t.id: t for t in before.tasks}
+    after_tasks = {t.id: t for t in after.tasks}
+    for tid, ta in after_tasks.items():
+        tb = before_tasks.get(tid)
+        if not tb:
+            continue
+        if tb.status != "overdue" and tb.status != "done" and ta.status == "overdue":
+            s -= 2.0
+        if tb.status != "done" and ta.status == "done":
+            dl = _parse_dt(tb.deadline)
+            if dl >= now:
+                s += 2.0
+            else:
+                s += 0.5
+        if (not tb.delegated_to) and ta.delegated_to:
+            de = next((c for c in after.contacts if c.name == ta.delegated_to), None)
+            if de and de.importance <= 3:
+                s += 1.0
+    if action_ok and action.action_type == "complete_task":
+        s += _COMPLETE_TASK_VALID_MICRO_BONUS
+    if action_ok and action.action_type == "delegate_task":
+        s += _DELEGATE_TASK_VALID_MICRO_BONUS
+    return s
+def catastrophic(world: WorldState) -> bool:
+    vip_furious = any(c.importance >= 4 and c.mood == "furious" for c in world.contacts)
+    critical_open = sum(1 for e in world.emails if e.priority == "critical" and not e.replied)
+    return vip_furious or critical_open > 3
+def aggregate_scores(
+    conflict: float,
+    relationship: float,
+    task: float,
+    *,
+    conflict_raw: float,
+    critical_queue_bonus: float,
+    weighted_inner: float,
+    action_ok: bool,
+    episode_done: bool,
+    world_after: WorldState,
+) -> RewardBreakdown:
+    weighted = WEIGHTED_OUTPUT_SCALE * weighted_inner
+    inv = 0.0
+    if not action_ok:
+        inv = -0.25
+    bonus = 0.0
+    cata = 0.0
+    if episode_done:
+        if world_after.stress < 40:
+            bonus = 10.0
+        if catastrophic(world_after):
+            cata = -15.0
+    final = weighted + inv + bonus + cata
+    return RewardBreakdown(
+        conflict_raw=conflict_raw,
+        critical_queue_bonus=critical_queue_bonus,
+        conflict=conflict,
+        relationship=relationship,
+        task=task,
+        weighted_base=weighted,
+        output_scale=WEIGHTED_OUTPUT_SCALE,
+        invalid_step_adjustment=inv,
+        episode_completion_bonus=bonus,
+        catastrophic_penalty=cata,
+        do_nothing_floor=0.0,
+        final=final,
+    )
+def apply_do_nothing_penalty_floor(
+    action: GhostexecAction,
+    breakdown: RewardBreakdown,
+) -> RewardBreakdown:
+    if action.action_type != "do_nothing":
+        return breakdown
+    floor_delta = _DO_NOTHING_STRICT_PENALTY
+    new_final = breakdown.final + floor_delta
+    return breakdown.model_copy(
+        update={"do_nothing_floor": floor_delta, "final": new_final},
+    )
+def compute_step_reward(
+    before: WorldState,
+    after: WorldState,
+    action: GhostexecAction,
+    *,
+    action_ok: bool,
+    episode_done: bool,
+    relationship_suppressed_for_email_to: frozenset[str] | None = None,
+) -> RewardBreakdown:
+    c_core = score_conflict_resolution(before, after, action, action_ok=action_ok)
+    crit_b = score_critical_queue_bonus(before, after)
+    c_raw = c_core + crit_b
+    c = max(-CONFLICT_RAW_CAP, min(CONFLICT_RAW_CAP, c_raw))
+    r = score_relationship(
+        before,
+        after,
+        action,
+        action_ok=action_ok,
+        relationship_suppressed_for_email_to=relationship_suppressed_for_email_to,
+    )
+    t = score_task_completion(before, after, action, action_ok=action_ok)
+    weighted_inner = W_CONFLICT * c + W_REL * r + W_TASK * t
+    bd = aggregate_scores(
+        c,
+        r,
+        t,
+        conflict_raw=c_raw,
+        critical_queue_bonus=crit_b,
+        weighted_inner=weighted_inner,
+        action_ok=action_ok,
+        episode_done=episode_done,
+        world_after=after,
+    )
+    return apply_do_nothing_penalty_floor(action, bd)

tests/test_api_reward_dead_500.py ADDED Viewed

	@@ -0,0 +1,150 @@

+"""Hard API dead-test: 500+ calls with reward-consistency checks."""
+from __future__ import annotations
+from typing import Any
+import pytest
+from fastapi.testclient import TestClient
+from ghostexec.server.app import app
+W_CONFLICT = 0.35
+W_REL = 0.35
+W_TASK = 0.30
+OUTPUT_SCALE = 0.48
+def _step_payload_for(i: int) -> dict[str, Any]:
+    templates: list[dict[str, Any]] = [
+        {"action": {"action_type": "do_nothing"}},
+        {"action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}},
+        {"action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}},
+        {"action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}},
+        {"action": {"action_type": "archive_email", "email_id": "e09"}},
+        {"action": {"action_type": "archive_email", "email_id": "bad_id"}},
+        {
+            "action": {
+                "action_type": "reschedule_meeting",
+                "meeting_id": "m02",
+                "new_time": "2026-04-21T18:00:00",
+            }
+        },
+        {
+            "action": {
+                "action_type": "reschedule_meeting",
+                "meeting_id": "m03",
+                "new_time": "2026-04-21T09:30:00",  # overlap -> invalid semantic
+            }
+        },
+        {"action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}},
+        {"action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}},
+        {"action": {"action_type": "complete_task", "task_id": "t07"}},
+        {"action": {"action_type": "complete_task", "task_id": "t09"}},  # already done
+        {"action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}},
+        {"action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}},
+        {
+            "action": {
+                "action_type": "send_message",
+                "contact_name": "Jamie Liu",
+                "message_body": "Quick sync please.",
+            }
+        },
+        {
+            "action": {
+                "action_type": "send_message",
+                "contact_name": "Nobody",
+                "message_body": "hello",
+            }
+        },
+    ]
+    return templates[i % len(templates)]
+@pytest.fixture(scope="module")
+def client() -> TestClient:
+    return TestClient(app, raise_server_exceptions=True)
+def test_api_surface_all_endpoints(client: TestClient) -> None:
+    # Core GET endpoints.
+    for path in ("/health", "/metadata", "/state", "/schema", "/openapi.json", "/docs", "/redoc"):
+        r = client.get(path)
+        assert r.status_code == 200, f"{path} -> {r.status_code}"
+    # Control routes: method contracts.
+    assert client.get("/reset").status_code == 405
+    assert client.get("/step").status_code == 405
+    assert client.put("/reset", json={}).status_code in (405, 422)
+    assert client.get("/this-path-should-not-exist-ghostexec").status_code == 404
+    # Reset variants.
+    for body in ({}, {"seed": 42}, {"episode_id": "dead-api-001"}, {"seed": 1, "future_field": True}):
+        rr = client.post("/reset", json=body)
+        assert rr.status_code == 200
+        j = rr.json()
+        assert "observation" in j and "done" in j
+    # MCP endpoint variants.
+    mcp_ok = client.post(
+        "/mcp",
+        json={"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}},
+    )
+    assert mcp_ok.status_code == 200
+    mcp_bad_json = client.post("/mcp", content="{", headers={"Content-Type": "application/json"})
+    assert mcp_bad_json.status_code == 200
+@pytest.mark.parametrize("idx", range(520))
+def test_api_reward_dead_520_cases(client: TestClient, idx: int) -> None:
+    # Keep each case independent and deterministic.
+    rr = client.post("/reset", json={"episode_id": f"dead-{idx:04d}", "seed": 42})
+    assert rr.status_code == 200
+    payload = _step_payload_for(idx)
+    rs = client.post("/step", json=payload)
+    assert rs.status_code == 200, f"idx={idx} payload={payload} status={rs.status_code}"
+    body = rs.json()
+    assert "observation" in body and "reward" in body and "done" in body
+    obs = body["observation"]
+    meta = obs.get("metadata") or {}
+    bd = meta.get("reward_breakdown") or {}
+    # Structural contracts.
+    assert isinstance(obs.get("echoed_message", ""), str) and obs.get("echoed_message")
+    assert "step_ok" in meta
+    assert "step_detail" in meta
+    assert "final" in bd
+    assert "weighted_base" in bd
+    # Reward identity: top-level reward must equal breakdown.final.
+    reward = float(body["reward"])
+    final = float(bd["final"])
+    assert reward == pytest.approx(final, abs=1e-9)
+    # Aggregation formula must hold exactly (within floating tolerance).
+    conflict = float(bd.get("conflict", 0.0))
+    relationship = float(bd.get("relationship", 0.0))
+    task = float(bd.get("task", 0.0))
+    weighted_inner = W_CONFLICT * conflict + W_REL * relationship + W_TASK * task
+    expected_weighted = OUTPUT_SCALE * weighted_inner
+    assert float(bd["weighted_base"]) == pytest.approx(expected_weighted, abs=1e-9)
+    expected_final = (
+        float(bd.get("weighted_base", 0.0))
+        + float(bd.get("invalid_step_adjustment", 0.0))
+        + float(bd.get("episode_completion_bonus", 0.0))
+        + float(bd.get("catastrophic_penalty", 0.0))
+        + float(bd.get("do_nothing_floor", 0.0))
+    )
+    assert final == pytest.approx(expected_final, abs=1e-9)
+    action_type = payload["action"]["action_type"]
+    if action_type == "do_nothing":
+        assert float(bd.get("do_nothing_floor", 0.0)) == pytest.approx(-0.15, abs=1e-12)
+        assert reward < 0
+    if meta.get("step_ok") is False:
+        assert float(bd.get("invalid_step_adjustment", 0.0)) == pytest.approx(-0.25, abs=1e-12)

tests/test_complete_integration.py ADDED Viewed

	@@ -0,0 +1,235 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# End-to-end stack test: FastAPI/OpenEnv HTTP + WebSocket, GhostExec env,
+# and (optionally) GhostexecEnv client over ASGI TestClient.
+from __future__ import annotations
+import json
+import os
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+import pytest
+from fastapi.testclient import TestClient
+from ghostexec.models import GhostexecAction
+from ghostexec.server.app import app
+from ghostexec.server.ghostexec_environment import GhostexecEnvironment
+ROOT = Path(__file__).resolve().parents[1]
+SCENARIO = ROOT / "scenarios" / "phase2_core.json"
+MONDAY = ROOT / "scenarios" / "monday_morning.json"
+def _http_paths(client: TestClient) -> set[str]:
+    paths: set[str] = set()
+    for r in app.routes:
+        p = getattr(r, "path", None)
+        if isinstance(p, str) and p:
+            paths.add(p)
+    return paths
+def test_server_app_import_matches_uvicorn_server_string() -> None:
+    """`uvicorn server.app:app` loads `server.app` with cwd on path (no `ghostexec.` prefix)."""
+    rc = subprocess.run(
+        [sys.executable, "-c", "import server.app; assert server.app.app is not None"],
+        cwd=str(ROOT),
+        check=False,
+    )
+    assert rc.returncode == 0, "import server.app must work from ghostexec repo root"
+def test_openapi_docs_and_schema_discovery() -> None:
+    with TestClient(app, raise_server_exceptions=True) as client:
+        r = client.get("/openapi.json")
+        assert r.status_code == 200
+        spec = r.json()
+        assert spec.get("openapi")
+        assert "paths" in spec and spec["paths"]
+        for path in ("/docs", "/redoc"):
+            resp = client.get(path)
+            assert resp.status_code == 200
+            assert len(resp.text) > 100
+def test_openapi_examples_match_ghostexec_observation_shape() -> None:
+    spec = app.openapi()
+    for path in ("/reset", "/step"):
+        ex = spec["paths"][path]["post"]["responses"]["200"]["content"]["application/json"]["example"]
+        obs = ex["observation"]
+        assert "echoed_message" in obs and "message_length" in obs
+        assert "status" not in obs and "data" not in obs
+        assert "reward" in ex and "done" in ex
+def test_openapi_info_documents_http_vs_websocket_episode() -> None:
+    """Runtime-visible API docs: HTTP reset/step are not one persistent episode; /ws is."""
+    spec = app.openapi()
+    desc = spec.get("info", {}).get("description") or ""
+    assert "Ghostexec / OpenEnv HTTP" in desc
+    assert "/ws" in desc and "WebSocket" in desc
+def test_all_registered_get_post_routes_smoke() -> None:
+    """Smoke every stable OpenEnv HTTP route (simulation mode, no Gradio /web)."""
+    with TestClient(app, raise_server_exceptions=True) as client:
+        paths = _http_paths(client)
+        assert "/health" in paths
+        assert "/metadata" in paths
+        assert "/schema" in paths
+        assert "/state" in paths
+        assert "/reset" in paths
+        assert "/step" in paths
+        assert "/ws" in paths
+        assert "/mcp" in paths
+        h = client.get("/health")
+        assert h.status_code == 200
+        assert h.json().get("status") == "healthy"
+        meta = client.get("/metadata")
+        assert meta.status_code == 200
+        body = meta.json()
+        assert body.get("name") in ("ghostexec", "GhostexecEnvironment")
+        assert "description" in body
+        st = client.get("/state")
+        assert st.status_code == 200
+        assert "step_count" in st.json()
+        sch = client.get("/schema")
+        assert sch.status_code == 200
+        sj = sch.json()
+        assert "action" in sj and "observation" in sj and "state" in sj
+        assert sj["action"].get("title") or sj["action"].get("properties")
+def test_http_reset_and_step_return_valid_payloads() -> None:
+    """
+    Stateless HTTP: each request builds a fresh env (OpenEnv design).
+    POST /step on a new instance loads the scenario then applies the action (primed reset).
+    """
+    with TestClient(app, raise_server_exceptions=True) as client:
+        reset = client.post("/reset", json={})
+        assert reset.status_code == 200
+        rj = reset.json()
+        assert "observation" in rj
+        obs = rj["observation"]
+        assert "echoed_message" in obs
+        assert "GHOSTEXEC BRIEFING" in (obs.get("echoed_message") or "")
+        step = client.post(
+            "/step",
+            json={
+                "action": {
+                    "action_type": "reply_email",
+                    "email_id": "e05",
+                    "message_body": "On it.",
+                }
+            },
+        )
+        assert step.status_code == 200
+        sj = step.json()
+        assert "observation" in sj
+        assert sj.get("reward") is not None or sj["observation"].get("reward") is not None
+def test_http_step_invalid_action_422() -> None:
+    with TestClient(app, raise_server_exceptions=True) as client:
+        bad = client.post("/step", json={"action": "not-an-object"})
+        assert bad.status_code == 422
+def test_mcp_jsonrpc_tools_list() -> None:
+    with TestClient(app, raise_server_exceptions=True) as client:
+        payload = {"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}}
+        r = client.post("/mcp", json=payload)
+        assert r.status_code == 200
+        data = r.json()
+        assert "result" in data or "error" in data
+def test_websocket_full_episode_reset_step_state_close() -> None:
+    with TestClient(app, raise_server_exceptions=True) as client:
+        with client.websocket_connect("/ws") as ws:
+            ws.send_json({"type": "reset", "data": {}})
+            msg = ws.receive_json()
+            assert msg.get("type") == "observation"
+            data = msg.get("data") or {}
+            assert "observation" in data
+            inner = data["observation"]
+            assert "echoed_message" in inner
+            assert "GHOSTEXEC BRIEFING" in inner.get("echoed_message", "")
+            ws.send_json(
+                {
+                    "type": "step",
+                    "data": {
+                        "action_type": "reschedule_meeting",
+                        "meeting_id": "m02",
+                        "new_time": "2026-04-21T18:00:00",
+                    },
+                }
+            )
+            msg2 = ws.receive_json()
+            assert msg2.get("type") == "observation"
+            d2 = msg2.get("data") or {}
+            assert d2.get("reward") is not None
+            ws.send_json({"type": "state"})
+            msg3 = ws.receive_json()
+            assert msg3.get("type") == "state", msg3
+            st = msg3.get("data") or {}
+            assert st.get("step_count", 0) >= 1
+            ws.send_json({"type": "close", "data": {}})
+def test_inprocess_env_matches_ws_briefing_shape() -> None:
+    env = GhostexecEnvironment(SCENARIO)
+    obs = env.reset()
+    assert "BRIEFING" in obs.echoed_message
+    o2 = env.step(
+        GhostexecAction(
+            action_type="reschedule_meeting",
+            meeting_id="m02",
+            new_time="2026-04-21T18:00:00",
+        )
+    )
+    assert o2.reward is not None
+    assert o2.metadata.get("step_ok") is True
+def test_monday_morning_scenario_reward_signal() -> None:
+    assert MONDAY.is_file()
+    env = GhostexecEnvironment(MONDAY)
+    env.reset()
+    r = env.step(GhostexecAction(action_type="do_nothing")).reward
+    assert isinstance(r, float)
+def test_ghostexec_env_client_against_live_url_if_set() -> None:
+    """
+    GhostexecEnv opens a real TCP WebSocket; Starlette TestClient uses the
+    non-resolvable host ``testserver`` on some platforms, so this only runs when
+    ``GHOSTEXEC_WS_BASE_URL`` points at a live server (e.g. local uvicorn).
+    """
+    base = os.environ.get("GHOSTEXEC_WS_BASE_URL", "").strip().rstrip("/")
+    if not base:
+        pytest.skip("Set GHOSTEXEC_WS_BASE_URL (e.g. http://127.0.0.1:8000) to test GhostexecEnv client.")
+    from ghostexec.client import GhostexecEnv
+    sync_client = GhostexecEnv(base_url=base).sync()
+    with sync_client:
+        res = sync_client.reset()
+        assert res.observation.echoed_message
+        res2 = sync_client.step(GhostexecAction(action_type="do_nothing"))
+        assert res2.observation.echoed_message

tests/test_docker_build.py ADDED Viewed

	@@ -0,0 +1,60 @@

+"""Opt-in Docker build smoke test for Phase 1 deployment readiness."""
+from __future__ import annotations
+import os
+import shutil
+import subprocess
+from pathlib import Path
+import pytest
+ROOT = Path(__file__).resolve().parents[1]
+@pytest.mark.skipif(
+    shutil.which("docker") is None or os.environ.get("GHOSTEXEC_RUN_DOCKER_BUILD") != "1",
+    reason="Set GHOSTEXEC_RUN_DOCKER_BUILD=1 and ensure docker is installed to run this test.",
+)
+def test_server_dockerfile_builds():
+    daemon = subprocess.run(
+        ["docker", "version"],
+        cwd=str(ROOT),
+        capture_output=True,
+        text=True,
+        timeout=60,
+        check=False,
+    )
+    if daemon.returncode != 0:
+        pytest.skip("Docker daemon is unavailable on this machine.")
+    image_tag = "ghostexec-env:ci"
+    build_cmd = ["docker", "build", "-t", image_tag, "."]
+    built = subprocess.run(
+        build_cmd,
+        cwd=str(ROOT),
+        capture_output=True,
+        text=True,
+        timeout=900,
+        check=False,
+    )
+    assert built.returncode == 0, (
+        "docker build failed\n"
+        f"stdout:\n{built.stdout}\n"
+        f"stderr:\n{built.stderr}\n"
+    )
+    inspect_cmd = ["docker", "image", "inspect", image_tag]
+    inspected = subprocess.run(
+        inspect_cmd,
+        cwd=str(ROOT),
+        capture_output=True,
+        text=True,
+        timeout=120,
+        check=False,
+    )
+    assert inspected.returncode == 0, (
+        f"image inspect failed for {image_tag}\n"
+        f"stdout:\n{inspected.stdout}\n"
+        f"stderr:\n{inspected.stderr}\n"
+    )

tests/test_env.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""OpenEnv Phase 2 submission guardrails (graders + manifest wiring)."""
+from __future__ import annotations
+import importlib
+import sys
+from pathlib import Path
+import pytest
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+from graders import (
+    dinner_disaster_grader,
+    monday_morning_grader,
+    phase2_core_grader,
+)
+PUBLIC_GRADERS = (phase2_core_grader, monday_morning_grader, dinner_disaster_grader)
+@pytest.mark.parametrize("grader", PUBLIC_GRADERS)
+def test_public_graders_are_strictly_bounded(grader):
+    assert grader({"rewards": [1.0]}) == 0.99
+    assert grader({"rewards": [0.0]}) == 0.01
+    assert grader({"rewards": [-5.0]}) == 0.01
+    assert grader({"score": 1.5}) == 0.99
+    assert grader({"score": -0.5}) == 0.01
+    assert grader({"reward": {"total": 1.0}}) == 0.99
+    v = grader(None)
+    assert 0.0 < v < 1.0
+    v = grader({})
+    assert 0.0 < v < 1.0
+def test_openenv_yaml_declares_three_tasks_with_graders():
+    import yaml
+    root = Path(__file__).resolve().parent.parent
+    with (root / "openenv.yaml").open("r", encoding="utf-8") as f:
+        spec = yaml.safe_load(f)
+    tasks = spec.get("tasks", [])
+    assert len(tasks) >= 3, "Phase 2 requires >= 3 tasks"
+    for t in tasks:
+        assert "grader" in t, f"Task {t.get('id')} missing grader"
+        module_path, _, func_name = t["grader"].rpartition(".")
+        mod = importlib.import_module(module_path)
+        assert callable(getattr(mod, func_name)), f"{t['grader']} not callable"

tests/test_live_server_exhaustive.py ADDED Viewed

	@@ -0,0 +1,287 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Exhaustive / adversarial probes against a RUNNING GhostExec HTTP server.
+# Default: http://127.0.0.1:8000  (override with GHOSTEXEC_LIVE_BASE_URL).
+# Skips all tests if /health is unreachable.
+from __future__ import annotations
+import asyncio
+import json
+import os
+import urllib.error
+import urllib.request
+from typing import Any
+import pytest
+BASE = os.environ.get("GHOSTEXEC_LIVE_BASE_URL", "http://127.0.0.1:8000").rstrip("/")
+def _req(
+    method: str,
+    path: str,
+    *,
+    data: bytes | None = None,
+    headers: dict[str, str] | None = None,
+    timeout: float = 15.0,
+) -> tuple[int, bytes]:
+    url = BASE + path
+    h = urllib.request.Request(url, data=data, headers=headers or {}, method=method)
+    try:
+        with urllib.request.urlopen(h, timeout=timeout) as resp:
+            return resp.status, resp.read()
+    except urllib.error.HTTPError as e:
+        try:
+            body = e.read()
+        except (ConnectionResetError, OSError):
+            body = b""
+        return e.code, body
+@pytest.fixture(scope="module")
+def live() -> str:
+    try:
+        code, _ = _req("GET", "/health", timeout=3.0)
+    except OSError as e:
+        pytest.skip(f"Live server not reachable at {BASE!r}: {e}")
+    if code != 200:
+        pytest.skip(f"Live /health returned {code} at {BASE!r}")
+    return BASE
+def test_get_core_docs(live: str) -> None:
+    for path, min_len in [
+        ("/health", 10),
+        ("/metadata", 20),
+        ("/state", 10),
+        ("/schema", 500),
+        ("/openapi.json", 1000),
+        ("/docs", 200),
+        ("/redoc", 200),
+    ]:
+        code, body = _req("GET", path)
+        assert code == 200, f"{path} -> {code}"
+        assert len(body) >= min_len, f"{path} body tiny"
+def test_wrong_http_methods_on_control_routes(live: str) -> None:
+    assert _req("GET", "/reset")[0] == 405
+    assert _req("GET", "/step")[0] == 405
+    assert _req("PUT", "/reset", data=b"{}")[0] in (405, 422)
+    code, _ = _req("DELETE", "/health")
+    assert code in (405, 404)
+    assert _req("GET", "/this-path-should-not-exist-ghostexec")[0] == 404
+def test_reset_payload_variants(live: str) -> None:
+    for label, payload in [
+        ("empty", {}),
+        ("seed", {"seed": 42}),
+        ("episode_id", {"episode_id": "probe-episode-1"}),
+        ("extra_ignored", {"seed": 1, "unknown_future_field_xyz": True}),
+    ]:
+        code, body = _req(
+            "POST",
+            "/reset",
+            data=json.dumps(payload).encode(),
+            headers={"Content-Type": "application/json"},
+        )
+        assert code == 200, f"reset {label}: {code}"
+        j = json.loads(body.decode())
+        assert "observation" in j and "done" in j
+        obs = j["observation"]
+        assert "echoed_message" in obs
+def test_step_valid_action_types(live: str) -> None:
+    cases: list[tuple[str, dict[str, Any]]] = [
+        ("do_nothing", {"action_type": "do_nothing"}),
+        (
+            "reply_email",
+            {"action_type": "reply_email", "email_id": "e14", "message_body": "Live exhaustive probe."},
+        ),
+        ("archive_email", {"action_type": "archive_email", "email_id": "e09"}),
+        (
+            "reschedule_meeting",
+            {
+                "action_type": "reschedule_meeting",
+                "meeting_id": "m02",
+                "new_time": "2026-04-21T18:00:00",
+            },
+        ),
+        (
+            "cancel_meeting",
+            {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "probe cancel"},
+        ),
+        ("complete_task", {"action_type": "complete_task", "task_id": "t07"}),
+        (
+            "delegate_task",
+            {
+                "action_type": "delegate_task",
+                "task_id": "t08",
+                "contact_name": "Jordan Lee",
+            },
+        ),
+        (
+            "send_message",
+            {
+                "action_type": "send_message",
+                "contact_name": "Jamie Liu",
+                "message_body": "Exhaustive live test ping.",
+            },
+        ),
+    ]
+    for name, action in cases:
+        code, body = _req(
+            "POST",
+            "/step",
+            data=json.dumps({"action": action}).encode(),
+            headers={"Content-Type": "application/json"},
+        )
+        assert code == 200, f"step {name}: HTTP {code} {body[:200]!r}"
+        j = json.loads(body.decode())
+        assert "observation" in j
+        meta = (j.get("observation") or {}).get("metadata") or {}
+        assert "step_ok" in meta, f"step {name}: missing step_ok"
+def test_step_invalid_contracts(live: str) -> None:
+    assert _req("POST", "/step", data=b"not-json", headers={"Content-Type": "application/json"})[0] in (
+        400,
+        422,
+    )
+    assert (
+        _req(
+            "POST",
+            "/step",
+            data=json.dumps({"action": "not-a-dict"}).encode(),
+            headers={"Content-Type": "application/json"},
+        )[0]
+        == 422
+    )
+    assert (
+        _req(
+            "POST",
+            "/step",
+            data=json.dumps({"action": {"action_type": "reply_email", "email_id": "nope", "message_body": "x"}}).encode(),
+            headers={"Content-Type": "application/json"},
+        )[0]
+        == 200
+    )
+    j = json.loads(
+        _req(
+            "POST",
+            "/step",
+            data=json.dumps(
+                {"action": {"action_type": "reply_email", "email_id": "nope", "message_body": "x"}}
+            ).encode(),
+            headers={"Content-Type": "application/json"},
+        )[1].decode()
+    )
+    assert j["observation"]["metadata"].get("step_ok") is False
+    assert (
+        _req(
+            "POST",
+            "/step",
+            data=json.dumps({"action": {"action_type": "complete_task", "task_id": "t09"}}).encode(),
+            headers={"Content-Type": "application/json"},
+        )[0]
+        == 200
+    )
+    j2 = json.loads(
+        _req(
+            "POST",
+            "/step",
+            data=json.dumps({"action": {"action_type": "complete_task", "task_id": "t09"}}).encode(),
+            headers={"Content-Type": "application/json"},
+        )[1].decode()
+    )
+    assert j2["observation"]["metadata"].get("step_ok") is False
+def test_step_unicode_and_long_message(live: str) -> None:
+    long_body = ("Line note.\n" * 80) + " café naïve résumé 日本語"
+    code, body = _req(
+        "POST",
+        "/step",
+        data=json.dumps(
+            {"action": {"action_type": "reply_email", "email_id": "e05", "message_body": long_body}}
+        ).encode(),
+        headers={"Content-Type": "application/json"},
+    )
+    assert code == 200
+def test_step_wrong_content_type(live: str) -> None:
+    code, _ = _req(
+        "POST",
+        "/step",
+        data=b"action_type=do_nothing",
+        headers={"Content-Type": "application/x-www-form-urlencoded"},
+    )
+    assert code in (400, 415, 422)
+def test_reset_invalid_json(live: str) -> None:
+    code, _ = _req("POST", "/reset", data=b"{", headers={"Content-Type": "application/json"})
+    assert code in (400, 422)
+def test_mcp_variants(live: str) -> None:
+    assert _req("POST", "/mcp", data=b"{", headers={"Content-Type": "application/json"})[0] == 200
+    body = _req(
+        "POST",
+        "/mcp",
+        data=json.dumps({"jsonrpc": "2.0", "id": 1, "method": "bogus/thing", "params": {}}).encode(),
+        headers={"Content-Type": "application/json"},
+    )[1].decode()
+    j = json.loads(body)
+    assert "error" in j or "result" in j
+def test_openapi_lists_expected_paths(live: str) -> None:
+    _, raw = _req("GET", "/openapi.json")
+    spec = json.loads(raw.decode())
+    paths = spec.get("paths") or {}
+    for p in ("/health", "/reset", "/step", "/schema", "/metadata", "/state", "/mcp"):
+        assert p in paths, f"missing path {p} in OpenAPI"
+def test_websocket_dead_ends(live: str) -> None:
+    try:
+        import websockets
+    except ImportError:
+        pytest.skip("websockets not installed")
+    async def _run() -> None:
+        ws_url = live.replace("http://", "ws://").replace("https://", "wss://") + "/ws"
+        async with websockets.connect(ws_url, max_size=10_000_000) as ws:
+            await ws.send("{ not json")
+            e1 = json.loads(await ws.recv())
+            assert e1.get("type") == "error"
+            await ws.send(json.dumps({"type": "nosuch", "data": {}}))
+            e2 = json.loads(await ws.recv())
+            assert e2.get("type") == "error"
+            await ws.send(json.dumps({"type": "reset", "data": {}}))
+            ok = json.loads(await ws.recv())
+            assert ok.get("type") == "observation"
+            await ws.send(
+                json.dumps({"type": "step", "data": {"action_type": "reply_email", "email_id": "missing"}})
+            )
+            bad = json.loads(await ws.recv())
+            assert bad.get("type") == "observation"
+            meta = (bad.get("data") or {}).get("observation", {}).get("metadata") or {}
+            assert meta.get("step_ok") is False
+            await ws.send(json.dumps({"type": "state"}))
+            st = json.loads(await ws.recv())
+            assert st.get("type") == "state"
+            await ws.send(json.dumps({"type": "close", "data": {}}))
+    asyncio.run(_run())

tests/test_phase1.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""Phase 1: scaffold, OpenEnv manifest, layout, and HTTP health surface."""
+from pathlib import Path
+import yaml
+from starlette.testclient import TestClient
+ROOT = Path(__file__).resolve().parents[1]
+def test_openenv_yaml_exists_and_metadata():
+    path = ROOT / "openenv.yaml"
+    assert path.is_file(), "openenv.yaml must exist at project root"
+    data = yaml.safe_load(path.read_text(encoding="utf-8"))
+    assert data.get("name") == "ghostexec"
+    assert data.get("spec_version") == 1
+    assert data.get("type") == "space"
+    assert data.get("runtime") == "fastapi"
+    assert data.get("app") == "server.app:app"
+    desc = data.get("description")
+    assert desc and isinstance(desc, str) and len(desc.strip()) > 0
+    ver = data.get("version")
+    assert ver and isinstance(ver, str) and len(ver.strip()) > 0
+def test_expected_folder_structure():
+    assert (ROOT / "models.py").is_file()
+    assert (ROOT / "client.py").is_file()
+    assert (ROOT / "pyproject.toml").is_file()
+    assert (ROOT / "server" / "app.py").is_file()
+    assert (ROOT / "server" / "ghostexec_environment.py").is_file()
+    assert (ROOT / "Dockerfile").is_file() or (ROOT / "server" / "Dockerfile").is_file()
+    assert (ROOT / "server" / "requirements.txt").is_file()
+def test_server_health_ping():
+    from ghostexec.server.app import app
+    client = TestClient(app)
+    response = client.get("/health")
+    assert response.status_code == 200
+    assert response.json().get("status") == "healthy"

tests/test_phase2.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""Phase 2: world state, inbox, calendar, contacts, tasks (scenario-driven)."""
+from pathlib import Path
+from ghostexec.server.ghostexec_environment import GhostexecEnvironment
+ROOT = Path(__file__).resolve().parents[1]
+SCENARIO = ROOT / "scenarios" / "phase2_core.json"
+def test_scenario_file_exists():
+    assert SCENARIO.is_file()
+def test_world_json_roundtrip():
+    world = GhostexecEnvironment.load_world_from_json(SCENARIO)
+    blob = GhostexecEnvironment.world_to_json(world)
+    again = GhostexecEnvironment.world_from_json(blob)
+    assert again.simulation_time == world.simulation_time
+    assert len(again.emails) == len(world.emails)
+    assert len(again.meetings) == len(world.meetings)
+def test_pool_sizes_from_scenario():
+    w = GhostexecEnvironment.load_world_from_json(SCENARIO)
+    assert len(w.emails) >= 30
+    assert len(w.meetings) >= 8
+    assert len(w.contacts) >= 15
+    assert len(w.tasks) >= 10
+def test_inbox_unread_priority_order():
+    env = GhostexecEnvironment(SCENARIO)
+    env.reset()
+    unread = env.get_unread_emails_sorted()
+    priorities = [e.priority for e in unread]
+    rank = {"critical": 0, "high": 1, "normal": 2, "low": 3}
+    assert priorities == sorted(priorities, key=lambda p: rank[p])
+    assert unread[0].priority == "critical"
+def test_calendar_detects_four_conflicts():
+    env = GhostexecEnvironment(SCENARIO)
+    env.reset()
+    conflicts = env.detect_meeting_conflicts()
+    assert len(conflicts) >= 4
+def test_contact_mood_update():
+    env = GhostexecEnvironment(SCENARIO)
+    env.reset()
+    c = env.get_contact("David Okonkwo")
+    assert c is not None
+    assert c.mood == "angry"
+    assert env.update_contact_mood("David Okonkwo", "neutral")
+    assert env.get_contact("David Okonkwo") is not None
+    assert env.get_contact("David Okonkwo").mood == "neutral"
+def test_overdue_tasks_after_time_advance():
+    env = GhostexecEnvironment(SCENARIO)
+    env.reset()
+    future = "2026-04-22T12:00:00"
+    env.set_simulation_time(future)
+    overdue = env.overdue_tasks_at(future)
+    assert len(overdue) >= 2
+    assert all(t.status == "overdue" for t in overdue)
+def test_mark_email_read_and_reschedule_reduces_calendar_conflicts():
+    env = GhostexecEnvironment(SCENARIO)
+    env.reset()
+    before = len(env.detect_meeting_conflicts())
+    assert env.reschedule_meeting("m02", "2026-04-21T18:00:00")
+    after = len(env.detect_meeting_conflicts())
+    assert after < before
+    assert env.mark_email_read("e01")

tests/test_phase3.py ADDED Viewed

	@@ -0,0 +1,153 @@

+"""Phase 3: plain-text briefing, eight legal actions, validation without crashes."""
+from pathlib import Path
+import pytest
+from ghostexec.models import GhostexecAction
+from ghostexec.server.ghostexec_environment import GhostexecEnvironment
+ROOT = Path(__file__).resolve().parents[1]
+SCENARIO = ROOT / "scenarios" / "phase2_core.json"
+def _env() -> GhostexecEnvironment:
+    e = GhostexecEnvironment(SCENARIO)
+    e.reset()
+    return e
+def test_briefing_is_plain_text_after_reset():
+    env = _env()
+    obs = env.reset()
+    text = obs.echoed_message
+    assert "=== GHOSTEXEC BRIEFING" in text
+    assert "UNREAD EMAILS" in text
+    assert "CALENDAR CONFLICTS IN NEXT 4 HOURS" in text
+    assert "CONTACTS TO WATCH" in text
+    assert "OVERDUE OR DUE-SOON TASKS" in text
+    assert "EXEC STRESS LEVEL" in text
+    assert "STEPS REMAINING" in text
+    assert obs.message_length == len(text)
+@pytest.mark.parametrize(
+    "action,check",
+    [
+        (
+            GhostexecAction(action_type="reply_email", email_id="e05", message_body="On it."),
+            lambda env: next(e for e in env.world.emails if e.id == "e05").replied is True,
+        ),
+        (
+            GhostexecAction(action_type="archive_email", email_id="e09"),
+            lambda env: next(e for e in env.world.emails if e.id == "e09").read is True,
+        ),
+        (
+            GhostexecAction(
+                action_type="reschedule_meeting",
+                meeting_id="m03",
+                new_time="2026-04-21T18:00:00",
+            ),
+            lambda env: next(m for m in env.world.meetings if m.id == "m03").start
+            == "2026-04-21T18:00:00",
+        ),
+        (
+            GhostexecAction(
+                action_type="cancel_meeting",
+                meeting_id="m10",
+                reason="Merged into ops review",
+            ),
+            lambda env: next(m for m in env.world.meetings if m.id == "m10").cancelled is True,
+        ),
+        (
+            GhostexecAction(action_type="complete_task", task_id="t07"),
+            lambda env: next(t for t in env.world.tasks if t.id == "t07").status == "done",
+        ),
+        (
+            GhostexecAction(
+                action_type="delegate_task",
+                task_id="t08",
+                contact_name="Jordan Lee",
+            ),
+            lambda env: next(t for t in env.world.tasks if t.id == "t08").delegated_to == "Jordan Lee",
+        ),
+        (
+            GhostexecAction(
+                action_type="send_message",
+                contact_name="Jamie Liu",
+                message_body="Thanks for the demo feedback.",
+            ),
+            lambda env: any("message to Jamie Liu" in line for line in env.world.action_log),
+        ),
+        (
+            GhostexecAction(action_type="do_nothing"),
+            lambda env: True,
+        ),
+    ],
+)
+def test_each_legal_action_runs_without_crash(action, check):
+    env = _env()
+    obs = env.step(action)
+    assert obs.echoed_message
+    assert check(env)
+def test_reply_marks_email_handled():
+    env = _env()
+    e = next(x for x in env.world.emails if x.id == "e14")
+    assert not e.read
+    env.step(GhostexecAction(action_type="reply_email", email_id="e14", message_body="Noted."))
+    e2 = next(x for x in env.world.emails if x.id == "e14")
+    assert e2.read and e2.replied
+def test_invalid_actions_return_error_metadata_not_exception():
+    base = _env()
+    r_do_nothing = base.step(GhostexecAction(action_type="do_nothing")).reward
+    env = _env()
+    obs = env.step(GhostexecAction(action_type="reply_email", email_id="nope", message_body="x"))
+    assert obs.metadata.get("step_ok") is False
+    assert obs.metadata.get("step_error")
+    # Same before→after sub-scores as do_nothing, plus explicit invalid add-on.
+    # do_nothing has an additional strict additive floor (-0.15), so the delta is -0.10 here.
+    assert obs.reward == pytest.approx((r_do_nothing or 0) - (0.25 - 0.15))
+    obs2 = env.step(GhostexecAction(action_type="complete_task", task_id="t09"))
+    assert obs2.metadata.get("step_ok") is False
+    assert "already done" in (obs2.metadata.get("step_error") or "").lower()
+    obs3 = env.step(
+        GhostexecAction(
+            action_type="send_message",
+            contact_name="Nobody By That Name",
+            message_body="hello",
+        )
+    )
+    assert obs3.metadata.get("step_ok") is False
+    obs4 = env.step(
+        GhostexecAction(
+            action_type="reschedule_meeting",
+            meeting_id="m03",
+            new_time="2026-04-21T09:30:00",
+        )
+    )
+    assert obs4.metadata.get("step_ok") is False
+    assert "overlap" in (obs4.metadata.get("step_error") or "").lower()
+def test_reschedule_resolves_prior_conflict_pair():
+    env = _env()
+    before = {frozenset((r["meeting_a"], r["meeting_b"])) for r in env.detect_meeting_conflicts()}
+    assert frozenset(("m01", "m02")) in before
+    obs = env.step(
+        GhostexecAction(
+            action_type="reschedule_meeting",
+            meeting_id="m02",
+            new_time="2026-04-21T18:00:00",
+        )
+    )
+    assert obs.metadata.get("step_ok") is True
+    after = {frozenset((r["meeting_a"], r["meeting_b"])) for r in env.detect_meeting_conflicts()}
+    assert frozenset(("m01", "m02")) not in after

tests/test_phase4.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""Phase 4: reward sub-scores, aggregation, logging, schema drift."""
+import json
+import random
+import statistics
+from pathlib import Path
+import pytest
+from ghostexec.models import GhostexecAction
+from ghostexec.server import reward as reward_mod
+from ghostexec.server.reward import aggregate_scores
+from ghostexec.server.ghostexec_environment import GhostexecEnvironment
+ROOT = Path(__file__).resolve().parents[1]
+SCENARIO = ROOT / "scenarios" / "phase2_core.json"
+DRIFT = ROOT / "scenarios" / "schema_drift_test.json"
+def test_reward_weights_and_aggregator_helpers():
+    w = GhostexecEnvironment.load_world_from_json(SCENARIO)
+    c, r, t = 1.0, -1.0, 2.5
+    weighted_inner = reward_mod.W_CONFLICT * c + reward_mod.W_REL * r + reward_mod.W_TASK * t
+    bd = aggregate_scores(
+        c,
+        r,
+        t,
+        conflict_raw=c,
+        critical_queue_bonus=0.0,
+        weighted_inner=weighted_inner,
+        action_ok=True,
+        episode_done=False,
+        world_after=w,
+    )
+    assert bd.weighted_base == pytest.approx(reward_mod.WEIGHTED_OUTPUT_SCALE * weighted_inner)
+def test_catastrophic_and_completion_bonuses_only_when_episode_done():
+    w0 = GhostexecEnvironment.load_world_from_json(SCENARIO)
+    w1 = w0.model_copy(deep=True)
+    w1.stress = 30
+    w2 = w1.model_copy(deep=True)
+    action = GhostexecAction(action_type="do_nothing")
+    mid = reward_mod.compute_step_reward(w1, w2, action, action_ok=True, episode_done=False)
+    assert mid.episode_completion_bonus == 0.0
+    assert mid.catastrophic_penalty == 0.0
+    w_bad = w1.model_copy(deep=True)
+    for i, c in enumerate(w_bad.contacts):
+        if c.name == "Marcus Webb":
+            w_bad.contacts[i] = c.model_copy(update={"mood": "furious"})
+            break
+    end = reward_mod.compute_step_reward(w1, w_bad, action, action_ok=True, episode_done=True)
+    assert end.episode_completion_bonus == pytest.approx(10.0)
+    assert end.catastrophic_penalty == pytest.approx(-15.0)
+def test_invalid_step_matches_do_nothing_subscores_plus_invalid_addon():
+    w = GhostexecEnvironment.load_world_from_json(SCENARIO)
+    noop = GhostexecAction(action_type="do_nothing")
+    bad = GhostexecAction(action_type="reply_email", email_id="missing", message_body="x")
+    bd_ok = reward_mod.compute_step_reward(w, w, noop, action_ok=True, episode_done=False)
+    bd_bad = reward_mod.compute_step_reward(w, w, bad, action_ok=False, episode_done=False)
+    assert bd_bad.invalid_step_adjustment == pytest.approx(-0.25)
+    # do_nothing carries an additional strict additive floor (-0.15) not applied to invalid non-idle actions.
+    assert bd_bad.final == pytest.approx(bd_ok.final - (0.25 - 0.15))
+def test_scripted_episode_reward_direction_and_log(tmp_path, monkeypatch):
+    logf = tmp_path / "rewards.jsonl"
+    env = GhostexecEnvironment(SCENARIO)
+    env.reset()
+    monkeypatch.setattr(env, "_reward_log_path", logf)
+    r_resolve = env.step(
+        GhostexecAction(
+            action_type="reschedule_meeting",
+            meeting_id="m02",
+            new_time="2026-04-21T18:00:00",
+        )
+    )
+    r_bad = env.step(GhostexecAction(action_type="do_nothing"))
+    assert r_resolve.metadata.get("step_ok") is True
+    assert r_bad.metadata.get("step_ok") is True
+    assert (r_resolve.reward or 0) > (r_bad.reward or 0)
+    assert logf.is_file()
+    lines = logf.read_text(encoding="utf-8").strip().splitlines()
+    assert len(lines) >= 2
+    row = json.loads(lines[0])
+    assert "reward" in row and "episode_id" in row
+    assert row.get("action_type") == "reschedule_meeting"
+    assert "conflict_raw" in row and "step_ok" in row
+def test_schema_drift_events_mutate_world():
+    env = GhostexecEnvironment(SCENARIO, schema_drift_events_path=DRIFT)
+    env.reset()
+    assert env.step(GhostexecAction(action_type="do_nothing")).metadata.get("step_ok") is True
+    assert any("schema drift: shifted" in x for x in env.world.action_log)
+    env.step(GhostexecAction(action_type="do_nothing"))
+    sarah = env.get_contact("Sarah Chen")
+    assert sarah is not None
+    assert sarah.communication_preference == "text"
+    env.step(GhostexecAction(action_type="do_nothing"))
+    t02 = next(t for t in env.world.tasks if t.id == "t02")
+    assert t02.deadline == "2026-04-21T07:00:00"
+    assert "Marcus Webb" in env._reply_relationship_suppressed  # noqa: SLF001
+def test_rewards_differ_between_helpful_and_idle_steps():
+    env = GhostexecEnvironment(SCENARIO)
+    env.reset()
+    r_help = env.step(
+        GhostexecAction(
+            action_type="reschedule_meeting",
+            meeting_id="m02",
+            new_time="2026-04-21T18:00:00",
+        )
+    ).reward
+    r_idle = env.step(GhostexecAction(action_type="do_nothing")).reward
+    assert r_help is not None and r_idle is not None
+    assert r_help != r_idle
+# Whitelisted reschedules (known non-overlapping targets for phase2_core at 08:00).
+_SAFE_RESCHEDULES: list[tuple[str, str]] = [
+    ("m02", "2026-04-21T18:00:00"),
+    ("m03", "2026-04-21T18:30:00"),
+    ("m06", "2026-04-21T20:00:00"),
+    ("m09", "2026-04-21T21:00:00"),
+]
+def test_seeded_stochastic_policy_reward_spread():
+    random.seed(1234)
+    K = 80
+    archive_ids = [f"e{i:02d}" for i in range(1, 31)]
+    contacts = ["Jordan Lee", "Jamie Liu", "Marcus Webb", "Sarah Chen"]
+    env = GhostexecEnvironment(SCENARIO)
+    env.reset()
+    rewards: list[float] = []
+    ai = ri = 0
+    for _ in range(K):
+        u = random.random()
+        if u < 0.32:
+            obs = env.step(GhostexecAction(action_type="do_nothing"))
+        elif u < 0.58:
+            eid = archive_ids[ai % len(archive_ids)]
+            ai += 1
+            obs = env.step(GhostexecAction(action_type="archive_email", email_id=eid))
+        elif u < 0.78:
+            mid, nt = _SAFE_RESCHEDULES[ri % len(_SAFE_RESCHEDULES)]
+            ri += 1
+            obs = env.step(
+                GhostexecAction(action_type="reschedule_meeting", meeting_id=mid, new_time=nt)
+            )
+        else:
+            cname = contacts[ai % len(contacts)]
+            ai += 1
+            obs = env.step(
+                GhostexecAction(
+                    action_type="send_message",
+                    contact_name=cname,
+                    message_body="Quick sync on priorities.",
+                )
+            )
+        assert obs.reward is not None
+        rewards.append(float(obs.reward))
+    std = statistics.pstdev(rewards)
+    sr = sorted(rewards)
+    p5 = sr[max(0, int(0.05 * (len(sr) - 1)))]
+    p95 = sr[min(len(sr) - 1, int(0.95 * (len(sr) - 1)))]
+    assert std > 0.06
+    assert (p95 - p5) > 0.09
+def test_good_script_beats_do_nothing_spam_on_mean_reward():
+    good = GhostexecEnvironment(SCENARIO)
+    good.reset()
+    good_actions = [
+        GhostexecAction(
+            action_type="reschedule_meeting",
+            meeting_id="m02",
+            new_time="2026-04-21T18:00:00",
+        ),
+        GhostexecAction(action_type="reply_email", email_id="e01", message_body="Drafting revised figures now."),
+        GhostexecAction(action_type="archive_email", email_id="e09"),
+        GhostexecAction(
+            action_type="send_message",
+            contact_name="Jordan Lee",
+            message_body="Standup notes attached.",
+        ),
+        GhostexecAction(action_type="complete_task", task_id="t06"),
+    ]
+    g_rewards = [good.step(a).reward for a in good_actions]
+    g_mean = sum(float(x) for x in g_rewards) / len(g_rewards)
+    bad = GhostexecEnvironment(SCENARIO)
+    bad.reset()
+    b_rewards = [bad.step(GhostexecAction(action_type="do_nothing")).reward for _ in range(5)]
+    b_mean = sum(float(x) for x in b_rewards) / len(b_rewards)
+    assert g_mean > b_mean + 0.2

tests/test_reward_dead_suite.py ADDED Viewed

	@@ -0,0 +1,319 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Dead-test suite for Phase 4 step rewards: 100+ independent scenarios on
+# phase2_core.json. Asserts penalization (do_nothing, invalid), priority
+# ordering (VIP critical > normal), and legal-action signatures for GRPO-style
+# post-training signal quality.
+from __future__ import annotations
+from pathlib import Path
+import pytest
+from ghostexec.models import GhostexecAction
+from ghostexec.server import reward as reward_mod
+from ghostexec.server.ghostexec_environment import GhostexecEnvironment
+ROOT = Path(__file__).resolve().parents[1]
+SCENARIO = ROOT / "scenarios" / "phase2_core.json"
+# All inbox ids from phase2_core (e01–e30).
+REPLY_EMAIL_IDS = [f"e{i:02d}" for i in range(1, 31)]
+# Unread or replyable ids suitable for archive (skip if unknown — all exist).
+ARCHIVE_EMAIL_IDS = [f"e{i:02d}" for i in range(1, 16)]
+# Pending / in-progress tasks only (t09 is done in fixture).
+COMPLETE_TASK_IDS = [f"t{i:02d}" for i in range(1, 13) if i != 9]
+# Known non-overlapping reschedules for 08:00 sim time (from phase4 tests).
+_SAFE_RESCHEDULES: list[tuple[str, str]] = [
+    ("m02", "2026-04-21T18:00:00"),
+    ("m03", "2026-04-21T18:30:00"),
+    ("m06", "2026-04-21T20:00:00"),
+    ("m09", "2026-04-21T21:00:00"),
+    ("m04", "2026-04-21T19:00:00"),
+    ("m05", "2026-04-21T19:30:00"),
+    ("m07", "2026-04-21T20:30:00"),
+    ("m08", "2026-04-21T21:30:00"),
+    ("m01", "2026-04-21T17:00:00"),
+    ("m10", "2026-04-21T22:00:00"),
+]
+MEETING_IDS_CANCEL = [f"m{i:02d}" for i in range(1, 11)]
+KNOWN_CONTACTS = ["Jordan Lee", "Jamie Liu", "Marcus Webb", "Sarah Chen", "Priya Sharma", "David Okonkwo"]
+_BODY = "Thanks — triaging and will follow up shortly."
+# --- 30 cases: reply every email id -------------------------------------------
+@pytest.mark.parametrize("email_id", REPLY_EMAIL_IDS)
+def test_dead_reply_email_each_id_positive_or_neutral(email_id: str) -> None:
+    e = GhostexecEnvironment(SCENARIO)
+    e.reset()
+    obs = e.step(GhostexecAction(action_type="reply_email", email_id=email_id, message_body=_BODY))
+    assert obs.metadata.get("step_ok") is True
+    assert obs.reward is not None
+    bd = (obs.metadata or {}).get("reward_breakdown") or {}
+    assert bd.get("invalid_step_adjustment", 0) == pytest.approx(0.0)
+    assert bd.get("do_nothing_floor", 0) == pytest.approx(0.0)
+    # No snapshot -4 conflict tax: legal reply should not tank below -0.5
+    assert float(obs.reward) > -0.5
+@pytest.mark.parametrize("email_id", ("e01", "e03", "e12", "e21", "e27"))
+def test_dead_reply_vip_critical_queue_bonus(email_id: str) -> None:
+    e = GhostexecEnvironment(SCENARIO)
+    e.reset()
+    obs = e.step(GhostexecAction(action_type="reply_email", email_id=email_id, message_body=_BODY))
+    assert obs.metadata.get("step_ok") is True
+    # VIP+critical micro + critical_queue bonus; exact float varies slightly (0.48 scale).
+    assert float(obs.reward or 0) > 0.06
+    bd = (obs.metadata or {}).get("reward_breakdown") or {}
+    assert float(bd.get("critical_queue_bonus") or 0) > 0
+@pytest.mark.parametrize("email_id", ("e02", "e04", "e06", "e14", "e23"))
+def test_dead_reply_high_or_normal_small_positive(email_id: str) -> None:
+    e = GhostexecEnvironment(SCENARIO)
+    e.reset()
+    obs = e.step(GhostexecAction(action_type="reply_email", email_id=email_id, message_body=_BODY))
+    assert obs.metadata.get("step_ok") is True
+    assert float(obs.reward or 0) > 0.0
+# --- 20 cases: do_nothing always penalized ------------------------------------
+@pytest.mark.parametrize("seed", range(20))
+def test_dead_do_nothing_strict_penalty(seed: int) -> None:
+    e = GhostexecEnvironment(SCENARIO)
+    e.reset()
+    obs = e.step(GhostexecAction(action_type="do_nothing"))
+    assert obs.metadata.get("step_ok") is True
+    assert float(obs.reward or 0) < 0
+    bd = (obs.metadata or {}).get("reward_breakdown") or {}
+    assert float(bd.get("do_nothing_floor") or 0) == pytest.approx(reward_mod._DO_NOTHING_STRICT_PENALTY)
+# --- 15 cases: archive --------------------------------------------------------
+@pytest.mark.parametrize("email_id", ARCHIVE_EMAIL_IDS)
+def test_dead_archive_email_step_ok(email_id: str) -> None:
+    e = GhostexecEnvironment(SCENARIO)
+    e.reset()
+    obs = e.step(GhostexecAction(action_type="archive_email", email_id=email_id))
+    assert obs.metadata.get("step_ok") is True
+    assert obs.reward is not None
+# --- 11 cases: complete pending task -----------------------------------------
+@pytest.mark.parametrize("task_id", COMPLETE_TASK_IDS)
+def test_dead_complete_task_step_ok(task_id: str) -> None:
+    e = GhostexecEnvironment(SCENARIO)
+    e.reset()
+    obs = e.step(GhostexecAction(action_type="complete_task", task_id=task_id))
+    assert obs.metadata.get("step_ok") is True
+    assert obs.reward is not None
+    bd = (obs.metadata or {}).get("reward_breakdown") or {}
+    assert float(bd.get("task") or 0) >= reward_mod._COMPLETE_TASK_VALID_MICRO_BONUS
+# --- 10 cases: reschedule safe slots -----------------------------------------
+@pytest.mark.parametrize("meeting_id,new_time", _SAFE_RESCHEDULES)
+def test_dead_reschedule_meeting_resolves_or_micro(meeting_id: str, new_time: str) -> None:
+    e = GhostexecEnvironment(SCENARIO)
+    e.reset()
+    obs = e.step(
+        GhostexecAction(action_type="reschedule_meeting", meeting_id=meeting_id, new_time=new_time)
+    )
+    assert obs.metadata.get("step_ok") is True
+    assert obs.reward is not None
+    # Should beat idle do-nothing on same fresh env
+    e2 = GhostexecEnvironment(SCENARIO)
+    e2.reset()
+    idle = e2.step(GhostexecAction(action_type="do_nothing"))
+    assert float(obs.reward or 0) > float(idle.reward or 0)
+# --- 10 cases: cancel meeting --------------------------------------------------
+@pytest.mark.parametrize("meeting_id", MEETING_IDS_CANCEL)
+def test_dead_cancel_meeting_step_ok(meeting_id: str) -> None:
+    e = GhostexecEnvironment(SCENARIO)
+    e.reset()
+    obs = e.step(
+        GhostexecAction(action_type="cancel_meeting", meeting_id=meeting_id, reason="dead test cancel")
+    )
+    assert obs.metadata.get("step_ok") is True
+    assert obs.reward is not None
+# --- 6 cases: send_message -----------------------------------------------------
+@pytest.mark.parametrize("contact_name", KNOWN_CONTACTS)
+def test_dead_send_message_known_contact(contact_name: str) -> None:
+    e = GhostexecEnvironment(SCENARIO)
+    e.reset()
+    obs = e.step(
+        GhostexecAction(
+            action_type="send_message",
+            contact_name=contact_name,
+            message_body="Quick sync on priorities.",
+        )
+    )
+    assert obs.metadata.get("step_ok") is True
+    bd = (obs.metadata or {}).get("reward_breakdown") or {}
+    assert float(bd.get("relationship") or 0) >= reward_mod._SEND_MESSAGE_VALID_MICRO_BONUS - 0.01
+# --- 5 cases: delegate_task ---------------------------------------------------
+@pytest.mark.parametrize(
+    "task_id,contact",
+    [
+        ("t08", "Jordan Lee"),
+        ("t07", "Jamie Liu"),
+        ("t01", "Marcus Webb"),
+        ("t02", "Sarah Chen"),
+        ("t11", "Casey Nguyen"),
+    ],
+)
+def test_dead_delegate_task(task_id: str, contact: str) -> None:
+    e = GhostexecEnvironment(SCENARIO)
+    e.reset()
+    obs = e.step(
+        GhostexecAction(action_type="delegate_task", task_id=task_id, contact_name=contact)
+    )
+    assert obs.metadata.get("step_ok") is True
+    bd = (obs.metadata or {}).get("reward_breakdown") or {}
+    assert float(bd.get("task") or 0) >= reward_mod._DELEGATE_TASK_VALID_MICRO_BONUS - 0.01
+# --- 10 cases: invalid actions ------------------------------------------------
+@pytest.mark.parametrize(
+    "action,expect_ok",
+    [
+        (GhostexecAction(action_type="reply_email", email_id="nope", message_body="x"), False),
+        (GhostexecAction(action_type="complete_task", task_id="t09"), False),
+        (GhostexecAction(action_type="archive_email", email_id="nope"), False),
+        (GhostexecAction(action_type="reschedule_meeting", meeting_id="m99", new_time="2026-04-21T18:00:00"), False),
+        (GhostexecAction(action_type="cancel_meeting", meeting_id="m99", reason="x"), False),
+        (GhostexecAction(action_type="delegate_task", task_id="t01", contact_name="Nobody"), False),
+        (GhostexecAction(action_type="send_message", contact_name="Nobody", message_body="hi"), False),
+        (GhostexecAction(action_type="reply_email", email_id="", message_body="hi"), False),
+        (GhostexecAction(action_type="complete_task", task_id=""), False),
+        (GhostexecAction(action_type="archive_email", email_id=""), False),
+    ],
+)
+def test_dead_invalid_action_step_ok_false(action: GhostexecAction, expect_ok: bool) -> None:
+    e = GhostexecEnvironment(SCENARIO)
+    e.reset()
+    obs = e.step(action)
+    assert obs.metadata.get("step_ok") is expect_ok
+    bd = (obs.metadata or {}).get("reward_breakdown") or {}
+    assert float(bd.get("invalid_step_adjustment") or 0) == pytest.approx(-0.25)
+# --- Ordering: VIP critical reply >> do_nothing --------------------------------
+def test_dead_priority_ordering_vip_critical_over_normal_over_idle() -> None:
+    r_vip: list[float] = []
+    r_norm: list[float] = []
+    r_idle: list[float] = []
+    for _ in range(5):
+        e1 = GhostexecEnvironment(SCENARIO)
+        e1.reset()
+        r_vip.append(float(e1.step(GhostexecAction(action_type="reply_email", email_id="e01", message_body=_BODY)).reward or 0))
+        e2 = GhostexecEnvironment(SCENARIO)
+        e2.reset()
+        r_norm.append(float(e2.step(GhostexecAction(action_type="reply_email", email_id="e14", message_body=_BODY)).reward or 0))
+        e3 = GhostexecEnvironment(SCENARIO)
+        e3.reset()
+        r_idle.append(float(e3.step(GhostexecAction(action_type="do_nothing")).reward or 0))
+    assert min(r_vip) > max(r_idle)
+    assert min(r_norm) > max(r_idle)
+    assert sum(r_vip) / len(r_vip) > sum(r_norm) / len(r_norm)
+# --- Tone penalty: casual to angry board contact ------------------------------
+def test_dead_tone_penalty_casual_to_angry_board() -> None:
+    e = GhostexecEnvironment(SCENARIO)
+    e.reset()
+    # Marcus Webb is board; ensure angry mood in scenario or pick contact - phase2 has Marcus ANGRY in briefing
+    obs_bad = e.step(
+        GhostexecAction(
+            action_type="reply_email",
+            email_id="e01",
+            message_body="hey lol no worries",
+        )
+    )
+    assert obs_bad.metadata.get("step_ok") is True
+    e2 = GhostexecEnvironment(SCENARIO)
+    e2.reset()
+    obs_good = e2.step(
+        GhostexecAction(
+            action_type="reply_email",
+            email_id="e01",
+            message_body="Dear Marcus, sincerely addressing the board request now.",
+        )
+    )
+    assert float(obs_good.reward or 0) > float(obs_bad.reward or 0)
+# --- Reschedule adds conflict channel micro even if overlap unchanged ---------
+def test_dead_reschedule_micro_in_breakdown() -> None:
+    e = GhostexecEnvironment(SCENARIO)
+    e.reset()
+    obs = e.step(
+        GhostexecAction(action_type="reschedule_meeting", meeting_id="m07", new_time="2026-04-21T20:30:00")
+    )
+    assert obs.metadata.get("step_ok") is True
+    bd = (obs.metadata or {}).get("reward_breakdown") or {}
+    assert float(bd.get("conflict_raw") or 0) >= reward_mod._RESCHEDULE_VALID_MICRO_BONUS - 0.01
+# --- Unit: compute_step_reward invalid vs noop delta matches contract ---------
+def test_dead_compute_reward_invalid_vs_noop_delta() -> None:
+    w = GhostexecEnvironment.load_world_from_json(SCENARIO)
+    noop = GhostexecAction(action_type="do_nothing")
+    bad = GhostexecAction(action_type="reply_email", email_id="missing", message_body="x")
+    bd_ok = reward_mod.compute_step_reward(w, w, noop, action_ok=True, episode_done=False)
+    bd_bad = reward_mod.compute_step_reward(w, w, bad, action_ok=False, episode_done=False)
+    assert bd_bad.final == pytest.approx(bd_ok.final - (0.25 - 0.15))
+def test_dead_vip_critical_reply_outscores_professional_critical() -> None:
+    """VIP x2 micro on critical senders should dominate professional critical."""
+    e_vip = GhostexecEnvironment(SCENARIO)
+    e_vip.reset()
+    r_vip = float(
+        e_vip.step(GhostexecAction(action_type="reply_email", email_id="e01", message_body=_BODY)).reward or 0
+    )
+    e_pro = GhostexecEnvironment(SCENARIO)
+    e_pro.reset()
+    r_pro = float(
+        e_pro.step(GhostexecAction(action_type="reply_email", email_id="e21", message_body=_BODY)).reward or 0
+    )
+    assert r_vip > r_pro

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

validate-submission.sh ADDED Viewed

	@@ -0,0 +1,163 @@

+#!/usr/bin/env bash
+#
+# validate-submission.sh — Ghostexec OpenEnv Submission Validator
+#
+# Checks that your HF Space is live, Docker image builds, and openenv validate passes.
+set -uo pipefail
+DOCKER_BUILD_TIMEOUT=600
+if [ -t 1 ]; then
+  RED='\033[0;31m'
+  GREEN='\033[0;32m'
+  YELLOW='\033[1;33m'
+  BOLD='\033[1m'
+  NC='\033[0m'
+else
+  RED='' GREEN='' YELLOW='' BOLD='' NC=''
+fi
+run_with_timeout() {
+  local secs="$1"; shift
+  if command -v timeout &>/dev/null; then
+    timeout "$secs" "$@"
+  elif command -v gtimeout &>/dev/null; then
+    gtimeout "$secs" "$@"
+  else
+    "$@" &
+    local pid=$!
+    ( sleep "$secs" && kill "$pid" 2>/dev/null ) &
+    local watcher=$!
+    wait "$pid" 2>/dev/null
+    local rc=$?
+    kill "$watcher" 2>/dev/null
+    wait "$watcher" 2>/dev/null
+    return $rc
+  fi
+}
+portable_mktemp() {
+  local prefix="${1:-validate}"
+  mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
+}
+CLEANUP_FILES=()
+cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
+trap cleanup EXIT
+PING_URL="${1:-}"
+REPO_DIR="${2:-.}"
+if [ -z "$PING_URL" ]; then
+  printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
+  printf "\n"
+  printf "  ping_url   Your HuggingFace Space URL (e.g. https://modelbuilderhq-ghostexec.hf.space)\n"
+  printf "  repo_dir   Path to your repo (default: current directory)\n"
+  exit 1
+fi
+if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
+  printf "Error: directory '%s' not found\n" "${2:-.}"
+  exit 1
+fi
+PING_URL="${PING_URL%/}"
+PASS=0
+log()  { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
+pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
+fail() { log "${RED}FAILED${NC} -- $1"; }
+hint() { printf "  ${YELLOW}Hint:${NC} %b\n" "$1"; }
+stop_at() {
+  printf "\n"
+  printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
+  exit 1
+}
+printf "\n"
+printf "${BOLD}========================================${NC}\n"
+printf "${BOLD}  Ghostexec OpenEnv Validator${NC}\n"
+printf "${BOLD}========================================${NC}\n"
+log "Repo:     $REPO_DIR"
+log "Ping URL: $PING_URL"
+printf "\n"
+log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
+CURL_OUTPUT=$(portable_mktemp "validate-curl")
+CLEANUP_FILES+=("$CURL_OUTPUT")
+HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
+  -H "Content-Type: application/json" -d '{}' \
+  "$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
+if [ "$HTTP_CODE" = "200" ]; then
+  pass "HF Space is live and responds to /reset"
+elif [ "$HTTP_CODE" = "000" ]; then
+  fail "HF Space not reachable (connection failed or timed out)"
+  hint "Check your network connection and that the Space is running."
+  hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
+  stop_at "Step 1"
+else
+  fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
+  hint "Make sure your Space is running and the URL is correct."
+  hint "Try opening $PING_URL in your browser first."
+  stop_at "Step 1"
+fi
+log "${BOLD}Step 2/3: Running docker build${NC} ..."
+if ! command -v docker &>/dev/null; then
+  fail "docker command not found"
+  hint "Install Docker: https://docs.docker.com/get-docker/"
+  stop_at "Step 2"
+fi
+if [ -f "$REPO_DIR/Dockerfile" ]; then
+  DOCKER_CONTEXT="$REPO_DIR"
+elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
+  DOCKER_CONTEXT="$REPO_DIR/server"
+else
+  fail "No Dockerfile found in repo root or server/ directory"
+  stop_at "Step 2"
+fi
+log "  Found Dockerfile in $DOCKER_CONTEXT"
+BUILD_OK=false
+BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
+if [ "$BUILD_OK" = true ]; then
+  pass "Docker build succeeded"
+else
+  fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
+  printf "%s\n" "$BUILD_OUTPUT" | tail -20
+  stop_at "Step 2"
+fi
+log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
+if ! command -v openenv &>/dev/null; then
+  fail "openenv command not found"
+  hint "Install it with your project env, e.g.: uv run pip install openenv-core"
+  stop_at "Step 3"
+fi
+VALIDATE_OK=false
+VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
+if [ "$VALIDATE_OK" = true ]; then
+  pass "openenv validate passed"
+  [ -n "$VALIDATE_OUTPUT" ] && log "  $VALIDATE_OUTPUT"
+else
+  fail "openenv validate failed"
+  printf "%s\n" "$VALIDATE_OUTPUT"
+  stop_at "Step 3"
+fi
+printf "\n"
+printf "${BOLD}========================================${NC}\n"
+printf "${GREEN}${BOLD}  All 3/3 checks passed!${NC}\n"
+printf "${GREEN}${BOLD}  Ghostexec is ready for submission.${NC}\n"
+printf "${BOLD}========================================${NC}\n"
+printf "\n"
+exit 0