Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- Dockerfile +83 -0
- README.md +303 -7
- __init__.py +20 -0
- client.py +64 -0
- conftest.py +12 -0
- graders.py +121 -0
- inference.py +265 -0
- models.py +204 -0
- openenv.yaml +32 -0
- openenv_ghostexec.egg-info/PKG-INFO +15 -0
- openenv_ghostexec.egg-info/SOURCES.txt +45 -0
- openenv_ghostexec.egg-info/dependency_links.txt +1 -0
- openenv_ghostexec.egg-info/entry_points.txt +2 -0
- openenv_ghostexec.egg-info/requires.txt +13 -0
- openenv_ghostexec.egg-info/top_level.txt +1 -0
- outputs/logs/api_dead_live_500.jsonl +500 -0
- outputs/logs/episode_rewards.jsonl +0 -0
- outputs/training/_integration_ckpt/run_summary.json +28 -0
- outputs/training/checkpoints/run_summary.json +28 -0
- outputs/training/episode_returns.jsonl +10 -0
- outputs/training/smoke/checkpoints/run_summary.json +28 -0
- outputs/training/smoke/reinforce_returns.jsonl +48 -0
- outputs/training/test_returns.jsonl +25 -0
- pyproject.toml +61 -0
- scenarios/dinner_disaster.json +107 -0
- scenarios/monday_morning.json +257 -0
- scenarios/phase2_core.json +83 -0
- scenarios/schema_drift_test.json +27 -0
- scenarios/vip_meltdown.json +63 -0
- scenarios/vip_meltdown_drift.json +25 -0
- scripts/__init__.py +1 -0
- scripts/http_endpoint_smoke.py +184 -0
- scripts/run_live_api_dead_500.py +196 -0
- server/__init__.py +11 -0
- server/app.py +169 -0
- server/ghostexec_environment.py +706 -0
- server/requirements.txt +6 -0
- server/reward.py +350 -0
- tests/test_api_reward_dead_500.py +150 -0
- tests/test_complete_integration.py +235 -0
- tests/test_docker_build.py +60 -0
- tests/test_env.py +48 -0
- tests/test_live_server_exhaustive.py +287 -0
- tests/test_phase1.py +42 -0
- tests/test_phase2.py +77 -0
- tests/test_phase3.py +153 -0
- tests/test_phase4.py +206 -0
- tests/test_reward_dead_suite.py +319 -0
- uv.lock +0 -0
- validate-submission.sh +163 -0
Dockerfile
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
# Multi-stage build using openenv-base
|
| 8 |
+
# This Dockerfile is flexible and works for both:
|
| 9 |
+
# - In-repo environments (with local OpenEnv sources)
|
| 10 |
+
# - Standalone environments (with openenv from PyPI/Git)
|
| 11 |
+
# The build script (openenv build) handles context detection and sets appropriate build args.
|
| 12 |
+
|
| 13 |
+
ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
|
| 14 |
+
FROM ${BASE_IMAGE} AS builder
|
| 15 |
+
|
| 16 |
+
WORKDIR /app
|
| 17 |
+
|
| 18 |
+
# Ensure git is available (required for installing dependencies from VCS)
|
| 19 |
+
RUN apt-get update && \
|
| 20 |
+
apt-get install -y --no-install-recommends git && \
|
| 21 |
+
rm -rf /var/lib/apt/lists/*
|
| 22 |
+
|
| 23 |
+
# Build argument to control whether we're building standalone or in-repo
|
| 24 |
+
ARG BUILD_MODE=in-repo
|
| 25 |
+
ARG ENV_NAME=ghostexec
|
| 26 |
+
|
| 27 |
+
# Copy environment code (always at root of build context)
|
| 28 |
+
COPY . /app/env
|
| 29 |
+
|
| 30 |
+
# For in-repo builds, openenv is already vendored in the build context
|
| 31 |
+
# For standalone builds, openenv will be installed via pyproject.toml
|
| 32 |
+
WORKDIR /app/env
|
| 33 |
+
|
| 34 |
+
# Ensure uv is available (for local builds where base image lacks it)
|
| 35 |
+
RUN if ! command -v uv >/dev/null 2>&1; then \
|
| 36 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
| 37 |
+
mv /root/.local/bin/uv /usr/local/bin/uv && \
|
| 38 |
+
mv /root/.local/bin/uvx /usr/local/bin/uvx; \
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
# Install dependencies using uv sync
|
| 42 |
+
# If uv.lock exists, use it; otherwise resolve on the fly
|
| 43 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 44 |
+
if [ -f uv.lock ]; then \
|
| 45 |
+
uv sync --frozen --no-install-project --no-editable; \
|
| 46 |
+
else \
|
| 47 |
+
uv sync --no-install-project --no-editable; \
|
| 48 |
+
fi
|
| 49 |
+
|
| 50 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 51 |
+
if [ -f uv.lock ]; then \
|
| 52 |
+
uv sync --frozen --no-editable; \
|
| 53 |
+
else \
|
| 54 |
+
uv sync --no-editable; \
|
| 55 |
+
fi
|
| 56 |
+
|
| 57 |
+
# Final runtime stage
|
| 58 |
+
FROM ${BASE_IMAGE}
|
| 59 |
+
|
| 60 |
+
WORKDIR /app
|
| 61 |
+
|
| 62 |
+
# Copy the virtual environment from builder
|
| 63 |
+
COPY --from=builder /app/env/.venv /app/.venv
|
| 64 |
+
|
| 65 |
+
# Copy the environment code
|
| 66 |
+
COPY --from=builder /app/env /app/env
|
| 67 |
+
|
| 68 |
+
# Set PATH to use the virtual environment
|
| 69 |
+
ENV PATH="/app/.venv/bin:$PATH"
|
| 70 |
+
|
| 71 |
+
# Set PYTHONPATH so imports work correctly
|
| 72 |
+
ENV PYTHONPATH="/app/env:$PYTHONPATH"
|
| 73 |
+
|
| 74 |
+
# Mount Gradio OpenEnv UI at /web (matches HF Space README expectations)
|
| 75 |
+
ENV ENABLE_WEB_INTERFACE=true
|
| 76 |
+
|
| 77 |
+
# Health check
|
| 78 |
+
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
| 79 |
+
CMD sh -c 'curl -f "http://localhost:${PORT:-7860}/health" || exit 1'
|
| 80 |
+
|
| 81 |
+
# Same entrypoint as local `uv run server` (console script from the project venv)
|
| 82 |
+
WORKDIR /app/env
|
| 83 |
+
CMD ["/bin/sh", "-lc", "/app/.venv/bin/server --port ${PORT:-7860}"]
|
README.md
CHANGED
|
@@ -1,12 +1,308 @@
|
|
| 1 |
---
|
| 2 |
-
title: Ghostexec
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Ghostexec Environment Server
|
| 3 |
+
emoji: 📢
|
| 4 |
+
colorFrom: pink
|
| 5 |
+
colorTo: yellow
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
app_port: 7860
|
| 9 |
+
base_path: /web
|
| 10 |
+
tags:
|
| 11 |
+
- openenv
|
| 12 |
---
|
| 13 |
|
| 14 |
+
# Ghostexec
|
| 15 |
+
|
| 16 |
+
**Ghostexec** is an [OpenEnv](https://github.com/meta-pytorch/OpenEnv)-compatible environment that simulates a busy executive’s world: inbox, calendar, contacts, tasks, and stakeholder moods. The agent chooses **structured actions** (reply, reschedule, delegate, …); the server returns a **plain-text briefing** as the main observation and a **scalar reward** shaped around conflict, relationships, and task progress. Scenario data lives in `scenarios/*.json` — nothing is hardcoded in Python for world content.
|
| 17 |
+
|
| 18 |
+
**Manifest:** `openenv.yaml` (name **`ghostexec`**, HF Space identifier).
|
| 19 |
+
**Package:** `openenv-ghostexec` in `pyproject.toml` (import as `ghostexec`).
|
| 20 |
+
|
| 21 |
+
---
|
| 22 |
+
|
| 23 |
+
## Deliverables
|
| 24 |
+
|
| 25 |
+
| Deliverable | URL |
|
| 26 |
+
|-------------|-----|
|
| 27 |
+
| Public HF Space (required) | `TODO: https://huggingface.co/spaces/<org>/ghostexec` |
|
| 28 |
+
| Write-up / blog (HF post preferred) | `TODO: https://huggingface.co/blog/...` |
|
| 29 |
+
| Short demo video (<2 min) | `TODO: https://youtube.com/...` |
|
| 30 |
+
|
| 31 |
+
Fill these URLs before submission freeze so reviewers can verify everything from one place.
|
| 32 |
+
|
| 33 |
+
---
|
| 34 |
+
|
| 35 |
+
## OpenEnv Hackathon alignment (themes + submission checklist)
|
| 36 |
+
|
| 37 |
+
**Theme fit (examples, not exhaustive):** Ghostexec targets **Theme 3.2 — Personalized tasks** (executive-style inbox, calendar, conflicts, delegation via structured actions). **Theme 4** is partially supported via curriculum + perturb (`GHOSTEXEC_CURRICULUM`, `GHOSTEXEC_PERTURB`) and diverse scenarios under `scenarios/`.
|
| 38 |
+
|
| 39 |
+
**Minimum submission checklist (fill before freeze):**
|
| 40 |
+
|
| 41 |
+
| Item | Status |
|
| 42 |
+
|------|--------|
|
| 43 |
+
| OpenEnv-based env + `openenv.yaml` | Done in-repo (`openenv-core[core]>=0.2.3` in `pyproject.toml`; aligns with current PyPI release line). |
|
| 44 |
+
| Short write-up or <2 min video | **You:** publish and paste links in [Deliverables](#deliverables). |
|
| 45 |
+
| Public HF Space URL | **You:** `openenv push` and paste the URL in [Deliverables](#deliverables). |
|
| 46 |
+
|
| 47 |
+
---
|
| 48 |
+
|
| 49 |
+
## Design narrative
|
| 50 |
+
|
| 51 |
+
Ghostexec is intentionally built as an **AI Chief of Staff** environment, not a grid-world clone: the model must triage inbox, calendar, stakeholder mood, and task deadlines under conflict pressure while taking only legal structured actions.
|
| 52 |
+
|
| 53 |
+
- **Environment Innovation (40%)** — scenario-driven executive operations with competing priorities, conflict queues, and relationship-sensitive outcomes in `scenarios/*.json` + `server/ghostexec_environment.py`.
|
| 54 |
+
- **Storytelling & Presentation (30%)** — each scenario encodes a narrative arc (VIP escalations, family/professional collisions, deadline cascades) so policy behavior reads like realistic assistant decisions rather than abstract moves.
|
| 55 |
+
- **Showing Improvement in Rewards (20%)** — environment reward remains deterministic, inspectable, and traceable through metadata + episode logs under `outputs/logs/`.
|
| 56 |
+
- **Reward Quality (10%)** — fixed weighted core signal (0.35 conflict / 0.35 relationship / 0.30 task), bounded shaping terms, explicit invalid-action handling, and do_nothing penalties.
|
| 57 |
+
|
| 58 |
+
This framing gives judges a clear throughline: **realistic executive chaos -> constrained legal actions -> measurable policy improvement on held-out scenarios**.
|
| 59 |
+
|
| 60 |
+
---
|
| 61 |
+
|
| 62 |
+
## Features
|
| 63 |
+
|
| 64 |
+
- **Legal action set** — `reply_email`, `archive_email`, `reschedule_meeting`, `cancel_meeting`, `complete_task`, `delegate_task`, `send_message`, `do_nothing` (see `models.py`).
|
| 65 |
+
- **Human-readable observations** — `GhostexecObservation.echoed_message` is the full briefing text for the model (not raw JSON).
|
| 66 |
+
- **Invalid actions** — Handled in-process: structured metadata (e.g. `step_ok`), no server crash.
|
| 67 |
+
- **Reward** — Weighted blend of conflict, relationship, and task signals (see [Reward](#reward)); per-step logging under `outputs/logs/` (gitignored).
|
| 68 |
+
- **HTTP + WebSocket** — FastAPI app in `server/app.py`; `GhostexecEnv` uses WebSockets for persistent episodes.
|
| 69 |
+
|
| 70 |
+
---
|
| 71 |
+
|
| 72 |
+
## Quick start (Python client)
|
| 73 |
+
|
| 74 |
+
From the repo root (`ghostexec/` — where `pyproject.toml` lives):
|
| 75 |
+
|
| 76 |
+
```bash
|
| 77 |
+
uv sync
|
| 78 |
+
uv run server --port 8000
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
In another terminal or notebook:
|
| 82 |
+
|
| 83 |
+
```python
|
| 84 |
+
from ghostexec import GhostexecAction, GhostexecEnv
|
| 85 |
+
|
| 86 |
+
with GhostexecEnv(base_url="http://127.0.0.1:8000") as env:
|
| 87 |
+
out = env.reset()
|
| 88 |
+
print(out.observation.echoed_message[:500], "…") # plain-text briefing
|
| 89 |
+
|
| 90 |
+
step = env.step(
|
| 91 |
+
GhostexecAction(
|
| 92 |
+
action_type="reply_email",
|
| 93 |
+
email_id="e01",
|
| 94 |
+
message_body=(
|
| 95 |
+
"Marcus — acknowledged. Revised figures and short rationale "
|
| 96 |
+
"before noon. — Exec"
|
| 97 |
+
),
|
| 98 |
+
)
|
| 99 |
+
)
|
| 100 |
+
print("reward:", step.reward)
|
| 101 |
+
print("metadata keys:", sorted((step.observation.metadata or {}).keys()))
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
**Docker image** (optional): if your OpenEnv client supports it, you can point `GhostexecEnv` at a container built from the root `Dockerfile`. Build from repo root:
|
| 105 |
+
|
| 106 |
+
```bash
|
| 107 |
+
docker build -t ghostexec-env:latest .
|
| 108 |
+
```
|
| 109 |
+
|
| 110 |
+
---
|
| 111 |
+
|
| 112 |
+
## Actions and fields
|
| 113 |
+
|
| 114 |
+
`GhostexecAction` (`models.py`) includes:
|
| 115 |
+
|
| 116 |
+
| `action_type` | Typical fields used |
|
| 117 |
+
|------------------------|----------------------|
|
| 118 |
+
| `reply_email` | `email_id`, `message_body` |
|
| 119 |
+
| `archive_email` | `email_id` |
|
| 120 |
+
| `reschedule_meeting` | `meeting_id`, `new_time`, `reason` |
|
| 121 |
+
| `cancel_meeting` | `meeting_id`, `reason` |
|
| 122 |
+
| `complete_task` | `task_id` |
|
| 123 |
+
| `delegate_task` | `task_id`, `contact_name` |
|
| 124 |
+
| `send_message` | `contact_name`, `message` (channel text) |
|
| 125 |
+
| `do_nothing` | — (intentionally weak / penalised path) |
|
| 126 |
+
|
| 127 |
+
Unknown or malformed HTTP payloads deserialize safely to `do_nothing`-style defaults where applicable so older clients do not crash.
|
| 128 |
+
|
| 129 |
+
---
|
| 130 |
+
|
| 131 |
+
## Observation
|
| 132 |
+
|
| 133 |
+
`GhostexecObservation`:
|
| 134 |
+
|
| 135 |
+
- **`echoed_message`** — Full briefing (emails, conflicts, contacts, tasks, stress, steps remaining).
|
| 136 |
+
- **`message_length`** — Length of `echoed_message` for quick checks.
|
| 137 |
+
- **`reward`**, **`done`**, **`metadata`** — Step outcome; metadata carries flags such as `step_ok`, reward breakdown fields, and ids for debugging.
|
| 138 |
+
|
| 139 |
+
---
|
| 140 |
+
|
| 141 |
+
## Reward
|
| 142 |
+
|
| 143 |
+
Phase-4 scoring (`server/reward.py`) combines three channels with **fixed weights**:
|
| 144 |
+
|
| 145 |
+
\[
|
| 146 |
+
\text{weighted base} = 0.35 \cdot \text{conflict} + 0.35 \cdot \text{relationship} + 0.30 \cdot \text{task}
|
| 147 |
+
\]
|
| 148 |
+
|
| 149 |
+
Then applies output scaling, invalid-step adjustments, bonuses/penalties, and a floor for `do_nothing`. Full component values are available on `RewardBreakdown` and are mirrored into observation metadata where configured. **Episode reward traces** append to `outputs/logs/episode_rewards.jsonl` (directory gitignored).
|
| 150 |
+
|
| 151 |
+
**Reward-engineering provenance.** The design follows the reward-shaping playbook surveyed in *Comprehensive Overview of Reward Engineering and Shaping in Advancing Reinforcement Learning Applications* ([arXiv:2408.10215](https://arxiv.org/abs/2408.10215)): dense per-step shaping around proxy signals (conflict / relationship / task) instead of a single sparse end-of-episode reward, fixed weights to keep channel trade-offs inspectable, and bounded per-step magnitudes to resist hacking.
|
| 152 |
+
|
| 153 |
+
---
|
| 154 |
+
|
| 155 |
+
## HTTP vs WebSocket (episode state)
|
| 156 |
+
|
| 157 |
+
- **HTTP** `POST /reset` and `POST /step` often bind to **short-lived** environment instances depending on deployment; consecutive HTTP calls may not share one in-memory episode.
|
| 158 |
+
- **Ghostexec** still applies your action against a scenario-primed instance so a lone `POST /step` can return a meaningful reward and metadata.
|
| 159 |
+
- **WebSocket `/ws`** — Use this (or `GhostexecEnv(base_url=...)`, which speaks WebSocket) for **multi-step episodes** on the same session.
|
| 160 |
+
|
| 161 |
+
Endpoints (typical OpenEnv layout): **`/web`**, **`/docs`**, **`/health`**, **`/ws`**.
|
| 162 |
+
|
| 163 |
+
---
|
| 164 |
+
|
| 165 |
+
## Running and testing locally
|
| 166 |
+
|
| 167 |
+
```bash
|
| 168 |
+
# Dev server (package layout)
|
| 169 |
+
uv run uvicorn ghostexec.server.app:app --reload --host 0.0.0.0 --port 8000
|
| 170 |
+
|
| 171 |
+
# Or console entrypoint (matches Dockerfile)
|
| 172 |
+
uv run server --port 8000
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
**Smoke script** (HTTP):
|
| 176 |
+
|
| 177 |
+
```bash
|
| 178 |
+
uv run python scripts/http_endpoint_smoke.py --local
|
| 179 |
+
uv run python scripts/http_endpoint_smoke.py --url http://127.0.0.1:8000
|
| 180 |
+
uv run python scripts/http_endpoint_smoke.py --print-curl
|
| 181 |
+
```
|
| 182 |
+
|
| 183 |
+
**Tests:**
|
| 184 |
+
|
| 185 |
+
```bash
|
| 186 |
+
uv run pytest tests/ -q
|
| 187 |
+
```
|
| 188 |
+
|
| 189 |
+
Opt-in Docker build smoke (Phase 1 gate):
|
| 190 |
+
|
| 191 |
+
```bash
|
| 192 |
+
GHOSTEXEC_RUN_DOCKER_BUILD=1 uv run pytest tests/test_docker_build.py -q
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
With the server already on port 8000:
|
| 196 |
+
|
| 197 |
+
```bash
|
| 198 |
+
uv run pytest tests/test_live_server_exhaustive.py -v --tb=short
|
| 199 |
+
```
|
| 200 |
+
|
| 201 |
+
Override live URL (Windows PowerShell example):
|
| 202 |
+
|
| 203 |
+
```powershell
|
| 204 |
+
$env:GHOSTEXEC_LIVE_BASE_URL = "http://127.0.0.1:9000"
|
| 205 |
+
uv run pytest tests/test_live_server_exhaustive.py -q
|
| 206 |
+
```
|
| 207 |
+
|
| 208 |
+
Optional real WebSocket client check:
|
| 209 |
+
|
| 210 |
+
```bash
|
| 211 |
+
# Terminal 1
|
| 212 |
+
uv run server --port 8000
|
| 213 |
+
# Terminal 2
|
| 214 |
+
set GHOSTEXEC_WS_BASE_URL=http://127.0.0.1:8000
|
| 215 |
+
uv run pytest tests/test_complete_integration.py::test_ghostexec_env_client_against_live_url_if_set -q
|
| 216 |
+
```
|
| 217 |
+
|
| 218 |
+
---
|
| 219 |
+
|
| 220 |
+
## Hugging Face Spaces
|
| 221 |
+
|
| 222 |
+
Full OpenEnv CLI flow from this directory (matches steps 5–8 of the [Packaging & Deploying guide](https://meta-pytorch.org/OpenEnv/auto_getting_started/environment-builder.html)):
|
| 223 |
+
|
| 224 |
+
```bash
|
| 225 |
+
openenv serve # local dev server on :8000
|
| 226 |
+
openenv build # build the Docker image
|
| 227 |
+
openenv validate --verbose # structure + Dockerfile + entrypoint checks
|
| 228 |
+
openenv push # deploy to HF Spaces
|
| 229 |
+
# openenv push --repo-id your-username/ghostexec
|
| 230 |
+
```
|
| 231 |
+
|
| 232 |
+
Use a **public** Space for the default hackathon flow unless you intentionally need a private Space. Authenticate with Hugging Face first (`huggingface-cli login` or equivalent).
|
| 233 |
+
|
| 234 |
+
---
|
| 235 |
+
|
| 236 |
+
## Scenarios
|
| 237 |
+
|
| 238 |
+
| File | Role |
|
| 239 |
+
|------|------|
|
| 240 |
+
| `scenarios/phase2_core.json` | Default dense inbox/calendar/tasks fixture |
|
| 241 |
+
| `scenarios/monday_morning.json`, `dinner_disaster.json`, `vip_meltdown.json` | Narrative demos |
|
| 242 |
+
| `scenarios/vip_meltdown_drift.json` | Mood / escalation drift |
|
| 243 |
+
| `scenarios/schema_drift_test.json` | Drift-event harness |
|
| 244 |
+
|
| 245 |
+
---
|
| 246 |
+
|
| 247 |
+
## Concurrent WebSocket sessions
|
| 248 |
+
|
| 249 |
+
`server/app.py` passes **`GhostexecEnvironment`** (the class) into `create_app` with `max_concurrent_envs=1` by default. Increase `max_concurrent_envs` if you need multiple simultaneous WebSocket clients.
|
| 250 |
+
|
| 251 |
+
---
|
| 252 |
+
|
| 253 |
+
## Project layout
|
| 254 |
+
|
| 255 |
+
```
|
| 256 |
+
ghostexec/
|
| 257 |
+
├── openenv.yaml # OpenEnv name, version, description
|
| 258 |
+
├── pyproject.toml # Package metadata + optional extras
|
| 259 |
+
├── uv.lock
|
| 260 |
+
├── models.py # World + GhostexecAction / GhostexecObservation
|
| 261 |
+
├── client.py # GhostexecEnv (WebSocket client)
|
| 262 |
+
├── scenarios/ # World JSON (source of truth for episodes)
|
| 263 |
+
├── scripts/ # http_endpoint_smoke.py
|
| 264 |
+
├── tests/
|
| 265 |
+
└── server/
|
| 266 |
+
├── app.py # FastAPI + create_app
|
| 267 |
+
├── ghostexec_environment.py
|
| 268 |
+
├── reward.py
|
| 269 |
+
└── Dockerfile
|
| 270 |
+
```
|
| 271 |
+
|
| 272 |
+
---
|
| 273 |
+
|
| 274 |
+
## Resources & references
|
| 275 |
+
|
| 276 |
+
Ghostexec is built against the official Meta PyTorch OpenEnv stack. Every design choice below is traceable to one of these sources.
|
| 277 |
+
|
| 278 |
+
**OpenEnv core.** The Gymnasium-style `reset()` / `step()` / `state` interface in `server/ghostexec_environment.py`, the `EnvClient` subclass in `client.py`, and the `create_app(...)` wiring in `server/app.py` follow the [Packaging & Deploying guide](https://meta-pytorch.org/OpenEnv/auto_getting_started/environment-builder.html) exactly.
|
| 279 |
+
|
| 280 |
+
- Core repo: [meta-pytorch/OpenEnv](https://github.com/meta-pytorch/OpenEnv)
|
| 281 |
+
- Docs: [meta-pytorch.org/OpenEnv](https://meta-pytorch.org/OpenEnv/)
|
| 282 |
+
|
| 283 |
+
**OpenEnv Hub (Hugging Face).** Target deployment for `openenv push`. The Space metadata at the top of this README + `openenv.yaml` are the knobs HF Spaces reads.
|
| 284 |
+
|
| 285 |
+
- Environments: [huggingface.co/openenv](https://huggingface.co/openenv)
|
| 286 |
+
- Spaces: [huggingface.co/openenv/spaces](https://huggingface.co/openenv/spaces)
|
| 287 |
+
|
| 288 |
+
**Tutorials.** General OpenEnv environment patterns are documented in the official tutorial pages and examples.
|
| 289 |
+
|
| 290 |
+
- All tutorials: [OpenEnv/tutorial](https://github.com/meta-pytorch/OpenEnv/tree/main/tutorial)
|
| 291 |
+
- Environment examples: [OpenEnv/envs](https://github.com/meta-pytorch/OpenEnv/tree/main/envs)
|
| 292 |
+
|
| 293 |
+
**YouTube — Building RL environments.** Talks from Meta / OpenEnv contributors that informed the scenario-driven reset, WebSocket session model, and reward breakdown used here:
|
| 294 |
+
|
| 295 |
+
- [Building RL Environments with OpenEnv](https://www.youtube.com/watch?v=0airz7BhBiA)
|
| 296 |
+
- [OpenEnv Deep Dive](https://www.youtube.com/watch?v=ap4q4sAK4OY)
|
| 297 |
+
- [Agentic RL Environments](https://www.youtube.com/watch?v=Jew4lhAiqnw)
|
| 298 |
+
- [OpenEnv Livestream (4-hour walkthrough)](https://www.youtube.com/live/kkCNMz0Ptd8)
|
| 299 |
+
|
| 300 |
+
**Reward-engineering papers.** See [Reward](#reward) for how each paper maps to specific components of `server/reward.py`.
|
| 301 |
+
|
| 302 |
+
- Jnadi, A. (2024). *Comprehensive Overview of Reward Engineering and Shaping in Advancing Reinforcement Learning Applications*. [arXiv:2408.10215](https://arxiv.org/abs/2408.10215). Informs the dense per-step conflict / relationship / task shaping and the bounded-magnitude design.
|
| 303 |
+
|
| 304 |
+
---
|
| 305 |
+
|
| 306 |
+
## License
|
| 307 |
+
|
| 308 |
+
BSD-style — see the license notice at the top of each source file (Meta / OpenEnv lineage).
|
__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Ghostexec Environment."""
|
| 8 |
+
|
| 9 |
+
from .models import GhostexecAction, GhostexecObservation
|
| 10 |
+
|
| 11 |
+
# Importing ghostexec.models in notebooks should not require websocket client deps.
|
| 12 |
+
# Keep client import optional so package imports survive OpenEnv layout differences.
|
| 13 |
+
try:
|
| 14 |
+
from .client import GhostexecEnv
|
| 15 |
+
except Exception: # pragma: no cover - import-compat shim
|
| 16 |
+
GhostexecEnv = None # type: ignore[assignment]
|
| 17 |
+
|
| 18 |
+
__all__ = ["GhostexecAction", "GhostexecObservation"]
|
| 19 |
+
if GhostexecEnv is not None:
|
| 20 |
+
__all__.append("GhostexecEnv")
|
client.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Ghostexec Environment Client."""
|
| 8 |
+
|
| 9 |
+
from typing import Any, Dict
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
# OpenEnv newer layout.
|
| 13 |
+
from openenv.client import EnvClient
|
| 14 |
+
except ImportError:
|
| 15 |
+
try:
|
| 16 |
+
# Some builds expose the class one level deeper.
|
| 17 |
+
from openenv.client.client import EnvClient
|
| 18 |
+
except ImportError:
|
| 19 |
+
# Backward compatibility with older OpenEnv versions.
|
| 20 |
+
from openenv.core import EnvClient
|
| 21 |
+
from openenv.core.client_types import StepResult
|
| 22 |
+
from openenv.core.env_server.types import State
|
| 23 |
+
|
| 24 |
+
from .models import GhostexecAction, GhostexecObservation
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class GhostexecEnv(
|
| 28 |
+
EnvClient[GhostexecAction, GhostexecObservation, State]
|
| 29 |
+
):
|
| 30 |
+
"""
|
| 31 |
+
Client for the Ghostexec Environment.
|
| 32 |
+
|
| 33 |
+
This client maintains a persistent WebSocket connection to the environment server,
|
| 34 |
+
enabling efficient multi-step interactions with lower latency.
|
| 35 |
+
Each client instance has its own dedicated environment session on the server.
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
def _step_payload(self, action: GhostexecAction) -> Dict[str, Any]:
|
| 39 |
+
payload = action.model_dump(mode="json")
|
| 40 |
+
if not payload.get("metadata"):
|
| 41 |
+
payload.pop("metadata", None)
|
| 42 |
+
return payload
|
| 43 |
+
|
| 44 |
+
def _parse_result(self, payload: Dict) -> StepResult[GhostexecObservation]:
|
| 45 |
+
obs_data = payload.get("observation", {})
|
| 46 |
+
observation = GhostexecObservation(
|
| 47 |
+
echoed_message=obs_data.get("echoed_message", ""),
|
| 48 |
+
message_length=obs_data.get("message_length", 0),
|
| 49 |
+
done=payload.get("done", False),
|
| 50 |
+
reward=payload.get("reward"),
|
| 51 |
+
metadata=obs_data.get("metadata", {}),
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
return StepResult(
|
| 55 |
+
observation=observation,
|
| 56 |
+
reward=payload.get("reward"),
|
| 57 |
+
done=payload.get("done", False),
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
def _parse_state(self, payload: Dict) -> State:
|
| 61 |
+
return State(
|
| 62 |
+
episode_id=payload.get("episode_id"),
|
| 63 |
+
step_count=payload.get("step_count", 0),
|
| 64 |
+
)
|
conftest.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# Put repo root on sys.path before test collection (supports `uv run pytest` without editable install).
|
| 4 |
+
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
_ROOT = Path(__file__).resolve().parent
|
| 11 |
+
if str(_ROOT) not in sys.path:
|
| 12 |
+
sys.path.insert(0, str(_ROOT))
|
graders.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Public trajectory graders for OpenEnv Phase 2 / HF deep validation.
|
| 3 |
+
|
| 4 |
+
These are **episode-level** scores (strictly inside (0, 1)), separate from per-step
|
| 5 |
+
rewards in `server/reward.py`. The hackathon validator reads `openenv.yaml`
|
| 6 |
+
`tasks[].grader` and calls these functions with trajectory dicts.
|
| 7 |
+
"""
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
from typing import Iterable, List
|
| 11 |
+
|
| 12 |
+
STRICT_MIN = 0.01
|
| 13 |
+
STRICT_MAX = 0.99
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _bounded(value: float) -> float:
|
| 17 |
+
return min(max(round(float(value), 4), STRICT_MIN), STRICT_MAX)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _as_reward_list(trajectory: dict | None) -> List[float]:
|
| 21 |
+
payload = trajectory or {}
|
| 22 |
+
rewards = payload.get("rewards")
|
| 23 |
+
if isinstance(rewards, list) and rewards:
|
| 24 |
+
return [float(r) for r in rewards]
|
| 25 |
+
if "score" in payload:
|
| 26 |
+
return [float(payload["score"])]
|
| 27 |
+
reward = payload.get("reward")
|
| 28 |
+
if isinstance(reward, dict) and "total" in reward:
|
| 29 |
+
return [float(reward["total"])]
|
| 30 |
+
if reward is not None:
|
| 31 |
+
return [float(reward)]
|
| 32 |
+
return []
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _profile(reward: float) -> str:
|
| 36 |
+
if reward <= 0.05:
|
| 37 |
+
return "unsafe_miss"
|
| 38 |
+
if reward <= 0.20:
|
| 39 |
+
return "bad_call"
|
| 40 |
+
if reward < 0.50:
|
| 41 |
+
return "weak"
|
| 42 |
+
if reward < 0.80:
|
| 43 |
+
return "workable"
|
| 44 |
+
if reward < 0.95:
|
| 45 |
+
return "strong"
|
| 46 |
+
return "expert"
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _score_episode(
|
| 50 |
+
rewards: List[float],
|
| 51 |
+
*,
|
| 52 |
+
miss_cost: float,
|
| 53 |
+
overcall_cost: float,
|
| 54 |
+
stability_gain: float,
|
| 55 |
+
expertise_gain: float,
|
| 56 |
+
) -> float:
|
| 57 |
+
if not rewards:
|
| 58 |
+
return _bounded(0.5)
|
| 59 |
+
labels = [_profile(r) for r in rewards]
|
| 60 |
+
mean_r = sum(rewards) / len(rewards)
|
| 61 |
+
n = len(rewards)
|
| 62 |
+
miss = labels.count("unsafe_miss")
|
| 63 |
+
bad = labels.count("bad_call")
|
| 64 |
+
weak = labels.count("weak")
|
| 65 |
+
strong = labels.count("strong") + labels.count("expert")
|
| 66 |
+
expert = labels.count("expert")
|
| 67 |
+
|
| 68 |
+
downward = (
|
| 69 |
+
min(miss * miss_cost, 0.35)
|
| 70 |
+
+ min(bad * overcall_cost, 0.15)
|
| 71 |
+
+ min(weak * 0.015, 0.06)
|
| 72 |
+
)
|
| 73 |
+
upward = 0.0
|
| 74 |
+
if strong / n >= 0.80:
|
| 75 |
+
upward += stability_gain
|
| 76 |
+
if expert / n >= 0.60:
|
| 77 |
+
upward += expertise_gain
|
| 78 |
+
|
| 79 |
+
return _bounded(mean_r - downward + upward)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def phase2_core_grader(trajectory: dict | None = None) -> float:
|
| 83 |
+
"""Easy tier — dense default inbox (scenarios/phase2_core.json)."""
|
| 84 |
+
return _score_episode(
|
| 85 |
+
_as_reward_list(trajectory),
|
| 86 |
+
miss_cost=0.12,
|
| 87 |
+
overcall_cost=0.03,
|
| 88 |
+
stability_gain=0.05,
|
| 89 |
+
expertise_gain=0.01,
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def monday_morning_grader(trajectory: dict | None = None) -> float:
|
| 94 |
+
"""Medium tier — stacked Monday conflicts (scenarios/monday_morning.json)."""
|
| 95 |
+
return _score_episode(
|
| 96 |
+
_as_reward_list(trajectory),
|
| 97 |
+
miss_cost=0.09,
|
| 98 |
+
overcall_cost=0.04,
|
| 99 |
+
stability_gain=0.03,
|
| 100 |
+
expertise_gain=0.02,
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def dinner_disaster_grader(trajectory: dict | None = None) -> float:
|
| 105 |
+
"""Hard tier — personal/professional collision (scenarios/dinner_disaster.json)."""
|
| 106 |
+
return _score_episode(
|
| 107 |
+
_as_reward_list(trajectory),
|
| 108 |
+
miss_cost=0.07,
|
| 109 |
+
overcall_cost=0.03,
|
| 110 |
+
stability_gain=0.02,
|
| 111 |
+
expertise_gain=0.04,
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
__all__ = [
|
| 116 |
+
"phase2_core_grader",
|
| 117 |
+
"monday_morning_grader",
|
| 118 |
+
"dinner_disaster_grader",
|
| 119 |
+
"STRICT_MIN",
|
| 120 |
+
"STRICT_MAX",
|
| 121 |
+
]
|
inference.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Baseline runner for the Ghostexec submission.
|
| 3 |
+
|
| 4 |
+
This script queries a chat model through the OpenAI client, sends its decision
|
| 5 |
+
to the environment server, and prints machine-readable lines expected by simple
|
| 6 |
+
evaluators/log parsers.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import argparse
|
| 12 |
+
import json
|
| 13 |
+
import os
|
| 14 |
+
from typing import Any, Iterable
|
| 15 |
+
|
| 16 |
+
import requests
|
| 17 |
+
from pydantic import ValidationError
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
from .graders import dinner_disaster_grader, monday_morning_grader, phase2_core_grader
|
| 21 |
+
from .models import GhostexecAction
|
| 22 |
+
except ImportError:
|
| 23 |
+
from graders import dinner_disaster_grader, monday_morning_grader, phase2_core_grader
|
| 24 |
+
from models import GhostexecAction
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 28 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
|
| 29 |
+
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 30 |
+
ENV_URL = os.getenv("ENV_URL", "http://localhost:7860").rstrip("/")
|
| 31 |
+
TASK_OVERRIDE = os.getenv("TASK_NAME", "").strip()
|
| 32 |
+
BENCHMARK = "ghostexec"
|
| 33 |
+
|
| 34 |
+
TASK_SETS: dict[str, tuple[str, ...]] = {
|
| 35 |
+
"easy": ("phase2_core",),
|
| 36 |
+
"medium": ("monday_morning",),
|
| 37 |
+
"hard": ("dinner_disaster",),
|
| 38 |
+
"all": ("phase2_core", "monday_morning", "dinner_disaster"),
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
TASK_TO_GRADER = {
|
| 42 |
+
"phase2_core": phase2_core_grader,
|
| 43 |
+
"monday_morning": monday_morning_grader,
|
| 44 |
+
"dinner_disaster": dinner_disaster_grader,
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
SYSTEM_MESSAGE = """
|
| 48 |
+
You are acting as an AI Chief-of-Staff assistant in Ghostexec.
|
| 49 |
+
|
| 50 |
+
You must output exactly one JSON object that matches GhostexecAction.
|
| 51 |
+
|
| 52 |
+
Allowed action_type values:
|
| 53 |
+
- reply_email
|
| 54 |
+
- archive_email
|
| 55 |
+
- reschedule_meeting
|
| 56 |
+
- cancel_meeting
|
| 57 |
+
- complete_task
|
| 58 |
+
- delegate_task
|
| 59 |
+
- send_message
|
| 60 |
+
- do_nothing
|
| 61 |
+
|
| 62 |
+
Allowed keys:
|
| 63 |
+
- action_type
|
| 64 |
+
- email_id
|
| 65 |
+
- message_body
|
| 66 |
+
- meeting_id
|
| 67 |
+
- new_time
|
| 68 |
+
- reason
|
| 69 |
+
- task_id
|
| 70 |
+
- contact_name
|
| 71 |
+
- message
|
| 72 |
+
|
| 73 |
+
Rules:
|
| 74 |
+
- Output valid JSON only (no markdown, no prose).
|
| 75 |
+
- Prefer high-impact conflict-reducing actions over do_nothing.
|
| 76 |
+
- Only reference ids/entities that appear in the briefing.
|
| 77 |
+
- If unsure, output {"action_type":"do_nothing"}.
|
| 78 |
+
""".strip()
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def emit_start(task_name: str) -> None:
|
| 82 |
+
print(f"[START] task={task_name} env={BENCHMARK} model={MODEL_NAME}", flush=True)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def emit_step(step_no: int, action_text: str, reward: float, done: bool, error: str | None) -> None:
|
| 86 |
+
error_text = error if error else "null"
|
| 87 |
+
print(
|
| 88 |
+
f"[STEP] step={step_no} action={action_text} reward={reward:.2f} "
|
| 89 |
+
f"done={str(done).lower()} error={error_text}",
|
| 90 |
+
flush=True,
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def emit_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
|
| 95 |
+
reward_text = ",".join(f"{reward:.2f}" for reward in rewards)
|
| 96 |
+
print(
|
| 97 |
+
f"[END] success={str(success).lower()} steps={steps} "
|
| 98 |
+
f"score={score:.6f} rewards={reward_text}",
|
| 99 |
+
flush=True,
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def choose_tasks(selection: str) -> Iterable[str]:
|
| 104 |
+
if TASK_OVERRIDE:
|
| 105 |
+
return (TASK_OVERRIDE,)
|
| 106 |
+
return TASK_SETS[selection]
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def client() -> Any:
|
| 110 |
+
if not HF_TOKEN:
|
| 111 |
+
raise EnvironmentError("HF_TOKEN or API_KEY must be set before running inference.py")
|
| 112 |
+
from openai import OpenAI
|
| 113 |
+
|
| 114 |
+
return OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def fetch_reset(task_name: str) -> dict[str, Any]:
|
| 118 |
+
response = requests.post(
|
| 119 |
+
f"{ENV_URL}/reset",
|
| 120 |
+
json={"task_id": task_name},
|
| 121 |
+
timeout=30,
|
| 122 |
+
)
|
| 123 |
+
response.raise_for_status()
|
| 124 |
+
return response.json()
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def submit_action(action: GhostexecAction) -> dict[str, Any]:
|
| 128 |
+
response = requests.post(
|
| 129 |
+
f"{ENV_URL}/step",
|
| 130 |
+
json={"action": action.model_dump()},
|
| 131 |
+
timeout=30,
|
| 132 |
+
)
|
| 133 |
+
response.raise_for_status()
|
| 134 |
+
return response.json()
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def _extract_json_object(text: str) -> str:
|
| 138 |
+
s = text.strip()
|
| 139 |
+
if s.startswith("```"):
|
| 140 |
+
# tolerate fenced output from weak model instruction following
|
| 141 |
+
s = s.strip("`")
|
| 142 |
+
if "\n" in s:
|
| 143 |
+
s = s.split("\n", 1)[1]
|
| 144 |
+
start = s.find("{")
|
| 145 |
+
end = s.rfind("}")
|
| 146 |
+
if start == -1 or end == -1 or end <= start:
|
| 147 |
+
raise json.JSONDecodeError("No JSON object found", s, 0)
|
| 148 |
+
return s[start : end + 1]
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def prompt_for_case(observation: dict[str, Any]) -> str:
|
| 152 |
+
return (
|
| 153 |
+
"Take one best next action for the Ghostexec environment.\n\n"
|
| 154 |
+
"Return one final structured GhostexecAction JSON object.\n\n"
|
| 155 |
+
f"{json.dumps(observation, ensure_ascii=True, indent=2)}\n\n"
|
| 156 |
+
"Choose the action that most reduces conflicts, protects relationships, "
|
| 157 |
+
"and advances urgent tasks."
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def ask_model(llm: Any, observation: dict[str, Any]) -> GhostexecAction:
|
| 162 |
+
completion = llm.chat.completions.create(
|
| 163 |
+
model=MODEL_NAME,
|
| 164 |
+
messages=[
|
| 165 |
+
{"role": "system", "content": SYSTEM_MESSAGE},
|
| 166 |
+
{"role": "user", "content": prompt_for_case(observation)},
|
| 167 |
+
],
|
| 168 |
+
temperature=0.0,
|
| 169 |
+
max_tokens=260,
|
| 170 |
+
stream=False,
|
| 171 |
+
)
|
| 172 |
+
text = (completion.choices[0].message.content or "").strip()
|
| 173 |
+
payload = json.loads(_extract_json_object(text))
|
| 174 |
+
return GhostexecAction(**payload)
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def compact_action(action: GhostexecAction) -> str:
|
| 178 |
+
label = action.action_type
|
| 179 |
+
for candidate in (action.email_id, action.meeting_id, action.task_id, action.contact_name):
|
| 180 |
+
if candidate:
|
| 181 |
+
return f"{label}/{candidate}"
|
| 182 |
+
return label
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def _extract_reward(payload: dict[str, Any]) -> float:
|
| 186 |
+
reward_payload = payload.get("reward")
|
| 187 |
+
if isinstance(reward_payload, dict):
|
| 188 |
+
return float(reward_payload.get("total", 0.0))
|
| 189 |
+
if reward_payload is not None:
|
| 190 |
+
return float(reward_payload)
|
| 191 |
+
obs = payload.get("observation")
|
| 192 |
+
if isinstance(obs, dict) and obs.get("reward") is not None:
|
| 193 |
+
return float(obs["reward"])
|
| 194 |
+
return 0.0
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def final_score(task_name: str, rewards: list[float]) -> float:
|
| 198 |
+
grader = TASK_TO_GRADER.get(task_name)
|
| 199 |
+
if grader is None:
|
| 200 |
+
score = sum(rewards) / len(rewards) if rewards else 0.0
|
| 201 |
+
return min(max(round(score, 4), 0.01), 0.99)
|
| 202 |
+
return float(grader({"rewards": rewards}))
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def run_one_task(llm: Any, task_name: str) -> None:
|
| 206 |
+
rewards: list[float] = []
|
| 207 |
+
steps_taken = 0
|
| 208 |
+
score = 0.0
|
| 209 |
+
success = False
|
| 210 |
+
|
| 211 |
+
emit_start(task_name)
|
| 212 |
+
|
| 213 |
+
try:
|
| 214 |
+
result = fetch_reset(task_name)
|
| 215 |
+
done = bool(result.get("done", False))
|
| 216 |
+
|
| 217 |
+
while not done:
|
| 218 |
+
observation = result.get("observation", result)
|
| 219 |
+
action = ask_model(llm, observation if isinstance(observation, dict) else result)
|
| 220 |
+
action_text = compact_action(action)
|
| 221 |
+
|
| 222 |
+
result = submit_action(action)
|
| 223 |
+
reward = _extract_reward(result)
|
| 224 |
+
done = bool(result.get("done", False))
|
| 225 |
+
|
| 226 |
+
rewards.append(reward)
|
| 227 |
+
steps_taken += 1
|
| 228 |
+
emit_step(steps_taken, action_text, reward, done, None)
|
| 229 |
+
|
| 230 |
+
score = final_score(task_name, rewards)
|
| 231 |
+
success = score >= 0.60
|
| 232 |
+
|
| 233 |
+
except json.JSONDecodeError:
|
| 234 |
+
rewards = [0.0]
|
| 235 |
+
steps_taken = 1
|
| 236 |
+
emit_step(1, "parse_error", 0.0, True, "parse_error")
|
| 237 |
+
except ValidationError:
|
| 238 |
+
rewards = [0.0]
|
| 239 |
+
steps_taken = 1
|
| 240 |
+
emit_step(1, "schema_error", 0.0, True, "schema_error")
|
| 241 |
+
except Exception as exc:
|
| 242 |
+
rewards = [0.0]
|
| 243 |
+
steps_taken = 1
|
| 244 |
+
emit_step(1, "error", 0.0, True, str(exc))
|
| 245 |
+
finally:
|
| 246 |
+
emit_end(success, steps_taken, score, rewards or [0.0])
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
def main() -> None:
|
| 250 |
+
parser = argparse.ArgumentParser(description="Run the Ghostexec baseline agent")
|
| 251 |
+
parser.add_argument(
|
| 252 |
+
"--difficulty",
|
| 253 |
+
choices=["easy", "medium", "hard", "all"],
|
| 254 |
+
default="all",
|
| 255 |
+
help="Which task subset to run",
|
| 256 |
+
)
|
| 257 |
+
args = parser.parse_args()
|
| 258 |
+
|
| 259 |
+
llm = client()
|
| 260 |
+
for task_name in choose_tasks(args.difficulty):
|
| 261 |
+
run_one_task(llm, task_name)
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
if __name__ == "__main__":
|
| 265 |
+
main()
|
models.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Data models for GhostExec — all world and API types live here."""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
from typing import Any, Literal
|
| 12 |
+
|
| 13 |
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
| 14 |
+
|
| 15 |
+
try:
|
| 16 |
+
from openenv.core.env_server.types import Action as _OpenEnvAction
|
| 17 |
+
from openenv.core.env_server.types import Observation as _OpenEnvObservation
|
| 18 |
+
except Exception:
|
| 19 |
+
_OpenEnvAction = BaseModel # type: ignore[assignment]
|
| 20 |
+
_OpenEnvObservation = BaseModel # type: ignore[assignment]
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _is_pydantic_model_class(cls: object) -> bool:
|
| 24 |
+
try:
|
| 25 |
+
return isinstance(cls, type) and issubclass(cls, BaseModel)
|
| 26 |
+
except TypeError:
|
| 27 |
+
return False
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# Some OpenEnv builds expose dataclass-style Action/Observation that do not accept
|
| 31 |
+
# additional keyword fields, which breaks GhostexecAction/GhostexecObservation
|
| 32 |
+
# construction in Colab. Fall back to BaseModel in that case.
|
| 33 |
+
ActionBase = _OpenEnvAction if _is_pydantic_model_class(_OpenEnvAction) else BaseModel
|
| 34 |
+
ObservationBase = (
|
| 35 |
+
_OpenEnvObservation if _is_pydantic_model_class(_OpenEnvObservation) else BaseModel
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
# --- Aliases for scenario / world strings ---
|
| 39 |
+
|
| 40 |
+
EmailPriority = Literal["critical", "high", "normal", "low"]
|
| 41 |
+
SenderRelationship = Literal["VIP", "personal", "professional", "unknown"]
|
| 42 |
+
ContactRelationship = Literal[
|
| 43 |
+
"board_member",
|
| 44 |
+
"spouse",
|
| 45 |
+
"investor",
|
| 46 |
+
"direct_report",
|
| 47 |
+
"client",
|
| 48 |
+
"friend",
|
| 49 |
+
"team_member",
|
| 50 |
+
]
|
| 51 |
+
CommPreference = Literal["email", "text", "call"]
|
| 52 |
+
Mood = Literal["happy", "neutral", "annoyed", "angry", "furious"]
|
| 53 |
+
TaskStatus = Literal["pending", "in-progress", "done", "overdue"]
|
| 54 |
+
Effort = Literal["low", "medium", "high"]
|
| 55 |
+
MeetingPriority = Literal["critical", "high", "normal", "low"]
|
| 56 |
+
|
| 57 |
+
GhostexecActionType = Literal[
|
| 58 |
+
"reply_email",
|
| 59 |
+
"archive_email",
|
| 60 |
+
"reschedule_meeting",
|
| 61 |
+
"cancel_meeting",
|
| 62 |
+
"complete_task",
|
| 63 |
+
"delegate_task",
|
| 64 |
+
"send_message",
|
| 65 |
+
"do_nothing",
|
| 66 |
+
]
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
class Email(BaseModel):
|
| 70 |
+
"""Single inbox message."""
|
| 71 |
+
|
| 72 |
+
model_config = ConfigDict(extra="forbid")
|
| 73 |
+
|
| 74 |
+
id: str
|
| 75 |
+
sender: str
|
| 76 |
+
subject: str
|
| 77 |
+
body: str
|
| 78 |
+
read: bool = False
|
| 79 |
+
replied: bool = False
|
| 80 |
+
priority: EmailPriority
|
| 81 |
+
sender_relationship: SenderRelationship
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
class Meeting(BaseModel):
|
| 85 |
+
"""Calendar block."""
|
| 86 |
+
|
| 87 |
+
model_config = ConfigDict(extra="forbid")
|
| 88 |
+
|
| 89 |
+
id: str
|
| 90 |
+
title: str
|
| 91 |
+
start: str = Field(..., description="ISO 8601 start datetime")
|
| 92 |
+
duration_minutes: int = Field(..., ge=1)
|
| 93 |
+
attendees: list[str] = Field(default_factory=list)
|
| 94 |
+
location: str = ""
|
| 95 |
+
priority: MeetingPriority = "normal"
|
| 96 |
+
cancelled: bool = False
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
class Contact(BaseModel):
|
| 100 |
+
"""Stakeholder in the exec's network."""
|
| 101 |
+
|
| 102 |
+
model_config = ConfigDict(extra="forbid")
|
| 103 |
+
|
| 104 |
+
name: str
|
| 105 |
+
relationship_type: ContactRelationship
|
| 106 |
+
communication_preference: CommPreference
|
| 107 |
+
importance: int = Field(..., ge=1, le=5)
|
| 108 |
+
mood: Mood = "neutral"
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
class Task(BaseModel):
|
| 112 |
+
"""To-do item."""
|
| 113 |
+
|
| 114 |
+
model_config = ConfigDict(extra="forbid")
|
| 115 |
+
|
| 116 |
+
id: str
|
| 117 |
+
description: str
|
| 118 |
+
deadline: str = Field(..., description="ISO 8601 deadline")
|
| 119 |
+
owner: str
|
| 120 |
+
status: TaskStatus = "pending"
|
| 121 |
+
effort: Effort = "medium"
|
| 122 |
+
delegated_to: str | None = None
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
class WorldState(BaseModel):
|
| 126 |
+
"""Full simulated world — JSON-serialisable."""
|
| 127 |
+
|
| 128 |
+
model_config = ConfigDict(extra="forbid")
|
| 129 |
+
|
| 130 |
+
simulation_time: str = Field(..., description="Current simulated instant, ISO 8601")
|
| 131 |
+
stress: int = Field(default=0, ge=0, le=100)
|
| 132 |
+
active_conflicts: list[str] = Field(default_factory=list)
|
| 133 |
+
action_log: list[str] = Field(default_factory=list)
|
| 134 |
+
episode_active: bool = True
|
| 135 |
+
episode_end_reason: str | None = None
|
| 136 |
+
max_episode_steps: int = Field(default=48, ge=1, le=10_000)
|
| 137 |
+
emails: list[Email] = Field(default_factory=list)
|
| 138 |
+
meetings: list[Meeting] = Field(default_factory=list)
|
| 139 |
+
contacts: list[Contact] = Field(default_factory=list)
|
| 140 |
+
tasks: list[Task] = Field(default_factory=list)
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
class GhostexecAction(ActionBase):
|
| 144 |
+
"""
|
| 145 |
+
Legal agent actions (Phase 3). Unknown HTTP payloads default to do_nothing
|
| 146 |
+
so older clients do not crash deserialization.
|
| 147 |
+
"""
|
| 148 |
+
|
| 149 |
+
action_type: GhostexecActionType = Field(
|
| 150 |
+
default="do_nothing",
|
| 151 |
+
description="Which legal action to execute this step",
|
| 152 |
+
)
|
| 153 |
+
email_id: str = ""
|
| 154 |
+
message_body: str = ""
|
| 155 |
+
meeting_id: str = ""
|
| 156 |
+
new_time: str = ""
|
| 157 |
+
reason: str = ""
|
| 158 |
+
task_id: str = ""
|
| 159 |
+
contact_name: str = ""
|
| 160 |
+
message: str = Field(default="", description="Optional note for action_log (legacy / debug)")
|
| 161 |
+
|
| 162 |
+
@model_validator(mode="before")
|
| 163 |
+
@classmethod
|
| 164 |
+
def _default_action_type(cls, data: Any) -> Any:
|
| 165 |
+
if isinstance(data, dict) and "action_type" not in data:
|
| 166 |
+
data = {**data, "action_type": "do_nothing"}
|
| 167 |
+
return data
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
class GhostexecObservation(ObservationBase):
|
| 171 |
+
"""
|
| 172 |
+
Primary LLM-facing field is `echoed_message`: full plain-text briefing (Phase 3).
|
| 173 |
+
"""
|
| 174 |
+
|
| 175 |
+
# Keep these fields explicit for compatibility with OpenEnv builds where
|
| 176 |
+
# Observation is not a pydantic base carrying done/reward/metadata.
|
| 177 |
+
done: bool = False
|
| 178 |
+
reward: float | None = None
|
| 179 |
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
| 180 |
+
|
| 181 |
+
echoed_message: str = Field(
|
| 182 |
+
default="",
|
| 183 |
+
description="Human-readable briefing text for the LLM (not JSON)",
|
| 184 |
+
)
|
| 185 |
+
message_length: int = Field(default=0, description="Byte length of echoed_message for quick checks")
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
class RewardBreakdown(BaseModel):
|
| 189 |
+
"""Phase 4 reward components (logged and exposed in observation metadata)."""
|
| 190 |
+
|
| 191 |
+
model_config = ConfigDict(extra="forbid")
|
| 192 |
+
|
| 193 |
+
conflict_raw: float = 0.0
|
| 194 |
+
critical_queue_bonus: float = 0.0
|
| 195 |
+
conflict: float = 0.0
|
| 196 |
+
relationship: float = 0.0
|
| 197 |
+
task: float = 0.0
|
| 198 |
+
weighted_base: float = 0.0
|
| 199 |
+
output_scale: float = 1.0
|
| 200 |
+
invalid_step_adjustment: float = 0.0
|
| 201 |
+
episode_completion_bonus: float = 0.0
|
| 202 |
+
catastrophic_penalty: float = 0.0
|
| 203 |
+
do_nothing_floor: float = 0.0
|
| 204 |
+
final: float = 0.0
|
openenv.yaml
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
name: ghostexec
|
| 3 |
+
version: "0.1.0"
|
| 4 |
+
description: "GhostExec — RL training environment for personal and executive task conflict resolution (The AI Chief of Staff)."
|
| 5 |
+
type: space
|
| 6 |
+
runtime: fastapi
|
| 7 |
+
app: server.app:app
|
| 8 |
+
port: 8000
|
| 9 |
+
|
| 10 |
+
difficulties: [easy, medium, hard]
|
| 11 |
+
max_steps: 20
|
| 12 |
+
|
| 13 |
+
tasks:
|
| 14 |
+
- id: phase2_core
|
| 15 |
+
difficulty: easy
|
| 16 |
+
description: >
|
| 17 |
+
Default dense inbox/calendar fixture (scenarios/phase2_core.json).
|
| 18 |
+
Stress-test triage, VIP queues, and calendar relief.
|
| 19 |
+
grader: graders.phase2_core_grader
|
| 20 |
+
|
| 21 |
+
- id: monday_morning
|
| 22 |
+
difficulty: medium
|
| 23 |
+
description: >
|
| 24 |
+
Monday morning rush with stacked conflicts (scenarios/monday_morning.json).
|
| 25 |
+
grader: graders.monday_morning_grader
|
| 26 |
+
|
| 27 |
+
- id: dinner_disaster
|
| 28 |
+
difficulty: hard
|
| 29 |
+
description: >
|
| 30 |
+
Personal/professional collision with escalation risk
|
| 31 |
+
(scenarios/dinner_disaster.json).
|
| 32 |
+
grader: graders.dinner_disaster_grader
|
openenv_ghostexec.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: openenv-ghostexec
|
| 3 |
+
Version: 0.1.0
|
| 4 |
+
Summary: Ghostexec environment for OpenEnv
|
| 5 |
+
Requires-Python: >=3.10
|
| 6 |
+
Requires-Dist: openenv-core[core]>=0.2.3
|
| 7 |
+
Provides-Extra: dev
|
| 8 |
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
| 9 |
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
| 10 |
+
Requires-Dist: pyyaml>=6.0.0; extra == "dev"
|
| 11 |
+
Requires-Dist: matplotlib>=3.8.0; extra == "dev"
|
| 12 |
+
Provides-Extra: constrained
|
| 13 |
+
Requires-Dist: lm-format-enforcer>=0.10; extra == "constrained"
|
| 14 |
+
Provides-Extra: constrained-outlines
|
| 15 |
+
Requires-Dist: outlines>=0.1; extra == "constrained-outlines"
|
openenv_ghostexec.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
README.md
|
| 2 |
+
__init__.py
|
| 3 |
+
client.py
|
| 4 |
+
conftest.py
|
| 5 |
+
graders.py
|
| 6 |
+
models.py
|
| 7 |
+
pyproject.toml
|
| 8 |
+
./__init__.py
|
| 9 |
+
./client.py
|
| 10 |
+
./conftest.py
|
| 11 |
+
./graders.py
|
| 12 |
+
./models.py
|
| 13 |
+
./scenarios/dinner_disaster.json
|
| 14 |
+
./scenarios/monday_morning.json
|
| 15 |
+
./scenarios/phase2_core.json
|
| 16 |
+
./scenarios/schema_drift_test.json
|
| 17 |
+
./scenarios/vip_meltdown.json
|
| 18 |
+
./scenarios/vip_meltdown_drift.json
|
| 19 |
+
openenv_ghostexec.egg-info/PKG-INFO
|
| 20 |
+
openenv_ghostexec.egg-info/SOURCES.txt
|
| 21 |
+
openenv_ghostexec.egg-info/dependency_links.txt
|
| 22 |
+
openenv_ghostexec.egg-info/entry_points.txt
|
| 23 |
+
openenv_ghostexec.egg-info/requires.txt
|
| 24 |
+
openenv_ghostexec.egg-info/top_level.txt
|
| 25 |
+
scenarios/dinner_disaster.json
|
| 26 |
+
scenarios/monday_morning.json
|
| 27 |
+
scenarios/phase2_core.json
|
| 28 |
+
scenarios/schema_drift_test.json
|
| 29 |
+
scenarios/vip_meltdown.json
|
| 30 |
+
scenarios/vip_meltdown_drift.json
|
| 31 |
+
server/__init__.py
|
| 32 |
+
server/app.py
|
| 33 |
+
server/ghostexec_environment.py
|
| 34 |
+
server/reward.py
|
| 35 |
+
tests/test_api_reward_dead_500.py
|
| 36 |
+
tests/test_complete_integration.py
|
| 37 |
+
tests/test_docker_build.py
|
| 38 |
+
tests/test_env.py
|
| 39 |
+
tests/test_live_server_exhaustive.py
|
| 40 |
+
tests/test_phase1.py
|
| 41 |
+
tests/test_phase2.py
|
| 42 |
+
tests/test_phase3.py
|
| 43 |
+
tests/test_phase4.py
|
| 44 |
+
tests/test_reward_dead_suite.py
|
| 45 |
+
tests/test_submission_plots_committed.py
|
openenv_ghostexec.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
openenv_ghostexec.egg-info/entry_points.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[console_scripts]
|
| 2 |
+
server = ghostexec.server.app:main
|
openenv_ghostexec.egg-info/requires.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv-core[core]>=0.2.3
|
| 2 |
+
|
| 3 |
+
[constrained]
|
| 4 |
+
lm-format-enforcer>=0.10
|
| 5 |
+
|
| 6 |
+
[constrained-outlines]
|
| 7 |
+
outlines>=0.1
|
| 8 |
+
|
| 9 |
+
[dev]
|
| 10 |
+
pytest>=8.0.0
|
| 11 |
+
pytest-cov>=4.0.0
|
| 12 |
+
pyyaml>=6.0.0
|
| 13 |
+
matplotlib>=3.8.0
|
openenv_ghostexec.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
ghostexec
|
outputs/logs/api_dead_live_500.jsonl
ADDED
|
@@ -0,0 +1,500 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"idx": 0, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 2 |
+
{"idx": 1, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 3 |
+
{"idx": 2, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 4 |
+
{"idx": 3, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 5 |
+
{"idx": 4, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 6 |
+
{"idx": 5, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 7 |
+
{"idx": 6, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 8 |
+
{"idx": 7, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 9 |
+
{"idx": 8, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 10 |
+
{"idx": 9, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 11 |
+
{"idx": 10, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 12 |
+
{"idx": 11, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 13 |
+
{"idx": 12, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 14 |
+
{"idx": 13, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 15 |
+
{"idx": 14, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 16 |
+
{"idx": 15, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 17 |
+
{"idx": 16, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 18 |
+
{"idx": 17, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 19 |
+
{"idx": 18, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 20 |
+
{"idx": 19, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 21 |
+
{"idx": 20, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 22 |
+
{"idx": 21, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 23 |
+
{"idx": 22, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 24 |
+
{"idx": 23, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 25 |
+
{"idx": 24, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 26 |
+
{"idx": 25, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 27 |
+
{"idx": 26, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 28 |
+
{"idx": 27, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 29 |
+
{"idx": 28, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 30 |
+
{"idx": 29, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 31 |
+
{"idx": 30, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 32 |
+
{"idx": 31, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 33 |
+
{"idx": 32, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 34 |
+
{"idx": 33, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 35 |
+
{"idx": 34, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 36 |
+
{"idx": 35, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 37 |
+
{"idx": 36, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 38 |
+
{"idx": 37, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 39 |
+
{"idx": 38, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 40 |
+
{"idx": 39, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 41 |
+
{"idx": 40, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 42 |
+
{"idx": 41, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 43 |
+
{"idx": 42, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 44 |
+
{"idx": 43, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 45 |
+
{"idx": 44, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 46 |
+
{"idx": 45, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 47 |
+
{"idx": 46, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 48 |
+
{"idx": 47, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 49 |
+
{"idx": 48, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 50 |
+
{"idx": 49, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 51 |
+
{"idx": 50, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 52 |
+
{"idx": 51, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 53 |
+
{"idx": 52, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 54 |
+
{"idx": 53, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 55 |
+
{"idx": 54, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 56 |
+
{"idx": 55, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 57 |
+
{"idx": 56, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 58 |
+
{"idx": 57, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 59 |
+
{"idx": 58, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 60 |
+
{"idx": 59, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 61 |
+
{"idx": 60, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 62 |
+
{"idx": 61, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 63 |
+
{"idx": 62, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 64 |
+
{"idx": 63, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 65 |
+
{"idx": 64, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 66 |
+
{"idx": 65, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 67 |
+
{"idx": 66, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 68 |
+
{"idx": 67, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 69 |
+
{"idx": 68, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 70 |
+
{"idx": 69, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 71 |
+
{"idx": 70, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 72 |
+
{"idx": 71, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 73 |
+
{"idx": 72, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 74 |
+
{"idx": 73, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 75 |
+
{"idx": 74, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 76 |
+
{"idx": 75, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 77 |
+
{"idx": 76, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 78 |
+
{"idx": 77, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 79 |
+
{"idx": 78, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 80 |
+
{"idx": 79, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 81 |
+
{"idx": 80, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 82 |
+
{"idx": 81, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 83 |
+
{"idx": 82, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 84 |
+
{"idx": 83, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 85 |
+
{"idx": 84, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 86 |
+
{"idx": 85, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 87 |
+
{"idx": 86, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 88 |
+
{"idx": 87, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 89 |
+
{"idx": 88, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 90 |
+
{"idx": 89, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 91 |
+
{"idx": 90, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 92 |
+
{"idx": 91, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 93 |
+
{"idx": 92, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 94 |
+
{"idx": 93, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 95 |
+
{"idx": 94, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 96 |
+
{"idx": 95, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 97 |
+
{"idx": 96, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 98 |
+
{"idx": 97, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 99 |
+
{"idx": 98, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 100 |
+
{"idx": 99, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 101 |
+
{"idx": 100, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 102 |
+
{"idx": 101, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 103 |
+
{"idx": 102, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 104 |
+
{"idx": 103, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 105 |
+
{"idx": 104, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 106 |
+
{"idx": 105, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 107 |
+
{"idx": 106, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 108 |
+
{"idx": 107, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 109 |
+
{"idx": 108, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 110 |
+
{"idx": 109, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 111 |
+
{"idx": 110, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 112 |
+
{"idx": 111, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 113 |
+
{"idx": 112, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 114 |
+
{"idx": 113, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 115 |
+
{"idx": 114, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 116 |
+
{"idx": 115, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 117 |
+
{"idx": 116, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 118 |
+
{"idx": 117, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 119 |
+
{"idx": 118, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 120 |
+
{"idx": 119, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 121 |
+
{"idx": 120, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 122 |
+
{"idx": 121, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 123 |
+
{"idx": 122, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 124 |
+
{"idx": 123, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 125 |
+
{"idx": 124, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 126 |
+
{"idx": 125, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 127 |
+
{"idx": 126, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 128 |
+
{"idx": 127, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 129 |
+
{"idx": 128, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 130 |
+
{"idx": 129, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 131 |
+
{"idx": 130, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 132 |
+
{"idx": 131, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 133 |
+
{"idx": 132, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 134 |
+
{"idx": 133, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 135 |
+
{"idx": 134, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 136 |
+
{"idx": 135, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 137 |
+
{"idx": 136, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 138 |
+
{"idx": 137, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 139 |
+
{"idx": 138, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 140 |
+
{"idx": 139, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 141 |
+
{"idx": 140, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 142 |
+
{"idx": 141, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 143 |
+
{"idx": 142, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 144 |
+
{"idx": 143, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 145 |
+
{"idx": 144, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 146 |
+
{"idx": 145, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 147 |
+
{"idx": 146, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 148 |
+
{"idx": 147, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 149 |
+
{"idx": 148, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 150 |
+
{"idx": 149, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 151 |
+
{"idx": 150, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 152 |
+
{"idx": 151, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 153 |
+
{"idx": 152, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 154 |
+
{"idx": 153, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 155 |
+
{"idx": 154, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 156 |
+
{"idx": 155, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 157 |
+
{"idx": 156, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 158 |
+
{"idx": 157, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 159 |
+
{"idx": 158, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 160 |
+
{"idx": 159, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 161 |
+
{"idx": 160, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 162 |
+
{"idx": 161, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 163 |
+
{"idx": 162, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 164 |
+
{"idx": 163, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 165 |
+
{"idx": 164, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 166 |
+
{"idx": 165, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 167 |
+
{"idx": 166, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 168 |
+
{"idx": 167, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 169 |
+
{"idx": 168, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 170 |
+
{"idx": 169, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 171 |
+
{"idx": 170, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 172 |
+
{"idx": 171, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 173 |
+
{"idx": 172, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 174 |
+
{"idx": 173, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 175 |
+
{"idx": 174, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 176 |
+
{"idx": 175, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 177 |
+
{"idx": 176, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 178 |
+
{"idx": 177, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 179 |
+
{"idx": 178, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 180 |
+
{"idx": 179, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 181 |
+
{"idx": 180, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 182 |
+
{"idx": 181, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 183 |
+
{"idx": 182, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 184 |
+
{"idx": 183, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 185 |
+
{"idx": 184, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 186 |
+
{"idx": 185, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 187 |
+
{"idx": 186, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 188 |
+
{"idx": 187, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 189 |
+
{"idx": 188, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 190 |
+
{"idx": 189, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 191 |
+
{"idx": 190, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 192 |
+
{"idx": 191, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 193 |
+
{"idx": 192, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 194 |
+
{"idx": 193, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 195 |
+
{"idx": 194, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 196 |
+
{"idx": 195, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 197 |
+
{"idx": 196, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 198 |
+
{"idx": 197, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 199 |
+
{"idx": 198, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 200 |
+
{"idx": 199, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 201 |
+
{"idx": 200, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 202 |
+
{"idx": 201, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 203 |
+
{"idx": 202, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 204 |
+
{"idx": 203, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 205 |
+
{"idx": 204, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 206 |
+
{"idx": 205, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 207 |
+
{"idx": 206, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 208 |
+
{"idx": 207, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 209 |
+
{"idx": 208, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 210 |
+
{"idx": 209, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 211 |
+
{"idx": 210, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 212 |
+
{"idx": 211, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 213 |
+
{"idx": 212, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 214 |
+
{"idx": 213, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 215 |
+
{"idx": 214, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 216 |
+
{"idx": 215, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 217 |
+
{"idx": 216, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 218 |
+
{"idx": 217, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 219 |
+
{"idx": 218, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 220 |
+
{"idx": 219, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 221 |
+
{"idx": 220, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 222 |
+
{"idx": 221, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 223 |
+
{"idx": 222, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 224 |
+
{"idx": 223, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 225 |
+
{"idx": 224, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 226 |
+
{"idx": 225, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 227 |
+
{"idx": 226, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 228 |
+
{"idx": 227, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 229 |
+
{"idx": 228, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 230 |
+
{"idx": 229, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 231 |
+
{"idx": 230, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 232 |
+
{"idx": 231, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 233 |
+
{"idx": 232, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 234 |
+
{"idx": 233, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 235 |
+
{"idx": 234, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 236 |
+
{"idx": 235, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 237 |
+
{"idx": 236, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 238 |
+
{"idx": 237, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 239 |
+
{"idx": 238, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 240 |
+
{"idx": 239, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 241 |
+
{"idx": 240, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 242 |
+
{"idx": 241, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 243 |
+
{"idx": 242, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 244 |
+
{"idx": 243, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 245 |
+
{"idx": 244, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 246 |
+
{"idx": 245, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 247 |
+
{"idx": 246, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 248 |
+
{"idx": 247, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 249 |
+
{"idx": 248, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 250 |
+
{"idx": 249, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 251 |
+
{"idx": 250, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 252 |
+
{"idx": 251, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 253 |
+
{"idx": 252, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 254 |
+
{"idx": 253, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 255 |
+
{"idx": 254, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 256 |
+
{"idx": 255, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 257 |
+
{"idx": 256, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 258 |
+
{"idx": 257, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 259 |
+
{"idx": 258, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 260 |
+
{"idx": 259, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 261 |
+
{"idx": 260, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 262 |
+
{"idx": 261, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 263 |
+
{"idx": 262, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 264 |
+
{"idx": 263, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 265 |
+
{"idx": 264, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 266 |
+
{"idx": 265, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 267 |
+
{"idx": 266, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 268 |
+
{"idx": 267, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 269 |
+
{"idx": 268, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 270 |
+
{"idx": 269, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 271 |
+
{"idx": 270, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 272 |
+
{"idx": 271, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 273 |
+
{"idx": 272, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 274 |
+
{"idx": 273, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 275 |
+
{"idx": 274, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 276 |
+
{"idx": 275, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 277 |
+
{"idx": 276, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 278 |
+
{"idx": 277, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 279 |
+
{"idx": 278, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 280 |
+
{"idx": 279, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 281 |
+
{"idx": 280, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 282 |
+
{"idx": 281, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 283 |
+
{"idx": 282, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 284 |
+
{"idx": 283, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 285 |
+
{"idx": 284, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 286 |
+
{"idx": 285, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 287 |
+
{"idx": 286, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 288 |
+
{"idx": 287, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 289 |
+
{"idx": 288, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 290 |
+
{"idx": 289, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 291 |
+
{"idx": 290, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 292 |
+
{"idx": 291, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 293 |
+
{"idx": 292, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 294 |
+
{"idx": 293, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 295 |
+
{"idx": 294, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 296 |
+
{"idx": 295, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 297 |
+
{"idx": 296, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 298 |
+
{"idx": 297, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 299 |
+
{"idx": 298, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 300 |
+
{"idx": 299, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 301 |
+
{"idx": 300, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 302 |
+
{"idx": 301, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 303 |
+
{"idx": 302, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 304 |
+
{"idx": 303, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 305 |
+
{"idx": 304, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 306 |
+
{"idx": 305, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 307 |
+
{"idx": 306, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 308 |
+
{"idx": 307, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 309 |
+
{"idx": 308, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 310 |
+
{"idx": 309, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 311 |
+
{"idx": 310, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 312 |
+
{"idx": 311, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 313 |
+
{"idx": 312, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 314 |
+
{"idx": 313, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 315 |
+
{"idx": 314, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 316 |
+
{"idx": 315, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 317 |
+
{"idx": 316, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 318 |
+
{"idx": 317, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 319 |
+
{"idx": 318, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 320 |
+
{"idx": 319, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 321 |
+
{"idx": 320, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 322 |
+
{"idx": 321, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 323 |
+
{"idx": 322, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 324 |
+
{"idx": 323, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 325 |
+
{"idx": 324, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 326 |
+
{"idx": 325, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 327 |
+
{"idx": 326, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 328 |
+
{"idx": 327, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 329 |
+
{"idx": 328, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 330 |
+
{"idx": 329, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 331 |
+
{"idx": 330, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 332 |
+
{"idx": 331, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 333 |
+
{"idx": 332, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 334 |
+
{"idx": 333, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 335 |
+
{"idx": 334, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 336 |
+
{"idx": 335, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 337 |
+
{"idx": 336, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 338 |
+
{"idx": 337, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 339 |
+
{"idx": 338, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 340 |
+
{"idx": 339, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 341 |
+
{"idx": 340, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 342 |
+
{"idx": 341, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 343 |
+
{"idx": 342, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 344 |
+
{"idx": 343, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 345 |
+
{"idx": 344, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 346 |
+
{"idx": 345, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 347 |
+
{"idx": 346, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 348 |
+
{"idx": 347, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 349 |
+
{"idx": 348, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 350 |
+
{"idx": 349, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 351 |
+
{"idx": 350, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 352 |
+
{"idx": 351, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 353 |
+
{"idx": 352, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 354 |
+
{"idx": 353, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 355 |
+
{"idx": 354, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 356 |
+
{"idx": 355, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 357 |
+
{"idx": 356, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 358 |
+
{"idx": 357, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 359 |
+
{"idx": 358, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 360 |
+
{"idx": 359, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 361 |
+
{"idx": 360, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 362 |
+
{"idx": 361, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 363 |
+
{"idx": 362, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 364 |
+
{"idx": 363, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 365 |
+
{"idx": 364, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 366 |
+
{"idx": 365, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 367 |
+
{"idx": 366, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 368 |
+
{"idx": 367, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 369 |
+
{"idx": 368, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 370 |
+
{"idx": 369, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 371 |
+
{"idx": 370, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 372 |
+
{"idx": 371, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 373 |
+
{"idx": 372, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 374 |
+
{"idx": 373, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 375 |
+
{"idx": 374, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 376 |
+
{"idx": 375, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 377 |
+
{"idx": 376, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 378 |
+
{"idx": 377, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 379 |
+
{"idx": 378, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 380 |
+
{"idx": 379, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 381 |
+
{"idx": 380, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 382 |
+
{"idx": 381, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 383 |
+
{"idx": 382, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 384 |
+
{"idx": 383, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 385 |
+
{"idx": 384, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 386 |
+
{"idx": 385, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 387 |
+
{"idx": 386, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 388 |
+
{"idx": 387, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 389 |
+
{"idx": 388, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 390 |
+
{"idx": 389, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 391 |
+
{"idx": 390, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 392 |
+
{"idx": 391, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 393 |
+
{"idx": 392, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 394 |
+
{"idx": 393, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 395 |
+
{"idx": 394, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 396 |
+
{"idx": 395, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 397 |
+
{"idx": 396, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 398 |
+
{"idx": 397, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 399 |
+
{"idx": 398, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 400 |
+
{"idx": 399, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 401 |
+
{"idx": 400, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 402 |
+
{"idx": 401, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 403 |
+
{"idx": 402, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 404 |
+
{"idx": 403, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 405 |
+
{"idx": 404, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 406 |
+
{"idx": 405, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 407 |
+
{"idx": 406, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 408 |
+
{"idx": 407, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 409 |
+
{"idx": 408, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 410 |
+
{"idx": 409, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 411 |
+
{"idx": 410, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 412 |
+
{"idx": 411, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 413 |
+
{"idx": 412, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 414 |
+
{"idx": 413, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 415 |
+
{"idx": 414, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 416 |
+
{"idx": 415, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 417 |
+
{"idx": 416, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 418 |
+
{"idx": 417, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 419 |
+
{"idx": 418, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 420 |
+
{"idx": 419, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 421 |
+
{"idx": 420, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 422 |
+
{"idx": 421, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 423 |
+
{"idx": 422, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 424 |
+
{"idx": 423, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 425 |
+
{"idx": 424, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 426 |
+
{"idx": 425, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 427 |
+
{"idx": 426, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 428 |
+
{"idx": 427, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 429 |
+
{"idx": 428, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 430 |
+
{"idx": 429, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 431 |
+
{"idx": 430, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 432 |
+
{"idx": 431, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 433 |
+
{"idx": 432, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 434 |
+
{"idx": 433, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 435 |
+
{"idx": 434, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 436 |
+
{"idx": 435, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 437 |
+
{"idx": 436, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 438 |
+
{"idx": 437, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 439 |
+
{"idx": 438, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 440 |
+
{"idx": 439, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 441 |
+
{"idx": 440, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 442 |
+
{"idx": 441, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 443 |
+
{"idx": 442, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 444 |
+
{"idx": 443, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 445 |
+
{"idx": 444, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 446 |
+
{"idx": 445, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 447 |
+
{"idx": 446, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 448 |
+
{"idx": 447, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 449 |
+
{"idx": 448, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 450 |
+
{"idx": 449, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 451 |
+
{"idx": 450, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 452 |
+
{"idx": 451, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 453 |
+
{"idx": 452, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 454 |
+
{"idx": 453, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 455 |
+
{"idx": 454, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 456 |
+
{"idx": 455, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 457 |
+
{"idx": 456, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 458 |
+
{"idx": 457, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 459 |
+
{"idx": 458, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 460 |
+
{"idx": 459, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 461 |
+
{"idx": 460, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 462 |
+
{"idx": 461, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 463 |
+
{"idx": 462, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 464 |
+
{"idx": 463, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 465 |
+
{"idx": 464, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 466 |
+
{"idx": 465, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 467 |
+
{"idx": 466, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 468 |
+
{"idx": 467, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 469 |
+
{"idx": 468, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 470 |
+
{"idx": 469, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 471 |
+
{"idx": 470, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 472 |
+
{"idx": 471, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 473 |
+
{"idx": 472, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 474 |
+
{"idx": 473, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 475 |
+
{"idx": 474, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 476 |
+
{"idx": 475, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 477 |
+
{"idx": 476, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 478 |
+
{"idx": 477, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 479 |
+
{"idx": 478, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 480 |
+
{"idx": 479, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 481 |
+
{"idx": 480, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 482 |
+
{"idx": 481, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 483 |
+
{"idx": 482, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 484 |
+
{"idx": 483, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
| 485 |
+
{"idx": 484, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "e09"}, "reward": 0.0, "step_ok": true}
|
| 486 |
+
{"idx": 485, "ok": true, "error": null, "action": {"action_type": "archive_email", "email_id": "bad_id"}, "reward": -0.25, "step_ok": false}
|
| 487 |
+
{"idx": 486, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m02", "new_time": "2026-04-21T18:00:00"}, "reward": 0.37296, "step_ok": true}
|
| 488 |
+
{"idx": 487, "ok": true, "error": null, "action": {"action_type": "reschedule_meeting", "meeting_id": "m03", "new_time": "2026-04-21T09:30:00"}, "reward": -0.25, "step_ok": false}
|
| 489 |
+
{"idx": 488, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}, "reward": 0.35616, "step_ok": true}
|
| 490 |
+
{"idx": 489, "ok": true, "error": null, "action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}, "reward": -0.25, "step_ok": false}
|
| 491 |
+
{"idx": 490, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t07"}, "reward": 0.29663999999999996, "step_ok": true}
|
| 492 |
+
{"idx": 491, "ok": true, "error": null, "action": {"action_type": "complete_task", "task_id": "t09"}, "reward": -0.25, "step_ok": false}
|
| 493 |
+
{"idx": 492, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}, "reward": 0.1584, "step_ok": true}
|
| 494 |
+
{"idx": 493, "ok": true, "error": null, "action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}, "reward": -0.25, "step_ok": false}
|
| 495 |
+
{"idx": 494, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Jamie Liu", "message_body": "Quick sync please."}, "reward": 0.013439999999999999, "step_ok": true}
|
| 496 |
+
{"idx": 495, "ok": true, "error": null, "action": {"action_type": "send_message", "contact_name": "Nobody", "message_body": "hello"}, "reward": -0.25, "step_ok": false}
|
| 497 |
+
{"idx": 496, "ok": true, "error": null, "action": {"action_type": "do_nothing"}, "reward": -0.15, "step_ok": true}
|
| 498 |
+
{"idx": 497, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}, "reward": 0.13776, "step_ok": true}
|
| 499 |
+
{"idx": 498, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}, "reward": 0.006719999999999999, "step_ok": true}
|
| 500 |
+
{"idx": 499, "ok": true, "error": null, "action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}, "reward": -0.25, "step_ok": false}
|
outputs/logs/episode_rewards.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
outputs/training/_integration_ckpt/run_summary.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"episodes": 5,
|
| 3 |
+
"log_path": "D:\\Scalar Final\\Final\\ghostexec\\outputs\\training\\_integration_train_smoke.jsonl",
|
| 4 |
+
"first_episode_first_action": {
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"action_type": "reply_email",
|
| 7 |
+
"email_id": "e01",
|
| 8 |
+
"message_body": "On it \u2014 drafting a response and owners now.",
|
| 9 |
+
"meeting_id": "",
|
| 10 |
+
"new_time": "",
|
| 11 |
+
"reason": "",
|
| 12 |
+
"task_id": "",
|
| 13 |
+
"contact_name": "",
|
| 14 |
+
"message": ""
|
| 15 |
+
},
|
| 16 |
+
"last_episode_first_action": {
|
| 17 |
+
"metadata": {},
|
| 18 |
+
"action_type": "reply_email",
|
| 19 |
+
"email_id": "e01",
|
| 20 |
+
"message_body": "On it \u2014 drafting a response and owners now.",
|
| 21 |
+
"meeting_id": "",
|
| 22 |
+
"new_time": "",
|
| 23 |
+
"reason": "",
|
| 24 |
+
"task_id": "",
|
| 25 |
+
"contact_name": "",
|
| 26 |
+
"message": ""
|
| 27 |
+
}
|
| 28 |
+
}
|
outputs/training/checkpoints/run_summary.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"episodes": 5,
|
| 3 |
+
"log_path": "D:\\Scalar Final\\Final\\ghostexec\\outputs\\training\\episode_returns.jsonl",
|
| 4 |
+
"first_episode_first_action": {
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"action_type": "reply_email",
|
| 7 |
+
"email_id": "e01",
|
| 8 |
+
"message_body": "On it \u2014 drafting a response and owners now.",
|
| 9 |
+
"meeting_id": "",
|
| 10 |
+
"new_time": "",
|
| 11 |
+
"reason": "",
|
| 12 |
+
"task_id": "",
|
| 13 |
+
"contact_name": "",
|
| 14 |
+
"message": ""
|
| 15 |
+
},
|
| 16 |
+
"last_episode_first_action": {
|
| 17 |
+
"metadata": {},
|
| 18 |
+
"action_type": "reply_email",
|
| 19 |
+
"email_id": "e01",
|
| 20 |
+
"message_body": "On it \u2014 drafting a response and owners now.",
|
| 21 |
+
"meeting_id": "",
|
| 22 |
+
"new_time": "",
|
| 23 |
+
"reason": "",
|
| 24 |
+
"task_id": "",
|
| 25 |
+
"contact_name": "",
|
| 26 |
+
"message": ""
|
| 27 |
+
}
|
| 28 |
+
}
|
outputs/training/episode_returns.jsonl
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"episode": 0, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
|
| 2 |
+
{"episode": 1, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
|
| 3 |
+
{"episode": 2, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
|
| 4 |
+
{"episode": 3, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
|
| 5 |
+
{"episode": 4, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
|
| 6 |
+
{"episode": 0, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
|
| 7 |
+
{"episode": 1, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
|
| 8 |
+
{"episode": 2, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
|
| 9 |
+
{"episode": 3, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
|
| 10 |
+
{"episode": 4, "scenario": "phase2_core.json", "backend": "local", "agent": "smart", "return": -6.347039999999998, "length": 12, "mean_step_reward": -0.5289199999999998}
|
outputs/training/smoke/checkpoints/run_summary.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"episodes": 48,
|
| 3 |
+
"log_path": "D:\\Scalar Final\\Final\\ghostexec\\outputs\\training\\smoke\\reinforce_returns.jsonl",
|
| 4 |
+
"first_episode_first_action": {
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"action_type": "reply_email",
|
| 7 |
+
"email_id": "e01",
|
| 8 |
+
"message_body": "Acknowledged \u2014 working the thread now.",
|
| 9 |
+
"meeting_id": "",
|
| 10 |
+
"new_time": "",
|
| 11 |
+
"reason": "",
|
| 12 |
+
"task_id": "",
|
| 13 |
+
"contact_name": "",
|
| 14 |
+
"message": ""
|
| 15 |
+
},
|
| 16 |
+
"last_episode_first_action": {
|
| 17 |
+
"metadata": {},
|
| 18 |
+
"action_type": "archive_email",
|
| 19 |
+
"email_id": "e06",
|
| 20 |
+
"message_body": "",
|
| 21 |
+
"meeting_id": "",
|
| 22 |
+
"new_time": "",
|
| 23 |
+
"reason": "",
|
| 24 |
+
"task_id": "",
|
| 25 |
+
"contact_name": "",
|
| 26 |
+
"message": ""
|
| 27 |
+
}
|
| 28 |
+
}
|
outputs/training/smoke/reinforce_returns.jsonl
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"episode": 0, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.391999999999998, "length": 14, "mean_step_reward": -0.5279999999999998}
|
| 2 |
+
{"episode": 1, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.297919999999997, "length": 14, "mean_step_reward": -0.5212799999999997}
|
| 3 |
+
{"episode": 2, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -9.334079999999998, "length": 14, "mean_step_reward": -0.6667199999999999}
|
| 4 |
+
{"episode": 3, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -8.641919999999997, "length": 14, "mean_step_reward": -0.6172799999999998}
|
| 5 |
+
{"episode": 4, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.924959999999997, "length": 14, "mean_step_reward": -0.4946399999999998}
|
| 6 |
+
{"episode": 5, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.465919999999997, "length": 14, "mean_step_reward": -0.5332799999999998}
|
| 7 |
+
{"episode": 6, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999998, "length": 14, "mean_step_reward": -0.47591999999999984}
|
| 8 |
+
{"episode": 7, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.633919999999996, "length": 14, "mean_step_reward": -0.5452799999999998}
|
| 9 |
+
{"episode": 8, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.998879999999997, "length": 14, "mean_step_reward": -0.4999199999999998}
|
| 10 |
+
{"episode": 9, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.625919999999997, "length": 14, "mean_step_reward": -0.4732799999999998}
|
| 11 |
+
{"episode": 10, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.998879999999997, "length": 14, "mean_step_reward": -0.4999199999999998}
|
| 12 |
+
{"episode": 11, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.428959999999997, "length": 14, "mean_step_reward": -0.5306399999999998}
|
| 13 |
+
{"episode": 12, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.166879999999997, "length": 14, "mean_step_reward": -0.5119199999999998}
|
| 14 |
+
{"episode": 13, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999998, "length": 14, "mean_step_reward": -0.47591999999999984}
|
| 15 |
+
{"episode": 14, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.297919999999998, "length": 14, "mean_step_reward": -0.5212799999999999}
|
| 16 |
+
{"episode": 15, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.756959999999998, "length": 14, "mean_step_reward": -0.48263999999999985}
|
| 17 |
+
{"episode": 16, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.670879999999997, "length": 14, "mean_step_reward": -0.5479199999999997}
|
| 18 |
+
{"episode": 17, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -8.174879999999996, "length": 14, "mean_step_reward": -0.5839199999999998}
|
| 19 |
+
{"episode": 18, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999998, "length": 14, "mean_step_reward": -0.47591999999999984}
|
| 20 |
+
{"episode": 19, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999997, "length": 14, "mean_step_reward": -0.4759199999999998}
|
| 21 |
+
{"episode": 20, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.8678399999999975, "length": 14, "mean_step_reward": -0.49055999999999983}
|
| 22 |
+
{"episode": 21, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.551999999999998, "length": 14, "mean_step_reward": -0.46799999999999986}
|
| 23 |
+
{"episode": 22, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999997, "length": 14, "mean_step_reward": -0.4759199999999998}
|
| 24 |
+
{"episode": 23, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.961919999999997, "length": 14, "mean_step_reward": -0.49727999999999983}
|
| 25 |
+
{"episode": 24, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.5889599999999975, "length": 14, "mean_step_reward": -0.47063999999999984}
|
| 26 |
+
{"episode": 25, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999997, "length": 14, "mean_step_reward": -0.4759199999999998}
|
| 27 |
+
{"episode": 26, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.793919999999997, "length": 14, "mean_step_reward": -0.4852799999999998}
|
| 28 |
+
{"episode": 27, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.5889599999999975, "length": 14, "mean_step_reward": -0.47063999999999984}
|
| 29 |
+
{"episode": 28, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.625919999999997, "length": 14, "mean_step_reward": -0.4732799999999998}
|
| 30 |
+
{"episode": 29, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.961919999999997, "length": 14, "mean_step_reward": -0.49727999999999983}
|
| 31 |
+
{"episode": 30, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.924959999999998, "length": 14, "mean_step_reward": -0.49463999999999986}
|
| 32 |
+
{"episode": 31, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.699839999999997, "length": 14, "mean_step_reward": -0.4785599999999998}
|
| 33 |
+
{"episode": 32, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.793919999999997, "length": 14, "mean_step_reward": -0.4852799999999998}
|
| 34 |
+
{"episode": 33, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.830879999999997, "length": 14, "mean_step_reward": -0.4879199999999998}
|
| 35 |
+
{"episode": 34, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.961919999999997, "length": 14, "mean_step_reward": -0.49727999999999983}
|
| 36 |
+
{"episode": 35, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.559999999999997, "length": 14, "mean_step_reward": -0.5399999999999998}
|
| 37 |
+
{"episode": 36, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.334879999999997, "length": 14, "mean_step_reward": -0.5239199999999998}
|
| 38 |
+
{"episode": 37, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.699839999999997, "length": 14, "mean_step_reward": -0.4785599999999998}
|
| 39 |
+
{"episode": 38, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.924959999999997, "length": 14, "mean_step_reward": -0.4946399999999998}
|
| 40 |
+
{"episode": 39, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999998, "length": 14, "mean_step_reward": -0.47591999999999984}
|
| 41 |
+
{"episode": 40, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.699839999999997, "length": 14, "mean_step_reward": -0.4785599999999998}
|
| 42 |
+
{"episode": 41, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999998, "length": 14, "mean_step_reward": -0.47591999999999984}
|
| 43 |
+
{"episode": 42, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.625919999999997, "length": 14, "mean_step_reward": -0.4732799999999998}
|
| 44 |
+
{"episode": 43, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.8678399999999975, "length": 14, "mean_step_reward": -0.49055999999999983}
|
| 45 |
+
{"episode": 44, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.699839999999997, "length": 14, "mean_step_reward": -0.4785599999999998}
|
| 46 |
+
{"episode": 45, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.830879999999997, "length": 14, "mean_step_reward": -0.4879199999999998}
|
| 47 |
+
{"episode": 46, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.551999999999998, "length": 14, "mean_step_reward": -0.46799999999999986}
|
| 48 |
+
{"episode": 47, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.166879999999997, "length": 14, "mean_step_reward": -0.5119199999999998}
|
outputs/training/test_returns.jsonl
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"episode": 0, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.166879999999997, "length": 14, "mean_step_reward": -0.5119199999999998}
|
| 2 |
+
{"episode": 1, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.8019199999999955, "length": 14, "mean_step_reward": -0.5572799999999997}
|
| 3 |
+
{"episode": 2, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -8.641919999999999, "length": 14, "mean_step_reward": -0.6172799999999999}
|
| 4 |
+
{"episode": 3, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.801919999999996, "length": 14, "mean_step_reward": -0.5572799999999998}
|
| 5 |
+
{"episode": 4, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -8.735999999999997, "length": 14, "mean_step_reward": -0.6239999999999998}
|
| 6 |
+
{"episode": 5, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.961919999999997, "length": 14, "mean_step_reward": -0.49727999999999983}
|
| 7 |
+
{"episode": 6, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -8.137919999999998, "length": 14, "mean_step_reward": -0.5812799999999998}
|
| 8 |
+
{"episode": 7, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.428959999999997, "length": 14, "mean_step_reward": -0.5306399999999998}
|
| 9 |
+
{"episode": 8, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.830879999999997, "length": 14, "mean_step_reward": -0.4879199999999998}
|
| 10 |
+
{"episode": 9, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.334879999999997, "length": 14, "mean_step_reward": -0.5239199999999998}
|
| 11 |
+
{"episode": 10, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.588959999999998, "length": 14, "mean_step_reward": -0.4706399999999999}
|
| 12 |
+
{"episode": 11, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.625919999999997, "length": 14, "mean_step_reward": -0.4732799999999998}
|
| 13 |
+
{"episode": 12, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.793919999999997, "length": 14, "mean_step_reward": -0.4852799999999998}
|
| 14 |
+
{"episode": 13, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.166879999999997, "length": 14, "mean_step_reward": -0.5119199999999998}
|
| 15 |
+
{"episode": 14, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.793919999999998, "length": 14, "mean_step_reward": -0.4852799999999999}
|
| 16 |
+
{"episode": 15, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.625919999999997, "length": 14, "mean_step_reward": -0.4732799999999998}
|
| 17 |
+
{"episode": 16, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.187039999999998, "length": 14, "mean_step_reward": -0.5133599999999998}
|
| 18 |
+
{"episode": 17, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.719999999999998, "length": 14, "mean_step_reward": -0.47999999999999987}
|
| 19 |
+
{"episode": 18, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.719999999999998, "length": 14, "mean_step_reward": -0.47999999999999987}
|
| 20 |
+
{"episode": 19, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.8678399999999975, "length": 14, "mean_step_reward": -0.49055999999999983}
|
| 21 |
+
{"episode": 20, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.8510399999999985, "length": 14, "mean_step_reward": -0.4893599999999999}
|
| 22 |
+
{"episode": 21, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.719999999999998, "length": 14, "mean_step_reward": -0.47999999999999987}
|
| 23 |
+
{"episode": 22, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -7.092959999999998, "length": 14, "mean_step_reward": -0.5066399999999999}
|
| 24 |
+
{"episode": 23, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.887999999999998, "length": 14, "mean_step_reward": -0.4919999999999999}
|
| 25 |
+
{"episode": 24, "scenario": "phase2_core.json", "backend": "local", "agent": "reinforce", "return": -6.662879999999997, "length": 14, "mean_step_reward": -0.4759199999999998}
|
pyproject.toml
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
[build-system]
|
| 8 |
+
requires = ["setuptools>=45", "wheel"]
|
| 9 |
+
build-backend = "setuptools.build_meta"
|
| 10 |
+
|
| 11 |
+
[project]
|
| 12 |
+
name = "openenv-ghostexec"
|
| 13 |
+
version = "0.1.0"
|
| 14 |
+
description = "Ghostexec environment for OpenEnv"
|
| 15 |
+
requires-python = ">=3.10"
|
| 16 |
+
dependencies = [
|
| 17 |
+
# Core OpenEnv runtime (provides FastAPI server + HTTP client types)
|
| 18 |
+
# install from github
|
| 19 |
+
# "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
|
| 20 |
+
"openenv-core[core]>=0.2.3",
|
| 21 |
+
# Environment-specific dependencies
|
| 22 |
+
# Add all dependencies needed for your environment here
|
| 23 |
+
# Examples:
|
| 24 |
+
# "numpy>=1.19.0",
|
| 25 |
+
# "torch>=2.0.0",
|
| 26 |
+
# "gymnasium>=0.29.0",
|
| 27 |
+
# "openspiel>=1.0.0",
|
| 28 |
+
# "smolagents>=1.22.0,<2",
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
[project.optional-dependencies]
|
| 32 |
+
dev = [
|
| 33 |
+
"pytest>=8.0.0",
|
| 34 |
+
"pytest-cov>=4.0.0",
|
| 35 |
+
"pyyaml>=6.0.0",
|
| 36 |
+
"matplotlib>=3.8.0",
|
| 37 |
+
]
|
| 38 |
+
# Optional JSON-schema-constrained decoding backends (pick one).
|
| 39 |
+
constrained = [
|
| 40 |
+
"lm-format-enforcer>=0.10",
|
| 41 |
+
]
|
| 42 |
+
constrained-outlines = [
|
| 43 |
+
"outlines>=0.1",
|
| 44 |
+
]
|
| 45 |
+
|
| 46 |
+
[project.scripts]
|
| 47 |
+
# Server entry point - enables running via: uv run --project . server
|
| 48 |
+
# or: python -m ghostexec.server.app
|
| 49 |
+
server = "ghostexec.server.app:main"
|
| 50 |
+
|
| 51 |
+
[tool.setuptools]
|
| 52 |
+
include-package-data = true
|
| 53 |
+
packages = ["ghostexec", "ghostexec.server"]
|
| 54 |
+
package-dir = { "ghostexec" = ".", "ghostexec.server" = "server" }
|
| 55 |
+
|
| 56 |
+
[tool.setuptools.package-data]
|
| 57 |
+
ghostexec = ["scenarios/*.json"]
|
| 58 |
+
|
| 59 |
+
[tool.pytest.ini_options]
|
| 60 |
+
testpaths = ["tests"]
|
| 61 |
+
pythonpath = ["."]
|
scenarios/dinner_disaster.json
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"simulation_time": "2026-04-21T18:45:00",
|
| 3 |
+
"stress": 61,
|
| 4 |
+
"active_conflicts": [],
|
| 5 |
+
"action_log": [
|
| 6 |
+
"Client call ran long; dinner reservation at 19:00."
|
| 7 |
+
],
|
| 8 |
+
"episode_active": true,
|
| 9 |
+
"episode_end_reason": null,
|
| 10 |
+
"max_episode_steps": 40,
|
| 11 |
+
"emails": [
|
| 12 |
+
{
|
| 13 |
+
"id": "d1",
|
| 14 |
+
"sender": "Sarah Chen",
|
| 15 |
+
"subject": "Dinner \u2014 I am at the restaurant",
|
| 16 |
+
"body": "We have held the table until 7:15. Please leave the call.",
|
| 17 |
+
"read": false,
|
| 18 |
+
"replied": false,
|
| 19 |
+
"priority": "high",
|
| 20 |
+
"sender_relationship": "personal"
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"id": "d2",
|
| 24 |
+
"sender": "Taylor Brooks",
|
| 25 |
+
"subject": "Need sign-off tonight",
|
| 26 |
+
"body": "Board-adjacent ask: one paragraph on risk posture before morning.",
|
| 27 |
+
"read": false,
|
| 28 |
+
"replied": false,
|
| 29 |
+
"priority": "critical",
|
| 30 |
+
"sender_relationship": "VIP"
|
| 31 |
+
}
|
| 32 |
+
],
|
| 33 |
+
"meetings": [
|
| 34 |
+
{
|
| 35 |
+
"id": "dc1",
|
| 36 |
+
"title": "Client escalation call",
|
| 37 |
+
"start": "2026-04-21T17:30:00",
|
| 38 |
+
"duration_minutes": 90,
|
| 39 |
+
"attendees": [
|
| 40 |
+
"David Okonkwo"
|
| 41 |
+
],
|
| 42 |
+
"location": "virtual",
|
| 43 |
+
"priority": "high",
|
| 44 |
+
"cancelled": false
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"id": "dc2",
|
| 48 |
+
"title": "Dinner reservation window",
|
| 49 |
+
"start": "2026-04-21T19:00:00",
|
| 50 |
+
"duration_minutes": 120,
|
| 51 |
+
"attendees": [
|
| 52 |
+
"Sarah Chen"
|
| 53 |
+
],
|
| 54 |
+
"location": "Osteria",
|
| 55 |
+
"priority": "normal",
|
| 56 |
+
"cancelled": false
|
| 57 |
+
}
|
| 58 |
+
],
|
| 59 |
+
"contacts": [
|
| 60 |
+
{
|
| 61 |
+
"name": "Sarah Chen",
|
| 62 |
+
"relationship_type": "spouse",
|
| 63 |
+
"communication_preference": "text",
|
| 64 |
+
"importance": 5,
|
| 65 |
+
"mood": "annoyed"
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"name": "Taylor Brooks",
|
| 69 |
+
"relationship_type": "investor",
|
| 70 |
+
"communication_preference": "call",
|
| 71 |
+
"importance": 4,
|
| 72 |
+
"mood": "neutral"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"name": "David Okonkwo",
|
| 76 |
+
"relationship_type": "client",
|
| 77 |
+
"communication_preference": "email",
|
| 78 |
+
"importance": 4,
|
| 79 |
+
"mood": "angry"
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"name": "Jordan Lee",
|
| 83 |
+
"relationship_type": "direct_report",
|
| 84 |
+
"communication_preference": "call",
|
| 85 |
+
"importance": 3,
|
| 86 |
+
"mood": "neutral"
|
| 87 |
+
}
|
| 88 |
+
],
|
| 89 |
+
"tasks": [
|
| 90 |
+
{
|
| 91 |
+
"id": "dt1",
|
| 92 |
+
"description": "Text Sarah ETA for dinner",
|
| 93 |
+
"deadline": "2026-04-21T18:50:00",
|
| 94 |
+
"owner": "Self",
|
| 95 |
+
"status": "pending",
|
| 96 |
+
"effort": "low"
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"id": "dt2",
|
| 100 |
+
"description": "Send Taylor the risk paragraph",
|
| 101 |
+
"deadline": "2026-04-21T23:00:00",
|
| 102 |
+
"owner": "Self",
|
| 103 |
+
"status": "pending",
|
| 104 |
+
"effort": "medium"
|
| 105 |
+
}
|
| 106 |
+
]
|
| 107 |
+
}
|
scenarios/monday_morning.json
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"simulation_time": "2026-04-22T07:00:00",
|
| 3 |
+
"stress": 84,
|
| 4 |
+
"active_conflicts": [],
|
| 5 |
+
"action_log": [
|
| 6 |
+
"Monday: board + investor travel + home commitments collide."
|
| 7 |
+
],
|
| 8 |
+
"episode_active": true,
|
| 9 |
+
"episode_end_reason": null,
|
| 10 |
+
"max_episode_steps": 56,
|
| 11 |
+
"emails": [
|
| 12 |
+
{
|
| 13 |
+
"id": "mb1",
|
| 14 |
+
"sender": "Marcus Webb",
|
| 15 |
+
"subject": "Board deck \u2014 need numbers before market open",
|
| 16 |
+
"body": "We are missing revised Q3 figures. This is blocking the packet.",
|
| 17 |
+
"read": false,
|
| 18 |
+
"replied": false,
|
| 19 |
+
"priority": "critical",
|
| 20 |
+
"sender_relationship": "VIP"
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"id": "mb2",
|
| 24 |
+
"sender": "Marcus Webb",
|
| 25 |
+
"subject": "Re: investor flight \u2014 your call",
|
| 26 |
+
"body": "Taylor is airborne in 4 hours. We need a decision on the alternate slot.",
|
| 27 |
+
"read": false,
|
| 28 |
+
"replied": false,
|
| 29 |
+
"priority": "critical",
|
| 30 |
+
"sender_relationship": "VIP"
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"id": "mb3",
|
| 34 |
+
"sender": "Marcus Webb",
|
| 35 |
+
"subject": "Confidential \u2014 board sentiment",
|
| 36 |
+
"body": "Side channel: two directors are rattled. Your visible handling today matters.",
|
| 37 |
+
"read": false,
|
| 38 |
+
"replied": false,
|
| 39 |
+
"priority": "critical",
|
| 40 |
+
"sender_relationship": "VIP"
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"id": "sp1",
|
| 44 |
+
"sender": "Sarah Chen",
|
| 45 |
+
"subject": "Dinner 7pm \u2014 please confirm",
|
| 46 |
+
"body": "Reservation is 7pm. If you are going to be late again, tell me now.",
|
| 47 |
+
"read": false,
|
| 48 |
+
"replied": false,
|
| 49 |
+
"priority": "high",
|
| 50 |
+
"sender_relationship": "personal"
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"id": "fl1",
|
| 54 |
+
"sender": "Alex Rivera",
|
| 55 |
+
"subject": "Flight cancelled \u2014 investor dinner at risk",
|
| 56 |
+
"body": "Airline put me on a 6am tomorrow. Investor dinner prep is slipping.",
|
| 57 |
+
"read": false,
|
| 58 |
+
"replied": false,
|
| 59 |
+
"priority": "high",
|
| 60 |
+
"sender_relationship": "professional"
|
| 61 |
+
}
|
| 62 |
+
],
|
| 63 |
+
"meetings": [
|
| 64 |
+
{
|
| 65 |
+
"id": "mx1",
|
| 66 |
+
"title": "Commitment block 1",
|
| 67 |
+
"start": "2026-04-22T07:00:00",
|
| 68 |
+
"duration_minutes": 60,
|
| 69 |
+
"attendees": [
|
| 70 |
+
"Marcus Webb"
|
| 71 |
+
],
|
| 72 |
+
"location": "virtual",
|
| 73 |
+
"priority": "critical",
|
| 74 |
+
"cancelled": false
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"id": "mx2",
|
| 78 |
+
"title": "Commitment block 2",
|
| 79 |
+
"start": "2026-04-22T07:00:00",
|
| 80 |
+
"duration_minutes": 60,
|
| 81 |
+
"attendees": [
|
| 82 |
+
"Taylor Brooks"
|
| 83 |
+
],
|
| 84 |
+
"location": "virtual",
|
| 85 |
+
"priority": "critical",
|
| 86 |
+
"cancelled": false
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"id": "mx3",
|
| 90 |
+
"title": "Commitment block 3",
|
| 91 |
+
"start": "2026-04-22T07:00:00",
|
| 92 |
+
"duration_minutes": 60,
|
| 93 |
+
"attendees": [
|
| 94 |
+
"Marcus Webb"
|
| 95 |
+
],
|
| 96 |
+
"location": "virtual",
|
| 97 |
+
"priority": "high",
|
| 98 |
+
"cancelled": false
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"id": "mx4",
|
| 102 |
+
"title": "Commitment block 4",
|
| 103 |
+
"start": "2026-04-22T07:00:00",
|
| 104 |
+
"duration_minutes": 60,
|
| 105 |
+
"attendees": [
|
| 106 |
+
"Taylor Brooks"
|
| 107 |
+
],
|
| 108 |
+
"location": "virtual",
|
| 109 |
+
"priority": "high",
|
| 110 |
+
"cancelled": false
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"id": "mx5",
|
| 114 |
+
"title": "Commitment block 5",
|
| 115 |
+
"start": "2026-04-22T07:00:00",
|
| 116 |
+
"duration_minutes": 60,
|
| 117 |
+
"attendees": [
|
| 118 |
+
"Marcus Webb"
|
| 119 |
+
],
|
| 120 |
+
"location": "virtual",
|
| 121 |
+
"priority": "high",
|
| 122 |
+
"cancelled": false
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"id": "mx6",
|
| 126 |
+
"title": "Commitment block 6",
|
| 127 |
+
"start": "2026-04-22T07:00:00",
|
| 128 |
+
"duration_minutes": 60,
|
| 129 |
+
"attendees": [
|
| 130 |
+
"Taylor Brooks"
|
| 131 |
+
],
|
| 132 |
+
"location": "virtual",
|
| 133 |
+
"priority": "high",
|
| 134 |
+
"cancelled": false
|
| 135 |
+
},
|
| 136 |
+
{
|
| 137 |
+
"id": "m_pm",
|
| 138 |
+
"title": "Afternoon sync",
|
| 139 |
+
"start": "2026-04-22T15:00:00",
|
| 140 |
+
"duration_minutes": 30,
|
| 141 |
+
"attendees": [
|
| 142 |
+
"Jordan Lee"
|
| 143 |
+
],
|
| 144 |
+
"location": "virtual",
|
| 145 |
+
"priority": "normal",
|
| 146 |
+
"cancelled": false
|
| 147 |
+
}
|
| 148 |
+
],
|
| 149 |
+
"contacts": [
|
| 150 |
+
{
|
| 151 |
+
"name": "Marcus Webb",
|
| 152 |
+
"relationship_type": "board_member",
|
| 153 |
+
"communication_preference": "email",
|
| 154 |
+
"importance": 5,
|
| 155 |
+
"mood": "angry"
|
| 156 |
+
},
|
| 157 |
+
{
|
| 158 |
+
"name": "Sarah Chen",
|
| 159 |
+
"relationship_type": "spouse",
|
| 160 |
+
"communication_preference": "text",
|
| 161 |
+
"importance": 5,
|
| 162 |
+
"mood": "annoyed"
|
| 163 |
+
},
|
| 164 |
+
{
|
| 165 |
+
"name": "Taylor Brooks",
|
| 166 |
+
"relationship_type": "investor",
|
| 167 |
+
"communication_preference": "call",
|
| 168 |
+
"importance": 4,
|
| 169 |
+
"mood": "neutral"
|
| 170 |
+
},
|
| 171 |
+
{
|
| 172 |
+
"name": "Alex Rivera",
|
| 173 |
+
"relationship_type": "direct_report",
|
| 174 |
+
"communication_preference": "text",
|
| 175 |
+
"importance": 3,
|
| 176 |
+
"mood": "annoyed"
|
| 177 |
+
},
|
| 178 |
+
{
|
| 179 |
+
"name": "Jordan Lee",
|
| 180 |
+
"relationship_type": "direct_report",
|
| 181 |
+
"communication_preference": "call",
|
| 182 |
+
"importance": 3,
|
| 183 |
+
"mood": "happy"
|
| 184 |
+
},
|
| 185 |
+
{
|
| 186 |
+
"name": "Priya Sharma",
|
| 187 |
+
"relationship_type": "investor",
|
| 188 |
+
"communication_preference": "email",
|
| 189 |
+
"importance": 5,
|
| 190 |
+
"mood": "annoyed"
|
| 191 |
+
},
|
| 192 |
+
{
|
| 193 |
+
"name": "Elena Vogt",
|
| 194 |
+
"relationship_type": "team_member",
|
| 195 |
+
"communication_preference": "email",
|
| 196 |
+
"importance": 3,
|
| 197 |
+
"mood": "neutral"
|
| 198 |
+
},
|
| 199 |
+
{
|
| 200 |
+
"name": "David Okonkwo",
|
| 201 |
+
"relationship_type": "client",
|
| 202 |
+
"communication_preference": "email",
|
| 203 |
+
"importance": 4,
|
| 204 |
+
"mood": "neutral"
|
| 205 |
+
}
|
| 206 |
+
],
|
| 207 |
+
"tasks": [
|
| 208 |
+
{
|
| 209 |
+
"id": "ov1",
|
| 210 |
+
"description": "Finalize board packet figures",
|
| 211 |
+
"deadline": "2026-04-22T06:00:00",
|
| 212 |
+
"owner": "Self",
|
| 213 |
+
"status": "pending",
|
| 214 |
+
"effort": "high"
|
| 215 |
+
},
|
| 216 |
+
{
|
| 217 |
+
"id": "ov2",
|
| 218 |
+
"description": "Callback legal on redlines",
|
| 219 |
+
"deadline": "2026-04-22T05:30:00",
|
| 220 |
+
"owner": "Self",
|
| 221 |
+
"status": "pending",
|
| 222 |
+
"effort": "medium"
|
| 223 |
+
},
|
| 224 |
+
{
|
| 225 |
+
"id": "ov3",
|
| 226 |
+
"description": "Approve investor comms draft",
|
| 227 |
+
"deadline": "2026-04-22T06:15:00",
|
| 228 |
+
"owner": "Self",
|
| 229 |
+
"status": "pending",
|
| 230 |
+
"effort": "high"
|
| 231 |
+
},
|
| 232 |
+
{
|
| 233 |
+
"id": "ov4",
|
| 234 |
+
"description": "Expense report sign-off",
|
| 235 |
+
"deadline": "2026-04-22T04:00:00",
|
| 236 |
+
"owner": "Self",
|
| 237 |
+
"status": "overdue",
|
| 238 |
+
"effort": "low"
|
| 239 |
+
},
|
| 240 |
+
{
|
| 241 |
+
"id": "ov5",
|
| 242 |
+
"description": "Brief EA on calendar triage",
|
| 243 |
+
"deadline": "2026-04-22T05:00:00",
|
| 244 |
+
"owner": "Self",
|
| 245 |
+
"status": "overdue",
|
| 246 |
+
"effort": "low"
|
| 247 |
+
},
|
| 248 |
+
{
|
| 249 |
+
"id": "fu1",
|
| 250 |
+
"description": "Team social RSVP",
|
| 251 |
+
"deadline": "2026-04-25T12:00:00",
|
| 252 |
+
"owner": "Self",
|
| 253 |
+
"status": "pending",
|
| 254 |
+
"effort": "low"
|
| 255 |
+
}
|
| 256 |
+
]
|
| 257 |
+
}
|
scenarios/phase2_core.json
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"simulation_time": "2026-04-21T08:00:00",
|
| 3 |
+
"stress": 52,
|
| 4 |
+
"active_conflicts": [],
|
| 5 |
+
"action_log": [],
|
| 6 |
+
"episode_active": true,
|
| 7 |
+
"episode_end_reason": null,
|
| 8 |
+
"emails": [
|
| 9 |
+
{"id": "e01", "sender": "Marcus Webb", "subject": "RE: Q3 Numbers — we need to talk", "body": "Board expects revised figures before noon. This is urgent.", "read": false, "replied": false, "priority": "critical", "sender_relationship": "VIP"},
|
| 10 |
+
{"id": "e02", "sender": "Sarah Chen", "subject": "Dinner tonight", "body": "Reservation at 7pm — please confirm you will make it.", "read": false, "replied": false, "priority": "high", "sender_relationship": "personal"},
|
| 11 |
+
{"id": "e03", "sender": "Priya Sharma", "subject": "Still waiting on the deck", "body": "Investor read-through is tomorrow. Where is the version you promised?", "read": false, "replied": false, "priority": "critical", "sender_relationship": "VIP"},
|
| 12 |
+
{"id": "e04", "sender": "Legal", "subject": "Contract redlines due", "body": "Please return comments on the MSA by EOD.", "read": false, "replied": false, "priority": "high", "sender_relationship": "professional"},
|
| 13 |
+
{"id": "e05", "sender": "Jordan Lee", "subject": "Quick question on roadmap", "body": "Can we grab 10 minutes before standup?", "read": false, "replied": false, "priority": "normal", "sender_relationship": "professional"},
|
| 14 |
+
{"id": "e06", "sender": "Alex Rivera", "subject": "Flight cancelled — options?", "body": "Airline moved me to a 6am tomorrow. Need guidance.", "read": false, "replied": false, "priority": "high", "sender_relationship": "professional"},
|
| 15 |
+
{"id": "e07", "sender": "HR Benefits", "subject": "Open enrollment reminder", "body": "Friendly reminder window closes Friday.", "read": true, "replied": false, "priority": "low", "sender_relationship": "professional"},
|
| 16 |
+
{"id": "e08", "sender": "David Okonkwo", "subject": "Angry about last meeting", "body": "We were not heard. Expect a follow-up call.", "read": false, "replied": false, "priority": "high", "sender_relationship": "professional"},
|
| 17 |
+
{"id": "e09", "sender": "Newsletter", "subject": "Your weekly digest", "body": "Top stories in tech leadership.", "read": false, "replied": false, "priority": "low", "sender_relationship": "unknown"},
|
| 18 |
+
{"id": "e10", "sender": "Elena Vogt", "subject": "Board prep materials", "body": "Slides uploaded to the secure folder.", "read": false, "replied": false, "priority": "normal", "sender_relationship": "professional"},
|
| 19 |
+
{"id": "e11", "sender": "Chris Park", "subject": "Lunch?", "body": "Want to grab something casual near the office?", "read": false, "replied": false, "priority": "low", "sender_relationship": "personal"},
|
| 20 |
+
{"id": "e12", "sender": "Morgan Blake", "subject": "RE: Budget variance", "body": "Finance needs sign-off today or we slip the quarter.", "read": false, "replied": false, "priority": "critical", "sender_relationship": "VIP"},
|
| 21 |
+
{"id": "e13", "sender": "IT Security", "subject": "Password rotation", "body": "Your account expires in 48 hours.", "read": true, "replied": true, "priority": "normal", "sender_relationship": "professional"},
|
| 22 |
+
{"id": "e14", "sender": "Jamie Liu", "subject": "Sprint demo feedback", "body": "Mostly positive — a few UX nits to track.", "read": false, "replied": false, "priority": "normal", "sender_relationship": "professional"},
|
| 23 |
+
{"id": "e15", "sender": "Taylor Brooks", "subject": "Investor dinner follow-up", "body": "Thanks for last night — next steps attached.", "read": false, "replied": false, "priority": "high", "sender_relationship": "VIP"},
|
| 24 |
+
{"id": "e16", "sender": "Operations", "subject": "Incident report #4421", "body": "Minor outage resolved; postmortem scheduled.", "read": false, "replied": false, "priority": "normal", "sender_relationship": "professional"},
|
| 25 |
+
{"id": "e17", "sender": "Riley Santos", "subject": "Can you approve PTO?", "body": "Team coverage looks fine for next week.", "read": false, "replied": false, "priority": "low", "sender_relationship": "professional"},
|
| 26 |
+
{"id": "e18", "sender": "Noah Patel", "subject": "Vendor pricing", "body": "They moved numbers again — need a decision.", "read": false, "replied": false, "priority": "high", "sender_relationship": "professional"},
|
| 27 |
+
{"id": "e19", "sender": "Calendar Bot", "subject": "You have 12 conflicts today", "body": "Automated summary of overlapping meetings.", "read": false, "replied": false, "priority": "normal", "sender_relationship": "unknown"},
|
| 28 |
+
{"id": "e20", "sender": "Casey Nguyen", "subject": "Design review moved", "body": "We shifted to 3pm — hope that works.", "read": false, "replied": false, "priority": "normal", "sender_relationship": "professional"},
|
| 29 |
+
{"id": "e21", "sender": "Samira Haddad", "subject": "Formal complaint received", "body": "Please acknowledge receipt per policy.", "read": false, "replied": false, "priority": "critical", "sender_relationship": "professional"},
|
| 30 |
+
{"id": "e22", "sender": "Spouse", "subject": "Kids pickup", "body": "I have a dentist appointment — can you cover 4pm?", "read": false, "replied": false, "priority": "high", "sender_relationship": "personal"},
|
| 31 |
+
{"id": "e23", "sender": "Marketing", "subject": "Launch blog draft", "body": "Casual tone OK? LMK by 2pm.", "read": false, "replied": false, "priority": "normal", "sender_relationship": "professional"},
|
| 32 |
+
{"id": "e24", "sender": "Vikram Singh", "subject": "Partnership term sheet", "body": "Legal asked for your eyes on section 4 only.", "read": false, "replied": false, "priority": "high", "sender_relationship": "VIP"},
|
| 33 |
+
{"id": "e25", "sender": "Facilities", "subject": "Office move checklist", "body": "Please label your boxes by Friday.", "read": true, "replied": false, "priority": "low", "sender_relationship": "professional"},
|
| 34 |
+
{"id": "e26", "sender": "Quinn Murphy", "subject": "Sorry for the tone earlier", "body": "Rough morning — can we reset?", "read": false, "replied": false, "priority": "normal", "sender_relationship": "personal"},
|
| 35 |
+
{"id": "e27", "sender": "Board Secretary", "subject": "Confidential — agenda", "body": "Materials under embargo until 5pm.", "read": false, "replied": false, "priority": "critical", "sender_relationship": "VIP"},
|
| 36 |
+
{"id": "e28", "sender": "Recruiting", "subject": "VP Eng loop feedback", "body": "Candidate availability Thursday.", "read": false, "replied": false, "priority": "normal", "sender_relationship": "professional"},
|
| 37 |
+
{"id": "e29", "sender": "Avery Cole", "subject": "Weekend golf?", "body": "Totally casual — no pressure.", "read": false, "replied": false, "priority": "low", "sender_relationship": "personal"},
|
| 38 |
+
{"id": "e30", "sender": "Support", "subject": "Ticket #9982 closed", "body": "Your laptop swap is complete.", "read": true, "replied": false, "priority": "low", "sender_relationship": "unknown"}
|
| 39 |
+
],
|
| 40 |
+
"meetings": [
|
| 41 |
+
{"id": "m01", "title": "Board Call", "start": "2026-04-21T09:00:00", "duration_minutes": 60, "attendees": ["Marcus Webb", "Elena Vogt"], "location": "virtual", "priority": "critical", "cancelled": false},
|
| 42 |
+
{"id": "m02", "title": "Client Demo", "start": "2026-04-21T09:00:00", "duration_minutes": 60, "attendees": ["David Okonkwo"], "location": "virtual", "priority": "high", "cancelled": false},
|
| 43 |
+
{"id": "m03", "title": "Coffee with Jordan", "start": "2026-04-21T10:30:00", "duration_minutes": 30, "attendees": ["Jordan Lee"], "location": "Cafe North", "priority": "low", "cancelled": false},
|
| 44 |
+
{"id": "m04", "title": "Team Standup", "start": "2026-04-21T11:00:00", "duration_minutes": 30, "attendees": ["Jordan Lee", "Jamie Liu"], "location": "virtual", "priority": "normal", "cancelled": false},
|
| 45 |
+
{"id": "m05", "title": "Lunch with Priya", "start": "2026-04-21T11:00:00", "duration_minutes": 90, "attendees": ["Priya Sharma"], "location": "Osteria", "priority": "high", "cancelled": false},
|
| 46 |
+
{"id": "m06", "title": "1:1 Avery", "start": "2026-04-21T13:00:00", "duration_minutes": 60, "attendees": ["Avery Cole"], "location": "Office 12B", "priority": "normal", "cancelled": false},
|
| 47 |
+
{"id": "m07", "title": "Investor Update", "start": "2026-04-21T14:00:00", "duration_minutes": 60, "attendees": ["Taylor Brooks"], "location": "virtual", "priority": "critical", "cancelled": false},
|
| 48 |
+
{"id": "m08", "title": "Legal Review", "start": "2026-04-21T14:00:00", "duration_minutes": 60, "attendees": ["Legal"], "location": "virtual", "priority": "high", "cancelled": false},
|
| 49 |
+
{"id": "m09", "title": "Sales QBR", "start": "2026-04-21T15:00:00", "duration_minutes": 60, "attendees": ["Noah Patel"], "location": "virtual", "priority": "normal", "cancelled": false},
|
| 50 |
+
{"id": "m10", "title": "Ops Incident Review", "start": "2026-04-21T15:30:00", "duration_minutes": 60, "attendees": ["Operations"], "location": "virtual", "priority": "high", "cancelled": false}
|
| 51 |
+
],
|
| 52 |
+
"contacts": [
|
| 53 |
+
{"name": "Marcus Webb", "relationship_type": "board_member", "communication_preference": "email", "importance": 5, "mood": "angry"},
|
| 54 |
+
{"name": "Sarah Chen", "relationship_type": "spouse", "communication_preference": "text", "importance": 5, "mood": "neutral"},
|
| 55 |
+
{"name": "Priya Sharma", "relationship_type": "investor", "communication_preference": "email", "importance": 5, "mood": "annoyed"},
|
| 56 |
+
{"name": "Jordan Lee", "relationship_type": "direct_report", "communication_preference": "call", "importance": 3, "mood": "happy"},
|
| 57 |
+
{"name": "David Okonkwo", "relationship_type": "client", "communication_preference": "email", "importance": 4, "mood": "angry"},
|
| 58 |
+
{"name": "Elena Vogt", "relationship_type": "team_member", "communication_preference": "email", "importance": 3, "mood": "neutral"},
|
| 59 |
+
{"name": "Taylor Brooks", "relationship_type": "investor", "communication_preference": "call", "importance": 4, "mood": "neutral"},
|
| 60 |
+
{"name": "Alex Rivera", "relationship_type": "direct_report", "communication_preference": "text", "importance": 2, "mood": "neutral"},
|
| 61 |
+
{"name": "Jamie Liu", "relationship_type": "team_member", "communication_preference": "email", "importance": 2, "mood": "happy"},
|
| 62 |
+
{"name": "Chris Park", "relationship_type": "friend", "communication_preference": "text", "importance": 2, "mood": "happy"},
|
| 63 |
+
{"name": "Morgan Blake", "relationship_type": "board_member", "communication_preference": "email", "importance": 5, "mood": "neutral"},
|
| 64 |
+
{"name": "Riley Santos", "relationship_type": "direct_report", "communication_preference": "email", "importance": 3, "mood": "neutral"},
|
| 65 |
+
{"name": "Noah Patel", "relationship_type": "client", "communication_preference": "email", "importance": 4, "mood": "annoyed"},
|
| 66 |
+
{"name": "Casey Nguyen", "relationship_type": "team_member", "communication_preference": "email", "importance": 2, "mood": "neutral"},
|
| 67 |
+
{"name": "Vikram Singh", "relationship_type": "investor", "communication_preference": "email", "importance": 4, "mood": "neutral"}
|
| 68 |
+
],
|
| 69 |
+
"tasks": [
|
| 70 |
+
{"id": "t01", "description": "Send Q3 deck to Marcus", "deadline": "2026-04-21T09:30:00", "owner": "Marcus Webb", "status": "pending", "effort": "high"},
|
| 71 |
+
{"id": "t02", "description": "Confirm dinner reservation", "deadline": "2026-04-21T10:00:00", "owner": "Sarah Chen", "status": "pending", "effort": "low"},
|
| 72 |
+
{"id": "t03", "description": "Rebook investor flight", "deadline": "2026-04-21T11:00:00", "owner": "Alex Rivera", "status": "pending", "effort": "medium"},
|
| 73 |
+
{"id": "t04", "description": "Legal MSA redlines", "deadline": "2026-04-21T17:00:00", "owner": "Legal", "status": "in-progress", "effort": "high"},
|
| 74 |
+
{"id": "t05", "description": "Approve vendor SOW", "deadline": "2026-04-22T12:00:00", "owner": "Noah Patel", "status": "pending", "effort": "medium"},
|
| 75 |
+
{"id": "t06", "description": "Prep board talking points", "deadline": "2026-04-21T08:30:00", "owner": "Self", "status": "pending", "effort": "high"},
|
| 76 |
+
{"id": "t07", "description": "Expense report Q1", "deadline": "2026-04-25T23:59:59", "owner": "Finance", "status": "pending", "effort": "low"},
|
| 77 |
+
{"id": "t08", "description": "Callback David Okonkwo", "deadline": "2026-04-21T12:00:00", "owner": "Self", "status": "pending", "effort": "low"},
|
| 78 |
+
{"id": "t09", "description": "Review design mocks", "deadline": "2026-04-21T15:00:00", "owner": "Casey Nguyen", "status": "done", "effort": "medium"},
|
| 79 |
+
{"id": "t10", "description": "Submit benefits election", "deadline": "2026-04-28T23:59:59", "owner": "HR", "status": "pending", "effort": "low"},
|
| 80 |
+
{"id": "t11", "description": "Brief PR on launch timing", "deadline": "2026-04-21T14:00:00", "owner": "Marketing", "status": "pending", "effort": "medium"},
|
| 81 |
+
{"id": "t12", "description": "Sign birthday card for Avery", "deadline": "2026-04-21T16:00:00", "owner": "Jamie Liu", "status": "pending", "effort": "low"}
|
| 82 |
+
]
|
| 83 |
+
}
|
scenarios/schema_drift_test.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"description": "Patronus-style schema drift bundle (three mid-episode rule changes).",
|
| 3 |
+
"events": [
|
| 4 |
+
{
|
| 5 |
+
"after_step": 1,
|
| 6 |
+
"shift_all_meetings_hours": 1,
|
| 7 |
+
"comment": "Scenario 1: calendar system shifts all local meeting times by +1 hour."
|
| 8 |
+
},
|
| 9 |
+
{
|
| 10 |
+
"after_step": 2,
|
| 11 |
+
"set_contact_preference": {
|
| 12 |
+
"name": "Sarah Chen",
|
| 13 |
+
"communication_preference": "text"
|
| 14 |
+
},
|
| 15 |
+
"comment": "Scenario 2: VIP/personal contact switches to text-only preference."
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"after_step": 3,
|
| 19 |
+
"set_task_deadline": {
|
| 20 |
+
"task_id": "t02",
|
| 21 |
+
"deadline": "2026-04-21T07:00:00"
|
| 22 |
+
},
|
| 23 |
+
"suppress_reply_relationship_for_senders": ["Marcus Webb"],
|
| 24 |
+
"comment": "Scenario 3: task deadline moved earlier; Marcus email replies yield zero relationship score."
|
| 25 |
+
}
|
| 26 |
+
]
|
| 27 |
+
}
|
scenarios/vip_meltdown.json
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"simulation_time": "2026-04-21T09:00:00",
|
| 3 |
+
"stress": 44,
|
| 4 |
+
"active_conflicts": [],
|
| 5 |
+
"action_log": [
|
| 6 |
+
"VIP meltdown demo: external pressure escalates if ignored."
|
| 7 |
+
],
|
| 8 |
+
"episode_active": true,
|
| 9 |
+
"episode_end_reason": null,
|
| 10 |
+
"max_episode_steps": 24,
|
| 11 |
+
"emails": [
|
| 12 |
+
{
|
| 13 |
+
"id": "v1",
|
| 14 |
+
"sender": "Taylor Brooks",
|
| 15 |
+
"subject": "We need alignment now",
|
| 16 |
+
"body": "Neutral opening: waiting on your stance before we brief others.",
|
| 17 |
+
"read": false,
|
| 18 |
+
"replied": false,
|
| 19 |
+
"priority": "critical",
|
| 20 |
+
"sender_relationship": "VIP"
|
| 21 |
+
}
|
| 22 |
+
],
|
| 23 |
+
"meetings": [
|
| 24 |
+
{
|
| 25 |
+
"id": "vm1",
|
| 26 |
+
"title": "Investor sync",
|
| 27 |
+
"start": "2026-04-21T10:00:00",
|
| 28 |
+
"duration_minutes": 45,
|
| 29 |
+
"attendees": [
|
| 30 |
+
"Taylor Brooks"
|
| 31 |
+
],
|
| 32 |
+
"location": "virtual",
|
| 33 |
+
"priority": "high",
|
| 34 |
+
"cancelled": false
|
| 35 |
+
}
|
| 36 |
+
],
|
| 37 |
+
"contacts": [
|
| 38 |
+
{
|
| 39 |
+
"name": "Taylor Brooks",
|
| 40 |
+
"relationship_type": "investor",
|
| 41 |
+
"communication_preference": "call",
|
| 42 |
+
"importance": 4,
|
| 43 |
+
"mood": "neutral"
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"name": "Jordan Lee",
|
| 47 |
+
"relationship_type": "direct_report",
|
| 48 |
+
"communication_preference": "email",
|
| 49 |
+
"importance": 3,
|
| 50 |
+
"mood": "happy"
|
| 51 |
+
}
|
| 52 |
+
],
|
| 53 |
+
"tasks": [
|
| 54 |
+
{
|
| 55 |
+
"id": "vt1",
|
| 56 |
+
"description": "Prep one-pager for Taylor",
|
| 57 |
+
"deadline": "2026-04-21T11:00:00",
|
| 58 |
+
"owner": "Self",
|
| 59 |
+
"status": "pending",
|
| 60 |
+
"effort": "medium"
|
| 61 |
+
}
|
| 62 |
+
]
|
| 63 |
+
}
|
scenarios/vip_meltdown_drift.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"events": [
|
| 3 |
+
{
|
| 4 |
+
"after_step": 1,
|
| 5 |
+
"set_contact_mood": {
|
| 6 |
+
"name": "Taylor Brooks",
|
| 7 |
+
"mood": "annoyed"
|
| 8 |
+
}
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"after_step": 2,
|
| 12 |
+
"set_contact_mood": {
|
| 13 |
+
"name": "Taylor Brooks",
|
| 14 |
+
"mood": "angry"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"after_step": 3,
|
| 19 |
+
"set_contact_mood": {
|
| 20 |
+
"name": "Taylor Brooks",
|
| 21 |
+
"mood": "furious"
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
]
|
| 25 |
+
}
|
scripts/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Makes ``scripts.*`` importable when repo root is on PYTHONPATH (pytest).
|
scripts/http_endpoint_smoke.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 3 |
+
#
|
| 4 |
+
# CLI: hit GhostExec HTTP endpoints (live URL or --local in-process app).
|
| 5 |
+
#
|
| 6 |
+
# uv run python scripts/http_endpoint_smoke.py --local
|
| 7 |
+
# uv run python scripts/http_endpoint_smoke.py --url http://127.0.0.1:8000
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import argparse
|
| 12 |
+
import json
|
| 13 |
+
import sys
|
| 14 |
+
import urllib.error
|
| 15 |
+
import urllib.request
|
| 16 |
+
from typing import Any
|
| 17 |
+
from urllib.parse import urljoin
|
| 18 |
+
|
| 19 |
+
ROOT = __import__("pathlib").Path(__file__).resolve().parents[1]
|
| 20 |
+
if str(ROOT) not in sys.path:
|
| 21 |
+
sys.path.insert(0, str(ROOT))
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def _print_curl(base: str) -> None:
|
| 25 |
+
print("# --- copy/paste (bash) ---")
|
| 26 |
+
for method, path in [
|
| 27 |
+
("GET", "/health"),
|
| 28 |
+
("GET", "/metadata"),
|
| 29 |
+
("GET", "/state"),
|
| 30 |
+
("GET", "/schema"),
|
| 31 |
+
("GET", "/openapi.json"),
|
| 32 |
+
]:
|
| 33 |
+
print(f"curl -sS -X {method} '{base.rstrip('/')}{path}' | head -c 200 && echo")
|
| 34 |
+
print(
|
| 35 |
+
"curl -sS -X POST '{base}/reset' -H 'Content-Type: application/json' -d '{{}}' | head -c 300 && echo".format(
|
| 36 |
+
base=base.rstrip("/")
|
| 37 |
+
)
|
| 38 |
+
)
|
| 39 |
+
print(
|
| 40 |
+
"curl -sS -X POST '{base}/step' -H 'Content-Type: application/json' "
|
| 41 |
+
"-d '{{\"action\":{{\"action_type\":\"do_nothing\"}}}}' | head -c 300 && echo".format(base=base.rstrip("/"))
|
| 42 |
+
)
|
| 43 |
+
print(
|
| 44 |
+
"# Note: HTTP uses a new env per request — not one multi-step episode; use WebSocket /ws for that."
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class LiveClient:
|
| 49 |
+
def __init__(self, base: str) -> None:
|
| 50 |
+
self.base = base.rstrip("/")
|
| 51 |
+
|
| 52 |
+
def request(
|
| 53 |
+
self,
|
| 54 |
+
method: str,
|
| 55 |
+
path: str,
|
| 56 |
+
*,
|
| 57 |
+
data: bytes | None = None,
|
| 58 |
+
headers: dict[str, str] | None = None,
|
| 59 |
+
) -> tuple[int, str]:
|
| 60 |
+
url = urljoin(self.base + "/", path.lstrip("/"))
|
| 61 |
+
req = urllib.request.Request(url, data=data, headers=headers or {}, method=method)
|
| 62 |
+
try:
|
| 63 |
+
with urllib.request.urlopen(req, timeout=20) as resp:
|
| 64 |
+
return resp.status, resp.read().decode(errors="replace")
|
| 65 |
+
except urllib.error.HTTPError as e:
|
| 66 |
+
return e.code, e.read().decode(errors="replace")
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
class LocalClient:
|
| 70 |
+
def __init__(self) -> None:
|
| 71 |
+
from fastapi.testclient import TestClient
|
| 72 |
+
|
| 73 |
+
from ghostexec.server.app import app
|
| 74 |
+
|
| 75 |
+
self._client = TestClient(app, raise_server_exceptions=True)
|
| 76 |
+
|
| 77 |
+
def request(
|
| 78 |
+
self,
|
| 79 |
+
method: str,
|
| 80 |
+
path: str,
|
| 81 |
+
*,
|
| 82 |
+
data: bytes | None = None,
|
| 83 |
+
headers: dict[str, str] | None = None,
|
| 84 |
+
) -> tuple[int, str]:
|
| 85 |
+
hdrs = headers or {}
|
| 86 |
+
kwargs: dict[str, Any] = {}
|
| 87 |
+
if data is not None:
|
| 88 |
+
kwargs["content"] = data
|
| 89 |
+
kwargs["headers"] = hdrs
|
| 90 |
+
r = self._client.request(method, path, **kwargs)
|
| 91 |
+
return r.status_code, r.text
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def main() -> int:
|
| 95 |
+
p = argparse.ArgumentParser(description="GhostExec HTTP endpoint smoke (CLI).")
|
| 96 |
+
p.add_argument(
|
| 97 |
+
"--url",
|
| 98 |
+
default="http://127.0.0.1:8000",
|
| 99 |
+
help="Live server base URL (ignored with --local).",
|
| 100 |
+
)
|
| 101 |
+
p.add_argument(
|
| 102 |
+
"--local",
|
| 103 |
+
action="store_true",
|
| 104 |
+
help="Use in-process FastAPI TestClient (no server required).",
|
| 105 |
+
)
|
| 106 |
+
p.add_argument(
|
| 107 |
+
"--print-curl",
|
| 108 |
+
action="store_true",
|
| 109 |
+
help="Print example curl commands and exit 0.",
|
| 110 |
+
)
|
| 111 |
+
args = p.parse_args()
|
| 112 |
+
|
| 113 |
+
if args.print_curl:
|
| 114 |
+
_print_curl(args.url)
|
| 115 |
+
return 0
|
| 116 |
+
|
| 117 |
+
client: LiveClient | LocalClient
|
| 118 |
+
label: str
|
| 119 |
+
if args.local:
|
| 120 |
+
client = LocalClient()
|
| 121 |
+
label = "local TestClient"
|
| 122 |
+
else:
|
| 123 |
+
client = LiveClient(args.url)
|
| 124 |
+
label = args.url
|
| 125 |
+
|
| 126 |
+
def check_get(path: str) -> None:
|
| 127 |
+
code, body = client.request("GET", path)
|
| 128 |
+
ok = 200 <= code < 300
|
| 129 |
+
status = "OK" if ok else "FAIL"
|
| 130 |
+
print(f"[{status}] GET {path} -> HTTP {code} (body ~{len(body)} chars)")
|
| 131 |
+
if not ok:
|
| 132 |
+
raise SystemExit(1)
|
| 133 |
+
|
| 134 |
+
print(f"GhostExec HTTP smoke ({label})\n")
|
| 135 |
+
|
| 136 |
+
for path in (
|
| 137 |
+
"/health",
|
| 138 |
+
"/metadata",
|
| 139 |
+
"/state",
|
| 140 |
+
"/schema",
|
| 141 |
+
"/openapi.json",
|
| 142 |
+
"/docs",
|
| 143 |
+
"/redoc",
|
| 144 |
+
):
|
| 145 |
+
check_get(path)
|
| 146 |
+
|
| 147 |
+
body = json.dumps({}).encode()
|
| 148 |
+
hdrs = {"Content-Type": "application/json"}
|
| 149 |
+
code, txt = client.request("POST", "/reset", data=body, headers=hdrs)
|
| 150 |
+
print(f"[{'OK' if code == 200 else 'FAIL'}] POST /reset -> HTTP {code}")
|
| 151 |
+
if code != 200:
|
| 152 |
+
raise SystemExit(1)
|
| 153 |
+
j = json.loads(txt)
|
| 154 |
+
em = (j.get("observation") or {}).get("echoed_message", "")[:50]
|
| 155 |
+
print(f" briefing prefix: {em!r}")
|
| 156 |
+
|
| 157 |
+
step_payload = json.dumps({"action": {"action_type": "do_nothing"}}).encode()
|
| 158 |
+
code2, txt2 = client.request("POST", "/step", data=step_payload, headers=hdrs)
|
| 159 |
+
print(f"[{'OK' if code2 == 200 else 'FAIL'}] POST /step do_nothing -> HTTP {code2}")
|
| 160 |
+
if code2 != 200:
|
| 161 |
+
raise SystemExit(1)
|
| 162 |
+
|
| 163 |
+
print(
|
| 164 |
+
"\nNote: OpenEnv HTTP may use a new env per request, so separate POSTs do not advance "
|
| 165 |
+
"one long episode; each POST /step runs a single action on a fresh instance. "
|
| 166 |
+
"Multi-step learning on one episode: WebSocket /ws (see ghostexec/README.md)."
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
code3, _ = client.request("POST", "/mcp", data=json.dumps({"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}}).encode(), headers=hdrs)
|
| 170 |
+
print(f"[{'OK' if code3 == 200 else 'FAIL'}] POST /mcp tools/list -> HTTP {code3}")
|
| 171 |
+
if code3 != 200:
|
| 172 |
+
raise SystemExit(1)
|
| 173 |
+
|
| 174 |
+
code4, _ = client.request("GET", "/reset")
|
| 175 |
+
print(f"[{'OK' if code4 == 405 else 'FAIL'}] GET /reset (expect 405) -> HTTP {code4}")
|
| 176 |
+
if code4 != 405:
|
| 177 |
+
raise SystemExit(1)
|
| 178 |
+
|
| 179 |
+
print("\nAll checks passed.")
|
| 180 |
+
return 0
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
if __name__ == "__main__":
|
| 184 |
+
raise SystemExit(main())
|
scripts/run_live_api_dead_500.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Run 500+ LIVE HTTP API reward dead-tests against a running GhostExec server.
|
| 2 |
+
|
| 3 |
+
Usage:
|
| 4 |
+
uv run python scripts/run_live_api_dead_500.py --url http://127.0.0.1:8002 --cases 500
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import argparse
|
| 10 |
+
import json
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import Any
|
| 13 |
+
from urllib.parse import urljoin
|
| 14 |
+
import urllib.error
|
| 15 |
+
import urllib.request
|
| 16 |
+
|
| 17 |
+
W_CONFLICT = 0.35
|
| 18 |
+
W_REL = 0.35
|
| 19 |
+
W_TASK = 0.30
|
| 20 |
+
OUTPUT_SCALE = 0.48
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _request(
|
| 24 |
+
base_url: str,
|
| 25 |
+
method: str,
|
| 26 |
+
path: str,
|
| 27 |
+
*,
|
| 28 |
+
body: dict[str, Any] | None = None,
|
| 29 |
+
timeout: float = 20.0,
|
| 30 |
+
) -> tuple[int, str]:
|
| 31 |
+
data = None
|
| 32 |
+
headers = {"Accept": "application/json"}
|
| 33 |
+
if body is not None:
|
| 34 |
+
data = json.dumps(body).encode()
|
| 35 |
+
headers["Content-Type"] = "application/json"
|
| 36 |
+
req = urllib.request.Request(
|
| 37 |
+
urljoin(base_url.rstrip("/") + "/", path.lstrip("/")),
|
| 38 |
+
data=data,
|
| 39 |
+
headers=headers,
|
| 40 |
+
method=method,
|
| 41 |
+
)
|
| 42 |
+
try:
|
| 43 |
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
| 44 |
+
return resp.status, resp.read().decode(errors="replace")
|
| 45 |
+
except urllib.error.HTTPError as e:
|
| 46 |
+
return e.code, e.read().decode(errors="replace")
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _step_payload_for(i: int) -> dict[str, Any]:
|
| 50 |
+
templates: list[dict[str, Any]] = [
|
| 51 |
+
{"action": {"action_type": "do_nothing"}},
|
| 52 |
+
{"action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}},
|
| 53 |
+
{"action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}},
|
| 54 |
+
{"action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}},
|
| 55 |
+
{"action": {"action_type": "archive_email", "email_id": "e09"}},
|
| 56 |
+
{"action": {"action_type": "archive_email", "email_id": "bad_id"}},
|
| 57 |
+
{
|
| 58 |
+
"action": {
|
| 59 |
+
"action_type": "reschedule_meeting",
|
| 60 |
+
"meeting_id": "m02",
|
| 61 |
+
"new_time": "2026-04-21T18:00:00",
|
| 62 |
+
}
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"action": {
|
| 66 |
+
"action_type": "reschedule_meeting",
|
| 67 |
+
"meeting_id": "m03",
|
| 68 |
+
"new_time": "2026-04-21T09:30:00", # overlap -> invalid semantic
|
| 69 |
+
}
|
| 70 |
+
},
|
| 71 |
+
{"action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}},
|
| 72 |
+
{"action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}},
|
| 73 |
+
{"action": {"action_type": "complete_task", "task_id": "t07"}},
|
| 74 |
+
{"action": {"action_type": "complete_task", "task_id": "t09"}}, # already done
|
| 75 |
+
{"action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}},
|
| 76 |
+
{"action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}},
|
| 77 |
+
{
|
| 78 |
+
"action": {
|
| 79 |
+
"action_type": "send_message",
|
| 80 |
+
"contact_name": "Jamie Liu",
|
| 81 |
+
"message_body": "Quick sync please.",
|
| 82 |
+
}
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"action": {
|
| 86 |
+
"action_type": "send_message",
|
| 87 |
+
"contact_name": "Nobody",
|
| 88 |
+
"message_body": "hello",
|
| 89 |
+
}
|
| 90 |
+
},
|
| 91 |
+
]
|
| 92 |
+
return templates[i % len(templates)]
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def _assert_api_surface(base_url: str) -> None:
|
| 96 |
+
for path in ("/health", "/metadata", "/state", "/schema", "/openapi.json", "/docs", "/redoc"):
|
| 97 |
+
code, _ = _request(base_url, "GET", path)
|
| 98 |
+
assert code == 200, f"{path} -> {code}"
|
| 99 |
+
assert _request(base_url, "GET", "/reset")[0] == 405
|
| 100 |
+
assert _request(base_url, "GET", "/step")[0] == 405
|
| 101 |
+
assert _request(base_url, "GET", "/this-path-should-not-exist-ghostexec")[0] == 404
|
| 102 |
+
assert _request(base_url, "POST", "/mcp", body={"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}})[0] == 200
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def main() -> int:
|
| 106 |
+
p = argparse.ArgumentParser(description="Run live 500+ reward dead-tests.")
|
| 107 |
+
p.add_argument("--url", default="http://127.0.0.1:8002", help="Base server URL")
|
| 108 |
+
p.add_argument("--cases", type=int, default=500, help="Number of /reset+/step cases")
|
| 109 |
+
args = p.parse_args()
|
| 110 |
+
|
| 111 |
+
base_url = args.url.rstrip("/")
|
| 112 |
+
cases = max(1, args.cases)
|
| 113 |
+
|
| 114 |
+
_assert_api_surface(base_url)
|
| 115 |
+
|
| 116 |
+
out_dir = Path("outputs") / "logs"
|
| 117 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 118 |
+
out_path = out_dir / f"api_dead_live_{cases}.jsonl"
|
| 119 |
+
|
| 120 |
+
passed = 0
|
| 121 |
+
failed = 0
|
| 122 |
+
failures: list[str] = []
|
| 123 |
+
|
| 124 |
+
with out_path.open("w", encoding="utf-8") as f:
|
| 125 |
+
for idx in range(cases):
|
| 126 |
+
rec: dict[str, Any] = {"idx": idx, "ok": False, "error": None}
|
| 127 |
+
try:
|
| 128 |
+
rc, rb = _request(
|
| 129 |
+
base_url,
|
| 130 |
+
"POST",
|
| 131 |
+
"/reset",
|
| 132 |
+
body={"episode_id": f"live-dead-{idx:04d}", "seed": 42},
|
| 133 |
+
)
|
| 134 |
+
assert rc == 200, f"reset status {rc}"
|
| 135 |
+
|
| 136 |
+
payload = _step_payload_for(idx)
|
| 137 |
+
rec["action"] = payload["action"]
|
| 138 |
+
sc, sb = _request(base_url, "POST", "/step", body=payload)
|
| 139 |
+
assert sc == 200, f"step status {sc}"
|
| 140 |
+
body = json.loads(sb)
|
| 141 |
+
|
| 142 |
+
obs = body["observation"]
|
| 143 |
+
meta = obs.get("metadata") or {}
|
| 144 |
+
bd = meta.get("reward_breakdown") or {}
|
| 145 |
+
|
| 146 |
+
reward = float(body["reward"])
|
| 147 |
+
final = float(bd["final"])
|
| 148 |
+
assert reward == final, "reward != breakdown.final"
|
| 149 |
+
|
| 150 |
+
c = float(bd.get("conflict", 0.0))
|
| 151 |
+
r = float(bd.get("relationship", 0.0))
|
| 152 |
+
t = float(bd.get("task", 0.0))
|
| 153 |
+
expected_weighted = OUTPUT_SCALE * (W_CONFLICT * c + W_REL * r + W_TASK * t)
|
| 154 |
+
assert float(bd["weighted_base"]) == expected_weighted, "weighted_base mismatch"
|
| 155 |
+
|
| 156 |
+
expected_final = (
|
| 157 |
+
float(bd.get("weighted_base", 0.0))
|
| 158 |
+
+ float(bd.get("invalid_step_adjustment", 0.0))
|
| 159 |
+
+ float(bd.get("episode_completion_bonus", 0.0))
|
| 160 |
+
+ float(bd.get("catastrophic_penalty", 0.0))
|
| 161 |
+
+ float(bd.get("do_nothing_floor", 0.0))
|
| 162 |
+
)
|
| 163 |
+
assert final == expected_final, "final aggregation mismatch"
|
| 164 |
+
|
| 165 |
+
if payload["action"]["action_type"] == "do_nothing":
|
| 166 |
+
assert float(bd.get("do_nothing_floor", 0.0)) == -0.15, "do_nothing floor mismatch"
|
| 167 |
+
assert reward < 0, "do_nothing should be negative"
|
| 168 |
+
|
| 169 |
+
if meta.get("step_ok") is False:
|
| 170 |
+
assert float(bd.get("invalid_step_adjustment", 0.0)) == -0.25, "invalid penalty mismatch"
|
| 171 |
+
|
| 172 |
+
rec["ok"] = True
|
| 173 |
+
rec["reward"] = reward
|
| 174 |
+
rec["step_ok"] = meta.get("step_ok")
|
| 175 |
+
passed += 1
|
| 176 |
+
except Exception as e: # noqa: BLE001
|
| 177 |
+
rec["ok"] = False
|
| 178 |
+
rec["error"] = str(e)
|
| 179 |
+
failed += 1
|
| 180 |
+
if len(failures) < 10:
|
| 181 |
+
failures.append(f"idx={idx}: {e}")
|
| 182 |
+
finally:
|
| 183 |
+
f.write(json.dumps(rec, ensure_ascii=False) + "\n")
|
| 184 |
+
|
| 185 |
+
print(f"Live API dead-test complete: passed={passed} failed={failed} total={cases}")
|
| 186 |
+
print(f"Report: {out_path}")
|
| 187 |
+
if failures:
|
| 188 |
+
print("First failures:")
|
| 189 |
+
for row in failures:
|
| 190 |
+
print(" -", row)
|
| 191 |
+
return 0 if failed == 0 else 1
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
if __name__ == "__main__":
|
| 195 |
+
raise SystemExit(main())
|
| 196 |
+
|
server/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Ghostexec environment server components."""
|
| 8 |
+
|
| 9 |
+
from .ghostexec_environment import GhostexecEnvironment
|
| 10 |
+
|
| 11 |
+
__all__ = ["GhostexecEnvironment"]
|
server/app.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
FastAPI application for the Ghostexec Environment.
|
| 9 |
+
|
| 10 |
+
This module creates an HTTP server that exposes the GhostexecEnvironment
|
| 11 |
+
over HTTP and WebSocket endpoints, compatible with EnvClient.
|
| 12 |
+
|
| 13 |
+
Endpoints:
|
| 14 |
+
- POST /reset: Reset the environment
|
| 15 |
+
- POST /step: Execute an action
|
| 16 |
+
- GET /state: Get current environment state
|
| 17 |
+
- GET /schema: Get action/observation schemas
|
| 18 |
+
- WS /ws: WebSocket endpoint for persistent sessions
|
| 19 |
+
|
| 20 |
+
Usage:
|
| 21 |
+
# Development (with auto-reload):
|
| 22 |
+
uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
|
| 23 |
+
|
| 24 |
+
# Production:
|
| 25 |
+
uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
|
| 26 |
+
|
| 27 |
+
# Or run directly:
|
| 28 |
+
python -m server.app
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
try:
|
| 32 |
+
import openenv.core.env_server.http_server as _openenv_http
|
| 33 |
+
except Exception as e: # pragma: no cover
|
| 34 |
+
raise ImportError(
|
| 35 |
+
"openenv is required for the web interface. Install dependencies with '\n uv sync\n'"
|
| 36 |
+
) from e
|
| 37 |
+
|
| 38 |
+
# OpenEnv's serialize_observation drops `metadata` from the JSON body; Ghostexec
|
| 39 |
+
# trainers and live tests rely on step_ok / ids inside observation.metadata.
|
| 40 |
+
_orig_serialize_observation = _openenv_http.serialize_observation
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _ghostexec_serialize_observation(observation): # type: ignore[no-untyped-def]
|
| 44 |
+
payload = _orig_serialize_observation(observation)
|
| 45 |
+
inner = payload.get("observation")
|
| 46 |
+
if isinstance(inner, dict):
|
| 47 |
+
meta = getattr(observation, "metadata", None) or {}
|
| 48 |
+
inner["metadata"] = _openenv_http._make_json_serializable(meta)
|
| 49 |
+
return payload
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
_openenv_http.serialize_observation = _ghostexec_serialize_observation
|
| 53 |
+
|
| 54 |
+
from openenv.core.env_server.http_server import create_app # noqa: E402
|
| 55 |
+
|
| 56 |
+
try:
|
| 57 |
+
# Editable / normal install (package name `ghostexec`).
|
| 58 |
+
from ghostexec.models import GhostexecAction, GhostexecObservation
|
| 59 |
+
from ghostexec.server.ghostexec_environment import GhostexecEnvironment
|
| 60 |
+
except ImportError:
|
| 61 |
+
# Plain `uvicorn server.app:app` from repo root: top-level `models` + `server` package.
|
| 62 |
+
from models import GhostexecAction, GhostexecObservation
|
| 63 |
+
from server.ghostexec_environment import GhostexecEnvironment
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# Create the app with web interface and README integration
|
| 67 |
+
app = create_app(
|
| 68 |
+
GhostexecEnvironment,
|
| 69 |
+
GhostexecAction,
|
| 70 |
+
GhostexecObservation,
|
| 71 |
+
env_name="ghostexec",
|
| 72 |
+
max_concurrent_envs=1, # increase this number to allow more concurrent WebSocket sessions
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def _patch_openapi_ghostexec_examples(schema: dict) -> None:
|
| 77 |
+
"""Replace OpenEnv's generic observation examples with GhostExec's plain-text briefing shape."""
|
| 78 |
+
briefing = (
|
| 79 |
+
"=== GHOSTEXEC BRIEFING — Tue 21 Apr 2026 08:00 ===\n\n"
|
| 80 |
+
"UNREAD EMAILS (…): …\n\n"
|
| 81 |
+
"CALENDAR CONFLICTS IN NEXT 4 HOURS: …\n\n"
|
| 82 |
+
"CONTACTS TO WATCH: …\n\n"
|
| 83 |
+
"OVERDUE OR DUE-SOON TASKS: …\n\n"
|
| 84 |
+
"EXEC STRESS LEVEL: 52/100\n"
|
| 85 |
+
"STEPS REMAINING: 48"
|
| 86 |
+
)
|
| 87 |
+
obs = {"echoed_message": briefing, "message_length": len(briefing)}
|
| 88 |
+
reset_ex = {"observation": obs, "reward": 0.0, "done": False}
|
| 89 |
+
step_ex = {"observation": obs, "reward": -0.42, "done": False}
|
| 90 |
+
for path, example in (("/reset", reset_ex), ("/step", step_ex)):
|
| 91 |
+
try:
|
| 92 |
+
cell = (
|
| 93 |
+
schema["paths"][path]["post"]["responses"]["200"]["content"]["application/json"]
|
| 94 |
+
)
|
| 95 |
+
if isinstance(cell, dict):
|
| 96 |
+
cell["example"] = example
|
| 97 |
+
except KeyError:
|
| 98 |
+
continue
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
_OPENAPI_HTTP_EPISODE_SENTINEL = "Ghostexec / OpenEnv HTTP"
|
| 102 |
+
|
| 103 |
+
_OPENAPI_HTTP_EPISODE_NOTE = f"""
|
| 104 |
+
---
|
| 105 |
+
## {_OPENAPI_HTTP_EPISODE_SENTINEL}
|
| 106 |
+
|
| 107 |
+
Each `POST /reset` and `POST /step` may run on a **new** environment instance, so
|
| 108 |
+
separate HTTP requests do **not** share one in-memory episode across calls. A lone
|
| 109 |
+
`POST /step` still applies your action once (after internal scenario load). For
|
| 110 |
+
**many steps on the same episode**, use **WebSocket `/ws`**: open a connection,
|
| 111 |
+
reset once, then send many step messages on that same socket. See **ghostexec/README.md**
|
| 112 |
+
for details.
|
| 113 |
+
"""
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def _patch_openapi_ghostexec_http_note(schema: dict) -> None:
|
| 117 |
+
"""Document HTTP statelessness vs /ws so Swagger and OpenAPI clients see it."""
|
| 118 |
+
try:
|
| 119 |
+
info = schema.get("info")
|
| 120 |
+
if not isinstance(info, dict):
|
| 121 |
+
return
|
| 122 |
+
desc = info.get("description") or ""
|
| 123 |
+
if _OPENAPI_HTTP_EPISODE_SENTINEL in desc:
|
| 124 |
+
return
|
| 125 |
+
info["description"] = desc + _OPENAPI_HTTP_EPISODE_NOTE
|
| 126 |
+
except (TypeError, KeyError):
|
| 127 |
+
return
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
_fastapi_openapi = type(app).openapi.__get__(app, type(app))
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def _ghostexec_openapi() -> dict:
|
| 134 |
+
if app.openapi_schema is None:
|
| 135 |
+
_fastapi_openapi()
|
| 136 |
+
_patch_openapi_ghostexec_examples(app.openapi_schema)
|
| 137 |
+
_patch_openapi_ghostexec_http_note(app.openapi_schema)
|
| 138 |
+
return app.openapi_schema # type: ignore[return-value]
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
app.openapi = _ghostexec_openapi # type: ignore[method-assign]
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def main() -> None:
|
| 145 |
+
"""
|
| 146 |
+
Entry point for direct execution via uv run or python -m.
|
| 147 |
+
|
| 148 |
+
This function enables running the server without Docker:
|
| 149 |
+
uv run --project . server
|
| 150 |
+
uv run --project . server --port 8001
|
| 151 |
+
python -m ghostexec.server.app
|
| 152 |
+
|
| 153 |
+
For production deployments, consider using uvicorn directly with
|
| 154 |
+
multiple workers:
|
| 155 |
+
uvicorn ghostexec.server.app:app --workers 4
|
| 156 |
+
"""
|
| 157 |
+
import argparse
|
| 158 |
+
|
| 159 |
+
import uvicorn
|
| 160 |
+
|
| 161 |
+
parser = argparse.ArgumentParser(description="GhostExec OpenEnv HTTP server")
|
| 162 |
+
parser.add_argument("--host", type=str, default="0.0.0.0", help="Bind address")
|
| 163 |
+
parser.add_argument("--port", type=int, default=8000, help="Listen port")
|
| 164 |
+
args = parser.parse_args()
|
| 165 |
+
uvicorn.run(app, host=args.host, port=args.port)
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
if __name__ == '__main__':
|
| 169 |
+
main()
|
server/ghostexec_environment.py
ADDED
|
@@ -0,0 +1,706 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
GhostExec simulated world, agent step (Phases 2–3), and reward (Phase 4).
|
| 9 |
+
|
| 10 |
+
Scenario payloads load from scenarios/*.json. Observations are plain-text briefings.
|
| 11 |
+
Invalid actions return a structured error in observation metadata without raising.
|
| 12 |
+
Rewards aggregate conflict / relationship / task scores and log each step to outputs/logs/.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
import json
|
| 18 |
+
from datetime import datetime, timedelta, timezone
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
from typing import Any
|
| 21 |
+
from uuid import uuid4
|
| 22 |
+
|
| 23 |
+
from openenv.core.env_server.interfaces import Environment
|
| 24 |
+
from openenv.core.env_server.types import State
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
from ..models import (
|
| 28 |
+
Contact,
|
| 29 |
+
Email,
|
| 30 |
+
GhostexecAction,
|
| 31 |
+
GhostexecObservation,
|
| 32 |
+
Meeting,
|
| 33 |
+
Mood,
|
| 34 |
+
RewardBreakdown,
|
| 35 |
+
Task,
|
| 36 |
+
TaskStatus,
|
| 37 |
+
WorldState,
|
| 38 |
+
)
|
| 39 |
+
except ImportError:
|
| 40 |
+
from models import (
|
| 41 |
+
Contact,
|
| 42 |
+
Email,
|
| 43 |
+
GhostexecAction,
|
| 44 |
+
GhostexecObservation,
|
| 45 |
+
Meeting,
|
| 46 |
+
Mood,
|
| 47 |
+
RewardBreakdown,
|
| 48 |
+
Task,
|
| 49 |
+
TaskStatus,
|
| 50 |
+
WorldState,
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
try:
|
| 54 |
+
from . import reward as _reward
|
| 55 |
+
except ImportError:
|
| 56 |
+
try:
|
| 57 |
+
from server import reward as _reward
|
| 58 |
+
except ImportError:
|
| 59 |
+
import reward as _reward # type: ignore[no-redef]
|
| 60 |
+
|
| 61 |
+
_PRIORITY_RANK: dict[str, int] = {"critical": 0, "high": 1, "normal": 2, "low": 3}
|
| 62 |
+
_REL_DISPLAY: dict[str, str] = {
|
| 63 |
+
"board_member": "Board",
|
| 64 |
+
"spouse": "Spouse",
|
| 65 |
+
"investor": "Investor",
|
| 66 |
+
"direct_report": "Direct report",
|
| 67 |
+
"client": "Client",
|
| 68 |
+
"friend": "Friend",
|
| 69 |
+
"team_member": "Team",
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
_INVALID_ACTION_REWARD = -0.25
|
| 73 |
+
_DEFAULT_STEP_REWARD = 0.0
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def _default_scenario_path() -> Path:
|
| 77 |
+
return Path(__file__).resolve().parent.parent / "scenarios" / "phase2_core.json"
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def _parse_dt(value: str) -> datetime:
|
| 81 |
+
if value.endswith("Z"):
|
| 82 |
+
return datetime.fromisoformat(value[:-1]).replace(tzinfo=timezone.utc)
|
| 83 |
+
dt = datetime.fromisoformat(value)
|
| 84 |
+
if dt.tzinfo is None:
|
| 85 |
+
return dt.replace(tzinfo=timezone.utc)
|
| 86 |
+
return dt
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def _meeting_end(m: Meeting) -> datetime:
|
| 90 |
+
start = _parse_dt(m.start)
|
| 91 |
+
return start + timedelta(minutes=m.duration_minutes)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def _windows_overlap(a_start: datetime, a_end: datetime, b_start: datetime, b_end: datetime) -> bool:
|
| 95 |
+
return a_start < b_end and b_start < a_end
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
class GhostexecEnvironment(Environment):
|
| 99 |
+
"""Inbox, calendar, contacts, tasks, actions, briefings, and Phase 4 rewards."""
|
| 100 |
+
|
| 101 |
+
SUPPORTS_CONCURRENT_SESSIONS: bool = True
|
| 102 |
+
|
| 103 |
+
def __init__(
|
| 104 |
+
self,
|
| 105 |
+
scenario_path: str | Path | None = None,
|
| 106 |
+
schema_drift_events_path: str | Path | None = None,
|
| 107 |
+
) -> None:
|
| 108 |
+
self._scenario_path = Path(scenario_path) if scenario_path else _default_scenario_path()
|
| 109 |
+
self._drift_events_path = (
|
| 110 |
+
Path(schema_drift_events_path) if schema_drift_events_path is not None else None
|
| 111 |
+
)
|
| 112 |
+
self._drift_events: list[dict[str, Any]] = []
|
| 113 |
+
if self._drift_events_path and self._drift_events_path.is_file():
|
| 114 |
+
drift_raw = json.loads(self._drift_events_path.read_text(encoding="utf-8"))
|
| 115 |
+
self._drift_events = list(drift_raw.get("events", []))
|
| 116 |
+
self._reply_relationship_suppressed: set[str] = set()
|
| 117 |
+
self._reward_log_path = (
|
| 118 |
+
Path(__file__).resolve().parent.parent / "outputs" / "logs" / "episode_rewards.jsonl"
|
| 119 |
+
)
|
| 120 |
+
self._world: WorldState | None = None
|
| 121 |
+
self._base_stress: int = 0
|
| 122 |
+
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 123 |
+
self._last_step_ok: bool = True
|
| 124 |
+
self._last_step_error: str | None = None
|
| 125 |
+
self._last_step_detail: str = ""
|
| 126 |
+
self._last_reward_breakdown: RewardBreakdown | None = None
|
| 127 |
+
|
| 128 |
+
# --- lifecycle ---
|
| 129 |
+
|
| 130 |
+
def reset(self) -> GhostexecObservation: # type: ignore[override]
|
| 131 |
+
self._world = self.load_world_from_json(self._scenario_path)
|
| 132 |
+
self._base_stress = self._world.stress
|
| 133 |
+
self._rebuild_conflict_list()
|
| 134 |
+
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 135 |
+
self._last_step_ok = True
|
| 136 |
+
self._last_step_error = None
|
| 137 |
+
self._last_step_detail = "Episode started."
|
| 138 |
+
self._reply_relationship_suppressed.clear()
|
| 139 |
+
self._last_reward_breakdown = None
|
| 140 |
+
self._ensure_reward_log_dir()
|
| 141 |
+
briefing = self.build_briefing_text()
|
| 142 |
+
return self._observation_from_briefing(
|
| 143 |
+
briefing,
|
| 144 |
+
reward=_DEFAULT_STEP_REWARD,
|
| 145 |
+
done=False,
|
| 146 |
+
reward_breakdown=None,
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
def step(self, action: GhostexecAction) -> GhostexecObservation: # type: ignore[override]
|
| 150 |
+
if self._world is None:
|
| 151 |
+
# OpenEnv HTTP uses a new env per request; prime the world so this step still
|
| 152 |
+
# runs the requested action (invalid actions get step_ok False, rewards apply).
|
| 153 |
+
self.reset()
|
| 154 |
+
|
| 155 |
+
assert self._world is not None
|
| 156 |
+
if not self._world.episode_active:
|
| 157 |
+
self._last_step_ok = False
|
| 158 |
+
self._last_step_error = "Episode is already finished."
|
| 159 |
+
bd = RewardBreakdown(
|
| 160 |
+
final=_INVALID_ACTION_REWARD,
|
| 161 |
+
invalid_step_adjustment=_INVALID_ACTION_REWARD,
|
| 162 |
+
)
|
| 163 |
+
self._last_reward_breakdown = bd
|
| 164 |
+
return self._observation_from_briefing(
|
| 165 |
+
self.build_briefing_text(),
|
| 166 |
+
reward=bd.final,
|
| 167 |
+
done=True,
|
| 168 |
+
reward_breakdown=bd,
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
self._state.step_count += 1
|
| 172 |
+
self._maybe_apply_schema_drift_events()
|
| 173 |
+
|
| 174 |
+
if action.message.strip():
|
| 175 |
+
self._world.action_log.append(f"note: {action.message.strip()}")
|
| 176 |
+
|
| 177 |
+
before = self.world.model_copy(deep=True)
|
| 178 |
+
action_ok = self._apply_action(action)
|
| 179 |
+
self._rebuild_conflict_list()
|
| 180 |
+
|
| 181 |
+
episode_done = False
|
| 182 |
+
if self._state.step_count >= self._world.max_episode_steps:
|
| 183 |
+
episode_done = True
|
| 184 |
+
self._world.episode_active = False
|
| 185 |
+
self._world.episode_end_reason = self._world.episode_end_reason or "step_limit"
|
| 186 |
+
|
| 187 |
+
breakdown = _reward.compute_step_reward(
|
| 188 |
+
before,
|
| 189 |
+
self.world,
|
| 190 |
+
action,
|
| 191 |
+
action_ok=action_ok,
|
| 192 |
+
episode_done=episode_done,
|
| 193 |
+
relationship_suppressed_for_email_to=frozenset(self._reply_relationship_suppressed),
|
| 194 |
+
)
|
| 195 |
+
self._last_reward_breakdown = breakdown
|
| 196 |
+
self._append_reward_log(breakdown, episode_done, action)
|
| 197 |
+
|
| 198 |
+
briefing = self.build_briefing_text()
|
| 199 |
+
return self._observation_from_briefing(
|
| 200 |
+
briefing,
|
| 201 |
+
reward=breakdown.final,
|
| 202 |
+
done=episode_done,
|
| 203 |
+
reward_breakdown=breakdown,
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
@property
|
| 207 |
+
def state(self) -> State:
|
| 208 |
+
return self._state
|
| 209 |
+
|
| 210 |
+
@property
|
| 211 |
+
def world(self) -> WorldState:
|
| 212 |
+
if self._world is None:
|
| 213 |
+
raise RuntimeError("World not initialised; call reset() first.")
|
| 214 |
+
return self._world
|
| 215 |
+
|
| 216 |
+
# --- Phase 3 briefing (plain text for LLM) ---
|
| 217 |
+
|
| 218 |
+
def build_briefing_text(self) -> str:
|
| 219 |
+
w = self.world
|
| 220 |
+
now = _parse_dt(w.simulation_time)
|
| 221 |
+
header = now.strftime("=== GHOSTEXEC BRIEFING — %a %d %b %Y %H:%M ===")
|
| 222 |
+
|
| 223 |
+
unread = self.get_unread_emails_sorted()
|
| 224 |
+
email_lines = [
|
| 225 |
+
f"- [{e.priority.upper()}] From: {e.sender} ({_REL_DISPLAY.get(e.sender_relationship, e.sender_relationship)}) — "
|
| 226 |
+
f'"{e.subject}"\n Preview: {(e.body[:100] + ("…" if len(e.body) > 100 else "")).replace(chr(10), " ")}'
|
| 227 |
+
for e in unread[:20]
|
| 228 |
+
]
|
| 229 |
+
email_block = "\n".join(email_lines) if email_lines else "(none)"
|
| 230 |
+
|
| 231 |
+
horizon = now + timedelta(hours=4)
|
| 232 |
+
conflict_lines: list[str] = []
|
| 233 |
+
for row in self.detect_meeting_conflicts():
|
| 234 |
+
o0 = _parse_dt(row["overlap_start"])
|
| 235 |
+
o1 = _parse_dt(row["overlap_end"])
|
| 236 |
+
if o1 <= now or o0 >= horizon:
|
| 237 |
+
continue
|
| 238 |
+
ma = self._meeting_by_id(row["meeting_a"])
|
| 239 |
+
mb = self._meeting_by_id(row["meeting_b"])
|
| 240 |
+
if not ma or not mb or ma.cancelled or mb.cancelled:
|
| 241 |
+
continue
|
| 242 |
+
conflict_lines.append(
|
| 243 |
+
f"- {_fmt_meeting_line(ma)} CLASHES WITH -> {_fmt_meeting_line(mb)}"
|
| 244 |
+
)
|
| 245 |
+
conflict_block = "\n".join(conflict_lines) if conflict_lines else "(none in next 4 hours)"
|
| 246 |
+
|
| 247 |
+
top_contacts = sorted(w.contacts, key=lambda c: (-c.importance, c.name))[:5]
|
| 248 |
+
contact_lines = [
|
| 249 |
+
f"- {c.name}: {c.mood.upper()} — {_REL_DISPLAY.get(c.relationship_type, c.relationship_type)}; "
|
| 250 |
+
f"prefers {c.communication_preference}"
|
| 251 |
+
for c in top_contacts
|
| 252 |
+
]
|
| 253 |
+
contact_block = "\n".join(contact_lines) if contact_lines else "(none)"
|
| 254 |
+
|
| 255 |
+
soon = now + timedelta(hours=24)
|
| 256 |
+
task_lines: list[str] = []
|
| 257 |
+
for t in w.tasks:
|
| 258 |
+
if t.status == "done":
|
| 259 |
+
continue
|
| 260 |
+
dl = _parse_dt(t.deadline)
|
| 261 |
+
if dl < now or (now <= dl <= soon):
|
| 262 |
+
flag = "OVERDUE" if dl < now else "due soon"
|
| 263 |
+
task_lines.append(f"- [{flag}] {t.description} (deadline {t.deadline}, owner {t.owner})")
|
| 264 |
+
task_block = "\n".join(task_lines[:15]) if task_lines else "(none)"
|
| 265 |
+
|
| 266 |
+
remaining = max(0, w.max_episode_steps - self._state.step_count)
|
| 267 |
+
|
| 268 |
+
parts = [
|
| 269 |
+
header,
|
| 270 |
+
"",
|
| 271 |
+
f"UNREAD EMAILS ({len(unread)} unread):",
|
| 272 |
+
email_block,
|
| 273 |
+
"",
|
| 274 |
+
"CALENDAR CONFLICTS IN NEXT 4 HOURS:",
|
| 275 |
+
conflict_block,
|
| 276 |
+
"",
|
| 277 |
+
"CONTACTS TO WATCH (top 5 by importance):",
|
| 278 |
+
contact_block,
|
| 279 |
+
"",
|
| 280 |
+
"OVERDUE OR DUE-SOON TASKS (next 24h window):",
|
| 281 |
+
task_block,
|
| 282 |
+
"",
|
| 283 |
+
f"EXEC STRESS LEVEL: {w.stress}/100",
|
| 284 |
+
f"STEPS REMAINING: {remaining}",
|
| 285 |
+
]
|
| 286 |
+
if self._last_step_error:
|
| 287 |
+
parts += ["", f"LAST ACTION: ERROR — {self._last_step_error}"]
|
| 288 |
+
elif self._last_step_detail:
|
| 289 |
+
parts += ["", f"LAST ACTION: OK — {self._last_step_detail}"]
|
| 290 |
+
|
| 291 |
+
return "\n".join(parts)
|
| 292 |
+
|
| 293 |
+
def _meeting_by_id(self, mid: str) -> Meeting | None:
|
| 294 |
+
for m in self.world.meetings:
|
| 295 |
+
if m.id == mid:
|
| 296 |
+
return m
|
| 297 |
+
return None
|
| 298 |
+
|
| 299 |
+
# --- scenario IO ---
|
| 300 |
+
|
| 301 |
+
@staticmethod
|
| 302 |
+
def load_world_from_json(path: str | Path) -> WorldState:
|
| 303 |
+
raw = Path(path).read_text(encoding="utf-8")
|
| 304 |
+
data = json.loads(raw)
|
| 305 |
+
return WorldState.model_validate(data)
|
| 306 |
+
|
| 307 |
+
@staticmethod
|
| 308 |
+
def world_to_json(world: WorldState) -> str:
|
| 309 |
+
return world.model_dump_json()
|
| 310 |
+
|
| 311 |
+
@staticmethod
|
| 312 |
+
def world_from_json(blob: str) -> WorldState:
|
| 313 |
+
return WorldState.model_validate_json(blob)
|
| 314 |
+
|
| 315 |
+
# --- inbox ---
|
| 316 |
+
|
| 317 |
+
def get_unread_emails_sorted(self) -> list[Email]:
|
| 318 |
+
w = self.world
|
| 319 |
+
unread = [e for e in w.emails if not e.read]
|
| 320 |
+
return sorted(
|
| 321 |
+
unread,
|
| 322 |
+
key=lambda e: (_PRIORITY_RANK.get(e.priority, 99), e.id),
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
+
def mark_email_read(self, email_id: str) -> bool:
|
| 326 |
+
for i, e in enumerate(self.world.emails):
|
| 327 |
+
if e.id == email_id:
|
| 328 |
+
self.world.emails[i] = e.model_copy(update={"read": True})
|
| 329 |
+
return True
|
| 330 |
+
return False
|
| 331 |
+
|
| 332 |
+
def mark_email_replied(self, email_id: str) -> bool:
|
| 333 |
+
for i, e in enumerate(self.world.emails):
|
| 334 |
+
if e.id == email_id:
|
| 335 |
+
self.world.emails[i] = e.model_copy(update={"read": True, "replied": True})
|
| 336 |
+
return True
|
| 337 |
+
return False
|
| 338 |
+
|
| 339 |
+
# --- calendar ---
|
| 340 |
+
|
| 341 |
+
def detect_meeting_conflicts(self) -> list[dict[str, Any]]:
|
| 342 |
+
active = [m for m in self.world.meetings if not m.cancelled]
|
| 343 |
+
out: list[dict[str, Any]] = []
|
| 344 |
+
for i, a in enumerate(active):
|
| 345 |
+
a_start = _parse_dt(a.start)
|
| 346 |
+
a_end = _meeting_end(a)
|
| 347 |
+
for b in active[i + 1 :]:
|
| 348 |
+
b_start = _parse_dt(b.start)
|
| 349 |
+
b_end = _meeting_end(b)
|
| 350 |
+
if _windows_overlap(a_start, a_end, b_start, b_end):
|
| 351 |
+
overlap_start = max(a_start, b_start)
|
| 352 |
+
overlap_end = min(a_end, b_end)
|
| 353 |
+
out.append(
|
| 354 |
+
{
|
| 355 |
+
"meeting_a": a.id,
|
| 356 |
+
"meeting_b": b.id,
|
| 357 |
+
"overlap_start": overlap_start.isoformat(),
|
| 358 |
+
"overlap_end": overlap_end.isoformat(),
|
| 359 |
+
}
|
| 360 |
+
)
|
| 361 |
+
return out
|
| 362 |
+
|
| 363 |
+
def _reschedule_causes_overlap(self, meeting_id: str, new_start_iso: str) -> bool:
|
| 364 |
+
idx = next((i for i, m in enumerate(self.world.meetings) if m.id == meeting_id), None)
|
| 365 |
+
if idx is None:
|
| 366 |
+
return True
|
| 367 |
+
cand = self.world.meetings[idx].model_copy(update={"start": new_start_iso})
|
| 368 |
+
c_start = _parse_dt(cand.start)
|
| 369 |
+
c_end = _meeting_end(cand)
|
| 370 |
+
for m in self.world.meetings:
|
| 371 |
+
if m.cancelled or m.id == meeting_id:
|
| 372 |
+
continue
|
| 373 |
+
if _windows_overlap(c_start, c_end, _parse_dt(m.start), _meeting_end(m)):
|
| 374 |
+
return True
|
| 375 |
+
return False
|
| 376 |
+
|
| 377 |
+
def reschedule_meeting(self, meeting_id: str, new_start_iso: str) -> bool:
|
| 378 |
+
for i, m in enumerate(self.world.meetings):
|
| 379 |
+
if m.id == meeting_id and not m.cancelled:
|
| 380 |
+
self.world.meetings[i] = m.model_copy(update={"start": new_start_iso})
|
| 381 |
+
self._rebuild_conflict_list()
|
| 382 |
+
return True
|
| 383 |
+
return False
|
| 384 |
+
|
| 385 |
+
def cancel_meeting(self, meeting_id: str) -> bool:
|
| 386 |
+
for i, m in enumerate(self.world.meetings):
|
| 387 |
+
if m.id == meeting_id:
|
| 388 |
+
self.world.meetings[i] = m.model_copy(update={"cancelled": True})
|
| 389 |
+
self._rebuild_conflict_list()
|
| 390 |
+
return True
|
| 391 |
+
return False
|
| 392 |
+
|
| 393 |
+
def add_meeting(self, meeting: Meeting) -> None:
|
| 394 |
+
self.world.meetings.append(meeting)
|
| 395 |
+
self._rebuild_conflict_list()
|
| 396 |
+
|
| 397 |
+
# --- contacts ---
|
| 398 |
+
|
| 399 |
+
def get_contact(self, name: str) -> Contact | None:
|
| 400 |
+
for c in self.world.contacts:
|
| 401 |
+
if c.name == name:
|
| 402 |
+
return c
|
| 403 |
+
return None
|
| 404 |
+
|
| 405 |
+
def update_contact_mood(self, name: str, mood: Mood) -> bool:
|
| 406 |
+
for i, c in enumerate(self.world.contacts):
|
| 407 |
+
if c.name == name:
|
| 408 |
+
self.world.contacts[i] = c.model_copy(update={"mood": mood})
|
| 409 |
+
return True
|
| 410 |
+
return False
|
| 411 |
+
|
| 412 |
+
# --- tasks ---
|
| 413 |
+
|
| 414 |
+
def update_task_status(self, task_id: str, status: TaskStatus) -> bool:
|
| 415 |
+
for i, t in enumerate(self.world.tasks):
|
| 416 |
+
if t.id == task_id:
|
| 417 |
+
self.world.tasks[i] = t.model_copy(update={"status": status})
|
| 418 |
+
return True
|
| 419 |
+
return False
|
| 420 |
+
|
| 421 |
+
def overdue_tasks_at(self, simulation_iso: str) -> list[Task]:
|
| 422 |
+
now = _parse_dt(simulation_iso)
|
| 423 |
+
out: list[Task] = []
|
| 424 |
+
for t in self.world.tasks:
|
| 425 |
+
if t.status in ("done",):
|
| 426 |
+
continue
|
| 427 |
+
if _parse_dt(t.deadline) < now:
|
| 428 |
+
out.append(t)
|
| 429 |
+
return out
|
| 430 |
+
|
| 431 |
+
def set_simulation_time(self, simulation_iso: str) -> None:
|
| 432 |
+
self.world.simulation_time = simulation_iso
|
| 433 |
+
self._reapply_task_overdue_flags()
|
| 434 |
+
self._rebuild_conflict_list()
|
| 435 |
+
|
| 436 |
+
# --- Phase 3 action execution ---
|
| 437 |
+
|
| 438 |
+
def _apply_action(self, action: GhostexecAction) -> bool:
|
| 439 |
+
self._last_step_ok = True
|
| 440 |
+
self._last_step_error = None
|
| 441 |
+
self._last_step_detail = ""
|
| 442 |
+
at = action.action_type
|
| 443 |
+
|
| 444 |
+
if at == "do_nothing":
|
| 445 |
+
self._last_step_detail = "No action taken."
|
| 446 |
+
return True
|
| 447 |
+
|
| 448 |
+
if at == "reply_email":
|
| 449 |
+
if not action.email_id:
|
| 450 |
+
return self._fail("reply_email requires email_id")
|
| 451 |
+
if not any(e.id == action.email_id for e in self.world.emails):
|
| 452 |
+
return self._fail(f"Unknown email_id {action.email_id!r}")
|
| 453 |
+
if not action.message_body.strip():
|
| 454 |
+
return self._fail("reply_email requires non-empty message_body")
|
| 455 |
+
self.mark_email_replied(action.email_id)
|
| 456 |
+
self._last_step_detail = f"Replied to email {action.email_id}."
|
| 457 |
+
return True
|
| 458 |
+
|
| 459 |
+
if at == "archive_email":
|
| 460 |
+
if not action.email_id:
|
| 461 |
+
return self._fail("archive_email requires email_id")
|
| 462 |
+
if not self.mark_email_read(action.email_id):
|
| 463 |
+
return self._fail(f"Unknown email_id {action.email_id!r}")
|
| 464 |
+
self._last_step_detail = f"Archived (read) email {action.email_id}."
|
| 465 |
+
return True
|
| 466 |
+
|
| 467 |
+
if at == "reschedule_meeting":
|
| 468 |
+
if not action.meeting_id or not action.new_time:
|
| 469 |
+
return self._fail("reschedule_meeting requires meeting_id and new_time")
|
| 470 |
+
if not any(m.id == action.meeting_id for m in self.world.meetings):
|
| 471 |
+
return self._fail(f"Unknown meeting_id {action.meeting_id!r}")
|
| 472 |
+
if self._reschedule_causes_overlap(action.meeting_id, action.new_time):
|
| 473 |
+
return self._fail("Target time overlaps another active meeting.")
|
| 474 |
+
if not self.reschedule_meeting(action.meeting_id, action.new_time):
|
| 475 |
+
return self._fail("Could not reschedule meeting.")
|
| 476 |
+
self._last_step_detail = f"Rescheduled {action.meeting_id} to {action.new_time}."
|
| 477 |
+
return True
|
| 478 |
+
|
| 479 |
+
if at == "cancel_meeting":
|
| 480 |
+
if not action.meeting_id:
|
| 481 |
+
return self._fail("cancel_meeting requires meeting_id")
|
| 482 |
+
if not any(m.id == action.meeting_id for m in self.world.meetings):
|
| 483 |
+
return self._fail(f"Unknown meeting_id {action.meeting_id!r}")
|
| 484 |
+
if not self.cancel_meeting(action.meeting_id):
|
| 485 |
+
return self._fail("Could not cancel meeting.")
|
| 486 |
+
reason = action.reason.strip() or "(no reason given)"
|
| 487 |
+
self._world.action_log.append(f"cancelled {action.meeting_id}: {reason}")
|
| 488 |
+
self._last_step_detail = f"Cancelled meeting {action.meeting_id}."
|
| 489 |
+
return True
|
| 490 |
+
|
| 491 |
+
if at == "complete_task":
|
| 492 |
+
if not action.task_id:
|
| 493 |
+
return self._fail("complete_task requires task_id")
|
| 494 |
+
t = next((x for x in self.world.tasks if x.id == action.task_id), None)
|
| 495 |
+
if not t:
|
| 496 |
+
return self._fail(f"Unknown task_id {action.task_id!r}")
|
| 497 |
+
if t.status == "done":
|
| 498 |
+
return self._fail("Task is already done.")
|
| 499 |
+
self.update_task_status(action.task_id, "done")
|
| 500 |
+
self._last_step_detail = f"Completed task {action.task_id}."
|
| 501 |
+
return True
|
| 502 |
+
|
| 503 |
+
if at == "delegate_task":
|
| 504 |
+
if not action.task_id or not action.contact_name.strip():
|
| 505 |
+
return self._fail("delegate_task requires task_id and contact_name")
|
| 506 |
+
if not any(t.id == action.task_id for t in self.world.tasks):
|
| 507 |
+
return self._fail(f"Unknown task_id {action.task_id!r}")
|
| 508 |
+
if not self.get_contact(action.contact_name.strip()):
|
| 509 |
+
return self._fail(f"Unknown contact {action.contact_name.strip()!r}")
|
| 510 |
+
for i, t in enumerate(self.world.tasks):
|
| 511 |
+
if t.id == action.task_id:
|
| 512 |
+
self.world.tasks[i] = t.model_copy(
|
| 513 |
+
update={
|
| 514 |
+
"delegated_to": action.contact_name.strip(),
|
| 515 |
+
"status": "in-progress",
|
| 516 |
+
}
|
| 517 |
+
)
|
| 518 |
+
break
|
| 519 |
+
self._last_step_detail = f"Delegated {action.task_id} to {action.contact_name.strip()}."
|
| 520 |
+
return True
|
| 521 |
+
|
| 522 |
+
if at == "send_message":
|
| 523 |
+
name = action.contact_name.strip()
|
| 524 |
+
if not name:
|
| 525 |
+
return self._fail("send_message requires contact_name")
|
| 526 |
+
if not self.get_contact(name):
|
| 527 |
+
return self._fail(f"Unknown contact {name!r}")
|
| 528 |
+
if not action.message_body.strip():
|
| 529 |
+
return self._fail("send_message requires non-empty message_body")
|
| 530 |
+
self._world.action_log.append(f"message to {name}: {action.message_body.strip()[:500]}")
|
| 531 |
+
self._last_step_detail = f"Message sent to {name}."
|
| 532 |
+
return True
|
| 533 |
+
|
| 534 |
+
return self._fail(f"Unsupported action_type {at!r}")
|
| 535 |
+
|
| 536 |
+
def _fail(self, msg: str) -> bool:
|
| 537 |
+
self._last_step_ok = False
|
| 538 |
+
self._last_step_error = msg
|
| 539 |
+
self._last_step_detail = ""
|
| 540 |
+
self._world.action_log.append(f"error: {msg}")
|
| 541 |
+
return False
|
| 542 |
+
|
| 543 |
+
def _ensure_reward_log_dir(self) -> None:
|
| 544 |
+
self._reward_log_path.parent.mkdir(parents=True, exist_ok=True)
|
| 545 |
+
|
| 546 |
+
def _append_reward_log(
|
| 547 |
+
self,
|
| 548 |
+
breakdown: RewardBreakdown,
|
| 549 |
+
episode_done: bool,
|
| 550 |
+
action: GhostexecAction,
|
| 551 |
+
) -> None:
|
| 552 |
+
self._ensure_reward_log_dir()
|
| 553 |
+
w = self.world
|
| 554 |
+
crit_open = sum(1 for e in w.emails if e.priority == "critical" and not e.replied)
|
| 555 |
+
overdue_n = len(self.overdue_tasks_at(w.simulation_time))
|
| 556 |
+
line = {
|
| 557 |
+
"episode_id": self._state.episode_id,
|
| 558 |
+
"step": self._state.step_count,
|
| 559 |
+
"action_type": action.action_type,
|
| 560 |
+
"step_ok": self._last_step_ok,
|
| 561 |
+
"reward": breakdown.final,
|
| 562 |
+
"conflict_raw": breakdown.conflict_raw,
|
| 563 |
+
"critical_queue_bonus": breakdown.critical_queue_bonus,
|
| 564 |
+
"conflict": breakdown.conflict,
|
| 565 |
+
"relationship": breakdown.relationship,
|
| 566 |
+
"task": breakdown.task,
|
| 567 |
+
"weighted_base": breakdown.weighted_base,
|
| 568 |
+
"output_scale": breakdown.output_scale,
|
| 569 |
+
"invalid_step_adjustment": breakdown.invalid_step_adjustment,
|
| 570 |
+
"episode_completion_bonus": breakdown.episode_completion_bonus,
|
| 571 |
+
"catastrophic_penalty": breakdown.catastrophic_penalty,
|
| 572 |
+
"episode_done": episode_done,
|
| 573 |
+
"calendar_overlap_pairs": len(self.detect_meeting_conflicts()),
|
| 574 |
+
"critical_unreplied": crit_open,
|
| 575 |
+
"overdue_tasks": overdue_n,
|
| 576 |
+
}
|
| 577 |
+
with self._reward_log_path.open("a", encoding="utf-8") as fh:
|
| 578 |
+
fh.write(json.dumps(line) + "\n")
|
| 579 |
+
|
| 580 |
+
def _maybe_apply_schema_drift_events(self) -> None:
|
| 581 |
+
if not self._world or not self._drift_events:
|
| 582 |
+
return
|
| 583 |
+
step = self._state.step_count
|
| 584 |
+
for ev in self._drift_events:
|
| 585 |
+
if ev.get("after_step") != step:
|
| 586 |
+
continue
|
| 587 |
+
if "shift_all_meetings_hours" in ev:
|
| 588 |
+
delta = int(ev["shift_all_meetings_hours"])
|
| 589 |
+
for i, m in enumerate(self._world.meetings):
|
| 590 |
+
new_start = (_parse_dt(m.start) + timedelta(hours=delta)).replace(tzinfo=None)
|
| 591 |
+
self._world.meetings[i] = m.model_copy(
|
| 592 |
+
update={"start": new_start.isoformat(timespec="seconds")}
|
| 593 |
+
)
|
| 594 |
+
self._world.action_log.append(
|
| 595 |
+
f"schema drift: shifted all meeting starts by {delta:+d} hour(s) (calendar TZ policy)."
|
| 596 |
+
)
|
| 597 |
+
pref = ev.get("set_contact_preference")
|
| 598 |
+
if isinstance(pref, dict):
|
| 599 |
+
name = str(pref.get("name", ""))
|
| 600 |
+
comm = str(pref.get("communication_preference", "text"))
|
| 601 |
+
for i, c in enumerate(self._world.contacts):
|
| 602 |
+
if c.name == name:
|
| 603 |
+
self._world.contacts[i] = c.model_copy(
|
| 604 |
+
update={"communication_preference": comm} # type: ignore[arg-type]
|
| 605 |
+
)
|
| 606 |
+
break
|
| 607 |
+
self._world.action_log.append(
|
| 608 |
+
f"schema drift: contact {name!r} now prefers {comm} only (relationship channel change)."
|
| 609 |
+
)
|
| 610 |
+
td = ev.get("set_task_deadline")
|
| 611 |
+
if isinstance(td, dict):
|
| 612 |
+
tid = str(td.get("task_id", ""))
|
| 613 |
+
dl = str(td.get("deadline", ""))
|
| 614 |
+
for i, t in enumerate(self._world.tasks):
|
| 615 |
+
if t.id == tid:
|
| 616 |
+
self._world.tasks[i] = t.model_copy(update={"deadline": dl})
|
| 617 |
+
break
|
| 618 |
+
self._world.action_log.append(
|
| 619 |
+
f"schema drift: task {tid!r} deadline moved earlier to {dl!r}."
|
| 620 |
+
)
|
| 621 |
+
for name in ev.get("suppress_reply_relationship_for_senders", []) or []:
|
| 622 |
+
self._reply_relationship_suppressed.add(str(name))
|
| 623 |
+
self._world.action_log.append(
|
| 624 |
+
f"schema drift: replies to emails from {name!r} yield zero relationship score this episode."
|
| 625 |
+
)
|
| 626 |
+
scm = ev.get("set_contact_mood")
|
| 627 |
+
if isinstance(scm, dict):
|
| 628 |
+
cname = str(scm.get("name", ""))
|
| 629 |
+
mood_raw = str(scm.get("mood", "neutral"))
|
| 630 |
+
allowed: tuple[Mood, ...] = ("happy", "neutral", "annoyed", "angry", "furious")
|
| 631 |
+
if cname and mood_raw in allowed and self.update_contact_mood(cname, mood_raw):
|
| 632 |
+
self._world.action_log.append(
|
| 633 |
+
f"schema drift: stakeholder {cname!r} mood is now {mood_raw} (external pressure)."
|
| 634 |
+
)
|
| 635 |
+
if any(ev.get("after_step") == step for ev in self._drift_events):
|
| 636 |
+
self._rebuild_conflict_list()
|
| 637 |
+
|
| 638 |
+
# --- internals ---
|
| 639 |
+
|
| 640 |
+
def _reapply_task_overdue_flags(self) -> None:
|
| 641 |
+
now = _parse_dt(self.world.simulation_time)
|
| 642 |
+
for i, t in enumerate(self.world.tasks):
|
| 643 |
+
if t.status == "done":
|
| 644 |
+
continue
|
| 645 |
+
if _parse_dt(t.deadline) < now and t.status != "overdue":
|
| 646 |
+
self.world.tasks[i] = t.model_copy(update={"status": "overdue"})
|
| 647 |
+
|
| 648 |
+
def _rebuild_conflict_list(self) -> None:
|
| 649 |
+
lines: list[str] = []
|
| 650 |
+
for row in self.detect_meeting_conflicts():
|
| 651 |
+
lines.append(
|
| 652 |
+
f"Calendar overlap: {row['meeting_a']} vs {row['meeting_b']} "
|
| 653 |
+
f"({row['overlap_start']} – {row['overlap_end']})"
|
| 654 |
+
)
|
| 655 |
+
for e in self.world.emails:
|
| 656 |
+
if e.priority == "critical" and not e.replied:
|
| 657 |
+
lines.append(f"Unanswered critical email {e.id}: {e.subject}")
|
| 658 |
+
bump = min(35, len(lines) * 2)
|
| 659 |
+
self.world.active_conflicts = lines
|
| 660 |
+
self.world.stress = min(100, self._base_stress + bump)
|
| 661 |
+
|
| 662 |
+
def _observation_from_briefing(
|
| 663 |
+
self,
|
| 664 |
+
briefing: str,
|
| 665 |
+
reward: float,
|
| 666 |
+
done: bool,
|
| 667 |
+
reward_breakdown: RewardBreakdown | None = None,
|
| 668 |
+
) -> GhostexecObservation:
|
| 669 |
+
w = self.world
|
| 670 |
+
unread_sorted = self.get_unread_emails_sorted()
|
| 671 |
+
meta: dict[str, Any] = {
|
| 672 |
+
"simulation_time": w.simulation_time,
|
| 673 |
+
"stress": w.stress,
|
| 674 |
+
"unread_email_count": sum(1 for e in w.emails if not e.read),
|
| 675 |
+
"calendar_conflict_pairs": len(self.detect_meeting_conflicts()),
|
| 676 |
+
"episode_step": self._state.step_count,
|
| 677 |
+
"max_episode_steps": w.max_episode_steps,
|
| 678 |
+
"episode_active": w.episode_active,
|
| 679 |
+
"episode_end_reason": w.episode_end_reason,
|
| 680 |
+
"step_ok": self._last_step_ok,
|
| 681 |
+
"step_error": self._last_step_error,
|
| 682 |
+
"step_detail": self._last_step_detail,
|
| 683 |
+
# Compact ids for remote trainers / Colab (briefing stays plain text).
|
| 684 |
+
"critical_unreplied_email_ids": [
|
| 685 |
+
e.id for e in w.emails if e.priority == "critical" and not e.replied
|
| 686 |
+
][:12],
|
| 687 |
+
"unread_email_ids": [e.id for e in unread_sorted[:15]],
|
| 688 |
+
"overdue_task_ids": [t.id for t in self.overdue_tasks_at(w.simulation_time)][:12],
|
| 689 |
+
"active_meeting_ids": [m.id for m in w.meetings if not m.cancelled][:20],
|
| 690 |
+
}
|
| 691 |
+
if reward_breakdown is not None:
|
| 692 |
+
meta["reward_breakdown"] = reward_breakdown.model_dump()
|
| 693 |
+
cap = 48_000
|
| 694 |
+
text = briefing if len(briefing) <= cap else briefing[: cap - 1] + "…"
|
| 695 |
+
return GhostexecObservation(
|
| 696 |
+
echoed_message=text,
|
| 697 |
+
message_length=len(text),
|
| 698 |
+
done=done,
|
| 699 |
+
reward=reward,
|
| 700 |
+
metadata=meta,
|
| 701 |
+
)
|
| 702 |
+
|
| 703 |
+
|
| 704 |
+
def _fmt_meeting_line(m: Meeting) -> str:
|
| 705 |
+
st = _parse_dt(m.start)
|
| 706 |
+
return f"{st.strftime('%H:%M')}: {m.title} ({m.duration_minutes}min)"
|
server/requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv[core]>=0.2.0
|
| 2 |
+
fastapi>=0.115.0
|
| 3 |
+
uvicorn>=0.24.0
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
|
server/reward.py
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
Phase 4 reward: weighted (0.35 / 0.35 / 0.30) with potential-style deltas, critical-queue
|
| 9 |
+
shaping, full sub-scores even on invalid steps (+ explicit invalid penalty), and mild output
|
| 10 |
+
scaling.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
|
| 15 |
+
from datetime import datetime, timedelta, timezone
|
| 16 |
+
from typing import Any
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
from ..models import GhostexecAction, RewardBreakdown, WorldState
|
| 20 |
+
except ImportError:
|
| 21 |
+
from models import GhostexecAction, RewardBreakdown, WorldState
|
| 22 |
+
|
| 23 |
+
W_CONFLICT = 0.35
|
| 24 |
+
W_REL = 0.35
|
| 25 |
+
W_TASK = 0.30
|
| 26 |
+
|
| 27 |
+
# Raw conflict units (pre-weight) are clamped to keep invalid / idle steps from exploding.
|
| 28 |
+
CONFLICT_RAW_CAP: float = 6.0
|
| 29 |
+
|
| 30 |
+
# Scales the weighted sum of the three channels (weights stay fixed per hackathon rules).
|
| 31 |
+
WEIGHTED_OUTPUT_SCALE: float = 0.48
|
| 32 |
+
|
| 33 |
+
# Tone misfit penalties kept small vs outcome terms (~<20% of a strong +2 conflict step after weights).
|
| 34 |
+
TONE_PENALTY_CASUAL_ANGRY_BOARD: float = 0.35
|
| 35 |
+
TONE_PENALTY_FORMAL_PERSONAL: float = 0.08
|
| 36 |
+
|
| 37 |
+
_RESOLVE_MICRO_BONUS: float = 0.12
|
| 38 |
+
_CRITICAL_PER_EMAIL_BONUS: float = 0.22
|
| 39 |
+
_RESCHEDULE_VALID_MICRO_BONUS: float = 0.10
|
| 40 |
+
_SEND_MESSAGE_VALID_MICRO_BONUS: float = 0.08
|
| 41 |
+
_COMPLETE_TASK_VALID_MICRO_BONUS: float = 0.06
|
| 42 |
+
_DELEGATE_TASK_VALID_MICRO_BONUS: float = 0.10
|
| 43 |
+
_DO_NOTHING_STRICT_PENALTY: float = -0.15
|
| 44 |
+
_REPLY_PRIORITY_MICRO_BONUS: dict[str, float] = {
|
| 45 |
+
"critical": 0.30,
|
| 46 |
+
"high": 0.15,
|
| 47 |
+
"normal": 0.04,
|
| 48 |
+
"low": 0.02,
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
_MOOD_RANK: dict[str, int] = {
|
| 52 |
+
"happy": 4,
|
| 53 |
+
"neutral": 3,
|
| 54 |
+
"annoyed": 2,
|
| 55 |
+
"angry": 1,
|
| 56 |
+
"furious": 0,
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def _parse_dt(value: str) -> datetime:
|
| 61 |
+
if value.endswith("Z"):
|
| 62 |
+
return datetime.fromisoformat(value[:-1]).replace(tzinfo=timezone.utc)
|
| 63 |
+
dt = datetime.fromisoformat(value)
|
| 64 |
+
if dt.tzinfo is None:
|
| 65 |
+
return dt.replace(tzinfo=timezone.utc)
|
| 66 |
+
return dt
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _meeting_end(m: Any) -> datetime:
|
| 70 |
+
start = _parse_dt(m.start)
|
| 71 |
+
return start + timedelta(minutes=m.duration_minutes)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def _overlap(a0: datetime, a1: datetime, b0: datetime, b1: datetime) -> bool:
|
| 75 |
+
return a0 < b1 and b0 < a1
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def meeting_conflicts(world: WorldState) -> list[dict[str, Any]]:
|
| 79 |
+
active = [m for m in world.meetings if not m.cancelled]
|
| 80 |
+
out: list[dict[str, Any]] = []
|
| 81 |
+
for i, a in enumerate(active):
|
| 82 |
+
a0, a1 = _parse_dt(a.start), _meeting_end(a)
|
| 83 |
+
for b in active[i + 1 :]:
|
| 84 |
+
b0, b1 = _parse_dt(b.start), _meeting_end(b)
|
| 85 |
+
if _overlap(a0, a1, b0, b1):
|
| 86 |
+
o0, o1 = max(a0, b0), min(a1, b1)
|
| 87 |
+
out.append(
|
| 88 |
+
{
|
| 89 |
+
"meeting_a": a.id,
|
| 90 |
+
"meeting_b": b.id,
|
| 91 |
+
"overlap_start": o0.isoformat(),
|
| 92 |
+
"overlap_end": o1.isoformat(),
|
| 93 |
+
}
|
| 94 |
+
)
|
| 95 |
+
return out
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def _pair_set(rows: list[dict[str, Any]]) -> set[frozenset[str]]:
|
| 99 |
+
return {frozenset((r["meeting_a"], r["meeting_b"])) for r in rows}
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def _attendee_moods_ok(world: WorldState, pair: frozenset[str]) -> bool:
|
| 103 |
+
names: set[str] = set()
|
| 104 |
+
for mid in pair:
|
| 105 |
+
m = next((x for x in world.meetings if x.id == mid), None)
|
| 106 |
+
if m:
|
| 107 |
+
names.update(m.attendees)
|
| 108 |
+
for n in names:
|
| 109 |
+
c = next((x for x in world.contacts if x.name == n), None)
|
| 110 |
+
if c is None:
|
| 111 |
+
continue
|
| 112 |
+
if c.mood not in ("happy", "neutral"):
|
| 113 |
+
return False
|
| 114 |
+
return True
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def score_conflict_resolution(
|
| 118 |
+
before: WorldState,
|
| 119 |
+
after: WorldState,
|
| 120 |
+
action: GhostexecAction,
|
| 121 |
+
*,
|
| 122 |
+
action_ok: bool,
|
| 123 |
+
) -> float:
|
| 124 |
+
b = _pair_set(meeting_conflicts(before))
|
| 125 |
+
a = _pair_set(meeting_conflicts(after))
|
| 126 |
+
s = 0.0
|
| 127 |
+
for _p in b - a:
|
| 128 |
+
s += 2.0 + _RESOLVE_MICRO_BONUS
|
| 129 |
+
if _attendee_moods_ok(after, _p):
|
| 130 |
+
s += 1.0
|
| 131 |
+
for _ in a - b:
|
| 132 |
+
s -= 3.0
|
| 133 |
+
if action_ok and action.action_type == "reschedule_meeting":
|
| 134 |
+
s += _RESCHEDULE_VALID_MICRO_BONUS
|
| 135 |
+
return s
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def critical_unreplied_count(world: WorldState) -> int:
|
| 139 |
+
return sum(1 for e in world.emails if e.priority == "critical" and not e.replied)
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def score_critical_queue_bonus(before: WorldState, after: WorldState) -> float:
|
| 143 |
+
reduction = critical_unreplied_count(before) - critical_unreplied_count(after)
|
| 144 |
+
return _CRITICAL_PER_EMAIL_BONUS * max(0, reduction)
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def _classify_tone(text: str) -> str:
|
| 148 |
+
t = text.lower()
|
| 149 |
+
if any(w in t for w in ("sorry", "apologize", "apologies", "my mistake")):
|
| 150 |
+
return "apologetic"
|
| 151 |
+
if any(w in t for w in ("dear ", "sincerely", "best regards", "respectfully", "cordially")):
|
| 152 |
+
return "formal"
|
| 153 |
+
if any(w in t for w in ("hey", "lol", "haha", "👋", "no worries", "cheers")):
|
| 154 |
+
return "casual"
|
| 155 |
+
if any(w in t for w in ("must", "immediately", "asap", "non-negotiable", "demand")):
|
| 156 |
+
return "assertive"
|
| 157 |
+
return "neutral"
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def score_relationship(
|
| 161 |
+
before: WorldState,
|
| 162 |
+
after: WorldState,
|
| 163 |
+
action: GhostexecAction,
|
| 164 |
+
*,
|
| 165 |
+
action_ok: bool,
|
| 166 |
+
relationship_suppressed_for_email_to: frozenset[str] | None = None,
|
| 167 |
+
) -> float:
|
| 168 |
+
rel_sup = relationship_suppressed_for_email_to or frozenset()
|
| 169 |
+
s = 0.0
|
| 170 |
+
before_map = {c.name: c for c in before.contacts}
|
| 171 |
+
after_map = {c.name: c for c in after.contacts}
|
| 172 |
+
for name, ca in after_map.items():
|
| 173 |
+
cb = before_map.get(name)
|
| 174 |
+
if not cb:
|
| 175 |
+
continue
|
| 176 |
+
ra, rb = _MOOD_RANK[ca.mood], _MOOD_RANK[cb.mood]
|
| 177 |
+
vip = ca.importance >= 4
|
| 178 |
+
if ra > rb:
|
| 179 |
+
s += 3.0 if vip else 1.0
|
| 180 |
+
elif ra < rb:
|
| 181 |
+
s -= 4.0 if vip else 2.0
|
| 182 |
+
|
| 183 |
+
if action.action_type == "reply_email" and action.email_id:
|
| 184 |
+
em = next((e for e in before.emails if e.id == action.email_id), None)
|
| 185 |
+
if em and em.sender in rel_sup:
|
| 186 |
+
return 0.0
|
| 187 |
+
if em:
|
| 188 |
+
if action_ok and (action.message_body or "").strip():
|
| 189 |
+
pri = (em.priority or "").lower()
|
| 190 |
+
micro = _REPLY_PRIORITY_MICRO_BONUS.get(pri, 0.0)
|
| 191 |
+
if em.sender_relationship == "VIP":
|
| 192 |
+
micro *= 2.0
|
| 193 |
+
s += micro
|
| 194 |
+
tone = _classify_tone(action.message_body)
|
| 195 |
+
contact = next((c for c in before.contacts if c.name == em.sender), None)
|
| 196 |
+
if (
|
| 197 |
+
contact
|
| 198 |
+
and contact.relationship_type == "board_member"
|
| 199 |
+
and contact.mood in ("angry", "furious", "annoyed")
|
| 200 |
+
and tone == "casual"
|
| 201 |
+
):
|
| 202 |
+
s -= TONE_PENALTY_CASUAL_ANGRY_BOARD
|
| 203 |
+
if em.sender_relationship == "personal" and tone == "formal":
|
| 204 |
+
s -= TONE_PENALTY_FORMAL_PERSONAL
|
| 205 |
+
if action_ok and action.action_type == "send_message" and action.contact_name:
|
| 206 |
+
known_contact = any(c.name == action.contact_name for c in before.contacts)
|
| 207 |
+
if known_contact and (action.message_body or "").strip():
|
| 208 |
+
s += _SEND_MESSAGE_VALID_MICRO_BONUS
|
| 209 |
+
return s
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def _overdue_tasks(world: WorldState) -> list[Any]:
|
| 213 |
+
now = _parse_dt(world.simulation_time)
|
| 214 |
+
out = []
|
| 215 |
+
for t in world.tasks:
|
| 216 |
+
if t.status == "done":
|
| 217 |
+
continue
|
| 218 |
+
if _parse_dt(t.deadline) < now:
|
| 219 |
+
out.append(t)
|
| 220 |
+
return out
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
def score_task_completion(
|
| 224 |
+
before: WorldState,
|
| 225 |
+
after: WorldState,
|
| 226 |
+
action: GhostexecAction,
|
| 227 |
+
*,
|
| 228 |
+
action_ok: bool,
|
| 229 |
+
) -> float:
|
| 230 |
+
s = 0.0
|
| 231 |
+
now = _parse_dt(after.simulation_time)
|
| 232 |
+
|
| 233 |
+
before_tasks = {t.id: t for t in before.tasks}
|
| 234 |
+
after_tasks = {t.id: t for t in after.tasks}
|
| 235 |
+
for tid, ta in after_tasks.items():
|
| 236 |
+
tb = before_tasks.get(tid)
|
| 237 |
+
if not tb:
|
| 238 |
+
continue
|
| 239 |
+
if tb.status != "overdue" and tb.status != "done" and ta.status == "overdue":
|
| 240 |
+
s -= 2.0
|
| 241 |
+
if tb.status != "done" and ta.status == "done":
|
| 242 |
+
dl = _parse_dt(tb.deadline)
|
| 243 |
+
if dl >= now:
|
| 244 |
+
s += 2.0
|
| 245 |
+
else:
|
| 246 |
+
s += 0.5
|
| 247 |
+
if (not tb.delegated_to) and ta.delegated_to:
|
| 248 |
+
de = next((c for c in after.contacts if c.name == ta.delegated_to), None)
|
| 249 |
+
if de and de.importance <= 3:
|
| 250 |
+
s += 1.0
|
| 251 |
+
if action_ok and action.action_type == "complete_task":
|
| 252 |
+
s += _COMPLETE_TASK_VALID_MICRO_BONUS
|
| 253 |
+
if action_ok and action.action_type == "delegate_task":
|
| 254 |
+
s += _DELEGATE_TASK_VALID_MICRO_BONUS
|
| 255 |
+
return s
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
def catastrophic(world: WorldState) -> bool:
|
| 259 |
+
vip_furious = any(c.importance >= 4 and c.mood == "furious" for c in world.contacts)
|
| 260 |
+
critical_open = sum(1 for e in world.emails if e.priority == "critical" and not e.replied)
|
| 261 |
+
return vip_furious or critical_open > 3
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
def aggregate_scores(
|
| 265 |
+
conflict: float,
|
| 266 |
+
relationship: float,
|
| 267 |
+
task: float,
|
| 268 |
+
*,
|
| 269 |
+
conflict_raw: float,
|
| 270 |
+
critical_queue_bonus: float,
|
| 271 |
+
weighted_inner: float,
|
| 272 |
+
action_ok: bool,
|
| 273 |
+
episode_done: bool,
|
| 274 |
+
world_after: WorldState,
|
| 275 |
+
) -> RewardBreakdown:
|
| 276 |
+
weighted = WEIGHTED_OUTPUT_SCALE * weighted_inner
|
| 277 |
+
inv = 0.0
|
| 278 |
+
if not action_ok:
|
| 279 |
+
inv = -0.25
|
| 280 |
+
bonus = 0.0
|
| 281 |
+
cata = 0.0
|
| 282 |
+
if episode_done:
|
| 283 |
+
if world_after.stress < 40:
|
| 284 |
+
bonus = 10.0
|
| 285 |
+
if catastrophic(world_after):
|
| 286 |
+
cata = -15.0
|
| 287 |
+
final = weighted + inv + bonus + cata
|
| 288 |
+
return RewardBreakdown(
|
| 289 |
+
conflict_raw=conflict_raw,
|
| 290 |
+
critical_queue_bonus=critical_queue_bonus,
|
| 291 |
+
conflict=conflict,
|
| 292 |
+
relationship=relationship,
|
| 293 |
+
task=task,
|
| 294 |
+
weighted_base=weighted,
|
| 295 |
+
output_scale=WEIGHTED_OUTPUT_SCALE,
|
| 296 |
+
invalid_step_adjustment=inv,
|
| 297 |
+
episode_completion_bonus=bonus,
|
| 298 |
+
catastrophic_penalty=cata,
|
| 299 |
+
do_nothing_floor=0.0,
|
| 300 |
+
final=final,
|
| 301 |
+
)
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
def apply_do_nothing_penalty_floor(
|
| 305 |
+
action: GhostexecAction,
|
| 306 |
+
breakdown: RewardBreakdown,
|
| 307 |
+
) -> RewardBreakdown:
|
| 308 |
+
if action.action_type != "do_nothing":
|
| 309 |
+
return breakdown
|
| 310 |
+
floor_delta = _DO_NOTHING_STRICT_PENALTY
|
| 311 |
+
new_final = breakdown.final + floor_delta
|
| 312 |
+
return breakdown.model_copy(
|
| 313 |
+
update={"do_nothing_floor": floor_delta, "final": new_final},
|
| 314 |
+
)
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
def compute_step_reward(
|
| 318 |
+
before: WorldState,
|
| 319 |
+
after: WorldState,
|
| 320 |
+
action: GhostexecAction,
|
| 321 |
+
*,
|
| 322 |
+
action_ok: bool,
|
| 323 |
+
episode_done: bool,
|
| 324 |
+
relationship_suppressed_for_email_to: frozenset[str] | None = None,
|
| 325 |
+
) -> RewardBreakdown:
|
| 326 |
+
c_core = score_conflict_resolution(before, after, action, action_ok=action_ok)
|
| 327 |
+
crit_b = score_critical_queue_bonus(before, after)
|
| 328 |
+
c_raw = c_core + crit_b
|
| 329 |
+
c = max(-CONFLICT_RAW_CAP, min(CONFLICT_RAW_CAP, c_raw))
|
| 330 |
+
r = score_relationship(
|
| 331 |
+
before,
|
| 332 |
+
after,
|
| 333 |
+
action,
|
| 334 |
+
action_ok=action_ok,
|
| 335 |
+
relationship_suppressed_for_email_to=relationship_suppressed_for_email_to,
|
| 336 |
+
)
|
| 337 |
+
t = score_task_completion(before, after, action, action_ok=action_ok)
|
| 338 |
+
weighted_inner = W_CONFLICT * c + W_REL * r + W_TASK * t
|
| 339 |
+
bd = aggregate_scores(
|
| 340 |
+
c,
|
| 341 |
+
r,
|
| 342 |
+
t,
|
| 343 |
+
conflict_raw=c_raw,
|
| 344 |
+
critical_queue_bonus=crit_b,
|
| 345 |
+
weighted_inner=weighted_inner,
|
| 346 |
+
action_ok=action_ok,
|
| 347 |
+
episode_done=episode_done,
|
| 348 |
+
world_after=after,
|
| 349 |
+
)
|
| 350 |
+
return apply_do_nothing_penalty_floor(action, bd)
|
tests/test_api_reward_dead_500.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Hard API dead-test: 500+ calls with reward-consistency checks."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
import pytest
|
| 8 |
+
from fastapi.testclient import TestClient
|
| 9 |
+
|
| 10 |
+
from ghostexec.server.app import app
|
| 11 |
+
|
| 12 |
+
W_CONFLICT = 0.35
|
| 13 |
+
W_REL = 0.35
|
| 14 |
+
W_TASK = 0.30
|
| 15 |
+
OUTPUT_SCALE = 0.48
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _step_payload_for(i: int) -> dict[str, Any]:
|
| 19 |
+
templates: list[dict[str, Any]] = [
|
| 20 |
+
{"action": {"action_type": "do_nothing"}},
|
| 21 |
+
{"action": {"action_type": "reply_email", "email_id": "e01", "message_body": "On it now."}},
|
| 22 |
+
{"action": {"action_type": "reply_email", "email_id": "e14", "message_body": "Acknowledged."}},
|
| 23 |
+
{"action": {"action_type": "reply_email", "email_id": "nope_999", "message_body": "x"}},
|
| 24 |
+
{"action": {"action_type": "archive_email", "email_id": "e09"}},
|
| 25 |
+
{"action": {"action_type": "archive_email", "email_id": "bad_id"}},
|
| 26 |
+
{
|
| 27 |
+
"action": {
|
| 28 |
+
"action_type": "reschedule_meeting",
|
| 29 |
+
"meeting_id": "m02",
|
| 30 |
+
"new_time": "2026-04-21T18:00:00",
|
| 31 |
+
}
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"action": {
|
| 35 |
+
"action_type": "reschedule_meeting",
|
| 36 |
+
"meeting_id": "m03",
|
| 37 |
+
"new_time": "2026-04-21T09:30:00", # overlap -> invalid semantic
|
| 38 |
+
}
|
| 39 |
+
},
|
| 40 |
+
{"action": {"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "dead test"}},
|
| 41 |
+
{"action": {"action_type": "cancel_meeting", "meeting_id": "m99", "reason": "dead test"}},
|
| 42 |
+
{"action": {"action_type": "complete_task", "task_id": "t07"}},
|
| 43 |
+
{"action": {"action_type": "complete_task", "task_id": "t09"}}, # already done
|
| 44 |
+
{"action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Jordan Lee"}},
|
| 45 |
+
{"action": {"action_type": "delegate_task", "task_id": "t08", "contact_name": "Nobody"}},
|
| 46 |
+
{
|
| 47 |
+
"action": {
|
| 48 |
+
"action_type": "send_message",
|
| 49 |
+
"contact_name": "Jamie Liu",
|
| 50 |
+
"message_body": "Quick sync please.",
|
| 51 |
+
}
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"action": {
|
| 55 |
+
"action_type": "send_message",
|
| 56 |
+
"contact_name": "Nobody",
|
| 57 |
+
"message_body": "hello",
|
| 58 |
+
}
|
| 59 |
+
},
|
| 60 |
+
]
|
| 61 |
+
return templates[i % len(templates)]
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
@pytest.fixture(scope="module")
|
| 65 |
+
def client() -> TestClient:
|
| 66 |
+
return TestClient(app, raise_server_exceptions=True)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def test_api_surface_all_endpoints(client: TestClient) -> None:
|
| 70 |
+
# Core GET endpoints.
|
| 71 |
+
for path in ("/health", "/metadata", "/state", "/schema", "/openapi.json", "/docs", "/redoc"):
|
| 72 |
+
r = client.get(path)
|
| 73 |
+
assert r.status_code == 200, f"{path} -> {r.status_code}"
|
| 74 |
+
|
| 75 |
+
# Control routes: method contracts.
|
| 76 |
+
assert client.get("/reset").status_code == 405
|
| 77 |
+
assert client.get("/step").status_code == 405
|
| 78 |
+
assert client.put("/reset", json={}).status_code in (405, 422)
|
| 79 |
+
assert client.get("/this-path-should-not-exist-ghostexec").status_code == 404
|
| 80 |
+
|
| 81 |
+
# Reset variants.
|
| 82 |
+
for body in ({}, {"seed": 42}, {"episode_id": "dead-api-001"}, {"seed": 1, "future_field": True}):
|
| 83 |
+
rr = client.post("/reset", json=body)
|
| 84 |
+
assert rr.status_code == 200
|
| 85 |
+
j = rr.json()
|
| 86 |
+
assert "observation" in j and "done" in j
|
| 87 |
+
|
| 88 |
+
# MCP endpoint variants.
|
| 89 |
+
mcp_ok = client.post(
|
| 90 |
+
"/mcp",
|
| 91 |
+
json={"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}},
|
| 92 |
+
)
|
| 93 |
+
assert mcp_ok.status_code == 200
|
| 94 |
+
mcp_bad_json = client.post("/mcp", content="{", headers={"Content-Type": "application/json"})
|
| 95 |
+
assert mcp_bad_json.status_code == 200
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
@pytest.mark.parametrize("idx", range(520))
|
| 99 |
+
def test_api_reward_dead_520_cases(client: TestClient, idx: int) -> None:
|
| 100 |
+
# Keep each case independent and deterministic.
|
| 101 |
+
rr = client.post("/reset", json={"episode_id": f"dead-{idx:04d}", "seed": 42})
|
| 102 |
+
assert rr.status_code == 200
|
| 103 |
+
|
| 104 |
+
payload = _step_payload_for(idx)
|
| 105 |
+
rs = client.post("/step", json=payload)
|
| 106 |
+
assert rs.status_code == 200, f"idx={idx} payload={payload} status={rs.status_code}"
|
| 107 |
+
|
| 108 |
+
body = rs.json()
|
| 109 |
+
assert "observation" in body and "reward" in body and "done" in body
|
| 110 |
+
obs = body["observation"]
|
| 111 |
+
meta = obs.get("metadata") or {}
|
| 112 |
+
bd = meta.get("reward_breakdown") or {}
|
| 113 |
+
|
| 114 |
+
# Structural contracts.
|
| 115 |
+
assert isinstance(obs.get("echoed_message", ""), str) and obs.get("echoed_message")
|
| 116 |
+
assert "step_ok" in meta
|
| 117 |
+
assert "step_detail" in meta
|
| 118 |
+
assert "final" in bd
|
| 119 |
+
assert "weighted_base" in bd
|
| 120 |
+
|
| 121 |
+
# Reward identity: top-level reward must equal breakdown.final.
|
| 122 |
+
reward = float(body["reward"])
|
| 123 |
+
final = float(bd["final"])
|
| 124 |
+
assert reward == pytest.approx(final, abs=1e-9)
|
| 125 |
+
|
| 126 |
+
# Aggregation formula must hold exactly (within floating tolerance).
|
| 127 |
+
conflict = float(bd.get("conflict", 0.0))
|
| 128 |
+
relationship = float(bd.get("relationship", 0.0))
|
| 129 |
+
task = float(bd.get("task", 0.0))
|
| 130 |
+
weighted_inner = W_CONFLICT * conflict + W_REL * relationship + W_TASK * task
|
| 131 |
+
expected_weighted = OUTPUT_SCALE * weighted_inner
|
| 132 |
+
assert float(bd["weighted_base"]) == pytest.approx(expected_weighted, abs=1e-9)
|
| 133 |
+
|
| 134 |
+
expected_final = (
|
| 135 |
+
float(bd.get("weighted_base", 0.0))
|
| 136 |
+
+ float(bd.get("invalid_step_adjustment", 0.0))
|
| 137 |
+
+ float(bd.get("episode_completion_bonus", 0.0))
|
| 138 |
+
+ float(bd.get("catastrophic_penalty", 0.0))
|
| 139 |
+
+ float(bd.get("do_nothing_floor", 0.0))
|
| 140 |
+
)
|
| 141 |
+
assert final == pytest.approx(expected_final, abs=1e-9)
|
| 142 |
+
|
| 143 |
+
action_type = payload["action"]["action_type"]
|
| 144 |
+
if action_type == "do_nothing":
|
| 145 |
+
assert float(bd.get("do_nothing_floor", 0.0)) == pytest.approx(-0.15, abs=1e-12)
|
| 146 |
+
assert reward < 0
|
| 147 |
+
|
| 148 |
+
if meta.get("step_ok") is False:
|
| 149 |
+
assert float(bd.get("invalid_step_adjustment", 0.0)) == pytest.approx(-0.25, abs=1e-12)
|
| 150 |
+
|
tests/test_complete_integration.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# End-to-end stack test: FastAPI/OpenEnv HTTP + WebSocket, GhostExec env,
|
| 4 |
+
# and (optionally) GhostexecEnv client over ASGI TestClient.
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import os
|
| 10 |
+
import shutil
|
| 11 |
+
import subprocess
|
| 12 |
+
import sys
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
import pytest
|
| 15 |
+
from fastapi.testclient import TestClient
|
| 16 |
+
|
| 17 |
+
from ghostexec.models import GhostexecAction
|
| 18 |
+
from ghostexec.server.app import app
|
| 19 |
+
from ghostexec.server.ghostexec_environment import GhostexecEnvironment
|
| 20 |
+
|
| 21 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 22 |
+
SCENARIO = ROOT / "scenarios" / "phase2_core.json"
|
| 23 |
+
MONDAY = ROOT / "scenarios" / "monday_morning.json"
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _http_paths(client: TestClient) -> set[str]:
|
| 27 |
+
paths: set[str] = set()
|
| 28 |
+
for r in app.routes:
|
| 29 |
+
p = getattr(r, "path", None)
|
| 30 |
+
if isinstance(p, str) and p:
|
| 31 |
+
paths.add(p)
|
| 32 |
+
return paths
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def test_server_app_import_matches_uvicorn_server_string() -> None:
|
| 36 |
+
"""`uvicorn server.app:app` loads `server.app` with cwd on path (no `ghostexec.` prefix)."""
|
| 37 |
+
rc = subprocess.run(
|
| 38 |
+
[sys.executable, "-c", "import server.app; assert server.app.app is not None"],
|
| 39 |
+
cwd=str(ROOT),
|
| 40 |
+
check=False,
|
| 41 |
+
)
|
| 42 |
+
assert rc.returncode == 0, "import server.app must work from ghostexec repo root"
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def test_openapi_docs_and_schema_discovery() -> None:
|
| 46 |
+
with TestClient(app, raise_server_exceptions=True) as client:
|
| 47 |
+
r = client.get("/openapi.json")
|
| 48 |
+
assert r.status_code == 200
|
| 49 |
+
spec = r.json()
|
| 50 |
+
assert spec.get("openapi")
|
| 51 |
+
assert "paths" in spec and spec["paths"]
|
| 52 |
+
|
| 53 |
+
for path in ("/docs", "/redoc"):
|
| 54 |
+
resp = client.get(path)
|
| 55 |
+
assert resp.status_code == 200
|
| 56 |
+
assert len(resp.text) > 100
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def test_openapi_examples_match_ghostexec_observation_shape() -> None:
|
| 60 |
+
spec = app.openapi()
|
| 61 |
+
for path in ("/reset", "/step"):
|
| 62 |
+
ex = spec["paths"][path]["post"]["responses"]["200"]["content"]["application/json"]["example"]
|
| 63 |
+
obs = ex["observation"]
|
| 64 |
+
assert "echoed_message" in obs and "message_length" in obs
|
| 65 |
+
assert "status" not in obs and "data" not in obs
|
| 66 |
+
assert "reward" in ex and "done" in ex
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def test_openapi_info_documents_http_vs_websocket_episode() -> None:
|
| 70 |
+
"""Runtime-visible API docs: HTTP reset/step are not one persistent episode; /ws is."""
|
| 71 |
+
spec = app.openapi()
|
| 72 |
+
desc = spec.get("info", {}).get("description") or ""
|
| 73 |
+
assert "Ghostexec / OpenEnv HTTP" in desc
|
| 74 |
+
assert "/ws" in desc and "WebSocket" in desc
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def test_all_registered_get_post_routes_smoke() -> None:
|
| 78 |
+
"""Smoke every stable OpenEnv HTTP route (simulation mode, no Gradio /web)."""
|
| 79 |
+
with TestClient(app, raise_server_exceptions=True) as client:
|
| 80 |
+
paths = _http_paths(client)
|
| 81 |
+
assert "/health" in paths
|
| 82 |
+
assert "/metadata" in paths
|
| 83 |
+
assert "/schema" in paths
|
| 84 |
+
assert "/state" in paths
|
| 85 |
+
assert "/reset" in paths
|
| 86 |
+
assert "/step" in paths
|
| 87 |
+
assert "/ws" in paths
|
| 88 |
+
assert "/mcp" in paths
|
| 89 |
+
|
| 90 |
+
h = client.get("/health")
|
| 91 |
+
assert h.status_code == 200
|
| 92 |
+
assert h.json().get("status") == "healthy"
|
| 93 |
+
|
| 94 |
+
meta = client.get("/metadata")
|
| 95 |
+
assert meta.status_code == 200
|
| 96 |
+
body = meta.json()
|
| 97 |
+
assert body.get("name") in ("ghostexec", "GhostexecEnvironment")
|
| 98 |
+
assert "description" in body
|
| 99 |
+
|
| 100 |
+
st = client.get("/state")
|
| 101 |
+
assert st.status_code == 200
|
| 102 |
+
assert "step_count" in st.json()
|
| 103 |
+
|
| 104 |
+
sch = client.get("/schema")
|
| 105 |
+
assert sch.status_code == 200
|
| 106 |
+
sj = sch.json()
|
| 107 |
+
assert "action" in sj and "observation" in sj and "state" in sj
|
| 108 |
+
assert sj["action"].get("title") or sj["action"].get("properties")
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def test_http_reset_and_step_return_valid_payloads() -> None:
|
| 112 |
+
"""
|
| 113 |
+
Stateless HTTP: each request builds a fresh env (OpenEnv design).
|
| 114 |
+
POST /step on a new instance loads the scenario then applies the action (primed reset).
|
| 115 |
+
"""
|
| 116 |
+
with TestClient(app, raise_server_exceptions=True) as client:
|
| 117 |
+
reset = client.post("/reset", json={})
|
| 118 |
+
assert reset.status_code == 200
|
| 119 |
+
rj = reset.json()
|
| 120 |
+
assert "observation" in rj
|
| 121 |
+
obs = rj["observation"]
|
| 122 |
+
assert "echoed_message" in obs
|
| 123 |
+
assert "GHOSTEXEC BRIEFING" in (obs.get("echoed_message") or "")
|
| 124 |
+
|
| 125 |
+
step = client.post(
|
| 126 |
+
"/step",
|
| 127 |
+
json={
|
| 128 |
+
"action": {
|
| 129 |
+
"action_type": "reply_email",
|
| 130 |
+
"email_id": "e05",
|
| 131 |
+
"message_body": "On it.",
|
| 132 |
+
}
|
| 133 |
+
},
|
| 134 |
+
)
|
| 135 |
+
assert step.status_code == 200
|
| 136 |
+
sj = step.json()
|
| 137 |
+
assert "observation" in sj
|
| 138 |
+
assert sj.get("reward") is not None or sj["observation"].get("reward") is not None
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def test_http_step_invalid_action_422() -> None:
|
| 142 |
+
with TestClient(app, raise_server_exceptions=True) as client:
|
| 143 |
+
bad = client.post("/step", json={"action": "not-an-object"})
|
| 144 |
+
assert bad.status_code == 422
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def test_mcp_jsonrpc_tools_list() -> None:
|
| 148 |
+
with TestClient(app, raise_server_exceptions=True) as client:
|
| 149 |
+
payload = {"jsonrpc": "2.0", "id": 1, "method": "tools/list", "params": {}}
|
| 150 |
+
r = client.post("/mcp", json=payload)
|
| 151 |
+
assert r.status_code == 200
|
| 152 |
+
data = r.json()
|
| 153 |
+
assert "result" in data or "error" in data
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def test_websocket_full_episode_reset_step_state_close() -> None:
|
| 157 |
+
with TestClient(app, raise_server_exceptions=True) as client:
|
| 158 |
+
with client.websocket_connect("/ws") as ws:
|
| 159 |
+
ws.send_json({"type": "reset", "data": {}})
|
| 160 |
+
msg = ws.receive_json()
|
| 161 |
+
assert msg.get("type") == "observation"
|
| 162 |
+
data = msg.get("data") or {}
|
| 163 |
+
assert "observation" in data
|
| 164 |
+
inner = data["observation"]
|
| 165 |
+
assert "echoed_message" in inner
|
| 166 |
+
assert "GHOSTEXEC BRIEFING" in inner.get("echoed_message", "")
|
| 167 |
+
|
| 168 |
+
ws.send_json(
|
| 169 |
+
{
|
| 170 |
+
"type": "step",
|
| 171 |
+
"data": {
|
| 172 |
+
"action_type": "reschedule_meeting",
|
| 173 |
+
"meeting_id": "m02",
|
| 174 |
+
"new_time": "2026-04-21T18:00:00",
|
| 175 |
+
},
|
| 176 |
+
}
|
| 177 |
+
)
|
| 178 |
+
msg2 = ws.receive_json()
|
| 179 |
+
assert msg2.get("type") == "observation"
|
| 180 |
+
d2 = msg2.get("data") or {}
|
| 181 |
+
assert d2.get("reward") is not None
|
| 182 |
+
|
| 183 |
+
ws.send_json({"type": "state"})
|
| 184 |
+
msg3 = ws.receive_json()
|
| 185 |
+
assert msg3.get("type") == "state", msg3
|
| 186 |
+
st = msg3.get("data") or {}
|
| 187 |
+
assert st.get("step_count", 0) >= 1
|
| 188 |
+
|
| 189 |
+
ws.send_json({"type": "close", "data": {}})
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def test_inprocess_env_matches_ws_briefing_shape() -> None:
|
| 193 |
+
env = GhostexecEnvironment(SCENARIO)
|
| 194 |
+
obs = env.reset()
|
| 195 |
+
assert "BRIEFING" in obs.echoed_message
|
| 196 |
+
o2 = env.step(
|
| 197 |
+
GhostexecAction(
|
| 198 |
+
action_type="reschedule_meeting",
|
| 199 |
+
meeting_id="m02",
|
| 200 |
+
new_time="2026-04-21T18:00:00",
|
| 201 |
+
)
|
| 202 |
+
)
|
| 203 |
+
assert o2.reward is not None
|
| 204 |
+
assert o2.metadata.get("step_ok") is True
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def test_monday_morning_scenario_reward_signal() -> None:
|
| 208 |
+
assert MONDAY.is_file()
|
| 209 |
+
env = GhostexecEnvironment(MONDAY)
|
| 210 |
+
env.reset()
|
| 211 |
+
r = env.step(GhostexecAction(action_type="do_nothing")).reward
|
| 212 |
+
assert isinstance(r, float)
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
def test_ghostexec_env_client_against_live_url_if_set() -> None:
|
| 216 |
+
"""
|
| 217 |
+
GhostexecEnv opens a real TCP WebSocket; Starlette TestClient uses the
|
| 218 |
+
non-resolvable host ``testserver`` on some platforms, so this only runs when
|
| 219 |
+
``GHOSTEXEC_WS_BASE_URL`` points at a live server (e.g. local uvicorn).
|
| 220 |
+
"""
|
| 221 |
+
base = os.environ.get("GHOSTEXEC_WS_BASE_URL", "").strip().rstrip("/")
|
| 222 |
+
if not base:
|
| 223 |
+
pytest.skip("Set GHOSTEXEC_WS_BASE_URL (e.g. http://127.0.0.1:8000) to test GhostexecEnv client.")
|
| 224 |
+
|
| 225 |
+
from ghostexec.client import GhostexecEnv
|
| 226 |
+
|
| 227 |
+
sync_client = GhostexecEnv(base_url=base).sync()
|
| 228 |
+
with sync_client:
|
| 229 |
+
res = sync_client.reset()
|
| 230 |
+
assert res.observation.echoed_message
|
| 231 |
+
res2 = sync_client.step(GhostexecAction(action_type="do_nothing"))
|
| 232 |
+
assert res2.observation.echoed_message
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
|
tests/test_docker_build.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Opt-in Docker build smoke test for Phase 1 deployment readiness."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import shutil
|
| 7 |
+
import subprocess
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
import pytest
|
| 11 |
+
|
| 12 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@pytest.mark.skipif(
|
| 16 |
+
shutil.which("docker") is None or os.environ.get("GHOSTEXEC_RUN_DOCKER_BUILD") != "1",
|
| 17 |
+
reason="Set GHOSTEXEC_RUN_DOCKER_BUILD=1 and ensure docker is installed to run this test.",
|
| 18 |
+
)
|
| 19 |
+
def test_server_dockerfile_builds():
|
| 20 |
+
daemon = subprocess.run(
|
| 21 |
+
["docker", "version"],
|
| 22 |
+
cwd=str(ROOT),
|
| 23 |
+
capture_output=True,
|
| 24 |
+
text=True,
|
| 25 |
+
timeout=60,
|
| 26 |
+
check=False,
|
| 27 |
+
)
|
| 28 |
+
if daemon.returncode != 0:
|
| 29 |
+
pytest.skip("Docker daemon is unavailable on this machine.")
|
| 30 |
+
|
| 31 |
+
image_tag = "ghostexec-env:ci"
|
| 32 |
+
build_cmd = ["docker", "build", "-t", image_tag, "."]
|
| 33 |
+
built = subprocess.run(
|
| 34 |
+
build_cmd,
|
| 35 |
+
cwd=str(ROOT),
|
| 36 |
+
capture_output=True,
|
| 37 |
+
text=True,
|
| 38 |
+
timeout=900,
|
| 39 |
+
check=False,
|
| 40 |
+
)
|
| 41 |
+
assert built.returncode == 0, (
|
| 42 |
+
"docker build failed\n"
|
| 43 |
+
f"stdout:\n{built.stdout}\n"
|
| 44 |
+
f"stderr:\n{built.stderr}\n"
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
inspect_cmd = ["docker", "image", "inspect", image_tag]
|
| 48 |
+
inspected = subprocess.run(
|
| 49 |
+
inspect_cmd,
|
| 50 |
+
cwd=str(ROOT),
|
| 51 |
+
capture_output=True,
|
| 52 |
+
text=True,
|
| 53 |
+
timeout=120,
|
| 54 |
+
check=False,
|
| 55 |
+
)
|
| 56 |
+
assert inspected.returncode == 0, (
|
| 57 |
+
f"image inspect failed for {image_tag}\n"
|
| 58 |
+
f"stdout:\n{inspected.stdout}\n"
|
| 59 |
+
f"stderr:\n{inspected.stderr}\n"
|
| 60 |
+
)
|
tests/test_env.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OpenEnv Phase 2 submission guardrails (graders + manifest wiring)."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import importlib
|
| 5 |
+
import sys
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
import pytest
|
| 9 |
+
|
| 10 |
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
| 11 |
+
|
| 12 |
+
from graders import (
|
| 13 |
+
dinner_disaster_grader,
|
| 14 |
+
monday_morning_grader,
|
| 15 |
+
phase2_core_grader,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
PUBLIC_GRADERS = (phase2_core_grader, monday_morning_grader, dinner_disaster_grader)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@pytest.mark.parametrize("grader", PUBLIC_GRADERS)
|
| 22 |
+
def test_public_graders_are_strictly_bounded(grader):
|
| 23 |
+
assert grader({"rewards": [1.0]}) == 0.99
|
| 24 |
+
assert grader({"rewards": [0.0]}) == 0.01
|
| 25 |
+
assert grader({"rewards": [-5.0]}) == 0.01
|
| 26 |
+
assert grader({"score": 1.5}) == 0.99
|
| 27 |
+
assert grader({"score": -0.5}) == 0.01
|
| 28 |
+
assert grader({"reward": {"total": 1.0}}) == 0.99
|
| 29 |
+
v = grader(None)
|
| 30 |
+
assert 0.0 < v < 1.0
|
| 31 |
+
v = grader({})
|
| 32 |
+
assert 0.0 < v < 1.0
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def test_openenv_yaml_declares_three_tasks_with_graders():
|
| 36 |
+
import yaml
|
| 37 |
+
|
| 38 |
+
root = Path(__file__).resolve().parent.parent
|
| 39 |
+
with (root / "openenv.yaml").open("r", encoding="utf-8") as f:
|
| 40 |
+
spec = yaml.safe_load(f)
|
| 41 |
+
|
| 42 |
+
tasks = spec.get("tasks", [])
|
| 43 |
+
assert len(tasks) >= 3, "Phase 2 requires >= 3 tasks"
|
| 44 |
+
for t in tasks:
|
| 45 |
+
assert "grader" in t, f"Task {t.get('id')} missing grader"
|
| 46 |
+
module_path, _, func_name = t["grader"].rpartition(".")
|
| 47 |
+
mod = importlib.import_module(module_path)
|
| 48 |
+
assert callable(getattr(mod, func_name)), f"{t['grader']} not callable"
|
tests/test_live_server_exhaustive.py
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# Exhaustive / adversarial probes against a RUNNING GhostExec HTTP server.
|
| 4 |
+
# Default: http://127.0.0.1:8000 (override with GHOSTEXEC_LIVE_BASE_URL).
|
| 5 |
+
# Skips all tests if /health is unreachable.
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import asyncio
|
| 10 |
+
import json
|
| 11 |
+
import os
|
| 12 |
+
import urllib.error
|
| 13 |
+
import urllib.request
|
| 14 |
+
from typing import Any
|
| 15 |
+
|
| 16 |
+
import pytest
|
| 17 |
+
|
| 18 |
+
BASE = os.environ.get("GHOSTEXEC_LIVE_BASE_URL", "http://127.0.0.1:8000").rstrip("/")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _req(
|
| 22 |
+
method: str,
|
| 23 |
+
path: str,
|
| 24 |
+
*,
|
| 25 |
+
data: bytes | None = None,
|
| 26 |
+
headers: dict[str, str] | None = None,
|
| 27 |
+
timeout: float = 15.0,
|
| 28 |
+
) -> tuple[int, bytes]:
|
| 29 |
+
url = BASE + path
|
| 30 |
+
h = urllib.request.Request(url, data=data, headers=headers or {}, method=method)
|
| 31 |
+
try:
|
| 32 |
+
with urllib.request.urlopen(h, timeout=timeout) as resp:
|
| 33 |
+
return resp.status, resp.read()
|
| 34 |
+
except urllib.error.HTTPError as e:
|
| 35 |
+
try:
|
| 36 |
+
body = e.read()
|
| 37 |
+
except (ConnectionResetError, OSError):
|
| 38 |
+
body = b""
|
| 39 |
+
return e.code, body
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@pytest.fixture(scope="module")
|
| 43 |
+
def live() -> str:
|
| 44 |
+
try:
|
| 45 |
+
code, _ = _req("GET", "/health", timeout=3.0)
|
| 46 |
+
except OSError as e:
|
| 47 |
+
pytest.skip(f"Live server not reachable at {BASE!r}: {e}")
|
| 48 |
+
if code != 200:
|
| 49 |
+
pytest.skip(f"Live /health returned {code} at {BASE!r}")
|
| 50 |
+
return BASE
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def test_get_core_docs(live: str) -> None:
|
| 54 |
+
for path, min_len in [
|
| 55 |
+
("/health", 10),
|
| 56 |
+
("/metadata", 20),
|
| 57 |
+
("/state", 10),
|
| 58 |
+
("/schema", 500),
|
| 59 |
+
("/openapi.json", 1000),
|
| 60 |
+
("/docs", 200),
|
| 61 |
+
("/redoc", 200),
|
| 62 |
+
]:
|
| 63 |
+
code, body = _req("GET", path)
|
| 64 |
+
assert code == 200, f"{path} -> {code}"
|
| 65 |
+
assert len(body) >= min_len, f"{path} body tiny"
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def test_wrong_http_methods_on_control_routes(live: str) -> None:
|
| 69 |
+
assert _req("GET", "/reset")[0] == 405
|
| 70 |
+
assert _req("GET", "/step")[0] == 405
|
| 71 |
+
assert _req("PUT", "/reset", data=b"{}")[0] in (405, 422)
|
| 72 |
+
code, _ = _req("DELETE", "/health")
|
| 73 |
+
assert code in (405, 404)
|
| 74 |
+
assert _req("GET", "/this-path-should-not-exist-ghostexec")[0] == 404
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def test_reset_payload_variants(live: str) -> None:
|
| 78 |
+
for label, payload in [
|
| 79 |
+
("empty", {}),
|
| 80 |
+
("seed", {"seed": 42}),
|
| 81 |
+
("episode_id", {"episode_id": "probe-episode-1"}),
|
| 82 |
+
("extra_ignored", {"seed": 1, "unknown_future_field_xyz": True}),
|
| 83 |
+
]:
|
| 84 |
+
code, body = _req(
|
| 85 |
+
"POST",
|
| 86 |
+
"/reset",
|
| 87 |
+
data=json.dumps(payload).encode(),
|
| 88 |
+
headers={"Content-Type": "application/json"},
|
| 89 |
+
)
|
| 90 |
+
assert code == 200, f"reset {label}: {code}"
|
| 91 |
+
j = json.loads(body.decode())
|
| 92 |
+
assert "observation" in j and "done" in j
|
| 93 |
+
obs = j["observation"]
|
| 94 |
+
assert "echoed_message" in obs
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def test_step_valid_action_types(live: str) -> None:
|
| 98 |
+
cases: list[tuple[str, dict[str, Any]]] = [
|
| 99 |
+
("do_nothing", {"action_type": "do_nothing"}),
|
| 100 |
+
(
|
| 101 |
+
"reply_email",
|
| 102 |
+
{"action_type": "reply_email", "email_id": "e14", "message_body": "Live exhaustive probe."},
|
| 103 |
+
),
|
| 104 |
+
("archive_email", {"action_type": "archive_email", "email_id": "e09"}),
|
| 105 |
+
(
|
| 106 |
+
"reschedule_meeting",
|
| 107 |
+
{
|
| 108 |
+
"action_type": "reschedule_meeting",
|
| 109 |
+
"meeting_id": "m02",
|
| 110 |
+
"new_time": "2026-04-21T18:00:00",
|
| 111 |
+
},
|
| 112 |
+
),
|
| 113 |
+
(
|
| 114 |
+
"cancel_meeting",
|
| 115 |
+
{"action_type": "cancel_meeting", "meeting_id": "m10", "reason": "probe cancel"},
|
| 116 |
+
),
|
| 117 |
+
("complete_task", {"action_type": "complete_task", "task_id": "t07"}),
|
| 118 |
+
(
|
| 119 |
+
"delegate_task",
|
| 120 |
+
{
|
| 121 |
+
"action_type": "delegate_task",
|
| 122 |
+
"task_id": "t08",
|
| 123 |
+
"contact_name": "Jordan Lee",
|
| 124 |
+
},
|
| 125 |
+
),
|
| 126 |
+
(
|
| 127 |
+
"send_message",
|
| 128 |
+
{
|
| 129 |
+
"action_type": "send_message",
|
| 130 |
+
"contact_name": "Jamie Liu",
|
| 131 |
+
"message_body": "Exhaustive live test ping.",
|
| 132 |
+
},
|
| 133 |
+
),
|
| 134 |
+
]
|
| 135 |
+
for name, action in cases:
|
| 136 |
+
code, body = _req(
|
| 137 |
+
"POST",
|
| 138 |
+
"/step",
|
| 139 |
+
data=json.dumps({"action": action}).encode(),
|
| 140 |
+
headers={"Content-Type": "application/json"},
|
| 141 |
+
)
|
| 142 |
+
assert code == 200, f"step {name}: HTTP {code} {body[:200]!r}"
|
| 143 |
+
j = json.loads(body.decode())
|
| 144 |
+
assert "observation" in j
|
| 145 |
+
meta = (j.get("observation") or {}).get("metadata") or {}
|
| 146 |
+
assert "step_ok" in meta, f"step {name}: missing step_ok"
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def test_step_invalid_contracts(live: str) -> None:
|
| 150 |
+
assert _req("POST", "/step", data=b"not-json", headers={"Content-Type": "application/json"})[0] in (
|
| 151 |
+
400,
|
| 152 |
+
422,
|
| 153 |
+
)
|
| 154 |
+
assert (
|
| 155 |
+
_req(
|
| 156 |
+
"POST",
|
| 157 |
+
"/step",
|
| 158 |
+
data=json.dumps({"action": "not-a-dict"}).encode(),
|
| 159 |
+
headers={"Content-Type": "application/json"},
|
| 160 |
+
)[0]
|
| 161 |
+
== 422
|
| 162 |
+
)
|
| 163 |
+
assert (
|
| 164 |
+
_req(
|
| 165 |
+
"POST",
|
| 166 |
+
"/step",
|
| 167 |
+
data=json.dumps({"action": {"action_type": "reply_email", "email_id": "nope", "message_body": "x"}}).encode(),
|
| 168 |
+
headers={"Content-Type": "application/json"},
|
| 169 |
+
)[0]
|
| 170 |
+
== 200
|
| 171 |
+
)
|
| 172 |
+
j = json.loads(
|
| 173 |
+
_req(
|
| 174 |
+
"POST",
|
| 175 |
+
"/step",
|
| 176 |
+
data=json.dumps(
|
| 177 |
+
{"action": {"action_type": "reply_email", "email_id": "nope", "message_body": "x"}}
|
| 178 |
+
).encode(),
|
| 179 |
+
headers={"Content-Type": "application/json"},
|
| 180 |
+
)[1].decode()
|
| 181 |
+
)
|
| 182 |
+
assert j["observation"]["metadata"].get("step_ok") is False
|
| 183 |
+
|
| 184 |
+
assert (
|
| 185 |
+
_req(
|
| 186 |
+
"POST",
|
| 187 |
+
"/step",
|
| 188 |
+
data=json.dumps({"action": {"action_type": "complete_task", "task_id": "t09"}}).encode(),
|
| 189 |
+
headers={"Content-Type": "application/json"},
|
| 190 |
+
)[0]
|
| 191 |
+
== 200
|
| 192 |
+
)
|
| 193 |
+
j2 = json.loads(
|
| 194 |
+
_req(
|
| 195 |
+
"POST",
|
| 196 |
+
"/step",
|
| 197 |
+
data=json.dumps({"action": {"action_type": "complete_task", "task_id": "t09"}}).encode(),
|
| 198 |
+
headers={"Content-Type": "application/json"},
|
| 199 |
+
)[1].decode()
|
| 200 |
+
)
|
| 201 |
+
assert j2["observation"]["metadata"].get("step_ok") is False
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def test_step_unicode_and_long_message(live: str) -> None:
|
| 205 |
+
long_body = ("Line note.\n" * 80) + " café naïve résumé 日本語"
|
| 206 |
+
code, body = _req(
|
| 207 |
+
"POST",
|
| 208 |
+
"/step",
|
| 209 |
+
data=json.dumps(
|
| 210 |
+
{"action": {"action_type": "reply_email", "email_id": "e05", "message_body": long_body}}
|
| 211 |
+
).encode(),
|
| 212 |
+
headers={"Content-Type": "application/json"},
|
| 213 |
+
)
|
| 214 |
+
assert code == 200
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def test_step_wrong_content_type(live: str) -> None:
|
| 218 |
+
code, _ = _req(
|
| 219 |
+
"POST",
|
| 220 |
+
"/step",
|
| 221 |
+
data=b"action_type=do_nothing",
|
| 222 |
+
headers={"Content-Type": "application/x-www-form-urlencoded"},
|
| 223 |
+
)
|
| 224 |
+
assert code in (400, 415, 422)
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
def test_reset_invalid_json(live: str) -> None:
|
| 228 |
+
code, _ = _req("POST", "/reset", data=b"{", headers={"Content-Type": "application/json"})
|
| 229 |
+
assert code in (400, 422)
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def test_mcp_variants(live: str) -> None:
|
| 233 |
+
assert _req("POST", "/mcp", data=b"{", headers={"Content-Type": "application/json"})[0] == 200
|
| 234 |
+
body = _req(
|
| 235 |
+
"POST",
|
| 236 |
+
"/mcp",
|
| 237 |
+
data=json.dumps({"jsonrpc": "2.0", "id": 1, "method": "bogus/thing", "params": {}}).encode(),
|
| 238 |
+
headers={"Content-Type": "application/json"},
|
| 239 |
+
)[1].decode()
|
| 240 |
+
j = json.loads(body)
|
| 241 |
+
assert "error" in j or "result" in j
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def test_openapi_lists_expected_paths(live: str) -> None:
|
| 245 |
+
_, raw = _req("GET", "/openapi.json")
|
| 246 |
+
spec = json.loads(raw.decode())
|
| 247 |
+
paths = spec.get("paths") or {}
|
| 248 |
+
for p in ("/health", "/reset", "/step", "/schema", "/metadata", "/state", "/mcp"):
|
| 249 |
+
assert p in paths, f"missing path {p} in OpenAPI"
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def test_websocket_dead_ends(live: str) -> None:
|
| 253 |
+
try:
|
| 254 |
+
import websockets
|
| 255 |
+
except ImportError:
|
| 256 |
+
pytest.skip("websockets not installed")
|
| 257 |
+
|
| 258 |
+
async def _run() -> None:
|
| 259 |
+
ws_url = live.replace("http://", "ws://").replace("https://", "wss://") + "/ws"
|
| 260 |
+
async with websockets.connect(ws_url, max_size=10_000_000) as ws:
|
| 261 |
+
await ws.send("{ not json")
|
| 262 |
+
e1 = json.loads(await ws.recv())
|
| 263 |
+
assert e1.get("type") == "error"
|
| 264 |
+
|
| 265 |
+
await ws.send(json.dumps({"type": "nosuch", "data": {}}))
|
| 266 |
+
e2 = json.loads(await ws.recv())
|
| 267 |
+
assert e2.get("type") == "error"
|
| 268 |
+
|
| 269 |
+
await ws.send(json.dumps({"type": "reset", "data": {}}))
|
| 270 |
+
ok = json.loads(await ws.recv())
|
| 271 |
+
assert ok.get("type") == "observation"
|
| 272 |
+
|
| 273 |
+
await ws.send(
|
| 274 |
+
json.dumps({"type": "step", "data": {"action_type": "reply_email", "email_id": "missing"}})
|
| 275 |
+
)
|
| 276 |
+
bad = json.loads(await ws.recv())
|
| 277 |
+
assert bad.get("type") == "observation"
|
| 278 |
+
meta = (bad.get("data") or {}).get("observation", {}).get("metadata") or {}
|
| 279 |
+
assert meta.get("step_ok") is False
|
| 280 |
+
|
| 281 |
+
await ws.send(json.dumps({"type": "state"}))
|
| 282 |
+
st = json.loads(await ws.recv())
|
| 283 |
+
assert st.get("type") == "state"
|
| 284 |
+
|
| 285 |
+
await ws.send(json.dumps({"type": "close", "data": {}}))
|
| 286 |
+
|
| 287 |
+
asyncio.run(_run())
|
tests/test_phase1.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Phase 1: scaffold, OpenEnv manifest, layout, and HTTP health surface."""
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import yaml
|
| 6 |
+
from starlette.testclient import TestClient
|
| 7 |
+
|
| 8 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def test_openenv_yaml_exists_and_metadata():
|
| 12 |
+
path = ROOT / "openenv.yaml"
|
| 13 |
+
assert path.is_file(), "openenv.yaml must exist at project root"
|
| 14 |
+
data = yaml.safe_load(path.read_text(encoding="utf-8"))
|
| 15 |
+
assert data.get("name") == "ghostexec"
|
| 16 |
+
assert data.get("spec_version") == 1
|
| 17 |
+
assert data.get("type") == "space"
|
| 18 |
+
assert data.get("runtime") == "fastapi"
|
| 19 |
+
assert data.get("app") == "server.app:app"
|
| 20 |
+
desc = data.get("description")
|
| 21 |
+
assert desc and isinstance(desc, str) and len(desc.strip()) > 0
|
| 22 |
+
ver = data.get("version")
|
| 23 |
+
assert ver and isinstance(ver, str) and len(ver.strip()) > 0
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def test_expected_folder_structure():
|
| 27 |
+
assert (ROOT / "models.py").is_file()
|
| 28 |
+
assert (ROOT / "client.py").is_file()
|
| 29 |
+
assert (ROOT / "pyproject.toml").is_file()
|
| 30 |
+
assert (ROOT / "server" / "app.py").is_file()
|
| 31 |
+
assert (ROOT / "server" / "ghostexec_environment.py").is_file()
|
| 32 |
+
assert (ROOT / "Dockerfile").is_file() or (ROOT / "server" / "Dockerfile").is_file()
|
| 33 |
+
assert (ROOT / "server" / "requirements.txt").is_file()
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def test_server_health_ping():
|
| 37 |
+
from ghostexec.server.app import app
|
| 38 |
+
|
| 39 |
+
client = TestClient(app)
|
| 40 |
+
response = client.get("/health")
|
| 41 |
+
assert response.status_code == 200
|
| 42 |
+
assert response.json().get("status") == "healthy"
|
tests/test_phase2.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Phase 2: world state, inbox, calendar, contacts, tasks (scenario-driven)."""
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
from ghostexec.server.ghostexec_environment import GhostexecEnvironment
|
| 6 |
+
|
| 7 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 8 |
+
SCENARIO = ROOT / "scenarios" / "phase2_core.json"
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def test_scenario_file_exists():
|
| 12 |
+
assert SCENARIO.is_file()
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def test_world_json_roundtrip():
|
| 16 |
+
world = GhostexecEnvironment.load_world_from_json(SCENARIO)
|
| 17 |
+
blob = GhostexecEnvironment.world_to_json(world)
|
| 18 |
+
again = GhostexecEnvironment.world_from_json(blob)
|
| 19 |
+
assert again.simulation_time == world.simulation_time
|
| 20 |
+
assert len(again.emails) == len(world.emails)
|
| 21 |
+
assert len(again.meetings) == len(world.meetings)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def test_pool_sizes_from_scenario():
|
| 25 |
+
w = GhostexecEnvironment.load_world_from_json(SCENARIO)
|
| 26 |
+
assert len(w.emails) >= 30
|
| 27 |
+
assert len(w.meetings) >= 8
|
| 28 |
+
assert len(w.contacts) >= 15
|
| 29 |
+
assert len(w.tasks) >= 10
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def test_inbox_unread_priority_order():
|
| 33 |
+
env = GhostexecEnvironment(SCENARIO)
|
| 34 |
+
env.reset()
|
| 35 |
+
unread = env.get_unread_emails_sorted()
|
| 36 |
+
priorities = [e.priority for e in unread]
|
| 37 |
+
rank = {"critical": 0, "high": 1, "normal": 2, "low": 3}
|
| 38 |
+
assert priorities == sorted(priorities, key=lambda p: rank[p])
|
| 39 |
+
assert unread[0].priority == "critical"
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def test_calendar_detects_four_conflicts():
|
| 43 |
+
env = GhostexecEnvironment(SCENARIO)
|
| 44 |
+
env.reset()
|
| 45 |
+
conflicts = env.detect_meeting_conflicts()
|
| 46 |
+
assert len(conflicts) >= 4
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def test_contact_mood_update():
|
| 50 |
+
env = GhostexecEnvironment(SCENARIO)
|
| 51 |
+
env.reset()
|
| 52 |
+
c = env.get_contact("David Okonkwo")
|
| 53 |
+
assert c is not None
|
| 54 |
+
assert c.mood == "angry"
|
| 55 |
+
assert env.update_contact_mood("David Okonkwo", "neutral")
|
| 56 |
+
assert env.get_contact("David Okonkwo") is not None
|
| 57 |
+
assert env.get_contact("David Okonkwo").mood == "neutral"
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def test_overdue_tasks_after_time_advance():
|
| 61 |
+
env = GhostexecEnvironment(SCENARIO)
|
| 62 |
+
env.reset()
|
| 63 |
+
future = "2026-04-22T12:00:00"
|
| 64 |
+
env.set_simulation_time(future)
|
| 65 |
+
overdue = env.overdue_tasks_at(future)
|
| 66 |
+
assert len(overdue) >= 2
|
| 67 |
+
assert all(t.status == "overdue" for t in overdue)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def test_mark_email_read_and_reschedule_reduces_calendar_conflicts():
|
| 71 |
+
env = GhostexecEnvironment(SCENARIO)
|
| 72 |
+
env.reset()
|
| 73 |
+
before = len(env.detect_meeting_conflicts())
|
| 74 |
+
assert env.reschedule_meeting("m02", "2026-04-21T18:00:00")
|
| 75 |
+
after = len(env.detect_meeting_conflicts())
|
| 76 |
+
assert after < before
|
| 77 |
+
assert env.mark_email_read("e01")
|
tests/test_phase3.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Phase 3: plain-text briefing, eight legal actions, validation without crashes."""
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
|
| 7 |
+
from ghostexec.models import GhostexecAction
|
| 8 |
+
from ghostexec.server.ghostexec_environment import GhostexecEnvironment
|
| 9 |
+
|
| 10 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 11 |
+
SCENARIO = ROOT / "scenarios" / "phase2_core.json"
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _env() -> GhostexecEnvironment:
|
| 15 |
+
e = GhostexecEnvironment(SCENARIO)
|
| 16 |
+
e.reset()
|
| 17 |
+
return e
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def test_briefing_is_plain_text_after_reset():
|
| 21 |
+
env = _env()
|
| 22 |
+
obs = env.reset()
|
| 23 |
+
text = obs.echoed_message
|
| 24 |
+
assert "=== GHOSTEXEC BRIEFING" in text
|
| 25 |
+
assert "UNREAD EMAILS" in text
|
| 26 |
+
assert "CALENDAR CONFLICTS IN NEXT 4 HOURS" in text
|
| 27 |
+
assert "CONTACTS TO WATCH" in text
|
| 28 |
+
assert "OVERDUE OR DUE-SOON TASKS" in text
|
| 29 |
+
assert "EXEC STRESS LEVEL" in text
|
| 30 |
+
assert "STEPS REMAINING" in text
|
| 31 |
+
assert obs.message_length == len(text)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@pytest.mark.parametrize(
|
| 35 |
+
"action,check",
|
| 36 |
+
[
|
| 37 |
+
(
|
| 38 |
+
GhostexecAction(action_type="reply_email", email_id="e05", message_body="On it."),
|
| 39 |
+
lambda env: next(e for e in env.world.emails if e.id == "e05").replied is True,
|
| 40 |
+
),
|
| 41 |
+
(
|
| 42 |
+
GhostexecAction(action_type="archive_email", email_id="e09"),
|
| 43 |
+
lambda env: next(e for e in env.world.emails if e.id == "e09").read is True,
|
| 44 |
+
),
|
| 45 |
+
(
|
| 46 |
+
GhostexecAction(
|
| 47 |
+
action_type="reschedule_meeting",
|
| 48 |
+
meeting_id="m03",
|
| 49 |
+
new_time="2026-04-21T18:00:00",
|
| 50 |
+
),
|
| 51 |
+
lambda env: next(m for m in env.world.meetings if m.id == "m03").start
|
| 52 |
+
== "2026-04-21T18:00:00",
|
| 53 |
+
),
|
| 54 |
+
(
|
| 55 |
+
GhostexecAction(
|
| 56 |
+
action_type="cancel_meeting",
|
| 57 |
+
meeting_id="m10",
|
| 58 |
+
reason="Merged into ops review",
|
| 59 |
+
),
|
| 60 |
+
lambda env: next(m for m in env.world.meetings if m.id == "m10").cancelled is True,
|
| 61 |
+
),
|
| 62 |
+
(
|
| 63 |
+
GhostexecAction(action_type="complete_task", task_id="t07"),
|
| 64 |
+
lambda env: next(t for t in env.world.tasks if t.id == "t07").status == "done",
|
| 65 |
+
),
|
| 66 |
+
(
|
| 67 |
+
GhostexecAction(
|
| 68 |
+
action_type="delegate_task",
|
| 69 |
+
task_id="t08",
|
| 70 |
+
contact_name="Jordan Lee",
|
| 71 |
+
),
|
| 72 |
+
lambda env: next(t for t in env.world.tasks if t.id == "t08").delegated_to == "Jordan Lee",
|
| 73 |
+
),
|
| 74 |
+
(
|
| 75 |
+
GhostexecAction(
|
| 76 |
+
action_type="send_message",
|
| 77 |
+
contact_name="Jamie Liu",
|
| 78 |
+
message_body="Thanks for the demo feedback.",
|
| 79 |
+
),
|
| 80 |
+
lambda env: any("message to Jamie Liu" in line for line in env.world.action_log),
|
| 81 |
+
),
|
| 82 |
+
(
|
| 83 |
+
GhostexecAction(action_type="do_nothing"),
|
| 84 |
+
lambda env: True,
|
| 85 |
+
),
|
| 86 |
+
],
|
| 87 |
+
)
|
| 88 |
+
def test_each_legal_action_runs_without_crash(action, check):
|
| 89 |
+
env = _env()
|
| 90 |
+
obs = env.step(action)
|
| 91 |
+
assert obs.echoed_message
|
| 92 |
+
assert check(env)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def test_reply_marks_email_handled():
|
| 96 |
+
env = _env()
|
| 97 |
+
e = next(x for x in env.world.emails if x.id == "e14")
|
| 98 |
+
assert not e.read
|
| 99 |
+
env.step(GhostexecAction(action_type="reply_email", email_id="e14", message_body="Noted."))
|
| 100 |
+
e2 = next(x for x in env.world.emails if x.id == "e14")
|
| 101 |
+
assert e2.read and e2.replied
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def test_invalid_actions_return_error_metadata_not_exception():
|
| 105 |
+
base = _env()
|
| 106 |
+
r_do_nothing = base.step(GhostexecAction(action_type="do_nothing")).reward
|
| 107 |
+
|
| 108 |
+
env = _env()
|
| 109 |
+
obs = env.step(GhostexecAction(action_type="reply_email", email_id="nope", message_body="x"))
|
| 110 |
+
assert obs.metadata.get("step_ok") is False
|
| 111 |
+
assert obs.metadata.get("step_error")
|
| 112 |
+
# Same before→after sub-scores as do_nothing, plus explicit invalid add-on.
|
| 113 |
+
# do_nothing has an additional strict additive floor (-0.15), so the delta is -0.10 here.
|
| 114 |
+
assert obs.reward == pytest.approx((r_do_nothing or 0) - (0.25 - 0.15))
|
| 115 |
+
|
| 116 |
+
obs2 = env.step(GhostexecAction(action_type="complete_task", task_id="t09"))
|
| 117 |
+
assert obs2.metadata.get("step_ok") is False
|
| 118 |
+
assert "already done" in (obs2.metadata.get("step_error") or "").lower()
|
| 119 |
+
|
| 120 |
+
obs3 = env.step(
|
| 121 |
+
GhostexecAction(
|
| 122 |
+
action_type="send_message",
|
| 123 |
+
contact_name="Nobody By That Name",
|
| 124 |
+
message_body="hello",
|
| 125 |
+
)
|
| 126 |
+
)
|
| 127 |
+
assert obs3.metadata.get("step_ok") is False
|
| 128 |
+
|
| 129 |
+
obs4 = env.step(
|
| 130 |
+
GhostexecAction(
|
| 131 |
+
action_type="reschedule_meeting",
|
| 132 |
+
meeting_id="m03",
|
| 133 |
+
new_time="2026-04-21T09:30:00",
|
| 134 |
+
)
|
| 135 |
+
)
|
| 136 |
+
assert obs4.metadata.get("step_ok") is False
|
| 137 |
+
assert "overlap" in (obs4.metadata.get("step_error") or "").lower()
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def test_reschedule_resolves_prior_conflict_pair():
|
| 141 |
+
env = _env()
|
| 142 |
+
before = {frozenset((r["meeting_a"], r["meeting_b"])) for r in env.detect_meeting_conflicts()}
|
| 143 |
+
assert frozenset(("m01", "m02")) in before
|
| 144 |
+
obs = env.step(
|
| 145 |
+
GhostexecAction(
|
| 146 |
+
action_type="reschedule_meeting",
|
| 147 |
+
meeting_id="m02",
|
| 148 |
+
new_time="2026-04-21T18:00:00",
|
| 149 |
+
)
|
| 150 |
+
)
|
| 151 |
+
assert obs.metadata.get("step_ok") is True
|
| 152 |
+
after = {frozenset((r["meeting_a"], r["meeting_b"])) for r in env.detect_meeting_conflicts()}
|
| 153 |
+
assert frozenset(("m01", "m02")) not in after
|
tests/test_phase4.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Phase 4: reward sub-scores, aggregation, logging, schema drift."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import random
|
| 5 |
+
import statistics
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
import pytest
|
| 9 |
+
|
| 10 |
+
from ghostexec.models import GhostexecAction
|
| 11 |
+
from ghostexec.server import reward as reward_mod
|
| 12 |
+
from ghostexec.server.reward import aggregate_scores
|
| 13 |
+
from ghostexec.server.ghostexec_environment import GhostexecEnvironment
|
| 14 |
+
|
| 15 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 16 |
+
SCENARIO = ROOT / "scenarios" / "phase2_core.json"
|
| 17 |
+
DRIFT = ROOT / "scenarios" / "schema_drift_test.json"
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def test_reward_weights_and_aggregator_helpers():
|
| 21 |
+
w = GhostexecEnvironment.load_world_from_json(SCENARIO)
|
| 22 |
+
c, r, t = 1.0, -1.0, 2.5
|
| 23 |
+
weighted_inner = reward_mod.W_CONFLICT * c + reward_mod.W_REL * r + reward_mod.W_TASK * t
|
| 24 |
+
bd = aggregate_scores(
|
| 25 |
+
c,
|
| 26 |
+
r,
|
| 27 |
+
t,
|
| 28 |
+
conflict_raw=c,
|
| 29 |
+
critical_queue_bonus=0.0,
|
| 30 |
+
weighted_inner=weighted_inner,
|
| 31 |
+
action_ok=True,
|
| 32 |
+
episode_done=False,
|
| 33 |
+
world_after=w,
|
| 34 |
+
)
|
| 35 |
+
assert bd.weighted_base == pytest.approx(reward_mod.WEIGHTED_OUTPUT_SCALE * weighted_inner)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def test_catastrophic_and_completion_bonuses_only_when_episode_done():
|
| 39 |
+
w0 = GhostexecEnvironment.load_world_from_json(SCENARIO)
|
| 40 |
+
w1 = w0.model_copy(deep=True)
|
| 41 |
+
w1.stress = 30
|
| 42 |
+
w2 = w1.model_copy(deep=True)
|
| 43 |
+
action = GhostexecAction(action_type="do_nothing")
|
| 44 |
+
mid = reward_mod.compute_step_reward(w1, w2, action, action_ok=True, episode_done=False)
|
| 45 |
+
assert mid.episode_completion_bonus == 0.0
|
| 46 |
+
assert mid.catastrophic_penalty == 0.0
|
| 47 |
+
|
| 48 |
+
w_bad = w1.model_copy(deep=True)
|
| 49 |
+
for i, c in enumerate(w_bad.contacts):
|
| 50 |
+
if c.name == "Marcus Webb":
|
| 51 |
+
w_bad.contacts[i] = c.model_copy(update={"mood": "furious"})
|
| 52 |
+
break
|
| 53 |
+
end = reward_mod.compute_step_reward(w1, w_bad, action, action_ok=True, episode_done=True)
|
| 54 |
+
assert end.episode_completion_bonus == pytest.approx(10.0)
|
| 55 |
+
assert end.catastrophic_penalty == pytest.approx(-15.0)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def test_invalid_step_matches_do_nothing_subscores_plus_invalid_addon():
|
| 59 |
+
w = GhostexecEnvironment.load_world_from_json(SCENARIO)
|
| 60 |
+
noop = GhostexecAction(action_type="do_nothing")
|
| 61 |
+
bad = GhostexecAction(action_type="reply_email", email_id="missing", message_body="x")
|
| 62 |
+
bd_ok = reward_mod.compute_step_reward(w, w, noop, action_ok=True, episode_done=False)
|
| 63 |
+
bd_bad = reward_mod.compute_step_reward(w, w, bad, action_ok=False, episode_done=False)
|
| 64 |
+
assert bd_bad.invalid_step_adjustment == pytest.approx(-0.25)
|
| 65 |
+
# do_nothing carries an additional strict additive floor (-0.15) not applied to invalid non-idle actions.
|
| 66 |
+
assert bd_bad.final == pytest.approx(bd_ok.final - (0.25 - 0.15))
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def test_scripted_episode_reward_direction_and_log(tmp_path, monkeypatch):
|
| 70 |
+
logf = tmp_path / "rewards.jsonl"
|
| 71 |
+
env = GhostexecEnvironment(SCENARIO)
|
| 72 |
+
env.reset()
|
| 73 |
+
monkeypatch.setattr(env, "_reward_log_path", logf)
|
| 74 |
+
|
| 75 |
+
r_resolve = env.step(
|
| 76 |
+
GhostexecAction(
|
| 77 |
+
action_type="reschedule_meeting",
|
| 78 |
+
meeting_id="m02",
|
| 79 |
+
new_time="2026-04-21T18:00:00",
|
| 80 |
+
)
|
| 81 |
+
)
|
| 82 |
+
r_bad = env.step(GhostexecAction(action_type="do_nothing"))
|
| 83 |
+
|
| 84 |
+
assert r_resolve.metadata.get("step_ok") is True
|
| 85 |
+
assert r_bad.metadata.get("step_ok") is True
|
| 86 |
+
assert (r_resolve.reward or 0) > (r_bad.reward or 0)
|
| 87 |
+
|
| 88 |
+
assert logf.is_file()
|
| 89 |
+
lines = logf.read_text(encoding="utf-8").strip().splitlines()
|
| 90 |
+
assert len(lines) >= 2
|
| 91 |
+
row = json.loads(lines[0])
|
| 92 |
+
assert "reward" in row and "episode_id" in row
|
| 93 |
+
assert row.get("action_type") == "reschedule_meeting"
|
| 94 |
+
assert "conflict_raw" in row and "step_ok" in row
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def test_schema_drift_events_mutate_world():
|
| 98 |
+
env = GhostexecEnvironment(SCENARIO, schema_drift_events_path=DRIFT)
|
| 99 |
+
env.reset()
|
| 100 |
+
assert env.step(GhostexecAction(action_type="do_nothing")).metadata.get("step_ok") is True
|
| 101 |
+
assert any("schema drift: shifted" in x for x in env.world.action_log)
|
| 102 |
+
env.step(GhostexecAction(action_type="do_nothing"))
|
| 103 |
+
sarah = env.get_contact("Sarah Chen")
|
| 104 |
+
assert sarah is not None
|
| 105 |
+
assert sarah.communication_preference == "text"
|
| 106 |
+
env.step(GhostexecAction(action_type="do_nothing"))
|
| 107 |
+
t02 = next(t for t in env.world.tasks if t.id == "t02")
|
| 108 |
+
assert t02.deadline == "2026-04-21T07:00:00"
|
| 109 |
+
assert "Marcus Webb" in env._reply_relationship_suppressed # noqa: SLF001
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def test_rewards_differ_between_helpful_and_idle_steps():
|
| 113 |
+
env = GhostexecEnvironment(SCENARIO)
|
| 114 |
+
env.reset()
|
| 115 |
+
r_help = env.step(
|
| 116 |
+
GhostexecAction(
|
| 117 |
+
action_type="reschedule_meeting",
|
| 118 |
+
meeting_id="m02",
|
| 119 |
+
new_time="2026-04-21T18:00:00",
|
| 120 |
+
)
|
| 121 |
+
).reward
|
| 122 |
+
r_idle = env.step(GhostexecAction(action_type="do_nothing")).reward
|
| 123 |
+
assert r_help is not None and r_idle is not None
|
| 124 |
+
assert r_help != r_idle
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
# Whitelisted reschedules (known non-overlapping targets for phase2_core at 08:00).
|
| 128 |
+
_SAFE_RESCHEDULES: list[tuple[str, str]] = [
|
| 129 |
+
("m02", "2026-04-21T18:00:00"),
|
| 130 |
+
("m03", "2026-04-21T18:30:00"),
|
| 131 |
+
("m06", "2026-04-21T20:00:00"),
|
| 132 |
+
("m09", "2026-04-21T21:00:00"),
|
| 133 |
+
]
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def test_seeded_stochastic_policy_reward_spread():
|
| 137 |
+
random.seed(1234)
|
| 138 |
+
K = 80
|
| 139 |
+
archive_ids = [f"e{i:02d}" for i in range(1, 31)]
|
| 140 |
+
contacts = ["Jordan Lee", "Jamie Liu", "Marcus Webb", "Sarah Chen"]
|
| 141 |
+
env = GhostexecEnvironment(SCENARIO)
|
| 142 |
+
env.reset()
|
| 143 |
+
rewards: list[float] = []
|
| 144 |
+
ai = ri = 0
|
| 145 |
+
for _ in range(K):
|
| 146 |
+
u = random.random()
|
| 147 |
+
if u < 0.32:
|
| 148 |
+
obs = env.step(GhostexecAction(action_type="do_nothing"))
|
| 149 |
+
elif u < 0.58:
|
| 150 |
+
eid = archive_ids[ai % len(archive_ids)]
|
| 151 |
+
ai += 1
|
| 152 |
+
obs = env.step(GhostexecAction(action_type="archive_email", email_id=eid))
|
| 153 |
+
elif u < 0.78:
|
| 154 |
+
mid, nt = _SAFE_RESCHEDULES[ri % len(_SAFE_RESCHEDULES)]
|
| 155 |
+
ri += 1
|
| 156 |
+
obs = env.step(
|
| 157 |
+
GhostexecAction(action_type="reschedule_meeting", meeting_id=mid, new_time=nt)
|
| 158 |
+
)
|
| 159 |
+
else:
|
| 160 |
+
cname = contacts[ai % len(contacts)]
|
| 161 |
+
ai += 1
|
| 162 |
+
obs = env.step(
|
| 163 |
+
GhostexecAction(
|
| 164 |
+
action_type="send_message",
|
| 165 |
+
contact_name=cname,
|
| 166 |
+
message_body="Quick sync on priorities.",
|
| 167 |
+
)
|
| 168 |
+
)
|
| 169 |
+
assert obs.reward is not None
|
| 170 |
+
rewards.append(float(obs.reward))
|
| 171 |
+
|
| 172 |
+
std = statistics.pstdev(rewards)
|
| 173 |
+
sr = sorted(rewards)
|
| 174 |
+
p5 = sr[max(0, int(0.05 * (len(sr) - 1)))]
|
| 175 |
+
p95 = sr[min(len(sr) - 1, int(0.95 * (len(sr) - 1)))]
|
| 176 |
+
assert std > 0.06
|
| 177 |
+
assert (p95 - p5) > 0.09
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def test_good_script_beats_do_nothing_spam_on_mean_reward():
|
| 181 |
+
good = GhostexecEnvironment(SCENARIO)
|
| 182 |
+
good.reset()
|
| 183 |
+
good_actions = [
|
| 184 |
+
GhostexecAction(
|
| 185 |
+
action_type="reschedule_meeting",
|
| 186 |
+
meeting_id="m02",
|
| 187 |
+
new_time="2026-04-21T18:00:00",
|
| 188 |
+
),
|
| 189 |
+
GhostexecAction(action_type="reply_email", email_id="e01", message_body="Drafting revised figures now."),
|
| 190 |
+
GhostexecAction(action_type="archive_email", email_id="e09"),
|
| 191 |
+
GhostexecAction(
|
| 192 |
+
action_type="send_message",
|
| 193 |
+
contact_name="Jordan Lee",
|
| 194 |
+
message_body="Standup notes attached.",
|
| 195 |
+
),
|
| 196 |
+
GhostexecAction(action_type="complete_task", task_id="t06"),
|
| 197 |
+
]
|
| 198 |
+
g_rewards = [good.step(a).reward for a in good_actions]
|
| 199 |
+
g_mean = sum(float(x) for x in g_rewards) / len(g_rewards)
|
| 200 |
+
|
| 201 |
+
bad = GhostexecEnvironment(SCENARIO)
|
| 202 |
+
bad.reset()
|
| 203 |
+
b_rewards = [bad.step(GhostexecAction(action_type="do_nothing")).reward for _ in range(5)]
|
| 204 |
+
b_mean = sum(float(x) for x in b_rewards) / len(b_rewards)
|
| 205 |
+
|
| 206 |
+
assert g_mean > b_mean + 0.2
|
tests/test_reward_dead_suite.py
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
#
|
| 3 |
+
# Dead-test suite for Phase 4 step rewards: 100+ independent scenarios on
|
| 4 |
+
# phase2_core.json. Asserts penalization (do_nothing, invalid), priority
|
| 5 |
+
# ordering (VIP critical > normal), and legal-action signatures for GRPO-style
|
| 6 |
+
# post-training signal quality.
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
import pytest
|
| 13 |
+
|
| 14 |
+
from ghostexec.models import GhostexecAction
|
| 15 |
+
from ghostexec.server import reward as reward_mod
|
| 16 |
+
from ghostexec.server.ghostexec_environment import GhostexecEnvironment
|
| 17 |
+
|
| 18 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 19 |
+
SCENARIO = ROOT / "scenarios" / "phase2_core.json"
|
| 20 |
+
|
| 21 |
+
# All inbox ids from phase2_core (e01–e30).
|
| 22 |
+
REPLY_EMAIL_IDS = [f"e{i:02d}" for i in range(1, 31)]
|
| 23 |
+
|
| 24 |
+
# Unread or replyable ids suitable for archive (skip if unknown — all exist).
|
| 25 |
+
ARCHIVE_EMAIL_IDS = [f"e{i:02d}" for i in range(1, 16)]
|
| 26 |
+
|
| 27 |
+
# Pending / in-progress tasks only (t09 is done in fixture).
|
| 28 |
+
COMPLETE_TASK_IDS = [f"t{i:02d}" for i in range(1, 13) if i != 9]
|
| 29 |
+
|
| 30 |
+
# Known non-overlapping reschedules for 08:00 sim time (from phase4 tests).
|
| 31 |
+
_SAFE_RESCHEDULES: list[tuple[str, str]] = [
|
| 32 |
+
("m02", "2026-04-21T18:00:00"),
|
| 33 |
+
("m03", "2026-04-21T18:30:00"),
|
| 34 |
+
("m06", "2026-04-21T20:00:00"),
|
| 35 |
+
("m09", "2026-04-21T21:00:00"),
|
| 36 |
+
("m04", "2026-04-21T19:00:00"),
|
| 37 |
+
("m05", "2026-04-21T19:30:00"),
|
| 38 |
+
("m07", "2026-04-21T20:30:00"),
|
| 39 |
+
("m08", "2026-04-21T21:30:00"),
|
| 40 |
+
("m01", "2026-04-21T17:00:00"),
|
| 41 |
+
("m10", "2026-04-21T22:00:00"),
|
| 42 |
+
]
|
| 43 |
+
|
| 44 |
+
MEETING_IDS_CANCEL = [f"m{i:02d}" for i in range(1, 11)]
|
| 45 |
+
|
| 46 |
+
KNOWN_CONTACTS = ["Jordan Lee", "Jamie Liu", "Marcus Webb", "Sarah Chen", "Priya Sharma", "David Okonkwo"]
|
| 47 |
+
|
| 48 |
+
_BODY = "Thanks — triaging and will follow up shortly."
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
# --- 30 cases: reply every email id -------------------------------------------
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
@pytest.mark.parametrize("email_id", REPLY_EMAIL_IDS)
|
| 55 |
+
def test_dead_reply_email_each_id_positive_or_neutral(email_id: str) -> None:
|
| 56 |
+
e = GhostexecEnvironment(SCENARIO)
|
| 57 |
+
e.reset()
|
| 58 |
+
obs = e.step(GhostexecAction(action_type="reply_email", email_id=email_id, message_body=_BODY))
|
| 59 |
+
assert obs.metadata.get("step_ok") is True
|
| 60 |
+
assert obs.reward is not None
|
| 61 |
+
bd = (obs.metadata or {}).get("reward_breakdown") or {}
|
| 62 |
+
assert bd.get("invalid_step_adjustment", 0) == pytest.approx(0.0)
|
| 63 |
+
assert bd.get("do_nothing_floor", 0) == pytest.approx(0.0)
|
| 64 |
+
# No snapshot -4 conflict tax: legal reply should not tank below -0.5
|
| 65 |
+
assert float(obs.reward) > -0.5
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
@pytest.mark.parametrize("email_id", ("e01", "e03", "e12", "e21", "e27"))
|
| 69 |
+
def test_dead_reply_vip_critical_queue_bonus(email_id: str) -> None:
|
| 70 |
+
e = GhostexecEnvironment(SCENARIO)
|
| 71 |
+
e.reset()
|
| 72 |
+
obs = e.step(GhostexecAction(action_type="reply_email", email_id=email_id, message_body=_BODY))
|
| 73 |
+
assert obs.metadata.get("step_ok") is True
|
| 74 |
+
# VIP+critical micro + critical_queue bonus; exact float varies slightly (0.48 scale).
|
| 75 |
+
assert float(obs.reward or 0) > 0.06
|
| 76 |
+
bd = (obs.metadata or {}).get("reward_breakdown") or {}
|
| 77 |
+
assert float(bd.get("critical_queue_bonus") or 0) > 0
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
@pytest.mark.parametrize("email_id", ("e02", "e04", "e06", "e14", "e23"))
|
| 81 |
+
def test_dead_reply_high_or_normal_small_positive(email_id: str) -> None:
|
| 82 |
+
e = GhostexecEnvironment(SCENARIO)
|
| 83 |
+
e.reset()
|
| 84 |
+
obs = e.step(GhostexecAction(action_type="reply_email", email_id=email_id, message_body=_BODY))
|
| 85 |
+
assert obs.metadata.get("step_ok") is True
|
| 86 |
+
assert float(obs.reward or 0) > 0.0
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# --- 20 cases: do_nothing always penalized ------------------------------------
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
@pytest.mark.parametrize("seed", range(20))
|
| 93 |
+
def test_dead_do_nothing_strict_penalty(seed: int) -> None:
|
| 94 |
+
e = GhostexecEnvironment(SCENARIO)
|
| 95 |
+
e.reset()
|
| 96 |
+
obs = e.step(GhostexecAction(action_type="do_nothing"))
|
| 97 |
+
assert obs.metadata.get("step_ok") is True
|
| 98 |
+
assert float(obs.reward or 0) < 0
|
| 99 |
+
bd = (obs.metadata or {}).get("reward_breakdown") or {}
|
| 100 |
+
assert float(bd.get("do_nothing_floor") or 0) == pytest.approx(reward_mod._DO_NOTHING_STRICT_PENALTY)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
# --- 15 cases: archive --------------------------------------------------------
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
@pytest.mark.parametrize("email_id", ARCHIVE_EMAIL_IDS)
|
| 107 |
+
def test_dead_archive_email_step_ok(email_id: str) -> None:
|
| 108 |
+
e = GhostexecEnvironment(SCENARIO)
|
| 109 |
+
e.reset()
|
| 110 |
+
obs = e.step(GhostexecAction(action_type="archive_email", email_id=email_id))
|
| 111 |
+
assert obs.metadata.get("step_ok") is True
|
| 112 |
+
assert obs.reward is not None
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
# --- 11 cases: complete pending task -----------------------------------------
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
@pytest.mark.parametrize("task_id", COMPLETE_TASK_IDS)
|
| 119 |
+
def test_dead_complete_task_step_ok(task_id: str) -> None:
|
| 120 |
+
e = GhostexecEnvironment(SCENARIO)
|
| 121 |
+
e.reset()
|
| 122 |
+
obs = e.step(GhostexecAction(action_type="complete_task", task_id=task_id))
|
| 123 |
+
assert obs.metadata.get("step_ok") is True
|
| 124 |
+
assert obs.reward is not None
|
| 125 |
+
bd = (obs.metadata or {}).get("reward_breakdown") or {}
|
| 126 |
+
assert float(bd.get("task") or 0) >= reward_mod._COMPLETE_TASK_VALID_MICRO_BONUS
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
# --- 10 cases: reschedule safe slots -----------------------------------------
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
@pytest.mark.parametrize("meeting_id,new_time", _SAFE_RESCHEDULES)
|
| 133 |
+
def test_dead_reschedule_meeting_resolves_or_micro(meeting_id: str, new_time: str) -> None:
|
| 134 |
+
e = GhostexecEnvironment(SCENARIO)
|
| 135 |
+
e.reset()
|
| 136 |
+
obs = e.step(
|
| 137 |
+
GhostexecAction(action_type="reschedule_meeting", meeting_id=meeting_id, new_time=new_time)
|
| 138 |
+
)
|
| 139 |
+
assert obs.metadata.get("step_ok") is True
|
| 140 |
+
assert obs.reward is not None
|
| 141 |
+
# Should beat idle do-nothing on same fresh env
|
| 142 |
+
e2 = GhostexecEnvironment(SCENARIO)
|
| 143 |
+
e2.reset()
|
| 144 |
+
idle = e2.step(GhostexecAction(action_type="do_nothing"))
|
| 145 |
+
assert float(obs.reward or 0) > float(idle.reward or 0)
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
# --- 10 cases: cancel meeting --------------------------------------------------
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
@pytest.mark.parametrize("meeting_id", MEETING_IDS_CANCEL)
|
| 152 |
+
def test_dead_cancel_meeting_step_ok(meeting_id: str) -> None:
|
| 153 |
+
e = GhostexecEnvironment(SCENARIO)
|
| 154 |
+
e.reset()
|
| 155 |
+
obs = e.step(
|
| 156 |
+
GhostexecAction(action_type="cancel_meeting", meeting_id=meeting_id, reason="dead test cancel")
|
| 157 |
+
)
|
| 158 |
+
assert obs.metadata.get("step_ok") is True
|
| 159 |
+
assert obs.reward is not None
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
# --- 6 cases: send_message -----------------------------------------------------
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
@pytest.mark.parametrize("contact_name", KNOWN_CONTACTS)
|
| 166 |
+
def test_dead_send_message_known_contact(contact_name: str) -> None:
|
| 167 |
+
e = GhostexecEnvironment(SCENARIO)
|
| 168 |
+
e.reset()
|
| 169 |
+
obs = e.step(
|
| 170 |
+
GhostexecAction(
|
| 171 |
+
action_type="send_message",
|
| 172 |
+
contact_name=contact_name,
|
| 173 |
+
message_body="Quick sync on priorities.",
|
| 174 |
+
)
|
| 175 |
+
)
|
| 176 |
+
assert obs.metadata.get("step_ok") is True
|
| 177 |
+
bd = (obs.metadata or {}).get("reward_breakdown") or {}
|
| 178 |
+
assert float(bd.get("relationship") or 0) >= reward_mod._SEND_MESSAGE_VALID_MICRO_BONUS - 0.01
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
# --- 5 cases: delegate_task ---------------------------------------------------
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
@pytest.mark.parametrize(
|
| 185 |
+
"task_id,contact",
|
| 186 |
+
[
|
| 187 |
+
("t08", "Jordan Lee"),
|
| 188 |
+
("t07", "Jamie Liu"),
|
| 189 |
+
("t01", "Marcus Webb"),
|
| 190 |
+
("t02", "Sarah Chen"),
|
| 191 |
+
("t11", "Casey Nguyen"),
|
| 192 |
+
],
|
| 193 |
+
)
|
| 194 |
+
def test_dead_delegate_task(task_id: str, contact: str) -> None:
|
| 195 |
+
e = GhostexecEnvironment(SCENARIO)
|
| 196 |
+
e.reset()
|
| 197 |
+
obs = e.step(
|
| 198 |
+
GhostexecAction(action_type="delegate_task", task_id=task_id, contact_name=contact)
|
| 199 |
+
)
|
| 200 |
+
assert obs.metadata.get("step_ok") is True
|
| 201 |
+
bd = (obs.metadata or {}).get("reward_breakdown") or {}
|
| 202 |
+
assert float(bd.get("task") or 0) >= reward_mod._DELEGATE_TASK_VALID_MICRO_BONUS - 0.01
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
# --- 10 cases: invalid actions ------------------------------------------------
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
@pytest.mark.parametrize(
|
| 209 |
+
"action,expect_ok",
|
| 210 |
+
[
|
| 211 |
+
(GhostexecAction(action_type="reply_email", email_id="nope", message_body="x"), False),
|
| 212 |
+
(GhostexecAction(action_type="complete_task", task_id="t09"), False),
|
| 213 |
+
(GhostexecAction(action_type="archive_email", email_id="nope"), False),
|
| 214 |
+
(GhostexecAction(action_type="reschedule_meeting", meeting_id="m99", new_time="2026-04-21T18:00:00"), False),
|
| 215 |
+
(GhostexecAction(action_type="cancel_meeting", meeting_id="m99", reason="x"), False),
|
| 216 |
+
(GhostexecAction(action_type="delegate_task", task_id="t01", contact_name="Nobody"), False),
|
| 217 |
+
(GhostexecAction(action_type="send_message", contact_name="Nobody", message_body="hi"), False),
|
| 218 |
+
(GhostexecAction(action_type="reply_email", email_id="", message_body="hi"), False),
|
| 219 |
+
(GhostexecAction(action_type="complete_task", task_id=""), False),
|
| 220 |
+
(GhostexecAction(action_type="archive_email", email_id=""), False),
|
| 221 |
+
],
|
| 222 |
+
)
|
| 223 |
+
def test_dead_invalid_action_step_ok_false(action: GhostexecAction, expect_ok: bool) -> None:
|
| 224 |
+
e = GhostexecEnvironment(SCENARIO)
|
| 225 |
+
e.reset()
|
| 226 |
+
obs = e.step(action)
|
| 227 |
+
assert obs.metadata.get("step_ok") is expect_ok
|
| 228 |
+
bd = (obs.metadata or {}).get("reward_breakdown") or {}
|
| 229 |
+
assert float(bd.get("invalid_step_adjustment") or 0) == pytest.approx(-0.25)
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
# --- Ordering: VIP critical reply >> do_nothing --------------------------------
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def test_dead_priority_ordering_vip_critical_over_normal_over_idle() -> None:
|
| 236 |
+
r_vip: list[float] = []
|
| 237 |
+
r_norm: list[float] = []
|
| 238 |
+
r_idle: list[float] = []
|
| 239 |
+
for _ in range(5):
|
| 240 |
+
e1 = GhostexecEnvironment(SCENARIO)
|
| 241 |
+
e1.reset()
|
| 242 |
+
r_vip.append(float(e1.step(GhostexecAction(action_type="reply_email", email_id="e01", message_body=_BODY)).reward or 0))
|
| 243 |
+
e2 = GhostexecEnvironment(SCENARIO)
|
| 244 |
+
e2.reset()
|
| 245 |
+
r_norm.append(float(e2.step(GhostexecAction(action_type="reply_email", email_id="e14", message_body=_BODY)).reward or 0))
|
| 246 |
+
e3 = GhostexecEnvironment(SCENARIO)
|
| 247 |
+
e3.reset()
|
| 248 |
+
r_idle.append(float(e3.step(GhostexecAction(action_type="do_nothing")).reward or 0))
|
| 249 |
+
assert min(r_vip) > max(r_idle)
|
| 250 |
+
assert min(r_norm) > max(r_idle)
|
| 251 |
+
assert sum(r_vip) / len(r_vip) > sum(r_norm) / len(r_norm)
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
# --- Tone penalty: casual to angry board contact ------------------------------
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def test_dead_tone_penalty_casual_to_angry_board() -> None:
|
| 258 |
+
e = GhostexecEnvironment(SCENARIO)
|
| 259 |
+
e.reset()
|
| 260 |
+
# Marcus Webb is board; ensure angry mood in scenario or pick contact - phase2 has Marcus ANGRY in briefing
|
| 261 |
+
obs_bad = e.step(
|
| 262 |
+
GhostexecAction(
|
| 263 |
+
action_type="reply_email",
|
| 264 |
+
email_id="e01",
|
| 265 |
+
message_body="hey lol no worries",
|
| 266 |
+
)
|
| 267 |
+
)
|
| 268 |
+
assert obs_bad.metadata.get("step_ok") is True
|
| 269 |
+
e2 = GhostexecEnvironment(SCENARIO)
|
| 270 |
+
e2.reset()
|
| 271 |
+
obs_good = e2.step(
|
| 272 |
+
GhostexecAction(
|
| 273 |
+
action_type="reply_email",
|
| 274 |
+
email_id="e01",
|
| 275 |
+
message_body="Dear Marcus, sincerely addressing the board request now.",
|
| 276 |
+
)
|
| 277 |
+
)
|
| 278 |
+
assert float(obs_good.reward or 0) > float(obs_bad.reward or 0)
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
# --- Reschedule adds conflict channel micro even if overlap unchanged ---------
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
def test_dead_reschedule_micro_in_breakdown() -> None:
|
| 285 |
+
e = GhostexecEnvironment(SCENARIO)
|
| 286 |
+
e.reset()
|
| 287 |
+
obs = e.step(
|
| 288 |
+
GhostexecAction(action_type="reschedule_meeting", meeting_id="m07", new_time="2026-04-21T20:30:00")
|
| 289 |
+
)
|
| 290 |
+
assert obs.metadata.get("step_ok") is True
|
| 291 |
+
bd = (obs.metadata or {}).get("reward_breakdown") or {}
|
| 292 |
+
assert float(bd.get("conflict_raw") or 0) >= reward_mod._RESCHEDULE_VALID_MICRO_BONUS - 0.01
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
# --- Unit: compute_step_reward invalid vs noop delta matches contract ---------
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
def test_dead_compute_reward_invalid_vs_noop_delta() -> None:
|
| 299 |
+
w = GhostexecEnvironment.load_world_from_json(SCENARIO)
|
| 300 |
+
noop = GhostexecAction(action_type="do_nothing")
|
| 301 |
+
bad = GhostexecAction(action_type="reply_email", email_id="missing", message_body="x")
|
| 302 |
+
bd_ok = reward_mod.compute_step_reward(w, w, noop, action_ok=True, episode_done=False)
|
| 303 |
+
bd_bad = reward_mod.compute_step_reward(w, w, bad, action_ok=False, episode_done=False)
|
| 304 |
+
assert bd_bad.final == pytest.approx(bd_ok.final - (0.25 - 0.15))
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
def test_dead_vip_critical_reply_outscores_professional_critical() -> None:
|
| 308 |
+
"""VIP x2 micro on critical senders should dominate professional critical."""
|
| 309 |
+
e_vip = GhostexecEnvironment(SCENARIO)
|
| 310 |
+
e_vip.reset()
|
| 311 |
+
r_vip = float(
|
| 312 |
+
e_vip.step(GhostexecAction(action_type="reply_email", email_id="e01", message_body=_BODY)).reward or 0
|
| 313 |
+
)
|
| 314 |
+
e_pro = GhostexecEnvironment(SCENARIO)
|
| 315 |
+
e_pro.reset()
|
| 316 |
+
r_pro = float(
|
| 317 |
+
e_pro.step(GhostexecAction(action_type="reply_email", email_id="e21", message_body=_BODY)).reward or 0
|
| 318 |
+
)
|
| 319 |
+
assert r_vip > r_pro
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
validate-submission.sh
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# validate-submission.sh — Ghostexec OpenEnv Submission Validator
|
| 4 |
+
#
|
| 5 |
+
# Checks that your HF Space is live, Docker image builds, and openenv validate passes.
|
| 6 |
+
|
| 7 |
+
set -uo pipefail
|
| 8 |
+
|
| 9 |
+
DOCKER_BUILD_TIMEOUT=600
|
| 10 |
+
if [ -t 1 ]; then
|
| 11 |
+
RED='\033[0;31m'
|
| 12 |
+
GREEN='\033[0;32m'
|
| 13 |
+
YELLOW='\033[1;33m'
|
| 14 |
+
BOLD='\033[1m'
|
| 15 |
+
NC='\033[0m'
|
| 16 |
+
else
|
| 17 |
+
RED='' GREEN='' YELLOW='' BOLD='' NC=''
|
| 18 |
+
fi
|
| 19 |
+
|
| 20 |
+
run_with_timeout() {
|
| 21 |
+
local secs="$1"; shift
|
| 22 |
+
if command -v timeout &>/dev/null; then
|
| 23 |
+
timeout "$secs" "$@"
|
| 24 |
+
elif command -v gtimeout &>/dev/null; then
|
| 25 |
+
gtimeout "$secs" "$@"
|
| 26 |
+
else
|
| 27 |
+
"$@" &
|
| 28 |
+
local pid=$!
|
| 29 |
+
( sleep "$secs" && kill "$pid" 2>/dev/null ) &
|
| 30 |
+
local watcher=$!
|
| 31 |
+
wait "$pid" 2>/dev/null
|
| 32 |
+
local rc=$?
|
| 33 |
+
kill "$watcher" 2>/dev/null
|
| 34 |
+
wait "$watcher" 2>/dev/null
|
| 35 |
+
return $rc
|
| 36 |
+
fi
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
portable_mktemp() {
|
| 40 |
+
local prefix="${1:-validate}"
|
| 41 |
+
mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
CLEANUP_FILES=()
|
| 45 |
+
cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
|
| 46 |
+
trap cleanup EXIT
|
| 47 |
+
|
| 48 |
+
PING_URL="${1:-}"
|
| 49 |
+
REPO_DIR="${2:-.}"
|
| 50 |
+
|
| 51 |
+
if [ -z "$PING_URL" ]; then
|
| 52 |
+
printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
|
| 53 |
+
printf "\n"
|
| 54 |
+
printf " ping_url Your HuggingFace Space URL (e.g. https://modelbuilderhq-ghostexec.hf.space)\n"
|
| 55 |
+
printf " repo_dir Path to your repo (default: current directory)\n"
|
| 56 |
+
exit 1
|
| 57 |
+
fi
|
| 58 |
+
|
| 59 |
+
if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
|
| 60 |
+
printf "Error: directory '%s' not found\n" "${2:-.}"
|
| 61 |
+
exit 1
|
| 62 |
+
fi
|
| 63 |
+
PING_URL="${PING_URL%/}"
|
| 64 |
+
PASS=0
|
| 65 |
+
|
| 66 |
+
log() { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
|
| 67 |
+
pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
|
| 68 |
+
fail() { log "${RED}FAILED${NC} -- $1"; }
|
| 69 |
+
hint() { printf " ${YELLOW}Hint:${NC} %b\n" "$1"; }
|
| 70 |
+
stop_at() {
|
| 71 |
+
printf "\n"
|
| 72 |
+
printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
|
| 73 |
+
exit 1
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
printf "\n"
|
| 77 |
+
printf "${BOLD}========================================${NC}\n"
|
| 78 |
+
printf "${BOLD} Ghostexec OpenEnv Validator${NC}\n"
|
| 79 |
+
printf "${BOLD}========================================${NC}\n"
|
| 80 |
+
log "Repo: $REPO_DIR"
|
| 81 |
+
log "Ping URL: $PING_URL"
|
| 82 |
+
printf "\n"
|
| 83 |
+
|
| 84 |
+
log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
|
| 85 |
+
|
| 86 |
+
CURL_OUTPUT=$(portable_mktemp "validate-curl")
|
| 87 |
+
CLEANUP_FILES+=("$CURL_OUTPUT")
|
| 88 |
+
HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
|
| 89 |
+
-H "Content-Type: application/json" -d '{}' \
|
| 90 |
+
"$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
|
| 91 |
+
|
| 92 |
+
if [ "$HTTP_CODE" = "200" ]; then
|
| 93 |
+
pass "HF Space is live and responds to /reset"
|
| 94 |
+
elif [ "$HTTP_CODE" = "000" ]; then
|
| 95 |
+
fail "HF Space not reachable (connection failed or timed out)"
|
| 96 |
+
hint "Check your network connection and that the Space is running."
|
| 97 |
+
hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
|
| 98 |
+
stop_at "Step 1"
|
| 99 |
+
else
|
| 100 |
+
fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
|
| 101 |
+
hint "Make sure your Space is running and the URL is correct."
|
| 102 |
+
hint "Try opening $PING_URL in your browser first."
|
| 103 |
+
stop_at "Step 1"
|
| 104 |
+
fi
|
| 105 |
+
|
| 106 |
+
log "${BOLD}Step 2/3: Running docker build${NC} ..."
|
| 107 |
+
|
| 108 |
+
if ! command -v docker &>/dev/null; then
|
| 109 |
+
fail "docker command not found"
|
| 110 |
+
hint "Install Docker: https://docs.docker.com/get-docker/"
|
| 111 |
+
stop_at "Step 2"
|
| 112 |
+
fi
|
| 113 |
+
|
| 114 |
+
if [ -f "$REPO_DIR/Dockerfile" ]; then
|
| 115 |
+
DOCKER_CONTEXT="$REPO_DIR"
|
| 116 |
+
elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
|
| 117 |
+
DOCKER_CONTEXT="$REPO_DIR/server"
|
| 118 |
+
else
|
| 119 |
+
fail "No Dockerfile found in repo root or server/ directory"
|
| 120 |
+
stop_at "Step 2"
|
| 121 |
+
fi
|
| 122 |
+
|
| 123 |
+
log " Found Dockerfile in $DOCKER_CONTEXT"
|
| 124 |
+
|
| 125 |
+
BUILD_OK=false
|
| 126 |
+
BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
|
| 127 |
+
|
| 128 |
+
if [ "$BUILD_OK" = true ]; then
|
| 129 |
+
pass "Docker build succeeded"
|
| 130 |
+
else
|
| 131 |
+
fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
|
| 132 |
+
printf "%s\n" "$BUILD_OUTPUT" | tail -20
|
| 133 |
+
stop_at "Step 2"
|
| 134 |
+
fi
|
| 135 |
+
|
| 136 |
+
log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
|
| 137 |
+
|
| 138 |
+
if ! command -v openenv &>/dev/null; then
|
| 139 |
+
fail "openenv command not found"
|
| 140 |
+
hint "Install it with your project env, e.g.: uv run pip install openenv-core"
|
| 141 |
+
stop_at "Step 3"
|
| 142 |
+
fi
|
| 143 |
+
|
| 144 |
+
VALIDATE_OK=false
|
| 145 |
+
VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
|
| 146 |
+
|
| 147 |
+
if [ "$VALIDATE_OK" = true ]; then
|
| 148 |
+
pass "openenv validate passed"
|
| 149 |
+
[ -n "$VALIDATE_OUTPUT" ] && log " $VALIDATE_OUTPUT"
|
| 150 |
+
else
|
| 151 |
+
fail "openenv validate failed"
|
| 152 |
+
printf "%s\n" "$VALIDATE_OUTPUT"
|
| 153 |
+
stop_at "Step 3"
|
| 154 |
+
fi
|
| 155 |
+
|
| 156 |
+
printf "\n"
|
| 157 |
+
printf "${BOLD}========================================${NC}\n"
|
| 158 |
+
printf "${GREEN}${BOLD} All 3/3 checks passed!${NC}\n"
|
| 159 |
+
printf "${GREEN}${BOLD} Ghostexec is ready for submission.${NC}\n"
|
| 160 |
+
printf "${BOLD}========================================${NC}\n"
|
| 161 |
+
printf "\n"
|
| 162 |
+
|
| 163 |
+
exit 0
|