Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- README.md +410 -355
- environment-innovation/README.md +147 -0
- server/app.py +25 -0
README.md
CHANGED
|
@@ -1,355 +1,410 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: Ghostexec Environment Server
|
| 3 |
-
emoji: 📢
|
| 4 |
-
colorFrom: pink
|
| 5 |
-
colorTo: yellow
|
| 6 |
-
sdk: docker
|
| 7 |
-
pinned: false
|
| 8 |
-
app_port: 7860
|
| 9 |
-
base_path: /web
|
| 10 |
-
tags:
|
| 11 |
-
- openenv
|
| 12 |
-
---
|
| 13 |
-
|
| 14 |
-
# Ghostexec
|
| 15 |
-
|
| 16 |
-
**Ghostexec** is an [OpenEnv](https://github.com/meta-pytorch/OpenEnv)-compatible environment
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
**
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
uv run
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
``
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
```
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
uv run server --port 8000
|
| 213 |
-
#
|
| 214 |
-
|
| 215 |
-
uv run pytest tests/
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
uv run
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
```
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
```
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
---
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
```
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
**
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
-
|
| 338 |
-
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
-
|
| 343 |
-
-
|
| 344 |
-
-
|
| 345 |
-
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Ghostexec Environment Server
|
| 3 |
+
emoji: 📢
|
| 4 |
+
colorFrom: pink
|
| 5 |
+
colorTo: yellow
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
app_port: 7860
|
| 9 |
+
base_path: /web
|
| 10 |
+
tags:
|
| 11 |
+
- openenv
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# Ghostexec
|
| 15 |
+
|
| 16 |
+
**Ghostexec** is an [OpenEnv](https://github.com/meta-pytorch/OpenEnv)-compatible environment: a busy **executive chief-of-staff** simulator with inbox, calendar, contacts, tasks, and stakeholder moods. The agent must read a **plain-text briefing**, then emit **one structured action per step** (`reply_email`, `reschedule_meeting`, …). The server returns rewards shaped around **conflict**, **relationships**, and **tasks**—plus trajectory **graders** for hackathon validation. All episode **content** lives in `scenarios/*.json`; the engine is in `server/ghostexec_environment.py` and `server/reward.py`.
|
| 17 |
+
|
| 18 |
+
| Item | Value |
|
| 19 |
+
|------|--------|
|
| 20 |
+
| **HF Space name / manifest** | `ghostexec` in [`openenv.yaml`](openenv.yaml) |
|
| 21 |
+
| **Python package** | `openenv-ghostexec` in [`pyproject.toml`](pyproject.toml) (import `ghostexec`) |
|
| 22 |
+
| **Public Space** | [modelbuilderhq/ghostexec](https://huggingface.co/spaces/modelbuilderhq/ghostexec) |
|
| 23 |
+
| **Deeper innovation-only brief** | [`environment-innovation/README.md`](environment-innovation/README.md) |
|
| 24 |
+
|
| 25 |
+
---
|
| 26 |
+
|
| 27 |
+
## Deliverables (fill before freeze)
|
| 28 |
+
|
| 29 |
+
| Deliverable | URL |
|
| 30 |
+
|-------------|-----|
|
| 31 |
+
| Public HF Space (required) | [https://huggingface.co/spaces/modelbuilderhq/ghostexec](https://huggingface.co/spaces/modelbuilderhq/ghostexec) |
|
| 32 |
+
| Write-up / blog (HF post preferred) | `TODO: paste your post URL` |
|
| 33 |
+
| Short demo video (<2 min) | `TODO: paste your video URL` |
|
| 34 |
+
|
| 35 |
+
---
|
| 36 |
+
|
| 37 |
+
## Contents
|
| 38 |
+
|
| 39 |
+
**Judging criteria (this README is organized around them)**
|
| 40 |
+
|
| 41 |
+
1. [Criterion: Environment Innovation (40%)](#ghostexec-env-innovation)
|
| 42 |
+
2. [Criterion: Storytelling & Presentation (30%)](#ghostexec-storytelling)
|
| 43 |
+
3. [Criterion: Showing Improvement in Rewards (20%)](#ghostexec-reward-improvement)
|
| 44 |
+
4. [Criterion: Reward & Training Pipeline (10%)](#ghostexec-reward-pipeline)
|
| 45 |
+
|
| 46 |
+
**Reference**
|
| 47 |
+
|
| 48 |
+
5. [Hackathon themes & checklist](#openenv-hackathon-themes--checklist)
|
| 49 |
+
6. [Quick start](#quick-start-python-client)
|
| 50 |
+
7. [Actions](#actions-and-fields)
|
| 51 |
+
8. [Observation](#observation)
|
| 52 |
+
9. [Reward (formula summary)](#reward-formula-summary)
|
| 53 |
+
10. [HTTP vs WebSocket](#http-vs-websocket-episode-state)
|
| 54 |
+
11. [Running and testing locally](#running-and-testing-locally)
|
| 55 |
+
12. [Hugging Face Spaces](#hugging-face-spaces)
|
| 56 |
+
13. [Scenarios](#scenarios)
|
| 57 |
+
14. [Project layout](#project-layout)
|
| 58 |
+
15. [Resources & references](#resources--references)
|
| 59 |
+
16. [License](#license)
|
| 60 |
+
|
| 61 |
+
---
|
| 62 |
+
|
| 63 |
+
## Criterion: Environment Innovation (40%)
|
| 64 |
+
|
| 65 |
+
<a id="ghostexec-env-innovation"></a>
|
| 66 |
+
|
| 67 |
+
**Weight:** 40%
|
| 68 |
+
|
| 69 |
+
**What it means:**
|
| 70 |
+
|
| 71 |
+
- Is the environment novel, creative, or genuinely challenging?
|
| 72 |
+
- Does it meaningfully test agent behavior in a way that hasn't been done before?
|
| 73 |
+
|
| 74 |
+
### How Ghostexec answers this
|
| 75 |
+
|
| 76 |
+
**Challenging world.** The policy sees **one dense natural-language briefing** per step (emails, calendar overlaps, contacts with mood, overdue tasks, stress, steps remaining)—not a JSON dump of the world. It must **ground** decisions in real ids from that text, return **valid typed actions**, and accept **time pressure** and **social fallout** when meetings move or mail goes unanswered. Invalid actions **do not crash** the server; they return structured errors so learning signals stay intact.
|
| 77 |
+
|
| 78 |
+
**Meaningful behavior, not a toy Q&A.** Success needs **comprehension + tool discipline**: legal JSON schema, multi-step **sequences** (WebSocket sessions for real episodes), and **tradeoffs** across channels (mail vs calendar vs tasks vs relationships). **`do_nothing` is penalised** so “safe” idleness is costly when fires are burning.
|
| 79 |
+
|
| 80 |
+
**Dynamics, not a static paragraph.** After each valid action, the simulation **advances the clock**, updates **moods**, rebuilds **conflicts**, and can apply **scenario-driven drift** (`after_step` events in JSON): shifted meetings, new deadlines, preference changes—so the agent is tested on **adaptation**, not memorizing the first screen.
|
| 81 |
+
|
| 82 |
+
**Dual evaluation.** **Dense step rewards** in `server/reward.py` teach fine structure; **trajectory graders** in `graders.py` return scores strictly in **`(0.01, 0.99)`** per OpenEnv task wiring in `openenv.yaml`. Agents learn from the dense signal; judges get bounded certification scores.
|
| 83 |
+
|
| 84 |
+
**Honest novelty claim.** Inboxes and calendars are familiar **ingredients**. What is less common is the **composition**: OpenEnv-native packaging, **plain-text-only** observations, **data-defined** scenarios, live dynamics + drift, dual reward/grader stack, and a **transactional** action API in one trainable, hostable environment.
|
| 85 |
+
|
| 86 |
+
### Task ladder (difficulty in data)
|
| 87 |
+
|
| 88 |
+
| Task id | Difficulty | Scenario | What gets harder |
|
| 89 |
+
|---------|------------|----------|------------------|
|
| 90 |
+
| `phase2_core` | easy | `scenarios/phase2_core.json` | Dense triage: VIP mail, calendar relief, overlapping work. |
|
| 91 |
+
| `monday_morning` | medium | `scenarios/monday_morning.json` | Stacked Monday rush, less slack. |
|
| 92 |
+
| `dinner_disaster` | hard | `scenarios/dinner_disaster.json` | Personal vs professional collision, escalation risk. |
|
| 93 |
+
|
| 94 |
+
### 5-minute verification checklist
|
| 95 |
+
|
| 96 |
+
1. **`openenv.yaml`** — three tasks, `max_steps`, `app: server.app:app`, `name: ghostexec`, grader paths.
|
| 97 |
+
2. **`scenarios/*.json`** — world content is **data**, not hardcoded lore in Python.
|
| 98 |
+
3. **`server/ghostexec_environment.py`** — `build_briefing_text`, `_apply_action`, post-step dynamics, schema drift hooks.
|
| 99 |
+
4. **`server/reward.py`** — fixed 0.35 / 0.35 / 0.30 core, invalid / idle handling, shaping caps.
|
| 100 |
+
5. **`graders.py`** — bounded grader outputs, trajectory consumption.
|
| 101 |
+
6. **Live Space** — `/docs` or `POST /reset` + `POST /step`: legal steps change state; illegal steps return errors, not stack traces.
|
| 102 |
+
|
| 103 |
+
For a **standalone** walkthrough of the innovation angle only, see **[environment-innovation/README.md](environment-innovation/README.md)**.
|
| 104 |
+
|
| 105 |
+
---
|
| 106 |
+
|
| 107 |
+
## Criterion: Storytelling & Presentation (30%)
|
| 108 |
+
|
| 109 |
+
<a id="ghostexec-storytelling"></a>
|
| 110 |
+
|
| 111 |
+
**Weight:** 30%
|
| 112 |
+
|
| 113 |
+
**What it means:**
|
| 114 |
+
|
| 115 |
+
- Can you clearly explain the problem, the environment, and what the agent learned?
|
| 116 |
+
- Is the demo engaging and easy to follow for a non-technical audience?
|
| 117 |
+
|
| 118 |
+
### The problem (plain language)
|
| 119 |
+
|
| 120 |
+
An executive’s day is **messy**: urgent email from a board member, a double-booked calendar, a spouse texting about dinner, a report due at noon, and every choice **ripples**—someone feels heard or ignored, a conflict gets better or worse, a task slips or gets done. Ghostexec turns that into a **small simulator** the model must **run**, not a single paragraph to summarize.
|
| 121 |
+
|
| 122 |
+
### The environment (one sentence)
|
| 123 |
+
|
| 124 |
+
**You read a realistic staff briefing; you pick one legal “move” (reply, reschedule, delegate, …); the world updates; you get a score that reflects tension across work, people, and tasks.**
|
| 125 |
+
|
| 126 |
+
### What the agent is supposed to learn
|
| 127 |
+
|
| 128 |
+
- **Read carefully** — wrong `email_id` / `meeting_id` / `task_id` fails cleanly with feedback.
|
| 129 |
+
- **Act under pressure** — clock, `max_steps`, and stress push toward decisions, not endless analysis.
|
| 130 |
+
- **Balance competing goals** — improving relationships can conflict with clearing the calendar or finishing tasks; rewards encode that tradeoff.
|
| 131 |
+
- **Recover from change** — drift events mean the “right” plan from step 1 may not stay right at step 8.
|
| 132 |
+
|
| 133 |
+
### Demo tips for a non-technical audience
|
| 134 |
+
|
| 135 |
+
1. **Show the briefing first** — let viewers see the same wall of text the model sees (relatable chaos).
|
| 136 |
+
2. **Show one good step vs one bad step** — e.g. thoughtful reply vs invalid id or `do_nothing` while critical mail waits (mood / reward visibly differ).
|
| 137 |
+
3. **Name the three “channels”** — calmer calendar, happier stakeholders, tasks moving forward—without math jargon.
|
| 138 |
+
4. **End on “what improved”** — after training, pick the same scenario and show fewer invalid steps, higher rewards, or a grader curve (ties to the 20% section below).
|
| 139 |
+
|
| 140 |
+
### Hackathon alignment (themes)
|
| 141 |
+
|
| 142 |
+
**Theme fit (examples):** Ghostexec fits **Theme 3.2 — Personalized tasks** (executive-style inbox, calendar, delegation). **Theme 4** is partially supported via `GHOSTEXEC_CURRICULUM`, `GHOSTEXEC_PERTURB`, and diverse `scenarios/`.
|
| 143 |
+
|
| 144 |
+
---
|
| 145 |
+
|
| 146 |
+
## Criterion: Showing Improvement in Rewards (20%)
|
| 147 |
+
|
| 148 |
+
<a id="ghostexec-reward-improvement"></a>
|
| 149 |
+
|
| 150 |
+
**Weight:** 20%
|
| 151 |
+
|
| 152 |
+
**What it means:**
|
| 153 |
+
|
| 154 |
+
- Is there observable evidence of training progress? Reward curves, before/after behavior, comparison against a baseline—anything that proves the agent learned something.
|
| 155 |
+
|
| 156 |
+
### Where evidence lives in this repo
|
| 157 |
+
|
| 158 |
+
| Artifact | Role |
|
| 159 |
+
|----------|------|
|
| 160 |
+
| `outputs/logs/episode_rewards.jsonl` | Per-step reward trace (gitignored); use for **reward curves** and component debugging. |
|
| 161 |
+
| `outputs/trainer_state.json` / training logs | Produced by training scripts when configured; feed into plotting. |
|
| 162 |
+
| `outputs/reward_log.csv` | Optional CSV companion for plotting pipelines. |
|
| 163 |
+
| `outputs/compliance_manifest.json` | Baseline / compliance metadata for **comparison** charts. |
|
| 164 |
+
| `outputs/plots/*.png` | Generated report figures (see command below). |
|
| 165 |
+
|
| 166 |
+
**Plot pack (loss + reward + components + baseline bar):**
|
| 167 |
+
|
| 168 |
+
```bash
|
| 169 |
+
uv run python scripts/plot_training_report.py \
|
| 170 |
+
--trainer-history outputs/trainer_state.json \
|
| 171 |
+
--reward-csv outputs/reward_log.csv \
|
| 172 |
+
--baselines-json outputs/compliance_manifest.json \
|
| 173 |
+
--out-dir outputs/plots
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
Writes `loss_curve.png`, `reward_curve.png`, `components_curve.png`, `baseline_comparison.png` under `outputs/plots/`.
|
| 177 |
+
|
| 178 |
+
**End-to-end notebook:** [`notebooks/ghostexec_unsloth_grpo_hf_api.ipynb`](notebooks/ghostexec_unsloth_grpo_hf_api.ipynb) is intended to **Run All** without manual steps (per project convention).
|
| 179 |
+
|
| 180 |
+
**Before / after narrative for judges:** same `task_id` and seed—show **lower invalid rate**, **higher mean step reward**, or **clearer grader trajectory** after finetuning. Pair numbers with **one short clip** of two runs side by side on the Space or local server.
|
| 181 |
+
|
| 182 |
+
---
|
| 183 |
+
|
| 184 |
+
## Criterion: Reward & Training Pipeline (10%)
|
| 185 |
+
|
| 186 |
+
<a id="ghostexec-reward-pipeline"></a>
|
| 187 |
+
|
| 188 |
+
**Weight:** 10%
|
| 189 |
+
|
| 190 |
+
**What it means:**
|
| 191 |
+
|
| 192 |
+
- Is the reward logic coherent?
|
| 193 |
+
- Does the pipeline produce meaningful improvement in the trained agent's behavior?
|
| 194 |
+
|
| 195 |
+
### Reward logic (coherent and inspectable)
|
| 196 |
+
|
| 197 |
+
Phase-4 scoring in `server/reward.py` uses a **fixed** core blend:
|
| 198 |
+
|
| 199 |
+
\[
|
| 200 |
+
\text{weighted base} = 0.35 \cdot \text{conflict} + 0.35 \cdot \text{relationship} + 0.30 \cdot \text{task}
|
| 201 |
+
\]
|
| 202 |
+
|
| 203 |
+
Then bounded shaping, invalid-step handling, and explicit penalties (including **`do_nothing`**). Components surface on `RewardBreakdown` and in observation **metadata** where configured—so “why did this step score X?” is **auditable**, not a black box.
|
| 204 |
+
|
| 205 |
+
Design rationale is aligned with dense reward-shaping practice (see [arXiv:2408.10215](https://arxiv.org/abs/2408.10215))—fixed channel weights, bounded magnitudes, sparse end-of-episode avoided for training.
|
| 206 |
+
|
| 207 |
+
### Training pipeline (entrypoints)
|
| 208 |
+
|
| 209 |
+
| Step | Command / artifact |
|
| 210 |
+
|------|---------------------|
|
| 211 |
+
| Install | `uv sync` (from repo root) |
|
| 212 |
+
| Server (matches Dockerfile) | `uv run server --port 8000` |
|
| 213 |
+
| SFT → GRPO script | `uv run python scripts/train_sft_then_grpo.py` (see [Running and testing locally](#running-and-testing-locally) for a full example invocation) |
|
| 214 |
+
| Tests | `uv run pytest tests/ -q` |
|
| 215 |
+
| Docker build gate | `GHOSTEXEC_RUN_DOCKER_BUILD=1 uv run pytest tests/test_docker_build.py -q` |
|
| 216 |
+
|
| 217 |
+
The pipeline is **meaningful** when tied to the **20% evidence** above: same env URL, logged rewards, and plots that move in the right direction over training—not when loss alone decreases.
|
| 218 |
+
|
| 219 |
+
---
|
| 220 |
+
|
| 221 |
+
## OpenEnv Hackathon themes & checklist
|
| 222 |
+
|
| 223 |
+
| Item | Status |
|
| 224 |
+
|------|--------|
|
| 225 |
+
| OpenEnv-based env + `openenv.yaml` | In-repo (`openenv-core[core]>=0.2.3`). |
|
| 226 |
+
| Short write-up or <2 min video | **You:** publish and paste URLs in [Deliverables](#deliverables-fill-before-freeze). |
|
| 227 |
+
| Public HF Space | [Deliverables](#deliverables-fill-before-freeze); deploy with `openenv push --repo-id <your>/ghostexec`. |
|
| 228 |
+
|
| 229 |
+
---
|
| 230 |
+
|
| 231 |
+
## Quick start (Python client)
|
| 232 |
+
|
| 233 |
+
From the repo root (where `pyproject.toml` lives):
|
| 234 |
+
|
| 235 |
+
```bash
|
| 236 |
+
uv sync
|
| 237 |
+
uv run server --port 8000
|
| 238 |
+
```
|
| 239 |
+
|
| 240 |
+
```python
|
| 241 |
+
from ghostexec import GhostexecAction, GhostexecEnv
|
| 242 |
+
|
| 243 |
+
with GhostexecEnv(base_url="http://127.0.0.1:8000") as env:
|
| 244 |
+
out = env.reset()
|
| 245 |
+
print(out.observation.echoed_message[:500], "…")
|
| 246 |
+
|
| 247 |
+
step = env.step(
|
| 248 |
+
GhostexecAction(
|
| 249 |
+
action_type="reply_email",
|
| 250 |
+
email_id="e01",
|
| 251 |
+
message_body=(
|
| 252 |
+
"Marcus — acknowledged. Revised figures and short rationale "
|
| 253 |
+
"before noon. — Exec"
|
| 254 |
+
),
|
| 255 |
+
)
|
| 256 |
+
)
|
| 257 |
+
print("reward:", step.reward)
|
| 258 |
+
print("metadata keys:", sorted((step.observation.metadata or {}).keys()))
|
| 259 |
+
```
|
| 260 |
+
|
| 261 |
+
**Docker (optional):**
|
| 262 |
+
|
| 263 |
+
```bash
|
| 264 |
+
docker build -t ghostexec-env:latest .
|
| 265 |
+
```
|
| 266 |
+
|
| 267 |
+
---
|
| 268 |
+
|
| 269 |
+
## Actions and fields
|
| 270 |
+
|
| 271 |
+
`GhostexecAction` (`models.py`):
|
| 272 |
+
|
| 273 |
+
| `action_type` | Typical fields |
|
| 274 |
+
|---------------|----------------|
|
| 275 |
+
| `reply_email` | `email_id`, `message_body` |
|
| 276 |
+
| `archive_email` | `email_id` |
|
| 277 |
+
| `reschedule_meeting` | `meeting_id`, `new_time`, `reason` |
|
| 278 |
+
| `cancel_meeting` | `meeting_id`, `reason` |
|
| 279 |
+
| `complete_task` | `task_id` |
|
| 280 |
+
| `delegate_task` | `task_id`, `contact_name` |
|
| 281 |
+
| `send_message` | `contact_name`, `message` |
|
| 282 |
+
| `do_nothing` | — (penalised path) |
|
| 283 |
+
|
| 284 |
+
Malformed HTTP payloads are handled safely so clients do not crash the server.
|
| 285 |
+
|
| 286 |
+
---
|
| 287 |
+
|
| 288 |
+
## Observation
|
| 289 |
+
|
| 290 |
+
- **`echoed_message`** — Full plain-text briefing.
|
| 291 |
+
- **`message_length`** — Length of briefing.
|
| 292 |
+
- **`reward`**, **`done`**, **`metadata`** — Step outcome; metadata includes `step_ok`, reward breakdown fields, and debug ids.
|
| 293 |
+
|
| 294 |
+
---
|
| 295 |
+
|
| 296 |
+
## Reward (formula summary)
|
| 297 |
+
|
| 298 |
+
Full detail is under [Criterion: Reward & Training Pipeline (10%)](#criterion-reward--training-pipeline-10). Episode logs: `outputs/logs/episode_rewards.jsonl` (gitignored).
|
| 299 |
+
|
| 300 |
+
---
|
| 301 |
+
|
| 302 |
+
## HTTP vs WebSocket (episode state)
|
| 303 |
+
|
| 304 |
+
- **HTTP** `POST /reset` and `POST /step` may use **short-lived** instances; consecutive HTTP calls might not share one in-memory episode.
|
| 305 |
+
- **WebSocket `/ws`** (or `GhostexecEnv`) — use for **multi-step episodes** on one session.
|
| 306 |
+
|
| 307 |
+
Endpoints: **`/web`**, **`/docs`**, **`/health`**, **`/ws`**.
|
| 308 |
+
|
| 309 |
+
---
|
| 310 |
+
|
| 311 |
+
## Running and testing locally
|
| 312 |
+
|
| 313 |
+
```bash
|
| 314 |
+
uv run uvicorn ghostexec.server.app:app --reload --host 0.0.0.0 --port 8000
|
| 315 |
+
# or
|
| 316 |
+
uv run server --port 8000
|
| 317 |
+
```
|
| 318 |
+
|
| 319 |
+
**HTTP smoke:**
|
| 320 |
+
|
| 321 |
+
```bash
|
| 322 |
+
uv run python scripts/http_endpoint_smoke.py --local
|
| 323 |
+
```
|
| 324 |
+
|
| 325 |
+
**Tests:**
|
| 326 |
+
|
| 327 |
+
```bash
|
| 328 |
+
uv run pytest tests/ -q
|
| 329 |
+
GHOSTEXEC_RUN_DOCKER_BUILD=1 uv run pytest tests/test_docker_build.py -q
|
| 330 |
+
uv run pytest tests/test_live_server_exhaustive.py -v --tb=short # server on :8000
|
| 331 |
+
```
|
| 332 |
+
|
| 333 |
+
**SFT → GRPO (example):**
|
| 334 |
+
|
| 335 |
+
```bash
|
| 336 |
+
uv run python scripts/train_sft_then_grpo.py \
|
| 337 |
+
--model-preset small_iter_fast \
|
| 338 |
+
--training-preset hackathon_turbo \
|
| 339 |
+
--env-url http://127.0.0.1:8000 \
|
| 340 |
+
--generate-sft-from-env \
|
| 341 |
+
--sft-samples 120 \
|
| 342 |
+
--max-sft-steps 60 \
|
| 343 |
+
--max-grpo-steps 120 \
|
| 344 |
+
--env-reward-scale 1.0 \
|
| 345 |
+
--local-reward-scale 0.35 \
|
| 346 |
+
--complexity-curriculum easy_to_full \
|
| 347 |
+
--curriculum-ramp-ratio 0.60
|
| 348 |
+
```
|
| 349 |
+
|
| 350 |
+
---
|
| 351 |
+
|
| 352 |
+
## Hugging Face Spaces
|
| 353 |
+
|
| 354 |
+
```bash
|
| 355 |
+
openenv serve
|
| 356 |
+
openenv build
|
| 357 |
+
openenv validate --verbose
|
| 358 |
+
openenv push
|
| 359 |
+
# openenv push --repo-id your-username/ghostexec
|
| 360 |
+
```
|
| 361 |
+
|
| 362 |
+
Use a **public** Space for the default hackathon flow. `openenv.yaml` carries **name**, **version**, and **description** for metadata—keep them in sync with submission needs.
|
| 363 |
+
|
| 364 |
+
---
|
| 365 |
+
|
| 366 |
+
## Scenarios
|
| 367 |
+
|
| 368 |
+
| File | Role |
|
| 369 |
+
|------|------|
|
| 370 |
+
| `scenarios/phase2_core.json` | Default dense fixture |
|
| 371 |
+
| `scenarios/monday_morning.json`, `dinner_disaster.json`, `vip_meltdown.json` | Narrative pressure |
|
| 372 |
+
| `scenarios/vip_meltdown_drift.json` | Mood / escalation drift |
|
| 373 |
+
| `scenarios/schema_drift_test.json` | Drift-event harness |
|
| 374 |
+
|
| 375 |
+
---
|
| 376 |
+
|
| 377 |
+
## Project layout
|
| 378 |
+
|
| 379 |
+
```
|
| 380 |
+
ghostexec/
|
| 381 |
+
├── openenv.yaml
|
| 382 |
+
├── pyproject.toml
|
| 383 |
+
├── models.py
|
| 384 |
+
├── client.py
|
| 385 |
+
├── graders.py
|
| 386 |
+
├── scenarios/
|
| 387 |
+
├── scripts/
|
| 388 |
+
├── notebooks/
|
| 389 |
+
├── tests/
|
| 390 |
+
└── server/
|
| 391 |
+
├── app.py
|
| 392 |
+
├── ghostexec_environment.py
|
| 393 |
+
├── reward.py
|
| 394 |
+
└── Dockerfile
|
| 395 |
+
```
|
| 396 |
+
|
| 397 |
+
---
|
| 398 |
+
|
| 399 |
+
## Resources & references
|
| 400 |
+
|
| 401 |
+
- [meta-pytorch/OpenEnv](https://github.com/meta-pytorch/OpenEnv) — core stack
|
| 402 |
+
- [Packaging & Deploying](https://meta-pytorch.org/OpenEnv/auto_getting_started/environment-builder.html)
|
| 403 |
+
- [OpenEnv Hub](https://huggingface.co/openenv)
|
| 404 |
+
- [Building RL Environments with OpenEnv](https://www.youtube.com/watch?v=0airz7BhBiA) (and related talks linked in prior README iterations)
|
| 405 |
+
|
| 406 |
+
---
|
| 407 |
+
|
| 408 |
+
## License
|
| 409 |
+
|
| 410 |
+
BSD-style — see license notices in source files (Meta / OpenEnv lineage).
|
environment-innovation/README.md
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Ghostexec — innovation brief (for reviewers)
|
| 2 |
+
|
| 3 |
+
**Repository:** [Ghostexec (OpenEnv)](../README.md)
|
| 4 |
+
**Public Space:** https://huggingface.co/spaces/modelbuilderhq/ghostexec
|
| 5 |
+
|
| 6 |
+
This README is a **standalone** walkthrough for reviewers: why the environment is hard, what agent capabilities it stresses, how to verify claims in code and on the live Space. You can read it **without** opening the rest of the repo narrative.
|
| 7 |
+
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
## Contents
|
| 11 |
+
|
| 12 |
+
1. [How to read this document](#how-to-read-this-document)
|
| 13 |
+
2. [Short answers](#short-answers-so-nothing-is-buried)
|
| 14 |
+
3. [What Ghostexec is](#1-what-ghostexec-is-one-paragraph)
|
| 15 |
+
4. [What the agent observes](#2-what-the-agent-observes-and-why-that-matters)
|
| 16 |
+
5. [What the agent can do](#3-what-the-agent-can-do-actions-and-legality)
|
| 17 |
+
6. [What changes between steps](#4-what-changes-between-steps-dynamics-and-drift)
|
| 18 |
+
7. [How success is scored](#5-how-success-is-scored-two-layers-on-purpose)
|
| 19 |
+
8. [Task ladder](#6-the-public-task-ladder-difficulty-in-data-not-vibes)
|
| 20 |
+
9. [Reviewer checklist](#7-how-a-reviewer-can-verify-5-minute-checklist)
|
| 21 |
+
10. [Closing](#8-closing)
|
| 22 |
+
11. [Key files (from repo root)](#key-files-from-repo-root)
|
| 23 |
+
|
| 24 |
+
---
|
| 25 |
+
|
| 26 |
+
## How to read this document
|
| 27 |
+
|
| 28 |
+
We group the argument under **two angles** reviewers typically care about. Everything below maps to one or both:
|
| 29 |
+
|
| 30 |
+
| Angle | Sections that answer it |
|
| 31 |
+
|-------|-------------------------|
|
| 32 |
+
| Is the **world** itself interesting and genuinely hard? | [Short answers](#short-answers-so-nothing-is-buried), [§1–§4](#1-what-ghostexec-is-one-paragraph) |
|
| 33 |
+
| Does it **stress-test agents** in a way a toy demo would not? | [Short answers](#short-answers-so-nothing-is-buried), [§3–§6](#3-what-the-agent-can-do-actions-and-legality), [§8](#8-closing) |
|
| 34 |
+
|
| 35 |
+
---
|
| 36 |
+
|
| 37 |
+
## Short answers (so nothing is buried)
|
| 38 |
+
|
| 39 |
+
**Is it genuinely challenging?** Yes. The agent must survive **dense natural-language state**, emit **strict structured actions** that **mutate** a multi-entity world, and accept **time pressure**, **social consequences**, and **invalid-action economics** without crashing the server. “Easy” wins are rare because channels **compete**: mail, calendar, tasks, and relationships all pull in different directions.
|
| 40 |
+
|
| 41 |
+
**Is it a meaningful test of behavior?** Yes. Success requires **grounded parsing** (real ids from the briefing), **tool discipline** (legal JSON schema), **sequencing** over multiple steps (WebSocket sessions for real episodes; HTTP for resets and single steps), and **tradeoffs** reflected in a **multi-channel** reward—not a single template answer.
|
| 42 |
+
|
| 43 |
+
**Is every ingredient globally novel?** No—and we do not claim otherwise. Inboxes and calendars are familiar. What *is* uncommon is the **composition**: OpenEnv-first packaging, **plain-text-only** observations, **data-driven** scenarios, **live dynamics** and **timed drift**, **dual** evaluation (**dense step rewards** + **trajectory graders** in strict `(0.01, 0.99)`), and a **production-shaped** action API—together—in one environment you can train and ship.
|
| 44 |
+
|
| 45 |
+
---
|
| 46 |
+
|
| 47 |
+
### 1. What Ghostexec is (one paragraph)
|
| 48 |
+
|
| 49 |
+
Ghostexec is an **executive chief-of-staff simulator**. Each episode starts from JSON scenario data under `../scenarios/`, selected by **task id** in `../openenv.yaml`. The **engine** lives in `../server/ghostexec_environment.py` and `../server/reward.py`; the **deployment contract** for Hugging Face / OpenEnv is `../openenv.yaml` (name **`ghostexec`**, FastAPI `server.app:app`, port **8000**). The model never sees raw scenario JSON as its primary observation: it sees a **rendered briefing**—the same class of messy, overlapping information a human would scan under time pressure.
|
| 50 |
+
|
| 51 |
+
---
|
| 52 |
+
|
| 53 |
+
### 2. What the agent observes (and why that matters)
|
| 54 |
+
|
| 55 |
+
After `reset` (or the WebSocket equivalent), the policy receives `GhostexecObservation.echoed_message`: a **single plain-text** block that includes, at minimum:
|
| 56 |
+
|
| 57 |
+
- A **timestamped header** (simulated “now”).
|
| 58 |
+
- **Unread emails** with priority, sender, relationship, subject, and a short preview.
|
| 59 |
+
- **Calendar conflicts** in a rolling horizon (overlaps the agent could resolve or worsen).
|
| 60 |
+
- **Top contacts** with **mood**, relationship type, and communication preference.
|
| 61 |
+
- **Tasks** that are overdue or due soon.
|
| 62 |
+
- **Executive stress** and **steps remaining** toward `max_steps` (see `../openenv.yaml`, default **20**).
|
| 63 |
+
|
| 64 |
+
**Why this matters for “challenging”:** many demos hide structure in JSON observations or tool schemas. Here, the **only** narrative state the model is supposed to “read” like a user is **natural language**, while the **law** of the world is still **typed actions**. That forces **comprehension + compliance** together—hallucinated ids and “vibes-only” plans fail in ways you can measure.
|
| 65 |
+
|
| 66 |
+
---
|
| 67 |
+
|
| 68 |
+
### 3. What the agent can do (actions and legality)
|
| 69 |
+
|
| 70 |
+
Each step the agent returns **exactly one** `GhostexecAction` (`../models.py`): `reply_email`, `archive_email`, `reschedule_meeting`, `cancel_meeting`, `complete_task`, `delegate_task`, `send_message`, or `do_nothing`.
|
| 71 |
+
|
| 72 |
+
**Validity is enforced against the live world:** wrong `email_id` / `meeting_id` / `task_id`, missing required fields, or impossible combinations produce an **invalid step**. The server **does not throw**; it returns structured metadata (`step_ok`, error text) so RL and HTTP clients can learn from mistakes instead of dying.
|
| 73 |
+
|
| 74 |
+
**Valid actions mutate state:** mail can be replied or archived; meetings moved or cancelled; tasks completed or delegated; direct messages sent. The episode is therefore a **small transactional simulation**, not a static Q&A.
|
| 75 |
+
|
| 76 |
+
---
|
| 77 |
+
|
| 78 |
+
### 4. What changes between steps (dynamics and drift)
|
| 79 |
+
|
| 80 |
+
Ghostexec is **not** a static paragraph with a hidden answer key. After actions, the environment runs **post-step dynamics** (see `../server/ghostexec_environment.py`):
|
| 81 |
+
|
| 82 |
+
- **Clock:** simulation time advances (default **20 minutes** per step), which can flip tasks into overdue and change what “urgent” means.
|
| 83 |
+
- **Mood:** stakeholders move along a mood ladder after real actions (e.g. a thoughtful reply can improve a sender; cancelling a meeting can upset attendees).
|
| 84 |
+
- **Pressure on idle / invalid behavior:** if the agent **`do_nothing`**s or **fails** while **critical** mail is still unanswered, mood pressure can concentrate on the sender who is actually waiting—so “safe” inaction is not safe in the social graph.
|
| 85 |
+
- **Stress and conflicts:** the world rebuilds an **active conflict list** (overlaps, unanswered critical mail) and maps that into the **stress** value surfaced in the briefing—so calendar debt is not cosmetic.
|
| 86 |
+
|
| 87 |
+
**Scenario-driven schema drift:** harder JSON can schedule **`after_step`** events that reshuffle the world mid-episode: shift meetings, move deadlines, change communication preferences, **suppress relationship credit** for certain reply paths, or force moods. That tests **adaptation**, not memorization of the first screen.
|
| 88 |
+
|
| 89 |
+
---
|
| 90 |
+
|
| 91 |
+
### 5. How success is scored (two layers, on purpose)
|
| 92 |
+
|
| 93 |
+
**A. Dense step reward (training and fine-grained analysis)** — `../server/reward.py`
|
| 94 |
+
A **fixed** weighted core (**0.35 conflict + 0.35 relationship + 0.30 task**) plus **bounded** shaping terms (synergy, tradeoffs, progress-style shaping, scaffold, quality separation). Invalid steps and **`do_nothing`** are handled explicitly (idle is **penalised**, not neutral). Rich `RewardBreakdown` fields can be logged to `outputs/logs/episode_rewards.jsonl` (gitignored) for auditing *why* a step moved.
|
| 95 |
+
|
| 96 |
+
**B. Trajectory graders (OpenEnv / hackathon validation)** — `../graders.py`
|
| 97 |
+
Each public task in `../openenv.yaml` binds a grader (`graders.phase2_core_grader`, etc.). Graders read **trajectory-shaped** payloads (e.g. lists of rewards) and return scores **strictly inside `(0.01, 0.99)`**—the validator-facing layer—while the step engine remains the **dense teaching signal**.
|
| 98 |
+
|
| 99 |
+
That split is deliberate: **agents learn from fine structure**, **judges certify** with stable bounded scores.
|
| 100 |
+
|
| 101 |
+
---
|
| 102 |
+
|
| 103 |
+
### 6. The public task ladder (difficulty in *data*, not vibes)
|
| 104 |
+
|
| 105 |
+
| Task id | Difficulty | Scenario file | What gets harder |
|
| 106 |
+
|---------|------------|----------------|------------------|
|
| 107 |
+
| `phase2_core` | easy | `../scenarios/phase2_core.json` | Dense default triage: VIP mail, calendar relief, overlapping obligations. |
|
| 108 |
+
| `monday_morning` | medium | `../scenarios/monday_morning.json` | Stacked Monday rush: more concurrent fires, less slack. |
|
| 109 |
+
| `dinner_disaster` | hard | `../scenarios/dinner_disaster.json` | Personal vs professional collision with **escalation risk**. |
|
| 110 |
+
|
| 111 |
+
All of this is declared in **`../openenv.yaml`** so the Space, CLI, and notebooks agree on **names**, **ports**, and **grader wiring** without a second source of truth.
|
| 112 |
+
|
| 113 |
+
---
|
| 114 |
+
|
| 115 |
+
### 7. How a reviewer can verify (5-minute checklist)
|
| 116 |
+
|
| 117 |
+
1. Open **`../openenv.yaml`** — confirm three tasks, `max_steps`, `app: server.app:app`, **`name: ghostexec`**.
|
| 118 |
+
2. Open **`../scenarios/*.json`** — confirm episodes are **data**, not hardcoded Python lore.
|
| 119 |
+
3. Skim **`../server/ghostexec_environment.py`** — `build_briefing_text`, `_apply_action`, `_apply_post_action_dynamics`, `_maybe_apply_schema_drift_events`.
|
| 120 |
+
4. Skim **`../server/reward.py`** — fixed weights, invalid / idle handling, shaping caps.
|
| 121 |
+
5. Open **`../graders.py`** — strict output bounds and trajectory consumption.
|
| 122 |
+
6. Open the **public Space**: https://huggingface.co/spaces/modelbuilderhq/ghostexec — use `/docs` or `POST /reset` + `POST /step`: legal actions change state; illegal actions return errors, **not** stack traces.
|
| 123 |
+
|
| 124 |
+
---
|
| 125 |
+
|
| 126 |
+
### 8. Closing
|
| 127 |
+
|
| 128 |
+
**World quality.** The challenge is **interactional and operational**: overlapping human-style goals, strict tool use, evolving social signals, and mid-episode drift—**not** a single binary “did you answer correctly.”
|
| 129 |
+
|
| 130 |
+
**What this stack proves.** If you strip Ghostexec to one bullet, it is: **plain-text situational awareness + legal structured world edits + multi-channel rewards + timed scenario pressure + OpenEnv-native deployment and graders**—in one coherent package you can train, log, and host.
|
| 131 |
+
|
| 132 |
+
That is the **innovation case** this repository is built to defend.
|
| 133 |
+
|
| 134 |
+
---
|
| 135 |
+
|
| 136 |
+
## Key files (from repo root)
|
| 137 |
+
|
| 138 |
+
| Path | Role |
|
| 139 |
+
|------|------|
|
| 140 |
+
| `openenv.yaml` | Space name, port, tasks, graders, `max_steps` |
|
| 141 |
+
| `scenarios/*.json` | Episode **data** (world content, drift hooks) |
|
| 142 |
+
| `server/ghostexec_environment.py` | Briefing text, actions, dynamics, drift |
|
| 143 |
+
| `server/reward.py` | Step reward, fixed 0.35 / 0.35 / 0.30 core + shaping |
|
| 144 |
+
| `graders.py` | Trajectory scores in `(0.01, 0.99)` per task |
|
| 145 |
+
| `models.py` | `GhostexecAction`, `GhostexecObservation`, `RewardBreakdown` |
|
| 146 |
+
|
| 147 |
+
For install, tests, training scripts, and the rest of the hackathon submission, see the [main project README](../README.md).
|
server/app.py
CHANGED
|
@@ -28,6 +28,9 @@ Usage:
|
|
| 28 |
python -m server.app
|
| 29 |
"""
|
| 30 |
|
|
|
|
|
|
|
|
|
|
| 31 |
try:
|
| 32 |
import openenv.core.env_server.http_server as _openenv_http
|
| 33 |
except Exception as e: # pragma: no cover
|
|
@@ -53,6 +56,28 @@ _openenv_http.serialize_observation = _ghostexec_serialize_observation
|
|
| 53 |
|
| 54 |
from openenv.core.env_server.http_server import create_app # noqa: E402
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
try:
|
| 57 |
# Editable / normal install (package name `ghostexec`).
|
| 58 |
from ghostexec.models import GhostexecAction, GhostexecObservation
|
|
|
|
| 28 |
python -m server.app
|
| 29 |
"""
|
| 30 |
|
| 31 |
+
import os
|
| 32 |
+
from pathlib import Path
|
| 33 |
+
|
| 34 |
try:
|
| 35 |
import openenv.core.env_server.http_server as _openenv_http
|
| 36 |
except Exception as e: # pragma: no cover
|
|
|
|
| 56 |
|
| 57 |
from openenv.core.env_server.http_server import create_app # noqa: E402
|
| 58 |
|
| 59 |
+
|
| 60 |
+
def _configure_openenv_readme_path() -> None:
|
| 61 |
+
"""OpenEnv Gradio sidebar loads README from /app/README.md or ENV_README_PATH only.
|
| 62 |
+
|
| 63 |
+
Our Docker layout copies the repo to /app/env/, so README.md lives at
|
| 64 |
+
/app/env/README.md. Set ENV_README_PATH before create_app so the Playground
|
| 65 |
+
shows the README instead of "No README available."
|
| 66 |
+
"""
|
| 67 |
+
if os.environ.get("ENV_README_PATH"):
|
| 68 |
+
return
|
| 69 |
+
_here = Path(__file__).resolve()
|
| 70 |
+
for candidate in (
|
| 71 |
+
Path("/app/env/README.md"), # HF Space / openenv Docker layout
|
| 72 |
+
_here.parent.parent / "README.md", # repo root when running from source
|
| 73 |
+
):
|
| 74 |
+
if candidate.is_file():
|
| 75 |
+
os.environ["ENV_README_PATH"] = str(candidate)
|
| 76 |
+
return
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
_configure_openenv_readme_path()
|
| 80 |
+
|
| 81 |
try:
|
| 82 |
# Editable / normal install (package name `ghostexec`).
|
| 83 |
from ghostexec.models import GhostexecAction, GhostexecObservation
|