Spaces:

modelbuilderhq
/

ghostexec

Running

App Files Files Community

modelbuilderhq commited on 6 days ago

Commit

ee21104

verified ·

1 Parent(s): f59df3f

Upload folder using huggingface_hub

Browse files

Files changed (16) hide show

README.md +49 -0
graders.py +35 -6
models.py +7 -0
notebooks/ghostexec_unsloth_grpo_hf_api.ipynb +337 -64
openenv_ghostexec.egg-info/PKG-INFO +5 -0
openenv_ghostexec.egg-info/SOURCES.txt +2 -2
openenv_ghostexec.egg-info/requires.txt +6 -0
outputs/logs/episode_rewards.jsonl +0 -0
pyproject.toml +6 -0
scripts/eval_reward_ablation.py +64 -0
scripts/plot_training_report.py +170 -0
scripts/train_sft_then_grpo.py +641 -0
server/ghostexec_environment.py +74 -0
server/reward.py +166 -1
tests/test_phase4.py +25 -0
uv.lock +0 -0

README.md CHANGED Viewed

@@ -215,6 +215,55 @@ set GHOSTEXEC_WS_BASE_URL=http://127.0.0.1:8000
 uv run pytest tests/test_complete_integration.py::test_ghostexec_env_client_against_live_url_if_set -q
 ```
 ---
 ## Hugging Face Spaces

 uv run pytest tests/test_complete_integration.py::test_ghostexec_env_client_against_live_url_if_set -q
 ```
+Post-training plot pack (loss + reward + components + baseline bar):
+```bash
+uv run python scripts/plot_training_report.py \
+  --trainer-history outputs/trainer_state.json \
+  --reward-csv outputs/reward_log.csv \
+  --baselines-json outputs/compliance_manifest.json \
+  --out-dir outputs/plots
+```
+The script writes:
+- `outputs/plots/loss_curve.png`
+- `outputs/plots/reward_curve.png`
+- `outputs/plots/components_curve.png`
+- `outputs/plots/baseline_comparison.png`
+SFT before GRPO (with partial live-env usage during SFT data generation and GRPO rewards):
+```bash
+uv run python scripts/train_sft_then_grpo.py \
+  --model-preset small_iter_fast \
+  --training-preset hackathon_turbo \
+  --env-url http://127.0.0.1:8000 \
+  --generate-sft-from-env \
+  --sft-samples 120 \
+  --max-sft-steps 60 \
+  --max-grpo-steps 120 \
+  --env-reward-scale 1.0 \
+  --local-reward-scale 0.35 \
+  --complexity-curriculum easy_to_full \
+  --curriculum-ramp-ratio 0.60
+```
+This performs:
+- SFT warm-start on JSONL (`prompt` + `completion`) generated from live `/reset` briefings.
+- GRPO continuation from the SFT adapter.
+- Mixed reward shaping where env-derived reward remains active and local shaping can be down-weighted/up-weighted via scales.
+- Optional complexity curriculum (`easy_to_full`) that starts with stronger scaffold/local signals and anneals to env-dominant reward later.
+- Stability-first optimization defaults (cosine schedule + warmup + grad clipping + higher GRPO KL beta) and optional guardrails:
+  - `--reward-ema-decay 0..1` smooths the *env* reward channel (defaults come from `--training-preset`).
+  - omit `--no-stability-tripwire` to enable early stopping when logs show repeated “env reward down + loss up” (GRPO) or repeated loss blow-ups (SFT).
+Recommended model strategy for hackathon iteration speed:
+- Start with `--model-preset small_iter_fast` (`unsloth/Qwen2.5-3B-Instruct`) + QLoRA.
+- Run many short SFT->GRPO loops, improve reward signals, then scale model size only after curves stabilize.
+- Use larger presets only when memory + runtime are consistently stable.
+- Use `--training-preset hackathon_turbo` to apply stable aggressive defaults for iterative win-rate.
+- Script prints SFT/GRPO LoRA delta checks; if deltas are near zero it stops, so you never mistake a no-op run for real finetuning.
 ---
 ## Hugging Face Spaces

graders.py CHANGED Viewed

@@ -7,28 +7,57 @@ rewards in `server/reward.py`. The hackathon validator reads `openenv.yaml`
 """
 from __future__ import annotations
-from typing import Iterable, List
 STRICT_MIN = 0.01
 STRICT_MAX = 0.99
 def _bounded(value: float) -> float:
-    return min(max(round(float(value), 4), STRICT_MIN), STRICT_MAX)
 def _as_reward_list(trajectory: dict | None) -> List[float]:
     payload = trajectory or {}
     rewards = payload.get("rewards")
     if isinstance(rewards, list) and rewards:
-        return [float(r) for r in rewards]
     if "score" in payload:
-        return [float(payload["score"])]
     reward = payload.get("reward")
     if isinstance(reward, dict) and "total" in reward:
-        return [float(reward["total"])]
     if reward is not None:
-        return [float(reward)]
     return []

 """
 from __future__ import annotations
+import math
+from typing import List
 STRICT_MIN = 0.01
 STRICT_MAX = 0.99
 def _bounded(value: float) -> float:
+    try:
+        v = round(float(value), 4)
+    except (TypeError, ValueError):
+        return 0.5
+    if not math.isfinite(v):
+        return 0.5
+    return min(max(v, STRICT_MIN), STRICT_MAX)
 def _as_reward_list(trajectory: dict | None) -> List[float]:
     payload = trajectory or {}
+    if not isinstance(payload, dict):
+        return []
     rewards = payload.get("rewards")
     if isinstance(rewards, list) and rewards:
+        out: List[float] = []
+        for r in rewards:
+            try:
+                rv = float(r)
+            except (TypeError, ValueError):
+                continue
+            if math.isfinite(rv):
+                out.append(rv)
+        return out
     if "score" in payload:
+        try:
+            v = float(payload["score"])
+            return [v] if math.isfinite(v) else []
+        except (TypeError, ValueError):
+            return []
     reward = payload.get("reward")
     if isinstance(reward, dict) and "total" in reward:
+        try:
+            v = float(reward["total"])
+            return [v] if math.isfinite(v) else []
+        except (TypeError, ValueError):
+            return []
     if reward is not None:
+        try:
+            v = float(reward)
+            return [v] if math.isfinite(v) else []
+        except (TypeError, ValueError):
+            return []
     return []

models.py CHANGED Viewed

@@ -195,6 +195,13 @@ class RewardBreakdown(BaseModel):
     conflict: float = 0.0
     relationship: float = 0.0
     task: float = 0.0
     weighted_base: float = 0.0
     output_scale: float = 1.0
     invalid_step_adjustment: float = 0.0

     conflict: float = 0.0
     relationship: float = 0.0
     task: float = 0.0
+    shaping_synergy: float = 0.0
+    shaping_tradeoff: float = 0.0
+    shaping_potential: float = 0.0
+    shaping_scaffold: float = 0.0
+    shaping_quality: float = 0.0
+    shaping_total: float = 0.0
+    shaping_to_base_ratio: float = 0.0
     weighted_base: float = 0.0
     output_scale: float = 1.0
     invalid_step_adjustment: float = 0.0

notebooks/ghostexec_unsloth_grpo_hf_api.ipynb CHANGED Viewed

@@ -4,15 +4,15 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "# Ghostexec — Unsloth + TRL GRPO against the deployed HF Space API\n",
         "\n",
-        "Post-train `unsloth/Llama-3.2-3B-Instruct` with GRPO where every reward is fetched over HTTP from the **live** Ghostexec OpenEnv Space.\n",
         "\n",
         "- Live endpoint: `https://modelbuilderhq-ghostexec.hf.space`\n",
-        "- Algorithm: TRL `0.22.2` `GRPOTrainer` (no vLLM — HF `generate()` path)\n",
-        "- Base: `unsloth/Llama-3.2-3B-Instruct` (4-bit) + LoRA r=16 + bf16\n",
-        "- Curriculum: exploration schedule across three stages (T=1.0 → 0.7 → 0.5)\n",
-        "- Rewards: three **independent** functions — `env_reward` (live Space) / `format_reward` / `anti_idle_reward`\n",
         "\n",
         "### Help Guide phase map (notebook sections mirror `[Participant Help Guide] §18`)\n",
         "| Phase | Where |\n",
@@ -21,12 +21,13 @@
         "| 2 Build the environment | section 2 (already deployed; health check here) |\n",
         "| 3 Build rewards | section 3 |\n",
         "| 4 Deploy | section 4 (confirm) |\n",
-        "| 5 Train small | section 5 (Stage B) |\n",
         "| 6 Inspect for hacking | section 6 |\n",
         "| 7 Add curriculum | section 7 (Stages C + D) |\n",
         "| 8 Train bigger | section 8 (knobs, not action) |\n",
         "| 9 Save and demo | section 9 |"
-      ]
     },
     {
       "cell_type": "markdown",
@@ -90,7 +91,8 @@
         "from typing import Any\n",
         "\n",
         "GHOSTEXEC_ENV_URL = os.environ.get(\"GHOSTEXEC_ENV_URL\", \"https://modelbuilderhq-ghostexec.hf.space\")\n",
-        "MODEL_ID          = os.environ.get(\"MODEL_ID\", \"unsloth/Llama-3.2-3B-Instruct\")\n",
         "RUN_NAME          = os.environ.get(\"RUN_NAME\", \"ghostexec-unsloth-grpo\")\n",
         "HUB_REPO_ID       = os.environ.get(\"HUB_REPO_ID\", \"\")\n",
         "OUT = pathlib.Path(\"/content/ghostexec_out\") if os.path.exists(\"/content\") else pathlib.Path(\"./ghostexec_out\")\n",
@@ -175,8 +177,13 @@
       "source": [
         "### 2.2 Verifier sanity check (Help Guide §8)\n",
         "\n",
-        "Fire every legal `action_type` once against the deployed Space. If rewards are all identical or `do_nothing` is not a floor, abort — GRPO cannot learn from a degenerate verifier."
-      ]
     },
     {
       "cell_type": "code",
@@ -188,33 +195,75 @@
         "]\n",
         "\n",
         "def _smoke_action(action_type: str) -> dict:\n",
         "    return {\n",
-        "        \"action_type\":   action_type,\n",
-        "        \"email_id\":      \"email_01\" if \"email\" in action_type else \"\",\n",
-        "        \"message_body\":  \"Acknowledged. Will follow up shortly.\",\n",
-        "        \"meeting_id\":    \"meeting_01\" if \"meeting\" in action_type else \"\",\n",
-        "        \"new_time\":      \"2025-01-02T15:00:00\" if action_type == \"reschedule_meeting\" else \"\",\n",
-        "        \"reason\":        \"scheduling conflict\",\n",
-        "        \"task_id\":       \"task_01\" if \"task\" in action_type else \"\",\n",
-        "        \"contact_name\": \"Alex\",\n",
-        "        \"message\":       \"\",\n",
         "    }\n",
         "\n",
         "rewards_by_action: dict[str, float] = {}\n",
         "for at in LEGAL_ACTION_TYPES:\n",
         "    env.reset()\n",
-        "    r, _ = env.step(_smoke_action(at))\n",
         "    rewards_by_action[at] = round(r, 4)\n",
-        "print(json.dumps(rewards_by_action, indent=2))\n",
         "\n",
         "uniq = set(rewards_by_action.values())\n",
         "assert len(uniq) > 1, \"Verifier is constant across actions — env can't teach anything.\"\n",
-        "assert rewards_by_action[\"do_nothing\"] <= min(rewards_by_action.values()) + 1e-6, \\\n",
-        "    \"do_nothing is not the worst/floor — reward shape probably broken.\"\n",
-        "print(\"\\nverifier OK — rewards are discriminating and do_nothing is the floor.\")"
       ],
       "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -264,6 +313,39 @@
         "    try: return parse_action_strict(text)\n",
         "    except Exception: return {\"action_type\": \"do_nothing\"}\n",
         "\n",
         "assert parse_action_strict('```json\\n{\"action_type\":\"archive_email\",\"email_id\":\"email_01\"}\\n```')[\"action_type\"] == \"archive_email\"\n",
         "assert parse_action(\"garbage\")[\"action_type\"] == \"do_nothing\"\n",
         "print(\"parser OK\")"
@@ -280,43 +362,115 @@
         "        return c[0].get(\"content\", \"\")\n",
         "    return c if isinstance(c, str) else str(c)\n",
         "\n",
         "def env_reward(completions, prompts=None, **_) -> list[float]:\n",
         "    out: list[float] = []\n",
         "    for c in completions:\n",
         "        text = _completion_text(c)\n",
-        "        action = parse_action(text)\n",
         "        try:\n",
         "            env.reset()\n",
         "            r, _ = env.step(action)\n",
         "        except Exception:\n",
         "            r = -1.0\n",
-        "        out.append(float(r))\n",
         "    return out\n",
         "\n",
         "def format_reward(completions, **_) -> list[float]:\n",
         "    out: list[float] = []\n",
         "    for c in completions:\n",
         "        text = _completion_text(c)\n",
         "        try:\n",
-        "            parse_action_strict(text); out.append(0.1)\n",
         "        except Exception:\n",
-        "            out.append(-0.1)\n",
         "    return out\n",
         "\n",
         "def anti_idle_reward(completions, **_) -> list[float]:\n",
         "    out: list[float] = []\n",
         "    for c in completions:\n",
         "        text = _completion_text(c)\n",
         "        act = parse_action(text)\n",
-        "        out.append(-0.05 if act.get(\"action_type\") == \"do_nothing\" else 0.0)\n",
         "    return out\n",
         "\n",
         "_dummy = '{\"action_type\":\"archive_email\",\"email_id\":\"email_01\"}'\n",
         "print(\"format   :\", format_reward([_dummy]))\n",
         "print(\"anti_idle:\", anti_idle_reward([_dummy]))"
       ],
       "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
@@ -325,24 +479,32 @@
         "from transformers import TrainerCallback\n",
         "\n",
         "class HackingTripwire(TrainerCallback):\n",
-        "    \"\"\"Stop training on mode collapse or reward-format divergence (Help Guide §8).\"\"\"\n",
-        "    def __init__(self, min_unique_ratio: float = 0.2):\n",
         "        self.min_unique_ratio = min_unique_ratio\n",
         "\n",
         "    def on_log(self, args, state, control, logs=None, **kw):\n",
         "        logs = logs or {}\n",
         "        uniq = logs.get(\"completions/unique_ratio\") or logs.get(\"completions/mean_unique\")\n",
         "        env_r = logs.get(\"rewards/env_reward/mean\")\n",
         "        fmt_r = logs.get(\"rewards/format_reward/mean\")\n",
         "        if uniq is not None and uniq < self.min_unique_ratio:\n",
         "            print(f\"[TRIPWIRE] unique_ratio={uniq:.2f} < {self.min_unique_ratio} — stopping.\")\n",
         "            control.should_training_stop = True\n",
         "        if env_r is not None and fmt_r is not None and env_r > 0.8 and fmt_r < 0.0:\n",
         "            print(f\"[TRIPWIRE] env_r={env_r:.2f} but fmt_r={fmt_r:.2f} — possible hack. stopping.\")\n",
         "            control.should_training_stop = True"
       ],
       "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -357,10 +519,11 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Phase 5 — Train small\n",
         "\n",
-        "Load `unsloth/Llama-3.2-3B-Instruct` in 4-bit with Unsloth, attach LoRA, then run one **short** GRPO stage to prove the loop works end-to-end. vLLM is not used anywhere in this notebook — rollouts go through the standard HF `generate()` path inside `GRPOTrainer`."
-      ]
     },
     {
       "cell_type": "code",
@@ -408,11 +571,17 @@
         "    \"Legal action_type values: reply_email, archive_email, reschedule_meeting, cancel_meeting, \"\n",
         "    \"complete_task, delegate_task, send_message, do_nothing.\\n\\n\"\n",
         "    \"Output ONLY a compact JSON object with these keys (no prose, no code fences):\\n\"\n",
-        "    \"{\\\"action_type\\\": <one of the legal values>, \\\"email_id\\\": \\\"\\\", \\\"message_body\\\": \\\"\\\", \"\n",
         "    \"\\\"meeting_id\\\": \\\"\\\", \\\"new_time\\\": \\\"\\\", \\\"reason\\\": \\\"\\\", \\\"task_id\\\": \\\"\\\", \"\n",
         "    \"\\\"contact_name\\\": \\\"\\\", \\\"message\\\": \\\"\\\"}.\\n\\n\"\n",
-        "    \"Rules: prioritise VIP/board/critical items, match tone to sender mood, never choose do_nothing \"\n",
-        "    \"if any critical item is unresolved.\"\n",
         ")\n",
         "\n",
         "def build_prompt(briefing: str) -> list[dict]:\n",
@@ -425,7 +594,8 @@
         "    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)"
       ],
       "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
@@ -492,9 +662,12 @@
         "            pad_token_id=tokenizer.pad_token_id,\n",
         "        )\n",
         "        completion = tokenizer.decode(out[0][inputs[\"input_ids\"].shape[1]:], skip_special_tokens=True)\n",
-        "        action = parse_action(completion)\n",
         "        env.reset()\n",
-        "        r, _ = env.step(action)\n",
         "        rs.append(r)\n",
         "    FastLanguageModel.for_training(model)\n",
         "    return rs\n",
@@ -510,26 +683,71 @@
         "baselines = {\"random\": random_rewards, \"frozen\": frozen_rewards}"
       ],
       "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "### 5.2 Stage B — first GRPO stage (broad exploration, short)\n",
         "\n",
-        "T=1.0, num_generations=2, max_steps=20. Purpose: prove the training loop runs, the Space is reachable from the training process, and rewards move."
-      ]
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
-        "from trl import GRPOConfig, GRPOTrainer\n",
         "\n",
-        "reward_funcs = [env_reward, format_reward, anti_idle_reward]\n",
         "stage_logs: dict[str, list[dict]] = {}\n",
         "\n",
         "def grpo_config(name: str, *, temperature: float, num_generations: int, max_steps: int, lr: float) -> GRPOConfig:\n",
         "    return GRPOConfig(\n",
         "        output_dir=str(OUT / f\"stage_{name}\"),\n",
@@ -537,20 +755,38 @@
         "        gradient_accumulation_steps=4,\n",
         "        num_generations=num_generations,\n",
         "        max_prompt_length=1920,\n",
-        "        max_completion_length=128,\n",
         "        temperature=temperature,\n",
         "        learning_rate=lr,\n",
         "        beta=0.04,\n",
         "        max_steps=max_steps,\n",
         "        logging_steps=1,\n",
-        "        bf16=True,\n",
         "        report_to=\"none\",\n",
         "        save_strategy=\"no\",\n",
         "        remove_unused_columns=False,\n",
         "        log_completions=True,\n",
         "    )\n",
         "\n",
         "def run_stage(name: str, **kw) -> None:\n",
         "    print(f\"\\n=== Stage {name} → {kw} ===\")\n",
         "    trainer = GRPOTrainer(\n",
         "        model=policy,\n",
@@ -567,10 +803,12 @@
         "    tokenizer.save_pretrained(adapter_dir)\n",
         "    print(f\"stage {name} adapter → {adapter_dir}\")\n",
         "\n",
-        "run_stage(\"B\", temperature=1.0, num_generations=2, max_steps=20, lr=5e-6)"
       ],
       "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -610,8 +848,15 @@
       "source": [
         "## Phase 7 — Add curriculum\n",
         "\n",
-        "The deployed Space scenario is fixed, so the curriculum is an **exploration schedule**: Stage C exploits what Stage B found (T=0.7) and Stage D hardens (T=0.5, lower lr)."
-      ]
     },
     {
       "cell_type": "code",
@@ -682,20 +927,38 @@
         "plt.show()\n",
         "\n",
         "rows = []\n",
         "step_counter = 0\n",
         "for name, log in stage_logs.items():\n",
         "    for entry in log:\n",
         "        r = entry.get(\"rewards/env_reward/mean\", entry.get(\"reward\"))\n",
-        "        if r is None: continue\n",
         "        step_counter += 1\n",
         "        rows.append({\n",
-        "            \"stage\": name, \"global_step\": step_counter, \"env\": r,\n",
         "            \"fmt\":  entry.get(\"rewards/format_reward/mean\", 0.0),\n",
         "            \"idle\": entry.get(\"rewards/anti_idle_reward/mean\", 0.0),\n",
         "        })\n",
         "df = pd.DataFrame(rows)\n",
         "df.to_csv(OUT / \"reward_log.csv\", index=False)\n",
         "\n",
         "if not df.empty:\n",
         "    plt.figure(figsize=(8, 4))\n",
         "    for name, sub in df.groupby(\"stage\"):\n",
@@ -708,6 +971,7 @@
         "    plt.figure(figsize=(8, 4))\n",
         "    plt.plot(df[\"global_step\"], df[\"env\"],  label=\"env_reward\")\n",
         "    plt.plot(df[\"global_step\"], df[\"fmt\"],  label=\"format_reward\")\n",
         "    plt.plot(df[\"global_step\"], df[\"idle\"], label=\"anti_idle_reward\")\n",
         "    plt.xlabel(\"global step\"); plt.ylabel(\"mean component reward\")\n",
         "    plt.title(\"Reward components — hacking-watch\")\n",
@@ -717,7 +981,8 @@
         "    print(\"No numeric reward log found — skipping curve plots.\")"
       ],
       "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
@@ -753,16 +1018,23 @@
         "    \"env_url\":    GHOSTEXEC_ENV_URL,\n",
         "    \"model\":      MODEL_ID,\n",
         "    \"run\":        RUN_NAME,\n",
-        "    \"stack\":      {\"unsloth\": True, \"trl\": \"0.22.2\"},\n",
         "    \"rewards\": {\n",
         "        \"random_mean\":  summary[\"random\"],\n",
         "        \"frozen_mean\":  summary[\"frozen\"],\n",
         "        \"trained_mean\": summary[\"trained\"],\n",
         "        \"improvement_vs_frozen\": summary[\"trained\"] - summary[\"frozen\"],\n",
         "    },\n",
-        "    \"stages\":       list(stage_logs.keys()),\n",
-        "    \"reward_fns\":   [\"env_reward\", \"format_reward\", \"anti_idle_reward\"],\n",
-        "    \"curriculum\":   \"exploration schedule (T=1.0→0.7→0.5)\",\n",
         "    \"tripwire\":     \"HackingTripwire (unique_ratio<0.2 or env↑/fmt↓)\",\n",
         "    \"adapter_path\": str(final_adapter),\n",
         "    \"mean_space_latency_ms\": round(sum(env.latency_ms) / max(len(env.latency_ms), 1), 1),\n",
@@ -773,7 +1045,8 @@
         "print(\"\\nmanifest →\", OUT / \"manifest.json\")"
       ],
       "execution_count": null,
-      "outputs": []
     }
   ],
   "metadata": {

       "cell_type": "markdown",
       "metadata": {},
       "source": [
+        "# Ghostexec — Unsloth + TRL SFT -> GRPO against the deployed HF Space API\n",
         "\n",
+        "Post-train `unsloth/Llama-3.2-3B-Instruct` with **SFT warmup first** and then GRPO, where rewards are fetched over HTTP from the **live** Ghostexec OpenEnv Space.\n",
         "\n",
         "- Live endpoint: `https://modelbuilderhq-ghostexec.hf.space`\n",
+        "- Algorithm: TRL `0.22.2` `SFTTrainer` -> `GRPOTrainer` (no vLLM — HF `generate()` path)\n",
+        "- Base (recommended for fast winning iterations): `unsloth/Qwen2.5-3B-Instruct` (4-bit) + LoRA r=16 + bf16\n",
+        "- Curriculum: **easy -> full** annealing (strong local scaffold early, env-dominant later)\n",
+        "- Rewards: four **independent** functions — `env_reward` (live Space) / `format_reward` / `semantic_action_reward` / `anti_idle_reward`\n",
         "\n",
         "### Help Guide phase map (notebook sections mirror `[Participant Help Guide] §18`)\n",
         "| Phase | Where |\n",
         "| 2 Build the environment | section 2 (already deployed; health check here) |\n",
         "| 3 Build rewards | section 3 |\n",
         "| 4 Deploy | section 4 (confirm) |\n",
+        "| 5 Train small | section 5 (SFT + Stage B) |\n",
         "| 6 Inspect for hacking | section 6 |\n",
         "| 7 Add curriculum | section 7 (Stages C + D) |\n",
         "| 8 Train bigger | section 8 (knobs, not action) |\n",
         "| 9 Save and demo | section 9 |"
+      ],
+      "id": "33566e3d"
     },
     {
       "cell_type": "markdown",
         "from typing import Any\n",
         "\n",
         "GHOSTEXEC_ENV_URL = os.environ.get(\"GHOSTEXEC_ENV_URL\", \"https://modelbuilderhq-ghostexec.hf.space\")\n",
+        "# Small-model-first default for rapid iteration and higher success probability.\n",
+        "MODEL_ID          = os.environ.get(\"MODEL_ID\", \"unsloth/Qwen2.5-3B-Instruct\")\n",
         "RUN_NAME          = os.environ.get(\"RUN_NAME\", \"ghostexec-unsloth-grpo\")\n",
         "HUB_REPO_ID       = os.environ.get(\"HUB_REPO_ID\", \"\")\n",
         "OUT = pathlib.Path(\"/content/ghostexec_out\") if os.path.exists(\"/content\") else pathlib.Path(\"./ghostexec_out\")\n",
       "source": [
         "### 2.2 Verifier sanity check (Help Guide §8)\n",
         "\n",
+        "**Colab / stale cells:** If the traceback mentions **`do_nothing is not the worst/floor`** on **line ~28**, you are running **old cached notebook code** (that assert was removed). Use **Runtime → Disconnect and delete runtime**, then **re-clone** the repo or **re-download** this notebook from GitHub and run from the top.\n",
+        "\n",
+        "**If every proactive action prints `-0.25` and only `do_nothing` is `-0.15`:** every non-idle smoke is an **invalid step** (wrong ids like `email_01`, or an outdated `_smoke_action`). This cell expects **real `phase2_core` ids** (`e01`, `e09`, `m02`, …) — see `_smoke_action` below.\n",
+        "\n",
+        "Fire every legal `action_type` once with **semantically valid** payloads (real ids from `scenarios/phase2_core.json`). Fake ids deserialize but fail validation (−0.25 invalid-step) and are not a fair probe. Also: **`do_nothing` is not guaranteed to be the lowest reward** — a valid but harmful action (e.g. cancelling an important meeting) can push the weighted score below the idle penalty. We instead assert **non-idle smokes are `step_ok=True`** and **`do_nothing` scores below a benign `reply_email` on `e01`**. If rewards are all identical, abort — GRPO cannot learn from a degenerate verifier."
+      ],
+      "id": "b747bc4e"
     },
     {
       "cell_type": "code",
         "]\n",
         "\n",
         "def _smoke_action(action_type: str) -> dict:\n",
+        "    # Real IDs from phase2_core scenario\n",
+        "    base = {\"action_type\": action_type, \"message\": \"\"}\n",
+        "\n",
+        "    if action_type == \"reply_email\":\n",
+        "        return {**base, \"email_id\": \"e01\", \"message_body\": \"Acknowledged — on it now.\"}\n",
+        "    if action_type == \"archive_email\":\n",
+        "        return {**base, \"email_id\": \"e09\"}\n",
+        "    if action_type == \"reschedule_meeting\":\n",
+        "        return {\n",
+        "            **base,\n",
+        "            \"meeting_id\": \"m02\",\n",
+        "            \"new_time\": \"2026-04-21T18:00:00\",\n",
+        "            \"reason\": \"freeing the morning block\",\n",
+        "        }\n",
+        "    if action_type == \"cancel_meeting\":\n",
+        "        return {**base, \"meeting_id\": \"m10\", \"reason\": \"smoke test cancel\"}\n",
+        "    if action_type == \"complete_task\":\n",
+        "        return {**base, \"task_id\": \"t07\"}\n",
+        "    if action_type == \"delegate_task\":\n",
+        "        return {**base, \"task_id\": \"t08\", \"contact_name\": \"Jordan Lee\"}\n",
+        "    if action_type == \"send_message\":\n",
+        "        return {\n",
+        "            **base,\n",
+        "            \"contact_name\": \"Jamie Liu\",\n",
+        "            \"message_body\": \"Quick sync when you have a minute.\",\n",
+        "        }\n",
+        "\n",
+        "    # do_nothing\n",
         "    return {\n",
+        "        **base,\n",
+        "        \"email_id\": \"\",\n",
+        "        \"message_body\": \"\",\n",
+        "        \"meeting_id\": \"\",\n",
+        "        \"new_time\": \"\",\n",
+        "        \"reason\": \"\",\n",
+        "        \"task_id\": \"\",\n",
+        "        \"contact_name\": \"\",\n",
         "    }\n",
         "\n",
         "rewards_by_action: dict[str, float] = {}\n",
+        "step_ok_by_action: dict[str, bool | None] = {}\n",
+        "\n",
         "for at in LEGAL_ACTION_TYPES:\n",
         "    env.reset()\n",
+        "    r, raw = env.step(_smoke_action(at))\n",
         "    rewards_by_action[at] = round(r, 4)\n",
+        "    obs = raw.get(\"observation\") or {}\n",
+        "    step_ok_by_action[at] = (obs.get(\"metadata\") or {}).get(\"step_ok\")\n",
+        "\n",
+        "print(json.dumps({\"reward\": rewards_by_action, \"step_ok\": step_ok_by_action}, indent=2))\n",
         "\n",
         "uniq = set(rewards_by_action.values())\n",
         "assert len(uniq) > 1, \"Verifier is constant across actions — env can't teach anything.\"\n",
+        "\n",
+        "# All non-idle smokes must be valid\n",
+        "for at in LEGAL_ACTION_TYPES:\n",
+        "    if at == \"do_nothing\":\n",
+        "        continue\n",
+        "    assert step_ok_by_action.get(at) is True, f\"{at} smoke is invalid; check IDs.\"\n",
+        "\n",
+        "# Idle should be worse than benign good action\n",
+        "assert rewards_by_action[\"do_nothing\"] < rewards_by_action[\"reply_email\"] - 1e-6, \\\n",
+        "    \"do_nothing should score below reply_email(e01).\"\n",
+        "\n",
+        "print(\"\\nverifier OK — rewards vary, smokes are valid, do_nothing < reply_email(e01).\")"
       ],
       "execution_count": null,
+      "outputs": [],
+      "id": "5ed1a9bc"
     },
     {
       "cell_type": "markdown",
         "    try: return parse_action_strict(text)\n",
         "    except Exception: return {\"action_type\": \"do_nothing\"}\n",
         "\n",
+        "LEGAL_ACTION_TYPES = {\n",
+        "    \"reply_email\", \"archive_email\", \"reschedule_meeting\", \"cancel_meeting\",\n",
+        "    \"complete_task\", \"delegate_task\", \"send_message\", \"do_nothing\",\n",
+        "}\n",
+        "LEGAL_ACTION_KEYS = {\n",
+        "    \"action_type\", \"email_id\", \"message_body\", \"meeting_id\",\n",
+        "    \"new_time\", \"reason\", \"task_id\", \"contact_name\", \"message\",\n",
+        "}\n",
+        "\n",
+        "\n",
+        "def sanitize_action(raw: dict) -> dict:\n",
+        "    \"\"\"Keep only legal Ghostexec fields and coerce malformed IDs/actions safely.\"\"\"\n",
+        "    action = {k: v for k, v in (raw or {}).items() if k in LEGAL_ACTION_KEYS}\n",
+        "\n",
+        "    at = str(action.get(\"action_type\", \"do_nothing\"))\n",
+        "    if at not in LEGAL_ACTION_TYPES:\n",
+        "        at = \"do_nothing\"\n",
+        "    action[\"action_type\"] = at\n",
+        "\n",
+        "    # Common model mistake: writes message text into `message` instead of `message_body`.\n",
+        "    if at in {\"reply_email\", \"send_message\"}:\n",
+        "        if not action.get(\"message_body\") and action.get(\"message\"):\n",
+        "            action[\"message_body\"] = action[\"message\"]\n",
+        "\n",
+        "    if \"email_id\" in action and not re.fullmatch(r\"e\\d{2}\", str(action[\"email_id\"])):\n",
+        "        action[\"email_id\"] = \"\"\n",
+        "    if \"meeting_id\" in action and not re.fullmatch(r\"m\\d{2}\", str(action[\"meeting_id\"])):\n",
+        "        action[\"meeting_id\"] = \"\"\n",
+        "    if \"task_id\" in action and not re.fullmatch(r\"t\\d{2}\", str(action[\"task_id\"])):\n",
+        "        action[\"task_id\"] = \"\"\n",
+        "\n",
+        "    return action\n",
+        "\n",
         "assert parse_action_strict('```json\\n{\"action_type\":\"archive_email\",\"email_id\":\"email_01\"}\\n```')[\"action_type\"] == \"archive_email\"\n",
         "assert parse_action(\"garbage\")[\"action_type\"] == \"do_nothing\"\n",
         "print(\"parser OK\")"
         "        return c[0].get(\"content\", \"\")\n",
         "    return c if isinstance(c, str) else str(c)\n",
         "\n",
+        "\n",
+        "def _prompt_to_text(p) -> str:\n",
+        "    if isinstance(p, list) and p and isinstance(p[-1], dict):\n",
+        "        return str(p[-1].get(\"content\", \"\"))\n",
+        "    if isinstance(p, dict):\n",
+        "        return str(p.get(\"content\", \"\"))\n",
+        "    return str(p)\n",
+        "\n",
+        "\n",
+        "# Curriculum scalars are updated per stage: easy -> full.\n",
+        "CURRENT_ENV_SCALE = 0.85\n",
+        "CURRENT_LOCAL_SCALE = 0.60\n",
+        "\n",
+        "\n",
         "def env_reward(completions, prompts=None, **_) -> list[float]:\n",
         "    out: list[float] = []\n",
         "    for c in completions:\n",
         "        text = _completion_text(c)\n",
+        "        action = sanitize_action(parse_action(text))\n",
         "        try:\n",
         "            env.reset()\n",
         "            r, _ = env.step(action)\n",
         "        except Exception:\n",
         "            r = -1.0\n",
+        "        out.append(float(r) * CURRENT_ENV_SCALE)\n",
         "    return out\n",
         "\n",
+        "\n",
         "def format_reward(completions, **_) -> list[float]:\n",
         "    out: list[float] = []\n",
         "    for c in completions:\n",
         "        text = _completion_text(c)\n",
         "        try:\n",
+        "            parse_action_strict(text)\n",
+        "            out.append(0.12 * CURRENT_LOCAL_SCALE)\n",
         "        except Exception:\n",
+        "            out.append(-0.20 * CURRENT_LOCAL_SCALE)\n",
+        "    return out\n",
+        "\n",
+        "\n",
+        "def semantic_action_reward(completions, prompts=None, **_) -> list[float]:\n",
+        "    \"\"\"\n",
+        "    Reward canonical, briefing-grounded action payloads before env call.\n",
+        "    Scaled by CURRENT_LOCAL_SCALE for easy->full curriculum annealing.\n",
+        "    \"\"\"\n",
+        "    out: list[float] = []\n",
+        "    for i, c in enumerate(completions):\n",
+        "        text = _completion_text(c)\n",
+        "        act = parse_action(text)\n",
+        "        at = act.get(\"action_type\", \"do_nothing\")\n",
+        "\n",
+        "        prompt_text = \"\"\n",
+        "        if prompts is not None and i < len(prompts):\n",
+        "            prompt_text = _prompt_to_text(prompts[i])\n",
+        "\n",
+        "        def present(tok: str) -> bool:\n",
+        "            return bool(tok) and re.search(rf\"\\b{re.escape(tok)}\\b\", prompt_text) is not None\n",
+        "\n",
+        "        r = -0.30\n",
+        "        if at == \"do_nothing\":\n",
+        "            r = -0.05\n",
+        "        elif at == \"reply_email\":\n",
+        "            eid = act.get(\"email_id\", \"\")\n",
+        "            mb = (act.get(\"message_body\", \"\") or \"\").strip()\n",
+        "            r = 0.30 if present(eid) and bool(re.fullmatch(r\"e\\d{2}\", eid)) and mb else -0.30\n",
+        "        elif at == \"archive_email\":\n",
+        "            eid = act.get(\"email_id\", \"\")\n",
+        "            r = 0.30 if present(eid) and bool(re.fullmatch(r\"e\\d{2}\", eid)) else -0.30\n",
+        "        elif at == \"reschedule_meeting\":\n",
+        "            mid = act.get(\"meeting_id\", \"\")\n",
+        "            nt = (act.get(\"new_time\", \"\") or \"\").strip()\n",
+        "            r = 0.30 if present(mid) and bool(re.fullmatch(r\"m\\d{2}\", mid)) and nt else -0.30\n",
+        "        elif at == \"cancel_meeting\":\n",
+        "            mid = act.get(\"meeting_id\", \"\")\n",
+        "            r = 0.30 if present(mid) and bool(re.fullmatch(r\"m\\d{2}\", mid)) else -0.30\n",
+        "        elif at == \"complete_task\":\n",
+        "            tid = act.get(\"task_id\", \"\")\n",
+        "            r = 0.30 if present(tid) and bool(re.fullmatch(r\"t\\d{2}\", tid)) else -0.30\n",
+        "        elif at == \"delegate_task\":\n",
+        "            tid = act.get(\"task_id\", \"\")\n",
+        "            cn = (act.get(\"contact_name\", \"\") or \"\").strip()\n",
+        "            r = 0.30 if present(tid) and bool(re.fullmatch(r\"t\\d{2}\", tid)) and (cn in prompt_text) else -0.30\n",
+        "        elif at == \"send_message\":\n",
+        "            cn = (act.get(\"contact_name\", \"\") or \"\").strip()\n",
+        "            mb = (act.get(\"message_body\", \"\") or \"\").strip()\n",
+        "            r = 0.30 if cn and (cn in prompt_text) and mb else -0.30\n",
+        "\n",
+        "        out.append(float(r) * CURRENT_LOCAL_SCALE)\n",
         "    return out\n",
         "\n",
+        "\n",
         "def anti_idle_reward(completions, **_) -> list[float]:\n",
         "    out: list[float] = []\n",
         "    for c in completions:\n",
         "        text = _completion_text(c)\n",
         "        act = parse_action(text)\n",
+        "        out.append((-0.28 if act.get(\"action_type\") == \"do_nothing\" else 0.03) * CURRENT_LOCAL_SCALE)\n",
         "    return out\n",
         "\n",
+        "\n",
         "_dummy = '{\"action_type\":\"archive_email\",\"email_id\":\"email_01\"}'\n",
+        "print(\"env      :\", env_reward([_dummy]))\n",
         "print(\"format   :\", format_reward([_dummy]))\n",
+        "print(\"semantic :\", semantic_action_reward([_dummy], prompts=[\"... e01 e09 t07 m02 Jamie Liu ...\"]))\n",
         "print(\"anti_idle:\", anti_idle_reward([_dummy]))"
       ],
       "execution_count": null,
+      "outputs": [],
+      "id": "3bd66b49"
     },
     {
       "cell_type": "code",
         "from transformers import TrainerCallback\n",
         "\n",
         "class HackingTripwire(TrainerCallback):\n",
+        "    \"\"\"Stop training on mode collapse, invalid-action collapse, or reward-format divergence.\"\"\"\n",
+        "    def __init__(self, min_unique_ratio: float = 0.2, invalid_floor: float = -0.24):\n",
         "        self.min_unique_ratio = min_unique_ratio\n",
+        "        self.invalid_floor = invalid_floor\n",
         "\n",
         "    def on_log(self, args, state, control, logs=None, **kw):\n",
         "        logs = logs or {}\n",
         "        uniq = logs.get(\"completions/unique_ratio\") or logs.get(\"completions/mean_unique\")\n",
         "        env_r = logs.get(\"rewards/env_reward/mean\")\n",
         "        fmt_r = logs.get(\"rewards/format_reward/mean\")\n",
+        "\n",
         "        if uniq is not None and uniq < self.min_unique_ratio:\n",
         "            print(f\"[TRIPWIRE] unique_ratio={uniq:.2f} < {self.min_unique_ratio} — stopping.\")\n",
         "            control.should_training_stop = True\n",
+        "\n",
+        "        if env_r is not None and env_r <= self.invalid_floor:\n",
+        "            print(f\"[TRIPWIRE] env_reward mean stuck at {env_r:.2f} (invalid-action collapse). stopping.\")\n",
+        "            control.should_training_stop = True\n",
+        "\n",
         "        if env_r is not None and fmt_r is not None and env_r > 0.8 and fmt_r < 0.0:\n",
         "            print(f\"[TRIPWIRE] env_r={env_r:.2f} but fmt_r={fmt_r:.2f} — possible hack. stopping.\")\n",
         "            control.should_training_stop = True"
       ],
       "execution_count": null,
+      "outputs": [],
+      "id": "a6a37dad"
     },
     {
       "cell_type": "markdown",
       "cell_type": "markdown",
       "metadata": {},
       "source": [
+        "## Phase 5 — Train small (SFT warmup -> GRPO)\n",
         "\n",
+        "Load `unsloth/Llama-3.2-3B-Instruct` in 4-bit with Unsloth, attach LoRA, run a **short SFT warmup first**, then run GRPO. vLLM is not used anywhere in this notebook — rollouts go through the standard HF `generate()` path inside `GRPOTrainer`."
+      ],
+      "id": "428d6377"
     },
     {
       "cell_type": "code",
         "    \"Legal action_type values: reply_email, archive_email, reschedule_meeting, cancel_meeting, \"\n",
         "    \"complete_task, delegate_task, send_message, do_nothing.\\n\\n\"\n",
         "    \"Output ONLY a compact JSON object with these keys (no prose, no code fences):\\n\"\n",
+        "    \"{\\\"action_type\\\": \\\"\\\", \\\"email_id\\\": \\\"\\\", \\\"message_body\\\": \\\"\\\", \"\n",
         "    \"\\\"meeting_id\\\": \\\"\\\", \\\"new_time\\\": \\\"\\\", \\\"reason\\\": \\\"\\\", \\\"task_id\\\": \\\"\\\", \"\n",
         "    \"\\\"contact_name\\\": \\\"\\\", \\\"message\\\": \\\"\\\"}.\\n\\n\"\n",
+        "    \"ID RULES:\\n\"\n",
+        "    \"- email_id must be an email token from briefing like e01, e02, ...\\n\"\n",
+        "    \"- meeting_id must be a meeting token like m01, m02, ...\\n\"\n",
+        "    \"- task_id must be a task token like t01, t02, ...\\n\"\n",
+        "    \"- contact_name must exactly match a contact shown in briefing.\\n\"\n",
+        "    \"- Never use subject/body/description text as an ID.\\n\"\n",
+        "    \"- If you cannot find a valid ID for your chosen action, output {\\\"action_type\\\":\\\"do_nothing\\\"}.\\n\\n\"\n",
+        "    \"Prefer high-impact valid actions; avoid do_nothing when critical items are unresolved.\"\n",
         ")\n",
         "\n",
         "def build_prompt(briefing: str) -> list[dict]:\n",
         "    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)"
       ],
       "execution_count": null,
+      "outputs": [],
+      "id": "883dce70"
     },
     {
       "cell_type": "code",
         "            pad_token_id=tokenizer.pad_token_id,\n",
         "        )\n",
         "        completion = tokenizer.decode(out[0][inputs[\"input_ids\"].shape[1]:], skip_special_tokens=True)\n",
+        "        action = sanitize_action(parse_action(completion))\n",
         "        env.reset()\n",
+        "        try:\n",
+        "            r, _ = env.step(action)\n",
+        "        except RuntimeError:\n",
+        "            r, _ = env.step({\"action_type\": \"do_nothing\"})\n",
         "        rs.append(r)\n",
         "    FastLanguageModel.for_training(model)\n",
         "    return rs\n",
         "baselines = {\"random\": random_rewards, \"frozen\": frozen_rewards}"
       ],
       "execution_count": null,
+      "outputs": [],
+      "id": "9c2ff7d6"
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
+        "### 5.2 Stage B — first GRPO stage (easy->full curriculum starts here)\n",
         "\n",
+        "We run a short SFT warmup first, then GRPO Stage B with stronger local scaffold weights (`CURRENT_LOCAL_SCALE`) and slightly lower env scale (`CURRENT_ENV_SCALE`).\n",
+        "\n",
+        "As stages progress (B -> C -> D), the notebook anneals toward full env-dominant training."
+      ],
+      "id": "018d2c7c"
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
+        "from trl import GRPOConfig, GRPOTrainer, SFTConfig, SFTTrainer\n",
         "\n",
+        "reward_funcs = [env_reward, format_reward, semantic_action_reward, anti_idle_reward]\n",
         "stage_logs: dict[str, list[dict]] = {}\n",
         "\n",
+        "# -------- SFT warmup --------\n",
+        "def _heuristic_action_for_sft(briefing: str) -> dict:\n",
+        "    b = briefing.lower()\n",
+        "    if \"e01\" in b:\n",
+        "        return {\"action_type\": \"reply_email\", \"email_id\": \"e01\", \"message_body\": \"Acknowledged, sharing an update shortly.\"}\n",
+        "    if \"m02\" in b:\n",
+        "        return {\"action_type\": \"reschedule_meeting\", \"meeting_id\": \"m02\", \"new_time\": \"2026-04-21T18:00:00\", \"reason\": \"resolve overlap\"}\n",
+        "    if \"t06\" in b:\n",
+        "        return {\"action_type\": \"complete_task\", \"task_id\": \"t06\"}\n",
+        "    return {\"action_type\": \"do_nothing\"}\n",
+        "\n",
+        "sft_rows = []\n",
+        "for b in briefings:\n",
+        "    msgs = build_prompt(b)\n",
+        "    prompt_txt = render_chat(msgs)\n",
+        "    completion_txt = json.dumps(_heuristic_action_for_sft(b), ensure_ascii=True)\n",
+        "    sft_rows.append({\"prompt_text\": prompt_txt, \"completion_text\": completion_txt})\n",
+        "\n",
+        "sft_ds = Dataset.from_list(sft_rows)\n",
+        "sft_cfg = SFTConfig(\n",
+        "    output_dir=str(OUT / \"sft_warmup\"),\n",
+        "    max_steps=30,\n",
+        "    per_device_train_batch_size=1,\n",
+        "    gradient_accumulation_steps=4,\n",
+        "    learning_rate=2e-5,\n",
+        "    logging_steps=5,\n",
+        "    report_to=\"none\",\n",
+        ")\n",
+        "sft_trainer = SFTTrainer(\n",
+        "    model=policy,\n",
+        "    processing_class=tokenizer,\n",
+        "    train_dataset=sft_ds,\n",
+        "    args=sft_cfg,\n",
+        "    dataset_text_field=\"prompt_text\",\n",
+        "    formatting_func=lambda ex: [f\"{p}{c}\" for p, c in zip(ex[\"prompt_text\"], ex[\"completion_text\"])],\n",
+        ")\n",
+        "print(\"\\n=== SFT warmup ===\")\n",
+        "sft_trainer.train()\n",
+        "policy = sft_trainer.model\n",
+        "\n",
+        "\n",
         "def grpo_config(name: str, *, temperature: float, num_generations: int, max_steps: int, lr: float) -> GRPOConfig:\n",
         "    return GRPOConfig(\n",
         "        output_dir=str(OUT / f\"stage_{name}\"),\n",
         "        gradient_accumulation_steps=4,\n",
         "        num_generations=num_generations,\n",
         "        max_prompt_length=1920,\n",
+        "        max_completion_length=48,\n",
         "        temperature=temperature,\n",
         "        learning_rate=lr,\n",
         "        beta=0.04,\n",
         "        max_steps=max_steps,\n",
         "        logging_steps=1,\n",
+        "        bf16=False,\n",
+        "        fp16=True,\n",
         "        report_to=\"none\",\n",
         "        save_strategy=\"no\",\n",
         "        remove_unused_columns=False,\n",
         "        log_completions=True,\n",
         "    )\n",
         "\n",
+        "\n",
+        "def set_curriculum_scales(stage_name: str) -> None:\n",
+        "    global CURRENT_ENV_SCALE, CURRENT_LOCAL_SCALE\n",
+        "    # easy -> full complexity curriculum\n",
+        "    if stage_name == \"B\":\n",
+        "        CURRENT_ENV_SCALE = 0.85\n",
+        "        CURRENT_LOCAL_SCALE = 0.60\n",
+        "    elif stage_name == \"C\":\n",
+        "        CURRENT_ENV_SCALE = 0.95\n",
+        "        CURRENT_LOCAL_SCALE = 0.40\n",
+        "    else:\n",
+        "        CURRENT_ENV_SCALE = 1.00\n",
+        "        CURRENT_LOCAL_SCALE = 0.25\n",
+        "    print(f\"curriculum[{stage_name}] env={CURRENT_ENV_SCALE:.2f} local={CURRENT_LOCAL_SCALE:.2f}\")\n",
+        "\n",
+        "\n",
         "def run_stage(name: str, **kw) -> None:\n",
+        "    set_curriculum_scales(name)\n",
         "    print(f\"\\n=== Stage {name} → {kw} ===\")\n",
         "    trainer = GRPOTrainer(\n",
         "        model=policy,\n",
         "    tokenizer.save_pretrained(adapter_dir)\n",
         "    print(f\"stage {name} adapter → {adapter_dir}\")\n",
         "\n",
+        "\n",
+        "run_stage(\"B\", temperature=0.8, num_generations=2, max_steps=20, lr=5e-6)"
       ],
       "execution_count": null,
+      "outputs": [],
+      "id": "10b073d0"
     },
     {
       "cell_type": "markdown",
       "source": [
         "## Phase 7 — Add curriculum\n",
         "\n",
+        "The deployed Space scenario is fixed, so curriculum is applied through both:\n",
+        "\n",
+        "1. **Exploration schedule** (temperature/lr across stages)\n",
+        "2. **Complexity curriculum (easy -> full)** via reward scales:\n",
+        "   - Stage B: stronger local scaffold guidance\n",
+        "   - Stage C: mixed guidance\n",
+        "   - Stage D: env-dominant optimization"
+      ],
+      "id": "524f6691"
     },
     {
       "cell_type": "code",
         "plt.show()\n",
         "\n",
         "rows = []\n",
+        "loss_rows = []\n",
         "step_counter = 0\n",
         "for name, log in stage_logs.items():\n",
         "    for entry in log:\n",
         "        r = entry.get(\"rewards/env_reward/mean\", entry.get(\"reward\"))\n",
+        "        if \"loss\" in entry:\n",
+        "            loss_rows.append({\"stage\": name, \"global_step\": step_counter + 1, \"loss\": entry[\"loss\"]})\n",
+        "        if r is None:\n",
+        "            continue\n",
         "        step_counter += 1\n",
         "        rows.append({\n",
+        "            \"stage\": name,\n",
+        "            \"global_step\": step_counter,\n",
+        "            \"env\": r,\n",
         "            \"fmt\":  entry.get(\"rewards/format_reward/mean\", 0.0),\n",
+        "            \"semantic\": entry.get(\"rewards/semantic_action_reward/mean\", 0.0),\n",
         "            \"idle\": entry.get(\"rewards/anti_idle_reward/mean\", 0.0),\n",
         "        })\n",
+        "\n",
         "df = pd.DataFrame(rows)\n",
         "df.to_csv(OUT / \"reward_log.csv\", index=False)\n",
         "\n",
+        "loss_df = pd.DataFrame(loss_rows)\n",
+        "if not loss_df.empty:\n",
+        "    plt.figure(figsize=(8, 4))\n",
+        "    for name, sub in loss_df.groupby(\"stage\"):\n",
+        "        plt.plot(sub[\"global_step\"], sub[\"loss\"], label=f\"stage {name}\")\n",
+        "    plt.xlabel(\"global step\"); plt.ylabel(\"loss\")\n",
+        "    plt.title(\"Ghostexec SFT+GRPO — loss vs step\")\n",
+        "    plt.legend(); plt.tight_layout()\n",
+        "    plt.savefig(OUT / \"loss_curve.png\", dpi=150); plt.show()\n",
+        "\n",
         "if not df.empty:\n",
         "    plt.figure(figsize=(8, 4))\n",
         "    for name, sub in df.groupby(\"stage\"):\n",
         "    plt.figure(figsize=(8, 4))\n",
         "    plt.plot(df[\"global_step\"], df[\"env\"],  label=\"env_reward\")\n",
         "    plt.plot(df[\"global_step\"], df[\"fmt\"],  label=\"format_reward\")\n",
+        "    plt.plot(df[\"global_step\"], df[\"semantic\"], label=\"semantic_action_reward\")\n",
         "    plt.plot(df[\"global_step\"], df[\"idle\"], label=\"anti_idle_reward\")\n",
         "    plt.xlabel(\"global step\"); plt.ylabel(\"mean component reward\")\n",
         "    plt.title(\"Reward components — hacking-watch\")\n",
         "    print(\"No numeric reward log found — skipping curve plots.\")"
       ],
       "execution_count": null,
+      "outputs": [],
+      "id": "5ccb3832"
     },
     {
       "cell_type": "code",
         "    \"env_url\":    GHOSTEXEC_ENV_URL,\n",
         "    \"model\":      MODEL_ID,\n",
         "    \"run\":        RUN_NAME,\n",
+        "    \"stack\":      {\"unsloth\": True, \"trl\": \"0.22.2\", \"pipeline\": \"SFT->GRPO\"},\n",
         "    \"rewards\": {\n",
         "        \"random_mean\":  summary[\"random\"],\n",
         "        \"frozen_mean\":  summary[\"frozen\"],\n",
         "        \"trained_mean\": summary[\"trained\"],\n",
         "        \"improvement_vs_frozen\": summary[\"trained\"] - summary[\"frozen\"],\n",
         "    },\n",
+        "    \"stages\":       [\"SFT\"] + list(stage_logs.keys()),\n",
+        "    \"reward_fns\":   [\"env_reward\", \"format_reward\", \"semantic_action_reward\", \"anti_idle_reward\"],\n",
+        "    \"curriculum\":   {\n",
+        "        \"type\": \"easy_to_full\",\n",
+        "        \"stage_scales\": {\n",
+        "            \"B\": {\"env\": 0.85, \"local\": 0.60},\n",
+        "            \"C\": {\"env\": 0.95, \"local\": 0.40},\n",
+        "            \"D\": {\"env\": 1.00, \"local\": 0.25},\n",
+        "        },\n",
+        "    },\n",
         "    \"tripwire\":     \"HackingTripwire (unique_ratio<0.2 or env↑/fmt↓)\",\n",
         "    \"adapter_path\": str(final_adapter),\n",
         "    \"mean_space_latency_ms\": round(sum(env.latency_ms) / max(len(env.latency_ms), 1), 1),\n",
         "print(\"\\nmanifest →\", OUT / \"manifest.json\")"
       ],
       "execution_count": null,
+      "outputs": [],
+      "id": "81fdfca3"
     }
   ],
   "metadata": {

openenv_ghostexec.egg-info/PKG-INFO CHANGED Viewed

@@ -13,3 +13,8 @@ Provides-Extra: constrained
 Requires-Dist: lm-format-enforcer>=0.10; extra == "constrained"
 Provides-Extra: constrained-outlines
 Requires-Dist: outlines>=0.1; extra == "constrained-outlines"

 Requires-Dist: lm-format-enforcer>=0.10; extra == "constrained"
 Provides-Extra: constrained-outlines
 Requires-Dist: outlines>=0.1; extra == "constrained-outlines"
+Provides-Extra: train
+Requires-Dist: datasets>=2.20.0; extra == "train"
+Requires-Dist: trl>=0.22.2; extra == "train"
+Requires-Dist: transformers>=4.45.0; extra == "train"
+Requires-Dist: accelerate>=0.34.0; extra == "train"

openenv_ghostexec.egg-info/SOURCES.txt CHANGED Viewed

@@ -9,6 +9,7 @@ pyproject.toml
 ./client.py
 ./conftest.py
 ./graders.py
 ./models.py
 ./scenarios/dinner_disaster.json
 ./scenarios/monday_morning.json
@@ -41,5 +42,4 @@ tests/test_phase1.py
 tests/test_phase2.py
 tests/test_phase3.py
 tests/test_phase4.py
-tests/test_reward_dead_suite.py
-tests/test_submission_plots_committed.py

 ./client.py
 ./conftest.py
 ./graders.py
+./inference.py
 ./models.py
 ./scenarios/dinner_disaster.json
 ./scenarios/monday_morning.json
 tests/test_phase2.py
 tests/test_phase3.py
 tests/test_phase4.py
+tests/test_reward_dead_suite.py

openenv_ghostexec.egg-info/requires.txt CHANGED Viewed

@@ -11,3 +11,9 @@ pytest>=8.0.0
 pytest-cov>=4.0.0
 pyyaml>=6.0.0
 matplotlib>=3.8.0

 pytest-cov>=4.0.0
 pyyaml>=6.0.0
 matplotlib>=3.8.0
+[train]
+datasets>=2.20.0
+trl>=0.22.2
+transformers>=4.45.0
+accelerate>=0.34.0

outputs/logs/episode_rewards.jsonl CHANGED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml CHANGED Viewed

@@ -42,6 +42,12 @@ constrained = [
 constrained-outlines = [
     "outlines>=0.1",
 ]
 [project.scripts]
 # Server entry point - enables running via: uv run --project . server

 constrained-outlines = [
     "outlines>=0.1",
 ]
+train = [
+    "datasets>=2.20.0",
+    "trl>=0.22.2",
+    "transformers>=4.45.0",
+    "accelerate>=0.34.0",
+]
 [project.scripts]
 # Server entry point - enables running via: uv run --project . server

scripts/eval_reward_ablation.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from __future__ import annotations
+import argparse
+import statistics
+import sys
+from pathlib import Path
+ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(ROOT.parent))
+from ghostexec.models import GhostexecAction
+from ghostexec.server.ghostexec_environment import GhostexecEnvironment
+def _run_episode(mode: str, scenario: Path) -> float:
+    env = GhostexecEnvironment(scenario_path=scenario, reward_mode=mode)
+    env.reset()
+    actions = [
+        GhostexecAction(action_type="reschedule_meeting", meeting_id="m02", new_time="2026-04-21T18:00:00"),
+        GhostexecAction(action_type="reply_email", email_id="e01", message_body="Sharing revised numbers now."),
+        GhostexecAction(action_type="archive_email", email_id="e09"),
+        GhostexecAction(action_type="send_message", contact_name="Jordan Lee", message_body="Quick status sync."),
+        GhostexecAction(action_type="complete_task", task_id="t06"),
+    ]
+    rewards = [float(env.step(a).reward or 0.0) for a in actions]
+    return statistics.fmean(rewards)
+def _run(mode: str, scenario: Path, episodes: int) -> dict[str, float]:
+    vals = [_run_episode(mode, scenario) for _ in range(episodes)]
+    return {
+        "mean": statistics.fmean(vals),
+        "std": statistics.pstdev(vals) if len(vals) > 1 else 0.0,
+        "min": min(vals),
+        "max": max(vals),
+    }
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Reward-mode ablation for Ghostexec.")
+    parser.add_argument("--episodes", type=int, default=30)
+    parser.add_argument(
+        "--scenario",
+        type=Path,
+        default=ROOT / "scenarios" / "phase2_core.json",
+    )
+    args = parser.parse_args()
+    modes = ("base", "full")
+    results = {m: _run(m, args.scenario, args.episodes) for m in modes}
+    print("Ghostexec reward ablation")
+    print(f"scenario={args.scenario} episodes={args.episodes}")
+    for m in modes:
+        r = results[m]
+        print(
+            f"{m:>5}: mean={r['mean']:.4f} std={r['std']:.4f} "
+            f"min={r['min']:.4f} max={r['max']:.4f}"
+        )
+    delta = results["full"]["mean"] - results["base"]["mean"]
+    print(f"delta(full-base)={delta:.4f}")
+if __name__ == "__main__":
+    main()

scripts/plot_training_report.py ADDED Viewed

	@@ -0,0 +1,170 @@

+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+import matplotlib.pyplot as plt
+import pandas as pd
+def _load_trainer_history(path: Path) -> list[dict[str, Any]]:
+    if not path.exists():
+        return []
+    data = json.loads(path.read_text(encoding="utf-8"))
+    if isinstance(data, dict) and isinstance(data.get("log_history"), list):
+        return [x for x in data["log_history"] if isinstance(x, dict)]
+    if isinstance(data, list):
+        return [x for x in data if isinstance(x, dict)]
+    return []
+def _load_baselines(path: Path) -> dict[str, float]:
+    if not path.exists():
+        return {}
+    data = json.loads(path.read_text(encoding="utf-8"))
+    if isinstance(data, dict) and "rewards" in data and isinstance(data["rewards"], dict):
+        data = data["rewards"]
+    out: dict[str, float] = {}
+    for k in ("random", "frozen", "trained", "random_mean", "frozen_mean", "trained_mean"):
+        if k in data:
+            v = data[k]
+            name = k.replace("_mean", "")
+            out[name] = float(v)
+    return out
+def _ensure_dir(path: Path) -> None:
+    path.mkdir(parents=True, exist_ok=True)
+def _plot_loss(history: list[dict[str, Any]], out_dir: Path) -> bool:
+    rows = []
+    for i, h in enumerate(history):
+        step = h.get("step", h.get("global_step", i))
+        if "loss" in h:
+            rows.append((float(step), float(h["loss"])))
+    if not rows:
+        return False
+    df = pd.DataFrame(rows, columns=["step", "loss"]).sort_values("step")
+    plt.figure(figsize=(9, 4.8))
+    plt.plot(df["step"], df["loss"], label="train_loss")
+    plt.xlabel("global step")
+    plt.ylabel("loss")
+    plt.title("Ghostexec training loss")
+    plt.grid(alpha=0.2)
+    plt.legend()
+    plt.tight_layout()
+    plt.savefig(out_dir / "loss_curve.png", dpi=150)
+    plt.close()
+    return True
+def _plot_reward_components(reward_csv: Path, out_dir: Path) -> tuple[bool, bool]:
+    if not reward_csv.exists():
+        return False, False
+    df = pd.read_csv(reward_csv)
+    if "global_step" not in df.columns:
+        return False, False
+    made_reward_curve = False
+    for col in ("env", "reward", "mean_reward"):
+        if col in df.columns:
+            plt.figure(figsize=(9, 4.8))
+            plt.plot(df["global_step"], df[col], label=col)
+            plt.xlabel("global step")
+            plt.ylabel("reward")
+            plt.title("Ghostexec reward vs step")
+            plt.grid(alpha=0.2)
+            plt.legend()
+            plt.tight_layout()
+            plt.savefig(out_dir / "reward_curve.png", dpi=150)
+            plt.close()
+            made_reward_curve = True
+            break
+    component_cols = [c for c in ("env", "fmt", "semantic", "idle") if c in df.columns]
+    if len(component_cols) >= 2:
+        plt.figure(figsize=(9, 4.8))
+        for c in component_cols:
+            plt.plot(df["global_step"], df[c], label=c)
+        plt.xlabel("global step")
+        plt.ylabel("mean component reward")
+        plt.title("Reward components vs step")
+        plt.grid(alpha=0.2)
+        plt.legend()
+        plt.tight_layout()
+        plt.savefig(out_dir / "components_curve.png", dpi=150)
+        plt.close()
+        return made_reward_curve, True
+    return made_reward_curve, False
+def _plot_baseline_bars(baselines: dict[str, float], out_dir: Path) -> bool:
+    needed = ("random", "frozen", "trained")
+    if not all(k in baselines for k in needed):
+        return False
+    names = list(needed)
+    vals = [baselines[n] for n in names]
+    colors = ["#888888", "#1f77b4", "#2ca02c"]
+    plt.figure(figsize=(8.2, 4.8))
+    plt.bar(names, vals, color=colors)
+    plt.ylabel("mean episode reward (higher is better)")
+    plt.title("Ghostexec: random vs frozen vs trained")
+    plt.tight_layout()
+    plt.savefig(out_dir / "baseline_comparison.png", dpi=150)
+    plt.close()
+    return True
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate post-training Ghostexec plots.")
+    parser.add_argument(
+        "--trainer-history",
+        type=Path,
+        default=Path("outputs/trainer_state.json"),
+        help="JSON with HF/Unsloth log history (trainer_state.json or list of logs).",
+    )
+    parser.add_argument(
+        "--reward-csv",
+        type=Path,
+        default=Path("outputs/reward_log.csv"),
+        help="CSV containing global_step and reward columns.",
+    )
+    parser.add_argument(
+        "--baselines-json",
+        type=Path,
+        default=Path("outputs/compliance_manifest.json"),
+        help="JSON containing random/frozen/trained means (or rewards object).",
+    )
+    parser.add_argument(
+        "--out-dir",
+        type=Path,
+        default=Path("outputs/plots"),
+        help="Directory to save plot PNGs.",
+    )
+    args = parser.parse_args()
+    _ensure_dir(args.out_dir)
+    history = _load_trainer_history(args.trainer_history)
+    baselines = _load_baselines(args.baselines_json)
+    made_loss = _plot_loss(history, args.out_dir)
+    made_reward, made_components = _plot_reward_components(args.reward_csv, args.out_dir)
+    made_bars = _plot_baseline_bars(baselines, args.out_dir)
+    print("Generated plots:")
+    print(f"- loss_curve.png: {'yes' if made_loss else 'no (missing loss history)'}")
+    print(f"- reward_curve.png: {'yes' if made_reward else 'no (missing reward csv columns)'}")
+    print(
+        f"- components_curve.png: {'yes' if made_components else 'no (missing component columns)'}"
+    )
+    print(
+        f"- baseline_comparison.png: {'yes' if made_bars else 'no (missing random/frozen/trained means)'}"
+    )
+    print(f"Output directory: {args.out_dir.resolve()}")
+if __name__ == "__main__":
+    main()

scripts/train_sft_then_grpo.py ADDED Viewed

	@@ -0,0 +1,641 @@

+from __future__ import annotations
+import argparse
+import json
+import random
+import re
+from pathlib import Path
+from typing import Any
+import requests
+from transformers import TrainerCallback
+LEGAL_ACTION_TYPES = [
+    "reply_email",
+    "archive_email",
+    "reschedule_meeting",
+    "cancel_meeting",
+    "complete_task",
+    "delegate_task",
+    "send_message",
+    "do_nothing",
+]
+MODEL_PRESETS: dict[str, str] = {
+    # Fast iteration winner preset: small, strong instruction following, QLoRA-friendly.
+    "small_iter_fast": "unsloth/Qwen2.5-3B-Instruct",
+    # Existing baseline used in this repo.
+    "balanced_3b": "unsloth/Llama-3.2-3B-Instruct",
+    # Larger option when compute budget is stable.
+    "bigger_4b": "unsloth/Qwen3-4B-Instruct-2507",
+}
+TRAINING_PRESETS: dict[str, dict[str, float | int | str]] = {
+    "hackathon_turbo": {
+        "max_sft_steps": 80,
+        "max_grpo_steps": 180,
+        "env_reward_scale": 1.00,
+        "local_reward_scale": 0.45,
+        "complexity_curriculum": "easy_to_full",
+        "curriculum_ramp_ratio": 0.65,
+        "sft_samples": 180,
+        # Optimizer / schedule knobs (stability-first for iterative winning runs)
+        "sft_lr": 1.2e-5,
+        "sft_grad_accum": 8,
+        "grpo_lr": 3.0e-6,
+        "grpo_grad_accum": 8,
+        "grpo_beta": 0.08,
+        "reward_ema_decay": 0.35,
+    },
+    # Quicker loop for smoke iterations on weaker hardware.
+    "quick_smoke": {
+        "max_sft_steps": 30,
+        "max_grpo_steps": 80,
+        "env_reward_scale": 0.95,
+        "local_reward_scale": 0.35,
+        "complexity_curriculum": "easy_to_full",
+        "curriculum_ramp_ratio": 0.50,
+        "sft_samples": 90,
+        "sft_lr": 1.5e-5,
+        "sft_grad_accum": 4,
+        "grpo_lr": 4.0e-6,
+        "grpo_grad_accum": 4,
+        "grpo_beta": 0.06,
+        "reward_ema_decay": 0.25,
+    },
+}
+def _as_float(x: object | None) -> float | None:
+    if x is None:
+        return None
+    try:
+        return float(x)
+    except Exception:
+        return None
+class StabilityTripwire(TrainerCallback):
+    """Stop training when logs show sustained reward collapse + loss blow-up."""
+    def __init__(
+        self,
+        *,
+        min_step: int,
+        reward_key: str,
+        loss_key: str,
+        reward_drop: float,
+        loss_spike: float,
+        bad_streak: int,
+    ) -> None:
+        self.min_step = min_step
+        self.reward_key = reward_key
+        self.loss_key = loss_key
+        self.reward_drop = reward_drop
+        self.loss_spike = loss_spike
+        self.bad_streak = bad_streak
+        self._best_reward: float | None = None
+        self._best_loss: float | None = None
+        self._streak = 0
+    def on_log(self, args, state, control, logs=None, **kw):  # type: ignore[no-untyped-def]
+        logs = logs or {}
+        step = int(getattr(state, "global_step", 0) or 0)
+        if step < self.min_step:
+            return control
+        r = _as_float(logs.get(self.reward_key))
+        loss = _as_float(logs.get(self.loss_key))
+        reward_bad = False
+        loss_bad = False
+        if r is not None:
+            if self._best_reward is None or r > self._best_reward:
+                self._best_reward = r
+            elif self._best_reward is not None and self._best_reward - r >= self.reward_drop:
+                reward_bad = True
+        if loss is not None:
+            if self._best_loss is None or loss < self._best_loss:
+                self._best_loss = loss
+            elif self._best_loss is not None and loss - self._best_loss >= self.loss_spike:
+                loss_bad = True
+        bad = reward_bad and loss_bad and r is not None and loss is not None
+        if bad:
+            self._streak += 1
+        else:
+            self._streak = 0
+        if self._streak >= self.bad_streak:
+            print(
+                f"[STABILITY] stopping: sustained instability "
+                f"(best {self.reward_key}={self._best_reward}, best loss={self._best_loss}, streak={self._streak})."
+            )
+            control.should_training_stop = True
+        return control
+class LossSpikeTripwire(TrainerCallback):
+    """SFT guardrail: stop if loss repeatedly blows up vs the best-so-far."""
+    def __init__(self, *, min_step: int, loss_key: str, loss_spike: float, bad_streak: int) -> None:
+        self.min_step = min_step
+        self.loss_key = loss_key
+        self.loss_spike = loss_spike
+        self.bad_streak = bad_streak
+        self._best_loss: float | None = None
+        self._streak = 0
+    def on_log(self, args, state, control, logs=None, **kw):  # type: ignore[no-untyped-def]
+        logs = logs or {}
+        step = int(getattr(state, "global_step", 0) or 0)
+        if step < self.min_step:
+            return control
+        loss = _as_float(logs.get(self.loss_key))
+        if loss is None:
+            return control
+        if self._best_loss is None or loss < self._best_loss:
+            self._best_loss = loss
+            self._streak = 0
+            return control
+        if self._best_loss is not None and loss - self._best_loss >= self.loss_spike:
+            self._streak += 1
+        else:
+            self._streak = 0
+        if self._streak >= self.bad_streak:
+            print(f"[STABILITY] stopping SFT: repeated loss spikes (best={self._best_loss}, streak={self._streak}).")
+            control.should_training_stop = True
+        return control
+def _extract_briefing(reset_payload: dict[str, Any]) -> str:
+    obs = reset_payload.get("observation", reset_payload)
+    if isinstance(obs, dict):
+        return str(obs.get("echoed_message", "")).strip()
+    return ""
+def _legal_action_heuristic(briefing: str) -> dict[str, Any]:
+    # Minimal heuristic used only for SFT warm-start data generation.
+    # Keeps the action schema valid and non-idle-biased.
+    lower = briefing.lower()
+    if "e01" in lower:
+        return {
+            "action_type": "reply_email",
+            "email_id": "e01",
+            "message_body": "Acknowledged. Sharing a concise update shortly.",
+        }
+    if "m02" in lower:
+        return {
+            "action_type": "reschedule_meeting",
+            "meeting_id": "m02",
+            "new_time": "2026-04-21T18:00:00",
+            "reason": "Resolve overlap with higher priority commitments.",
+        }
+    if "t06" in lower:
+        return {"action_type": "complete_task", "task_id": "t06"}
+    return {"action_type": random.choice(LEGAL_ACTION_TYPES)}
+def generate_sft_jsonl_from_env(
+    env_url: str,
+    out_jsonl: Path,
+    samples: int = 120,
+    task_id: str = "phase2_core",
+) -> None:
+    out_jsonl.parent.mkdir(parents=True, exist_ok=True)
+    rows: list[dict[str, str]] = []
+    for _ in range(samples):
+        r = requests.post(f"{env_url.rstrip('/')}/reset", json={"task_id": task_id}, timeout=30)
+        r.raise_for_status()
+        payload = r.json()
+        briefing = _extract_briefing(payload)
+        if not briefing:
+            continue
+        action = _legal_action_heuristic(briefing)
+        prompt = (
+            "You are Ghostexec AI Chief-of-Staff.\n"
+            "Output one valid GhostexecAction JSON only.\n\n"
+            f"{briefing}"
+        )
+        rows.append({"prompt": prompt, "completion": json.dumps(action, ensure_ascii=True)})
+    with out_jsonl.open("w", encoding="utf-8") as fh:
+        for row in rows:
+            fh.write(json.dumps(row, ensure_ascii=True) + "\n")
+    print(f"Wrote {len(rows)} SFT rows to {out_jsonl}")
+def run_sft_then_grpo(
+    model_name: str,
+    env_url: str,
+    sft_jsonl: Path,
+    out_dir: Path,
+    env_reward_scale: float,
+    local_reward_scale: float,
+    max_sft_steps: int,
+    max_grpo_steps: int,
+    complexity_curriculum: str,
+    curriculum_ramp_ratio: float,
+    *,
+    sft_lr: float,
+    sft_grad_accum: int,
+    grpo_lr: float,
+    grpo_grad_accum: int,
+    grpo_beta: float,
+    reward_ema_decay: float,
+    stability_tripwire: bool,
+) -> None:
+    try:
+        from datasets import load_dataset
+        from trl import GRPOConfig, GRPOTrainer, SFTConfig, SFTTrainer
+        from unsloth import FastLanguageModel
+    except Exception as exc:  # pragma: no cover
+        raise RuntimeError(
+            "Missing training deps. Install unsloth, trl, datasets, transformers before running."
+        ) from exc
+    out_dir.mkdir(parents=True, exist_ok=True)
+    def _trainable_lora_sum_abs(model) -> float:
+        total = 0.0
+        for n, p in model.named_parameters():
+            if not p.requires_grad:
+                continue
+            if "lora" not in n.lower():
+                continue
+            total += float(p.detach().abs().sum().item())
+        return total
+    policy, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=model_name,
+        max_seq_length=2048,
+        dtype=None,
+        load_in_4bit=True,
+    )
+    policy = FastLanguageModel.get_peft_model(
+        policy,
+        r=16,
+        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+        lora_alpha=16,
+        lora_dropout=0.0,
+        bias="none",
+        use_gradient_checkpointing="unsloth",
+        random_state=3407,
+    )
+    ds = load_dataset("json", data_files=str(sft_jsonl), split="train")
+    sft_cfg = SFTConfig(
+        output_dir=str(out_dir / "sft"),
+        max_steps=max_sft_steps,
+        per_device_train_batch_size=1,
+        gradient_accumulation_steps=sft_grad_accum,
+        learning_rate=sft_lr,
+        lr_scheduler_type="cosine",
+        warmup_ratio=0.06,
+        max_grad_norm=1.0,
+        adam_beta1=0.9,
+        adam_beta2=0.95,
+        logging_steps=5,
+        save_steps=max(10, max_sft_steps),
+        report_to=[],
+    )
+    sft_trainer = SFTTrainer(
+        model=policy,
+        tokenizer=tokenizer,
+        train_dataset=ds,
+        args=sft_cfg,
+        dataset_text_field="prompt",
+        formatting_func=lambda ex: [f"{p}\n\n{c}" for p, c in zip(ex["prompt"], ex["completion"])],
+    )
+    if stability_tripwire:
+        sft_trainer.add_callback(
+            LossSpikeTripwire(
+                min_step=max(10, max_sft_steps // 6),
+                loss_key="loss",
+                loss_spike=0.85,
+                bad_streak=4,
+            )
+        )
+    sft_before = _trainable_lora_sum_abs(policy)
+    sft_trainer.train()
+    sft_after = _trainable_lora_sum_abs(sft_trainer.model)
+    sft_delta = abs(sft_after - sft_before)
+    print(f"SFT LoRA delta(abs-sum): {sft_delta:.6f}")
+    if sft_delta <= 1e-6:
+        raise RuntimeError("SFT appears not to have updated LoRA weights (delta too small).")
+    sft_dir = out_dir / "sft_adapter"
+    sft_trainer.model.save_pretrained(sft_dir)
+    tokenizer.save_pretrained(sft_dir)
+    print(f"SFT complete. Adapter saved: {sft_dir}")
+    def _extract_json(text: str) -> dict[str, Any] | None:
+        m = re.search(r"\{.*\}", text, flags=re.S)
+        if not m:
+            return None
+        try:
+            obj = json.loads(m.group(0))
+        except Exception:
+            return None
+        return obj if isinstance(obj, dict) else None
+    def _env_step_reward_from_completion(text: str) -> float:
+        payload = _extract_json(text)
+        if payload is None:
+            return -0.25
+        payload.setdefault("action_type", "do_nothing")
+        try:
+            r = requests.post(f"{env_url.rstrip('/')}/reset", json={"task_id": "phase2_core"}, timeout=30)
+            r.raise_for_status()
+            s = requests.post(
+                f"{env_url.rstrip('/')}/step",
+                json={"action": payload},
+                timeout=30,
+            )
+            s.raise_for_status()
+            raw = s.json()
+        except Exception:
+            return 0.0
+        rew = raw.get("reward")
+        if rew is None and isinstance(raw.get("observation"), dict):
+            rew = raw["observation"].get("reward", 0.0)
+        try:
+            return float(rew)
+        except Exception:
+            return 0.0
+    progress = {"step": 0, "total": max(1, max_grpo_steps)}
+    reward_ema_state = {"env": None}
+    class _ProgressCallback(TrainerCallback):
+        def on_step_end(self, args, state, control, **kwargs):  # type: ignore[override]
+            progress["step"] = int(getattr(state, "global_step", progress["step"]))
+            return control
+    def _progress_frac() -> float:
+        return min(1.0, progress["step"] / progress["total"])
+    def _curriculum_phase_weight() -> float:
+        frac = _progress_frac()
+        ramp = max(0.05, min(1.0, curriculum_ramp_ratio))
+        if complexity_curriculum == "off":
+            return 1.0
+        # easy_to_full: start with strong scaffold guidance, then smoothly
+        # transition to full env-dominant optimization.
+        if frac >= ramp:
+            return 0.0
+        return max(0.0, 1.0 - (frac / ramp))
+    def _annealed_local_scale() -> float:
+        frac = _progress_frac()
+        base = local_reward_scale * (1.20 - 0.70 * frac)
+        return base * (1.0 + 0.70 * _curriculum_phase_weight())
+    def _annealed_env_scale() -> float:
+        w = _curriculum_phase_weight()
+        # Slightly downweight env reward in early easy phase to reduce variance,
+        # then recover to full strength by the end of ramp.
+        return env_reward_scale * (1.0 - 0.30 * w)
+    def env_reward(completions, **_):
+        scale = _annealed_env_scale()
+        raw = [scale * _env_step_reward_from_completion(str(c)) for c in completions]
+        if reward_ema_decay <= 0.0:
+            return raw
+        batch_mean = sum(raw) / max(len(raw), 1)
+        prev = reward_ema_state["env"]
+        d = max(0.0, min(1.0, reward_ema_decay))
+        if prev is None:
+            smoothed_mean = batch_mean
+        else:
+            smoothed_mean = (1.0 - d) * prev + d * batch_mean
+        reward_ema_state["env"] = smoothed_mean
+        delta = smoothed_mean - batch_mean
+        return [r + delta for r in raw]
+    def format_reward(completions, **_):
+        scale = _annealed_local_scale()
+        outs: list[float] = []
+        for c in completions:
+            txt = str(c).strip()
+            obj = _extract_json(txt)
+            if obj is None:
+                outs.append(-0.20 * scale)
+                continue
+            if obj.get("action_type") not in LEGAL_ACTION_TYPES:
+                outs.append(-0.20 * scale)
+                continue
+            # Encourage concise, parseable schema-correct JSON.
+            length_pen = -0.04 * scale if len(txt) > 500 else 0.0
+            outs.append(0.12 * scale + length_pen)
+        return outs
+    def semantic_action_reward(completions, prompts=None, **_):
+        scale = _annealed_local_scale()
+        outs: list[float] = []
+        for i, c in enumerate(completions):
+            obj = _extract_json(str(c))
+            if obj is None:
+                outs.append(-0.10 * scale)
+                continue
+            at = str(obj.get("action_type", ""))
+            ptxt = str(prompts[i] if prompts and i < len(prompts) else "").lower()
+            bonus = 0.0
+            if "critical" in ptxt and at == "reply_email":
+                bonus += 0.08
+            if "clash" in ptxt and at in ("reschedule_meeting", "cancel_meeting"):
+                bonus += 0.08
+            if ("overdue" in ptxt or "due soon" in ptxt) and at in ("complete_task", "delegate_task"):
+                bonus += 0.08
+            outs.append(scale * bonus)
+        return outs
+    def anti_idle_reward(completions, **_):
+        scale = _annealed_local_scale()
+        outs = []
+        for c in completions:
+            txt = str(c).lower()
+            outs.append((-0.20 if "do_nothing" in txt else 0.02) * scale)
+        return outs
+    grpo_cfg = GRPOConfig(
+        output_dir=str(out_dir / "grpo"),
+        learning_rate=grpo_lr,
+        per_device_train_batch_size=1,
+        gradient_accumulation_steps=grpo_grad_accum,
+        max_steps=max_grpo_steps,
+        logging_steps=5,
+        num_generations=2,
+        beta=grpo_beta,
+        lr_scheduler_type="cosine",
+        warmup_ratio=0.06,
+        max_grad_norm=1.0,
+        adam_beta1=0.9,
+        adam_beta2=0.95,
+        report_to=[],
+    )
+    grpo_callbacks = [_ProgressCallback()]
+    if stability_tripwire:
+        grpo_callbacks.append(
+            StabilityTripwire(
+                min_step=max(15, max_grpo_steps // 8),
+                reward_key="rewards/env_reward/mean",
+                loss_key="loss",
+                reward_drop=0.12,
+                loss_spike=0.35,
+                bad_streak=3,
+            )
+        )
+    grpo_trainer = GRPOTrainer(
+        model=sft_trainer.model,
+        processing_class=tokenizer,
+        reward_funcs=[env_reward, format_reward, semantic_action_reward, anti_idle_reward],
+        train_dataset=ds,
+        args=grpo_cfg,
+        callbacks=grpo_callbacks,
+    )
+    grpo_before = _trainable_lora_sum_abs(sft_trainer.model)
+    grpo_trainer.train()
+    progress["step"] = progress["total"]
+    grpo_after = _trainable_lora_sum_abs(grpo_trainer.model)
+    grpo_delta = abs(grpo_after - grpo_before)
+    print(f"GRPO LoRA delta(abs-sum): {grpo_delta:.6f}")
+    if grpo_delta <= 1e-6:
+        raise RuntimeError("GRPO appears not to have updated LoRA weights (delta too small).")
+    final_dir = out_dir / "grpo_adapter"
+    grpo_trainer.model.save_pretrained(final_dir)
+    tokenizer.save_pretrained(final_dir)
+    print(f"GRPO complete. Adapter saved: {final_dir}")
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Run SFT warmup before GRPO.")
+    parser.add_argument(
+        "--model-name",
+        default="",
+        help="Optional explicit model id. If omitted, --model-preset is used.",
+    )
+    parser.add_argument(
+        "--model-preset",
+        choices=sorted(MODEL_PRESETS.keys()),
+        default="small_iter_fast",
+        help="Recommended compute-aware preset. small_iter_fast is best for iteration speed.",
+    )
+    parser.add_argument(
+        "--training-preset",
+        choices=sorted(TRAINING_PRESETS.keys()),
+        default="hackathon_turbo",
+        help="Compute-aware run preset. hackathon_turbo is best default for iterative winning loops.",
+    )
+    parser.add_argument("--env-url", default="http://127.0.0.1:8000")
+    parser.add_argument("--sft-jsonl", type=Path, default=Path("outputs/sft_from_env.jsonl"))
+    parser.add_argument("--out-dir", type=Path, default=Path("outputs/train_runs/sft_then_grpo"))
+    parser.add_argument("--generate-sft-from-env", action="store_true")
+    parser.add_argument("--sft-samples", type=int, default=120)
+    parser.add_argument("--max-sft-steps", type=int, default=60)
+    parser.add_argument("--max-grpo-steps", type=int, default=120)
+    parser.add_argument("--env-reward-scale", type=float, default=1.0)
+    parser.add_argument("--local-reward-scale", type=float, default=0.35)
+    parser.add_argument(
+        "--complexity-curriculum",
+        choices=["off", "easy_to_full"],
+        default="easy_to_full",
+        help="Reward curriculum: easy_to_full starts with stronger local scaffold and anneals to env-dominant.",
+    )
+    parser.add_argument(
+        "--curriculum-ramp-ratio",
+        type=float,
+        default=0.60,
+        help="Fraction of GRPO steps used to ramp from easy scaffold to full env weighting.",
+    )
+    parser.add_argument(
+        "--no-stability-tripwire",
+        action="store_true",
+        help="Disable oscillation/collapse early-stop guardrails (not recommended).",
+    )
+    parser.add_argument(
+        "--reward-ema-decay",
+        type=float,
+        default=-1.0,
+        help="EMA decay in [0,1] for env reward smoothing; -1 uses training preset default.",
+    )
+    args = parser.parse_args()
+    model_name = args.model_name.strip() or MODEL_PRESETS[args.model_preset]
+    p = TRAINING_PRESETS[args.training_preset]
+    max_sft_steps = int(p["max_sft_steps"])
+    max_grpo_steps = int(p["max_grpo_steps"])
+    env_reward_scale = float(p["env_reward_scale"])
+    local_reward_scale = float(p["local_reward_scale"])
+    complexity_curriculum = str(p["complexity_curriculum"])
+    curriculum_ramp_ratio = float(p["curriculum_ramp_ratio"])
+    sft_samples = int(p["sft_samples"])
+    sft_lr = float(p["sft_lr"])
+    sft_grad_accum = int(p["sft_grad_accum"])
+    grpo_lr = float(p["grpo_lr"])
+    grpo_grad_accum = int(p["grpo_grad_accum"])
+    grpo_beta = float(p["grpo_beta"])
+    reward_ema_decay = float(p["reward_ema_decay"])
+    if args.max_sft_steps != 60:
+        max_sft_steps = args.max_sft_steps
+    if args.max_grpo_steps != 120:
+        max_grpo_steps = args.max_grpo_steps
+    if args.env_reward_scale != 1.0:
+        env_reward_scale = args.env_reward_scale
+    if args.local_reward_scale != 0.35:
+        local_reward_scale = args.local_reward_scale
+    if args.complexity_curriculum != "easy_to_full":
+        complexity_curriculum = args.complexity_curriculum
+    if args.curriculum_ramp_ratio != 0.60:
+        curriculum_ramp_ratio = args.curriculum_ramp_ratio
+    if args.sft_samples != 120:
+        sft_samples = args.sft_samples
+    if args.reward_ema_decay >= 0.0:
+        reward_ema_decay = float(args.reward_ema_decay)
+    stability_tripwire = not args.no_stability_tripwire
+    print(f"Model preset: {args.model_preset} -> {model_name}")
+    print(
+        "Training preset:"
+        f" {args.training_preset} -> sft={max_sft_steps}, grpo={max_grpo_steps},"
+        f" env_scale={env_reward_scale}, local_scale={local_reward_scale},"
+        f" curriculum={complexity_curriculum}, ramp={curriculum_ramp_ratio}"
+    )
+    if args.generate_sft_from_env or not args.sft_jsonl.exists():
+        generate_sft_jsonl_from_env(
+            env_url=args.env_url,
+            out_jsonl=args.sft_jsonl,
+            samples=sft_samples,
+            task_id="phase2_core",
+        )
+    run_sft_then_grpo(
+        model_name=model_name,
+        env_url=args.env_url,
+        sft_jsonl=args.sft_jsonl,
+        out_dir=args.out_dir,
+        env_reward_scale=env_reward_scale,
+        local_reward_scale=local_reward_scale,
+        max_sft_steps=max_sft_steps,
+        max_grpo_steps=max_grpo_steps,
+        complexity_curriculum=complexity_curriculum,
+        curriculum_ramp_ratio=curriculum_ramp_ratio,
+        sft_lr=sft_lr,
+        sft_grad_accum=sft_grad_accum,
+        grpo_lr=grpo_lr,
+        grpo_grad_accum=grpo_grad_accum,
+        grpo_beta=grpo_beta,
+        reward_ema_decay=reward_ema_decay,
+        stability_tripwire=stability_tripwire,
+    )
+if __name__ == "__main__":
+    main()

server/ghostexec_environment.py CHANGED Viewed

@@ -15,6 +15,7 @@ Rewards aggregate conflict / relationship / task scores and log each step to out
 from __future__ import annotations
 import json
 from datetime import datetime, timedelta, timezone
 from pathlib import Path
 from typing import Any
@@ -71,6 +72,7 @@ _REL_DISPLAY: dict[str, str] = {
 _INVALID_ACTION_REWARD = -0.25
 _DEFAULT_STEP_REWARD = 0.0
 def _default_scenario_path() -> Path:
@@ -104,6 +106,7 @@ class GhostexecEnvironment(Environment):
         self,
         scenario_path: str | Path | None = None,
         schema_drift_events_path: str | Path | None = None,
     ) -> None:
         self._scenario_path = Path(scenario_path) if scenario_path else _default_scenario_path()
         self._drift_events_path = (
@@ -124,6 +127,9 @@ class GhostexecEnvironment(Environment):
         self._last_step_error: str | None = None
         self._last_step_detail: str = ""
         self._last_reward_breakdown: RewardBreakdown | None = None
     # --- lifecycle ---
@@ -176,6 +182,7 @@ class GhostexecEnvironment(Environment):
         before = self.world.model_copy(deep=True)
         action_ok = self._apply_action(action)
         self._rebuild_conflict_list()
         episode_done = False
@@ -191,6 +198,9 @@ class GhostexecEnvironment(Environment):
             action_ok=action_ok,
             episode_done=episode_done,
             relationship_suppressed_for_email_to=frozenset(self._reply_relationship_suppressed),
         )
         self._last_reward_breakdown = breakdown
         self._append_reward_log(breakdown, episode_done, action)
@@ -540,6 +550,62 @@ class GhostexecEnvironment(Environment):
         self._world.action_log.append(f"error: {msg}")
         return False
     def _ensure_reward_log_dir(self) -> None:
         self._reward_log_path.parent.mkdir(parents=True, exist_ok=True)
@@ -566,6 +632,13 @@ class GhostexecEnvironment(Environment):
             "task": breakdown.task,
             "weighted_base": breakdown.weighted_base,
             "output_scale": breakdown.output_scale,
             "invalid_step_adjustment": breakdown.invalid_step_adjustment,
             "episode_completion_bonus": breakdown.episode_completion_bonus,
             "catastrophic_penalty": breakdown.catastrophic_penalty,
@@ -573,6 +646,7 @@ class GhostexecEnvironment(Environment):
             "calendar_overlap_pairs": len(self.detect_meeting_conflicts()),
             "critical_unreplied": crit_open,
             "overdue_tasks": overdue_n,
         }
         with self._reward_log_path.open("a", encoding="utf-8") as fh:
             fh.write(json.dumps(line) + "\n")

 from __future__ import annotations
 import json
+import os
 from datetime import datetime, timedelta, timezone
 from pathlib import Path
 from typing import Any
 _INVALID_ACTION_REWARD = -0.25
 _DEFAULT_STEP_REWARD = 0.0
+_MOOD_ORDER: tuple[Mood, ...] = ("furious", "angry", "annoyed", "neutral", "happy")
 def _default_scenario_path() -> Path:
         self,
         scenario_path: str | Path | None = None,
         schema_drift_events_path: str | Path | None = None,
+        reward_mode: str | None = None,
     ) -> None:
         self._scenario_path = Path(scenario_path) if scenario_path else _default_scenario_path()
         self._drift_events_path = (
         self._last_step_error: str | None = None
         self._last_step_detail: str = ""
         self._last_reward_breakdown: RewardBreakdown | None = None
+        self._reward_mode = (reward_mode or os.getenv("GHOSTEXEC_REWARD_MODE", "full")).strip().lower()
+        if self._reward_mode not in {"full", "base", "shaping"}:
+            self._reward_mode = "full"
     # --- lifecycle ---
         before = self.world.model_copy(deep=True)
         action_ok = self._apply_action(action)
+        self._apply_post_action_dynamics(action, action_ok=action_ok)
         self._rebuild_conflict_list()
         episode_done = False
             action_ok=action_ok,
             episode_done=episode_done,
             relationship_suppressed_for_email_to=frozenset(self._reply_relationship_suppressed),
+            reward_mode=self._reward_mode,
+            step_index=self._state.step_count,
+            max_steps=self.world.max_episode_steps,
         )
         self._last_reward_breakdown = breakdown
         self._append_reward_log(breakdown, episode_done, action)
         self._world.action_log.append(f"error: {msg}")
         return False
+    def _advance_clock(self, minutes: int) -> None:
+        now = _parse_dt(self.world.simulation_time)
+        new_now = (now + timedelta(minutes=minutes)).replace(tzinfo=None)
+        self.world.simulation_time = new_now.isoformat(timespec="seconds")
+        self._reapply_task_overdue_flags()
+    def _shift_contact_mood(self, name: str, delta: int) -> None:
+        if delta == 0:
+            return
+        c = self.get_contact(name)
+        if c is None:
+            return
+        idx = _MOOD_ORDER.index(c.mood)
+        next_idx = max(0, min(len(_MOOD_ORDER) - 1, idx + delta))
+        if next_idx != idx:
+            self.update_contact_mood(name, _MOOD_ORDER[next_idx])
+    def _apply_post_action_dynamics(self, action: GhostexecAction, *, action_ok: bool) -> None:
+        # Step-level world progression adds realistic pressure dynamics while
+        # remaining deterministic and learnable for policy optimization.
+        self._advance_clock(minutes=20)
+        now = _parse_dt(self.world.simulation_time)
+        if action_ok and action.action_type == "reply_email" and action.email_id:
+            em = next((e for e in self.world.emails if e.id == action.email_id), None)
+            if em:
+                self._shift_contact_mood(em.sender, +1)
+        if action_ok and action.action_type == "send_message" and action.contact_name:
+            self._shift_contact_mood(action.contact_name.strip(), +1)
+        if action_ok and action.action_type == "cancel_meeting" and action.meeting_id:
+            mtg = next((m for m in self.world.meetings if m.id == action.meeting_id), None)
+            if mtg:
+                for attendee in mtg.attendees:
+                    self._shift_contact_mood(attendee, -1)
+        # Pressure escalation only on idle/invalid behavior to keep
+        # action-quality separation sharp for learning.
+        if (not action_ok) or action.action_type == "do_nothing":
+            critical_pending = [e for e in self.world.emails if e.priority == "critical" and not e.replied]
+            if critical_pending:
+                self._shift_contact_mood(critical_pending[0].sender, -1)
+        # Meetings that have already ended without cancellation and still overlap
+        # indicate unresolved calendar debt; this increases stress pressure.
+        unresolved_past_conflicts = 0
+        for row in self.detect_meeting_conflicts():
+            overlap_end = _parse_dt(row["overlap_end"])
+            if overlap_end <= now:
+                unresolved_past_conflicts += 1
+        if unresolved_past_conflicts > 0:
+            self.world.action_log.append(
+                f"pressure: {unresolved_past_conflicts} unresolved past overlap(s) increased stress pressure."
+            )
     def _ensure_reward_log_dir(self) -> None:
         self._reward_log_path.parent.mkdir(parents=True, exist_ok=True)
             "task": breakdown.task,
             "weighted_base": breakdown.weighted_base,
             "output_scale": breakdown.output_scale,
+            "shaping_synergy": breakdown.shaping_synergy,
+            "shaping_tradeoff": breakdown.shaping_tradeoff,
+            "shaping_potential": breakdown.shaping_potential,
+            "shaping_scaffold": breakdown.shaping_scaffold,
+            "shaping_quality": breakdown.shaping_quality,
+            "shaping_total": breakdown.shaping_total,
+            "shaping_to_base_ratio": breakdown.shaping_to_base_ratio,
             "invalid_step_adjustment": breakdown.invalid_step_adjustment,
             "episode_completion_bonus": breakdown.episode_completion_bonus,
             "catastrophic_penalty": breakdown.catastrophic_penalty,
             "calendar_overlap_pairs": len(self.detect_meeting_conflicts()),
             "critical_unreplied": crit_open,
             "overdue_tasks": overdue_n,
+            "reward_mode": self._reward_mode,
         }
         with self._reward_log_path.open("a", encoding="utf-8") as fh:
             fh.write(json.dumps(line) + "\n")

server/reward.py CHANGED Viewed

@@ -12,6 +12,7 @@ scaling.
 from __future__ import annotations
 from datetime import datetime, timedelta, timezone
 from typing import Any
@@ -41,6 +42,12 @@ _SEND_MESSAGE_VALID_MICRO_BONUS: float = 0.08
 _COMPLETE_TASK_VALID_MICRO_BONUS: float = 0.06
 _DELEGATE_TASK_VALID_MICRO_BONUS: float = 0.10
 _DO_NOTHING_STRICT_PENALTY: float = -0.15
 _REPLY_PRIORITY_MICRO_BONUS: dict[str, float] = {
     "critical": 0.30,
     "high": 0.15,
@@ -261,6 +268,95 @@ def catastrophic(world: WorldState) -> bool:
     return vip_furious or critical_open > 3
 def aggregate_scores(
     conflict: float,
     relationship: float,
@@ -269,11 +365,23 @@ def aggregate_scores(
     conflict_raw: float,
     critical_queue_bonus: float,
     weighted_inner: float,
     action_ok: bool,
     episode_done: bool,
     world_after: WorldState,
 ) -> RewardBreakdown:
     weighted = WEIGHTED_OUTPUT_SCALE * weighted_inner
     inv = 0.0
     if not action_ok:
         inv = -0.25
@@ -291,6 +399,13 @@ def aggregate_scores(
         conflict=conflict,
         relationship=relationship,
         task=task,
         weighted_base=weighted,
         output_scale=WEIGHTED_OUTPUT_SCALE,
         invalid_step_adjustment=inv,
@@ -322,6 +437,9 @@ def compute_step_reward(
     action_ok: bool,
     episode_done: bool,
     relationship_suppressed_for_email_to: frozenset[str] | None = None,
 ) -> RewardBreakdown:
     c_core = score_conflict_resolution(before, after, action, action_ok=action_ok)
     crit_b = score_critical_queue_bonus(before, after)
@@ -335,7 +453,48 @@ def compute_step_reward(
         relationship_suppressed_for_email_to=relationship_suppressed_for_email_to,
     )
     t = score_task_completion(before, after, action, action_ok=action_ok)
-    weighted_inner = W_CONFLICT * c + W_REL * r + W_TASK * t
     bd = aggregate_scores(
         c,
         r,
@@ -343,6 +502,12 @@ def compute_step_reward(
         conflict_raw=c_raw,
         critical_queue_bonus=crit_b,
         weighted_inner=weighted_inner,
         action_ok=action_ok,
         episode_done=episode_done,
         world_after=after,

 from __future__ import annotations
+import math
 from datetime import datetime, timedelta, timezone
 from typing import Any
 _COMPLETE_TASK_VALID_MICRO_BONUS: float = 0.06
 _DELEGATE_TASK_VALID_MICRO_BONUS: float = 0.10
 _DO_NOTHING_STRICT_PENALTY: float = -0.15
+_SYNERGY_CAP: float = 0.40
+_TRADEOFF_CAP: float = 0.30
+_POTENTIAL_CAP: float = 0.25
+_SCAFFOLD_CAP: float = 0.35
+_SHAPING_TO_BASE_BUDGET: float = 1.25
+_QUALITY_CAP: float = 0.28
 _REPLY_PRIORITY_MICRO_BONUS: dict[str, float] = {
     "critical": 0.30,
     "high": 0.15,
     return vip_furious or critical_open > 3
+def _scaffold_learning_signal(
+    before: WorldState,
+    after: WorldState,
+    action: GhostexecAction,
+    *,
+    action_ok: bool,
+    step_index: int | None,
+    max_steps: int | None,
+) -> float:
+    if not action_ok:
+        return 0.0
+    if action.action_type == "do_nothing":
+        return 0.0
+    s = 0.0
+    critical_before = critical_unreplied_count(before)
+    critical_after = critical_unreplied_count(after)
+    conflict_before = len(meeting_conflicts(before))
+    conflict_after = len(meeting_conflicts(after))
+    overdue_before = len(_overdue_tasks(before))
+    overdue_after = len(_overdue_tasks(after))
+    if action.action_type == "reply_email":
+        if critical_after < critical_before:
+            s += 0.16
+        elif critical_before > 0:
+            s += 0.05
+    if action.action_type in ("reschedule_meeting", "cancel_meeting"):
+        if conflict_after < conflict_before:
+            s += 0.15
+        elif conflict_before > 0:
+            s += 0.04
+    if action.action_type in ("complete_task", "delegate_task"):
+        if overdue_after < overdue_before:
+            s += 0.12
+        elif overdue_before > 0:
+            s += 0.03
+    # Early episode shaping slightly amplified for better exploration guidance.
+    if step_index is not None and max_steps and max_steps > 0:
+        frac = max(0.0, min(1.0, step_index / max_steps))
+        if frac <= 0.33:
+            s *= 1.20
+        elif frac >= 0.85:
+            s *= 0.90
+    return max(-_SCAFFOLD_CAP, min(_SCAFFOLD_CAP, s))
+def _state_potential(world: WorldState) -> float:
+    conflicts = len(meeting_conflicts(world))
+    critical_open = critical_unreplied_count(world)
+    overdue = len(_overdue_tasks(world))
+    stress = float(world.stress)
+    # Lower operational pressure => higher potential.
+    return -(
+        1.15 * critical_open
+        + 0.90 * conflicts
+        + 0.55 * overdue
+        + 0.02 * stress
+    )
+def _budgeted_shaping_total(base_weighted_inner: float, shaping_total_inner: float) -> float:
+    # Keep shaping informative but bounded against the base objective to avoid exploit loops.
+    budget = _SHAPING_TO_BASE_BUDGET * (abs(base_weighted_inner) + 0.05)
+    return max(-budget, min(budget, shaping_total_inner))
+def _quality_separation_signal(
+    *,
+    c: float,
+    r: float,
+    t: float,
+    action: GhostexecAction,
+    action_ok: bool,
+) -> float:
+    # Amplify distance between clearly good vs clearly bad valid actions.
+    if not action_ok or action.action_type == "do_nothing":
+        return 0.0
+    base = W_CONFLICT * c + W_REL * r + W_TASK * t
+    if base >= 0.90:
+        return _QUALITY_CAP
+    if base >= 0.35:
+        return 0.12
+    if base <= -0.90:
+        return -_QUALITY_CAP
+    if base <= -0.35:
+        return -0.12
+    return 0.0
 def aggregate_scores(
     conflict: float,
     relationship: float,
     conflict_raw: float,
     critical_queue_bonus: float,
     weighted_inner: float,
+    weighted_base_only: float,
+    shaping_synergy: float,
+    shaping_tradeoff: float,
+    shaping_potential: float,
+    shaping_scaffold: float,
+    shaping_quality: float,
     action_ok: bool,
     episode_done: bool,
     world_after: WorldState,
 ) -> RewardBreakdown:
     weighted = WEIGHTED_OUTPUT_SCALE * weighted_inner
+    weighted_base_only_scaled = WEIGHTED_OUTPUT_SCALE * weighted_base_only
+    shaping_total = WEIGHTED_OUTPUT_SCALE * (
+        shaping_synergy + shaping_tradeoff + shaping_potential + shaping_scaffold + shaping_quality
+    )
+    denom = abs(weighted_base_only_scaled) + 1e-6
+    shaping_ratio = min(10.0, abs(shaping_total) / denom)
     inv = 0.0
     if not action_ok:
         inv = -0.25
         conflict=conflict,
         relationship=relationship,
         task=task,
+        shaping_synergy=WEIGHTED_OUTPUT_SCALE * shaping_synergy,
+        shaping_tradeoff=WEIGHTED_OUTPUT_SCALE * shaping_tradeoff,
+        shaping_potential=WEIGHTED_OUTPUT_SCALE * shaping_potential,
+        shaping_scaffold=WEIGHTED_OUTPUT_SCALE * shaping_scaffold,
+        shaping_quality=WEIGHTED_OUTPUT_SCALE * shaping_quality,
+        shaping_total=shaping_total,
+        shaping_to_base_ratio=shaping_ratio,
         weighted_base=weighted,
         output_scale=WEIGHTED_OUTPUT_SCALE,
         invalid_step_adjustment=inv,
     action_ok: bool,
     episode_done: bool,
     relationship_suppressed_for_email_to: frozenset[str] | None = None,
+    reward_mode: str = "full",
+    step_index: int | None = None,
+    max_steps: int | None = None,
 ) -> RewardBreakdown:
     c_core = score_conflict_resolution(before, after, action, action_ok=action_ok)
     crit_b = score_critical_queue_bonus(before, after)
         relationship_suppressed_for_email_to=relationship_suppressed_for_email_to,
     )
     t = score_task_completion(before, after, action, action_ok=action_ok)
+    weighted_base_only = W_CONFLICT * c + W_REL * r + W_TASK * t
+    weighted_inner = weighted_base_only
+    synergy = 0.0
+    tradeoff_penalty = 0.0
+    potential_progress = 0.0
+    scaffold_signal = 0.0
+    quality_signal = 0.0
+    if reward_mode in ("full", "shaping"):
+        # Bounded nonlinear shaping to speed learning without overpowering base channels.
+        if c > 0.0 and r > 0.0:
+            synergy += min(_SYNERGY_CAP, 0.18 * math.tanh(0.35 * c * r))
+        if t > 0.0 and (c > 0.0 or r > 0.0):
+            bridge = max(c, 0.0) + max(r, 0.0)
+            synergy += min(_SYNERGY_CAP, 0.10 * math.tanh(0.25 * t * bridge))
+        if c < -0.5 and r < -0.5:
+            tradeoff_penalty -= min(_TRADEOFF_CAP, 0.12 * math.tanh(0.25 * abs(c * r)))
+        if t < -0.5 and (c < 0.0 or r < 0.0):
+            debt = abs(t) * (abs(min(c, 0.0)) + abs(min(r, 0.0)))
+            tradeoff_penalty -= min(_TRADEOFF_CAP, 0.08 * math.tanh(0.18 * debt))
+        potential_progress = max(
+            -_POTENTIAL_CAP,
+            min(_POTENTIAL_CAP, _state_potential(after) - _state_potential(before)),
+        )
+        scaffold_signal = _scaffold_learning_signal(
+            before,
+            after,
+            action,
+            action_ok=action_ok,
+            step_index=step_index,
+            max_steps=max_steps,
+        )
+        quality_signal = _quality_separation_signal(
+            c=c,
+            r=r,
+            t=t,
+            action=action,
+            action_ok=action_ok,
+        )
+        shaping_total_inner = (
+            synergy + tradeoff_penalty + potential_progress + scaffold_signal + quality_signal
+        )
+        weighted_inner += _budgeted_shaping_total(weighted_base_only, shaping_total_inner)
     bd = aggregate_scores(
         c,
         r,
         conflict_raw=c_raw,
         critical_queue_bonus=crit_b,
         weighted_inner=weighted_inner,
+        weighted_base_only=weighted_base_only,
+        shaping_synergy=synergy,
+        shaping_tradeoff=tradeoff_penalty,
+        shaping_potential=potential_progress,
+        shaping_scaffold=scaffold_signal,
+        shaping_quality=quality_signal,
         action_ok=action_ok,
         episode_done=episode_done,
         world_after=after,

tests/test_phase4.py CHANGED Viewed

@@ -28,6 +28,12 @@ def test_reward_weights_and_aggregator_helpers():
         conflict_raw=c,
         critical_queue_bonus=0.0,
         weighted_inner=weighted_inner,
         action_ok=True,
         episode_done=False,
         world_after=w,
@@ -92,6 +98,25 @@ def test_scripted_episode_reward_direction_and_log(tmp_path, monkeypatch):
     assert "reward" in row and "episode_id" in row
     assert row.get("action_type") == "reschedule_meeting"
     assert "conflict_raw" in row and "step_ok" in row
 def test_schema_drift_events_mutate_world():

         conflict_raw=c,
         critical_queue_bonus=0.0,
         weighted_inner=weighted_inner,
+        weighted_base_only=weighted_inner,
+        shaping_synergy=0.0,
+        shaping_tradeoff=0.0,
+        shaping_potential=0.0,
+        shaping_scaffold=0.0,
+        shaping_quality=0.0,
         action_ok=True,
         episode_done=False,
         world_after=w,
     assert "reward" in row and "episode_id" in row
     assert row.get("action_type") == "reschedule_meeting"
     assert "conflict_raw" in row and "step_ok" in row
+    assert "shaping_total" in row and "shaping_to_base_ratio" in row
+    assert "shaping_scaffold" in row
+    assert row.get("reward_mode") == "full"
+def test_reward_mode_base_turns_off_shaping_terms():
+    env = GhostexecEnvironment(SCENARIO, reward_mode="base")
+    env.reset()
+    obs = env.step(
+        GhostexecAction(
+            action_type="reschedule_meeting",
+            meeting_id="m02",
+            new_time="2026-04-21T18:00:00",
+        )
+    )
+    bd = (obs.metadata or {}).get("reward_breakdown") or {}
+    assert float(bd.get("shaping_synergy") or 0.0) == pytest.approx(0.0)
+    assert float(bd.get("shaping_tradeoff") or 0.0) == pytest.approx(0.0)
+    assert float(bd.get("shaping_potential") or 0.0) == pytest.approx(0.0)
 def test_schema_drift_events_mutate_world():

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff