Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- README.md +1 -3
- inference.py +137 -12
- notebooks/ghostexec_unsloth_grpo_hf_api.ipynb +0 -0
- scripts/train_sft_then_grpo.py +1 -140
README.md
CHANGED
|
@@ -253,9 +253,7 @@ This performs:
|
|
| 253 |
- GRPO continuation from the SFT adapter.
|
| 254 |
- Mixed reward shaping where env-derived reward remains active and local shaping can be down-weighted/up-weighted via scales.
|
| 255 |
- Optional complexity curriculum (`easy_to_full`) that starts with stronger scaffold/local signals and anneals to env-dominant reward later.
|
| 256 |
-
- Stability-first optimization defaults (cosine schedule + warmup + grad clipping + higher GRPO KL beta)
|
| 257 |
-
- `--reward-ema-decay 0..1` smooths the *env* reward channel (defaults come from `--training-preset`).
|
| 258 |
-
- omit `--no-stability-tripwire` to enable early stopping when logs show repeated “env reward down + loss up” (GRPO) or repeated loss blow-ups (SFT).
|
| 259 |
|
| 260 |
Recommended model strategy for hackathon iteration speed:
|
| 261 |
- Start with `--model-preset small_iter_fast` (`unsloth/Qwen2.5-3B-Instruct`) + QLoRA.
|
|
|
|
| 253 |
- GRPO continuation from the SFT adapter.
|
| 254 |
- Mixed reward shaping where env-derived reward remains active and local shaping can be down-weighted/up-weighted via scales.
|
| 255 |
- Optional complexity curriculum (`easy_to_full`) that starts with stronger scaffold/local signals and anneals to env-dominant reward later.
|
| 256 |
+
- Stability-first optimization defaults (cosine schedule + warmup + grad clipping + higher GRPO KL beta). Optional `--reward-ema-decay 0..1` smooths the *env* reward channel (defaults come from `--training-preset`). Training always runs the full `max_*_steps` (no early-stop callbacks).
|
|
|
|
|
|
|
| 257 |
|
| 258 |
Recommended model strategy for hackathon iteration speed:
|
| 259 |
- Start with `--model-preset small_iter_fast` (`unsloth/Qwen2.5-3B-Instruct`) + QLoRA.
|
inference.py
CHANGED
|
@@ -1,9 +1,15 @@
|
|
| 1 |
"""
|
| 2 |
-
Baseline runner for the Ghostexec submission.
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
"""
|
| 8 |
|
| 9 |
from __future__ import annotations
|
|
@@ -11,6 +17,8 @@ from __future__ import annotations
|
|
| 11 |
import argparse
|
| 12 |
import json
|
| 13 |
import os
|
|
|
|
|
|
|
| 14 |
from typing import Any, Iterable
|
| 15 |
|
| 16 |
import requests
|
|
@@ -23,11 +31,14 @@ except ImportError:
|
|
| 23 |
from graders import dinner_disaster_grader, monday_morning_grader, phase2_core_grader
|
| 24 |
from models import GhostexecAction
|
| 25 |
|
|
|
|
|
|
|
| 26 |
|
| 27 |
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 28 |
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
|
| 29 |
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 30 |
-
|
|
|
|
| 31 |
TASK_OVERRIDE = os.getenv("TASK_NAME", "").strip()
|
| 32 |
BENCHMARK = "ghostexec"
|
| 33 |
|
|
@@ -44,6 +55,77 @@ TASK_TO_GRADER = {
|
|
| 44 |
"dinner_disaster": dinner_disaster_grader,
|
| 45 |
}
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
SYSTEM_MESSAGE = """
|
| 48 |
You are acting as an AI Chief-of-Staff assistant in Ghostexec.
|
| 49 |
|
|
@@ -78,8 +160,12 @@ Rules:
|
|
| 78 |
""".strip()
|
| 79 |
|
| 80 |
|
| 81 |
-
def emit_start(task_name: str) -> None:
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
|
| 85 |
def emit_step(step_no: int, action_text: str, reward: float, done: bool, error: str | None) -> None:
|
|
@@ -202,13 +288,13 @@ def final_score(task_name: str, rewards: list[float]) -> float:
|
|
| 202 |
return float(grader({"rewards": rewards}))
|
| 203 |
|
| 204 |
|
| 205 |
-
def run_one_task(llm: Any, task_name: str) -> None:
|
| 206 |
rewards: list[float] = []
|
| 207 |
steps_taken = 0
|
| 208 |
score = 0.0
|
| 209 |
success = False
|
| 210 |
|
| 211 |
-
emit_start(task_name)
|
| 212 |
|
| 213 |
try:
|
| 214 |
result = fetch_reset(task_name)
|
|
@@ -247,18 +333,57 @@ def run_one_task(llm: Any, task_name: str) -> None:
|
|
| 247 |
|
| 248 |
|
| 249 |
def main() -> None:
|
| 250 |
-
parser = argparse.ArgumentParser(
|
|
|
|
|
|
|
| 251 |
parser.add_argument(
|
| 252 |
"--difficulty",
|
| 253 |
choices=["easy", "medium", "hard", "all"],
|
| 254 |
default="all",
|
| 255 |
-
help="Which task subset to run",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
)
|
| 257 |
args = parser.parse_args()
|
| 258 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
llm = client()
|
| 260 |
for task_name in choose_tasks(args.difficulty):
|
| 261 |
-
run_one_task(llm, task_name)
|
| 262 |
|
| 263 |
|
| 264 |
if __name__ == "__main__":
|
|
|
|
| 1 |
"""
|
| 2 |
+
Baseline runner for the Ghostexec OpenEnv submission.
|
| 3 |
|
| 4 |
+
Links (keep these in sync when you change the env):
|
| 5 |
+
- **openenv.yaml** — `name`, `port`, `tasks[].id`, `tasks[].grader`, `max_steps`, `difficulties`
|
| 6 |
+
- **graders.py** — episode-level scores in (0.01, 0.99); symbols referenced by `tasks[].grader`
|
| 7 |
+
- **scenarios/*.json** — fixtures named in each task description in `openenv.yaml`
|
| 8 |
+
- **server/** — FastAPI app from `openenv.yaml` `app:` (`server.app:app`)
|
| 9 |
+
|
| 10 |
+
This script calls the deployed/local env over HTTP (`/reset`, `/step`), queries an LLM via the
|
| 11 |
+
OpenAI-compatible HF router, then aggregates step rewards with the **same** grader functions
|
| 12 |
+
used for OpenEnv validation (must match `openenv.yaml` task table).
|
| 13 |
"""
|
| 14 |
|
| 15 |
from __future__ import annotations
|
|
|
|
| 17 |
import argparse
|
| 18 |
import json
|
| 19 |
import os
|
| 20 |
+
import re
|
| 21 |
+
from pathlib import Path
|
| 22 |
from typing import Any, Iterable
|
| 23 |
|
| 24 |
import requests
|
|
|
|
| 31 |
from graders import dinner_disaster_grader, monday_morning_grader, phase2_core_grader
|
| 32 |
from models import GhostexecAction
|
| 33 |
|
| 34 |
+
REPO_ROOT = Path(__file__).resolve().parent
|
| 35 |
+
OPENENV_SPEC = REPO_ROOT / "openenv.yaml"
|
| 36 |
|
| 37 |
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 38 |
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
|
| 39 |
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 40 |
+
# Default matches openenv.yaml `port: 8000` and `uv run server` / Spaces proxy.
|
| 41 |
+
ENV_URL = os.getenv("ENV_URL", "http://127.0.0.1:8000").rstrip("/")
|
| 42 |
TASK_OVERRIDE = os.getenv("TASK_NAME", "").strip()
|
| 43 |
BENCHMARK = "ghostexec"
|
| 44 |
|
|
|
|
| 55 |
"dinner_disaster": dinner_disaster_grader,
|
| 56 |
}
|
| 57 |
|
| 58 |
+
_GRADER_TO_SYMBOL = {
|
| 59 |
+
phase2_core_grader: "graders.phase2_core_grader",
|
| 60 |
+
monday_morning_grader: "graders.monday_morning_grader",
|
| 61 |
+
dinner_disaster_grader: "graders.dinner_disaster_grader",
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def load_openenv_task_rows(spec_path: Path) -> list[dict[str, str]]:
|
| 66 |
+
"""Parse task `id` + `grader` from openenv.yaml without requiring PyYAML."""
|
| 67 |
+
if not spec_path.is_file():
|
| 68 |
+
return []
|
| 69 |
+
rows: list[dict[str, str]] = []
|
| 70 |
+
cur: dict[str, str] | None = None
|
| 71 |
+
for raw in spec_path.read_text(encoding="utf-8").splitlines():
|
| 72 |
+
line = raw.rstrip()
|
| 73 |
+
m_id = re.match(r"^\s*-\s+id:\s*(\S+)\s*$", line)
|
| 74 |
+
if m_id:
|
| 75 |
+
if cur and cur.get("id"):
|
| 76 |
+
rows.append(cur)
|
| 77 |
+
cur = {"id": m_id.group(1).strip()}
|
| 78 |
+
continue
|
| 79 |
+
if cur is not None:
|
| 80 |
+
m_gr = re.match(r"^\s+grader:\s*(\S+)\s*$", line)
|
| 81 |
+
if m_gr:
|
| 82 |
+
cur["grader"] = m_gr.group(1).strip()
|
| 83 |
+
if cur and cur.get("id"):
|
| 84 |
+
rows.append(cur)
|
| 85 |
+
return rows
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def openenv_max_steps(spec_path: Path) -> int | None:
|
| 89 |
+
if not spec_path.is_file():
|
| 90 |
+
return None
|
| 91 |
+
m = re.search(r"(?m)^max_steps:\s*(\d+)\s*$", spec_path.read_text(encoding="utf-8"))
|
| 92 |
+
return int(m.group(1)) if m else None
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def verify_openenv_alignment(spec_path: Path = OPENENV_SPEC) -> list[str]:
|
| 96 |
+
"""Return human-readable warnings if inference tables drift from openenv.yaml."""
|
| 97 |
+
warnings: list[str] = []
|
| 98 |
+
rows = load_openenv_task_rows(spec_path)
|
| 99 |
+
if not rows:
|
| 100 |
+
warnings.append(f"Could not read tasks from {spec_path} — skipping alignment check.")
|
| 101 |
+
return warnings
|
| 102 |
+
|
| 103 |
+
yaml_ids = [r["id"] for r in rows]
|
| 104 |
+
if tuple(yaml_ids) != TASK_SETS["all"]:
|
| 105 |
+
warnings.append(
|
| 106 |
+
f"openenv.yaml task order/ids {yaml_ids!r} != inference TASK_SETS['all'] {list(TASK_SETS['all'])!r}"
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
for row in rows:
|
| 110 |
+
tid = row["id"]
|
| 111 |
+
gref = row.get("grader", "")
|
| 112 |
+
fn = TASK_TO_GRADER.get(tid)
|
| 113 |
+
if fn is None:
|
| 114 |
+
warnings.append(f"openenv.yaml task {tid!r} has no TASK_TO_GRADER entry in inference.py")
|
| 115 |
+
continue
|
| 116 |
+
expected = _GRADER_TO_SYMBOL.get(fn)
|
| 117 |
+
if expected and gref and gref != expected:
|
| 118 |
+
warnings.append(
|
| 119 |
+
f"Task {tid!r}: openenv.yaml grader {gref!r} != inference mapping {expected!r}"
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
for tid in TASK_SETS["all"]:
|
| 123 |
+
if tid not in yaml_ids:
|
| 124 |
+
warnings.append(f"inference TASK_SETS includes {tid!r} but openenv.yaml has no such task id")
|
| 125 |
+
|
| 126 |
+
return warnings
|
| 127 |
+
|
| 128 |
+
|
| 129 |
SYSTEM_MESSAGE = """
|
| 130 |
You are acting as an AI Chief-of-Staff assistant in Ghostexec.
|
| 131 |
|
|
|
|
| 160 |
""".strip()
|
| 161 |
|
| 162 |
|
| 163 |
+
def emit_start(task_name: str, max_steps_hint: int | None) -> None:
|
| 164 |
+
ms = f" max_steps={max_steps_hint}" if max_steps_hint is not None else ""
|
| 165 |
+
print(
|
| 166 |
+
f"[START] task={task_name} env={BENCHMARK} model={MODEL_NAME} env_url={ENV_URL}{ms}",
|
| 167 |
+
flush=True,
|
| 168 |
+
)
|
| 169 |
|
| 170 |
|
| 171 |
def emit_step(step_no: int, action_text: str, reward: float, done: bool, error: str | None) -> None:
|
|
|
|
| 288 |
return float(grader({"rewards": rewards}))
|
| 289 |
|
| 290 |
|
| 291 |
+
def run_one_task(llm: Any, task_name: str, *, max_steps_hint: int | None) -> None:
|
| 292 |
rewards: list[float] = []
|
| 293 |
steps_taken = 0
|
| 294 |
score = 0.0
|
| 295 |
success = False
|
| 296 |
|
| 297 |
+
emit_start(task_name, max_steps_hint)
|
| 298 |
|
| 299 |
try:
|
| 300 |
result = fetch_reset(task_name)
|
|
|
|
| 333 |
|
| 334 |
|
| 335 |
def main() -> None:
|
| 336 |
+
parser = argparse.ArgumentParser(
|
| 337 |
+
description="Run the Ghostexec baseline agent (HTTP env + HF OpenAI-compatible router)."
|
| 338 |
+
)
|
| 339 |
parser.add_argument(
|
| 340 |
"--difficulty",
|
| 341 |
choices=["easy", "medium", "hard", "all"],
|
| 342 |
default="all",
|
| 343 |
+
help="Which task subset to run (mirrors openenv.yaml difficulties / tasks).",
|
| 344 |
+
)
|
| 345 |
+
parser.add_argument(
|
| 346 |
+
"--env-url",
|
| 347 |
+
default="",
|
| 348 |
+
help="Override Ghostexec HTTP base URL (else ENV_URL env or default 127.0.0.1:8000).",
|
| 349 |
+
)
|
| 350 |
+
parser.add_argument(
|
| 351 |
+
"--list-tasks",
|
| 352 |
+
action="store_true",
|
| 353 |
+
help="Print tasks parsed from openenv.yaml and exit.",
|
| 354 |
+
)
|
| 355 |
+
parser.add_argument(
|
| 356 |
+
"--check-alignment",
|
| 357 |
+
action="store_true",
|
| 358 |
+
help="Verify inference.py TASK_TO_GRADER matches openenv.yaml; print warnings and exit 1 if drift.",
|
| 359 |
)
|
| 360 |
args = parser.parse_args()
|
| 361 |
|
| 362 |
+
global ENV_URL
|
| 363 |
+
if args.env_url.strip():
|
| 364 |
+
ENV_URL = args.env_url.strip().rstrip("/")
|
| 365 |
+
|
| 366 |
+
if args.list_tasks:
|
| 367 |
+
for row in load_openenv_task_rows(OPENENV_SPEC):
|
| 368 |
+
print(row.get("id", ""), "->", row.get("grader", "?"))
|
| 369 |
+
return
|
| 370 |
+
|
| 371 |
+
drift = verify_openenv_alignment(OPENENV_SPEC)
|
| 372 |
+
for w in drift:
|
| 373 |
+
print(f"[openenv] {w}", flush=True)
|
| 374 |
+
|
| 375 |
+
if args.check_alignment:
|
| 376 |
+
hard = [x for x in drift if not x.startswith("Could not read")]
|
| 377 |
+
if hard:
|
| 378 |
+
for x in hard:
|
| 379 |
+
print(f"[ALIGNMENT ERROR] {x}", flush=True)
|
| 380 |
+
raise SystemExit(1)
|
| 381 |
+
return
|
| 382 |
+
|
| 383 |
+
max_steps_hint = openenv_max_steps(OPENENV_SPEC)
|
| 384 |
llm = client()
|
| 385 |
for task_name in choose_tasks(args.difficulty):
|
| 386 |
+
run_one_task(llm, task_name, max_steps_hint=max_steps_hint)
|
| 387 |
|
| 388 |
|
| 389 |
if __name__ == "__main__":
|
notebooks/ghostexec_unsloth_grpo_hf_api.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
scripts/train_sft_then_grpo.py
CHANGED
|
@@ -67,115 +67,6 @@ TRAINING_PRESETS: dict[str, dict[str, float | int | str]] = {
|
|
| 67 |
}
|
| 68 |
|
| 69 |
|
| 70 |
-
def _as_float(x: object | None) -> float | None:
|
| 71 |
-
if x is None:
|
| 72 |
-
return None
|
| 73 |
-
try:
|
| 74 |
-
return float(x)
|
| 75 |
-
except Exception:
|
| 76 |
-
return None
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
class StabilityTripwire(TrainerCallback):
|
| 80 |
-
"""Stop training when logs show sustained reward collapse + loss blow-up."""
|
| 81 |
-
|
| 82 |
-
def __init__(
|
| 83 |
-
self,
|
| 84 |
-
*,
|
| 85 |
-
min_step: int,
|
| 86 |
-
reward_key: str,
|
| 87 |
-
loss_key: str,
|
| 88 |
-
reward_drop: float,
|
| 89 |
-
loss_spike: float,
|
| 90 |
-
bad_streak: int,
|
| 91 |
-
) -> None:
|
| 92 |
-
self.min_step = min_step
|
| 93 |
-
self.reward_key = reward_key
|
| 94 |
-
self.loss_key = loss_key
|
| 95 |
-
self.reward_drop = reward_drop
|
| 96 |
-
self.loss_spike = loss_spike
|
| 97 |
-
self.bad_streak = bad_streak
|
| 98 |
-
self._best_reward: float | None = None
|
| 99 |
-
self._best_loss: float | None = None
|
| 100 |
-
self._streak = 0
|
| 101 |
-
|
| 102 |
-
def on_log(self, args, state, control, logs=None, **kw): # type: ignore[no-untyped-def]
|
| 103 |
-
logs = logs or {}
|
| 104 |
-
step = int(getattr(state, "global_step", 0) or 0)
|
| 105 |
-
if step < self.min_step:
|
| 106 |
-
return control
|
| 107 |
-
|
| 108 |
-
r = _as_float(logs.get(self.reward_key))
|
| 109 |
-
loss = _as_float(logs.get(self.loss_key))
|
| 110 |
-
|
| 111 |
-
reward_bad = False
|
| 112 |
-
loss_bad = False
|
| 113 |
-
|
| 114 |
-
if r is not None:
|
| 115 |
-
if self._best_reward is None or r > self._best_reward:
|
| 116 |
-
self._best_reward = r
|
| 117 |
-
elif self._best_reward is not None and self._best_reward - r >= self.reward_drop:
|
| 118 |
-
reward_bad = True
|
| 119 |
-
|
| 120 |
-
if loss is not None:
|
| 121 |
-
if self._best_loss is None or loss < self._best_loss:
|
| 122 |
-
self._best_loss = loss
|
| 123 |
-
elif self._best_loss is not None and loss - self._best_loss >= self.loss_spike:
|
| 124 |
-
loss_bad = True
|
| 125 |
-
|
| 126 |
-
bad = reward_bad and loss_bad and r is not None and loss is not None
|
| 127 |
-
|
| 128 |
-
if bad:
|
| 129 |
-
self._streak += 1
|
| 130 |
-
else:
|
| 131 |
-
self._streak = 0
|
| 132 |
-
|
| 133 |
-
if self._streak >= self.bad_streak:
|
| 134 |
-
print(
|
| 135 |
-
f"[STABILITY] stopping: sustained instability "
|
| 136 |
-
f"(best {self.reward_key}={self._best_reward}, best loss={self._best_loss}, streak={self._streak})."
|
| 137 |
-
)
|
| 138 |
-
control.should_training_stop = True
|
| 139 |
-
return control
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
class LossSpikeTripwire(TrainerCallback):
|
| 143 |
-
"""SFT guardrail: stop if loss repeatedly blows up vs the best-so-far."""
|
| 144 |
-
|
| 145 |
-
def __init__(self, *, min_step: int, loss_key: str, loss_spike: float, bad_streak: int) -> None:
|
| 146 |
-
self.min_step = min_step
|
| 147 |
-
self.loss_key = loss_key
|
| 148 |
-
self.loss_spike = loss_spike
|
| 149 |
-
self.bad_streak = bad_streak
|
| 150 |
-
self._best_loss: float | None = None
|
| 151 |
-
self._streak = 0
|
| 152 |
-
|
| 153 |
-
def on_log(self, args, state, control, logs=None, **kw): # type: ignore[no-untyped-def]
|
| 154 |
-
logs = logs or {}
|
| 155 |
-
step = int(getattr(state, "global_step", 0) or 0)
|
| 156 |
-
if step < self.min_step:
|
| 157 |
-
return control
|
| 158 |
-
|
| 159 |
-
loss = _as_float(logs.get(self.loss_key))
|
| 160 |
-
if loss is None:
|
| 161 |
-
return control
|
| 162 |
-
|
| 163 |
-
if self._best_loss is None or loss < self._best_loss:
|
| 164 |
-
self._best_loss = loss
|
| 165 |
-
self._streak = 0
|
| 166 |
-
return control
|
| 167 |
-
|
| 168 |
-
if self._best_loss is not None and loss - self._best_loss >= self.loss_spike:
|
| 169 |
-
self._streak += 1
|
| 170 |
-
else:
|
| 171 |
-
self._streak = 0
|
| 172 |
-
|
| 173 |
-
if self._streak >= self.bad_streak:
|
| 174 |
-
print(f"[STABILITY] stopping SFT: repeated loss spikes (best={self._best_loss}, streak={self._streak}).")
|
| 175 |
-
control.should_training_stop = True
|
| 176 |
-
return control
|
| 177 |
-
|
| 178 |
-
|
| 179 |
def _extract_briefing(reset_payload: dict[str, Any]) -> str:
|
| 180 |
obs = reset_payload.get("observation", reset_payload)
|
| 181 |
if isinstance(obs, dict):
|
|
@@ -251,7 +142,6 @@ def run_sft_then_grpo(
|
|
| 251 |
grpo_grad_accum: int,
|
| 252 |
grpo_beta: float,
|
| 253 |
reward_ema_decay: float,
|
| 254 |
-
stability_tripwire: bool,
|
| 255 |
) -> None:
|
| 256 |
try:
|
| 257 |
from datasets import load_dataset
|
|
@@ -315,16 +205,6 @@ def run_sft_then_grpo(
|
|
| 315 |
dataset_text_field="prompt",
|
| 316 |
formatting_func=lambda ex: [f"{p}\n\n{c}" for p, c in zip(ex["prompt"], ex["completion"])],
|
| 317 |
)
|
| 318 |
-
if stability_tripwire:
|
| 319 |
-
sft_trainer.add_callback(
|
| 320 |
-
LossSpikeTripwire(
|
| 321 |
-
min_step=max(10, max_sft_steps // 6),
|
| 322 |
-
loss_key="loss",
|
| 323 |
-
loss_spike=0.85,
|
| 324 |
-
bad_streak=4,
|
| 325 |
-
)
|
| 326 |
-
)
|
| 327 |
-
|
| 328 |
sft_before = _trainable_lora_sum_abs(policy)
|
| 329 |
sft_trainer.train()
|
| 330 |
sft_after = _trainable_lora_sum_abs(sft_trainer.model)
|
|
@@ -482,25 +362,13 @@ def run_sft_then_grpo(
|
|
| 482 |
adam_beta2=0.95,
|
| 483 |
report_to=[],
|
| 484 |
)
|
| 485 |
-
grpo_callbacks = [_ProgressCallback()]
|
| 486 |
-
if stability_tripwire:
|
| 487 |
-
grpo_callbacks.append(
|
| 488 |
-
StabilityTripwire(
|
| 489 |
-
min_step=max(15, max_grpo_steps // 8),
|
| 490 |
-
reward_key="rewards/env_reward/mean",
|
| 491 |
-
loss_key="loss",
|
| 492 |
-
reward_drop=0.12,
|
| 493 |
-
loss_spike=0.35,
|
| 494 |
-
bad_streak=3,
|
| 495 |
-
)
|
| 496 |
-
)
|
| 497 |
grpo_trainer = GRPOTrainer(
|
| 498 |
model=sft_trainer.model,
|
| 499 |
processing_class=tokenizer,
|
| 500 |
reward_funcs=[env_reward, format_reward, semantic_action_reward, anti_idle_reward],
|
| 501 |
train_dataset=ds,
|
| 502 |
args=grpo_cfg,
|
| 503 |
-
callbacks=
|
| 504 |
)
|
| 505 |
grpo_before = _trainable_lora_sum_abs(sft_trainer.model)
|
| 506 |
grpo_trainer.train()
|
|
@@ -556,11 +424,6 @@ def main() -> None:
|
|
| 556 |
default=0.60,
|
| 557 |
help="Fraction of GRPO steps used to ramp from easy scaffold to full env weighting.",
|
| 558 |
)
|
| 559 |
-
parser.add_argument(
|
| 560 |
-
"--no-stability-tripwire",
|
| 561 |
-
action="store_true",
|
| 562 |
-
help="Disable oscillation/collapse early-stop guardrails (not recommended).",
|
| 563 |
-
)
|
| 564 |
parser.add_argument(
|
| 565 |
"--reward-ema-decay",
|
| 566 |
type=float,
|
|
@@ -599,7 +462,6 @@ def main() -> None:
|
|
| 599 |
sft_samples = args.sft_samples
|
| 600 |
if args.reward_ema_decay >= 0.0:
|
| 601 |
reward_ema_decay = float(args.reward_ema_decay)
|
| 602 |
-
stability_tripwire = not args.no_stability_tripwire
|
| 603 |
print(f"Model preset: {args.model_preset} -> {model_name}")
|
| 604 |
print(
|
| 605 |
"Training preset:"
|
|
@@ -633,7 +495,6 @@ def main() -> None:
|
|
| 633 |
grpo_grad_accum=grpo_grad_accum,
|
| 634 |
grpo_beta=grpo_beta,
|
| 635 |
reward_ema_decay=reward_ema_decay,
|
| 636 |
-
stability_tripwire=stability_tripwire,
|
| 637 |
)
|
| 638 |
|
| 639 |
|
|
|
|
| 67 |
}
|
| 68 |
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
def _extract_briefing(reset_payload: dict[str, Any]) -> str:
|
| 71 |
obs = reset_payload.get("observation", reset_payload)
|
| 72 |
if isinstance(obs, dict):
|
|
|
|
| 142 |
grpo_grad_accum: int,
|
| 143 |
grpo_beta: float,
|
| 144 |
reward_ema_decay: float,
|
|
|
|
| 145 |
) -> None:
|
| 146 |
try:
|
| 147 |
from datasets import load_dataset
|
|
|
|
| 205 |
dataset_text_field="prompt",
|
| 206 |
formatting_func=lambda ex: [f"{p}\n\n{c}" for p, c in zip(ex["prompt"], ex["completion"])],
|
| 207 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
sft_before = _trainable_lora_sum_abs(policy)
|
| 209 |
sft_trainer.train()
|
| 210 |
sft_after = _trainable_lora_sum_abs(sft_trainer.model)
|
|
|
|
| 362 |
adam_beta2=0.95,
|
| 363 |
report_to=[],
|
| 364 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
grpo_trainer = GRPOTrainer(
|
| 366 |
model=sft_trainer.model,
|
| 367 |
processing_class=tokenizer,
|
| 368 |
reward_funcs=[env_reward, format_reward, semantic_action_reward, anti_idle_reward],
|
| 369 |
train_dataset=ds,
|
| 370 |
args=grpo_cfg,
|
| 371 |
+
callbacks=[_ProgressCallback()],
|
| 372 |
)
|
| 373 |
grpo_before = _trainable_lora_sum_abs(sft_trainer.model)
|
| 374 |
grpo_trainer.train()
|
|
|
|
| 424 |
default=0.60,
|
| 425 |
help="Fraction of GRPO steps used to ramp from easy scaffold to full env weighting.",
|
| 426 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
parser.add_argument(
|
| 428 |
"--reward-ema-decay",
|
| 429 |
type=float,
|
|
|
|
| 462 |
sft_samples = args.sft_samples
|
| 463 |
if args.reward_ema_decay >= 0.0:
|
| 464 |
reward_ema_decay = float(args.reward_ema_decay)
|
|
|
|
| 465 |
print(f"Model preset: {args.model_preset} -> {model_name}")
|
| 466 |
print(
|
| 467 |
"Training preset:"
|
|
|
|
| 495 |
grpo_grad_accum=grpo_grad_accum,
|
| 496 |
grpo_beta=grpo_beta,
|
| 497 |
reward_ema_decay=reward_ema_decay,
|
|
|
|
| 498 |
)
|
| 499 |
|
| 500 |
|