modelbuilderhq commited on
Commit
d669b0f
·
verified ·
1 Parent(s): ee21104

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -253,9 +253,7 @@ This performs:
253
  - GRPO continuation from the SFT adapter.
254
  - Mixed reward shaping where env-derived reward remains active and local shaping can be down-weighted/up-weighted via scales.
255
  - Optional complexity curriculum (`easy_to_full`) that starts with stronger scaffold/local signals and anneals to env-dominant reward later.
256
- - Stability-first optimization defaults (cosine schedule + warmup + grad clipping + higher GRPO KL beta) and optional guardrails:
257
- - `--reward-ema-decay 0..1` smooths the *env* reward channel (defaults come from `--training-preset`).
258
- - omit `--no-stability-tripwire` to enable early stopping when logs show repeated “env reward down + loss up” (GRPO) or repeated loss blow-ups (SFT).
259
 
260
  Recommended model strategy for hackathon iteration speed:
261
  - Start with `--model-preset small_iter_fast` (`unsloth/Qwen2.5-3B-Instruct`) + QLoRA.
 
253
  - GRPO continuation from the SFT adapter.
254
  - Mixed reward shaping where env-derived reward remains active and local shaping can be down-weighted/up-weighted via scales.
255
  - Optional complexity curriculum (`easy_to_full`) that starts with stronger scaffold/local signals and anneals to env-dominant reward later.
256
+ - Stability-first optimization defaults (cosine schedule + warmup + grad clipping + higher GRPO KL beta). Optional `--reward-ema-decay 0..1` smooths the *env* reward channel (defaults come from `--training-preset`). Training always runs the full `max_*_steps` (no early-stop callbacks).
 
 
257
 
258
  Recommended model strategy for hackathon iteration speed:
259
  - Start with `--model-preset small_iter_fast` (`unsloth/Qwen2.5-3B-Instruct`) + QLoRA.
inference.py CHANGED
@@ -1,9 +1,15 @@
1
  """
2
- Baseline runner for the Ghostexec submission.
3
 
4
- This script queries a chat model through the OpenAI client, sends its decision
5
- to the environment server, and prints machine-readable lines expected by simple
6
- evaluators/log parsers.
 
 
 
 
 
 
7
  """
8
 
9
  from __future__ import annotations
@@ -11,6 +17,8 @@ from __future__ import annotations
11
  import argparse
12
  import json
13
  import os
 
 
14
  from typing import Any, Iterable
15
 
16
  import requests
@@ -23,11 +31,14 @@ except ImportError:
23
  from graders import dinner_disaster_grader, monday_morning_grader, phase2_core_grader
24
  from models import GhostexecAction
25
 
 
 
26
 
27
  API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
28
  MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
29
  HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
30
- ENV_URL = os.getenv("ENV_URL", "http://localhost:7860").rstrip("/")
 
31
  TASK_OVERRIDE = os.getenv("TASK_NAME", "").strip()
32
  BENCHMARK = "ghostexec"
33
 
@@ -44,6 +55,77 @@ TASK_TO_GRADER = {
44
  "dinner_disaster": dinner_disaster_grader,
45
  }
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  SYSTEM_MESSAGE = """
48
  You are acting as an AI Chief-of-Staff assistant in Ghostexec.
49
 
@@ -78,8 +160,12 @@ Rules:
78
  """.strip()
79
 
80
 
81
- def emit_start(task_name: str) -> None:
82
- print(f"[START] task={task_name} env={BENCHMARK} model={MODEL_NAME}", flush=True)
 
 
 
 
83
 
84
 
85
  def emit_step(step_no: int, action_text: str, reward: float, done: bool, error: str | None) -> None:
@@ -202,13 +288,13 @@ def final_score(task_name: str, rewards: list[float]) -> float:
202
  return float(grader({"rewards": rewards}))
203
 
204
 
205
- def run_one_task(llm: Any, task_name: str) -> None:
206
  rewards: list[float] = []
207
  steps_taken = 0
208
  score = 0.0
209
  success = False
210
 
211
- emit_start(task_name)
212
 
213
  try:
214
  result = fetch_reset(task_name)
@@ -247,18 +333,57 @@ def run_one_task(llm: Any, task_name: str) -> None:
247
 
248
 
249
  def main() -> None:
250
- parser = argparse.ArgumentParser(description="Run the Ghostexec baseline agent")
 
 
251
  parser.add_argument(
252
  "--difficulty",
253
  choices=["easy", "medium", "hard", "all"],
254
  default="all",
255
- help="Which task subset to run",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  )
257
  args = parser.parse_args()
258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  llm = client()
260
  for task_name in choose_tasks(args.difficulty):
261
- run_one_task(llm, task_name)
262
 
263
 
264
  if __name__ == "__main__":
 
1
  """
2
+ Baseline runner for the Ghostexec OpenEnv submission.
3
 
4
+ Links (keep these in sync when you change the env):
5
+ - **openenv.yaml** `name`, `port`, `tasks[].id`, `tasks[].grader`, `max_steps`, `difficulties`
6
+ - **graders.py** — episode-level scores in (0.01, 0.99); symbols referenced by `tasks[].grader`
7
+ - **scenarios/*.json** — fixtures named in each task description in `openenv.yaml`
8
+ - **server/** — FastAPI app from `openenv.yaml` `app:` (`server.app:app`)
9
+
10
+ This script calls the deployed/local env over HTTP (`/reset`, `/step`), queries an LLM via the
11
+ OpenAI-compatible HF router, then aggregates step rewards with the **same** grader functions
12
+ used for OpenEnv validation (must match `openenv.yaml` task table).
13
  """
14
 
15
  from __future__ import annotations
 
17
  import argparse
18
  import json
19
  import os
20
+ import re
21
+ from pathlib import Path
22
  from typing import Any, Iterable
23
 
24
  import requests
 
31
  from graders import dinner_disaster_grader, monday_morning_grader, phase2_core_grader
32
  from models import GhostexecAction
33
 
34
+ REPO_ROOT = Path(__file__).resolve().parent
35
+ OPENENV_SPEC = REPO_ROOT / "openenv.yaml"
36
 
37
  API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
38
  MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
39
  HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
40
+ # Default matches openenv.yaml `port: 8000` and `uv run server` / Spaces proxy.
41
+ ENV_URL = os.getenv("ENV_URL", "http://127.0.0.1:8000").rstrip("/")
42
  TASK_OVERRIDE = os.getenv("TASK_NAME", "").strip()
43
  BENCHMARK = "ghostexec"
44
 
 
55
  "dinner_disaster": dinner_disaster_grader,
56
  }
57
 
58
+ _GRADER_TO_SYMBOL = {
59
+ phase2_core_grader: "graders.phase2_core_grader",
60
+ monday_morning_grader: "graders.monday_morning_grader",
61
+ dinner_disaster_grader: "graders.dinner_disaster_grader",
62
+ }
63
+
64
+
65
+ def load_openenv_task_rows(spec_path: Path) -> list[dict[str, str]]:
66
+ """Parse task `id` + `grader` from openenv.yaml without requiring PyYAML."""
67
+ if not spec_path.is_file():
68
+ return []
69
+ rows: list[dict[str, str]] = []
70
+ cur: dict[str, str] | None = None
71
+ for raw in spec_path.read_text(encoding="utf-8").splitlines():
72
+ line = raw.rstrip()
73
+ m_id = re.match(r"^\s*-\s+id:\s*(\S+)\s*$", line)
74
+ if m_id:
75
+ if cur and cur.get("id"):
76
+ rows.append(cur)
77
+ cur = {"id": m_id.group(1).strip()}
78
+ continue
79
+ if cur is not None:
80
+ m_gr = re.match(r"^\s+grader:\s*(\S+)\s*$", line)
81
+ if m_gr:
82
+ cur["grader"] = m_gr.group(1).strip()
83
+ if cur and cur.get("id"):
84
+ rows.append(cur)
85
+ return rows
86
+
87
+
88
+ def openenv_max_steps(spec_path: Path) -> int | None:
89
+ if not spec_path.is_file():
90
+ return None
91
+ m = re.search(r"(?m)^max_steps:\s*(\d+)\s*$", spec_path.read_text(encoding="utf-8"))
92
+ return int(m.group(1)) if m else None
93
+
94
+
95
+ def verify_openenv_alignment(spec_path: Path = OPENENV_SPEC) -> list[str]:
96
+ """Return human-readable warnings if inference tables drift from openenv.yaml."""
97
+ warnings: list[str] = []
98
+ rows = load_openenv_task_rows(spec_path)
99
+ if not rows:
100
+ warnings.append(f"Could not read tasks from {spec_path} — skipping alignment check.")
101
+ return warnings
102
+
103
+ yaml_ids = [r["id"] for r in rows]
104
+ if tuple(yaml_ids) != TASK_SETS["all"]:
105
+ warnings.append(
106
+ f"openenv.yaml task order/ids {yaml_ids!r} != inference TASK_SETS['all'] {list(TASK_SETS['all'])!r}"
107
+ )
108
+
109
+ for row in rows:
110
+ tid = row["id"]
111
+ gref = row.get("grader", "")
112
+ fn = TASK_TO_GRADER.get(tid)
113
+ if fn is None:
114
+ warnings.append(f"openenv.yaml task {tid!r} has no TASK_TO_GRADER entry in inference.py")
115
+ continue
116
+ expected = _GRADER_TO_SYMBOL.get(fn)
117
+ if expected and gref and gref != expected:
118
+ warnings.append(
119
+ f"Task {tid!r}: openenv.yaml grader {gref!r} != inference mapping {expected!r}"
120
+ )
121
+
122
+ for tid in TASK_SETS["all"]:
123
+ if tid not in yaml_ids:
124
+ warnings.append(f"inference TASK_SETS includes {tid!r} but openenv.yaml has no such task id")
125
+
126
+ return warnings
127
+
128
+
129
  SYSTEM_MESSAGE = """
130
  You are acting as an AI Chief-of-Staff assistant in Ghostexec.
131
 
 
160
  """.strip()
161
 
162
 
163
+ def emit_start(task_name: str, max_steps_hint: int | None) -> None:
164
+ ms = f" max_steps={max_steps_hint}" if max_steps_hint is not None else ""
165
+ print(
166
+ f"[START] task={task_name} env={BENCHMARK} model={MODEL_NAME} env_url={ENV_URL}{ms}",
167
+ flush=True,
168
+ )
169
 
170
 
171
  def emit_step(step_no: int, action_text: str, reward: float, done: bool, error: str | None) -> None:
 
288
  return float(grader({"rewards": rewards}))
289
 
290
 
291
+ def run_one_task(llm: Any, task_name: str, *, max_steps_hint: int | None) -> None:
292
  rewards: list[float] = []
293
  steps_taken = 0
294
  score = 0.0
295
  success = False
296
 
297
+ emit_start(task_name, max_steps_hint)
298
 
299
  try:
300
  result = fetch_reset(task_name)
 
333
 
334
 
335
  def main() -> None:
336
+ parser = argparse.ArgumentParser(
337
+ description="Run the Ghostexec baseline agent (HTTP env + HF OpenAI-compatible router)."
338
+ )
339
  parser.add_argument(
340
  "--difficulty",
341
  choices=["easy", "medium", "hard", "all"],
342
  default="all",
343
+ help="Which task subset to run (mirrors openenv.yaml difficulties / tasks).",
344
+ )
345
+ parser.add_argument(
346
+ "--env-url",
347
+ default="",
348
+ help="Override Ghostexec HTTP base URL (else ENV_URL env or default 127.0.0.1:8000).",
349
+ )
350
+ parser.add_argument(
351
+ "--list-tasks",
352
+ action="store_true",
353
+ help="Print tasks parsed from openenv.yaml and exit.",
354
+ )
355
+ parser.add_argument(
356
+ "--check-alignment",
357
+ action="store_true",
358
+ help="Verify inference.py TASK_TO_GRADER matches openenv.yaml; print warnings and exit 1 if drift.",
359
  )
360
  args = parser.parse_args()
361
 
362
+ global ENV_URL
363
+ if args.env_url.strip():
364
+ ENV_URL = args.env_url.strip().rstrip("/")
365
+
366
+ if args.list_tasks:
367
+ for row in load_openenv_task_rows(OPENENV_SPEC):
368
+ print(row.get("id", ""), "->", row.get("grader", "?"))
369
+ return
370
+
371
+ drift = verify_openenv_alignment(OPENENV_SPEC)
372
+ for w in drift:
373
+ print(f"[openenv] {w}", flush=True)
374
+
375
+ if args.check_alignment:
376
+ hard = [x for x in drift if not x.startswith("Could not read")]
377
+ if hard:
378
+ for x in hard:
379
+ print(f"[ALIGNMENT ERROR] {x}", flush=True)
380
+ raise SystemExit(1)
381
+ return
382
+
383
+ max_steps_hint = openenv_max_steps(OPENENV_SPEC)
384
  llm = client()
385
  for task_name in choose_tasks(args.difficulty):
386
+ run_one_task(llm, task_name, max_steps_hint=max_steps_hint)
387
 
388
 
389
  if __name__ == "__main__":
notebooks/ghostexec_unsloth_grpo_hf_api.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
scripts/train_sft_then_grpo.py CHANGED
@@ -67,115 +67,6 @@ TRAINING_PRESETS: dict[str, dict[str, float | int | str]] = {
67
  }
68
 
69
 
70
- def _as_float(x: object | None) -> float | None:
71
- if x is None:
72
- return None
73
- try:
74
- return float(x)
75
- except Exception:
76
- return None
77
-
78
-
79
- class StabilityTripwire(TrainerCallback):
80
- """Stop training when logs show sustained reward collapse + loss blow-up."""
81
-
82
- def __init__(
83
- self,
84
- *,
85
- min_step: int,
86
- reward_key: str,
87
- loss_key: str,
88
- reward_drop: float,
89
- loss_spike: float,
90
- bad_streak: int,
91
- ) -> None:
92
- self.min_step = min_step
93
- self.reward_key = reward_key
94
- self.loss_key = loss_key
95
- self.reward_drop = reward_drop
96
- self.loss_spike = loss_spike
97
- self.bad_streak = bad_streak
98
- self._best_reward: float | None = None
99
- self._best_loss: float | None = None
100
- self._streak = 0
101
-
102
- def on_log(self, args, state, control, logs=None, **kw): # type: ignore[no-untyped-def]
103
- logs = logs or {}
104
- step = int(getattr(state, "global_step", 0) or 0)
105
- if step < self.min_step:
106
- return control
107
-
108
- r = _as_float(logs.get(self.reward_key))
109
- loss = _as_float(logs.get(self.loss_key))
110
-
111
- reward_bad = False
112
- loss_bad = False
113
-
114
- if r is not None:
115
- if self._best_reward is None or r > self._best_reward:
116
- self._best_reward = r
117
- elif self._best_reward is not None and self._best_reward - r >= self.reward_drop:
118
- reward_bad = True
119
-
120
- if loss is not None:
121
- if self._best_loss is None or loss < self._best_loss:
122
- self._best_loss = loss
123
- elif self._best_loss is not None and loss - self._best_loss >= self.loss_spike:
124
- loss_bad = True
125
-
126
- bad = reward_bad and loss_bad and r is not None and loss is not None
127
-
128
- if bad:
129
- self._streak += 1
130
- else:
131
- self._streak = 0
132
-
133
- if self._streak >= self.bad_streak:
134
- print(
135
- f"[STABILITY] stopping: sustained instability "
136
- f"(best {self.reward_key}={self._best_reward}, best loss={self._best_loss}, streak={self._streak})."
137
- )
138
- control.should_training_stop = True
139
- return control
140
-
141
-
142
- class LossSpikeTripwire(TrainerCallback):
143
- """SFT guardrail: stop if loss repeatedly blows up vs the best-so-far."""
144
-
145
- def __init__(self, *, min_step: int, loss_key: str, loss_spike: float, bad_streak: int) -> None:
146
- self.min_step = min_step
147
- self.loss_key = loss_key
148
- self.loss_spike = loss_spike
149
- self.bad_streak = bad_streak
150
- self._best_loss: float | None = None
151
- self._streak = 0
152
-
153
- def on_log(self, args, state, control, logs=None, **kw): # type: ignore[no-untyped-def]
154
- logs = logs or {}
155
- step = int(getattr(state, "global_step", 0) or 0)
156
- if step < self.min_step:
157
- return control
158
-
159
- loss = _as_float(logs.get(self.loss_key))
160
- if loss is None:
161
- return control
162
-
163
- if self._best_loss is None or loss < self._best_loss:
164
- self._best_loss = loss
165
- self._streak = 0
166
- return control
167
-
168
- if self._best_loss is not None and loss - self._best_loss >= self.loss_spike:
169
- self._streak += 1
170
- else:
171
- self._streak = 0
172
-
173
- if self._streak >= self.bad_streak:
174
- print(f"[STABILITY] stopping SFT: repeated loss spikes (best={self._best_loss}, streak={self._streak}).")
175
- control.should_training_stop = True
176
- return control
177
-
178
-
179
  def _extract_briefing(reset_payload: dict[str, Any]) -> str:
180
  obs = reset_payload.get("observation", reset_payload)
181
  if isinstance(obs, dict):
@@ -251,7 +142,6 @@ def run_sft_then_grpo(
251
  grpo_grad_accum: int,
252
  grpo_beta: float,
253
  reward_ema_decay: float,
254
- stability_tripwire: bool,
255
  ) -> None:
256
  try:
257
  from datasets import load_dataset
@@ -315,16 +205,6 @@ def run_sft_then_grpo(
315
  dataset_text_field="prompt",
316
  formatting_func=lambda ex: [f"{p}\n\n{c}" for p, c in zip(ex["prompt"], ex["completion"])],
317
  )
318
- if stability_tripwire:
319
- sft_trainer.add_callback(
320
- LossSpikeTripwire(
321
- min_step=max(10, max_sft_steps // 6),
322
- loss_key="loss",
323
- loss_spike=0.85,
324
- bad_streak=4,
325
- )
326
- )
327
-
328
  sft_before = _trainable_lora_sum_abs(policy)
329
  sft_trainer.train()
330
  sft_after = _trainable_lora_sum_abs(sft_trainer.model)
@@ -482,25 +362,13 @@ def run_sft_then_grpo(
482
  adam_beta2=0.95,
483
  report_to=[],
484
  )
485
- grpo_callbacks = [_ProgressCallback()]
486
- if stability_tripwire:
487
- grpo_callbacks.append(
488
- StabilityTripwire(
489
- min_step=max(15, max_grpo_steps // 8),
490
- reward_key="rewards/env_reward/mean",
491
- loss_key="loss",
492
- reward_drop=0.12,
493
- loss_spike=0.35,
494
- bad_streak=3,
495
- )
496
- )
497
  grpo_trainer = GRPOTrainer(
498
  model=sft_trainer.model,
499
  processing_class=tokenizer,
500
  reward_funcs=[env_reward, format_reward, semantic_action_reward, anti_idle_reward],
501
  train_dataset=ds,
502
  args=grpo_cfg,
503
- callbacks=grpo_callbacks,
504
  )
505
  grpo_before = _trainable_lora_sum_abs(sft_trainer.model)
506
  grpo_trainer.train()
@@ -556,11 +424,6 @@ def main() -> None:
556
  default=0.60,
557
  help="Fraction of GRPO steps used to ramp from easy scaffold to full env weighting.",
558
  )
559
- parser.add_argument(
560
- "--no-stability-tripwire",
561
- action="store_true",
562
- help="Disable oscillation/collapse early-stop guardrails (not recommended).",
563
- )
564
  parser.add_argument(
565
  "--reward-ema-decay",
566
  type=float,
@@ -599,7 +462,6 @@ def main() -> None:
599
  sft_samples = args.sft_samples
600
  if args.reward_ema_decay >= 0.0:
601
  reward_ema_decay = float(args.reward_ema_decay)
602
- stability_tripwire = not args.no_stability_tripwire
603
  print(f"Model preset: {args.model_preset} -> {model_name}")
604
  print(
605
  "Training preset:"
@@ -633,7 +495,6 @@ def main() -> None:
633
  grpo_grad_accum=grpo_grad_accum,
634
  grpo_beta=grpo_beta,
635
  reward_ema_decay=reward_ema_decay,
636
- stability_tripwire=stability_tripwire,
637
  )
638
 
639
 
 
67
  }
68
 
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  def _extract_briefing(reset_payload: dict[str, Any]) -> str:
71
  obs = reset_payload.get("observation", reset_payload)
72
  if isinstance(obs, dict):
 
142
  grpo_grad_accum: int,
143
  grpo_beta: float,
144
  reward_ema_decay: float,
 
145
  ) -> None:
146
  try:
147
  from datasets import load_dataset
 
205
  dataset_text_field="prompt",
206
  formatting_func=lambda ex: [f"{p}\n\n{c}" for p, c in zip(ex["prompt"], ex["completion"])],
207
  )
 
 
 
 
 
 
 
 
 
 
208
  sft_before = _trainable_lora_sum_abs(policy)
209
  sft_trainer.train()
210
  sft_after = _trainable_lora_sum_abs(sft_trainer.model)
 
362
  adam_beta2=0.95,
363
  report_to=[],
364
  )
 
 
 
 
 
 
 
 
 
 
 
 
365
  grpo_trainer = GRPOTrainer(
366
  model=sft_trainer.model,
367
  processing_class=tokenizer,
368
  reward_funcs=[env_reward, format_reward, semantic_action_reward, anti_idle_reward],
369
  train_dataset=ds,
370
  args=grpo_cfg,
371
+ callbacks=[_ProgressCallback()],
372
  )
373
  grpo_before = _trainable_lora_sum_abs(sft_trainer.model)
374
  grpo_trainer.train()
 
424
  default=0.60,
425
  help="Fraction of GRPO steps used to ramp from easy scaffold to full env weighting.",
426
  )
 
 
 
 
 
427
  parser.add_argument(
428
  "--reward-ema-decay",
429
  type=float,
 
462
  sft_samples = args.sft_samples
463
  if args.reward_ema_decay >= 0.0:
464
  reward_ema_decay = float(args.reward_ema_decay)
 
465
  print(f"Model preset: {args.model_preset} -> {model_name}")
466
  print(
467
  "Training preset:"
 
495
  grpo_grad_accum=grpo_grad_accum,
496
  grpo_beta=grpo_beta,
497
  reward_ema_decay=reward_ema_decay,
 
498
  )
499
 
500