Spaces:

CreativeEngineer
/

fusion-design-lab

Paused

App Files Files Community

CreativeEngineer commited on 28 days ago

Commit

ebd0ff3

1 Parent(s): 729c711

feat: add llm rollout contract and simplify ppo smoke

Browse files

Files changed (4) hide show

fusion_lab/llm_agent.py +216 -0
training/README.md +14 -0
training/llm_rollout.py +130 -0
training/ppo_smoke.py +83 -60

fusion_lab/llm_agent.py ADDED Viewed

	@@ -0,0 +1,216 @@

+from __future__ import annotations
+import json
+import re
+from dataclasses import asdict, dataclass
+from typing import Final, Sequence
+from fusion_lab.models import (
+    DirectionName,
+    MagnitudeName,
+    ParameterName,
+    StellaratorAction,
+    StellaratorObservation,
+)
+from server.environment import BUDGET, StellaratorEnvironment
+RUN_PARAMETERS: Final[tuple[ParameterName, ...]] = (
+    "aspect_ratio",
+    "elongation",
+    "rotational_transform",
+    "triangularity_scale",
+)
+RUN_DIRECTIONS: Final[tuple[DirectionName, ...]] = ("increase", "decrease")
+RUN_MAGNITUDES: Final[tuple[MagnitudeName, ...]] = ("small", "medium", "large")
+SYSTEM_PROMPT: Final[str] = """You are an expert stellarator designer.
+Goal:
+- satisfy the P1 physics constraints
+- then improve the design score by lowering max elongation
+You control a 4-knob low-dimensional design:
+- aspect_ratio
+- elongation
+- rotational_transform
+- triangularity_scale
+Action rules:
+- output a JSON array
+- each item must be either:
+  - {"intent":"run","parameter":"<parameter>","direction":"increase|decrease","magnitude":"small|medium|large"}
+  - {"intent":"restore_best"}
+  - {"intent":"submit"}
+- keep the plan short and within the remaining budget
+- use "submit" only when the design looks ready
+Constraint directions:
+- aspect_ratio <= 4.0
+- average_triangularity <= -0.5
+- edge_iota_over_nfp >= 0.3"""
+ACTION_ARRAY_PATTERN: Final[re.Pattern[str]] = re.compile(r"\[[\s\S]*\]")
+@dataclass(frozen=True)
+class LLMStepTrace:
+    step: int
+    action_label: str
+    reward: float
+    p1_score: float
+    p1_feasibility: float
+    constraints_satisfied: bool
+    evaluation_fidelity: str
+    evaluation_failed: bool
+    budget_remaining: int
+    diagnostics_text: str
+@dataclass(frozen=True)
+class LLMEpisodeTrace:
+    seed: int
+    total_reward: float
+    final_score: float
+    final_feasibility: float
+    constraints_satisfied: bool
+    evaluation_failed: bool
+    steps: list[LLMStepTrace]
+    def asdict(self) -> dict[str, object]:
+        return asdict(self)
+def action_label(action: StellaratorAction) -> str:
+    if action.intent != "run":
+        return action.intent
+    return f"{action.intent} {action.parameter} {action.direction} {action.magnitude}"
+def format_observation(observation: StellaratorObservation) -> str:
+    return (
+        "Current stellarator state:\n"
+        f"- max_elongation: {observation.max_elongation:.4f}\n"
+        f"- aspect_ratio: {observation.aspect_ratio:.4f} (must stay <= 4.0)\n"
+        f"- average_triangularity: {observation.average_triangularity:.6f} "
+        "(must stay <= -0.5)\n"
+        f"- edge_iota_over_nfp: {observation.edge_iota_over_nfp:.4f} "
+        "(must stay >= 0.3)\n"
+        f"- p1_score: {observation.p1_score:.4f}\n"
+        f"- p1_feasibility: {observation.p1_feasibility:.6f}\n"
+        f"- constraints_satisfied: {observation.constraints_satisfied}\n"
+        f"- evaluation_fidelity: {observation.evaluation_fidelity}\n"
+        f"- evaluation_failed: {observation.evaluation_failed}\n"
+        f"- budget_remaining: {observation.budget_remaining}\n"
+        f"- best_low_fidelity_score: {observation.best_low_fidelity_score:.4f}\n"
+        f"- best_low_fidelity_feasibility: {observation.best_low_fidelity_feasibility:.6f}\n"
+        f"- diagnostics: {observation.diagnostics_text}\n"
+    )
+def build_prompt(observation: StellaratorObservation) -> str:
+    return (
+        f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
+        f"<|im_start|>user\n{format_observation(observation)}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+def extract_json_plan(text: str) -> str | None:
+    match = ACTION_ARRAY_PATTERN.search(text)
+    if match is None:
+        return None
+    return match.group()
+def _parse_action_item(item: object) -> StellaratorAction | None:
+    if not isinstance(item, dict):
+        return None
+    intent = item.get("intent")
+    if intent == "submit":
+        return StellaratorAction(intent="submit")
+    if intent == "restore_best":
+        return StellaratorAction(intent="restore_best")
+    if intent != "run":
+        return None
+    parameter = item.get("parameter")
+    direction = item.get("direction")
+    magnitude = item.get("magnitude", "small")
+    if parameter not in RUN_PARAMETERS:
+        return None
+    if direction not in RUN_DIRECTIONS:
+        return None
+    if magnitude not in RUN_MAGNITUDES:
+        return None
+    return StellaratorAction(
+        intent="run",
+        parameter=parameter,
+        direction=direction,
+        magnitude=magnitude,
+    )
+def parse_action_plan(text: str) -> list[StellaratorAction]:
+    raw_plan = extract_json_plan(text)
+    if raw_plan is None:
+        return []
+    try:
+        decoded = json.loads(raw_plan)
+    except json.JSONDecodeError:
+        return []
+    if not isinstance(decoded, list):
+        return []
+    parsed: list[StellaratorAction] = []
+    for item in decoded:
+        action = _parse_action_item(item)
+        if action is None:
+            continue
+        parsed.append(action)
+        if action.intent == "submit":
+            break
+    return parsed
+def run_episode_with_actions(
+    actions: Sequence[StellaratorAction],
+    *,
+    seed_idx: int,
+) -> LLMEpisodeTrace:
+    environment = StellaratorEnvironment()
+    observation = environment.reset(seed=seed_idx)
+    step_traces: list[LLMStepTrace] = []
+    total_reward = 0.0
+    for step_index, action in enumerate(actions[:BUDGET], start=1):
+        observation = environment.step(action)
+        reward = float(observation.reward or 0.0)
+        total_reward += reward
+        step_traces.append(
+            LLMStepTrace(
+                step=step_index,
+                action_label=action_label(action),
+                reward=reward,
+                p1_score=observation.p1_score,
+                p1_feasibility=observation.p1_feasibility,
+                constraints_satisfied=observation.constraints_satisfied,
+                evaluation_fidelity=observation.evaluation_fidelity,
+                evaluation_failed=observation.evaluation_failed,
+                budget_remaining=observation.budget_remaining,
+                diagnostics_text=observation.diagnostics_text,
+            )
+        )
+        if observation.done:
+            break
+    return LLMEpisodeTrace(
+        seed=seed_idx,
+        total_reward=round(total_reward, 4),
+        final_score=observation.p1_score,
+        final_feasibility=observation.p1_feasibility,
+        constraints_satisfied=observation.constraints_satisfied,
+        evaluation_failed=observation.evaluation_failed,
+        steps=step_traces,
+    )

training/README.md CHANGED Viewed

@@ -19,3 +19,17 @@ Training policy:
 - install the training dependencies: `uv sync --extra training`
 - tiny low-fi PPO smoke run: `uv run --extra training python training/ppo_smoke.py`

 - install the training dependencies: `uv sync --extra training`
 - tiny low-fi PPO smoke run: `uv run --extra training python training/ppo_smoke.py`
+- generate an LLM-ready prompt payload: `uv run python training/llm_rollout.py prompt --seed 0`
+- replay an LLM completion or action plan: `uv run python training/llm_rollout.py replay --seed 0 --completion-file <path>`
+## Shared LLM Contract
+The prompt/action/replay contract for LLM training lives in:
+- `fusion_lab/llm_agent.py`
+Use that module as the source of truth for:
+- prompt formatting
+- action-plan parsing
+- local rollout replay

training/llm_rollout.py ADDED Viewed

	@@ -0,0 +1,130 @@

+from __future__ import annotations
+import argparse
+import json
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Final
+from fusion_lab.llm_agent import (
+    build_prompt,
+    parse_action_plan,
+    run_episode_with_actions,
+)
+from fusion_lab.models import StellaratorAction
+from server.environment import StellaratorEnvironment
+DEFAULT_OUTPUT_DIR: Final[Path] = Path("training/artifacts/llm_rollout")
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Generate an LLM-ready prompt or replay an LLM completion against the live "
+            "Fusion Design Lab environment."
+        )
+    )
+    subparsers = parser.add_subparsers(dest="command", required=True)
+    prompt_parser = subparsers.add_parser("prompt", help="Print or save an LLM prompt.")
+    prompt_parser.add_argument("--seed", type=int, default=0, help="Reset seed index.")
+    prompt_parser.add_argument(
+        "--output-file",
+        type=Path,
+        default=None,
+        help="Optional JSON file path for the prompt payload.",
+    )
+    replay_parser = subparsers.add_parser(
+        "replay",
+        help="Replay a completion or action-plan file and save a rollout artifact.",
+    )
+    replay_parser.add_argument("--seed", type=int, default=0, help="Reset seed index.")
+    replay_parser.add_argument(
+        "--completion-file",
+        type=Path,
+        default=None,
+        help="Path to a raw LLM completion containing a JSON action array.",
+    )
+    replay_parser.add_argument(
+        "--action-plan-file",
+        type=Path,
+        default=None,
+        help="Path to a JSON array of actions.",
+    )
+    replay_parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=DEFAULT_OUTPUT_DIR,
+        help="Directory for rollout artifacts.",
+    )
+    return parser.parse_args()
+def prompt_payload(seed: int) -> dict[str, object]:
+    environment = StellaratorEnvironment()
+    observation = environment.reset(seed=seed)
+    return {
+        "created_at_utc": datetime.now(UTC).isoformat(),
+        "seed": seed,
+        "prompt": build_prompt(observation),
+        "target_spec": observation.target_spec,
+        "budget_remaining": observation.budget_remaining,
+        "diagnostics_text": observation.diagnostics_text,
+    }
+def parse_actions(args: argparse.Namespace) -> tuple[str, list[StellaratorAction]]:
+    if args.action_plan_file is not None:
+        text = args.action_plan_file.read_text()
+        source = str(args.action_plan_file)
+    elif args.completion_file is not None:
+        text = args.completion_file.read_text()
+        source = str(args.completion_file)
+    else:
+        raise ValueError("replay requires --completion-file or --action-plan-file")
+    return source, parse_action_plan(text)
+def write_json(path: Path, payload: dict[str, object]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n")
+def run_prompt(args: argparse.Namespace) -> None:
+    payload = prompt_payload(args.seed)
+    if args.output_file is not None:
+        write_json(args.output_file, payload)
+        print(args.output_file)
+        return
+    print(json.dumps(payload, indent=2))
+def run_replay(args: argparse.Namespace) -> None:
+    source, actions = parse_actions(args)
+    trace = run_episode_with_actions(actions, seed_idx=args.seed)
+    timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ")
+    output_path = args.output_dir / f"llm_rollout_{timestamp}.json"
+    payload = {
+        "created_at_utc": datetime.now(UTC).isoformat(),
+        "seed": args.seed,
+        "source": source,
+        "parsed_action_count": len(actions),
+        "actions": [action.model_dump(exclude_none=True) for action in actions],
+        "trace": trace.asdict(),
+    }
+    write_json(output_path, payload)
+    print(output_path)
+def main() -> None:
+    args = parse_args()
+    if args.command == "prompt":
+        run_prompt(args)
+        return
+    run_replay(args)
+if __name__ == "__main__":
+    main()

training/ppo_smoke.py CHANGED Viewed

@@ -17,37 +17,16 @@ from server.contract import RESET_SEEDS
 from server.environment import BUDGET, StellaratorEnvironment
 DEFAULT_OUTPUT_DIR: Final[Path] = Path("training/artifacts/ppo_smoke")
-DEFAULT_TOTAL_TIMESTEPS: Final[int] = 128
 DEFAULT_EVAL_EPISODES: Final[int] = 3
-RUN_ACTION_SPECS: Final[tuple[tuple[str, str, str], ...]] = (
-    ("aspect_ratio", "increase", "small"),
-    ("aspect_ratio", "increase", "medium"),
-    ("aspect_ratio", "increase", "large"),
-    ("aspect_ratio", "decrease", "small"),
-    ("aspect_ratio", "decrease", "medium"),
-    ("aspect_ratio", "decrease", "large"),
-    ("elongation", "increase", "small"),
-    ("elongation", "increase", "medium"),
-    ("elongation", "increase", "large"),
-    ("elongation", "decrease", "small"),
-    ("elongation", "decrease", "medium"),
-    ("elongation", "decrease", "large"),
-    ("rotational_transform", "increase", "small"),
     ("rotational_transform", "increase", "medium"),
-    ("rotational_transform", "increase", "large"),
-    ("rotational_transform", "decrease", "small"),
-    ("rotational_transform", "decrease", "medium"),
-    ("rotational_transform", "decrease", "large"),
-    ("triangularity_scale", "increase", "small"),
     ("triangularity_scale", "increase", "medium"),
-    ("triangularity_scale", "increase", "large"),
-    ("triangularity_scale", "decrease", "small"),
-    ("triangularity_scale", "decrease", "medium"),
-    ("triangularity_scale", "decrease", "large"),
 )
-LOW_FI_ACTION_COUNT: Final[int] = len(RUN_ACTION_SPECS) + 1
-LOW_FI_RESTORE_ACTION_INDEX: Final[int] = len(RUN_ACTION_SPECS)
 @dataclass(frozen=True)
@@ -61,6 +40,7 @@ class TraceStep:
     constraints_satisfied: bool
     evaluation_failed: bool
     budget_remaining: int
     max_elongation: float
     average_triangularity: float
     edge_iota_over_nfp: float
@@ -75,9 +55,25 @@ class EpisodeTrace:
     final_feasibility: float
     constraints_satisfied: bool
     evaluation_failed: bool
     steps: list[TraceStep]
 class LowFiSmokeEnv(gym.Env[np.ndarray, int]):
     metadata = {"render_modes": []}
@@ -89,7 +85,8 @@ class LowFiSmokeEnv(gym.Env[np.ndarray, int]):
         self.observation_space = spaces.Box(
             low=-np.inf,
             high=np.inf,
-            shape=(12,),
             dtype=np.float32,
         )
         self.action_space = spaces.Discrete(LOW_FI_ACTION_COUNT)
@@ -109,7 +106,9 @@ class LowFiSmokeEnv(gym.Env[np.ndarray, int]):
         if seed is not None:
             self._episode_index = 0
             return seed % len(RESET_SEEDS)
-        next_seed = self._episode_index % len(RESET_SEEDS)
         self._episode_index += 1
         return next_seed
@@ -120,30 +119,20 @@ class LowFiSmokeEnv(gym.Env[np.ndarray, int]):
         obs = self._env.step(self._decode_action(action))
         return (
             self._encode_observation(obs),
-            float(obs.reward or 0.0),
             bool(obs.done),
             False,
             self._info(obs),
         )
     def _decode_action(self, action: int) -> StellaratorAction:
-        if action == LOW_FI_RESTORE_ACTION_INDEX:
-            return StellaratorAction(intent="restore_best")
-        parameter, direction, magnitude = RUN_ACTION_SPECS[action]
-        return StellaratorAction(
-            intent="run",
-            parameter=parameter,
-            direction=direction,
-            magnitude=magnitude,
-        )
     def action_label(self, action: int) -> str:
-        if action == LOW_FI_RESTORE_ACTION_INDEX:
-            return "restore_best"
-        parameter, direction, magnitude = RUN_ACTION_SPECS[action]
-        return f"{parameter} {direction} {magnitude}"
     def _encode_observation(self, obs: StellaratorObservation) -> np.ndarray:
         budget_fraction = obs.budget_remaining / BUDGET
         step_fraction = obs.step_number / BUDGET
         return np.array(
@@ -155,11 +144,16 @@ class LowFiSmokeEnv(gym.Env[np.ndarray, int]):
                 obs.p1_score,
                 obs.p1_feasibility,
                 obs.vacuum_well,
                 budget_fraction,
                 step_fraction,
                 obs.best_low_fidelity_score,
                 obs.best_low_fidelity_feasibility,
-                float(obs.constraints_satisfied) - float(obs.evaluation_failed),
             ],
             dtype=np.float32,
         )
@@ -172,8 +166,22 @@ class LowFiSmokeEnv(gym.Env[np.ndarray, int]):
             "evaluation_failed": obs.evaluation_failed,
             "p1_score": obs.p1_score,
             "p1_feasibility": obs.p1_feasibility,
         }
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
@@ -216,26 +224,30 @@ def build_model(env: LowFiSmokeEnv, seed: int) -> PPO:
         seed=seed,
         verbose=0,
         device="cpu",
-        n_steps=32,
-        batch_size=32,
-        n_epochs=4,
-        gamma=0.98,
         learning_rate=3e-4,
         ent_coef=0.01,
     )
-def evaluate_policy(model: PPO, *, eval_episodes: int, base_seed: int) -> list[EpisodeTrace]:
     traces: list[EpisodeTrace] = []
     for episode in range(eval_episodes):
-        env = LowFiSmokeEnv()
         seed = base_seed + episode
-        obs, _ = env.reset(seed=seed)
         done = False
         total_reward = 0.0
         steps: list[TraceStep] = []
         step_index = 0
-        final_info: dict[str, object] = {}
         while not done:
             action, _ = model.predict(obs, deterministic=True)
@@ -256,9 +268,10 @@ def evaluate_policy(model: PPO, *, eval_episodes: int, base_seed: int) -> list[E
                     constraints_satisfied=bool(info["constraints_satisfied"]),
                     evaluation_failed=bool(info["evaluation_failed"]),
                     budget_remaining=int(info["budget_remaining"]),
-                    max_elongation=float(obs[0]),
-                    average_triangularity=float(obs[2]),
-                    edge_iota_over_nfp=float(obs[3]),
                 )
             )
@@ -271,10 +284,11 @@ def evaluate_policy(model: PPO, *, eval_episodes: int, base_seed: int) -> list[E
                 final_feasibility=float(final_info["p1_feasibility"]),
                 constraints_satisfied=bool(final_info["constraints_satisfied"]),
                 evaluation_failed=bool(final_info["evaluation_failed"]),
                 steps=steps,
             )
         )
-    return traces
 def artifact_payload(
@@ -282,6 +296,7 @@ def artifact_payload(
     total_timesteps: int,
     eval_episodes: int,
     seed: int,
     traces: list[EpisodeTrace],
 ) -> dict[str, object]:
     mean_reward = sum(trace.total_reward for trace in traces) / max(len(traces), 1)
@@ -292,12 +307,16 @@ def artifact_payload(
         "total_timesteps": total_timesteps,
         "eval_episodes": eval_episodes,
         "seed": seed,
-        "train_reset_seed_indices": list(range(len(RESET_SEEDS))),
         "action_space_size": LOW_FI_ACTION_COUNT,
         "notes": (
-            "Diagnostic-only PPO smoke run. Submit is intentionally excluded here so the "
-            "smoke loop stays low-fidelity and fast. Training resets cycle through the "
-            "frozen low-fidelity reset seeds to surface positive repair signal sooner."
         ),
         "summary": {
             "mean_eval_reward": round(mean_reward, 4),
@@ -320,7 +339,7 @@ def main() -> None:
     env = LowFiSmokeEnv()
     model = build_model(env, seed=args.seed)
     model.learn(total_timesteps=args.total_timesteps, progress_bar=False)
-    traces = evaluate_policy(
         model,
         eval_episodes=args.eval_episodes,
         base_seed=args.seed,
@@ -329,10 +348,14 @@ def main() -> None:
         total_timesteps=args.total_timesteps,
         eval_episodes=args.eval_episodes,
         seed=args.seed,
         traces=traces,
     )
     output_path = write_artifact(args.output_dir, payload)
     print(output_path)
 if __name__ == "__main__":

 from server.environment import BUDGET, StellaratorEnvironment
 DEFAULT_OUTPUT_DIR: Final[Path] = Path("training/artifacts/ppo_smoke")
+DEFAULT_TOTAL_TIMESTEPS: Final[int] = 32
 DEFAULT_EVAL_EPISODES: Final[int] = 3
+ENCODED_OBSERVATION_DIM: Final[int] = 17
+DIAGNOSTIC_RUN_ACTION_SPECS: Final[tuple[tuple[str, str, str], ...]] = (
     ("rotational_transform", "increase", "medium"),
     ("triangularity_scale", "increase", "medium"),
 )
+TRAIN_RESET_SEED_INDICES: Final[tuple[int, ...]] = (2,)
+LOW_FI_ACTION_COUNT: Final[int] = len(DIAGNOSTIC_RUN_ACTION_SPECS)
 @dataclass(frozen=True)
     constraints_satisfied: bool
     evaluation_failed: bool
     budget_remaining: int
+    termination_reason: str
     max_elongation: float
     average_triangularity: float
     edge_iota_over_nfp: float
     final_feasibility: float
     constraints_satisfied: bool
     evaluation_failed: bool
+    termination_reason: str
     steps: list[TraceStep]
+def diagnostic_action(action_index: int) -> StellaratorAction:
+    parameter, direction, magnitude = DIAGNOSTIC_RUN_ACTION_SPECS[action_index]
+    return StellaratorAction(
+        intent="run",
+        parameter=parameter,
+        direction=direction,
+        magnitude=magnitude,
+    )
+def diagnostic_action_label(action_index: int) -> str:
+    action = diagnostic_action(action_index)
+    return f"{action.parameter} {action.direction} {action.magnitude}"
 class LowFiSmokeEnv(gym.Env[np.ndarray, int]):
     metadata = {"render_modes": []}
         self.observation_space = spaces.Box(
             low=-np.inf,
             high=np.inf,
+            # Keep this aligned with _encode_observation feature count.
+            shape=(ENCODED_OBSERVATION_DIM,),
             dtype=np.float32,
         )
         self.action_space = spaces.Discrete(LOW_FI_ACTION_COUNT)
         if seed is not None:
             self._episode_index = 0
             return seed % len(RESET_SEEDS)
+        if not TRAIN_RESET_SEED_INDICES:
+            raise ValueError("TRAIN_RESET_SEED_INDICES must define at least one seed index.")
+        next_seed = TRAIN_RESET_SEED_INDICES[self._episode_index % len(TRAIN_RESET_SEED_INDICES)]
         self._episode_index += 1
         return next_seed
         obs = self._env.step(self._decode_action(action))
         return (
             self._encode_observation(obs),
+            float(obs.reward if obs.reward is not None else 0.0),
             bool(obs.done),
             False,
             self._info(obs),
         )
     def _decode_action(self, action: int) -> StellaratorAction:
+        return diagnostic_action(action)
     def action_label(self, action: int) -> str:
+        return diagnostic_action_label(action)
     def _encode_observation(self, obs: StellaratorObservation) -> np.ndarray:
+        params = self._env.state.current_params
         budget_fraction = obs.budget_remaining / BUDGET
         step_fraction = obs.step_number / BUDGET
         return np.array(
                 obs.p1_score,
                 obs.p1_feasibility,
                 obs.vacuum_well,
+                params.aspect_ratio,
+                params.elongation,
+                params.rotational_transform,
+                params.triangularity_scale,
                 budget_fraction,
                 step_fraction,
                 obs.best_low_fidelity_score,
                 obs.best_low_fidelity_feasibility,
+                float(obs.constraints_satisfied),
+                float(obs.evaluation_failed),
             ],
             dtype=np.float32,
         )
             "evaluation_failed": obs.evaluation_failed,
             "p1_score": obs.p1_score,
             "p1_feasibility": obs.p1_feasibility,
+            "max_elongation": obs.max_elongation,
+            "average_triangularity": obs.average_triangularity,
+            "edge_iota_over_nfp": obs.edge_iota_over_nfp,
+            "termination_reason": self._termination_reason(obs),
+            "current_seed": self._seed,
         }
+    def _termination_reason(self, obs: StellaratorObservation) -> str:
+        if obs.evaluation_failed:
+            return "evaluation_failed"
+        if obs.constraints_satisfied:
+            return "constraints_satisfied"
+        if obs.done:
+            return "budget_exhausted"
+        return "in_progress"
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
         seed=seed,
         verbose=0,
         device="cpu",
+        n_steps=16,
+        batch_size=16,
+        n_epochs=8,
+        gamma=0.995,
         learning_rate=3e-4,
         ent_coef=0.01,
     )
+def evaluate_policy(
+    model: PPO, *, eval_episodes: int, base_seed: int
+) -> tuple[list[EpisodeTrace], list[int]]:
     traces: list[EpisodeTrace] = []
+    eval_reset_seed_indices: list[int] = []
+    env = LowFiSmokeEnv()
     for episode in range(eval_episodes):
         seed = base_seed + episode
+        eval_reset_seed_indices.append(seed % len(RESET_SEEDS))
+        obs, info = env.reset(seed=seed)
         done = False
         total_reward = 0.0
         steps: list[TraceStep] = []
         step_index = 0
+        final_info = dict[str, object](info)
         while not done:
             action, _ = model.predict(obs, deterministic=True)
                     constraints_satisfied=bool(info["constraints_satisfied"]),
                     evaluation_failed=bool(info["evaluation_failed"]),
                     budget_remaining=int(info["budget_remaining"]),
+                    termination_reason=str(info["termination_reason"]),
+                    max_elongation=float(info["max_elongation"]),
+                    average_triangularity=float(info["average_triangularity"]),
+                    edge_iota_over_nfp=float(info["edge_iota_over_nfp"]),
                 )
             )
                 final_feasibility=float(final_info["p1_feasibility"]),
                 constraints_satisfied=bool(final_info["constraints_satisfied"]),
                 evaluation_failed=bool(final_info["evaluation_failed"]),
+                termination_reason=str(final_info["termination_reason"]),
                 steps=steps,
             )
         )
+    return traces, eval_reset_seed_indices
 def artifact_payload(
     total_timesteps: int,
     eval_episodes: int,
     seed: int,
+    eval_reset_seed_indices: list[int],
     traces: list[EpisodeTrace],
 ) -> dict[str, object]:
     mean_reward = sum(trace.total_reward for trace in traces) / max(len(traces), 1)
         "total_timesteps": total_timesteps,
         "eval_episodes": eval_episodes,
         "seed": seed,
+        "train_reset_seed_indices": list(TRAIN_RESET_SEED_INDICES),
+        "eval_reset_seed_indices": eval_reset_seed_indices,
         "action_space_size": LOW_FI_ACTION_COUNT,
+        "diagnostic_run_actions": [
+            diagnostic_action_label(action_index) for action_index in range(LOW_FI_ACTION_COUNT)
+        ],
         "notes": (
+            "Diagnostics-only low-fidelity PPO smoke; submit is excluded and the action "
+            "space is narrowed to a two-step repair arc. Evaluation runs across "
+            "frozen seeds and records full low-fi traces."
         ),
         "summary": {
             "mean_eval_reward": round(mean_reward, 4),
     env = LowFiSmokeEnv()
     model = build_model(env, seed=args.seed)
     model.learn(total_timesteps=args.total_timesteps, progress_bar=False)
+    traces, eval_reset_seed_indices = evaluate_policy(
         model,
         eval_episodes=args.eval_episodes,
         base_seed=args.seed,
         total_timesteps=args.total_timesteps,
         eval_episodes=args.eval_episodes,
         seed=args.seed,
+        eval_reset_seed_indices=eval_reset_seed_indices,
         traces=traces,
     )
     output_path = write_artifact(args.output_dir, payload)
+    summary = payload["summary"]
     print(output_path)
+    print(f"constraint_satisfaction_rate={summary['constraint_satisfaction_rate']}")
+    print(f"mean_eval_reward={summary['mean_eval_reward']}")
 if __name__ == "__main__":