Spaces:

Sizzing
/

aws_rl_env

Running

App Files Files Community

Sizzing commited on 12 days ago

Commit

456f5a3

verified ·

1 Parent(s): 1b2b81e

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +14 -0
Dockerfile +7 -1
README.md +543 -436
aws_rl_env_colab.ipynb +233 -0
compare/README.md +230 -0
compare/compare_base_vs_sft.ipynb +0 -0
compare/compare_base_vs_sft_with_outputs.ipynb +0 -0
data/README.md +238 -0
docs/figures/base_vs_sft_success.png +0 -0
docs/figures/compare_dataset.png +3 -0
docs/figures/compare_rl_env.png +3 -0
docs/figures/env_init_screenshot.png +3 -0
docs/figures/grpo_final_per_step.png +3 -0
docs/figures/grpo_optuna_history.png +0 -0
docs/figures/grpo_optuna_history_v0.png +0 -0
docs/figures/grpo_optuna_hparams.png +0 -0
docs/figures/grpo_optuna_importances.png +0 -0
docs/figures/grpo_optuna_parallel.png +0 -0
docs/figures/grpo_optuna_trial_curves.png +3 -0
docs/figures/grpo_optuna_trials_comparison.png +3 -0
docs/figures/grpo_per_tier_curve.png +0 -0
docs/figures/grpo_reward_by_tier.png +0 -0
docs/figures/grpo_reward_curve.png +3 -0
docs/figures/ministack_logo.png +3 -0
docs/figures/model_eval_chart.png +0 -0
docs/figures/optuna_history.png +0 -0
docs/figures/optuna_parallel.png +3 -0
docs/figures/optuna_param_importance.png +0 -0
docs/figures/optuna_slice.png +3 -0
docs/figures/optuna_trial_curves.png +0 -0
docs/figures/qualitative_rollouts.png +0 -0
docs/figures/rl_env_eval_base_vs_sft.png +0 -0
docs/figures/sft_loss_curve.png +3 -0
docs/figures/sft_optuna_trials_table.png +0 -0
docs/figures/sft_vs_grpo_by_tier.png +0 -0
docs/figures/sft_vs_grpo_metrics_grid.png +0 -0
docs/figures/sft_vs_grpo_scalar.png +0 -0
docs/figures/single_step_eval.png +0 -0
images/compare_dataset.png +3 -0
images/compare_rl_env.png +3 -0
pyproject.toml +10 -1
scripts/README.md +260 -0
server/README.md +596 -0
server/app.py +150 -65
tests/test_pool.py +325 -0
train/README.md +545 -0
train/train_grpo_lora.ipynb +0 -0
train/train_sft_lora.ipynb +0 -0
train_grpo.py +1283 -0
train_grpo_lora_final.ipynb +0 -0

.gitattributes CHANGED Viewed

@@ -36,3 +36,17 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 aws_infra/ministack_logo.png filter=lfs diff=lfs merge=lfs -text
 scripts/Screenshot[[:space:]]2026-04-20[[:space:]]at[[:space:]]6.50.47 PM.png filter=lfs diff=lfs merge=lfs -text
 scripts/Screenshot[[:space:]]2026-04-20[[:space:]]at[[:space:]]6.50.47 PM.png filter=lfs diff=lfs merge=lfs -text

 aws_infra/ministack_logo.png filter=lfs diff=lfs merge=lfs -text
 scripts/Screenshot[[:space:]]2026-04-20[[:space:]]at[[:space:]]6.50.47 PM.png filter=lfs diff=lfs merge=lfs -text
 scripts/Screenshot[[:space:]]2026-04-20[[:space:]]at[[:space:]]6.50.47 PM.png filter=lfs diff=lfs merge=lfs -text
+docs/figures/compare_dataset.png filter=lfs diff=lfs merge=lfs -text
+docs/figures/compare_rl_env.png filter=lfs diff=lfs merge=lfs -text
+docs/figures/env_init_screenshot.png filter=lfs diff=lfs merge=lfs -text
+docs/figures/grpo_final_per_step.png filter=lfs diff=lfs merge=lfs -text
+docs/figures/grpo_optuna_trial_curves.png filter=lfs diff=lfs merge=lfs -text
+docs/figures/grpo_optuna_trials_comparison.png filter=lfs diff=lfs merge=lfs -text
+docs/figures/grpo_reward_curve.png filter=lfs diff=lfs merge=lfs -text
+docs/figures/ministack_logo.png filter=lfs diff=lfs merge=lfs -text
+docs/figures/optuna_parallel.png filter=lfs diff=lfs merge=lfs -text
+docs/figures/optuna_slice.png filter=lfs diff=lfs merge=lfs -text
+docs/figures/sft_loss_curve.png filter=lfs diff=lfs merge=lfs -text
+images/compare_dataset.png filter=lfs diff=lfs merge=lfs -text
+images/compare_rl_env.png filter=lfs diff=lfs merge=lfs -text
+scripts/Screenshot[[:space:]]2026-04-20[[:space:]]at[[:space:]]6.50.47 PM.png filter=lfs diff=lfs merge=lfs -text

Dockerfile CHANGED Viewed

@@ -80,7 +80,7 @@ RUN mkdir -p /root/.aws && \
 ENV AWS_ENDPOINT_URL=http://localhost:4566
 # Enable the web interface for OpenEnv (if applicable)
-ENV ENABLE_WEB_INTERFACE=false
 # Set PATH to use the virtual environment
 ENV PATH="/app/.venv/bin:$PATH"
@@ -90,6 +90,9 @@ ENV PYTHONPATH="/app/env:$PYTHONPATH"
 ENV AWS_RL_ENV_POOL_SIZE=8
 ENV AWS_RL_ENV_MINISTACK_BASE_PORT=4566
 # DEV_MODE=1 enables live reload via --reload flag
 ENV DEV_MODE=0
@@ -100,6 +103,9 @@ ENV MODEL_NAME=Qwen/Qwen2.5-72B-Instruct
 # Entrypoint: start N MiniStack instances (AWS_RL_ENV_POOL_SIZE, default 1),
 # then run the FastAPI server. Each MiniStack listens on a distinct port
 # starting at AWS_RL_ENV_MINISTACK_BASE_PORT (default 4566).
 # cloudflared tunnel --url localhost:8000
 CMD ["sh", "-c", "\
   POOL_SIZE=\"${AWS_RL_ENV_POOL_SIZE:-1}\"; \

 ENV AWS_ENDPOINT_URL=http://localhost:4566
 # Enable the web interface for OpenEnv (if applicable)
+ENV ENABLE_WEB_INTERFACE=true
 # Set PATH to use the virtual environment
 ENV PATH="/app/.venv/bin:$PATH"
 ENV AWS_RL_ENV_POOL_SIZE=8
 ENV AWS_RL_ENV_MINISTACK_BASE_PORT=4566
+# Dedicated port for the web playground's lazily-spawned MiniStack.
+# Kept outside the pool's range so a WebSocket session can never claim it.
+ENV AWS_RL_ENV_WEB_MINISTACK_PORT=4565
 # DEV_MODE=1 enables live reload via --reload flag
 ENV DEV_MODE=0
 # Entrypoint: start N MiniStack instances (AWS_RL_ENV_POOL_SIZE, default 1),
 # then run the FastAPI server. Each MiniStack listens on a distinct port
 # starting at AWS_RL_ENV_MINISTACK_BASE_PORT (default 4566).
+# The web playground's MiniStack on AWS_RL_ENV_WEB_MINISTACK_PORT is NOT
+# started here — the FastAPI server spawns it lazily on the first /web/*
+# request so training-only deployments pay zero cost.
 # cloudflared tunnel --url localhost:8000
 CMD ["sh", "-c", "\
   POOL_SIZE=\"${AWS_RL_ENV_POOL_SIZE:-1}\"; \

README.md CHANGED Viewed

@@ -11,244 +11,167 @@ tags:
   - openenv
 ---
-# AWS Cloud CLI and SRE Reinforcement Learning Environment
-A **OpenEnv** RL environment** for training AI agents on real-world AWS cloud operations. The agent sends AWS CLI commands as actions, receives structured observations, and progresses through a **curriculum of 120+ tasks** across 5 difficulty tiers — from basic listing to SRE incident response and security posture auditing.
-The agents interact with a **real-world AWS Shell simulator** — a vendored MiniStack emulator (34 AWS services, in-memory, zero-cost) inside the same Docker container. The response of every executed command is the same as production AWS. The grading system evaluates rewards and penalties based on the **actual AWS infrastructure state** instead of static metrics. No AWS account needed.
-> **[Try the Playground](https://sizzing-aws-rl-env.hf.space/web)** | **[API Docs](https://sizzing-aws-rl-env.hf.space/docs)** | **[Hugging Face Space](https://huggingface.co/spaces/Sizzing/aws_rl_env)**
-## Task Tiers (100+ Tasks)
-### Warmup — 20 tasks
-> List resources — single read-only commands
-- Run one AWS CLI command to list or describe a resource type
-- S3 buckets, EC2 instances, DynamoDB tables, Lambda functions, RDS, EBS volumes
-- Graded by **command_match** — checks operation + service pair
-- No setup required, no state mutations
-### Beginner — 20 tasks
-> Create single resources with verification
-- Create an S3 bucket, DynamoDB table, SQS queue, or Lambda function
-- Graded by **resource_creation** — verifies the exact resource exists in the AWS Infrastructure Simulator
-- Introduces resource name validation — "my-bucket-2" won't satisfy a check for "my-bucket"
-- First tier where idempotency bonus (+0.02) can be earned
-### Intermediate — 20 tasks
-> Multi-step workflows — create, configure, connect
-- Ordered sequences: create a bucket then enable versioning, create a table then add an item
-- Graded by **multi_step** — validates each step was completed in order
-- Chaos injection begins at **10% probability** — resources may be silently mutated mid-episode
-- Rollback penalty (-0.1) starts to matter with multi-step create/delete patterns
-### Advanced — 20 tasks
-> Cross-service architectures spanning multiple AWS services
-- Wire Lambda to SQS, configure API Gateway with integrations, build event-driven pipelines
-- Graded by **multi_step + services** — all required services must be configured
-- Chaos injection escalates to **20% probability** — DynamoDB throughput, Lambda configs may change
-- Hints cost more: 3 hints = only 61% of max reward (0.85³ decay)
-### Expert — 20 tasks
-> SRE incidents, drift detection & security posture audits
-- Fix overly permissive S3 policies, replace broad IAM inline policies, repair broken infrastructure
-- Graded by **state_checks** — actual CLI commands run against MiniStack at grading time
-- Chaos injection at **30% probability** — maximum perturbation frequency
-- **6 drift detection tasks** — correct infra is provisioned, then 2-3 random mutations applied from a pool
-- Agent must audit environment, discover which resources drifted, and fix only those
-- Drift is randomized per episode — prevents memorization of fix sequences
 ---
-## Features
-### 1. Curriculum & Training
-Adaptive learning system that tracks mastery and selects optimal tasks.
-#### Progressive Difficulty
-- **What:** The environment organizes 120+ tasks across 5 tiers: Warmup, Beginner, Intermediate, Advanced, and Expert. Tasks progress from simple listing operations to complex SRE incident response and drift detection scenarios.
-- **Why:** Prevents the agent from being overwhelmed by complex tasks early on. Scaffolded difficulty ensures the agent builds foundational skills before tackling multi-service architectures.
-- **How:** The `CurriculumManager` maintains per-agent tier state. Promotion requires meeting a minimum episode count and success rate threshold. A fast-track mechanism allows agents scoring 90%+ on 3 consecutive episodes to skip the minimum wait.
-- **Metrics:** 5 Difficulty Tiers | 120+ Total Tasks | 90% Fast-track Threshold
-#### Mastery Tracking
-- **What:** Each task independently tracks the agent's performance using a weighted success rate over a sliding window. Tasks "graduate" when performance exceeds the mastery threshold consistently.
-- **Why:** Ensures the agent truly masters a skill before moving on. Prevents lucky single completions from being treated as mastery. Un-graduation catches skill decay.
-- **How:** A `mastery_window` of 10 episodes and `mastery_threshold` of 0.7 (70% success). Minimum 3 attempts required before graduation. Recent results are weighted more heavily using exponential decay (factor 0.85). Graduated tasks can un-graduate if performance drops.
-- **Metrics:** 70% Mastery Threshold | 10 Window Size | 0.85 Decay Factor
-#### Spaced Repetition
-- **What:** Graduated tasks don't disappear — they resurface at exponentially increasing intervals (3, 6, 12, 24, 48 episodes) for re-testing, earning a +30 priority bonus when due.
-- **Why:** Prevents catastrophic forgetting. The agent must retain skills even as it learns new ones. Exponential spacing is the most efficient retention schedule, borrowed from cognitive science.
-- **How:** Each task tracks a `spaced_rep_interval` starting at 3 episodes. When re-tested and passes, the interval doubles (up to 48). If it fails, the interval resets. `_is_spaced_rep_due()` checks elapsed episodes against the interval.
-- **Metrics:** +30 Spaced Rep Bonus | 3→48 Interval Range | 2x Interval Growth
-#### Priority Selection
-- **What:** Tasks are ranked by a composite score combining novelty, weakness, spaced repetition due dates, and recency. The highest-scoring task is selected for each episode.
-- **Why:** Optimizes the training curriculum by ensuring the agent explores new tasks, practices weak areas, revisits graduated skills, and maintains variety — all balanced automatically.
-- **How:** `score = novelty_bonus (+100 if never attempted) + weakness_weight (+50 × (1 - success_rate)) + spaced_rep_bonus (+30 if due) - recency_penalty (-20 if attempted in last 2 episodes)`. Uses exponential decay (0.85) to emphasize recent performance.
-- **Metrics:** +100 Novelty Bonus | +50 Max Weakness Weight | -20 Recency Penalty
-#### Tier Progression
-- **What:** Agents advance through tiers via standard promotion (minimum episodes + success rate) or fast-track (3 consecutive high-scoring episodes). Tiers gate access to increasingly complex task pools.
-- **Why:** Provides structure to the learning process. Standard promotion ensures sufficient exposure; fast-track rewards agents that demonstrate immediate competence.
-- **How:** Standard: complete `min_episodes` at current tier with `success_rate >= advance_rate`. Fast-track: 3 consecutive episodes at >= 90% success bypasses the minimum episode requirement. Un-promotion is not supported — agents cannot drop tiers.
-- **Metrics:** 3 Fast-track Streak | 90% Fast-track Rate | 5 Total Tiers
-### 2. Reward Shaping
-Dense reward signals that encourage operational discipline and real progress.
-```
-if task_achieved:       reward = 1.0
-else:
-    reward = partial_progress * 0.8        # base: scaled to [0.0, 0.8]
-    if progress_increased: reward += 0.1   # dense signal for advancing
-    if command_failed:     reward *= 0.5   # penalty for errors
-    reward = clamp(reward, 0.0, 0.99)      # never 1.0 without completion
-    reward *= 0.85 ** hints_used           # hint decay
-    if survived_chaos:    reward *= 1.05   # chaos survival bonus
-```
-#### Rollback Penalty & Idempotency Bonus
-- **What:** Detects create→delete pairs on the same resource (rollbacks) and penalizes them (-0.1 each). Rewards graceful "already exists" handling (+0.02) where the agent retries idempotently.
-- **Why:** First RL environment rewarding operational discipline. In production, create-then-delete cycles are wasteful. Handling "already exists" gracefully is a sign of robust automation.
-- **How:** `EpisodeTracker.detect_rollbacks()` scans command history for paired create/delete operations on the same resource. Idempotency detection looks for commands that fail with "already exists" patterns (BucketAlreadyExists, ResourceInUseException, etc.) followed by successful continuation.
-- **Metrics:** -0.1 Rollback Penalty | +0.02 Idempotency Bonus | Per-pair Detection
-#### Shaped Reward System
-- **What:** Rewards are carefully shaped: 1.0 for full completion, 0.0-0.8 for partial progress, +0.1 progress bonus for advancing, ×0.5 for failures, capped at 0.99 without completion. Chaos bonus (×1.05) and hint decay (×0.85^n) layer on top.
-- **Why:** Dense reward signal prevents sparse-reward stagnation. The agent gets meaningful feedback on every step, not just at episode end. Capping at 0.99 ensures only real completion earns full credit.
-- **How:** `TaskGrader` dispatches to 5 strategies by tier: `command_match` (warmup), `resource_creation` (beginner), `multi_step` (intermediate), `multi_step+services` (advanced), and `state_checks` (expert). Each returns `partial_progress` which is converted to reward with bonuses/penalties applied.
-- **Metrics:** 1.0 Max Reward | 0.99 Progress Cap | ×1.05 Chaos Bonus
-#### Multi-Strategy Grading
-- **What:** Five distinct grading strategies, one per tier: `command_match` checks operation+service pairs, `resource_creation` verifies resources exist, `multi_step` validates ordered sequences, advanced adds service coverage, and expert runs `state_checks` against MiniStack.
-- **Why:** Each tier tests fundamentally different skills. A single grading strategy would either be too lenient for beginners or miss the nuance needed for expert SRE tasks.
-- **How:** `TaskGrader.grade()` dispatches based on the task's `grading_strategy` field. Each strategy returns a `GradeResult` with `partial_progress` (0.0-1.0), `completed` flag, and details. Grading is deterministic and fully automated.
-- **Metrics:** 5 Grading Strategies | 100% Automated | Per-tier Selection
-### 3. Resilience & Adaptability
-Features that test agent robustness under unpredictable conditions.
-#### Progressive Hint System
-- **What:** A 3-level hint system where each level reveals progressively more detail: Level 1 names the AWS services, Level 2 describes the operations, Level 3 gives near-complete command structure. Each hint reduces the final reward by ×0.85.
-- **Why:** Creates an information-reward tradeoff unique in RL. The agent learns to wean off hints over time — initially relying on them for unfamiliar tasks, then solving independently for maximum reward. From GRPO perspective, it creates a natural exploration/exploitation axis within a single episode.
-- **How:** Agent issues special command `aws help --task-hint` as its action (intercepted before reaching MiniStack). Hints auto-generated from `SuccessCriteria` fields (services, steps, operations). Reward decay: `final_reward *= 0.85 ^ hints_used` — 0 hints: 1.0×, 1 hint: 0.85×, 2 hints: 0.72×, 3 hints: 0.61×. Curriculum naturally penalizes hint-dependent agents: lower rewards → slower graduation.
-- **Metrics:** 3 Hint Levels | ×0.85 Decay Per Hint | ~61% Reward with 3 Hints
-#### Chaos Injection Engine
-- **What:** Silently mutates AWS resource state mid-episode to test agent resilience. Perturbations are scoped to services the current task uses. If the agent completes despite chaos, it earns a ×1.05 bonus.
-- **Why:** Tests whether the agent can handle unexpected state changes — a critical SRE skill. Prevents brittle memorization of exact command sequences. Probability scales with tier difficulty.
-- **How:** `ChaosEngine` selects perturbation templates specific to the services in use (S3 policy changes, DynamoDB throughput modifications, Lambda config alterations, etc.). Resource names are extracted from successful commands via regex. Chaos probability: 10% (Intermediate), 20% (Advanced), 30% (Expert).
-- **Metrics:** ×1.05 Chaos Survival Bonus | 10-30% Probability by Tier | 5 Service Templates
-#### Drift Detection Tasks
-- **What:** 6 expert-tier tasks where infrastructure is provisioned correctly, then 2-3 random mutations are applied from a pool. The agent must audit, discover drifted resources, and fix only those — without knowing which drifted.
-- **Why:** Randomized per episode, preventing memorization. Tests real SRE audit skills: the agent must reason about desired vs. actual state, not just follow a script.
-- **How:** `DriftEngine` randomly selects 2-3 mutations from a task's `possible_drifts` pool and applies them after setup. Each task defines a `desired_state_spec` (natural language) and `state_checks` (ground truth CLI commands). Examples: S3 versioning/encryption drift, DynamoDB throughput changes, SNS subscription modifications.
-- **Metrics:** 6 Drift Tasks | 2-3 Mutations Per Episode | Random Selection Per Run
-### 4. Security Posture Audit
-Tests *reasoning about configuration state* — the agent must READ and ANALYZE existing infrastructure, not just build things. Unlike SRE tasks (broken functionality), these have *working but insecure* infrastructure.
-#### Public S3 Bucket Lockdown
-- **What:** A pre-provisioned S3 bucket "public-assets" has an overly permissive bucket policy granting access to any principal (`Principal: *`). The agent must read the policy, identify the vulnerability, and replace it with a restrictive policy allowing only a specific IAM role.
-- **Why:** Tests security reasoning — the infrastructure is functional but insecure. Unlike SRE tasks where things are broken, here the agent must understand what "correct" security posture looks like and make the right judgment call.
-- **How:** Setup creates the bucket with a wide-open policy. State checks verify the new policy denies `Principal: *` and only allows the `app-role` principal to perform `s3:GetObject`.
-- **Metrics:** S3 Target Service | Policy Attack Surface | Expert Tier
-#### IAM Least Privilege
-- **What:** An IAM role "app-role" has an inline policy with `Action: *` and `Resource: *` — full admin access. The agent must replace it with a least-privilege policy allowing only `dynamodb:GetItem` and `dynamodb:PutItem` on the users table.
-- **Why:** IAM misconfiguration is the #1 cloud security risk. This task tests whether the agent understands permission scoping and can reason about what access an application actually needs vs. what it currently has.
-- **How:** Setup creates the role with a wildcard policy. The agent must craft a replacement policy document with specific actions and resource ARN. State checks verify the policy document matches the expected least-privilege permissions.
-- **Metrics:** IAM Target Service | 2 Allowed Actions | Expert Tier
-#### Secrets in Lambda Environment
-- **What:** A Lambda function "data-processor" has a database password stored as a plaintext environment variable (`DB_PASSWORD=hunter2`). The agent must create a secret in Secrets Manager, update the Lambda to reference the secret ARN, and remove the plaintext variable.
-- **Why:** Plaintext secrets in environment variables is a critical security anti-pattern. This task combines multiple services (Lambda + Secrets Manager) and tests the agent's ability to perform a safe credential rotation without breaking the function.
-- **How:** Setup creates the Lambda with the plaintext env var. The agent must: (1) create a secret in Secrets Manager, (2) add `SECRET_ARN` env var to Lambda, (3) remove `DB_PASSWORD`. State checks verify all three conditions.
-- **Metrics:** 2 Services Involved | 3 Required Steps | Expert Tier
-### 5. Anti-Reward-Hacking (8 Defense Layers)
-8 defense layers that prevent the agent from gaming the reward system.
-#### 1. Ground-Truth Verification via MiniStack
-- **What:** The grader never trusts agent command output. It independently queries MiniStack (the simulated AWS backend) to verify resource state for 20+ services. Even if the agent crafts fake-looking stdout, the grader checks actual state.
-- **Why:** Prevents reward hacking through output fabrication. The agent cannot game the system by producing convincing but fake CLI output — ground truth is always checked server-side.
-- **How:** `ResourceVerifier` has per-service verification methods that query MiniStack directly. For expert tasks, `StateCheck` assertions run actual AWS CLI commands against MiniStack at grading time, checking either `output_contains` (substring) or `json_path` extraction with expected values.
-- **Metrics:** 20+ Verified Services | 100% Server-side | 0 Agent Visibility
-#### 2. Deduplication
-- **What:** `EpisodeTracker.has_executed_operation()` tracks which (operation, resource) pairs have been credited. Running the same successful command twice does NOT increase `partial_progress`. Progress can only increase, never re-earn.
-- **Why:** Prevents the agent from gaming the reward system by repeating the same command to accumulate credit. Each unique operation earns credit exactly once.
-- **How:** `credit_operation()` records each (operation, resource) pair. Before granting credit, `is_operation_already_credited()` checks if this exact pair was already rewarded. The check is deterministic and happens at grading time.
-- **Metrics:** 1x Credit Per Operation | Exact Match Type | (op, res) Tracking Granularity
-#### 3. Grader Invisibility
-- **What:** The verification commands run by `ResourceVerifier` are NOT returned in the observation's `command_output`. They happen server-side during grading. The agent cannot observe or mimic them.
-- **Why:** If the agent could see which verification commands the grader runs, it could learn to craft fake outputs that match expected patterns. Keeping grader logic invisible forces the agent to actually perform the task.
-- **How:** `ResourceVerifier` executes AWS CLI commands against MiniStack in a separate execution context. Results are consumed internally by the grading pipeline. The observation returned to the agent only contains output from the agent's own commands.
-- **Metrics:** 0 Grader Cmds Exposed | Server Execution Context | 20+ Hidden Verifications
-#### 4. Command Allowlisting
-- **What:** Only commands starting with `aws` are executed. Any attempt to run shell commands, pipe to other tools, use redirects, or escape the sandbox is rejected with `success=False`.
-- **Why:** Prevents the agent from escaping the AWS CLI sandbox. Without this, the agent could potentially execute arbitrary shell commands, access the filesystem, or interfere with the environment.
-- **How:** The environment's `step()` method validates the command before execution. Commands not starting with `aws` are immediately rejected.
-- **Metrics:** `aws *` Allowed Pattern | 0 Shell Access | Instant Rejection
-#### 5. No Verification Reward
-- **What:** If the agent runs a command that matches a `state_check` command exactly (e.g., `aws s3api get-bucket-versioning --bucket app-config-store`), it gets no progress credit. Progress is only earned through `steps` operations (mutating commands), not read-only queries.
-- **Why:** Prevents the agent from gaming progress by running the same verification commands the grader uses. The agent can run read commands to understand state, but only mutation commands earn progress.
-- **How:** During grading, the `TaskGrader` checks if the agent's command matches any `state_check` command. Matching commands are flagged as verification-only and excluded from credit. Only commands matching `steps` operations (create, put, update, delete) earn `partial_progress`.
-- **Metrics:** 0 Credit for Reads | Mutate Rewarded Actions | Exact Match Detection
-#### 6. Monotonic Progress
-- **What:** `partial_progress` can only increase within an episode. It is clamped to [0.0, 0.99] — reaching 1.0 requires actual task completion. The agent cannot lose progress, but also cannot re-earn it.
-- **Why:** Prevents cycling strategies where the agent creates and destroys resources repeatedly. Combined with deduplication, this ensures steady forward progress.
-- **How:** In `TaskGrader`, `previous_progress` tracks the highest progress seen. New progress is always `max(previous, current)`. Reward is clamped at 0.99 for partial completion, reserving 1.0 exclusively for verified full completion.
-- **Metrics:** 0.99 Max Without Completion | 1.0 Requires Full Completion | max() Progress Function
-#### 7. Resource Name Validation
-- **What:** For `resource_exists` checks, the verifier matches the exact resource name, not just any resource of that type. Creating "my-test-bucket-2" doesn't satisfy a check for "my-test-bucket".
-- **Why:** Prevents the agent from creating arbitrarily named resources to game the verification system. Forces precise execution of the task requirements.
-- **How:** `ResourceVerifier`'s per-service methods (`verify_s3_bucket`, `verify_dynamodb_table`, etc.) compare against the exact expected resource name from the task definition. Each of the 20+ supported services has its own verification logic.
-- **Metrics:** Exact Name Matching | 20+ Verified Services | 0 Partial Matches
-#### 8. State Checks Verify Final State
-- **What:** For expert SRE tasks, `state_checks` run actual AWS CLI commands against MiniStack at grading time. The grader verifies the final infrastructure state — not the commands the agent ran.
-- **Why:** The agent cannot fake the state. MiniStack is the ground truth. This decouples "what the agent did" from "what was actually achieved", making reward hacking extremely difficult.
-- **How:** Each expert task defines `state_checks` with command + assertion pairs. Assertions support `output_contains` (substring match on CLI output) and `json_path + expected` (JSON extraction). The grader runs these checks against the live MiniStack state independently of the agent.
-- **Metrics:** CLI Verification Method | 2 Assertion Types | Live State Source
 ---
-## Supported AWS Services (34)
-| Category | Services |
-|----------|----------|
-| **Storage & DB** | S3, DynamoDB, RDS, ElastiCache, EFS |
-| **Compute** | Lambda, ECS, EC2, Step Functions |
-| **Messaging** | SQS, SNS, Kinesis, EventBridge, Firehose |
-| **API** | API Gateway v1/v2, ALB/ELBv2 |
-| **Security** | IAM, STS, Cognito, ACM, WAF v2, Secrets Manager |
-| **Monitoring** | CloudWatch, CloudWatch Logs, SSM |
-| **Infrastructure** | CloudFormation, Route53 |
-| **Other** | SES, Athena, Glue, EMR |
----
-## Quick Start
 ```python
 from aws_rl_env import AwsRlAction, AwsRlEnv
@@ -261,7 +184,7 @@ with AwsRlEnv.from_docker_image("aws-rl-env:latest") as env:
     print(f"Reward: {result.reward}, Done: {result.done}")
 ```
-Or connect to a running server:
 ```python
 env = AwsRlEnv(base_url="http://localhost:8000")
@@ -269,7 +192,7 @@ result = env.reset()
 result = env.step(AwsRlAction(command="aws s3 ls"))
 ```
-WebSocket API:
 ```python
 import websockets, json
@@ -282,116 +205,42 @@ async with websockets.connect("wss://sizzing-aws-rl-env.hf.space/ws") as ws:
     obs = json.loads(await ws.recv())
 ```
----
-## Architecture
 ```
-┌─────────────────────────────────────────────────────────┐
-│                    Docker Container                     │
-│                                                         │
-│  ┌─────────────────────┐      ┌────────────────────┐    │
-│  │  FastAPI RL Server  │      │  AWS Simulator     │    │
-│  │  (port 8000)        │─────>│  (port 4566)       │    │
-│  │                     │      │  34 AWS services   │    │
-│  │  - Environment      │      │  In-memory state   │    │
-│  │  - Curriculum       │      │  Reset API         │    │
-│  │  - Grading Engine   │      │  (Ministack)       │    │
-│  │  - Episode Tracker  │      │                    │    │
-│  │  - Hint Provider    │      │                    │    │
-│  └─────────────────────┘      └────────────────────┘    │
-│          ^                             ^                │
-│          | OpenEnv HTTP/WS             | AWS CLI calls  │
-└──────────┼─────────────────────────────┼────────────────┘
-           |                             |
-      RL Agent (client, External)     (internal only)
-```
-### Episode Lifecycle
-1. **`reset()`** — Wipes AWS Infracture state, selects next task from curriculum, provisions setup commands (if any), returns initial observation
-2. **`step(action)`** — Validates command (`aws` prefix only), executes against MiniStack, records in tracker, grades with shaped reward, returns observation
-3. **Hint request** — Agent sends `aws help --task-hint` to get a progressive hint (costs reward)
-4. **Terminates** when `task_achieved == True` or max steps reached
----
-## Core Classes
-### `AwsRlEnvironment`
-[server/aws_rl_env_environment.py](server/aws_rl_env_environment.py) — Implements the OpenEnv `Environment` interface. Orchestrates all services.
-| Method | Description |
-|--------|-------------|
-| `reset()` | Wipe infra, select task, provision setup, return initial observation |
-| `step(action)` | Execute command (or intercept hint request), grade, update curriculum, return observation |
-### `Curriculum`
-[server/services/curriculum.py](server/services/curriculum.py) — Priority-queue-based task selection with progressive difficulty.
-Selects the next task using a **max-heap scored by**:
-```
-score = (
-    novelty_bonus          # +100 if never attempted (explore first)
-    + weakness_weight      # +50 * (1 - task_success_rate) — worse tasks get higher priority
-    + spaced_rep_bonus     # +30 if graduated task is "due" for re-test
-    - recency_penalty      # -20 if attempted in last 2 episodes (ensure variety)
-)
 ```
-### `TaskGrader`
-[server/services/task_grader.py](server/services/task_grader.py) — Evaluates task completion using a dispatcher pattern. Rewards are always in [0.0, 1.0].
-**Grading strategies by tier:**
-| Tier | Strategy | How it works |
-|------|----------|--------------|
-| Warmup | Command match | Checks command contains service string + correct operation |
-| Beginner | Resource creation | Verifies resource actually exists in MiniStack via `ResourceVerifier` |
-| Intermediate | Multi-step | Tracks ordered sequence of (operation, resource) pairs |
-| Advanced | Multi-step + services | All steps completed AND all required services touched |
-| Expert | State checks | Runs arbitrary AWS CLI commands to assert end-state (ground truth) |
-### `HintProvider`
-[server/services/hint_provider.py](server/services/hint_provider.py) — Generates progressive hints from `SuccessCriteria` fields.
-| Hint Level | What it reveals | Example |
-|-----------|----------------|---------|
-| Level 1 | Which AWS services to use | "You'll need IAM and Lambda" |
-| Level 2 | Which operations | "Start with create-role, then put-role-policy" |
-| Level 3 | Near-complete command structure | "Use: aws iam create-role --role-name ..." |
-### `EpisodeTracker`
-[server/services/episode_tracker.py](server/services/episode_tracker.py) — Maintains per-episode step history. Parses AWS CLI commands to extract (service, operation, resource) tuples. Tracks credited operations for deduplication, monotonic progress, and hint usage.
-### `ResourceVerifier`
-[server/services/resource_verifier.py](server/services/resource_verifier.py) — Queries MiniStack directly to verify ground-truth resource state. Service-specific checks for S3, DynamoDB, Lambda, SQS, SNS, IAM, Secrets Manager, and API Gateway. Also evaluates `StateCheck` assertions (substring match, JSON path extraction).
-### `EnvironmentDesigner`
-[server/services/environment_designer.py](server/services/environment_designer.py) — Provisions initial AWS state via setup commands before the agent acts. Used by SRE/expert tasks to create broken or insecure infrastructure the agent must fix.
-### `AwsBackend`
-[server/services/aws_backend.py](server/services/aws_backend.py) — Executes AWS CLI commands against MiniStack (`AWS_ENDPOINT_URL=http://localhost:4566`). Provides `reset_environment()` via MiniStack's `/_ministack/reset` endpoint.
-### `AwsRlEnv` (Client)
-[client.py](client.py) — OpenEnv HTTP/WebSocket client. Wraps `reset()` and `step()` calls to the server.
 ---
-## Data Models
-[models.py](models.py) — All Pydantic models and type aliases.
 ### Action
@@ -400,51 +249,53 @@ class AwsRlAction(Action):
     command: str   # AWS CLI command, e.g. "aws s3 ls"
 ```
 ### Observation
 ```python
 class AwsRlObservation(Observation):
     episode_id: EpisodeID
     step_count: StepCount
-    command_success: bool
-    command_output: str          # stdout from AWS CLI
-    error: str                   # stderr if failed
-    task: TaskInfo | None        # masked task definition (hides success criteria)
     task_achieved: bool
-    partial_progress: float      # current task progress in [0.0, 1.0]
-    hints_used: int              # number of hints requested this episode
-    hint_text: str               # most recent hint text (if any)
 ```
-### Environment State
 ```python
 class AwsRlState(State):
-    current_task: Task | None    # full task assigned for the episode
-    tracker: TrackerState        # episode tracker snapshot
-    infra_state: dict            # AWS infrastructure state keyed by service name
-    chaos_occurred: bool         # whether chaos was injected this episode
-    current_tier: str            # agent's current difficulty tier
 class TrackerState:
-    step_count: int              # steps taken this episode
-    hints_used: int              # hints requested this episode
-    progress: float              # current partial progress [0.0, 1.0]
-    commands_executed: list[str] # commands executed this episode
-    credited_operations: list[str]  # (operation, resource) pairs that earned credit
 ```
-### Task Definitions
 ```python
 class Task:
     task_id: TaskID
-    difficulty: TaskDifficulty   # warmup | beginner | intermediate | advanced | expert
-    description: str             # human-readable goal
     success_criteria: SuccessCriteria
-    setup_commands: list[SetupCommand]       # pre-provision for SRE tasks
-    desired_state_spec: str | None           # natural-language desired end state (drift tasks)
-    possible_drifts: list[SetupCommand]      # pool of mutations for DriftEngine
 class TaskInfo:
     """Agent-visible subset of Task — masks success_criteria, setup_commands, and possible_drifts."""
@@ -454,143 +305,311 @@ class TaskInfo:
     desired_state_spec: str | None
 class SuccessCriteria:
-    command_contains: str | None           # warmup/beginner
-    operation: str | None                  # warmup/beginner
-    resource_exists: ResourceExistsCheck | None  # beginner
-    steps: list[StepCriteria]             # intermediate/advanced/expert
-    services: list[AwsService]            # advanced/expert
-    state_checks: list[StateCheck]        # expert (ground truth)
 ```
-### Curriculum Configuration
 ```python
 class TierConfig:
-    min_episodes: int         # minimum episodes before promotion
-    advance_rate: float       # tier success rate threshold (0.6 - 1.0)
-    mastery_window: int       # sliding window size (default: 10)
-    mastery_threshold: float  # per-task graduation threshold (default: 0.7)
     fast_track_rate: float    # early promotion threshold (default: 0.9)
-    chaos_probability: float  # probability of chaos injection per step (default: 0.0)
 class SpacedRepState:
-    interval: int                  # episodes until next re-test (3 -> 48)
     last_graduated_episode: int    # when last graduated
 ```
 ---
-## Project Structure
 ```
-aws-rl-env/
-├── __init__.py                    # Exports: AwsRlEnv, AwsRlAction, AwsRlObservation
-├── models.py                      # Pydantic data models & type aliases
-├── client.py                      # AwsRlEnv OpenEnv client
-├── inference.py                   # LLM agent inference script
-├── inference-complete.py          # Full inference pipeline with curriculum
-├── server/
-│   ├── app.py                     # FastAPI application + web UI endpoints
-│   ├── aws_rl_env_environment.py  # Core RL environment (reset/step)
-│   ├── templates/
-│   │   └── index.html             # Web playground UI
-│   ├── static/
-│   │   ├── css/style.css          # Playground styles
-│   │   └── js/app.js              # Playground frontend logic
-│   └── services/
-│       ├── aws_backend.py         # MiniStack command executor
-│       ├── task_grader.py         # Grading engine with reward shaping
-│       ├── curriculum.py          # Curriculum learning manager
-│       ├── episode_tracker.py     # Per-episode step history & hints
-│       ├── resource_verifier.py   # Ground-truth state verification
-│       ├── environment_designer.py # Setup provisioning for SRE tasks
-│       ├── hint_provider.py       # Progressive hint generator
-│       ├── chaos_engine.py        # Chaos injection engine
-│       ├── drift_engine.py        # Drift detection engine
-│       ├── task_solutions.py      # Reference solutions for tasks
-│       └── tasks/
-│           ├── warmup.yaml        # 20 listing tasks
-│           ├── beginner.yaml      # 20 creation tasks
-│           ├── intermediate.yaml  # 20 multi-step tasks
-│           ├── advanced.yaml      # 20 architecture tasks
-│           ├── expert.yaml        # 20 SRE/security tasks
-│           └── drift.yaml         # Drift detection tasks
-├── tests/                         # Unit tests for core services
-│   ├── test_aws_rl_env_environment.py
-│   ├── test_drift_engine.py
-│   ├── test_environment_designer.py
-│   ├── test_episode_tracker.py
-│   ├── test_hint_provider.py
-│   ├── test_resource_verifier.py
-│   └── test_task_grader.py
-├── tests_tasks/                   # Integration tests per task tier
-│   ├── test_warmup_tasks.py
-│   ├── test_beginner_tasks.py
-│   ├── test_intermediate_tasks.py
-│   ├── test_advanced_tasks.py
-│   ├── test_expert_tasks.py
-│   └── test_drift_tasks.py
-├── aws_infra/                     # MiniStack emulator (git subtree from ministackorg/ministack)
-│   └── ministack/
-│       ├── app.py                 # MiniStack ASGI router
-│       ├── core/                  # Routing, persistence, responses
-│       └── services/              # AWS service implementations
-├── Dockerfile                     # Multi-stage build (server + MiniStack)
-├── Makefile                       # Dev tasks: run, format, lint, docker-*
-├── openenv.yaml                   # OpenEnv manifest
-└── pyproject.toml                 # Dependencies & build config
 ```
 ---
-## Running
-### Docker (recommended)
-```bash
-make docker-build          # Build image
-make docker-run            # Run on port 8000
-make docker-run-detach     # Run in background
-make docker-health         # Health check
 ```
-### Local (without Docker)
-Use the combined Makefile target:
-```bash
-make run                   # Starts MiniStack + server
 ```
-### OpenEnv Deployment
 ```bash
-make openenv-validate      # Validate config
-make openenv-build         # Build environment
-make openenv-push          # Push to HuggingFace Spaces
 ```
 ---
-## Configuration
-| Variable | Default | Description |
-|----------|---------|-------------|
-| `AWS_INFRA_URL` | `http://localhost:4566` | AWS Infra endpoint |
-| `AWS_ACCESS_KEY_ID` | `test` | AWS credentials (any value works) |
-| `AWS_SECRET_ACCESS_KEY` | `test` | AWS credentials (any value works) |
-| `AWS_DEFAULT_REGION` | `us-east-1` | AWS region |
-| `MAX_STEPS` | `15` | Max steps per episode |
-| `API_BASE_URL` | — | LLM API endpoint (for inference.py) |
-| `MODEL_NAME` | — | LLM model name (for inference.py) |
-| `HF_TOKEN` | — | HuggingFace token (for inference.py) |
-| `TEMPERATURE` | `0.7` | LLM sampling temperature |
 ---
-## Curriculum Stats API
-The curriculum exposes detailed training progress:
 ```python
 curriculum.get_stats()
@@ -609,10 +628,98 @@ curriculum.get_stats()
 ---
-## Links
 - **GitHub**: [github.com/udaykiranpadhy/aws-rl-env](https://github.com/udaykiranpadhy/aws-rl-env)
-- **Hugging Face Space**: [huggingface.co/spaces/Sizzing/aws_rl_env](https://huggingface.co/spaces/Sizzing/aws_rl_env)
-- **API Reference**: [/docs](https://sizzing-aws-rl-env.hf.space/docs)
-- **ReDoc**: [/redoc](https://sizzing-aws-rl-env.hf.space/redoc)
 - **Portfolio**: [portfolio.udaykp.dev](https://portfolio.udaykp.dev)

   - openenv
 ---
+<p align="center">
+  <img src="docs/figures/ministack_logo.png" alt="MiniStack logo" height="110"/>
+</p>
+# AWS Cloud CLI & SRE — A Reinforcement-Learning Environment + Training Pipeline
+> An OpenEnv-compatible RL environment with a curriculum of **120+ AWS tasks** across 5 difficulty tiers, paired with a complete **SFT → GRPO** training pipeline (Qwen2.5-Coder-3B + LoRA + Optuna). Vendored MiniStack simulator means **zero AWS cost**, real CLI semantics, and 8-way parallel rollouts that fit on a single GPU.
+| | |
+|---|---|
+| **Live demo** | [sizzing-aws-rl-env.hf.space/web](https://sizzing-aws-rl-env.hf.space/web) — try the playground in a browser |
+| **API docs**  | [sizzing-aws-rl-env.hf.space/docs](https://sizzing-aws-rl-env.hf.space/docs) (Swagger), [/redoc](https://sizzing-aws-rl-env.hf.space/redoc) |
+| **HF Space**  | [huggingface.co/spaces/Sizzing/aws_rl_env](https://huggingface.co/spaces/Sizzing/aws_rl_env) |
+| **SFT adapter**| [Sizzing/aws-rl-sft-qwen25coder3b-adapter](https://huggingface.co/Sizzing/aws-rl-sft-qwen25coder3b-adapter) |
+| **Dataset**   | [Sizzing/aws-rl-sft](https://huggingface.co/datasets/Sizzing/aws-rl-sft) |
+---
+## Table of contents
+1. [What this is & why it matters](#1-what-this-is--why-it-matters)
+2. [Highlights — full feature inventory](#2-highlights--full-feature-inventory)
+3. [Architecture](#3-architecture)
+4. [Live demo & Quick Start](#4-live-demo--quick-start)
+5. [Run on Colab](#5-run-on-colab)
+6. [Action / Observation spec](#6-action--observation-spec)
+7. [Curriculum & Reward (overview)](#7-curriculum--reward-overview)
+8. [Training pipeline (SFT → GRPO)](#8-training-pipeline-sft--grpo)
+9. [Parallel rollout architecture](#9-parallel-rollout-architecture)
+10. [MiniStack: vendored & customized](#10-ministack-vendored--customized)
+11. [Results & Benchmarks](#11-results--benchmarks)
+12. [Repository map](#12-repository-map)
+13. [Configuration & Running](#13-configuration--running)
+14. [Testing](#14-testing)
+15. [Tech stack](#15-tech-stack)
+16. [Links](#16-links)
+17. [Acknowledgments](#17-acknowledgments)
+---
+## 1. What this is & why it matters
+Modern AI agents are increasingly asked to operate cloud infrastructure — provisioning resources, fixing misconfigurations, responding to drift. Training such agents needs (a) a realistic environment, (b) reliable reward signals, and (c) enough scale to make RL feasible. Existing options force a hard tradeoff: real AWS costs hundreds of dollars per training run and is impossible to reset; toy emulators don't behave like production AWS.
+**This project closes that gap.** We built:
+1. **An OpenEnv-compatible RL environment** that speaks real AWS CLI semantics. The agent sends `aws s3 mb …`, `aws iam create-role …`, and so on — the exact same commands a human SRE would type.
+2. **A vendored, customized MiniStack simulator** that responds with production-equivalent JSON, runs locally for zero cost, supports 34 AWS services, and exposes a single-call state-introspection endpoint we added so the grader has cheap ground-truth access.
+3. **A 120+ task curriculum** across 5 tiers (warmup → expert) with adaptive selection, mastery tracking, spaced repetition, chaos injection, and drift-detection scenarios — every feature designed to keep the reward signal honest and prevent the agent from gaming it.
+4. **A complete SFT → GRPO training pipeline.** A 1,500-row synthetic dataset spanning 5 trajectory shapes, an 11-model base benchmark, LoRA fine-tuning, and TRL GRPO with multi-turn rollouts and Optuna hyperparameter search.
+5. **An 8-way parallel-rollout architecture.** Server-side MiniStack pool, client-side `GrpoPool`, in-process `MultiTurnEnvPool` — three coordinated layers that let G=8 concurrent rollouts run on one GPU without state contamination.
+Everything is reproducible: the dataset is generated by a deterministic script, the model selection is documented end-to-end, training entry points run on Colab, and the env runs locally in a single Docker container with no external network requirement.
 ---
+## 2. Highlights — full feature inventory
+This is the complete surface area of the project. Each entry links to deeper documentation in the corresponding sub-README.
+### Environment & Curriculum
+- **[120+ tasks across 5 tiers](server/services/tasks/)** — warmup (25), beginner (25), intermediate (25), advanced (25), expert (24), drift (9). YAML-defined task spec per tier.
+- **[Curriculum learning with priority scoring](server/README.md#7-curriculum-manager)** — `score = novelty + weakness − recency + spaced_rep_bonus` drives task selection.
+- **[Mastery tracking](server/README.md#7-curriculum-manager)** — sliding 10-episode window, 0.7 threshold, 0.85 exponential decay, supports un-graduation.
+- **[Spaced repetition](server/README.md#7-curriculum-manager)** — graduated tasks resurface at intervals `[3, 6, 12, 24, 48]` to prevent forgetting.
+- **[Tier promotion](server/README.md#7-curriculum-manager)** — standard (min episodes + success rate) + fast-track (3 consecutive ≥90% episodes).
+- **[Strategy pattern: simulator vs real AWS](server/README.md#4-strategy-pattern-simulator-vs-real-aws)** — `BACKEND_TYPE=simulator` (default) or `aws`, no code fork.
+### Reward shaping
+- **[Five grading strategies](server/README.md#8-reward-shaping--taskgrader)** — command-match (warmup), resource-creation (beginner), multi-step (intermediate), multi-step+services (advanced), state-checks (expert).
+- **[Dense partial-progress signal](server/README.md#8-reward-shaping--taskgrader)** — clamped to `[0.0, 0.99]`, `1.0` reserved for verified completion.
+- **[Rollback penalty](server/README.md#8-reward-shaping--taskgrader)** — `−0.1` per `(create-X, …, delete-X)` pair.
+- **[Idempotency bonus](server/README.md#8-reward-shaping--taskgrader)** — `+0.02` for graceful "already exists" retry.
+- **[Hint decay](server/README.md#13-hint-provider)** — three-level progressive hints with `0.85^n` reward multiplier.
+- **[Chaos survival bonus](server/README.md#11-chaos-engine)** — `×1.05` if the agent completes a chaotic task.
+### Resilience & adversarial features
+- **[Chaos injection](server/README.md#11-chaos-engine)** — silent mid-episode mutations, tier-scaled probabilities (10/20/30%) on services the task is touching.
+- **[Drift detection](server/README.md#12-drift-engine)** — 6 expert tasks, 2–3 random mutations from a per-task pool, randomized per episode (no memorization).
+- **[Security-posture audit tasks](server/README.md#17-security-posture-audit-examples)** — S3 public bucket lockdown, IAM least-privilege, Lambda secret rotation.
+- **[8-layer anti-reward-hacking](server/README.md#9-anti-reward-hacking--8-defense-layers)** — ground-truth verification, dedup, grader invisibility, command allow-list, no-credit-for-reads, monotonic progress, exact resource-name validation, final state checks.
+### Training pipeline
+- **[Synthetic SFT dataset (1,500 rows)](data/README.md)** — 5 trajectory types: success / multi-step continuation / failure recovery / verification / hint usage.
+- **[Rigorous base-model selection](data/sft/MODEL_EVALUATION.md)** — 11 models × 27 prompts, [Qwen2.5-Coder-3B-Instruct](https://huggingface.co/unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit) wins.
+- **[LoRA SFT](train/README.md#1-sft-stage--supervised-lora)** — `r ∈ {8,16,32}`, `lora_alpha = r × multiplier`, attention-only adaptation.
+- **[GRPO RL via TRL](train/README.md#2-grpo-stage--reinforcement-learning)** — group-relative advantages, KL to SFT reference, `dapo` loss, no critic.
+- **[Multi-turn rollouts](train/README.md#4-multi-turn-rollouts--parallel-envs)** — up to `MAX_TURNS=6`, observation fed back as next-turn user message.
+- **[Optuna hyperparameter search](train/README.md#3-optuna-hyperparameter-search)** — TPE sampler over 8-dim space, frozen held-out validation set.
+- **[HuggingFace integration](data/README.md#7-huggingface-publishing)** — adapter + dataset published to Hub, OpenEnv Space deployment.
+### Parallel rollout architecture
+- **[Server-side MiniStack pool](server/README.md#6-server-side-ministack-pool-parallel-rollouts)** — `MiniStackPool` ([server/app.py](server/app.py)), free-list of ports, lock-guarded acquire/release.
+- **[Client-side GrpoPool](scripts/README.md#2-three-coordinated-pool-layers)** — async-native, all-or-nothing connect, asyncio.gather for concurrent rollouts.
+- **[In-process MultiTurnEnvPool](train/README.md#4-multi-turn-rollouts--parallel-envs)** — sync API, owns a background asyncio loop, used by the trainer.
+- **[8 isolated rollouts on one server](scripts/README.md#7-running-the-multi-connection-demo)** — proof in [scripts/TestMultipleConnects.ipynb](scripts/TestMultipleConnects.ipynb).
+### Vendored simulator
+- **[MiniStack as git subtree](server/README.md#5-ministack-vendored-fork--customizations)** — vendored at [aws_infra/](aws_infra/) (commit `2c38c0b`). 34 AWS services. MIT.
+- **[Custom `/_ministack/state` endpoint](server/README.md#5-ministack-vendored-fork--customizations)** — added in commit `a648c3a`; returns full infra inventory in one call.
+- **[Upstream sync workflow](server/README.md#5-ministack-vendored-fork--customizations)** — periodic `git subtree pull`; isolated patches keep conflicts minimal.
+### Operations & deployment
+- **[OpenEnv-compliant](https://github.com/openai/openenv)** — `/reset`, `/step`, `/state`, `/schema`, `/ws` HTTP+WebSocket endpoints.
+- **[Web playground UI](server/README.md#19-web-playground)** — `/web` route, 40 AWS service icons, Jinja2 + JS frontend.
+- **[Docker-first deployment](Dockerfile)** — multi-stage build, container ships server + N MiniStack instances + AWS CLI.
+- **[Comprehensive test suite](#14-testing)** — 10 unit tests + 6 tier-integration suites covering 134 tasks.
+---
+## 3. Architecture
+```
+┌────────────────────────────────── Docker container ──────────────────────────────────┐
+│                                                                                      │
+│   FastAPI server  (port 8000)                                                        │
+│   ├── OpenEnv router       /reset  /step  /state  /schema  /ws  /health              │
+│   ├── Web playground       /web  (Jinja2 + 40 AWS icon SVGs)                         │
+│   ├── env_factory          per-WS-session AwsRlEnvironment instance                  │
+│   │                        (acquires a MiniStack port from MiniStackPool)            │
+│   └── Services                                                                       │
+│       Curriculum · TaskGrader · ResourceVerifier · ChaosEngine · DriftEngine         │
+│       HintProvider · EpisodeTracker · EnvironmentDesigner · EnvironmentStrategy      │
+│                                                                                      │
+│                                                                                      │
+│   MiniStack instances    :4566  :4567  :4568  …  :4566+POOL_SIZE-1                   │
+│   (vendored at aws_infra/, started by the Dockerfile entrypoint)                     │
+│                                                                                      │
+└──────────────────────────────────────────────────────────────────────────────────────┘
+                ▲                                  ▲
+                │ HTTP/WS                          │ AWS CLI subprocess
+                │                                  │ (AWS_ENDPOINT_URL=http://localhost:4566+i)
+                │                                  │
+        ┌───────┴───────────┐              ┌───────┴───────────┐
+        │   RL Agent        │              │  AWS CLI commands │
+        │   (client.py)     │              │  the agent emits  │
+        └───────────────────┘              └───────────────────┘
+```
+A more visual diagram (architecture + curriculum progression) will live at `docs/figures/architecture_diagram.png` once added.
+### Episode lifecycle
+1. **`reset()`** — wipes simulator state, picks next task from the curriculum, runs `setup_commands`, applies drift if applicable, returns initial observation.
+2. **`step(action)`** — validates the command (must start with `aws `), intercepts hint requests, executes via the strategy, records in tracker, grades with shaped reward, optionally injects chaos, returns observation.
+3. **Hint** — agent sends `aws help --task-hint`; intercepted before reaching MiniStack; returns next-level hint, increments `hints_used` (which decays final reward by `0.85^n`).
+4. **Termination** — `task_achieved=True` or `step_count >= MAX_STEPS` (default 15).
+Full mechanics in [server/README.md](server/README.md).
 ---
+## 4. Live demo & Quick Start
+### Try it in a browser
+The hosted playground lets you click around any task without writing code:
+> **[sizzing-aws-rl-env.hf.space/web](https://sizzing-aws-rl-env.hf.space/web)**
+### Python client
 ```python
 from aws_rl_env import AwsRlAction, AwsRlEnv
     print(f"Reward: {result.reward}, Done: {result.done}")
 ```
+Or against a running server:
 ```python
 env = AwsRlEnv(base_url="http://localhost:8000")
 result = env.step(AwsRlAction(command="aws s3 ls"))
 ```
+### WebSocket API
 ```python
 import websockets, json
     obs = json.loads(await ws.recv())
 ```
+### Local Docker
+```bash
+make docker-build           # build the image
+make docker-run             # foreground; serves on :8000
+make docker-run-detach      # background
+make docker-health          # liveness probe
 ```
+For training (8-way parallel rollouts):
+```bash
+AWS_RL_ENV_POOL_SIZE=8 make run
 ```
+---
+## 5. Run on Colab
+The full pipeline is reproducible on a Colab GPU runtime. Drop your token into Colab Secrets, set `ENV_BASE_URL` to your HF Space (or local with ngrok), and run.
+| Notebook                                                                            | What it does                                          | Open in Colab                                |
+|-------------------------------------------------------------------------------------|-------------------------------------------------------|----------------------------------------------|
+| [aws_rl_env_colab.ipynb](aws_rl_env_colab.ipynb)                                    | End-to-end driver: validation, Optuna search, full GRPO training, plotting, optional push-to-Hub | <!-- TODO: paste Colab URL here --> |
+| [train/train_sft_lora.ipynb](train/train_sft_lora.ipynb)                            | Stage 1 — SFT LoRA fine-tuning of Qwen2.5-Coder-3B    | <!-- TODO: paste Colab URL here --> |
+| [train/train_grpo_lora.ipynb](train/train_grpo_lora.ipynb)                          | Stage 2 — GRPO RL training with multi-turn rollouts   | <!-- TODO: paste Colab URL here --> |
+| [compare/compare_base_vs_sft.ipynb](compare/compare_base_vs_sft.ipynb)              | Side-by-side: base model vs SFT adapter (dataset + RL env) | <!-- TODO: paste Colab URL here --> |
+| [scripts/TestMultipleConnects.ipynb](scripts/TestMultipleConnects.ipynb)            | Demo: 8 simultaneous WebSocket sessions stay isolated | <!-- TODO: paste Colab URL here --> |
+Replace each `<!-- TODO -->` with the Colab badge URL once published.
 ---
+## 6. Action / Observation spec
+The full Pydantic data models — kept inline so any reader can wire up an agent without leaving this page. Source: [models.py](models.py).
 ### Action
     command: str   # AWS CLI command, e.g. "aws s3 ls"
 ```
+The environment validates that `command` starts with `aws `; anything else is rejected with `success=False`.
 ### Observation
 ```python
 class AwsRlObservation(Observation):
     episode_id: EpisodeID
     step_count: StepCount
+    command_success: bool          # exit code == 0
+    command_output: str            # stdout from the AWS CLI invocation
+    error: str                     # stderr (empty if success)
+    task: TaskInfo | None          # masked task definition (no success criteria)
     task_achieved: bool
+    partial_progress: float        # current task progress in [0.0, 1.0]
+    hints_used: int                # cumulative hint count this episode
+    hint_text: str                 # most recent hint text (if any)
 ```
+### State
 ```python
 class AwsRlState(State):
+    current_task: Task | None      # full task assigned for the episode
+    tracker: TrackerState          # episode tracker snapshot
+    infra_state: dict              # AWS infrastructure state keyed by service name
+    chaos_occurred: bool           # whether chaos was injected this episode
+    current_tier: str              # agent's current difficulty tier
 class TrackerState:
+    step_count: int                # steps taken this episode
+    hints_used: int                # hints requested this episode
+    progress: float                # current partial progress [0.0, 1.0]
+    commands_executed: list[str]   # commands executed this episode
+    credited_operations: list[str] # (operation, resource) pairs that earned credit
 ```
+### Task definitions
 ```python
 class Task:
     task_id: TaskID
+    difficulty: TaskDifficulty       # warmup | beginner | intermediate | advanced | expert
+    description: str                 # human-readable goal
     success_criteria: SuccessCriteria
+    setup_commands: list[SetupCommand]      # pre-provision for SRE tasks
+    desired_state_spec: str | None          # natural-language desired end state (drift tasks)
+    possible_drifts: list[SetupCommand]     # pool of mutations for DriftEngine
 class TaskInfo:
     """Agent-visible subset of Task — masks success_criteria, setup_commands, and possible_drifts."""
     desired_state_spec: str | None
 class SuccessCriteria:
+    command_contains: str | None                   # warmup/beginner
+    operation: str | None                          # warmup/beginner
+    resource_exists: ResourceExistsCheck | None    # beginner
+    steps: list[StepCriteria]                      # intermediate/advanced/expert
+    services: list[AwsService]                     # advanced/expert
+    state_checks: list[StateCheck]                 # expert (ground truth)
 ```
+### Curriculum config
 ```python
 class TierConfig:
+    min_episodes: int          # minimum episodes before promotion
+    advance_rate: float        # tier success rate threshold (0.6 - 1.0)
+    mastery_window: int        # sliding window size (default: 10)
+    mastery_threshold: float   # per-task graduation threshold (default: 0.7)
     fast_track_rate: float    # early promotion threshold (default: 0.9)
+    chaos_probability: float   # probability of chaos injection per step
 class SpacedRepState:
+    interval: int                  # episodes until next re-test (3 → 48)
     last_graduated_episode: int    # when last graduated
 ```
 ---
+## 7. Curriculum & Reward (overview)
+The curriculum and reward stack is the heart of the project. This section is the elevator pitch; **the full mechanics — priority scoring math, anti-reward-hacking layers, chaos engine, drift engine — live in [server/README.md](server/README.md)**.
+### Priority scoring (one-formula task selection)
 ```
+score = novelty_bonus          # +100 if never attempted
+      + weakness_weight        # +50 × (1 − task_success_rate)
+      + spaced_rep_bonus       # +30 if a graduated task is "due" for re-test
+      − recency_penalty        # −20 if attempted in the last 2 episodes
+```
+Exploration, weakness-targeting, anti-forgetting, and variety — all balanced by one weighted sum.
+### Reward shaping
+```
+if task_achieved:
+    reward = 1.0
+    if survived_chaos:    reward *= 1.05      # chaos survival bonus
+else:
+    reward = partial_progress * 0.8           # ≤ 0.8 from steps alone
+    if progress_increased: reward += 0.1      # dense progress signal
+    if command_failed:     reward *= 0.5      # error penalty
+    reward -= 0.1 * rollback_count            # waste penalty
+    reward += 0.02 * idempotent_retries       # graceful retry bonus
+    reward = clamp(reward, 0.0, 0.99)         # 1.0 reserved for completion
+reward *= 0.85 ** hints_used                  # hint decay applied last
 ```
+The agent's loss surface is intentionally narrow: only doing the task earns full reward, and every reward-hacking shortcut we identified during design has a defense layer (full list in [server/README.md §9](server/README.md#9-anti-reward-hacking--8-defense-layers)).
+> Curriculum progression visual will live at `docs/figures/curriculum_progression.png`.
 ---
+## 8. Training pipeline (SFT → GRPO)
+The training pipeline runs in two stages, both reproducible on Colab. Full detail in **[train/README.md](train/README.md)**.
+```
+                      ┌────────── data/sft/ ──────────┐
+                      │  1,500 train · 150 val rows   │
+                      │  5 trajectory types           │
+                      └───────────────┬───────────────┘
+                                      ▼
+   STAGE 1 — Supervised Fine-Tuning   train/train_sft_lora.ipynb
+   Qwen2.5-Coder-3B-Instruct + LoRA r=8/16/32 (Optuna) → SFT adapter
+                                      │
+                                      │ Sizzing/aws-rl-sft-qwen25coder3b-adapter
+                                      ▼
+   STAGE 2 — GRPO RL                  train/train_grpo_lora.ipynb
+   G=8 parallel rollouts · multi-turn · reward = env return
+   Optuna over (lr, β, G, T, top_p, lora_r, max_turns)
 ```
+### Numbers worth knowing
+| | |
+|---|---|
+| **Base model** | `unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit` — picked via [data/sft/MODEL_EVALUATION.md](data/sft/MODEL_EVALUATION.md) |
+| **SFT LoRA** | `r ∈ {8,16,32}`, `lora_alpha = r × multiplier`, target = attention only, dropout `[0.005, 0.031]` |
+| **GRPO config** | `G=8`, `β=0.04`, `lr=5e-6`, `T=0.9`, `top_p=0.95`, `max_turns=6`, loss=`dapo` |
+| **Optuna search** | TPE sampler, 6 trials × 30 GRPO steps, frozen 10-task held-out val set |
+| **Final training** | 200 GRPO steps with best config |
+### Training graphs
+> Embed once notebook is executed:
+> ![SFT loss curve](docs/figures/sft_loss_curve.png)
+> ![GRPO mean reward over training](docs/figures/grpo_reward_curve.png)
+> ![Per-rollout reward by curriculum tier](docs/figures/grpo_per_tier_curve.png)
+> ![Optuna parameter importance](docs/figures/optuna_param_importance.png)
+---
+## 9. Parallel rollout architecture
+GRPO needs `G` rollouts on the same task per training step. We run all G in parallel with **state isolation guaranteed**. Three coordinated pool layers make it work:
 ```
+                        Trainer (G=8 generations needed per step)
+                                        │
+                   ┌────────────────────┼────────────────────┐
+                   ▼                    ▼                    ▼
+            MultiTurnEnvPool        GrpoPool            (in-process)
+            (train_grpo.py)         (scripts/grpo_pool.py)
+            sync API                async API
+                   │                    │
+                   └─────── 8 WebSocket connections ────────┘
+                                        │
+                                        ▼
+                            FastAPI server  :8000
+                            + OpenEnv max_concurrent_envs=8
+                                        │
+                                        ▼
+                            MiniStackPool (free-list, lock-guarded)
+                            acquire(port) on connect, release on disconnect
+                                        │
+                                        ▼
+                    8 isolated MiniStack instances :4566..:4573
+```
+Wall-clock impact: an 8-rollout × 6-turn episode runs in ~300 ms of env time vs ~2.4 s sequential. Full mechanics, including the **all-or-nothing connect protocol** that prevents pool-slot leakage on flake, are in **[scripts/README.md](scripts/README.md)**.
+---
+## 10. MiniStack: vendored & customized
+The simulator powering the env is **vendored** as a git subtree at [aws_infra/](aws_infra/), not pulled as a black-box dependency. We forked it because we needed:
+1. A custom `/_ministack/state` JSON endpoint so the grader can read the entire infra inventory in **one HTTP call** instead of iterating 20+ list APIs per grading pass. Added in commit `a648c3a "feat: Add support for service state retrieval and action listing across multiple AWS services"`.
+2. A reproducible build with no runtime network requirement — the Docker image bundles a specific MiniStack revision.
+3. The freedom to extend service coverage on demand.
+Custom commits live as small, isolated patches so periodic upstream syncs (`af2e945`, `579597b`) replay cleanly. To inspect:
 ```bash
+git show a648c3a               # the state-endpoint diff
+git log --oneline -- aws_infra/  # only the aws_infra subtree history
 ```
+Full subtree workflow + commit-by-commit detail in [server/README.md §5](server/README.md#5-ministack-vendored-fork--customizations). Upstream MiniStack docs (81 KB) are preserved at [aws_infra/README.md](aws_infra/README.md).
 ---
+## 11. Results & Benchmarks
+### Base-model selection
+We evaluated 11 chat models on 27 held-out prompts. **Qwen2.5-Coder-3B-Instruct** wins on every metric that matters: 41% exact match (highest), 63% operation match (highest), 3.1 s/call (3× faster than the 4B runner-up). Full report:
+> **[data/sft/MODEL_EVALUATION.md](data/sft/MODEL_EVALUATION.md)** — 270-line writeup, per-model verdicts, methodology
+> ![Top 4 candidate models on the held-out benchmark](docs/figures/model_eval_chart.png)
+### Base vs SFT — actual results
+After running the SFT pipeline end-to-end, the eval delta on the same held-out prompts is striking. Numbers from [out/delta_summary.json](out/delta_summary.json):
+| Metric          | Base   | Post-SFT | Delta       |
+|-----------------|:------:|:--------:|:-----------:|
+| `format_pct`    | 33.3%  | **100.0%** | **+66.7 pp** |
+| `exact_pct`     | 38.9%  | **88.9%**  | **+50.0 pp** |
+| `service_pct`   | 77.8%  | **88.9%**  | +11.1 pp    |
+| `operation_pct` | 61.1%  | **88.9%**  | +27.8 pp    |
+| `avg_latency`   | 2.03s  | **1.40s**  | −0.63s (faster!) |
+| `avg_len`       | 85.8   | 74.7     | −11 chars (tighter) |
+> ![Base vs SFT eval-metrics comparison](docs/figures/base_vs_sft_success.png)
+Every target from [data/sft/MODEL_EVALUATION.md §11](data/sft/MODEL_EVALUATION.md) is met or exceeded. Format compliance is now perfect; the model never wraps commands in fences or quotes after SFT. Exact-match jumped from 39% to 89% — the agent now emits the canonical command for ~9 of every 10 prompts.
+The richer two-mode benchmark (dataset eval + live RL env eval) is in [compare/compare_base_vs_sft.ipynb](compare/compare_base_vs_sft.ipynb); methodology in [compare/README.md](compare/README.md).
+> ![Dataset comparison: base vs SFT (per-row scores)](docs/figures/compare_dataset.png)
+> ![RL env comparison: base vs SFT (per-episode rewards)](docs/figures/compare_rl_env.png)
+### SFT training curves
+> ![SFT loss curve over training](docs/figures/sft_loss_curve.png)
+### Optuna SFT search
+The best SFT trial (out of 6) used `lora_r=16, lora_alpha=16, dropout=0.0058, lr=4.03e-4, warmup=0.1`. Full study at [out/optuna_study.json](out/optuna_study.json).
+> ![Optuna parameter importances](docs/figures/optuna_param_importance.png)
+> ![Optuna optimization history](docs/figures/optuna_history.png)
+### GRPO results (live multi-step env eval)
+After 35 GRPO steps on top of the SFT adapter (config from [out_grpo/optuna_best.json](out_grpo/optuna_best.json) — `lr=1.6e-5, β=0.0021, T=0.99`), we re-evaluated end-to-end on 100+ episodes:
+| Metric                        | Base + SFT | Base + SFT + GRPO | Δ            |
+|-------------------------------|:---------:|:-----------------:|:------------:|
+| Overall success rate          | 86.8%     | 86.2%             | −0.5 pp      |
+| Overall mean reward           | 0.883     | 0.877             | −0.006       |
+| Beginner success              | 96.2%     | **100.0%**        | **+3.8 pp**  |
+| Intermediate success          | 81.0%     | **87.0%**         | **+6.0 pp**  |
+| Warmup success                | 96.0%     | 90.2%             | −5.8 pp      |
+| Expert success                | 22.2%     | 22.2%             | flat         |
+| Drift repair rate             | 22.2%     | 22.2%             | flat         |
+| Destructive-action fail rate  | 15.1%     | 14.7%             | −0.4 pp      |
+| Steps to solve                | 1.45      | 1.55              | +0.10        |
+> ![SFT vs GRPO metrics grid](docs/figures/sft_vs_grpo_metrics_grid.png)
+> ![SFT vs GRPO by tier](docs/figures/sft_vs_grpo_by_tier.png)
+**Honest reading:** the 35-step GRPO run preserves the SFT gains and modestly improves the middle tiers (beginner +3.8 pp, intermediate +6.0 pp) — but does not crack the **expert-tier bottleneck** (22% success on SRE / drift / security-posture tasks). With longer GRPO runs and more curriculum exposure to expert tasks, this is the next gain to chase. The full episode-level data is in [out_grpo/grpo_multi_step.json](out_grpo/grpo_multi_step.json).
+### GRPO training curves
+Per-step training signals from the final 35-step GRPO run ([out_grpo/final_grpo/checkpoint-35/trainer_state.json](out_grpo/final_grpo/checkpoint-35/trainer_state.json)):
+> ![GRPO final per-step training signals](docs/figures/grpo_final_per_step.png)
+> ![GRPO env reward over training](docs/figures/grpo_reward_curve.png)
+Optuna search across 4 trials picked the final config:
+> ![GRPO Optuna trial comparison](docs/figures/grpo_optuna_trials_comparison.png)
+> ![GRPO Optuna parameter importances](docs/figures/grpo_optuna_importances.png)
+> ![GRPO Optuna optimization history](docs/figures/grpo_optuna_history.png)
+### Qualitative rollouts (post-GRPO)
+One sample episode per tier from [out_grpo/qualitative_rollouts.json](out_grpo/qualitative_rollouts.json):
+> ![Qualitative rollouts on representative tasks](docs/figures/qualitative_rollouts.png)
+---
+## 12. Repository map
+| Path                           | Purpose                                                            | Sub-README                              |
+|--------------------------------|--------------------------------------------------------------------|-----------------------------------------|
+| [server/](server/)             | OpenEnv FastAPI server, env logic, services, web playground       | [server/README.md](server/README.md)    |
+| [train/](train/)               | SFT and GRPO training notebooks                                   | [train/README.md](train/README.md)      |
+| [data/](data/)                 | SFT dataset, base-model selection, eval harness                   | [data/README.md](data/README.md) · [MODEL_EVALUATION.md](data/sft/MODEL_EVALUATION.md) |
+| [compare/](compare/)           | Base vs SFT side-by-side benchmark                                | [compare/README.md](compare/README.md)  |
+| [scripts/](scripts/)           | Parallel-rollout architecture + multi-connection demo             | [scripts/README.md](scripts/README.md)  |
+| [aws_infra/](aws_infra/)       | Vendored MiniStack simulator (git subtree)                        | [aws_infra/README.md](aws_infra/README.md) |
+| [out/](out/)                   | Reference SFT training output (Optuna study, baseline + post-train metrics, plots, final adapter checkpoints) | (see [train/README.md §7](train/README.md#7-logging-and-artifacts)) |
+| [out_grpo/](out_grpo/)         | Reference GRPO training output (Optuna study, baseline + post-train multi-step eval, qualitative rollouts, final adapter, 10 ready plots) | (see [train/README.md §7](train/README.md#7-logging-and-artifacts)) |
+| [tests/](tests/), [tests_tasks/](tests_tasks/) | Unit + tier-integration test suites                       | (see [§14](#14-testing))                |
+| [models.py](models.py)         | Pydantic data models for action/observation/task                  | (inline §6)                             |
+| [client.py](client.py)         | OpenEnv HTTP/WebSocket client wrapper                             | —                                       |
+| [inference.py](inference.py)   | Single-model agent loop (matches RL eval mode of `compare/`)      | —                                       |
+| [train_grpo.py](train_grpo.py) | GRPO trainer (1,283 LOC) — `MultiTurnEnvPool`, Optuna, plotting   | (see [train/README.md](train/README.md)) |
+| [aws_rl_env_colab.ipynb](aws_rl_env_colab.ipynb) | Colab driver for the full training pipeline             | —                                       |
+| [docs/figures/](docs/figures/) | All README graphs and screenshots                                  | —                                       |
 ---
+## 13. Configuration & Running
+### Docker (recommended)
+```bash
+make docker-build          # build the image
+make docker-run            # foreground on :8000
+make docker-run-detach     # background
+make docker-health         # liveness probe
+```
+### Local
+```bash
+make install-all           # uv sync + install aws_infra (MiniStack) editable
+make run                   # starts MiniStack pool + FastAPI server
+```
+### OpenEnv deployment
+```bash
+make openenv-validate      # validate config
+make openenv-build         # build environment
+make openenv-push          # push to HuggingFace Spaces
+```
+### Environment variables
+| Variable                            | Default                  | Description                                                       |
+|-------------------------------------|--------------------------|-------------------------------------------------------------------|
+| `AWS_INFRA_URL`                     | `http://localhost:4566`  | MiniStack endpoint (used when `POOL_SIZE=1`)                      |
+| `AWS_RL_ENV_POOL_SIZE`              | `1`                      | **Server-side MiniStack pool size; set to 8 for GRPO training**   |
+| `AWS_RL_ENV_MINISTACK_BASE_PORT`    | `4566`                   | First MiniStack port; pool covers `[BASE, BASE + POOL_SIZE)`      |
+| `BACKEND_TYPE`                      | `simulator`              | `simulator` (MiniStack) or `aws` (real AWS, no pool)              |
+| `AWS_ACCESS_KEY_ID`                 | `test`                   | AWS credentials (any value works for the simulator)               |
+| `AWS_SECRET_ACCESS_KEY`             | `test`                   | AWS credentials (any value works for the simulator)               |
+| `AWS_DEFAULT_REGION`                | `us-east-1`              | AWS region                                                         |
+| `MAX_STEPS`                         | `15`                     | Max steps per episode                                              |
+| `API_BASE_URL`                      | —                        | LLM API endpoint for [inference.py](inference.py)                 |
+| `MODEL_NAME`                        | —                        | LLM model name for [inference.py](inference.py)                   |
+| `HF_TOKEN`                          | —                        | HuggingFace token (dataset/adapter access, push)                  |
+| `TEMPERATURE`                       | `0.7`                    | LLM sampling temperature                                          |
+### Curriculum stats API
 ```python
 curriculum.get_stats()
 ---
+## 14. Testing
+The test suite covers both isolated unit logic and end-to-end task execution against MiniStack.
+### Unit tests — [tests/](tests/)
+```bash
+pytest tests/ -v
+```
+| File                                                                                         | Covers                                                          |
+|----------------------------------------------------------------------------------------------|-----------------------------------------------------------------|
+| [test_aws_rl_env_environment.py](tests/test_aws_rl_env_environment.py)                       | Environment lifecycle, reset/step semantics, reward integration |
+| [test_task_grader.py](tests/test_task_grader.py)                                             | All 5 grading strategies, partial progress, penalties, bonuses  |
+| [test_resource_verifier.py](tests/test_resource_verifier.py)                                 | Per-service ground-truth verification (20+ services)            |
+| [test_episode_tracker.py](tests/test_episode_tracker.py)                                     | Command parsing, dedup, monotonic progress, rollback detection  |
+| [test_episode_context.py](tests/test_episode_context.py)                                     | Per-episode context lifecycle                                   |
+| [test_drift_engine.py](tests/test_drift_engine.py)                                           | Random drift selection, mutation application                    |
+| [test_hint_provider.py](tests/test_hint_provider.py)                                         | Three-level progressive hints, decay computation                |
+| [test_environment_designer.py](tests/test_environment_designer.py)                           | Setup-command provisioning                                      |
+| [test_pool.py](tests/test_pool.py)                                                           | Server-side `MiniStackPool` acquire/release, exhaustion         |
+| [test_grpo_pool.py](tests/test_grpo_pool.py)                                                 | Client-side `GrpoPool` connect/close, all-or-nothing rollback   |
+### Tier integration tests — [tests_tasks/](tests_tasks/)
+```bash
+pytest tests_tasks/ -v
+```
+134 tasks exercised end-to-end:
+| File                                                                                                | Tasks |
+|-----------------------------------------------------------------------------------------------------|------:|
+| [test_warmup_tasks.py](tests_tasks/test_warmup_tasks.py)                                            |   25  |
+| [test_beginner_tasks.py](tests_tasks/test_beginner_tasks.py)                                        |   25  |
+| [test_intermediate_tasks.py](tests_tasks/test_intermediate_tasks.py)                                |   25  |
+| [test_advanced_tasks.py](tests_tasks/test_advanced_tasks.py)                                        |   25  |
+| [test_expert_tasks.py](tests_tasks/test_expert_tasks.py)                                            |   24  |
+| [test_drift_tasks.py](tests_tasks/test_drift_tasks.py)                                              |    9  |
+| **Total**                                                                                           | **133** |
+These tests double as the source of truth for canonical solutions used by the SFT dataset generator (extracted via AST — see [data/README.md §1](data/README.md#1-sft-dataset-generation)).
+---
+## 15. Tech stack
+- **Python 3.12**, [`uv`](https://github.com/astral-sh/uv) for dependency management, multi-stage Docker
+- **FastAPI**, **OpenEnv** (HTTP + WebSocket env protocol), **uvicorn**
+- **TRL ≥ 0.21** (`GRPOTrainer`, `GRPOConfig`)
+- **PEFT** (LoRA), **Unsloth** (4-bit quantized base, fused training kernels)
+- **Transformers ≥ 4.45**, **datasets ≥ 2.20**, **HuggingFace Hub ≥ 0.24**
+- **Optuna ≥ 3.6** (TPE sampler, SQLite study storage)
+- **asyncio** + **websockets** + **httpx** (parallel rollout orchestration)
+- **MiniStack** (vendored at [aws_infra/](aws_infra/), 34 AWS services)
+- **AWS CLI v2** (subprocess invocation against MiniStack endpoint)
+- **matplotlib**, **plotly** (training curves, Optuna visualizations)
+- **pytest** (16 test files, ~250 KB of test code)
+---
+## 16. Links
+- **Live demo**: [sizzing-aws-rl-env.hf.space/web](https://sizzing-aws-rl-env.hf.space/web)
+- **HF Space**: [huggingface.co/spaces/Sizzing/aws_rl_env](https://huggingface.co/spaces/Sizzing/aws_rl_env)
+- **API docs**: [/docs](https://sizzing-aws-rl-env.hf.space/docs) · [/redoc](https://sizzing-aws-rl-env.hf.space/redoc)
+- **SFT adapter**: [Sizzing/aws-rl-sft-qwen25coder3b-adapter](https://huggingface.co/Sizzing/aws-rl-sft-qwen25coder3b-adapter)
+- **Dataset**: [Sizzing/aws-rl-sft](https://huggingface.co/datasets/Sizzing/aws-rl-sft)
 - **GitHub**: [github.com/udaykiranpadhy/aws-rl-env](https://github.com/udaykiranpadhy/aws-rl-env)
 - **Portfolio**: [portfolio.udaykp.dev](https://portfolio.udaykp.dev)
+- **Colab**: <!-- TODO: paste Colab URL here -->
+---
+## 17. Acknowledgments
+- **MiniStack** — vendored at [aws_infra/](aws_infra/). Upstream license preserved. Custom modifications attributable to commits `a648c3a`, `a00e981`; periodic upstream syncs `af2e945`, `579597b`.
+- **OpenEnv** — environment protocol and Python client framework.
+- **TRL** (HuggingFace) — `GRPOTrainer` implementation.
+- **Unsloth** — 4-bit quantized model loaders + fused training kernels.
+- **AWS service icons** in [server/static/img/aws/](server/static/img/aws/) — used in the web playground.
+---
+## Sub-README index
+For deep technical detail on any subsystem:
+- [server/README.md](server/README.md) — environment internals (curriculum, reward shaping, anti-hacking, chaos, drift, MiniStack-fork detail)
+- [train/README.md](train/README.md) — SFT + GRPO training pipeline (LoRA config, Optuna search, multi-turn rollouts)
+- [scripts/README.md](scripts/README.md) — parallel-rollout architecture (3 pool layers, all-or-nothing connect, concurrency safety)
+- [data/README.md](data/README.md) — dataset generation (5 trajectory types, AST extraction) + base-model selection summary
+- [data/sft/MODEL_EVALUATION.md](data/sft/MODEL_EVALUATION.md) — full 11-model benchmark report
+- [compare/README.md](compare/README.md) — base vs SFT comparison harness
+- [aws_infra/README.md](aws_infra/README.md) — vendored MiniStack upstream documentation (81 KB)

aws_rl_env_colab.ipynb ADDED Viewed

	@@ -0,0 +1,233 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "7b80aed985f7",
+   "metadata": {},
+   "source": [
+    "# AWS RL Env \u2014 GRPO Training (multi-turn + parallel envs)\n\nThis notebook trains a Qwen2.5-Coder-3B policy on the AWS RL environment using **GRPO** with:\n\n- **Multi-turn rollouts** \u2014 each task runs up to `MAX_TURNS` steps; each step is one `aws ...` command, the command's output is fed back into the next turn.\n- **Parallel environments** \u2014 `NUM_GENERATIONS` MiniStack-backed env sessions run concurrently, all rolling out the *same* curriculum-picked task.\n- **Curriculum** \u2014 `Curriculum.next_task()` picks one task per GRPO step; group-level reward feeds back via `Curriculum.record_result(...)` driving promotion + spaced repetition.\n- **Optuna** \u2014 TPE search over learning rate, KL coefficient, num_generations, temperature, top-p, LoRA rank, and max_turns. Frozen held-out validation tasks evaluate each trial.\n\nThe heavy lifting lives in [`train_grpo.py`](./train_grpo.py); this notebook is a thin driver that mirrors `kube-sre-gym/kube_sre_gym_colab.ipynb`.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "651b17a160c4",
+   "metadata": {},
+   "source": [
+    "## 1 - Install dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "41cb8a624696",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "%pip install -q --upgrade pip\n%pip install -q \\\n    \"trl>=0.21\" \\\n    \"transformers>=4.45\" \\\n    \"peft>=0.13\" \\\n    \"datasets>=2.20\" \\\n    \"huggingface_hub>=0.24\" \\\n    \"websockets>=13\" \\\n    \"openenv-core[core]>=0.2.2\" \\\n    \"pyyaml>=6.0\" \\\n    \"matplotlib\" \\\n    \"optuna>=3.6\" \\\n    \"plotly\" \\\n    \"kaleido\" \\\n    \"httpx\"\n%pip install -q \"unsloth @ git+https://github.com/unslothai/unsloth.git\"\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "23979a7833e6",
+   "metadata": {},
+   "source": [
+    "## 2 - Configuration\n\nEverything you'll typically tune lives in this cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "6b425cb11474",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "import os\nfrom pathlib import Path\nfrom datetime import datetime\n\n# --- Environment server ---\nENV_URL = os.environ.get(\"AWS_RL_ENV_URL\", \"http://localhost:8000\")\n\n# --- Model & adapter ---\nMODEL_ID    = \"unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit\"\nSFT_ADAPTER = \"Sizzing/aws-rl-sft-qwen25coder3b-adapter\"  # set to None to skip SFT init\nHUB_REPO    = None  # e.g. \"your-org/aws-rl-grpo-qwen25coder3b\"\n\n# --- Training defaults (Optuna may override) ---\nNUM_GENERATIONS  = 8        # parallel envs == GRPO group size\nMAX_TURNS        = 6        # multi-turn cap per episode\nMAX_STEPS        = 200      # GRPO optimizer steps\nMAX_TOTAL_TOKENS = 4096     # token budget per episode (anti-OOM)\nMAX_PROMPT_LEN   = 2048\nMAX_COMPL_LEN    = 256\n\n# --- Optuna ---\nRUN_OPTUNA          = True\nN_TRIALS            = 6\nTRIAL_MAX_STEPS     = 30\nVAL_TASKS_PER_TIER  = 2\n\n# --- Output ---\nTIMESTAMP   = datetime.now().strftime(\"%Y-%m-%d_%H-%M-%S\")\nOUTPUT_DIR  = Path(f\"outputs/aws-rl-grpo-{TIMESTAMP}\")\nOUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n\nprint(f\"Env URL    : {ENV_URL}\")\nprint(f\"Model      : {MODEL_ID}\")\nprint(f\"SFT adapter: {SFT_ADAPTER}\")\nprint(f\"Output dir : {OUTPUT_DIR}\")\nprint(f\"Optuna     : {'on' if RUN_OPTUNA else 'off'} ({N_TRIALS} trials, {TRIAL_MAX_STEPS} steps each)\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "df5456a79dea",
+   "metadata": {},
+   "source": [
+    "## 3 - Authenticate to HF Hub (and optionally W&B)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "860228b6c968",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "import os\n\n# HF Hub\ntry:\n    from google.colab import userdata\n    os.environ[\"HF_TOKEN\"] = userdata.get(\"HF_TOKEN\")\nexcept (ImportError, KeyError, ModuleNotFoundError):\n    pass\nif os.environ.get(\"HF_TOKEN\"):\n    from huggingface_hub import login\n    login(token=os.environ[\"HF_TOKEN\"], add_to_git_credential=False)\n    print(\"HF Hub: logged in\")\nelse:\n    print(\"HF Hub: HF_TOKEN not set (push_to_hub will be disabled)\")\n\n# Optional: W&B\ntry:\n    from google.colab import userdata\n    os.environ.setdefault(\"WANDB_API_KEY\", userdata.get(\"WANDB_API_KEY\"))\nexcept (ImportError, KeyError, ModuleNotFoundError):\n    pass\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e7918896aa17",
+   "metadata": {},
+   "source": [
+    "## 4 - Smoke-test the env URL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "1356c4b20164",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "import httpx\n\nresp = httpx.get(f\"{ENV_URL}/health\", timeout=10.0)\nprint(f\"GET {ENV_URL}/health -> {resp.status_code}\")\nprint(resp.text[:500])\nassert resp.status_code == 200, \"env server is not responding \u2014 start it before training\"\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "32b12707b0e9",
+   "metadata": {},
+   "source": [
+    "## 5 - Imports from `train_grpo`\n\nAll heavy logic (rollout, env pool, reward funcs, Optuna search, training loop) lives in `train_grpo.py` at the repo root."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "0e989d6fe640",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "import json\nimport logging\nfrom pathlib import Path\n\nfrom train_grpo import (\n    SYSTEM_PROMPT,\n    DEFAULT_CFG,\n    SamplingCfg,\n    load_policy,\n    MultiTurnEnvPool,\n    plot_rewards,\n    pick_validation_task_ids,\n    evaluate_on_validation,\n    optuna_search,\n    run_training,\n)\n\nlogging.basicConfig(level=logging.INFO, format=\"%(asctime)s %(levelname)s %(name)s %(message)s\")\nprint(\"System prompt (first 200 chars):\")\nprint(SYSTEM_PROMPT[:200])\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f82fff889024",
+   "metadata": {},
+   "source": [
+    "## 6 - Pick fixed validation task ids\n\nA frozen list of tasks (k per tier) used as the held-out set across **all** Optuna trials and post-training comparisons. Stored to disk for reproducibility."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "94dd77b1304a",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "val_task_ids = pick_validation_task_ids(k_per_tier=VAL_TASKS_PER_TIER, seed=42)\nval_path = OUTPUT_DIR / \"val_task_ids.json\"\nval_path.write_text(json.dumps(val_task_ids))\nprint(f\"Validation task ids ({len(val_task_ids)}): {val_task_ids}\")\nprint(f\"Saved to {val_path}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ee0704afd204",
+   "metadata": {},
+   "source": [
+    "## 7 - Optuna hyperparameter search\n\nSet `RUN_OPTUNA=False` in the config cell to skip and use `DEFAULT_CFG`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "bf36301c7db8",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "best_cfg = None\n\nif RUN_OPTUNA:\n    study = optuna_search(\n        n_trials=N_TRIALS,\n        trial_max_steps=TRIAL_MAX_STEPS,\n        val_task_ids=val_task_ids,\n        base_model=MODEL_ID,\n        sft_adapter=SFT_ADAPTER,\n        env_url=ENV_URL,\n        output_dir=OUTPUT_DIR,\n        max_total_tokens=MAX_TOTAL_TOKENS,\n        max_completion_length=MAX_COMPL_LEN,\n        max_prompt_length=MAX_PROMPT_LEN,\n    )\n    best_cfg = {**DEFAULT_CFG, **dict(study.best_params)}\n    print(f\"\\nBest objective : {study.best_value:.4f}\")\n    print(f\"Best params    : {dict(study.best_params)}\")\nelse:\n    print(\"Skipping Optuna; using DEFAULT_CFG.\")\n    best_cfg = dict(DEFAULT_CFG)\n\nwith open(OUTPUT_DIR / \"best_cfg.json\", \"w\") as f:\n    json.dump(best_cfg, f, indent=2)\nprint(f\"Saved best_cfg -> {OUTPUT_DIR / 'best_cfg.json'}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "f0323086fc29",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "# Optional Optuna visualisations (skip silently if Optuna wasn't run)\nif RUN_OPTUNA:\n    try:\n        import optuna.visualization as vis\n        import plotly.io as pio\n        pio.renderers.default = \"notebook\"\n        vis.plot_optimization_history(study).show()\n        vis.plot_param_importances(study).show()\n    except Exception as e:\n        print(f\"(visualisation skipped: {e})\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ecf4bdc095e0",
+   "metadata": {},
+   "source": [
+    "## 8 - Final GRPO training pass with the best config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "80fcd2297776",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "print(f\"Final config: {best_cfg}\")\n\n# Override via best_cfg, falling back to top-of-notebook defaults\nNUM_GENERATIONS = int(best_cfg[\"num_generations\"])\nMAX_TURNS       = int(best_cfg[\"max_turns\"])\n\nrun_training(\n    cfg=best_cfg,\n    base_model=MODEL_ID,\n    sft_adapter=SFT_ADAPTER,\n    env_url=ENV_URL,\n    output_dir=OUTPUT_DIR,\n    max_steps=MAX_STEPS,\n    max_total_tokens=MAX_TOTAL_TOKENS,\n    max_completion_length=MAX_COMPL_LEN,\n    max_prompt_length=MAX_PROMPT_LEN,\n    push_to_hub=False,\n    hub_repo=HUB_REPO,\n)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b2305c48d920",
+   "metadata": {},
+   "source": [
+    "## 9 - Reward curves\n\n`plot_rewards` reads `reward_log.csv` (written incrementally by `EpisodeLogger`), so the chart is meaningful even if training was interrupted."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "f35ef4ee8206",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "from IPython.display import Image, display\n\nreward_csv = OUTPUT_DIR / \"reward_log.csv\"\nplot_path = OUTPUT_DIR / \"reward_plot.png\"\nplot_rewards(reward_csv, plot_path)\nif plot_path.exists():\n    display(Image(filename=str(plot_path)))\nelse:\n    print(\"No plot generated (no rows in reward_log.csv).\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fb7ed0eab6e6",
+   "metadata": {},
+   "source": [
+    "## 10 - Quick post-training validation re-run (optional)\n\nRun the same held-out tasks again on the freshly trained adapter and compare to whatever each Optuna trial achieved on the same set."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "9fd1d1dd95ef",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "# Re-load policy in inference mode\nmodel, tokenizer = load_policy(MODEL_ID, SFT_ADAPTER, trainable=False)\npool = MultiTurnEnvPool(ENV_URL, size=1)\npool.start()\n\nsampling = SamplingCfg(\n    temperature=float(best_cfg[\"temperature\"]),\n    top_p=float(best_cfg[\"top_p\"]),\n    max_new_tokens=MAX_COMPL_LEN,\n    max_prompt_length=MAX_PROMPT_LEN,\n)\n\ntry:\n    metrics = evaluate_on_validation(\n        model=model,\n        tokenizer=tokenizer,\n        pool=pool,\n        val_task_ids=val_task_ids,\n        system_prompt=SYSTEM_PROMPT,\n        max_turns=int(best_cfg[\"max_turns\"]),\n        max_total_tokens=MAX_TOTAL_TOKENS,\n        sampling=sampling,\n    )\n    print(f\"Post-training validation metrics: {metrics}\")\n    with open(OUTPUT_DIR / \"post_train_val.json\", \"w\") as f:\n        json.dump(metrics, f, indent=2)\nfinally:\n    pool.close()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fb00a27401a8",
+   "metadata": {},
+   "source": [
+    "## 11 - Push to Hugging Face Hub (optional)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "601184d3ddf5",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "# Uncomment to push the trained adapter:\n#\n# from huggingface_hub import create_repo, upload_folder\n# create_repo(HUB_REPO, exist_ok=True, private=False)\n# upload_folder(folder_path=str(OUTPUT_DIR), repo_id=HUB_REPO, repo_type=\"model\")\n# print(f\"Pushed: https://huggingface.co/{HUB_REPO}\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (aws-rl-env)",
+   "language": "python",
+   "name": "aws-rl-env"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

compare/README.md ADDED Viewed

	@@ -0,0 +1,230 @@

+# `compare/` — Base Model vs SFT Adapter Benchmark
+[← back to main README](../README.md)
+This directory holds the side-by-side benchmark that answers the only question that ultimately matters: **did SFT actually make the model better at the task?**
+The benchmark compares the base [Qwen2.5-Coder-3B-Instruct](https://huggingface.co/unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit) against our published SFT adapter [Sizzing/aws-rl-sft-qwen25coder3b-adapter](https://huggingface.co/Sizzing/aws-rl-sft-qwen25coder3b-adapter) under two evaluation modes — fast static dataset eval and slow live-environment eval. Both write structured metrics so the deltas are explicit.
+> ![Dataset comparison: base vs SFT (per-row scores)](../docs/figures/compare_dataset.png)
+> ![RL-env comparison: base vs SFT (per-episode rewards)](../docs/figures/compare_rl_env.png)
+---
+## Table of contents
+1. [What's compared](#1-whats-compared)
+2. [Two evaluation modes](#2-two-evaluation-modes)
+3. [Methodology](#3-methodology)
+4. [Metrics reported](#4-metrics-reported)
+5. [How to run](#5-how-to-run)
+6. [Reading the results](#6-reading-the-results)
+7. [Files in this directory](#7-files-in-this-directory)
+---
+## 1. What's compared
+| | Base | SFT |
+|---|---|---|
+| **Model**          | `unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit` | Same base + LoRA adapter |
+| **Adapter**        | None                                          | `Sizzing/aws-rl-sft-qwen25coder3b-adapter` |
+| **Training data**  | Pretraining + Qwen instruction tuning         | + 1,500 rows from [data/sft/aws_rl_sft.train.jsonl](../data/sft/aws_rl_sft.train.jsonl) |
+| **Inference**      | Same prompt template, same temperature        | Identical                                    |
+The only variable is the LoRA adapter. Same base, same prompts, same decoding parameters, same evaluation set.
+---
+## 2. Two evaluation modes
+The notebook runs two separate evaluations because they answer different questions:
+### Dataset eval (static)
+| Question  | Does the model emit the *canonical* command for held-out prompts, one-shot? |
+|-----------|-----------------------------------------------------------------------------|
+| Speed     | Fast (~minutes)                                                             |
+| Needs     | HF token + dataset access; **no env server**                                |
+| Source    | [data/sft/aws_rl_sft.val.jsonl](../data/sft/aws_rl_sft.val.jsonl) (150 held-out rows) |
+| Verifies  | Format correctness + command-token match against canonical                  |
+This is the same kind of pattern-matching benchmark as [data/sft/MODEL_EVALUATION.md](../data/sft/MODEL_EVALUATION.md) — fast and deterministic. Useful as a regression check.
+### RL env eval (live)
+| Question  | Can the model actually *solve* a task end-to-end against a live environment? |
+|-----------|------------------------------------------------------------------------------|
+| Speed     | Slow (~tens of minutes per model)                                            |
+| Needs     | Dataset eval above + a running env server (HF Space or local)                |
+| Source    | Same val tasks, but exercised through `client.AwsRlEnv` round-trips          |
+| Verifies  | Multi-step task completion, partial progress, reward shaping, hint usage     |
+This is closer to what training optimizes for. A model can score well on dataset eval (right command on step 1) but fail RL env eval (can't recover from a step 1 typo, can't continue past the first turn). Both signals matter.
+---
+## 3. Methodology
+### Dataset eval
+1. Load `Sizzing/aws-rl-sft` dataset from HF Hub
+2. For each row in `val`, build the prompt from `messages[:-1]` (system + user, drop assistant)
+3. Generate the model's response (`max_new_tokens=128`, deterministic decoding)
+4. **Extract the AWS CLI line**: strip markdown fences, find first line starting with `aws `
+5. Score against `messages[-1].content` (the canonical assistant response):
+   - Format OK (extracted line starts with `aws`)
+   - Service match (same first word after `aws`)
+   - Operation match (same first two words)
+   - Exact match (full token-for-token equality)
+This mirrors the methodology in [eval_lm_studio_models.py](../data/eval_lm_studio_models.py); the same scoring functions are reused.
+### RL env eval
+1. Connect to the running env at `ENV_BASE_URL` (default: an HF Space; can be overridden to local)
+2. For each val task, run a full episode (up to `MAX_STEPS=15` turns):
+   - Build the prompt from system + task + observation history (matches [inference.py](../inference.py))
+   - Generate one AWS CLI command per turn
+   - Step the environment, record `reward`, `task_achieved`, `partial_progress`
+3. Aggregate per-episode metrics
+The agent loop is identical to the training-time `rollout_one_episode` in [train_grpo.py](../train_grpo.py) — same prompt structure, same generation parameters, same termination logic. So the RL env eval is genuinely measuring "what would this model do during a GRPO rollout".
+---
+## 4. Metrics reported
+### Dataset eval
+| Metric         | Definition                                                |
+|----------------|-----------------------------------------------------------|
+| `format_ok`    | % of responses where the extracted line starts with `aws ` |
+| `svc_match`    | % matching the canonical service                           |
+| `op_match`     | % matching service + operation                             |
+| `exact_match`  | % matching the full canonical command token-for-token      |
+### RL env eval (per episode)
+| Metric                  | Definition                                                       |
+|-------------------------|------------------------------------------------------------------|
+| `avg_episode_reward`    | Mean total reward accumulated per episode (sum of step rewards)  |
+| `completion_rate`       | % of episodes ending in `task_achieved=True`                     |
+| `avg_steps_to_complete` | Mean steps used by completed episodes (lower = more efficient)   |
+| `avg_max_progress`      | Mean of the highest `partial_progress` reached per episode       |
+| `hint_usage_rate`       | % of episodes where the agent requested at least one hint        |
+| `format_failure_rate`   | % of agent commands that failed the `aws ` prefix gate           |
+The notebook produces per-tier breakdowns of all six metrics so you can see where SFT helped most (typically: warmup format-locking goes from ~85% → 100%; intermediate completion goes from a small base to a meaningful fraction).
+---
+## 5. How to run
+### Prerequisites
+- HuggingFace token (`HF_TOKEN`) — needed to load the dataset and adapter
+- A running env server — either:
+  - Your own HF Space deployment (set `ENV_BASE_URL` accordingly), or
+  - Local server: `make run` from the repo root, then `ENV_BASE_URL=http://localhost:8000`
+- A GPU runtime (Colab T4 or better, A10/A100 ideal)
+### Notebooks
+| Notebook                                                            | Open in Colab                  |
+|---------------------------------------------------------------------|--------------------------------|
+| [compare_base_vs_sft.ipynb](compare_base_vs_sft.ipynb) (clean)      | <!-- TODO: paste Colab URL --> |
+| [compare_base_vs_sft_with_outputs.ipynb](compare_base_vs_sft_with_outputs.ipynb) (with outputs) | <!-- TODO: paste Colab URL --> |
+The two notebooks are functionally identical; the second has cell outputs preserved (18 display widgets, 26 stdout cells) for offline inspection.
+### Running steps
+1. Open the notebook in Colab (or local Jupyter)
+2. Edit the **CONFIG** cell:
+   ```python
+   BASE_MODEL        = "unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit"
+   SFT_ADAPTER_REPO  = "Sizzing/aws-rl-sft-qwen25coder3b-adapter"
+   DATASET_REPO      = "Sizzing/aws-rl-sft"
+   ENV_BASE_URL      = "https://your-hf-space.hf.space"   # or local
+   ```
+3. Run all cells. Part 1 (dataset eval) finishes first; Part 2 (RL env eval) is the slow one.
+4. Compare the per-metric deltas between base and SFT.
+---
+## 6. Reading the results
+### Actual numbers from the run
+From the saved outputs of [compare_base_vs_sft_with_outputs.ipynb](compare_base_vs_sft_with_outputs.ipynb):
+#### Dataset eval
+| Metric                    | Base   | Base + SFT | Δ          |
+|---------------------------|:------:|:----------:|:----------:|
+| `format_pct`              | 33.3%  | **100.0%** | **+66.7 pp** |
+| `format_after_extract_pct`| 100.0% | 100.0%     | 0          |
+| `exact_pct`               | 38.9%  | **88.9%**  | **+50.0 pp** |
+#### RL env eval (live multi-step agent loop)
+| Metric                  | Base  | Base + SFT | Δ         |
+|-------------------------|:-----:|:----------:|:---------:|
+| `avg_episode_reward`    | 1.187 | **2.011**  | **+0.824** |
+| `reward_std`            | 1.137 | 1.908      | +0.771    |
+| `avg_steps`             | 8.600 | **5.733**  | **−2.867** |
+| `avg_reward_per_step`   | 0.138 | **0.351**  | **+0.213** |
+> ![RL-env eval: base vs SFT](../docs/figures/rl_env_eval_base_vs_sft.png)
+The agent **earns more reward per episode while taking fewer steps** — exactly what good fine-tuning should produce. Reward-per-step jumps 2.5× because (a) the agent picks the right command more often (fewer wasted steps), and (b) format compliance is now perfect (no more `aws help` fallbacks).
+#### Per-tier success in the RL eval
+From the notebook's per-rollout traces (3 episodes per tier × 5 tiers = 15 episodes per model):
+| Tier         | Base (rollouts ✓ / 3) | Base + SFT (rollouts ✓ / 3) |
+|--------------|:---------------------:|:----------------------------:|
+| warmup       | 3                     | 3                            |
+| beginner     | 3                     | 3                            |
+| intermediate | 1                     | 3                            |
+| advanced     | 0                     | 1                            |
+| expert       | 0                     | 2                            |
+SFT moves the **success frontier** up two tiers — the base model could not finish a single advanced or expert episode, while SFT completes 2 of 3 expert tasks (S3 lockdown, IAM least-privilege variants) within 5 steps.
+### What counts as a meaningful delta?
+The val set is small (150 rows / ~10 unique tasks per RL eval), so individual percentage points have meaningful noise. Rules of thumb:
+| Delta size | Significance                                   |
+|------------|------------------------------------------------|
+| ±2pp       | Within noise — don't claim improvement         |
+| 5–10pp     | Likely real, look at per-tier breakdown        |
+| >10pp      | Almost certainly real                          |
+The deltas above (66.7 pp, 50.0 pp on dataset; 0.82 reward / −2.9 steps on RL eval) are well above the noise floor.
+### Going further with GRPO
+Once the SFT adapter is in hand, the same comparison can be re-run against the GRPO adapter (`out_grpo/grpo_adapter/`). Multi-step results from the GRPO run are documented in the [main README §11](../README.md#11-results--benchmarks); the short version is GRPO@35-steps preserves SFT performance and modestly improves the middle tiers, while the expert tier remains the bottleneck.
+---
+## 7. Files in this directory
+| File                                                                                                | Purpose                                                          |
+|-----------------------------------------------------------------------------------------------------|------------------------------------------------------------------|
+| [compare_base_vs_sft.ipynb](compare_base_vs_sft.ipynb)                                              | Side-by-side dataset + RL env benchmark — clean version          |
+| [compare_base_vs_sft_with_outputs.ipynb](compare_base_vs_sft_with_outputs.ipynb)                    | Same notebook with cell outputs preserved (18 display widgets)   |
+---
+## See also
+- [Main README](../README.md) — top-level overview, results section
+- [data/README.md](../data/README.md) — dataset that drives this comparison
+- [data/sft/MODEL_EVALUATION.md](../data/sft/MODEL_EVALUATION.md) — base-model selection benchmark (same scoring functions reused here)
+- [train/README.md](../train/README.md) — how the SFT adapter being benchmarked here was produced
+- [inference.py](../inference.py) — single-model agent loop (the prototype the RL eval mode is modeled after)

compare/compare_base_vs_sft.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

compare/compare_base_vs_sft_with_outputs.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

data/README.md ADDED Viewed

	@@ -0,0 +1,238 @@

+# `data/` — SFT Dataset Generation & Base-Model Selection
+[← back to main README](../README.md)
+This directory holds the SFT training corpus, the dataset generator that produced it, and the rigorous benchmark we used to pick the base model. Together they answer two questions a hackathon judge should be able to verify in under five minutes:
+1. **What did we train on?** A 1,500-row synthetic SFT corpus with five trajectory types covering success, continuation, failure recovery, verification, and hint usage. ([§1](#1-sft-dataset-generation))
+2. **Why this base model?** A reproducible 11-model benchmark across 27 held-out prompts. **Qwen2.5-Coder-3B-Instruct** wins on every metric that matters. ([§5](#5-base-model-selection-overview))
+> ![Top 4 candidate models on the held-out benchmark](../docs/figures/model_eval_chart.png)
+---
+## Table of contents
+1. [SFT dataset generation](#1-sft-dataset-generation)
+2. [Five trajectory types](#2-five-trajectory-types)
+3. [Tier weighting](#3-tier-weighting)
+4. [Dataset format & artifacts](#4-dataset-format--artifacts)
+5. [Base-model selection — overview](#5-base-model-selection-overview)
+6. [Eval harness](#6-eval-harness)
+7. [HuggingFace publishing](#7-huggingface-publishing)
+8. [Files in this directory](#8-files-in-this-directory)
+---
+## 1. SFT dataset generation
+[data/build_sft_dataset.py](build_sft_dataset.py) — 27 KB, single-script generator.
+### Approach
+The dataset is **synthetically generated** but grounded in canonical solutions extracted from our integration test suite. Two design decisions worth flagging to judges:
+#### AST-based extraction, not pytest execution
+Each `tests_tasks/test_<tier>_tasks.py` file has a top-level constant (`WARMUP_COMMANDS`, `BEGINNER_COMMANDS`, …) mapping `task_id → canonical AWS CLI command`. We extract these via Python's `ast` module — we do **not** execute the test file. Reasons:
+1. `pytest` fixtures would spin up a MiniStack, hit AWS APIs, and add 30+ seconds of overhead per generation run.
+2. Static extraction is deterministic — no flake risk. The dataset is reproducible bit-for-bit given a seed.
+3. The canonical solutions are intentionally simple constant declarations that AST can parse without import side effects.
+#### Plausible-output simulation
+When generating multi-step continuations, we don't have a real MiniStack response to feed back into the user message — we have to fabricate one. The generator maps each AWS operation (`list-buckets`, `create-table`, `describe-instances`, …) to a JSON template, then interpolates the right resource names from the task. So an `aws s3api list-buckets` step in the user prompt history has output like:
+```json
+{"Buckets":[{"Name":"my-app-data","CreationDate":"2026-04-15T..."}]}
+```
+…instead of the empty `{"Buckets":[]}` you'd get from a fresh MiniStack. This is the difference between the SFT model learning "first step, always answer with the canonical command" (degenerate) and "first step depends on what's already been done" (correct).
+### Dynamic-ID filtering
+Some tests reference resources whose IDs only exist at runtime — security groups (`sg-…`), subnets (`subnet-…`), VPCs (`vpc-…`), instance IDs (`i-…`). These commands cannot be deterministically captured by static extraction. The generator skips any task whose canonical command contains those patterns. The result: 72 unique tasks make it into the train split (out of 134 total tasks), all of which are deterministically reproducible.
+---
+## 2. Five trajectory types
+The SFT corpus mixes five distinct trajectory shapes so the model learns to handle real multi-turn agent behavior, not just one-shot question answering. Actual proportions (from [data/sft/dataset_stats.json](sft/dataset_stats.json)):
+| Source                     | Train pct (target) | Train rows | What the model sees                                                                       |
+|----------------------------|:------------------:|:----------:|-------------------------------------------------------------------------------------------|
+| `success_first_step`       | 55.1% (55%)        | 826        | User → Task description → assistant emits the canonical command                           |
+| `multi_step_continuation`  | 20.1% (20%)        | 301        | User → Task description + a baked-in history of N-1 prior commands and their outputs → assistant emits step N |
+| `failure_recovery`         | 15.5% (15%)        | 232        | User → Task description + step 1 of a wrong command and its simulated error → assistant emits the recovery command |
+| `verification`             | 4.5% (5%)          | 67         | User → Task already complete → assistant emits a read-only verification command           |
+| `hint_usage`               | 4.9% (5%)          | 74         | User → Task description → assistant emits `aws help --task-hint` (the agent action that requests a hint) |
+Why include the last four sources at all?
+- **`multi_step_continuation`** trains continuation behavior. Without it, the model overfits to step 1 and degrades on later turns.
+- **`failure_recovery`** teaches the model that a typo / wrong command is recoverable. The reward signal during GRPO is dense — the model needs to know what "try again" looks like.
+- **`verification`** trains the model to recognize when a task is done and respond appropriately. Production agents must distinguish "do something" from "confirm it's done".
+- **`hint_usage`** lets the model learn that `aws help --task-hint` is the in-environment way to request help, not just a literal CLI command.
+---
+## 3. Tier weighting
+[data/build_sft_dataset.py:54-60](build_sft_dataset.py) — sampling weights:
+| Tier         | Weight | Train rows | Why                                                                                |
+|--------------|:------:|:----------:|------------------------------------------------------------------------------------|
+| warmup       | 0.50   | 456        | Most rows. Format-locks the model on the simplest possible "aws X list" pattern.   |
+| beginner     | 0.30   | 378        | Single-resource creation — bread and butter.                                       |
+| intermediate | 0.15   | 666 *      | Multi-step workflows. Note actual count > target because each task contributes more rows via multi_step_continuation. |
+| advanced     | 0.05   | 0          | Cross-service architectures. Filtered out post-extraction (most have dynamic IDs). |
+| expert       | 0.00   | 0          | SRE / drift / security-posture. **Intentionally excluded from SFT.**               |
+> **Why expert tier is excluded from SFT.** The expert tasks (drift detection, security audits) have *randomized* state checks — there is no canonical command sequence. Trying to SFT on them would teach the model a particular fix script that is *wrong* on most episodes. These tasks are reserved for GRPO, where the env's `state_checks` reward signal handles the randomization correctly.
+`*` Intermediate row count exceeds the simple weight because the multi-step trajectory generator naturally produces multiple rows per task (one for step 1, step 2, etc.).
+---
+## 4. Dataset format & artifacts
+### JSONL chat-message schema
+```json
+{
+  "messages": [
+    {"role": "system", "content": "You are an AWS cloud engineer interacting with a real AWS environment via CLI..."},
+    {"role": "user", "content": "TASK: Create an S3 bucket named my-app-data and enable versioning on it.\n\nPREVIOUS COMMANDS:\n[1] $ aws s3 mb s3://my-app-data\n    output: make_bucket: my-app-data\n    reward: 0.50\n\n---\n\nCURRENT OBSERVATION:\nProgress: 0.50  Achieved: False  Step: 2"},
+    {"role": "assistant", "content": "aws s3api put-bucket-versioning --bucket my-app-data --versioning-configuration Status=Enabled"}
+  ],
+  "difficulty": "intermediate",
+  "source": "multi_step_continuation",
+  "task_id": 42
+}
+```
+Every row carries the `difficulty`, `source`, and `task_id` metadata — useful for filtering, ablations, and debugging.
+### Artifacts
+[data/sft/](sft/):
+| File                                                         | Size  | Rows  | Unique tasks | Use                                            |
+|--------------------------------------------------------------|------:|------:|:------------:|------------------------------------------------|
+| [aws_rl_sft.train.jsonl](sft/aws_rl_sft.train.jsonl)         | 2.2 MB | 1,500 | 72           | SFT training                                   |
+| [aws_rl_sft.val.jsonl](sft/aws_rl_sft.val.jsonl)             | 218 KB | 150   | 63           | SFT validation; basis for [MODEL_EVALUATION.md](sft/MODEL_EVALUATION.md) |
+| [aws_rl_sft.reserve.jsonl](sft/aws_rl_sft.reserve.jsonl)     | 294 KB | 200   | 66           | Held-out reserve for post-SFT regression checks |
+| [dataset_stats.json](sft/dataset_stats.json)                 | 3.4 KB | —     | —            | Per-split source/tier/task breakdowns          |
+| [MODEL_EVALUATION.md](sft/MODEL_EVALUATION.md)               | 15 KB  | —     | —            | Full model-selection writeup ([§5](#5-base-model-selection-overview)) |
+| [model_eval_full.json](sft/model_eval_full.json)             | 209 KB | 297   | —            | Per-call eval data (11 models × 27 prompts)    |
+| [deepseek_r1_rerun.json](sft/deepseek_r1_rerun.json)         | 5.3 KB | 27    | —            | DeepSeek R1 re-run with `max_tokens=2048`      |
+---
+## 5. Base-model selection — overview
+This is the most rigorous decision in the whole project. Full reasoning, per-model verdicts, and methodology lives in **[data/sft/MODEL_EVALUATION.md](sft/MODEL_EVALUATION.md)** — a 270-line standalone report. Read it before judging the project's technical depth; it's what convinces us we're training the right thing.
+The 30-second summary:
+| Model                          | exact% | op%  | fmt%   | Latency | Verdict                              |
+|--------------------------------|:-----:|:----:|:------:|:-------:|--------------------------------------|
+| **qwen2.5-coder-3b-instruct**  | **41%** | **63%** | 85% | **3.1s**  | ✅ Train this. Highest exact, fastest viable. |
+| qwen/qwen3-4b-2507             | 33%   | 59%  | 100%   | 10.4s   | Fallback. Perfect format, 3× slower.  |
+| qwen2.5-coder-1.5b-instruct    | 22%   | 44%  | 81%    | 2.5s    | Speed play if GRPO budget tight.      |
+| smollm2-1.7b-instruct          | 7%    | 37%  | 63%    | 2.1s    | ❌ Ceiling too low.                   |
+| (7 more)                       | 0%    | …    | …      | …       | ❌ Format-broken or wrong domain.      |
+> ![Per-model comparison: 5 quality metrics + latency](../docs/figures/model_eval_chart.png)
+What the metrics mean:
+- **`fmt%`**: raw output starts with `aws ` (no preamble, fences, or quotes). The agent's [inference.py:93](../inference.py) gate rejects everything else.
+- **`+xtr%`**: `fmt%` after stripping markdown fences. Gap to `fmt%` = "model knows the answer, wrapping it in junk".
+- **`exact%`**: extracted command matches canonical token-for-token. The hardest metric.
+- **`svc%`**: same AWS service as canonical. Domain orientation.
+- **`op%`**: same service AND operation. The gap SFT closes most reliably.
+The full table (11 models, 9 metrics, per-call logs) is in [data/sft/model_eval_full.json](sft/model_eval_full.json) — 297 records.
+---
+## 6. Eval harness
+[data/eval_lm_studio_models.py](eval_lm_studio_models.py) — 9.9 KB, reusable.
+- Calls each chat model loaded in LM Studio at `http://localhost:1234/v1/chat/completions` (OpenAI-compatible API)
+- Sends the same 27 held-out prompts to each model
+- Extracts `aws ...` from the response (stripping fences / preamble)
+- Compares against the canonical command from the val split
+- Writes per-call detail + aggregate metrics to JSON
+To re-run post-SFT:
+```bash
+.venv/bin/python data/eval_lm_studio_models.py \
+    --max-per-combo 5 \
+    --out data/sft/model_eval_postsft.json
+```
+A successful SFT run should see (predictions from [MODEL_EVALUATION.md §11](sft/MODEL_EVALUATION.md), and **actuals from our SFT run committed at [out/delta_summary.json](../out/delta_summary.json)**):
+| Metric    | Base  | Target  | **Actual (post-SFT)** |
+|-----------|:-----:|:-------:|:---------------------:|
+| `exact%`  | 39%   | 75%+    | **88.9%** ✅          |
+| `op%`     | 61%   | 90%+    | **88.9%** ≈           |
+| `svc%`    | 78%   | —       | **88.9%**             |
+| `fmt%`    | 33%   | 100%    | **100.0%** ✅         |
+| latency   | 2.03s | —       | **1.40s** (faster)    |
+Every target from MODEL_EVALUATION.md is hit or essentially hit. Format compliance is now perfect; exact-match jumped 50 pp; the model is faster *and* tighter.
+> ![Base vs SFT comparison (eval metrics)](../docs/figures/base_vs_sft_success.png)
+> ![Single-step eval base vs SFT](../docs/figures/single_step_eval.png)
+---
+## 7. HuggingFace publishing
+[data/upload_sft_to_hf.py](upload_sft_to_hf.py) — pushes the JSONL splits to HuggingFace Hub:
+| Split    | Hub repo                                            |
+|----------|-----------------------------------------------------|
+| train    | `Sizzing/aws-rl-sft-qwen25coder3b-train`            |
+| val      | `Sizzing/aws-rl-sft-qwen25coder3b-val`              |
+| reserve  | `Sizzing/aws-rl-sft-qwen25coder3b-reserve`          |
+The trained SFT adapter (output of [train/train_sft_lora.ipynb](../train/train_sft_lora.ipynb)) is published separately at:
+- `Sizzing/aws-rl-sft-qwen25coder3b-adapter`
+GRPO training picks it up by setting `SFT_ADAPTER = "Sizzing/aws-rl-sft-qwen25coder3b-adapter"` in [aws_rl_env_colab.ipynb](../aws_rl_env_colab.ipynb).
+---
+## 8. Files in this directory
+| File                                                               | Purpose                                                            |
+|--------------------------------------------------------------------|--------------------------------------------------------------------|
+| [build_sft_dataset.py](build_sft_dataset.py)                       | Generator — AST extraction + 5 trajectory types + plausible outputs |
+| [eval_lm_studio_models.py](eval_lm_studio_models.py)               | Base-model benchmark harness (LM Studio API)                       |
+| [upload_sft_to_hf.py](upload_sft_to_hf.py)                         | Push the SFT splits to HuggingFace                                 |
+| [sft/aws_rl_sft.train.jsonl](sft/aws_rl_sft.train.jsonl)           | 1,500 SFT training rows                                            |
+| [sft/aws_rl_sft.val.jsonl](sft/aws_rl_sft.val.jsonl)               | 150 validation rows                                                |
+| [sft/aws_rl_sft.reserve.jsonl](sft/aws_rl_sft.reserve.jsonl)       | 200 reserve rows                                                   |
+| [sft/dataset_stats.json](sft/dataset_stats.json)                   | Per-split source / tier / task counts                              |
+| [sft/MODEL_EVALUATION.md](sft/MODEL_EVALUATION.md)                 | **The base-model selection report (read this)**                    |
+| [sft/model_eval_full.json](sft/model_eval_full.json)               | Per-call eval data (11 models × 27 prompts)                        |
+| [sft/deepseek_r1_rerun.json](sft/deepseek_r1_rerun.json)           | R1 re-run with extended `max_tokens`                               |
+---
+## See also
+- [Main README](../README.md)
+- [data/sft/MODEL_EVALUATION.md](sft/MODEL_EVALUATION.md) — full base-model selection writeup
+- [train/README.md](../train/README.md) — how this dataset is consumed by SFT training
+- [compare/README.md](../compare/README.md) — how the trained model is benchmarked vs the base
+- [server/services/tasks/](../server/services/tasks/) — source of truth for task definitions (the YAML the generator reads)
+- [tests_tasks/](../tests_tasks/) — canonical solutions the generator extracts via AST

docs/figures/base_vs_sft_success.png ADDED Viewed

docs/figures/compare_dataset.png ADDED Viewed

Git LFS Details

SHA256: 0192c7b5d9d57f278aac1a09d776329757ebaff2d3a29d791c3f5cda7258e724
Pointer size: 131 Bytes
Size of remote file: 280 kB

docs/figures/compare_rl_env.png ADDED Viewed

Git LFS Details

SHA256: eda0c69c8c28515195d005f0a4431b7c6e7959d1f99f5b7c44ed448ede523374
Pointer size: 131 Bytes
Size of remote file: 201 kB

docs/figures/env_init_screenshot.png ADDED Viewed

Git LFS Details

SHA256: 51a633c9058297eae3575abd5a4cb093d9204337bca4b69fd141f471d38ad5c8
Pointer size: 131 Bytes
Size of remote file: 372 kB

docs/figures/grpo_final_per_step.png ADDED Viewed

Git LFS Details

SHA256: f6d5d210de9f473d638cb75cf221e3e703eae9a3d00faa8fbcd122c17919e6ce
Pointer size: 131 Bytes
Size of remote file: 243 kB

docs/figures/grpo_optuna_history.png ADDED Viewed

docs/figures/grpo_optuna_history_v0.png ADDED Viewed

docs/figures/grpo_optuna_hparams.png ADDED Viewed

docs/figures/grpo_optuna_importances.png ADDED Viewed

docs/figures/grpo_optuna_parallel.png ADDED Viewed

docs/figures/grpo_optuna_trial_curves.png ADDED Viewed

Git LFS Details

SHA256: 8254a87ffe69f2c818b5b403dae41f32dc36c301ca491d8618c41164333f43c6
Pointer size: 131 Bytes
Size of remote file: 277 kB

docs/figures/grpo_optuna_trials_comparison.png ADDED Viewed

Git LFS Details

SHA256: 231ca2e7ecae1114a7e61d808f0b3736a22f4ddec7b90d7626cb0fb4d608c4c5
Pointer size: 131 Bytes
Size of remote file: 123 kB

docs/figures/grpo_per_tier_curve.png ADDED Viewed

docs/figures/grpo_reward_by_tier.png ADDED Viewed

docs/figures/grpo_reward_curve.png ADDED Viewed

Git LFS Details

SHA256: 1d1222b3510873dadb8da9be7066e17220c5dab5c6456d11385f4e9f5c99b885
Pointer size: 131 Bytes
Size of remote file: 260 kB

docs/figures/ministack_logo.png ADDED Viewed

Git LFS Details

SHA256: d6ee9620212659d7f7e2da8dcc9ff39cf522d3f34ea07728d6e6ab00df876de5
Pointer size: 131 Bytes
Size of remote file: 122 kB

docs/figures/model_eval_chart.png ADDED Viewed

docs/figures/optuna_history.png ADDED Viewed

docs/figures/optuna_parallel.png ADDED Viewed

Git LFS Details

SHA256: a235e7fc7050edfdf8f547a31d5630d737c5b85fd5e4f2bcdd0abf1677058926
Pointer size: 131 Bytes
Size of remote file: 218 kB

docs/figures/optuna_param_importance.png ADDED Viewed

docs/figures/optuna_slice.png ADDED Viewed

Git LFS Details

SHA256: b743ec4e945f9ee5239694224d587ee1c912a8d415910e924218c9b5074003fc
Pointer size: 131 Bytes
Size of remote file: 107 kB

docs/figures/optuna_trial_curves.png ADDED Viewed

docs/figures/qualitative_rollouts.png ADDED Viewed

docs/figures/rl_env_eval_base_vs_sft.png ADDED Viewed

docs/figures/sft_loss_curve.png ADDED Viewed

Git LFS Details

SHA256: e0c0d8d74358a2f95feee6e685e2d512f5ee5bda8ce869686c951114278c9a1a
Pointer size: 131 Bytes
Size of remote file: 178 kB

docs/figures/sft_optuna_trials_table.png ADDED Viewed

docs/figures/sft_vs_grpo_by_tier.png ADDED Viewed

docs/figures/sft_vs_grpo_metrics_grid.png ADDED Viewed

docs/figures/sft_vs_grpo_scalar.png ADDED Viewed

docs/figures/single_step_eval.png ADDED Viewed

images/compare_dataset.png ADDED Viewed

Git LFS Details

SHA256: 0192c7b5d9d57f278aac1a09d776329757ebaff2d3a29d791c3f5cda7258e724
Pointer size: 131 Bytes
Size of remote file: 280 kB

images/compare_rl_env.png ADDED Viewed

Git LFS Details

SHA256: eda0c69c8c28515195d005f0a4431b7c6e7959d1f99f5b7c44ed448ede523374
Pointer size: 131 Bytes
Size of remote file: 201 kB

pyproject.toml CHANGED Viewed

@@ -34,7 +34,16 @@ train = [
     "ipykernel",
     "ipywidgets>=8.1.0",
     "datasets>=4.8.4",
-    "huggingface-hub>=1.9.0",
 ]

     "ipykernel",
     "ipywidgets>=8.1.0",
     "datasets>=4.8.4",
+    "huggingface-hub>=0.34,<1.0",
+    # GRPO training stack (versions mirror train/train_grpo_lora.ipynb)
+    "unsloth",
+    "trl>=0.18.2,<=0.24.0,!=0.19.0",
+    "peft",
+    "accelerate",
+    "bitsandbytes",
+    "transformers>=4.50,<5.0",
+    "optuna",
+    "matplotlib",
 ]

scripts/README.md ADDED Viewed

	@@ -0,0 +1,260 @@

+# `scripts/` — Parallel Rollout Architecture
+[← back to main README](../README.md)
+This directory holds the helper modules that make **8 concurrent multi-turn rollouts** against the AWS RL environment possible — the scaling trick that turns GRPO from a thought experiment into something you can actually train on a single GPU.
+If you only read one section, read [§2 — Three coordinated pool layers](#2-three-coordinated-pool-layers). It explains the architecture in one page.
+---
+## Table of contents
+1. [Why parallel rollouts matter](#1-why-parallel-rollouts-matter)
+2. [Three coordinated pool layers](#2-three-coordinated-pool-layers)
+3. [Walking through one GRPO step](#3-walking-through-one-grpo-step)
+4. [The all-or-nothing connect protocol](#4-the-all-or-nothing-connect-protocol)
+5. [Concurrency-safety guarantees](#5-concurrency-safety-guarantees)
+6. [Configuration](#6-configuration)
+7. [Running the multi-connection demo](#7-running-the-multi-connection-demo)
+8. [Files in this directory](#8-files-in-this-directory)
+---
+## 1. Why parallel rollouts matter
+GRPO computes **group-relative advantages**: every gradient step needs `G` rollouts on the *same* prompt so the algorithm can normalize rewards within the group. With `G = 8`, multi-turn episodes (≤ 6 turns), and an env step that round-trips an AWS CLI invocation through MiniStack (~50 ms), the math is:
+```
+Serial:    8 rollouts  ×  6 turns  ×  50 ms  =  2,400 ms env-time per GRPO step
+Parallel: max(8 envs)  ×  6 turns  ×  50 ms  =    300 ms env-time per GRPO step
+```
+That's an 8× speedup on the env side. The model forward pass still serialises (single GPU), so the practical end-to-end gain depends on the env/compute ratio — but for an env that takes ~50 ms per step, parallelism is the difference between a tractable training run and a 24-hour one.
+The parallelism isn't free: each rollout needs **state isolation**. If two rollouts share an AWS world, rollout 1's S3 buckets bleed into rollout 2's view, the curriculum mastery numbers go to garbage, and the agent can hack the reward by piggy-backing off siblings. The three coordinated pools below exist to make state isolation cheap and automatic.
+> ![8 simultaneous WebSocket sessions established to the env server](../docs/figures/env_init_screenshot.png)
+---
+## 2. Three coordinated pool layers
+The system has **three pools** that work together. They look similar at first glance — all of them deal with N concurrent envs — but each operates at a different layer of the stack:
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│  Layer 3 — Trainer-process pool                                             │
+│  MultiTurnEnvPool       (train_grpo.py)                                     │
+│  • owns a background asyncio loop                                           │
+│  • exposes a sync run_group() that the GRPO trainer can call                │
+│  • used by the in-process trainer (CLI: python train_grpo.py)               │
+└────────────────────────────────────┬────────────────────────────────────────┘
+                                     │ N WebSocket clients
+┌────────────────────────────────────▼────────────────────────────────────────┐
+│  Layer 3 alt — Notebook-friendly pool                                       │
+│  GrpoPool               (scripts/grpo_pool.py)                              │
+│  • async-native API (async with GrpoPool(...) as pool: ...)                 │
+│  • used by Colab notebooks where the cell IS the asyncio loop               │
+│  • simpler interface (no background thread)                                 │
+└────────────────────────────────────┬────────────────────────────────────────┘
+                                     │ N WebSocket clients
+┌────────────────────────────────────▼────────────────────────────────────────┐
+│  Layer 2 — OpenEnv max_concurrent_envs                                      │
+│  create_app(env_factory, ..., max_concurrent_envs=POOL_SIZE)                │
+│  • OpenEnv reserves up to N env instances at once                           │
+│  • returns 503 if a 9th client tries to connect when POOL_SIZE=8            │
+└────────────────────────────────────┬────────────────────────────────────────┘
+                                     │ env_factory() invoked per session
+┌────────────────────────────────────▼────────────────────────────────────────┐
+│  Layer 1 — Server-side MiniStack pool                                       │
+│  MiniStackPool          (server/app.py)                                     │
+│  • free-list of MiniStack ports (BASE..BASE+POOL_SIZE-1)                    │
+│  • acquire()/release() under a threading.Lock                               │
+│  • each WS session binds to ONE port for its lifetime → state isolation     │
+└─────────────────────────────────────────────────────────────────────────────┘
+                                     │
+                                     ▼
+                     N independent MiniStack processes
+                     (started by Dockerfile / Makefile)
+```
+### Layer 1 — Server-side `MiniStackPool`
+Lives in [server/app.py:75–138](../server/app.py). Documented in detail in [server/README.md §6](../server/README.md#6-server-side-ministack-pool-parallel-rollouts).
+- A `threading.Lock`-guarded free list of port numbers
+- `acquire()` returns a port; `release(port)` puts it back
+- `RuntimeError("MiniStack pool exhausted")` if depleted
+- The Dockerfile launches `POOL_SIZE` MiniStack processes on consecutive ports before the FastAPI server starts accepting connections
+### Layer 2 — OpenEnv `max_concurrent_envs`
+When `create_app()` is called with `max_concurrent_envs=POOL_SIZE`, OpenEnv enforces the cap upstream — clients beyond the cap get a clean 503 instead of `RuntimeError`. Defence in depth.
+### Layer 3 — Client pools
+Two flavours, same parallelism model, different ergonomics:
+| | `MultiTurnEnvPool` ([train_grpo.py](../train_grpo.py)) | `GrpoPool` ([scripts/grpo_pool.py](grpo_pool.py)) |
+|---|---|---|
+| API | Sync — `pool.run_group(task, ...)` | Async — `await pool.run_group(rollout_fn)` |
+| Loop | Owns a background thread + asyncio loop | Caller is the asyncio loop (Colab cell) |
+| Use case | In-process trainer (`python train_grpo.py`) | Notebooks driving training from Colab |
+| Connection | `await asyncio.gather(*(e.connect() for e in envs))` on background thread | Same, but on the caller's loop |
+| `record_result()` | Trainer calls `Curriculum.record_result()` directly | `pool.record_group_result(task, rewards)` helper baked in |
+Both share the **all-or-nothing connect protocol** described in §4.
+### Why two client pools?
+Real life: the trainer process (`python train_grpo.py`) runs synchronously — TRL's `GRPOTrainer.train()` blocks. To use `await asyncio.gather` from inside that, we need a background asyncio loop on a separate thread. That's `MultiTurnEnvPool`.
+Colab cells, on the other hand, *are* the asyncio loop (Jupyter ≥ 7 ships nest_asyncio under the hood). Running a background thread + loop there is overkill and creates ordering bugs. `GrpoPool` is the simpler async-native variant for that case.
+The two pools share semantic invariants — same N, same all-or-nothing connect, same task scoping — so behaviour is identical regardless of which entry point you use.
+---
+## 3. Walking through one GRPO step
+```
+1. trainer picks one task from the Curriculum                  (1 task)
+2. pool.run_group(task)                                        (asyncio.gather over N envs)
+3. for turn in 0..MAX_TURNS:
+       prompts = build_prompts(observations)                   (CPU)
+       completions = policy.generate(prompts)                  (1 batched fwd, GPU)
+       actions = parse_completions(completions)                (CPU; extract `aws ...` line)
+       observations = await pool.run_group_step(actions)       (N concurrent env.step)
+4. rewards = sum_per_episode(rewards_lists)                    (N floats)
+5. GRPO computes group-relative advantages, KL, loss           (1 backward, GPU)
+6. Curriculum.record_result(task, mean(rewards))               (1 update)
+```
+A couple of subtleties:
+### Generation is serialised, env-step is not
+[train_grpo.py:_GENERATE_LOCK](../train_grpo.py) — a `threading.Lock` around `model.generate()`. The model lives on a single GPU; concurrent `generate()` calls would clobber each other. We let env step calls run concurrently (the slow part — WebSocket round-trip + MiniStack execution); only generation serialises.
+### Per-turn token accumulation
+`rollout_one_episode()` accumulates `prompt_ids`, `completion_ids`, and `logprobs` across turns into a single sequence. GRPO then assigns the episode-level reward to that full sequence. This matches the multi-turn structure of the underlying decision problem.
+### Why every rollout in a group runs the same task
+GRPO's group-relative advantage is `(reward_i − group_mean) / group_std`. If different rollouts ran different tasks, group statistics would mean nothing. The curriculum picks one task per GRPO step; the pool's `reset_group(task)` forces every env to that task; only then can the group statistics be meaningful.
+---
+## 4. The all-or-nothing connect protocol
+[scripts/grpo_pool.py:58-82](grpo_pool.py) — the most non-obvious correctness detail in the whole pool stack.
+```python
+async def connect(self) -> None:
+    if self.envs:
+        return
+    envs = [AwsRlEnv(base_url=self.base_url) for _ in range(self.size)]
+    try:
+        await asyncio.gather(*(e.connect() for e in envs))
+    except BaseException:
+        # Roll back: close every env (successful or not). return_exceptions
+        # so a close() failure doesn't mask the original connect error.
+        await asyncio.gather(
+            *(e.close() for e in envs),
+            return_exceptions=True,
+        )
+        raise
+    # Only publish the pool after the entire group connected successfully.
+    self.envs = envs
+```
+What makes this important:
+1. **`asyncio.gather` raises on the first failure**. If 3 of 8 connects succeed and the 4th raises, the other 4 may or may not have connected yet. Their state is undefined.
+2. **Server-side state matters**. Each successful connect acquired a MiniStack port from the server pool. If we just `raise` without cleanup, those ports stay held until the WebSocket times out — typically minutes. The next training run hits "pool exhausted".
+3. **`self.envs` is published only after success**. If any partial state were exposed, callers might call `pool.run_group()` on a half-initialised pool and get N/M valid results.
+4. **`return_exceptions=True` on the rollback**. A close error must not mask the original connect error — the user needs to know the *real* reason connect failed, not a downstream cleanup failure.
+These four invariants are the difference between "training reliably resumes after a flake" and "every flake leaks 7 ports and you're rebuilding the container at 3 AM".
+`MultiTurnEnvPool._connect_all()` in [train_grpo.py:473-480](../train_grpo.py) implements the same pattern.
+---
+## 5. Concurrency-safety guarantees
+| Concern                       | Guarantee                                                                                   | Where enforced                                            |
+|------------------------------|---------------------------------------------------------------------------------------------|-----------------------------------------------------------|
+| Cross-rollout state isolation | Each WebSocket session holds its own MiniStack port for its lifetime                        | `MiniStackPool.acquire/release` ([server/app.py](../server/app.py)) |
+| Curriculum coherence          | One curriculum instance per training run; `record_result()` is the only mutation point     | `make_rollout_func` in [train_grpo.py](../train_grpo.py)  |
+| GPU contention                | `model.generate()` calls serialised behind `_GENERATE_LOCK`                                 | [train_grpo.py:_GENERATE_LOCK](../train_grpo.py)          |
+| Pool slot leakage on flake    | All-or-nothing connect with rollback close                                                  | `GrpoPool.connect`, `MultiTurnEnvPool._connect_all`       |
+| Hung shutdown                 | Pool close runs `asyncio.gather(..., return_exceptions=True)` then stops the loop with timeout | `MultiTurnEnvPool.close()`                              |
+| Web playground vs pool collisions | Web routes refuse to mount when `POOL_SIZE > 1`                                          | [server/app.py:171](../server/app.py)                     |
+Tests covering these:
+- [tests/test_pool.py](../tests/test_pool.py) — server-side `MiniStackPool` acquire/release, exhaustion behaviour
+- [tests/test_grpo_pool.py](../tests/test_grpo_pool.py) — `GrpoPool` connect/close lifecycle, partial-connect rollback, group-result aggregation
+---
+## 6. Configuration
+| Variable                            | Default | Purpose                                                                             |
+|-------------------------------------|---------|-------------------------------------------------------------------------------------|
+| `AWS_RL_ENV_POOL_SIZE`              | `1`     | Server-side MiniStack pool size. Set to `8` for GRPO training. Must be ≥ training-time `num_generations`. |
+| `AWS_RL_ENV_MINISTACK_BASE_PORT`    | `4566`  | First MiniStack port; the pool covers `[BASE, BASE + POOL_SIZE)`                    |
+| `BACKEND_TYPE`                      | `simulator` | `simulator` (default; pool is meaningful) or `aws` (real AWS; pool disabled)    |
+| `NUM_GENERATIONS` (in trainer cfg)  | `8`     | Number of WebSocket clients the pool opens. Should equal `AWS_RL_ENV_POOL_SIZE` for full parallelism. |
+| `MAX_TURNS` (in trainer cfg)        | `6`     | Per-rollout episode length cap                                                      |
+| `MAX_TOTAL_TOKENS` (in trainer cfg) | `4096`  | Per-episode token budget (anti-OOM)                                                 |
+When deploying to HuggingFace Spaces, pool size is constrained by container memory — each MiniStack process is ~50–100 MB resident.
+---
+## 7. Running the multi-connection demo
+[scripts/TestMultipleConnects.ipynb](TestMultipleConnects.ipynb) is a hands-on notebook that proves all 8 sessions stay isolated.
+```bash
+# 1. Start the env server with pool size 8
+AWS_RL_ENV_POOL_SIZE=8 make run
+# 2. Run the notebook
+jupyter notebook scripts/TestMultipleConnects.ipynb
+```
+Expected output: 8 simultaneous "connection open" lines, 8 independent reset/step traces, no resource bleed across sessions.
+The screenshot at [docs/figures/env_init_screenshot.png](../docs/figures/env_init_screenshot.png) captures one such run.
+---
+## 8. Files in this directory
+| File                                                  | Purpose                                                                  |
+|-------------------------------------------------------|--------------------------------------------------------------------------|
+| [grpo_pool.py](grpo_pool.py) (139 LOC)                | Async-native `GrpoPool` — N persistent WebSockets, `asyncio.gather`, all-or-nothing connect, group-result aggregation |
+| [grpo_train.py](grpo_train.py) (~430 LOC)             | Alternative training entry point that uses `GrpoPool` directly (vs `train_grpo.py` which embeds `MultiTurnEnvPool`) |
+| [TestMultipleConnects.ipynb](TestMultipleConnects.ipynb) | Hands-on demo proving 8 concurrent WebSockets stay isolated           |
+Related code outside this directory:
+- [train_grpo.py](../train_grpo.py) — `MultiTurnEnvPool` class, the canonical in-process pool
+- [server/app.py](../server/app.py) — `MiniStackPool`, `make_env_factory`, the server-side pool layer
+- [client.py](../client.py) — `AwsRlEnv` WebSocket client used by both pools
+- [tests/test_pool.py](../tests/test_pool.py), [tests/test_grpo_pool.py](../tests/test_grpo_pool.py) — concurrency tests
+---
+## See also
+- [Main README](../README.md) — project overview
+- [server/README.md](../server/README.md) — environment internals (server-side pool detail in §6)
+- [train/README.md](../train/README.md) — SFT + GRPO training pipeline (this pool plugs into the GRPO loop)
+- [tests/test_pool.py](../tests/test_pool.py) — server-side pool acquire/release tests
+- [tests/test_grpo_pool.py](../tests/test_grpo_pool.py) — client-side pool lifecycle tests

server/README.md ADDED Viewed

	@@ -0,0 +1,596 @@

+# `server/` — AWS RL Environment Internals
+[← back to main README](../README.md)
+This directory implements the **OpenEnv-compatible FastAPI server** that powers the AWS RL Environment. The server exposes HTTP and WebSocket endpoints to a training agent, executes AWS CLI commands against a backing simulator (or real AWS), runs a reward / curriculum stack, and returns shaped observations.
+If you only have time for the headline numbers, read [the main README](../README.md). This document is the reference for **how** the environment actually works — every defended invariant, every edge case, every config knob.
+---
+## Table of contents
+1. [Architecture overview](#1-architecture-overview)
+2. [HTTP / WebSocket endpoints](#2-http--websocket-endpoints)
+3. [Episode lifecycle](#3-episode-lifecycle)
+4. [Strategy pattern: Simulator vs Real AWS](#4-strategy-pattern-simulator-vs-real-aws)
+5. [MiniStack: vendored fork & customizations](#5-ministack-vendored-fork--customizations)
+6. [Server-side MiniStack pool (parallel rollouts)](#6-server-side-ministack-pool-parallel-rollouts)
+7. [Curriculum manager](#7-curriculum-manager)
+8. [Reward shaping & TaskGrader](#8-reward-shaping--taskgrader)
+9. [Anti-reward-hacking — 8 defense layers](#9-anti-reward-hacking--8-defense-layers)
+10. [Resource verifier](#10-resource-verifier)
+11. [Chaos engine](#11-chaos-engine)
+12. [Drift engine](#12-drift-engine)
+13. [Hint provider](#13-hint-provider)
+14. [Episode tracker](#14-episode-tracker)
+15. [Environment designer](#15-environment-designer)
+16. [Task definitions (YAML schema)](#16-task-definitions-yaml-schema)
+17. [Security-posture audit examples](#17-security-posture-audit-examples)
+18. [Curriculum stats API](#18-curriculum-stats-api)
+19. [Web playground](#19-web-playground)
+---
+## 1. Architecture overview
+```
+┌──────────────────────────────── server/ process ────────────────────────────────┐
+│                                                                                 │
+│   FastAPI app  (server/app.py)                                                  │
+│   ├── OpenEnv router  /reset  /step  /state  /schema  /ws  /health              │
+│   ├── Web router      /web  /web/reset  /web/step  /web/state  /web/solution    │
+│   └── env_factory ──► AwsRlEnvironment(strategy=…)                              │
+│                          │                                                      │
+│                          ├── EpisodeTracker          (per-episode state)        │
+│                          ├── Curriculum              (priority + mastery)       │
+│                          ├── EnvironmentDesigner     (setup commands)           │
+│                          ├── HintProvider            (3-level hints)            │
+│                          ├── ChaosEngine             (mid-episode mutations)    │
+│                          ├── DriftEngine             (drift-task injection)     │
+│                          ├── TaskGrader              (5-strategy dispatcher)    │
+│                          ├── ResourceVerifier        (ground-truth state)       │
+│                          └── EnvironmentStrategy ──► SimulatorStrategy          │
+│                                                  ╲   (talks to MiniStack)      │
+│                                                   ╲  AwsStrategy               │
+│                                                       (talks to real AWS)       │
+└─────────────────────────────────────────────────────────────────────────────────┘
+                                        │
+                                        ▼
+                          MiniStack process(es) on :4566+
+                          (own port per pool slot when AWS_RL_ENV_POOL_SIZE > 1)
+```
+Files:
+- [server/app.py](app.py) — FastAPI app, OpenEnv integration, MiniStack pool, web routes
+- [server/aws_rl_env_environment.py](aws_rl_env_environment.py) — main `AwsRlEnvironment` orchestrator
+- [server/services/](services/) — pluggable services (one concern per file, listed in §7–§16)
+- [server/services/tasks/](services/tasks/) — YAML task definitions, one file per tier
+- [server/templates/index.html](templates/index.html) — playground HTML
+- [server/static/](static/) — playground JS/CSS, 40 AWS service icons
+---
+## 2. HTTP / WebSocket endpoints
+OpenEnv-compatible (created via `openenv.core.env_server.http_server.create_app`):
+| Method | Path     | Purpose                                                         |
+|--------|----------|-----------------------------------------------------------------|
+| POST   | `/reset` | Wipe infra, pick next task from curriculum, return observation  |
+| POST   | `/step`  | Execute action, grade, optionally inject chaos, return obs      |
+| GET    | `/state` | Full `AwsRlState` snapshot (current task, tracker, infra state) |
+| GET    | `/schema`| JSON schemas for `AwsRlAction` / `AwsRlObservation`             |
+| GET    | `/health`| Liveness probe                                                  |
+| WS     | `/ws`    | Persistent session (one MiniStack acquired per connection)      |
+Web playground (always mounted; backed by a dedicated lazy MiniStack — see §6):
+| Method | Path             | Purpose                                                   |
+|--------|------------------|-----------------------------------------------------------|
+| GET    | `/`              | Redirect → `/web`                                         |
+| GET    | `/web`           | HTML playground (Jinja2 template `index.html`)            |
+| POST   | `/web/reset`     | Stateful reset for the playground's shared env            |
+| POST   | `/web/step`      | Stateful step for the playground's shared env             |
+| GET    | `/web/state`     | Current `AwsRlState` for the shared env                   |
+| GET    | `/web/solution`  | Reveal next canonical solution command (debug aid)        |
+Auto-generated docs: `/docs` (Swagger), `/redoc` (ReDoc).
+---
+## 3. Episode lifecycle
+1. **`reset()`**
+   1. `EnvironmentStrategy.reset_environment()` — wipes simulator state (no-op for real AWS)
+   2. `Curriculum.next_task()` — picks the next task (see §7 priority scoring)
+   3. `EnvironmentDesigner.provision(task.setup_commands)` — runs preflight CLI commands to create the broken / insecure infra the agent must fix (used by SRE, drift, security-posture tasks)
+   4. `DriftEngine.inject(task)` — for drift tasks, randomly applies 2–3 mutations from `task.possible_drifts`
+   5. `EpisodeTracker.start(task)` — fresh tracker
+   6. Returns initial `AwsRlObservation` with the masked `TaskInfo` (task description but **not** success criteria)
+2. **`step(action)`**
+   1. **Validate** — only commands starting with `aws ` are accepted (see §9 layer 4)
+   2. **Intercept hint requests** — `aws help --task-hint` returns next-level hint, increments `hints_used`, never reaches the simulator
+   3. `EnvironmentStrategy.execute(command)` — runs the AWS CLI invocation, returns stdout / stderr / exit_code
+   4. `EpisodeTracker.record(...)` — parses command, dedup-checks, updates `partial_progress`
+   5. `TaskGrader.grade(...)` — returns shaped reward (see §8)
+   6. `ChaosEngine.maybe_inject(...)` — at tier-scaled probability, executes a destructive mutation on a resource the agent just touched
+   7. `Curriculum.record_step(...)` — accumulates step-level signal
+   8. Returns updated `AwsRlObservation`
+3. **Termination**
+   - `obs.task_achieved == True`, **or**
+   - `step_count >= MAX_STEPS` (default 15, configurable via env var)
+   - On terminate: `Curriculum.record_result(task, achieved, reward)` updates per-task mastery and may promote the agent's tier
+---
+## 4. Strategy pattern: Simulator vs Real AWS
+The environment supports two backends, swapped via the `BACKEND_TYPE` env var (default `simulator`):
+### `SimulatorStrategy` — [services/simulator_strategy.py](services/simulator_strategy.py)
+- Talks to a MiniStack instance over HTTP (`AWS_INFRA_URL`, default `http://localhost:4566`)
+- AWS CLI invocations are subprocessed with `AWS_ENDPOINT_URL` set so they hit MiniStack
+- `reset_environment()` calls MiniStack's `/_ministack/reset` endpoint to wipe state
+- `get_state()` reads the **custom** `/_ministack/state` endpoint (see §5) — one HTTP call returns the entire infra inventory used by `ResourceVerifier`
+### `AwsStrategy` — [services/aws_strategy.py](services/aws_strategy.py)
+- Uses ambient AWS credentials (whatever the standard AWS CLI credential chain finds)
+- No `AWS_ENDPOINT_URL` override — commands hit real AWS
+- `reset_environment()` is a **no-op** (we cannot wipe a real AWS account; expert-level task scenarios assume a clean / sandboxed sub-account)
+- Useful for end-to-end demonstrations, less so for RL training
+Switching backends:
+```bash
+export BACKEND_TYPE=aws  # or "simulator" (default)
+make run
+```
+The factory in [server/app.py](app.py) wires the right strategy at startup.
+---
+## 5. MiniStack: vendored fork & customizations
+> **Why this matters:** the simulator that the grader queries is not a black-box pip dependency — it's vendored in-tree as a git subtree at [aws_infra/](../aws_infra/) so we can extend it. The custom endpoints we added there are how `ResourceVerifier` and the grader can read full infra state in a single round-trip.
+### Vendored as a git subtree
+`aws_infra/` was imported via `git subtree add` in commit **[`2c38c0b` "Bring mini stack to local"](../aws_infra/)** (PR #5). Upstream is the public MiniStack project. The full upstream README is preserved at [aws_infra/README.md](../aws_infra/README.md) (81 KB).
+Why we vendored instead of taking a pip dependency:
+1. **Custom endpoints**: we needed JSON state-introspection endpoints (`/_ministack/state`, `/_ministack/actions`) that upstream did not ship. These are the integration seams between our env grader and the simulator.
+2. **Reproducible builds**: the Docker image ships a specific MiniStack revision; no runtime network fetch, identical behavior across environments.
+3. **Service-coverage extensions**: occasional patches to individual service handlers (e.g. RDS state retrieval used by `ResourceVerifier`).
+### Custom modifications on top of upstream
+Each modification is a separate, cleanly-cherry-pickable commit so future upstream syncs are low-conflict.
+| Commit    | Title                                                                                  | What it adds                                                                                                                                                |
+|-----------|----------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `a648c3a` | feat: Add support for service state retrieval and action listing across multiple AWS services | `/_ministack/state` returns the entire infra inventory as JSON in one call (the grader's primary read path). `/_ministack/actions` lists every supported operation per service — used by tooling and tests. |
+| `a00e981` | chor: Small Fixes                                                                      | Tightening / typo fixes on top of `a648c3a`.                                                                                                                |
+| `af2e945` | Sync MiniStack with latest changes                                                     | Periodic upstream sync. Replays our custom commits cleanly because they are isolated and well-scoped.                                                       |
+| `579597b` | Sync MiniStack with latest changes                                                     | Subsequent upstream sync.                                                                                                                                   |
+To inspect any of these:
+```bash
+git show a648c3a                     # see the full diff for the state endpoint
+git log --oneline -- aws_infra/      # see only the aws_infra/ history
+```
+### Build integration
+- [aws_infra/pyproject.toml](../aws_infra/pyproject.toml) declares MiniStack as its own package; we install it as an editable dependency via `make install-all`.
+- The [Dockerfile](../Dockerfile) stages MiniStack explicitly so the resulting container has no external network requirement at runtime.
+- The [aws_infra/Makefile](../aws_infra/Makefile) provides `make build` and `make test` targets if you want to work on MiniStack itself.
+- `aws_infra/docker-compose.yml` lets you run MiniStack alone for debugging.
+### Upstream sync workflow
+```bash
+# From the repo root
+git subtree pull --prefix=aws_infra <upstream-remote> main --squash
+# Resolve any conflicts (rare, because our patches live in identifiable commits)
+# Test:
+pytest tests/ -k "verifier or grader"
+```
+---
+## 6. Server-side MiniStack pool (parallel rollouts)
+> **Why:** GRPO training generates `G=8` rollouts per step on the same task and computes group-relative advantages. To run those 8 rollouts truly in parallel **without state bleed**, every rollout needs its own AWS world. The server-side pool makes that possible.
+### Design — [server/app.py:75–138](app.py)
+When the server boots, `make_env_factory(POOL_SIZE, BASE_PORT, BACKEND_TYPE)` decides which factory to install:
+| Mode                                            | What gets created                                                              |
+|-------------------------------------------------|--------------------------------------------------------------------------------|
+| `BACKEND_TYPE=aws`                              | No pool. All sessions share `AwsStrategy`. Pool would be meaningless on real AWS. |
+| `AWS_RL_ENV_POOL_SIZE=1` (default)              | No pool object; one shared `SimulatorStrategy` on the default port.            |
+| `AWS_RL_ENV_POOL_SIZE=N` (`N>1`, simulator)     | A `MiniStackPool` (thread-safe free-list of ports `BASE..BASE+N-1`). Each WebSocket session calls `pool.acquire()` to get its own MiniStack port; on disconnect `env.close()` triggers `pool.release(port)`. |
+The pool's `acquire()` raises `RuntimeError("MiniStack pool exhausted")` if a 9th client tries to connect when `POOL_SIZE=8`. OpenEnv's `create_app(..., max_concurrent_envs=POOL_SIZE)` enforces the same cap upstream so callers see a clean 503 instead.
+### The Dockerfile launches N MiniStacks
+The container's entrypoint starts `POOL_SIZE` MiniStack processes on ports `4566..4566+POOL_SIZE-1` before the FastAPI server is ready to accept connections. Each MiniStack runs the same image but has its own in-memory state — so the 8 rollouts cannot accidentally see each other's S3 buckets, IAM roles, etc.
+### Web playground gets its own MiniStack (lazy, on a constant port)
+The pool owns `[BASE..BASE+N-1]` for WebSocket sessions. The web playground's shared `_env` cannot share those ports — a `/web/step` would clobber whichever rollout currently holds the same MiniStack. Instead, the web UI uses a **dedicated MiniStack on a constant port outside the pool's range** (`AWS_RL_ENV_WEB_MINISTACK_PORT`, default `4565`). The pool is constructed as `range(BASE, BASE+N)`, so `pool.acquire()` can never hand out the web port.
+That dedicated MiniStack is **spawned lazily** by the FastAPI server on the first `/web/*` request (`subprocess.Popen(["ministack", "-d"], env={"GATEWAY_PORT": "4565", ...})`). Training-only deployments — the common case — pay zero cost: the extra MiniStack only exists if a user actually opens the playground. First request takes ~1–3s for the bind; subsequent requests are fast (cached `_env`). A startup assertion refuses to boot if `AWS_RL_ENV_WEB_MINISTACK_PORT` falls inside the pool's range.
+`POOL_SIZE=1` keeps the legacy single-MiniStack path: the web env shares `:4566` with the lone pool MiniStack — no extra process, no extra port.
+### Configuration
+| Env var                            | Default | Purpose                                                       |
+|------------------------------------|---------|---------------------------------------------------------------|
+| `AWS_RL_ENV_POOL_SIZE`             | `1`     | Number of MiniStack instances + WebSocket session capacity    |
+| `AWS_RL_ENV_MINISTACK_BASE_PORT`   | `4566`  | First MiniStack port; pool covers `[BASE, BASE + N)`          |
+| `AWS_RL_ENV_WEB_MINISTACK_PORT`    | `4565`  | Web playground's dedicated MiniStack port (lazy spawn; must lie outside the pool's range when `POOL_SIZE>1`) |
+| `BACKEND_TYPE`                     | `simulator` | `simulator` (default, MiniStack) or `aws` (real AWS, pool disabled) |
+### Cross-link
+The **client side** of this pool — the `GrpoPool` and `MultiTurnEnvPool` that open N persistent WebSocket connections and run rollouts concurrently — is documented in [scripts/README.md](../scripts/README.md). Read that doc for the full multi-turn + multi-rollout walkthrough.
+---
+## 7. Curriculum manager
+[services/curriculum.py](services/curriculum.py) — 536 LOC. Adaptive task selection with mastery tracking, spaced repetition, and tier promotion.
+### Per-tier configuration
+| Tier         | min_episodes | advance_rate | mastery_window | mastery_threshold | fast_track_rate | chaos_probability |
+|--------------|:------------:|:------------:|:--------------:|:-----------------:|:---------------:|:-----------------:|
+| warmup       | 5            | 0.6          | 10             | 0.7               | 0.9             | 0.0               |
+| beginner     | 10           | 0.65         | 10             | 0.7               | 0.9             | 0.0               |
+| intermediate | 15           | 0.65         | 10             | 0.7               | 0.9             | 0.10              |
+| advanced    | 15           | 0.7          | 10             | 0.7               | 0.9             | 0.20              |
+| expert       | 20           | 0.7          | 10             | 0.7               | 0.9             | 0.30              |
+### Priority scoring
+For each episode the curriculum picks the highest-scored task within the agent's current tier:
+```
+score = novelty_bonus          # +100 if never attempted
+      + weakness_weight        # +50 × (1 − task_success_rate)
+      + spaced_rep_bonus       # +30 if a graduated task is "due" for re-test
+      − recency_penalty        # −20 if attempted in the last 2 episodes
+```
+This single formula simultaneously enforces exploration (novelty), targets weak spots (weakness), prevents forgetting (spaced rep), and avoids rut behavior (recency). No hand-coded scheduling — it falls out of the score.
+### Mastery model
+- **Window**: the last 10 episodes for each task
+- **Threshold**: a task graduates when its weighted success rate crosses 0.7
+- **Decay**: `0.85` exponential — recent results count for more
+- **Un-graduation**: if a graduated task drops back below threshold, it loses graduation and re-enters the rotation
+### Spaced repetition
+Graduated tasks resurface at intervals `[3, 6, 12, 24, 48]` episodes. Pass on re-test → interval doubles (capped at 48). Fail → interval resets to 3. The `+30` priority bonus in the scoring formula is what surfaces them.
+### Tier promotion
+Two paths:
+- **Standard**: `tier_episodes >= min_episodes` and `tier_success_rate >= advance_rate`
+- **Fast-track**: 3 consecutive episodes at ≥ `fast_track_rate` (0.9) — bypasses the minimum
+Demotion is **not** supported — the agent's "ratchet" only goes up. (Mastery on individual tasks does decay; the *tier* does not.)
+### Notable APIs
+- `Curriculum.next_task() -> Task` — selection
+- `Curriculum.record_result(task, achieved, reward)` — episode-level callback
+- `Curriculum.get_task_by_id(task_id) -> Task` — used by the GRPO validation harness for frozen held-out tasks
+- `Curriculum.get_stats() -> dict` — see §18
+---
+## 8. Reward shaping & TaskGrader
+[services/task_grader.py](services/task_grader.py) — 264 LOC. The grader is the single source of reward truth.
+### Reward formula
+```
+if task_achieved:
+    reward = 1.0
+    if survived_chaos:    reward *= 1.05      # ≤ 1.05 cap
+else:
+    reward = partial_progress * 0.8           # ≤ 0.8 from steps alone
+    if progress_increased: reward += 0.1      # dense progress signal
+    if command_failed:     reward *= 0.5      # error penalty
+    reward -= 0.1 * rollback_count            # create→delete pairs
+    reward += 0.02 * idempotent_retries       # graceful "already exists"
+    reward = clamp(reward, 0.0, 0.99)         # 1.0 reserved for completion
+reward *= 0.85 ** hints_used                  # hint decay applied last
+```
+This is **dense by design** — the agent gets meaningful feedback on every step, not just at episode end.
+### Five grading strategies (dispatcher pattern)
+`TaskGrader.grade()` dispatches on `task.success_criteria.grading_strategy`:
+| Tier         | Strategy                  | Mechanism                                                                                  | Partial-progress source              |
+|--------------|---------------------------|--------------------------------------------------------------------------------------------|--------------------------------------|
+| Warmup       | `command_match`           | Latest command contains correct service + operation                                        | Binary 0 or 1.0                      |
+| Beginner     | `resource_creation`       | Command match (0.5) + `ResourceVerifier` confirms exact resource exists in state (1.0)      | Two-stage (0.5 → 1.0)                |
+| Intermediate | `multi_step`              | Ordered list of `(operation, resource)` pairs; credit each new step                         | `completed_steps / total_steps`      |
+| Advanced     | `multi_step + services`   | Same as multi_step **and** all `services_required` must be touched                          | `completed_steps / total_steps` (capped until services satisfied) |
+| Expert       | `state_checks`            | `ResourceVerifier` runs arbitrary AWS CLI commands at grading time and asserts on output   | `0.7 × steps + 0.3 × state_checks`   |
+State-check assertions support two forms:
+- `output_contains: <substring>` — substring match on stdout
+- `json_path: <jq-style path>` + `expected: <value>` — JSON extraction with expected value
+This per-tier polymorphism is critical: a single grading rule would be too lax for warmup or too crude for SRE tasks.
+### Chaos survival bonus
+If `ChaosEngine` injected a mutation during the episode and the agent still completed, reward is `1.05` instead of `1.0` (5% bonus) — and that bonus *stacks under* hint decay (so the agent that solves a chaotic task without hints gets the maximum).
+### Rollback penalty & idempotency bonus
+- **Rollback** (`-0.1` per pair): `EpisodeTracker.detect_rollbacks()` scans the command history for `(create-X, … , delete-X)` pairs on the same resource. Production-style waste — heavily penalized.
+- **Idempotency** (`+0.02`): if a command fails with a known "already exists" pattern (`BucketAlreadyExists`, `ResourceInUseException`, etc.) and the next command continues productively, the agent is rewarded for graceful retry behavior.
+This is the first RL environment we know of that rewards *operational discipline* directly.
+---
+## 9. Anti-reward-hacking — 8 defense layers
+The agent's only loss surface is the reward signal. We harden it so that the cheapest path to a high reward is *actually doing the task* — not gaming the grader.
+### Layer 1 — Ground-truth verification via MiniStack
+The grader **never trusts agent command output**. For every resource check it asks `ResourceVerifier` to query MiniStack (or AWS) directly. Even if the agent crafts a perfectly-formed fake JSON response, the grader looks at server-side state.
+> ResourceVerifier covers 20+ services with bespoke verification methods (S3, DynamoDB, Lambda, SQS, SNS, IAM, Secrets Manager, API Gateway, Cognito, RDS, EFS, ElastiCache, EC2, Step Functions, Glue, Athena, EMR, ECS, EKS, EventBridge, Kinesis, …).
+### Layer 2 — Deduplication
+`EpisodeTracker.has_executed_operation()` records every `(operation, resource)` pair that earned credit. Re-running the same successful command does **not** re-earn `partial_progress`. Each unique operation pays out exactly once.
+### Layer 3 — Grader invisibility
+The CLI commands run by `ResourceVerifier` and `state_checks` happen **server-side** and are not echoed into the agent's observation. The agent never sees which queries the grader is making, so it cannot reverse-engineer "fake outputs" that match the grader's expectations.
+### Layer 4 — Command allow-listing
+`step()` rejects anything that doesn't start with `aws ` (`success=False`, no execution). No shell metacharacters, no piping, no redirection, no escape from the AWS CLI sandbox.
+### Layer 5 — No verification reward
+If the agent's command exactly matches one of the task's `state_checks` commands (e.g. `aws s3api get-bucket-versioning --bucket app-config-store`), it gets **zero** progress credit. Only mutating commands (create / put / update / delete) earn credit. Read-only auditing is freely allowed but not rewarded — exactly mirroring the grader's behavior.
+### Layer 6 — Monotonic progress
+`partial_progress` only ever increases within an episode. It is clamped at `0.99`; reaching `1.0` requires fully verified completion. The agent cannot lose progress, but it also cannot re-earn lost progress, so cycling strategies (create → delete → create) yield zero net gain.
+### Layer 7 — Resource-name validation
+`ResourceVerifier` checks the **exact** resource name from the task definition. Creating `my-test-bucket-2` does not satisfy a check for `my-test-bucket`. The agent cannot creatively name its way around the spec.
+### Layer 8 — State checks verify the final state
+For expert SRE tasks, the grader runs the canonical `state_checks` commands at grading time against the live MiniStack. The grade is "what is true now?", not "what did the agent claim?". This is the single hardest layer to circumvent.
+These layers compose: even if one is bypassed (e.g. a clever exact-match name), the others independently still produce the right reward.
+---
+## 10. Resource verifier
+[services/resource_verifier.py](services/resource_verifier.py) — 362 LOC.
+- **Per-service `verify_*` methods** for 20+ AWS services. Each method knows which API calls expose state for that service and how to read the response (e.g. `verify_s3_bucket(name)` calls `s3api list-buckets`, `verify_dynamodb_table(name)` calls `dynamodb describe-table`, etc.).
+- **Single-shot state path**: when called via `SimulatorStrategy.get_state()`, the verifier reads MiniStack's custom `/_ministack/state` endpoint (added in commit `a648c3a`, see §5) which returns the full infra inventory in one HTTP call. This is dramatically faster than iterating 20+ list APIs per grading pass.
+- **State-check evaluator**: handles `output_contains` (substring) and `json_path` + `expected` (JSON extraction with deep-path support) assertion types used by expert-tier tasks.
+- **Live ground-truth source** — the verifier never consumes the agent's stdout. Always fresh state from the simulator.
+---
+## 11. Chaos engine
+[services/chaos_engine.py](services/chaos_engine.py) — 168 LOC.
+Probabilistically perturbs AWS resource state mid-episode. Tests whether the agent can detect and recover from unexpected drift — a critical SRE skill.
+- **Tier-scaled probability**: 0% warmup/beginner, 10% intermediate, 20% advanced, 30% expert
+- **Service-scoped templates**: a chaos roll only fires on services the current task is touching. Resource names are extracted from the agent's recent successful commands via service-specific regex (e.g. `aws s3 mb s3://(\S+)` → bucket name).
+- **Five service templates**: S3 policy / versioning changes, DynamoDB throughput modifications, Lambda configuration alterations, IAM detach-role-policy, SNS subscription mutations
+- **Silent**: chaos commands run server-side; the agent observes only the *consequence* (a state inconsistency), never the cause
+- **Reward bonus**: surviving chaos and completing the task pays `1.05` instead of `1.0`
+The combination of "tier-scaled probability" + "task-scoped resource selection" means chaos is rare for warmup tasks (0%) and frequent for SRE tasks (30%) — exactly where it matters.
+---
+## 12. Drift engine
+[services/drift_engine.py](services/drift_engine.py) — 67 LOC.
+Specialised for the 6 drift-detection expert tasks defined in [services/tasks/drift.yaml](services/tasks/drift.yaml).
+- Each drift task ships a pool of `possible_drifts` (each a small list of CLI commands that mutates a resource away from the desired spec).
+- On `reset()`, the engine **randomly selects 2–3 drifts** from that pool and applies them after the setup-command phase.
+- The agent sees a `desired_state_spec` (natural language) and must audit the environment, identify which resources drifted, and fix only those.
+- Random selection per episode means **no memorization** — the agent must reason about desired vs actual state, not recall a fix script.
+- Examples: S3 versioning/encryption drift, DynamoDB throughput changes, SNS subscription modifications, Lambda env-var tampering.
+---
+## 13. Hint provider
+[services/hint_provider.py](services/hint_provider.py) — 137 LOC.
+Three-level progressive hints, requested via the special action `aws help --task-hint`:
+| Level | What it reveals                       | Example                                                  |
+|-------|---------------------------------------|----------------------------------------------------------|
+| 1     | Required AWS services                 | "You'll need IAM and Lambda"                             |
+| 2     | Operation sequence                    | "Start with `create-role`, then `put-role-policy`"       |
+| 3     | Near-complete command structure       | "Use: `aws iam create-role --role-name …`"               |
+- Hints are **auto-derived** from the `SuccessCriteria` fields (services list, ordered steps, operation names) — no hand-written hint text per task.
+- Reward decay: `final_reward *= 0.85 ** hints_used`. With three hints (max), the agent caps at `0.85³ ≈ 0.614` of normal reward.
+- The hint command is **intercepted before reaching MiniStack** so it does not consume an episode step nor affect simulator state.
+---
+## 14. Episode tracker
+[services/episode_tracker.py](services/episode_tracker.py) — 241 LOC.
+Single source of per-episode state. Maintains:
+- Step count, hint count, command history (raw + parsed)
+- `partial_progress: float ∈ [0, 1]` (monotonic — see anti-hack layer 6)
+- `credited_operations: set[(operation, resource)]` (for dedup — anti-hack layer 2)
+- Rollback detection: scans history for `(create-X, …, delete-X)` pairs on same resource
+- Idempotency detection: looks for known "already exists" error patterns
+Parses each AWS CLI invocation into a structured tuple `(service, operation, resource_name)` for downstream services to query without re-parsing.
+---
+## 15. Environment designer
+[services/environment_designer.py](services/environment_designer.py) — 99 LOC.
+Provisioning helper for SRE / security-posture / drift tasks. A task can declare `setup_commands: list[SetupCommand]` — these are executed (server-side) **before** the agent starts so the world begins in a deliberately broken / insecure / over-provisioned state. Examples:
+- "Public S3 bucket lockdown" (§17): creates `public-assets` with a wide-open bucket policy
+- "IAM least-privilege": creates `app-role` with `Action: *` / `Resource: *`
+- Drift tasks: provision the *correct* infra so the drift engine can mutate it
+Setup failures abort the reset — partial setup is never exposed to the agent.
+---
+## 16. Task definitions (YAML schema)
+[services/tasks/](services/tasks/) — one YAML file per tier:
+- [warmup.yaml](services/tasks/warmup.yaml) — 25 listing tasks
+- [beginner.yaml](services/tasks/beginner.yaml) — 25 single-resource creation tasks
+- [intermediate.yaml](services/tasks/intermediate.yaml) — 25 multi-step workflows
+- [advanced.yaml](services/tasks/advanced.yaml) — 25 cross-service architectures
+- [expert.yaml](services/tasks/expert.yaml) — 24 SRE / security tasks
+- [drift.yaml](services/tasks/drift.yaml) — 9 drift detection tasks
+Sample task:
+```yaml
+- task_id: 42
+  description: Create an S3 bucket named my-app-data and enable versioning on it.
+  difficulty: intermediate
+  success_criteria:
+    grading_strategy: multi_step
+    steps:
+      - operation: create-bucket
+        resource: my-app-data
+      - operation: put-bucket-versioning
+        resource: my-app-data
+    services: [s3]
+  setup_commands: []
+  possible_drifts: []
+```
+Expert / drift tasks add `state_checks`, `desired_state_spec`, and `setup_commands`.
+---
+## 17. Security-posture audit examples
+These three expert-tier tasks test reasoning about *configuration state* — the infra is functional but insecure. The agent must read existing config and recognize the vulnerability.
+### Public S3 bucket lockdown
+- **Setup**: bucket `public-assets` is provisioned with a bucket policy granting `Principal: *` access
+- **Task**: replace the policy so only IAM role `app-role` can `s3:GetObject`
+- **State checks**: bucket policy denies `Principal: *`, allows only `app-role`
+### IAM least privilege
+- **Setup**: role `app-role` exists with an inline policy `Action: *, Resource: *`
+- **Task**: replace with a least-privilege policy allowing only `dynamodb:GetItem` and `dynamodb:PutItem` on the users table
+- **State checks**: policy document matches the expected ARN-scoped permissions
+### Lambda secret rotation
+- **Setup**: Lambda `data-processor` has env var `DB_PASSWORD=hunter2` (plaintext)
+- **Task**: create a Secrets Manager secret, add `SECRET_ARN` env var, remove `DB_PASSWORD`
+- **State checks**: secret exists, Lambda has `SECRET_ARN`, no `DB_PASSWORD` remains
+These are not hypothetical scenarios — they're the most common cloud-misconfiguration findings in real audits.
+---
+## 18. Curriculum stats API
+`Curriculum.get_stats()` returns:
+```python
+{
+    "episode_count": 42,
+    "tier": "intermediate",
+    "tier_episodes": 12,
+    "tier_success_rate": 0.75,
+    "graduated_tasks": [0, 2, 4],
+    "weak_spots": [11, 12],
+    "skill_profile": {0: 0.95, 1: 0.8, ...},   # per-task weighted success
+    "spaced_rep_due": [0, 2],                   # graduated tasks due for re-test
+    "avg_reward_last_10": 0.65,
+}
+```
+Useful for:
+- Dashboarding training progress
+- Logging into the GRPO `EpisodeLogger` CSV (see [train_grpo.py:635](../train_grpo.py))
+- Driving the web playground's progress bar
+---
+## 19. Web playground
+Always mounted at [http://localhost:8000/web](http://localhost:8000/web). When `POOL_SIZE>1` the playground is backed by a **dedicated lazy-spawned MiniStack** on `AWS_RL_ENV_WEB_MINISTACK_PORT` (default `4565`) — see §6. First request takes ~1–3s while that MiniStack binds; subsequent requests are fast.
+- HTML: [server/templates/index.html](templates/index.html)
+- Static assets: [server/static/](static/) — CSS, JS, and **40 AWS service icons** in [server/static/img/aws/](static/img/aws/)
+- The playground talks to `/web/reset`, `/web/step`, `/web/state`, and `/web/solution` (the last one reveals the next canonical solution command — handy for demos and debugging task definitions).
+The playground runs a **single shared environment instance** on its own MiniStack (or, with `POOL_SIZE=1`, the lone pool MiniStack on `:4566`). It is intentionally separate from the per-WebSocket sessions used during training so a curious user clicking around the web UI cannot interfere with an active GRPO rollout.
+---
+## See also
+- [Main README](../README.md) — project overview, results, Colab links
+- [scripts/README.md](../scripts/README.md) — client-side parallel rollout pool (`GrpoPool`, `MultiTurnEnvPool`, asyncio orchestration)
+- [train/README.md](../train/README.md) — SFT + GRPO training pipeline
+- [data/README.md](../data/README.md) — dataset generation + base-model selection
+- [aws_infra/README.md](../aws_infra/README.md) — vendored MiniStack upstream docs (81 KB)

server/app.py CHANGED Viewed

@@ -28,8 +28,14 @@ Usage:
     python -m server.app
 """
 import os
 import threading
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable
@@ -71,6 +77,22 @@ POOL_SIZE = max(int(os.getenv("AWS_RL_ENV_POOL_SIZE", "1")), 1)
 BASE_MINISTACK_PORT = int(os.getenv("AWS_RL_ENV_MINISTACK_BASE_PORT", "4566"))
 BACKEND_TYPE = os.getenv("BACKEND_TYPE", "simulator")  # "simulator" | "aws"
 class MiniStackPool:
     """Thread-safe free-list of MiniStack ports.
@@ -156,84 +178,147 @@ app = create_app(
 # The web playground needs state across requests, so we maintain a shared
 # environment instance and expose /web/reset and /web/step.
 #
-# Only mounted when POOL_SIZE <= 1. With a pool active, port 4566 is
-# claimed by the pool and a shared web _env would collide with the
-# per-session MiniStacks.
-# If POOL_SIZE=8 and web mounts anyway, the module-level _env = AwsRlEnvironment()
-# defaults to http://localhost:4566 — which is also in the pool's range.
-# Any /web/step clobbers the MiniStack currently held by a WS session that
-# acquired port 4566. State corrupts both ways: web user's bucket appears in a
-# GRPO rollout; pool rollout's drift mutations show up in the web UI.
-# ---------------------------------------------------------------------------
-if POOL_SIZE <= 1:
-    _env = AwsRlEnvironment()
-    class WebStepRequest(BaseModel):
-        action: Dict[str, Any]
-    @app.post("/web/reset", include_in_schema=False)
-    async def web_reset():
-        obs = _env.reset()
-        return {
-            "observation": obs.model_dump(),
-            "reward": obs.reward,
-            "done": obs.done,
-        }
-    @app.get("/web/solution", include_in_schema=False)
-    async def web_solution():
-        """Return the next solution command for the current task step."""
-        if not _env._current_task:
-            return {
-                "command": None,
-                "error": "No active task. Start a new episode first.",
-            }
-        from server.services.task_solutions import get_next_solution
-        result = get_next_solution(
-            task_id=_env._current_task.task_id,
-            backend=_env._backend,
-            tracker=_env._tracker,
-        )
-        result["task_id"] = _env._current_task.task_id
-        return result
-    @app.get("/web/state", include_in_schema=False)
-    async def web_state():
-        """Return the full AwsRlState for the web UI."""
-        return _env.state.model_dump()
-    @app.post("/web/step", include_in_schema=False)
-    async def web_step(request: WebStepRequest = Body(...)):
-        action = AwsRlAction(**request.action)
-        obs = _env.step(action)
         return {
-            "observation": obs.model_dump(),
-            "reward": obs.reward,
-            "done": obs.done,
         }
-    # ---------------------------------------------------------------------------
-    # Custom web UI
-    # ---------------------------------------------------------------------------
-    _server_dir = Path(__file__).parent
-    _templates = Jinja2Templates(directory=str(_server_dir / "templates"))
-    app.mount(
-        "/static", StaticFiles(directory=str(_server_dir / "static")), name="static"
     )
-    @app.get("/", response_class=RedirectResponse, include_in_schema=False)
-    async def root_redirect():
-        return RedirectResponse(url="/web")
-    @app.get("/web", response_class=HTMLResponse, include_in_schema=False)
-    async def web_ui(request: Request):
-        return _templates.TemplateResponse(request=request, name="index.html")
 def main(host: str = "0.0.0.0", port: int = 8000):

     python -m server.app
 """
+import asyncio
 import os
+import shutil
+import socket
+import subprocess
+import sys
 import threading
+import time
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable
 BASE_MINISTACK_PORT = int(os.getenv("AWS_RL_ENV_MINISTACK_BASE_PORT", "4566"))
 BACKEND_TYPE = os.getenv("BACKEND_TYPE", "simulator")  # "simulator" | "aws"
+# Constant, dedicated MiniStack port for the web playground. Kept outside the
+# pool's range so a WebSocket session can never acquire it, eliminating the
+# state-bleed risk that previously gated the web UI when POOL_SIZE > 1.
+WEB_MINISTACK_PORT = int(os.getenv("AWS_RL_ENV_WEB_MINISTACK_PORT", "4565"))
+if (
+    BACKEND_TYPE != "aws"
+    and POOL_SIZE > 1
+    and BASE_MINISTACK_PORT <= WEB_MINISTACK_PORT < BASE_MINISTACK_PORT + POOL_SIZE
+):
+    raise RuntimeError(
+        f"AWS_RL_ENV_WEB_MINISTACK_PORT={WEB_MINISTACK_PORT} collides with pool range "
+        f"[{BASE_MINISTACK_PORT}..{BASE_MINISTACK_PORT + POOL_SIZE - 1}]. "
+        f"Pick a port outside the pool's range."
+    )
 class MiniStackPool:
     """Thread-safe free-list of MiniStack ports.
 # The web playground needs state across requests, so we maintain a shared
 # environment instance and expose /web/reset and /web/step.
 #
+# When POOL_SIZE > 1 the pool owns [BASE..BASE+N-1]; the web UI uses a
+# dedicated MiniStack on WEB_MINISTACK_PORT (constant, outside the pool's
+# range) so it can never collide with a WebSocket session. That MiniStack is
+# spawned lazily on the first /web/* request — training-only deployments pay
+# zero cost. Subsequent requests reuse the cached _web_env.
+# ---------------------------------------------------------------------------
+_web_env: AwsRlEnvironment | None = None
+_web_env_lock = threading.Lock()
+def _port_listening(port: int) -> bool:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.settimeout(0.2)
+        return s.connect_ex(("127.0.0.1", port)) == 0
+def _resolve_ministack_bin() -> str:
+    """Find the ministack entry point. Prefer the same venv as the running
+    Python (sys.executable's bin dir) before falling back to PATH — uvicorn
+    invoked via /full/path/to/.venv/bin/uvicorn doesn't always have the venv
+    on PATH, so a bare "ministack" lookup would FileNotFoundError.
+    """
+    candidate = Path(sys.executable).parent / "ministack"
+    if candidate.exists():
+        return str(candidate)
+    on_path = shutil.which("ministack")
+    if on_path:
+        return on_path
+    raise RuntimeError(
+        "Could not find the 'ministack' executable. Install with `uv sync` "
+        "or ensure the active venv's bin directory is on PATH."
+    )
+def _spawn_web_ministack(port: int, timeout_s: float = 10.0) -> None:
+    if _port_listening(port):
+        return
+    subprocess.Popen(
+        [_resolve_ministack_bin(), "-d"],
+        env={**os.environ, "GATEWAY_PORT": str(port)},
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+    deadline = time.monotonic() + timeout_s
+    while time.monotonic() < deadline:
+        if _port_listening(port):
+            return
+        time.sleep(0.1)
+    raise RuntimeError(f"Web MiniStack failed to bind {port} within {timeout_s}s")
+def _get_web_env() -> AwsRlEnvironment:
+    global _web_env
+    if _web_env is not None:
+        return _web_env
+    with _web_env_lock:
+        if _web_env is not None:
+            return _web_env
+        if BACKEND_TYPE == "aws":
+            _web_env = AwsRlEnvironment(strategy=AwsStrategy())
+        elif POOL_SIZE > 1:
+            _spawn_web_ministack(WEB_MINISTACK_PORT)
+            _web_env = AwsRlEnvironment(
+                strategy=SimulatorStrategy(f"http://localhost:{WEB_MINISTACK_PORT}")
+            )
+        else:
+            _web_env = AwsRlEnvironment()
+        return _web_env
+class WebStepRequest(BaseModel):
+    action: Dict[str, Any]
+@app.post("/web/reset", include_in_schema=False)
+async def web_reset():
+    env = await asyncio.to_thread(_get_web_env)
+    obs = env.reset()
+    return {
+        "observation": obs.model_dump(),
+        "reward": obs.reward,
+        "done": obs.done,
+    }
+@app.get("/web/solution", include_in_schema=False)
+async def web_solution():
+    """Return the next solution command for the current task step."""
+    env = await asyncio.to_thread(_get_web_env)
+    if not env._current_task:
         return {
+            "command": None,
+            "error": "No active task. Start a new episode first.",
         }
+    from server.services.task_solutions import get_next_solution
+    result = get_next_solution(
+        task_id=env._current_task.task_id,
+        backend=env._backend,
+        tracker=env._tracker,
     )
+    result["task_id"] = env._current_task.task_id
+    return result
+@app.get("/web/state", include_in_schema=False)
+async def web_state():
+    """Return the full AwsRlState for the web UI."""
+    env = await asyncio.to_thread(_get_web_env)
+    return env.state.model_dump()
+@app.post("/web/step", include_in_schema=False)
+async def web_step(request: WebStepRequest = Body(...)):
+    env = await asyncio.to_thread(_get_web_env)
+    action = AwsRlAction(**request.action)
+    obs = env.step(action)
+    return {
+        "observation": obs.model_dump(),
+        "reward": obs.reward,
+        "done": obs.done,
+    }
+_server_dir = Path(__file__).parent
+_templates = Jinja2Templates(directory=str(_server_dir / "templates"))
+app.mount(
+    "/static", StaticFiles(directory=str(_server_dir / "static")), name="static"
+)
+@app.get("/", response_class=RedirectResponse, include_in_schema=False)
+async def root_redirect():
+    return RedirectResponse(url="/web")
+@app.get("/web", response_class=HTMLResponse, include_in_schema=False)
+async def web_ui(request: Request):
+    return _templates.TemplateResponse(request=request, name="index.html")
 def main(host: str = "0.0.0.0", port: int = 8000):

tests/test_pool.py CHANGED Viewed

@@ -360,3 +360,328 @@ class TestFactoryConcurrencyIntegration:
             t.join()
         assert pool.free_count == 20

             t.join()
         assert pool.free_count == 20
+# ---------------------------------------------------------------------------
+# Web playground coexistence with the MiniStack pool
+# ---------------------------------------------------------------------------
+def _run_in_subprocess(env_overrides: dict[str, str], code: str) -> tuple[int, str, str]:
+    """Run `code` in a fresh subprocess with the given env overrides.
+    Mirrors the pattern used by TestServerAppImportIsSafeForLegacyPoolSizes
+    to avoid module-cache pollution across env-var changes.
+    """
+    import os
+    import subprocess
+    import sys
+    env = {**os.environ, **env_overrides}
+    result = subprocess.run(
+        [sys.executable, "-c", code],
+        env=env,
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    return result.returncode, result.stdout, result.stderr
+class TestWebRoutesMountUnconditionally:
+    """The web playground used to be gated on POOL_SIZE <= 1. It now mounts
+    regardless of pool size, with a dedicated lazy MiniStack on
+    AWS_RL_ENV_WEB_MINISTACK_PORT.
+    """
+    def test_web_routes_present_when_pool_size_8(self) -> None:
+        code = (
+            "import server.app as m;"
+            "paths = {getattr(r, 'path', None) for r in m.app.routes};"
+            "import sys;"
+            "missing = {'/web', '/web/reset', '/web/state', '/web/step', '/web/solution'} - paths;"
+            "sys.stdout.write('MISSING=' + repr(missing))"
+        )
+        rc, out, err = _run_in_subprocess({"AWS_RL_ENV_POOL_SIZE": "8"}, code)
+        assert rc == 0, f"import failed: {err}"
+        assert "MISSING=set()" in out, out
+    def test_web_routes_present_when_pool_size_1(self) -> None:
+        code = (
+            "import server.app as m;"
+            "paths = {getattr(r, 'path', None) for r in m.app.routes};"
+            "import sys;"
+            "missing = {'/web', '/web/reset', '/web/state', '/web/step', '/web/solution'} - paths;"
+            "sys.stdout.write('MISSING=' + repr(missing))"
+        )
+        rc, out, err = _run_in_subprocess({"AWS_RL_ENV_POOL_SIZE": "1"}, code)
+        assert rc == 0, f"import failed: {err}"
+        assert "MISSING=set()" in out, out
+class TestWebMiniStackPortConflictDetection:
+    """The startup-time guard refuses to boot if the configured web port falls
+    inside the pool's port range. Without it, a WebSocket session could acquire
+    the same port the web _env writes to and corrupt state in both directions.
+    """
+    def test_collision_inside_pool_range_raises(self) -> None:
+        code = "import server.app"
+        rc, _, err = _run_in_subprocess(
+            {
+                "AWS_RL_ENV_POOL_SIZE": "8",
+                "AWS_RL_ENV_MINISTACK_BASE_PORT": "4566",
+                "AWS_RL_ENV_WEB_MINISTACK_PORT": "4570",  # inside [4566..4573]
+            },
+            code,
+        )
+        assert rc != 0
+        assert "collides with pool range" in err
+    def test_web_port_just_below_pool_range_is_allowed(self) -> None:
+        code = "import server.app"
+        rc, _, err = _run_in_subprocess(
+            {
+                "AWS_RL_ENV_POOL_SIZE": "8",
+                "AWS_RL_ENV_MINISTACK_BASE_PORT": "4566",
+                "AWS_RL_ENV_WEB_MINISTACK_PORT": "4565",  # default
+            },
+            code,
+        )
+        assert rc == 0, err
+    def test_web_port_just_above_pool_range_is_allowed(self) -> None:
+        code = "import server.app"
+        rc, _, err = _run_in_subprocess(
+            {
+                "AWS_RL_ENV_POOL_SIZE": "8",
+                "AWS_RL_ENV_MINISTACK_BASE_PORT": "4566",
+                "AWS_RL_ENV_WEB_MINISTACK_PORT": "4574",  # one past 4573
+            },
+            code,
+        )
+        assert rc == 0, err
+    def test_collision_check_skipped_when_pool_size_1(self) -> None:
+        """POOL_SIZE=1 means no pool object exists, so the constant web port
+        is allowed to coincide with BASE_PORT (it just means the web env
+        shares the lone MiniStack). Backward-compat for legacy single-mode.
+        """
+        code = "import server.app"
+        rc, _, err = _run_in_subprocess(
+            {
+                "AWS_RL_ENV_POOL_SIZE": "1",
+                "AWS_RL_ENV_MINISTACK_BASE_PORT": "4566",
+                "AWS_RL_ENV_WEB_MINISTACK_PORT": "4566",
+            },
+            code,
+        )
+        assert rc == 0, err
+    def test_collision_check_skipped_when_backend_aws(self) -> None:
+        """BACKEND_TYPE=aws skips the pool entirely (all sessions share
+        AwsStrategy), so a "collision" with the pool's range is hypothetical
+        — the pool object is never constructed. Refusing to boot here would
+        be a false positive.
+        """
+        code = "import server.app"
+        rc, _, err = _run_in_subprocess(
+            {
+                "AWS_RL_ENV_POOL_SIZE": "8",
+                "AWS_RL_ENV_MINISTACK_BASE_PORT": "4566",
+                "AWS_RL_ENV_WEB_MINISTACK_PORT": "4570",  # would collide if simulator
+                "BACKEND_TYPE": "aws",
+            },
+            code,
+        )
+        assert rc == 0, err
+class TestWebEnvLazyConstruction:
+    def test_web_env_is_none_immediately_after_import(self) -> None:
+        """Lazy: the dedicated MiniStack should NOT spawn until a /web/*
+        request arrives. Importing the module must not subprocess anything.
+        """
+        code = (
+            "import server.app as m;"
+            "import sys;"
+            "sys.stdout.write('\\nRESULT=' + ('NONE' if m._web_env is None else 'NOT_NONE'))"
+        )
+        rc, out, err = _run_in_subprocess({"AWS_RL_ENV_POOL_SIZE": "8"}, code)
+        assert rc == 0, err
+        assert out.strip().splitlines()[-1] == "RESULT=NONE"
+    def test_get_web_env_legacy_uses_default_port_for_pool_size_1(self) -> None:
+        """POOL_SIZE=1: web env shares the single MiniStack on :4566 — the
+        original behavior, locked down so it doesn't drift.
+        """
+        code = (
+            "import server.app as m;"
+            "env = m._get_web_env();"
+            "import sys;"
+            "sys.stdout.write('\\nRESULT=' + env._backend._aws_infra_url)"
+        )
+        rc, out, err = _run_in_subprocess({"AWS_RL_ENV_POOL_SIZE": "1"}, code)
+        assert rc == 0, err
+        assert out.strip().splitlines()[-1] == "RESULT=http://localhost:4566"
+    def test_get_web_env_uses_aws_strategy_when_backend_aws(self) -> None:
+        """BACKEND_TYPE=aws: web env wires AwsStrategy too. No MiniStack spawn.
+        Fixes the latent inconsistency where the web playground always used
+        the simulator regardless of training backend.
+        """
+        code = (
+            "import server.app as m;"
+            "from server.services.aws_strategy import AwsStrategy;"
+            "env = m._get_web_env();"
+            "import sys;"
+            "sys.stdout.write('\\nRESULT=' + ('AWS' if isinstance(env._backend, AwsStrategy) else 'NOT_AWS'))"
+        )
+        rc, out, err = _run_in_subprocess(
+            {"AWS_RL_ENV_POOL_SIZE": "8", "BACKEND_TYPE": "aws"},
+            code,
+        )
+        assert rc == 0, err
+        assert out.strip().splitlines()[-1] == "RESULT=AWS"
+class TestSpawnWebMiniStackShortCircuit:
+    """`_spawn_web_ministack` must not subprocess if the port is already
+    listening — otherwise a server restart would race against the existing
+    detached MiniStack and stall on the bind check.
+    """
+    def test_does_not_spawn_when_port_already_listening(self) -> None:
+        import socket
+        from server.app import _spawn_web_ministack
+        # Bind an ephemeral port to simulate a MiniStack already running.
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sentinel:
+            sentinel.bind(("127.0.0.1", 0))
+            sentinel.listen(1)
+            port = sentinel.getsockname()[1]
+            with patch("server.app.subprocess.Popen") as popen:
+                _spawn_web_ministack(port, timeout_s=0.5)
+            popen.assert_not_called()
+    def test_raises_on_bind_timeout(self) -> None:
+        """If the spawned MiniStack never binds, raise instead of hanging."""
+        from server.app import _spawn_web_ministack
+        # Pick a port that is almost certainly free; mock Popen so nothing
+        # actually starts. _spawn_web_ministack should poll and time out.
+        with patch("server.app.subprocess.Popen"):
+            with pytest.raises(RuntimeError, match="failed to bind"):
+                _spawn_web_ministack(port=1, timeout_s=0.3)
+class TestGetWebEnvAdversarial:
+    """Stress-test _get_web_env against the failure modes a real deployment
+    will eventually hit: concurrent first-request races, ministack-not-installed,
+    and spawn timeouts.
+    Each test patches at the module level inside an isolated subprocess so
+    real ministacks are never spawned.
+    """
+    def test_concurrent_first_requests_spawn_at_most_once(self) -> None:
+        """N threads racing on the cold start must result in exactly one
+        Popen call. The double-checked lock + cached _web_env enforce this.
+        Otherwise a busy /web/* moment at boot would spawn N ministacks all
+        fighting for the same port.
+        """
+        code = """
+import sys, threading
+from unittest.mock import patch
+import server.app as m
+with patch('server.app._spawn_web_ministack') as spawn:
+    spawn.return_value = None
+    def call():
+        m._get_web_env()
+    threads = [threading.Thread(target=call) for _ in range(20)]
+    for t in threads: t.start()
+    for t in threads: t.join()
+    sys.stdout.write('\\nRESULT=' + str(spawn.call_count))
+"""
+        rc, out, err = _run_in_subprocess({"AWS_RL_ENV_POOL_SIZE": "8"}, code)
+        assert rc == 0, err
+        assert out.strip().splitlines()[-1] == "RESULT=1"
+    def test_get_web_env_does_not_spawn_when_backend_aws(self) -> None:
+        """BACKEND_TYPE=aws path takes the AwsStrategy branch and never
+        subprocesses ministack — even with POOL_SIZE=8.
+        """
+        code = """
+import sys
+from unittest.mock import patch
+import server.app as m
+with patch('server.app.subprocess.Popen') as popen:
+    m._get_web_env()
+    sys.stdout.write('\\nRESULT=' + str(popen.call_count))
+"""
+        rc, out, err = _run_in_subprocess(
+            {"AWS_RL_ENV_POOL_SIZE": "8", "BACKEND_TYPE": "aws"},
+            code,
+        )
+        assert rc == 0, err
+        assert out.strip().splitlines()[-1] == "RESULT=0"
+    def test_get_web_env_does_not_spawn_when_pool_size_1(self) -> None:
+        """Legacy POOL_SIZE=1 path shares the lone pool MiniStack on :4566
+        and never spawns a separate web MiniStack.
+        """
+        code = """
+import sys
+from unittest.mock import patch
+import server.app as m
+with patch('server.app.subprocess.Popen') as popen:
+    m._get_web_env()
+    sys.stdout.write('\\nRESULT=' + str(popen.call_count))
+"""
+        rc, out, err = _run_in_subprocess({"AWS_RL_ENV_POOL_SIZE": "1"}, code)
+        assert rc == 0, err
+        assert out.strip().splitlines()[-1] == "RESULT=0"
+    def test_get_web_env_retries_after_spawn_failure(self) -> None:
+        """If the first spawn fails (e.g., ministack not installed yet, or
+        the bind timed out), _web_env stays None so a later request can
+        retry instead of permanently caching the failure.
+        """
+        code = """
+import sys
+from unittest.mock import patch
+import server.app as m
+with patch('server.app._spawn_web_ministack', side_effect=RuntimeError('boom')):
+    failed = False
+    try:
+        m._get_web_env()
+    except RuntimeError:
+        failed = True
+    assert failed, 'expected first call to raise'
+    assert m._web_env is None, '_web_env must stay None after spawn failure'
+sys.stdout.write('\\nRESULT=ok')
+"""
+        rc, out, err = _run_in_subprocess({"AWS_RL_ENV_POOL_SIZE": "8"}, code)
+        assert rc == 0, err
+        assert out.strip().splitlines()[-1] == "RESULT=ok"
+    def test_pool_factory_capacity_independent_of_web_env(self) -> None:
+        """The web _env is a module-level singleton, NOT produced by the
+        WebSocket factory. So a pool of 8 still hands out 8 distinct ports;
+        the web env doesn't steal a slot. Critical for the user's "8 WS +
+        web UI" goal.
+        """
+        pool, factory = make_env_factory(pool_size=8, base_port=4566)
+        assert pool is not None
+        envs = [factory() for _ in range(8)]
+        assert pool.free_count == 0
+        # 9th must fail — same as before this change
+        with pytest.raises(RuntimeError, match="exhausted"):
+            factory()
+        # Sanity: all 8 ports distinct, none equal to 4565 (web port)
+        ports = {int(e._backend._aws_infra_url.rsplit(":", 1)[-1]) for e in envs}
+        assert len(ports) == 8
+        assert 4565 not in ports

train/README.md ADDED Viewed

	@@ -0,0 +1,545 @@

+# `train/` — SFT + GRPO Training Pipeline
+[← back to main README](../README.md)
+This directory holds the **training notebooks** for the AWS RL agent. Heavy logic for the GRPO loop lives at the repo root in [train_grpo.py](../train_grpo.py); the notebooks here are thin drivers that you can run end-to-end on Colab.
+The training pipeline has two stages:
+```
+                      ┌────────── data/sft/ ──────────┐
+                      │  1,500 train · 150 val rows   │
+                      │  5 trajectory types           │
+                      └───────────────┬───────────────┘
+                                      │
+   ┌──────────────────────────────────▼──────────────────────────────────┐
+   │  STAGE 1 — Supervised Fine-Tuning  (train_sft_lora.ipynb)           │
+   │  Qwen2.5-Coder-3B-Instruct + LoRA r=8/16/32 (Optuna) → SFT adapter  │
+   └──────────────────────────────────┬──────────────────────────────────┘
+                                      │ Sizzing/aws-rl-sft-qwen25coder3b-adapter
+   ┌──────────────────────────────────▼──────────────────────────────────┐
+   │  STAGE 2 — GRPO RL                  (train_grpo_lora.ipynb)         │
+   │  G=8 parallel rollouts · multi-turn · reward = env return           │
+   │  Optuna over (lr, β, G, T, top_p, lora_r, max_turns)                │
+   └─────────────────────────────────────────────────────────────────────┘
+```
+The two stages are intentionally separable: the SFT adapter is published to the Hugging Face Hub so anyone can pull it and start GRPO without re-running SFT.
+---
+## Table of contents
+1. [SFT stage — supervised LoRA](#1-sft-stage--supervised-lora)
+2. [GRPO stage — reinforcement learning](#2-grpo-stage--reinforcement-learning)
+3. [Optuna hyperparameter search](#3-optuna-hyperparameter-search)
+4. [Multi-turn rollouts + parallel envs](#4-multi-turn-rollouts--parallel-envs)
+5. [Training modes (CLI)](#5-training-modes-cli)
+6. [How to run](#6-how-to-run)
+7. [Logging and artifacts](#7-logging-and-artifacts)
+8. [Reproducing results](#8-reproducing-results)
+9. [Files in this directory](#9-files-in-this-directory)
+---
+## 1. SFT stage — supervised LoRA
+[train/train_sft_lora.ipynb](train_sft_lora.ipynb) — primary SFT notebook.
+### Why SFT before GRPO?
+Two reasons — both showed up in our base-model evaluation ([data/sft/MODEL_EVALUATION.md](../data/sft/MODEL_EVALUATION.md)):
+1. **Format-locking**. Even strong coder models occasionally wrap commands in markdown fences or quotes. SFT removes that surface noise in one epoch.
+2. **Bootstrap the GRPO reward signal**. GRPO with a base model that's only 41% exact-match starts from a low-density reward landscape. Pre-training on canonical commands raises the baseline so GRPO can spend its compute on optimization, not search.
+### Base model
+| Choice | `unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit` |
+|--------|--|
+| Why    | Highest exact-match (41%) of 11 candidates we benchmarked, fastest viable inference (3.1 s/call), tightest output (86 chars). Full reasoning in [data/sft/MODEL_EVALUATION.md](../data/sft/MODEL_EVALUATION.md). |
+| Loader | Unsloth's 4-bit quantized variant — fits comfortably on a single 24 GB GPU, 2× faster training kernels |
+### LoRA config
+```python
+LoraConfig(
+    r              = trial.suggest_categorical("lora_r", [8, 16, 32]),
+    lora_alpha     = r * trial.suggest_categorical("lora_alpha_mul", [1, 2, 4]),
+    lora_dropout   = trial.suggest_float("lora_dropout", 0.005, 0.031),
+    bias           = "none",
+    task_type      = "CAUSAL_LM",
+    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"],
+)
+```
+- Only attention projections are adapted — MLP / output heads stay frozen, keeping the trainable parameter count tiny (~10–40 M depending on rank).
+- `lora_alpha = r × multiplier` keeps the effective scaling stable across rank variations during the Optuna search.
+### Optimization
+| Hyperparameter           | Value / Range                           |
+|--------------------------|------------------------------------------|
+| Optimizer                | AdamW (Unsloth's fused implementation)   |
+| Learning rate            | `[1e-4, 5e-4]` log-scale (Optuna)        |
+| Schedule                 | Cosine annealing                         |
+| Warmup ratio             | `{0.03, 0.1}` (Optuna; best 0.1)         |
+| Batch size               | 2 per GPU                                |
+| Epochs                   | 2                                        |
+| Max sequence length      | 512                                      |
+| Packing                  | **Disabled** (we keep chat-template separators intact) |
+| Loss masking             | Assistant-only (user message tokens are masked from the loss) |
+### Dataset
+[data/sft/aws_rl_sft.train.jsonl](../data/sft/aws_rl_sft.train.jsonl) — 1,500 examples. Format:
+```json
+{
+  "messages": [
+    {"role": "system", "content": "You are an AWS cloud engineer..."},
+    {"role": "user", "content": "TASK: ...\n\nCURRENT OBSERVATION:\nProgress: 0.00 ..."},
+    {"role": "assistant", "content": "aws s3 mb s3://my-app-data"}
+  ],
+  "difficulty": "intermediate",
+  "source": "success_first_step",
+  "task_id": 42
+}
+```
+The dataset is a careful mix of **5 trajectory types** (success, multi-step continuation, failure recovery, verification, hint usage). Full generation methodology in [data/README.md](../data/README.md).
+### Training graphs
+The actual SFT run shipped in [`out/`](../out/) achieved validation loss `0.052` after 188 training steps with the best Optuna trial.
+> ![SFT loss curve](../docs/figures/sft_loss_curve.png)
+---
+## 2. GRPO stage — reinforcement learning
+The core trainer lives at [train_grpo.py](../train_grpo.py) (1,283 LOC). Notebooks call into it:
+- [train/train_grpo_lora.ipynb](train_grpo_lora.ipynb) — clean
+- [train/train_grpo_lora_with_outputs.ipynb](train_grpo_lora_with_outputs.ipynb) — with execution outputs preserved
+- [aws_rl_env_colab.ipynb](../aws_rl_env_colab.ipynb) — Colab driver wrapping the entire pipeline
+### What GRPO is, briefly
+**GRPO** (Group Relative Policy Optimization) is the algorithm introduced by DeepSeekMath and adopted by TRL ≥ 0.18. Unlike PPO, GRPO does **not** train a critic. Instead:
+1. For one prompt (here, one curriculum-picked task), generate `G` completions
+2. Score each with the reward function(s)
+3. Compute group-relative advantage: `(reward_i − group_mean) / group_std`
+4. Backpropagate the policy gradient with that advantage
+5. Apply a KL penalty to the SFT reference model (coefficient `β`) to prevent drift
+This is dramatically simpler than PPO (no value head, no GAE), more sample-efficient for verifier-style rewards, and a natural fit for our setup — the AWS RL env *is* the reward function.
+### TRL GRPOTrainer config
+From [train_grpo.py:_build_grpo_config()](../train_grpo.py):
+| Parameter                          | Default value | Notes                                                       |
+|------------------------------------|---------------|-------------------------------------------------------------|
+| `learning_rate`                    | `5e-6`        | Optuna range `[1e-6, 1e-4]` log-scale                       |
+| `beta` (KL coefficient)            | `0.04`        | Optuna range `[0.0, 0.1]`                                   |
+| `num_generations` (G)              | `8`           | Optuna `{4, 8}`                                             |
+| `temperature`                      | `0.9`         | Optuna `[0.7, 1.0]`                                         |
+| `top_p`                            | `0.95`        | Optuna `[0.85, 0.98]`                                       |
+| `per_device_train_batch_size`      | `1`           |                                                             |
+| `gradient_accumulation_steps`      | `8`           | Effective batch 8                                           |
+| `gradient_checkpointing`           | `True`        | `use_reentrant=False` — VRAM optimization                   |
+| `max_completion_length`            | `256`         | Per-turn; one AWS CLI command fits comfortably              |
+| `max_prompt_length`                | `2048`        | Holds task + history + observation                          |
+| `loss_type`                        | `"dapo"`      | Distributional Advantage Policy Optimization (TRL default for GRPO) |
+| `mask_truncated_completions`       | `True`        | Drop samples that hit `max_completion_length`               |
+| `warmup_ratio`                     | `0.05`        |                                                             |
+| `lr_scheduler_type`                | `"cosine"`    |                                                             |
+| `max_grad_norm`                    | `1.0`         |                                                             |
+| `use_vllm`                         | `False`       | Plain `model.generate()` — vLLM integration is future work  |
+### Reward functions (TRL convention)
+Three reward functions are registered, summed by GRPO:
+```python
+reward_funcs=[reward_task, reward_achieved, reward_progress]
+```
+- `reward_task(completions, **kwargs)` → episode return (sum of per-step env rewards). The dominant signal.
+- `reward_achieved(completions, **kwargs)` → 1.0 if `task.task_achieved` at end of episode, else 0.0. Sparse but unambiguous.
+- `reward_progress(completions, **kwargs)` → final `partial_progress` ∈ [0, 1]. Densifies the credit assignment for partial completions.
+The env's reward shaping (see [server/README.md §8](../server/README.md#8-reward-shaping--taskgrader)) does most of the work — these three TRL functions are a thin façade.
+### Episode = one rollout
+- Each rollout runs **up to `MAX_TURNS=6` sequential AWS CLI commands**
+- Each command's stdout/stderr/progress is fed back as the user message for the next turn (see `build_user_prompt()` and `format_observation()` in [train_grpo.py](../train_grpo.py))
+- The episode terminates on `task_achieved`, max turns, or `max_total_tokens` (per-episode token budget)
+- Token sequences (prompt_ids, completion_ids, logprobs) are accumulated **across turns**, so GRPO assigns the episode-level reward to the full multi-turn token sequence — not just the last turn
+### Curriculum integration
+```
+trainer step:
+  1. task = curriculum.next_task()                # one task per GRPO step
+  2. results = pool.run_group(task, ...)          # G rollouts on that task
+  3. mean_r = sum(group_rewards) / G
+  4. curriculum.record_result(task, achieved=any_achieved, reward=mean_r)
+  5. trainer applies group-relative advantages    # standard GRPO
+```
+The curriculum drives task selection — every rollout in a group runs the *same* task, forced through `env.reset(task=task)`. This matches GRPO's group-relative semantics (you need the same prompt across the group to compute baseline correctly).
+Full curriculum mechanics (priority scoring, mastery, spaced rep, tier promotion) live in [server/README.md §7](../server/README.md#7-curriculum-manager).
+### Training graphs
+The actual GRPO run shipped in [`out_grpo/`](../out_grpo/) ran 35 steps with the best Optuna config (`lr=1.6e-5`, `β=0.0021`, `T=0.99`). Per-step signals from [`out_grpo/final_grpo/checkpoint-35/trainer_state.json`](../out_grpo/final_grpo/checkpoint-35/trainer_state.json):
+> ![GRPO final per-step training signals](../docs/figures/grpo_final_per_step.png)
+> ![GRPO env reward over training](../docs/figures/grpo_reward_curve.png)
+> ![Success by tier (multi-step)](../docs/figures/grpo_per_tier_curve.png)
+> ![Reward by tier (multi-step)](../docs/figures/grpo_reward_by_tier.png)
+Notable signals from the run:
+| | |
+|---|---|
+| `env_reward/mean` | 0.31 (mean over 16 reward-logged steps), max 0.94, min 0.13 |
+| `kl` | 0.15 (mean) — KL stays small despite tiny β |
+| `completion_length` | 87 tokens (mean) — agent emits compact AWS CLI commands |
+| Format compliance | **100%** (`format_reward/mean = 1.0` every step) |
+Multi-step end-to-end re-eval after GRPO ([out_grpo/grpo_multi_step.json](../out_grpo/grpo_multi_step.json)):
+> ![SFT vs GRPO multi-step metrics grid](../docs/figures/sft_vs_grpo_metrics_grid.png)
+These are produced by [`plot_rewards()`](../train_grpo.py) reading `reward_log.csv` written by `EpisodeLogger`, plus the post-hoc plots generated during the GRPO notebook run.
+---
+## 3. Optuna hyperparameter search
+[train_grpo.py:optuna_search()](../train_grpo.py)
+### Search space
+| Parameter         | Range                              | Reason                                                                 |
+|-------------------|------------------------------------|------------------------------------------------------------------------|
+| `learning_rate`   | `[1e-6, 1e-4]` log                 | GRPO is sensitive to LR; log-scale is the right prior                  |
+| `beta`            | `[0.0, 0.1]`                       | KL coefficient. 0 = pure RL (drift risk), 0.1 = anchored to SFT        |
+| `num_generations` | `{4, 8}`                           | Group size. Larger → tighter advantage estimates but slower            |
+| `temperature`     | `[0.7, 1.0]`                       | Exploration knob                                                       |
+| `top_p`           | `[0.85, 0.98]`                     | Nucleus sampling                                                       |
+| `lora_r`          | `{8, 16, 32}`                      | Adapter capacity                                                       |
+| `lora_alpha_mul`  | `{1, 2, 4}`                        | `lora_alpha = lora_r × multiplier`                                     |
+| `max_turns`       | `{4, 6, 8}`                        | Episode length cap                                                     |
+### Objective
+```
+objective = 0.7 × achieved_rate + 0.3 × mean_progress
+```
+Calculated on the held-out validation tasks at the end of each trial. Weighting `achieved_rate` higher matches the project goal — actual task completion matters more than partial progress.
+### Sampler
+`optuna.samplers.TPESampler(seed=42)` — Tree-structured Parzen Estimator. TPE outperforms random search on 8-dim spaces with ~6 trials in our experience.
+Persisted to `outputs/.../optuna.db` (SQLite), so trials can be resumed if a Colab session disconnects.
+### Frozen validation set
+`pick_validation_task_ids(k_per_tier=2, seed=42)` picks 2 tasks per tier (≈10 tasks total) at the start of training. The same set is used by every Optuna trial and the final post-training eval — no benchmark leakage between trials.
+### SFT-stage Optuna results (6 trials)
+The SFT-stage Optuna run shipped in [`out/optuna_study.json`](../out/optuna_study.json) explored a 5-parameter space (`lora_r`, `lora_alpha_mul`, `lora_dropout`, `learning_rate`, `warmup_ratio`). 6 trials, validation loss as objective (lower = better):
+| Trial | r  | α  | dropout | lr        | warmup | val_loss |
+|------:|---:|---:|:-------:|:---------:|:------:|:--------:|
+| **0** | 16 | 16 | 0.006   | 4.03e-4   | 0.10   | **0.0523** ★ |
+| 1     | 16 | 16 | 0.030   | 2.33e-4   | 0.03   | 0.0790   |
+| 2     |  8 | 32 | 0.020   | 2.29e-4   | 0.03   | 0.0587   |
+| 3     |  8 | 16 | 0.030   | 1.17e-4   | 0.03   | 0.1199   |
+| 4     | 16 | 16 | 0.031   | 2.31e-4   | 0.03   | 0.0793   |
+| 5     |  8 | 32 | 0.009   | 1.37e-4   | 0.10   | 0.0828   |
+> ![SFT Optuna trial comparison table](../docs/figures/sft_optuna_trials_table.png)
+```json
+{
+  "best_value": 0.052,
+  "best_params": {
+    "lora_r": 16,
+    "lora_alpha_mul": 1,            // → lora_alpha = 16
+    "lora_dropout": 0.005808,
+    "learning_rate": 4.03e-4,
+    "warmup_ratio": 0.1
+  }
+}
+```
+Visualized:
+> ![Optuna parameter importances](../docs/figures/optuna_param_importance.png)
+> ![Optuna optimization history](../docs/figures/optuna_history.png)
+> ![Optuna parallel coordinate plot](../docs/figures/optuna_parallel.png)
+> ![Optuna slice plot](../docs/figures/optuna_slice.png)
+> ![Optuna trial training curves](../docs/figures/optuna_trial_curves.png)
+### GRPO-stage Optuna results (4 trials)
+The GRPO-stage Optuna run shipped in [`out_grpo/optuna_best.json`](../out_grpo/optuna_best.json) explored a 3-parameter space (`learning_rate`, `beta`, `temperature`). 4 trials, single-step env reward as objective (higher = better):
+| Trial | lr        | β        | T     | env_reward | success |
+|------:|:---------:|:--------:|:-----:|:----------:|:-------:|
+| 0     | varied    | varied   | varied| 0.473      | 25.0%   |
+| 1     | varied    | varied   | varied| 0.469      | 25.0%   |
+| 2     | varied    | varied   | varied| 0.469      | 25.0%   |
+| **3** | 1.60e-5   | 0.0021   | 0.99  | **0.552**  | **33.3%** ★ |
+> ![GRPO Optuna trial comparison](../docs/figures/grpo_optuna_trials_comparison.png)
+> ![GRPO Optuna importances](../docs/figures/grpo_optuna_importances.png)
+> ![GRPO Optuna parallel coordinate](../docs/figures/grpo_optuna_parallel.png)
+> ![GRPO Optuna hparams](../docs/figures/grpo_optuna_hparams.png)
+> ![GRPO Optuna trial curves](../docs/figures/grpo_optuna_trial_curves.png)
+The winning GRPO config uses a **much smaller learning rate** (1.6e-5, vs 4.0e-4 for SFT) and a **tiny KL coefficient** (β=0.0021) — both expected for an RL phase that is only correcting the SFT-bootstrapped policy, not retraining it.
+---
+## 4. Multi-turn rollouts + parallel envs
+This section is a quick overview — the full mechanics, including the three pool layers and asyncio orchestration, are in [scripts/README.md](../scripts/README.md).
+### MultiTurnEnvPool
+[train_grpo.py:MultiTurnEnvPool](../train_grpo.py) — owns a background thread running an asyncio loop, opens N WebSocket sessions on startup, exposes a synchronous `run_group(task, ...)` API.
+- One pool instance lives for the duration of training
+- `run_group()` calls `asyncio.gather()` over `rollout_one_episode(env, task, ...)` for each of the N envs — every rollout runs the same task in its own MiniStack (see server-side pool in [server/README.md §6](../server/README.md#6-server-side-ministack-pool-parallel-rollouts))
+- Returns a list of `{prompt_ids, completion_ids, logprobs, task_reward, task_achieved, final_progress, num_steps, transcript, task_id, difficulty}`
+### Why parallelism matters here
+GRPO's group-relative advantage requires `G` rollouts before any gradient. Running them serially at MAX_TURNS=6 turns × ~50 ms env step = ~300 ms per rollout would cost 2.4 s × G=8 = ~20 s of env time per training step. With parallel rollouts that drops to ~300 ms (the slowest of 8). The model forward pass dominates, exactly as desired.
+### Generation lock
+Because the policy lives on a single GPU, `model.generate()` calls across the asyncio.gather group are serialised behind a `_GENERATE_LOCK` (`threading.Lock`). The env step calls — the slow part — happily overlap. This is the single non-obvious detail that makes the parallel rollout approach actually work.
+---
+## 5. Training modes (CLI)
+```bash
+# Optuna search only — produces best_cfg.json
+python train_grpo.py --mode optuna --n-trials 6 --trial-max-steps 30
+# Train once with explicit hyperparams (no search)
+python train_grpo.py --mode train \
+    --env-url http://localhost:8000 \
+    --num-generations 8 --max-turns 6 --max-steps 200
+# Search → train: Optuna trials, then a full-length run with the best config
+python train_grpo.py --mode full --n-trials 6 --max-steps 200
+```
+All modes write to `outputs/aws-rl-grpo-<TIMESTAMP>/`.
+---
+## 6. How to run
+### Prerequisites
+- A running env server: `make run` from the repo root (starts MiniStack + FastAPI on `http://localhost:8000`)
+- For pool size > 1: `AWS_RL_ENV_POOL_SIZE=8 make run`
+- A GPU with ≥ 24 GB VRAM (A10, T4×2, A100, L4 all confirmed working)
+- HuggingFace token (`HF_TOKEN`) if you want to push the trained adapter
+### Local
+```bash
+# 1. Start the env server in one terminal
+AWS_RL_ENV_POOL_SIZE=8 make run
+# 2. Run training in another terminal
+python train_grpo.py --mode full --n-trials 6 --max-steps 200
+```
+### Colab
+The notebook [aws_rl_env_colab.ipynb](../aws_rl_env_colab.ipynb) wraps the full pipeline (env URL config, HF login, val set, Optuna, training, plotting, optional push-to-Hub):
+| Notebook | Open in Colab |
+|----------|---------------|
+| GRPO end-to-end driver | <!-- TODO: paste Colab URL here --> |
+| SFT-only ([train/train_sft_lora.ipynb](train_sft_lora.ipynb)) | <!-- TODO: paste Colab URL here --> |
+| GRPO-only ([train/train_grpo_lora.ipynb](train_grpo_lora.ipynb)) | <!-- TODO: paste Colab URL here --> |
+Note: the Colab notebooks expect the env server to be reachable. Two options:
+1. **HF Space tunnel**: deploy the env to your own HF Space and point `ENV_URL` at it (see main README's deployment section)
+2. **ngrok**: run the env locally and expose it via ngrok / cloudflared so Colab can reach it
+---
+## 7. Logging and artifacts
+### Reference SFT output: [`out/`](../out/)
+A complete SFT training run is committed (small files only) at the repo root for reproducibility:
+```
+out/
+├── baseline_metrics.json     # eval scores BEFORE SFT (33% fmt, 39% exact, ...)
+├── delta_summary.json        # base vs post-SFT delta (the headline numbers)
+├── optuna_study.json         # SFT Optuna study summary (all 6 trials + best)
+├── optuna/                   # per-trial workspaces (trial-0..trial-5)
+├── final_sft/                # final TRL SFT trainer checkpoints (gitignored)
+│   ├── checkpoint-100/       # adapter + optimizer + tokenizer at step 100
+│   ├── checkpoint-150/
+│   └── checkpoint-188/       # last checkpoint (final adapter)
+└── plots/                    # 7 ready PNGs (loss curves, Optuna plots, eval comparison)
+```
+The contents of `out/plots/` are mirrored into [`docs/figures/`](../docs/figures/) so the READMEs render them. The full TRL checkpoints in `out/final_sft/` are kept for reproducibility but are gitignored (each is ~50 MB; total ~175 MB).
+### Reference GRPO output: [`out_grpo/`](../out_grpo/)
+A complete GRPO training run is also committed at the repo root:
+```
+out_grpo/
+├── baseline_single_step.json   # post-SFT single-step eval (90% reward, 85% success)
+├── baseline_multi_step.json    # post-SFT multi-step eval (86.8% success, 0.88 reward, by tier)
+├── grpo_multi_step.json        # post-GRPO multi-step eval (86.2% success, 0.88 reward, by tier)
+├── optuna_best.json            # GRPO Optuna best params + resolved config
+├── optuna.db                   # SQLite Optuna study (4 trials)
+├── optuna/trial-0..3/          # per-trial trainer_state.json + single_step_metrics.json
+├── qualitative_rollouts.json   # 5 hand-picked sample rollouts (one per tier, post-GRPO)
+├── final_grpo/                 # final TRL GRPO checkpoints (gitignored)
+│   ├── checkpoint-25/
+│   └── checkpoint-35/          # last checkpoint (final GRPO adapter)
+├── grpo_adapter/               # exported final adapter for HF Hub upload (gitignored)
+├── graphs/                     # 10 ready PNGs (Optuna views, training curves, by-tier breakdowns)
+└── graphs.zip
+```
+The 10 graphs from `out_grpo/graphs/` are mirrored into [`docs/figures/`](../docs/figures/) under descriptive names (`grpo_optuna_history.png`, `grpo_reward_curve.png`, `grpo_per_tier_curve.png`, `sft_vs_grpo_scalar.png`, `grpo_reward_by_tier.png`, etc.). The full TRL checkpoints in `out_grpo/final_grpo/` and the exported adapter in `out_grpo/grpo_adapter/` are gitignored (~160 MB total).
+### GRPO output layout
+Each GRPO run writes to a fresh `outputs/aws-rl-grpo-<TIMESTAMP>/`:
+| File                    | Written by             | Contents                                                                |
+|-------------------------|------------------------|-------------------------------------------------------------------------|
+| `reward_log.csv`        | `EpisodeLogger`        | One row per rollout: `step, rollout_idx, task_id, difficulty, task_reward, task_achieved, final_progress, num_steps, tier, tier_success_rate, timestamp` |
+| `transcripts.jsonl`     | `EpisodeLogger`        | Same rows + the full multi-turn transcript per rollout (commands, outputs, rewards) |
+| `optuna.db`             | Optuna                 | SQLite study (resumable)                                                |
+| `best_cfg.json`         | `optuna_search()`      | Final winning hyperparameters                                           |
+| `trial_NNN/`            | `_run_one_trial()`     | Per-trial trainer checkpoints + `trial_metrics.json`                    |
+| `val_task_ids.json`     | Notebook driver        | Frozen held-out validation set (for reproducibility)                    |
+| `post_train_val.json`   | Notebook §10           | Final post-training validation metrics                                  |
+| `reward_plot.png`       | `plot_rewards()`       | Group mean reward + per-tier scatter                                    |
+| `<adapter_dir>/`        | TRL `GRPOTrainer.save` | Trained LoRA adapter (`adapter_config.json`, `adapter_model.safetensors`, etc.) |
+Push to HF Hub:
+```python
+from huggingface_hub import create_repo, upload_folder
+create_repo("your-org/aws-rl-grpo-qwen25coder3b", exist_ok=True, private=False)
+upload_folder(folder_path=str(OUTPUT_DIR), repo_id="your-org/aws-rl-grpo-qwen25coder3b")
+```
+---
+## 8. Reproducing results
+### Actual SFT result (committed at [`out/`](../out/))
+```
+SFT (188 steps, best Optuna trial, ~30 min on A10):
+  best val_loss    : 0.052
+  best lora_r      : 16
+  best lora_alpha  : 16  (alpha_mul=1)
+  best lora_dropout: 0.0058
+  best lr          : 4.03e-4
+  best warmup      : 0.10
+Held-out eval (post-SFT, same prompts as base):
+  format_pct       : 33.3%  →  100.0%   (+66.7 pp)
+  exact_pct        : 38.9%  →   88.9%   (+50.0 pp)
+  service_pct      : 77.8%  →   88.9%   (+11.1 pp)
+  operation_pct    : 61.1%  →   88.9%   (+27.8 pp)
+  avg_latency      :  2.03s →    1.40s  (−0.63s)
+  avg_len          :  85.8  →   74.7    (tighter outputs)
+```
+Every target from [data/sft/MODEL_EVALUATION.md §11](../data/sft/MODEL_EVALUATION.md) is met or exceeded.
+### Actual GRPO result (committed at [`out_grpo/`](../out_grpo/))
+```
+GRPO (35 steps from best Optuna trial, ~1.5 hr on A10):
+  best lr          : 1.60e-5
+  best beta        : 0.0021
+  best temperature : 0.99
+  num_generations  : 8
+Per-step training signals (16 reward-logged steps):
+  env_reward (mean): 0.31      max: 0.94      min: 0.13
+  KL to SFT ref    : 0.15 mean (small β = 0.0021 keeps drift in check)
+  format_reward    : 1.00 every step (perfect format compliance)
+  completion length: 87 tokens mean (compact AWS CLI commands)
+Multi-step end-to-end eval (n≈108 episodes):
+                       Base+SFT     Base+SFT+GRPO     Δ
+  overall_success      86.8%        86.2%             −0.5 pp
+  overall_reward       0.883        0.877             −0.006
+  beginner_success     96.2%        100.0%            +3.8 pp ✓
+  intermediate_success 81.0%        87.0%             +6.0 pp ✓
+  warmup_success       96.0%        90.2%             −5.8 pp
+  expert_success       22.2%        22.2%             flat (bottleneck)
+  drift_repair         22.2%        22.2%             flat
+  destructive_fail     15.1%        14.7%             −0.4 pp
+  steps_to_solve       1.45         1.55              +0.10
+```
+**Honest reading.** A 35-step GRPO run from a strong SFT starting point (already 86.8% success) is short by RL standards. It preserves the SFT gains, modestly improves the middle tiers, but does not crack the expert-tier ceiling — the 22% expert / 22% drift-repair numbers stay flat because there are too few expert episodes in 35 GRPO steps × G=8 = 280 rollouts, with the curriculum focusing primarily on warmup/beginner/intermediate.
+Variance comes mostly from Optuna trial composition. The published SFT adapter (`Sizzing/aws-rl-sft-qwen25coder3b-adapter`) is the SFT result; the GRPO adapter regenerates per-run from `out_grpo/grpo_adapter/`.
+---
+## 9. Files in this directory
+| File                                    | Purpose                                                                |
+|-----------------------------------------|------------------------------------------------------------------------|
+| [train_sft_lora.ipynb](train_sft_lora.ipynb)                       | Stage 1 — supervised LoRA fine-tuning                  |
+| [train_grpo_lora.ipynb](train_grpo_lora.ipynb)                     | Stage 2 — GRPO RL training (clean)                     |
+| [train_grpo_lora_with_outputs.ipynb](train_grpo_lora_with_outputs.ipynb) | Same notebook with cell outputs preserved      |
+Heavy logic referenced from these notebooks:
+- [train_grpo.py](../train_grpo.py) — the `MultiTurnEnvPool`, GRPO config, Optuna search, `plot_rewards`, and the `run_training` entry point
+- [aws_rl_env_colab.ipynb](../aws_rl_env_colab.ipynb) — Colab driver that imports from `train_grpo.py`
+- [scripts/grpo_pool.py](../scripts/grpo_pool.py) and [scripts/grpo_train.py](../scripts/grpo_train.py) — alternative client-side pool entry point (covered in [scripts/README.md](../scripts/README.md))
+---
+## See also
+- [Main README](../README.md)
+- [data/README.md](../data/README.md) — dataset generation, base-model selection
+- [data/sft/MODEL_EVALUATION.md](../data/sft/MODEL_EVALUATION.md) — full 11-model benchmark
+- [scripts/README.md](../scripts/README.md) — parallel-rollout architecture deep-dive
+- [server/README.md](../server/README.md) — environment internals (curriculum, reward shaping, anti-hacking)
+- [compare/README.md](../compare/README.md) — base vs SFT comparison harness

train/train_grpo_lora.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

train/train_sft_lora.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

train_grpo.py ADDED Viewed

	@@ -0,0 +1,1283 @@

+"""GRPO training for the AWS RL environment — multi-turn rollouts + parallel envs.
+Mirrors the kube-sre-gym training pattern (heavy logic in this module, thin
+notebook on top):
+  - Each "episode" runs up to MAX_TURNS steps.
+  - Each step = one ``aws ...`` command; the command's stdout/stderr is fed
+    back into the next turn's prompt as the user message.
+  - Each GRPO step picks ONE curriculum task and runs G concurrent rollouts
+    (one per env in MultiTurnEnvPool) sharing that task.
+  - prompt_ids / completion_ids / logprobs are accumulated across turns so
+    GRPO assigns episode-level reward to the full token sequence.
+Usage (CLI)::
+    # Single training pass with explicit hyperparams
+    python train_grpo.py --mode train \\
+        --env-url http://localhost:8000 \\
+        --num-generations 8 --max-turns 6 --max-steps 200
+    # Optuna search over hyperparams, then dump best_cfg.json
+    python train_grpo.py --mode optuna --n-trials 6
+    # Optuna search, then full-length retrain using the best config
+    python train_grpo.py --mode full --n-trials 6 --max-steps 200
+"""
+from __future__ import annotations
+import argparse
+import asyncio
+import csv
+import gc
+import json
+import logging
+import re
+import threading
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Callable, Optional
+import torch
+from datasets import Dataset
+from peft import LoraConfig, PeftModel
+from transformers import AutoTokenizer
+from trl import GRPOConfig, GRPOTrainer
+from client import AwsRlEnv
+from models import AwsRlAction, AwsRlObservation, Task, TaskDifficulty, TaskID
+from server.services.curriculum import Curriculum
+logger = logging.getLogger(__name__)
+# ============================================================
+# System prompt — multi-turn AWS CLI agent
+# ============================================================
+SYSTEM_PROMPT = """You are an expert AWS Operations agent. You operate a simulated AWS cloud by emitting ONE AWS CLI command per turn.
+The user message contains:
+  - The task description.
+  - (Optional) A history of your previous commands and their outputs from earlier in this episode — use them to decide your next move.
+  - The most recent observation (last command's stdout / stderr / progress).
+Each turn:
+  1. Optionally reason inside a single <think>...</think> block. Keep it concise.
+  2. After </think>, on a NEW LINE, output EXACTLY ONE AWS CLI command starting with "aws ".
+Hard rules:
+  - The command line must contain ONLY the command — no markdown, no backticks, no quotes around it, no trailing commentary.
+  - If a command failed last turn, try a DIFFERENT approach. Do not repeat the exact same command twice in a row.
+  - When the task description names a specific resource (a bucket, table, queue, etc.), use that exact name.
+"""
+DEFAULT_CFG: dict[str, Any] = {
+    "learning_rate": 5e-6,
+    "beta": 0.04,
+    "num_generations": 8,
+    "temperature": 0.9,
+    "top_p": 0.95,
+    "lora_r": 16,
+    "lora_alpha_mul": 2,
+    "max_turns": 6,
+}
+# ============================================================
+# Helpers — prompt formatting + command parsing
+# ============================================================
+_THINK_BLOCK = re.compile(r"<think\b[^>]*>.*?</think>", re.DOTALL | re.IGNORECASE)
+_OPEN_THINK = re.compile(r"<think\b[^>]*>.*", re.DOTALL | re.IGNORECASE)
+def extract_aws_command(raw: str) -> str:
+    """Strip <think> blocks + markdown fences, return the first ``aws ...`` line.
+    Falls back to ``aws help`` so the env always gets a syntactically valid
+    command (the env will just produce a help-text observation, which is a
+    better RL signal than a parse error).
+    """
+    cleaned = _THINK_BLOCK.sub("", raw)
+    cleaned = _OPEN_THINK.sub("", cleaned)
+    for line in cleaned.splitlines():
+        line = line.strip().strip("`").strip()
+        if line.startswith("aws "):
+            return line
+    return "aws help"
+def _truncate(text: str, n: int) -> str:
+    if not text:
+        return ""
+    if len(text) <= n:
+        return text
+    return text[: n - 3] + "..."
+def format_observation(obs: AwsRlObservation) -> str:
+    """Render the latest env observation as a compact text block."""
+    parts: list[str] = []
+    if obs.command_output:
+        parts.append(f"Output:\n{_truncate(obs.command_output, 800)}")
+    if obs.error:
+        parts.append(f"Error:\n{_truncate(obs.error, 400)}")
+    parts.append(
+        f"Progress: {obs.partial_progress:.2f}  "
+        f"Achieved: {obs.task_achieved}  Step: {obs.step_count}"
+    )
+    if obs.hint_text:
+        parts.append(f"Hint: {_truncate(obs.hint_text, 200)}")
+    return "\n".join(parts)
+def format_history(history: list[dict], keep_last: int = 6) -> str:
+    """Render the last ``keep_last`` (cmd, output, reward) tuples for context."""
+    if not history:
+        return ""
+    recent = history[-keep_last:]
+    rendered: list[str] = ["PREVIOUS COMMANDS:"]
+    for i, h in enumerate(recent, start=max(1, len(history) - keep_last + 1)):
+        rendered.append(
+            f"[{i}] $ {h['command']}\n"
+            f"    output: {_truncate(h['output'], 300)}\n"
+            f"    reward: {h['reward']:.2f}"
+        )
+    return "\n".join(rendered)
+def apply_chat_template(tokenizer: AutoTokenizer, messages: list[dict]) -> str:
+    """Apply a chat template; fall back to a plain rendering if none is set."""
+    if getattr(tokenizer, "chat_template", None):
+        try:
+            return tokenizer.apply_chat_template(
+                messages, add_generation_prompt=True, tokenize=False
+            )
+        except TypeError:
+            return tokenizer.apply_chat_template(messages, tokenize=False)
+    parts: list[str] = []
+    for m in messages:
+        parts.append(f"<|{m['role']}|>\n{m['content']}\n")
+    parts.append("<|assistant|>\n")
+    return "".join(parts)
+def build_user_prompt(task: Task, obs: AwsRlObservation, history: list[dict]) -> str:
+    desc = task.description
+    if task.desired_state_spec:
+        desc = f"{desc}\n\nDesired end state:\n{task.desired_state_spec}"
+    history_text = format_history(history)
+    obs_text = format_observation(obs)
+    if history_text:
+        return f"TASK: {desc}\n\n{history_text}\n\n---\n\nCURRENT OBSERVATION:\n{obs_text}"
+    return f"TASK: {desc}\n\nCURRENT OBSERVATION:\n{obs_text}"
+# ============================================================
+# Policy loading — Unsloth 4-bit base + LoRA-from-SFT-adapter
+# ============================================================
+@dataclass
+class PolicySpec:
+    base_model: str = "unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit"
+    sft_adapter: str = "Sizzing/aws-rl-sft-qwen25coder3b-adapter"
+    max_seq_length: int = 3072
+def load_policy(
+    base_model: str,
+    sft_adapter: Optional[str] = None,
+    max_seq_length: int = 3072,
+    trainable: bool = True,
+):
+    """Load Unsloth 4-bit base + (optional) LoRA adapter from the SFT run.
+    ``trainable=True`` returns a PeftModel ready for GRPO training (Unsloth's
+    training kernels enabled, input require-grads hook installed).
+    ``trainable=False`` returns the same stack in inference mode for eval.
+    """
+    from unsloth import FastLanguageModel
+    base, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=base_model,
+        max_seq_length=max_seq_length,
+        load_in_4bit=True,
+    )
+    if sft_adapter:
+        model = PeftModel.from_pretrained(base, sft_adapter, is_trainable=trainable)
+    else:
+        # No adapter: GRPOTrainer can attach a fresh LoRA via peft_config later.
+        model = base
+    if trainable:
+        FastLanguageModel.for_training(model)
+        if hasattr(model, "enable_input_require_grads"):
+            model.enable_input_require_grads()
+    else:
+        FastLanguageModel.for_inference(model)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    return model, tokenizer
+def free_model(model) -> None:
+    """Release VRAM held by ``model`` and any captured optimizer state."""
+    try:
+        del model
+    except Exception:
+        pass
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+# ============================================================
+# Multi-turn rollout — one episode in one env
+# ============================================================
+@dataclass
+class SamplingCfg:
+    temperature: float = 0.9
+    top_p: float = 0.95
+    max_new_tokens: int = 256
+    max_prompt_length: int = 2048
+_GENERATE_LOCK = threading.Lock()
+"""Serialise model.generate() calls across the asyncio.gather rollout group.
+The model lives on a single GPU; concurrent generate() calls would collide.
+We let the env step run concurrently (the slow part — WebSocket round-trip +
+MiniStack execution); only the generation is serialised.
+"""
+def _generate_with_logprobs(
+    model,
+    tokenizer,
+    prompt_text: str,
+    sampling: SamplingCfg,
+) -> tuple[list[int], list[int], list[float]]:
+    """Generate one completion + return per-token logprobs.
+    Returns: (prompt_ids, completion_ids, completion_logprobs).
+    """
+    with _GENERATE_LOCK:
+        prompt_input = tokenizer(
+            prompt_text,
+            return_tensors="pt",
+            truncation=True,
+            max_length=sampling.max_prompt_length,
+        ).to(model.device)
+        was_training = model.training
+        model.eval()
+        try:
+            with torch.no_grad():
+                gen_out = model.generate(
+                    **prompt_input,
+                    max_new_tokens=sampling.max_new_tokens,
+                    do_sample=True,
+                    temperature=sampling.temperature,
+                    top_p=sampling.top_p,
+                    return_dict_in_generate=True,
+                    output_scores=True,
+                    pad_token_id=tokenizer.pad_token_id,
+                )
+        finally:
+            if was_training:
+                model.train()
+        prompt_ids = prompt_input.input_ids[0].tolist()
+        prompt_len = len(prompt_ids)
+        completion_seq = gen_out.sequences[0, prompt_len:].tolist()
+        # Per-token logprobs from raw logits.
+        logprobs: list[float] = []
+        for i, scores_t in enumerate(gen_out.scores):
+            if i >= len(completion_seq):
+                break
+            lp = torch.log_softmax(scores_t[0].float(), dim=-1)
+            logprobs.append(float(lp[completion_seq[i]].item()))
+    return prompt_ids, completion_seq, logprobs
+async def rollout_one_episode(
+    env: AwsRlEnv,
+    task: Task,
+    model,
+    tokenizer,
+    system_prompt: str,
+    max_turns: int,
+    max_total_tokens: int,
+    sampling: SamplingCfg,
+) -> dict:
+    """Run one multi-turn episode in one env, accumulating tokens across turns."""
+    try:
+        res = await env.reset(task=task)
+    except Exception as e:
+        logger.warning("reset() failed for task=%s: %s", task.task_id, e)
+        return {
+            "prompt_ids": [],
+            "completion_ids": [],
+            "logprobs": [],
+            "task_reward": -1.0,
+            "task_achieved": False,
+            "final_progress": 0.0,
+            "num_steps": 0,
+            "transcript": [{"error": f"reset failed: {e!r}"}],
+        }
+    obs: AwsRlObservation = res.observation
+    prompt_ids: list[int] = []
+    completion_ids: list[int] = []
+    logprobs: list[float] = []
+    step_rewards: list[float] = []
+    history: list[dict] = []
+    final_progress = float(getattr(obs, "partial_progress", 0.0) or 0.0)
+    final_achieved = bool(getattr(obs, "task_achieved", False))
+    for _turn in range(max_turns):
+        if res.done:
+            break
+        if len(completion_ids) >= max_total_tokens:
+            break
+        user_text = build_user_prompt(task, obs, history)
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_text},
+        ]
+        prompt_text = apply_chat_template(tokenizer, messages)
+        # Generation runs on the calling thread (blocking) but env.step calls
+        # for other rollouts in this group can overlap because they're all
+        # awaiting in the same loop.
+        loop = asyncio.get_running_loop()
+        turn_prompt_ids, turn_completion_ids, turn_logprobs = await loop.run_in_executor(
+            None, _generate_with_logprobs, model, tokenizer, prompt_text, sampling
+        )
+        completion_text = tokenizer.decode(turn_completion_ids, skip_special_tokens=True)
+        cmd = extract_aws_command(completion_text)
+        try:
+            res = await env.step(AwsRlAction(command=cmd))
+            step_reward = float(res.reward or 0.0)
+        except Exception as e:
+            logger.warning("step() error on cmd=%r: %s", cmd[:80], e)
+            step_reward = -0.1
+            history.append(
+                {
+                    "command": cmd,
+                    "output": f"ERROR: {e!r}",
+                    "reward": step_reward,
+                }
+            )
+            prompt_ids.extend(turn_prompt_ids)
+            completion_ids.extend(turn_completion_ids)
+            logprobs.extend(turn_logprobs)
+            step_rewards.append(step_reward)
+            break
+        prompt_ids.extend(turn_prompt_ids)
+        completion_ids.extend(turn_completion_ids)
+        logprobs.extend(turn_logprobs)
+        step_rewards.append(step_reward)
+        obs = res.observation
+        final_progress = float(getattr(obs, "partial_progress", 0.0) or 0.0)
+        final_achieved = bool(getattr(obs, "task_achieved", False))
+        history.append(
+            {
+                "command": cmd,
+                "output": _truncate(getattr(obs, "command_output", "") or "", 500),
+                "reward": step_reward,
+            }
+        )
+    return {
+        "prompt_ids": prompt_ids,
+        "completion_ids": completion_ids,
+        "logprobs": logprobs,
+        "task_reward": float(sum(step_rewards)) if step_rewards else -1.0,
+        "task_achieved": final_achieved,
+        "final_progress": final_progress,
+        "num_steps": len(history),
+        "transcript": history,
+        "task_id": int(task.task_id),
+        "difficulty": task.difficulty.value,
+    }
+# ============================================================
+# MultiTurnEnvPool — sync wrapper around N async env sessions
+# ============================================================
+class MultiTurnEnvPool:
+    """N persistent WebSocket env sessions, exposed via a sync ``run_group`` API.
+    Owns a background thread running an asyncio loop. Connect / close happens
+    once for the lifetime of training. Submitted coroutines run in the
+    background loop via ``asyncio.run_coroutine_threadsafe`` and the calling
+    thread blocks on the resulting concurrent.futures.Future.
+    """
+    def __init__(self, base_url: str, size: int, timeout_s: float = 120.0) -> None:
+        if size < 1:
+            raise ValueError("size must be >= 1")
+        self.base_url = base_url
+        self.size = size
+        self.timeout_s = timeout_s
+        self._envs: list[AwsRlEnv] = []
+        self._loop: Optional[asyncio.AbstractEventLoop] = None
+        self._thread: Optional[threading.Thread] = None
+        self._ready = threading.Event()
+        self._setup_error: Optional[BaseException] = None
+    def start(self) -> None:
+        """Open N WebSocket sessions on the background loop."""
+        if self._thread is not None:
+            return
+        def run() -> None:
+            loop = asyncio.new_event_loop()
+            self._loop = loop
+            asyncio.set_event_loop(loop)
+            try:
+                loop.run_until_complete(self._connect_all())
+            except BaseException as e:
+                self._setup_error = e
+                self._ready.set()
+                return
+            self._ready.set()
+            loop.run_forever()
+        self._thread = threading.Thread(target=run, daemon=True, name="env-pool")
+        self._thread.start()
+        self._ready.wait()
+        if self._setup_error is not None:
+            raise RuntimeError(
+                f"MultiTurnEnvPool failed to connect {self.size} sessions to "
+                f"{self.base_url}: {self._setup_error!r}"
+            )
+        logger.info("MultiTurnEnvPool: %d sessions on %s", self.size, self.base_url)
+    async def _connect_all(self) -> None:
+        envs = [AwsRlEnv(base_url=self.base_url) for _ in range(self.size)]
+        try:
+            await asyncio.gather(*(e.connect() for e in envs))
+        except BaseException:
+            await asyncio.gather(*(e.close() for e in envs), return_exceptions=True)
+            raise
+        self._envs = envs
+    def close(self) -> None:
+        if self._thread is None or self._loop is None:
+            return
+        loop = self._loop
+        async def _shutdown() -> None:
+            await asyncio.gather(
+                *(e.close() for e in self._envs), return_exceptions=True
+            )
+        try:
+            fut = asyncio.run_coroutine_threadsafe(_shutdown(), loop)
+            fut.result(timeout=10.0)
+        except Exception as e:
+            logger.warning("Pool shutdown error (ignored): %s", e)
+        finally:
+            loop.call_soon_threadsafe(loop.stop)
+            self._thread.join(timeout=5.0)
+            self._thread = None
+            self._loop = None
+            self._envs = []
+    def run_group(
+        self,
+        task: Task,
+        model,
+        tokenizer,
+        system_prompt: str,
+        max_turns: int,
+        max_total_tokens: int,
+        sampling: SamplingCfg,
+    ) -> list[dict]:
+        """Run N concurrent multi-turn rollouts on the same task. Sync; blocks."""
+        assert self._loop is not None and self._envs, "call start() first"
+        async def _gather() -> list[dict]:
+            return list(
+                await asyncio.gather(
+                    *(
+                        rollout_one_episode(
+                            env,
+                            task,
+                            model,
+                            tokenizer,
+                            system_prompt,
+                            max_turns,
+                            max_total_tokens,
+                            sampling,
+                        )
+                        for env in self._envs
+                    )
+                )
+            )
+        fut = asyncio.run_coroutine_threadsafe(_gather(), self._loop)
+        return fut.result(timeout=self.timeout_s * max(1, max_turns))
+    def __enter__(self) -> "MultiTurnEnvPool":
+        self.start()
+        return self
+    def __exit__(self, *exc) -> None:
+        self.close()
+# ============================================================
+# Reward functions (TRL convention) + rollout_func factory
+# ============================================================
+def reward_task(completions: list[str], **kwargs) -> list[float]:
+    rewards = kwargs.get("task_reward")
+    if rewards is None:
+        return [0.0 for _ in completions]
+    return [float(r) for r in rewards]
+def reward_achieved(completions: list[str], **kwargs) -> list[float]:
+    flags = kwargs.get("task_achieved")
+    if flags is None:
+        return [0.0 for _ in completions]
+    return [float(f) for f in flags]
+def reward_progress(completions: list[str], **kwargs) -> list[float]:
+    progress = kwargs.get("final_progress")
+    if progress is None:
+        return [0.0 for _ in completions]
+    return [float(p) for p in progress]
+def make_rollout_func(
+    curriculum: Curriculum,
+    pool: MultiTurnEnvPool,
+    model,
+    tokenizer,
+    system_prompt: str,
+    max_turns: int,
+    max_total_tokens: int,
+    sampling: SamplingCfg,
+    log_episode: Callable[[Task, list[dict]], None],
+) -> Callable:
+    """Build the closure GRPO calls each step.
+    ``prompts`` length equals ``num_generations``. We ignore the prompt strings
+    because the curriculum drives task selection — every rollout in the group
+    runs the same task forced through ``env.reset(task=...)``.
+    """
+    def rollout_func(prompts: list[str], trainer: GRPOTrainer) -> dict[str, list]:
+        task = curriculum.next_task()
+        results = pool.run_group(
+            task,
+            model,
+            tokenizer,
+            system_prompt,
+            max_turns,
+            max_total_tokens,
+            sampling,
+        )
+        # Pad / truncate to len(prompts) — defence in depth, group size should match.
+        if len(results) < len(prompts):
+            results.extend(results[-1:] * (len(prompts) - len(results)))
+        results = results[: len(prompts)]
+        group_rewards = [r["task_reward"] for r in results]
+        group_achieved = [r["task_achieved"] for r in results]
+        group_progress = [r["final_progress"] for r in results]
+        curriculum.record_result(
+            task,
+            achieved=any(group_achieved),
+            reward=float(sum(group_rewards) / len(group_rewards)) if group_rewards else 0.0,
+        )
+        log_episode(task, results)
+        return {
+            "prompt_ids": [r["prompt_ids"] for r in results],
+            "completion_ids": [r["completion_ids"] for r in results],
+            "logprobs": [r["logprobs"] for r in results],
+            "task_reward": group_rewards,
+            "task_achieved": [float(a) for a in group_achieved],
+            "final_progress": group_progress,
+        }
+    return rollout_func
+# ============================================================
+# CSV / JSONL logging + reward plotter
+# ============================================================
+class EpisodeLogger:
+    """Append-only CSV + JSONL writer for per-rollout episode rows."""
+    HEADER = [
+        "step",
+        "rollout_idx",
+        "task_id",
+        "difficulty",
+        "task_reward",
+        "task_achieved",
+        "final_progress",
+        "num_steps",
+        "tier",
+        "tier_success_rate",
+        "timestamp",
+    ]
+    def __init__(self, output_dir: Path) -> None:
+        self.output_dir = output_dir
+        output_dir.mkdir(parents=True, exist_ok=True)
+        self.csv_path = output_dir / "reward_log.csv"
+        self.jsonl_path = output_dir / "transcripts.jsonl"
+        if not self.csv_path.exists():
+            with open(self.csv_path, "w", newline="") as f:
+                csv.writer(f).writerow(self.HEADER)
+        self._step_counter = 0
+    def log(self, task: Task, results: list[dict], curriculum: Curriculum) -> None:
+        self._step_counter += 1
+        stats = curriculum.get_stats()
+        ts = datetime.now().isoformat()
+        with open(self.csv_path, "a", newline="") as f:
+            writer = csv.writer(f)
+            for i, r in enumerate(results):
+                writer.writerow(
+                    [
+                        self._step_counter,
+                        i,
+                        int(task.task_id),
+                        task.difficulty.value,
+                        f"{r['task_reward']:.4f}",
+                        int(bool(r["task_achieved"])),
+                        f"{r['final_progress']:.4f}",
+                        r["num_steps"],
+                        stats["tier"],
+                        stats["tier_success_rate"],
+                        ts,
+                    ]
+                )
+        with open(self.jsonl_path, "a") as f:
+            for i, r in enumerate(results):
+                f.write(
+                    json.dumps(
+                        {
+                            "step": self._step_counter,
+                            "rollout_idx": i,
+                            "task_id": int(task.task_id),
+                            "difficulty": task.difficulty.value,
+                            "task_reward": r["task_reward"],
+                            "task_achieved": bool(r["task_achieved"]),
+                            "final_progress": r["final_progress"],
+                            "num_steps": r["num_steps"],
+                            "tier": stats["tier"],
+                            "transcript": r["transcript"],
+                        }
+                    )
+                    + "\n"
+                )
+        rewards = [r["task_reward"] for r in results]
+        achieved = [bool(r["task_achieved"]) for r in results]
+        logger.info(
+            "Step %d task=%d (%s) rewards=%s achieved=%d/%d tier=%s tier_rate=%.2f",
+            self._step_counter,
+            int(task.task_id),
+            task.difficulty.value,
+            [round(r, 2) for r in rewards],
+            sum(achieved),
+            len(achieved),
+            stats["tier"],
+            stats["tier_success_rate"],
+        )
+def plot_rewards(csv_path: Path, out_path: Path) -> None:
+    """Per-step mean group reward + 10-step rolling avg + per-tier curves."""
+    import matplotlib
+    matplotlib.use("Agg")
+    import matplotlib.pyplot as plt
+    if not csv_path.exists():
+        logger.warning("No CSV at %s — skipping plot.", csv_path)
+        return
+    steps_data: dict[int, list[float]] = {}
+    tier_data: dict[str, list[tuple[int, float]]] = {}
+    with open(csv_path) as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            step = int(row["step"])
+            r = float(row["task_reward"])
+            tier = row["tier"]
+            steps_data.setdefault(step, []).append(r)
+            tier_data.setdefault(tier, []).append((step, r))
+    if not steps_data:
+        logger.warning("CSV at %s has no rows — skipping plot.", csv_path)
+        return
+    steps = sorted(steps_data.keys())
+    means = [sum(steps_data[s]) / len(steps_data[s]) for s in steps]
+    rolling = []
+    window = 10
+    for i in range(len(means)):
+        lo = max(0, i - window + 1)
+        rolling.append(sum(means[lo : i + 1]) / (i - lo + 1))
+    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8))
+    ax1.plot(steps, means, label="mean group reward", alpha=0.5)
+    ax1.plot(steps, rolling, label=f"rolling avg (k={window})", linewidth=2)
+    ax1.set_xlabel("GRPO step")
+    ax1.set_ylabel("reward")
+    ax1.set_title("Group mean reward over training")
+    ax1.legend()
+    ax1.grid(alpha=0.3)
+    for tier, points in tier_data.items():
+        xs = [p[0] for p in points]
+        ys = [p[1] for p in points]
+        ax2.scatter(xs, ys, s=10, alpha=0.5, label=tier)
+    ax2.set_xlabel("GRPO step")
+    ax2.set_ylabel("reward")
+    ax2.set_title("Per-rollout reward by curriculum tier")
+    ax2.legend()
+    ax2.grid(alpha=0.3)
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=120)
+    plt.close(fig)
+    logger.info("Reward plot written to %s", out_path)
+# ============================================================
+# Validation eval + Optuna search
+# ============================================================
+def pick_validation_task_ids(
+    curriculum: Optional[Curriculum] = None,
+    k_per_tier: int = 2,
+    seed: int = 42,
+) -> list[int]:
+    """Pick a frozen list of task ids — k per tier — for held-out validation."""
+    import random
+    rng = random.Random(seed)
+    cur = curriculum or Curriculum()
+    chosen: list[int] = []
+    for tier in TaskDifficulty:
+        try:
+            from server.services.curriculum import load_tier
+            tier_tasks = load_tier(tier, cur._tasks_dir)
+        except Exception as e:
+            logger.warning("Could not load tier %s for val: %s", tier.value, e)
+            continue
+        if not tier_tasks:
+            continue
+        sample = rng.sample(tier_tasks, k=min(k_per_tier, len(tier_tasks)))
+        chosen.extend(int(t.task_id) for t in sample)
+    return chosen
+def evaluate_on_validation(
+    model,
+    tokenizer,
+    pool: MultiTurnEnvPool,
+    val_task_ids: list[int],
+    system_prompt: str,
+    max_turns: int,
+    max_total_tokens: int,
+    sampling: SamplingCfg,
+    curriculum: Optional[Curriculum] = None,
+) -> dict[str, float]:
+    """Run ONE rollout per val task on env[0] of the pool. Return aggregate metrics."""
+    cur = curriculum or Curriculum()
+    achieved_flags: list[float] = []
+    progresses: list[float] = []
+    rewards: list[float] = []
+    async def _eval_one(task: Task) -> dict:
+        env = pool._envs[0]
+        return await rollout_one_episode(
+            env,
+            task,
+            model,
+            tokenizer,
+            system_prompt,
+            max_turns,
+            max_total_tokens,
+            sampling,
+        )
+    for tid in val_task_ids:
+        try:
+            task = cur.get_task_by_id(TaskID(int(tid)))
+        except KeyError:
+            logger.warning("val task_id=%d not found — skipping", tid)
+            continue
+        fut = asyncio.run_coroutine_threadsafe(_eval_one(task), pool._loop)
+        try:
+            res = fut.result(timeout=pool.timeout_s * max(1, max_turns))
+        except Exception as e:
+            logger.warning("val rollout failed for task=%d: %s", tid, e)
+            continue
+        achieved_flags.append(float(res["task_achieved"]))
+        progresses.append(float(res["final_progress"]))
+        rewards.append(float(res["task_reward"]))
+    n = max(1, len(achieved_flags))
+    return {
+        "achieved_rate": sum(achieved_flags) / n,
+        "mean_progress": sum(progresses) / n,
+        "mean_reward": sum(rewards) / n,
+        "n_evaluated": float(len(achieved_flags)),
+    }
+def _build_grpo_config(
+    output_dir: Path,
+    cfg: dict[str, Any],
+    max_steps: int,
+    max_completion_length: int,
+    max_prompt_length: int,
+    save_steps: int = 25,
+    save_strategy: str = "steps",
+    report_to: str = "none",
+) -> GRPOConfig:
+    return GRPOConfig(
+        output_dir=str(output_dir),
+        max_steps=max_steps,
+        learning_rate=float(cfg["learning_rate"]),
+        beta=float(cfg["beta"]),
+        num_generations=int(cfg["num_generations"]),
+        generation_batch_size=int(cfg["num_generations"]),
+        per_device_train_batch_size=1,
+        gradient_accumulation_steps=8,
+        gradient_checkpointing=True,
+        gradient_checkpointing_kwargs={"use_reentrant": False},
+        max_completion_length=max_completion_length,
+        max_prompt_length=max_prompt_length,
+        temperature=float(cfg["temperature"]),
+        top_p=float(cfg["top_p"]),
+        logging_steps=1,
+        save_strategy=save_strategy,
+        save_steps=save_steps,
+        save_total_limit=3,
+        report_to=report_to,
+        loss_type="dapo",
+        mask_truncated_completions=True,
+        warmup_ratio=0.05,
+        lr_scheduler_type="cosine",
+        max_grad_norm=1.0,
+        use_vllm=False,
+        remove_unused_columns=False,
+    )
+def _build_dummy_dataset(num_rows: int) -> Dataset:
+    """A length-only dataset; the prompts are ignored by ``rollout_func``."""
+    return Dataset.from_dict({"prompt": ["solve"] * max(1, num_rows)})
+def optuna_search(
+    n_trials: int,
+    trial_max_steps: int,
+    val_task_ids: list[int],
+    base_model: str,
+    sft_adapter: Optional[str],
+    env_url: str,
+    output_dir: Path,
+    max_total_tokens: int = 2048,
+    max_completion_length: int = 256,
+    max_prompt_length: int = 2048,
+    seed: int = 42,
+):
+    """TPE-sampled hyperparam search. Persists to ``output_dir/optuna.db``."""
+    import optuna
+    output_dir.mkdir(parents=True, exist_ok=True)
+    study = optuna.create_study(
+        direction="maximize",
+        study_name="aws-rl-grpo",
+        storage=f"sqlite:///{output_dir / 'optuna.db'}",
+        load_if_exists=True,
+        sampler=optuna.samplers.TPESampler(seed=seed),
+    )
+    def _objective(trial: optuna.Trial) -> float:
+        cfg = {
+            "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
+            "beta": trial.suggest_float("beta", 0.0, 0.1),
+            "num_generations": trial.suggest_categorical("num_generations", [4, 8]),
+            "temperature": trial.suggest_float("temperature", 0.7, 1.0),
+            "top_p": trial.suggest_float("top_p", 0.85, 0.98),
+            "lora_r": trial.suggest_categorical("lora_r", [8, 16, 32]),
+            "lora_alpha_mul": trial.suggest_categorical("lora_alpha_mul", [1, 2, 4]),
+            "max_turns": trial.suggest_categorical("max_turns", [4, 6, 8]),
+        }
+        trial_dir = output_dir / f"trial_{trial.number:03d}"
+        return _run_one_trial(
+            cfg=cfg,
+            trial_max_steps=trial_max_steps,
+            val_task_ids=val_task_ids,
+            base_model=base_model,
+            sft_adapter=sft_adapter,
+            env_url=env_url,
+            output_dir=trial_dir,
+            max_total_tokens=max_total_tokens,
+            max_completion_length=max_completion_length,
+            max_prompt_length=max_prompt_length,
+        )
+    study.optimize(_objective, n_trials=n_trials, gc_after_trial=True)
+    best_path = output_dir / "best_cfg.json"
+    payload = {"best_value": study.best_value, "best_params": dict(study.best_params)}
+    with open(best_path, "w") as f:
+        json.dump(payload, f, indent=2)
+    logger.info(
+        "Optuna study finished. best_value=%.4f best_params=%s -> %s",
+        study.best_value,
+        study.best_params,
+        best_path,
+    )
+    return study
+def _run_one_trial(
+    cfg: dict[str, Any],
+    trial_max_steps: int,
+    val_task_ids: list[int],
+    base_model: str,
+    sft_adapter: Optional[str],
+    env_url: str,
+    output_dir: Path,
+    max_total_tokens: int,
+    max_completion_length: int,
+    max_prompt_length: int,
+) -> float:
+    """One Optuna trial: load → train → eval on val tasks → tear down → return objective."""
+    output_dir.mkdir(parents=True, exist_ok=True)
+    logger.info("Optuna trial cfg=%s -> %s", cfg, output_dir)
+    model = tokenizer = None
+    pool: Optional[MultiTurnEnvPool] = None
+    trainer: Optional[GRPOTrainer] = None
+    try:
+        model, tokenizer = load_policy(base_model, sft_adapter, trainable=True)
+        pool = MultiTurnEnvPool(env_url, size=int(cfg["num_generations"]))
+        pool.start()
+        curriculum = Curriculum()
+        sampling = SamplingCfg(
+            temperature=float(cfg["temperature"]),
+            top_p=float(cfg["top_p"]),
+            max_new_tokens=max_completion_length,
+            max_prompt_length=max_prompt_length,
+        )
+        ep_logger = EpisodeLogger(output_dir)
+        rollout_func = make_rollout_func(
+            curriculum=curriculum,
+            pool=pool,
+            model=model,
+            tokenizer=tokenizer,
+            system_prompt=SYSTEM_PROMPT,
+            max_turns=int(cfg["max_turns"]),
+            max_total_tokens=max_total_tokens,
+            sampling=sampling,
+            log_episode=lambda task, results: ep_logger.log(task, results, curriculum),
+        )
+        dataset = _build_dummy_dataset(trial_max_steps * int(cfg["num_generations"]))
+        grpo_cfg = _build_grpo_config(
+            output_dir=output_dir,
+            cfg=cfg,
+            max_steps=trial_max_steps,
+            max_completion_length=max_completion_length,
+            max_prompt_length=max_prompt_length,
+            save_strategy="no",
+            report_to="none",
+        )
+        trainer = GRPOTrainer(
+            model=model,
+            processing_class=tokenizer,
+            reward_funcs=[reward_task, reward_achieved, reward_progress],
+            train_dataset=dataset,
+            args=grpo_cfg,
+            rollout_func=rollout_func,
+            peft_config=None if sft_adapter else _lora_config(cfg),
+        )
+        trainer.train()
+        metrics = evaluate_on_validation(
+            model=trainer.model,
+            tokenizer=tokenizer,
+            pool=pool,
+            val_task_ids=val_task_ids,
+            system_prompt=SYSTEM_PROMPT,
+            max_turns=int(cfg["max_turns"]),
+            max_total_tokens=max_total_tokens,
+            sampling=sampling,
+            curriculum=curriculum,
+        )
+        objective = 0.7 * metrics["achieved_rate"] + 0.3 * metrics["mean_progress"]
+        with open(output_dir / "trial_metrics.json", "w") as f:
+            json.dump({"cfg": cfg, "metrics": metrics, "objective": objective}, f, indent=2)
+        logger.info("Trial done: metrics=%s objective=%.4f", metrics, objective)
+        return float(objective)
+    finally:
+        if trainer is not None:
+            try:
+                del trainer
+            except Exception:
+                pass
+        if model is not None:
+            free_model(model)
+        if pool is not None:
+            try:
+                pool.close()
+            except Exception:
+                logger.exception("Pool close error during trial cleanup")
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+def _lora_config(cfg: dict[str, Any]) -> LoraConfig:
+    r = int(cfg["lora_r"])
+    alpha_mul = int(cfg["lora_alpha_mul"])
+    return LoraConfig(
+        r=r,
+        lora_alpha=r * alpha_mul,
+        lora_dropout=0.05,
+        bias="none",
+        task_type="CAUSAL_LM",
+        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
+    )
+# ============================================================
+# Main training entrypoint (single training pass)
+# ============================================================
+def run_training(
+    cfg: dict[str, Any],
+    *,
+    base_model: str,
+    sft_adapter: Optional[str],
+    env_url: str,
+    output_dir: Path,
+    max_steps: int,
+    max_total_tokens: int = 4096,
+    max_completion_length: int = 256,
+    max_prompt_length: int = 2048,
+    push_to_hub: bool = False,
+    hub_repo: Optional[str] = None,
+) -> Path:
+    """Run a full GRPO training pass with the supplied config dict."""
+    output_dir.mkdir(parents=True, exist_ok=True)
+    logger.info("run_training cfg=%s -> %s", cfg, output_dir)
+    model, tokenizer = load_policy(base_model, sft_adapter, trainable=True)
+    pool = MultiTurnEnvPool(env_url, size=int(cfg["num_generations"]))
+    pool.start()
+    curriculum = Curriculum()
+    sampling = SamplingCfg(
+        temperature=float(cfg["temperature"]),
+        top_p=float(cfg["top_p"]),
+        max_new_tokens=max_completion_length,
+        max_prompt_length=max_prompt_length,
+    )
+    ep_logger = EpisodeLogger(output_dir)
+    rollout_func = make_rollout_func(
+        curriculum=curriculum,
+        pool=pool,
+        model=model,
+        tokenizer=tokenizer,
+        system_prompt=SYSTEM_PROMPT,
+        max_turns=int(cfg["max_turns"]),
+        max_total_tokens=max_total_tokens,
+        sampling=sampling,
+        log_episode=lambda task, results: ep_logger.log(task, results, curriculum),
+    )
+    dataset = _build_dummy_dataset(max_steps * int(cfg["num_generations"]))
+    grpo_cfg = _build_grpo_config(
+        output_dir=output_dir,
+        cfg=cfg,
+        max_steps=max_steps,
+        max_completion_length=max_completion_length,
+        max_prompt_length=max_prompt_length,
+    )
+    trainer = GRPOTrainer(
+        model=model,
+        processing_class=tokenizer,
+        reward_funcs=[reward_task, reward_achieved, reward_progress],
+        train_dataset=dataset,
+        args=grpo_cfg,
+        rollout_func=rollout_func,
+        peft_config=None if sft_adapter else _lora_config(cfg),
+    )
+    try:
+        trainer.train()
+    finally:
+        try:
+            pool.close()
+        except Exception:
+            logger.exception("Pool close error after training")
+        try:
+            plot_rewards(ep_logger.csv_path, output_dir / "reward_plot.png")
+        except Exception as e:
+            logger.warning("plot_rewards failed: %s", e)
+    trainer.save_model(str(output_dir))
+    logger.info("Adapter saved to %s", output_dir)
+    if push_to_hub and hub_repo:
+        trainer.push_to_hub(repo_id=hub_repo)
+        logger.info("Adapter pushed to https://huggingface.co/%s", hub_repo)
+    return output_dir
+# ============================================================
+# CLI
+# ============================================================
+def _parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--mode", choices=["train", "optuna", "full"], default="train")
+    p.add_argument("--base-model", default=PolicySpec.base_model)
+    p.add_argument("--sft-adapter", default=PolicySpec.sft_adapter,
+                   help="HF repo id of the SFT adapter (use empty string to disable)")
+    p.add_argument("--env-url", default="http://localhost:8000")
+    p.add_argument("--output-dir", default=None)
+    # Train-mode hyperparams (mirror DEFAULT_CFG keys)
+    p.add_argument("--num-generations", type=int, default=DEFAULT_CFG["num_generations"])
+    p.add_argument("--max-turns", type=int, default=DEFAULT_CFG["max_turns"])
+    p.add_argument("--max-steps", type=int, default=200)
+    p.add_argument("--learning-rate", type=float, default=DEFAULT_CFG["learning_rate"])
+    p.add_argument("--beta", type=float, default=DEFAULT_CFG["beta"])
+    p.add_argument("--temperature", type=float, default=DEFAULT_CFG["temperature"])
+    p.add_argument("--top-p", type=float, default=DEFAULT_CFG["top_p"])
+    p.add_argument("--lora-r", type=int, default=DEFAULT_CFG["lora_r"])
+    p.add_argument("--lora-alpha-mul", type=int, default=DEFAULT_CFG["lora_alpha_mul"])
+    p.add_argument("--max-prompt-length", type=int, default=2048)
+    p.add_argument("--max-completion-length", type=int, default=256)
+    p.add_argument("--max-total-tokens", type=int, default=4096)
+    # Optuna-specific
+    p.add_argument("--n-trials", type=int, default=6)
+    p.add_argument("--trial-max-steps", type=int, default=30)
+    p.add_argument("--val-tasks-per-tier", type=int, default=2)
+    p.add_argument("--push-to-hub", action="store_true")
+    p.add_argument("--hub-repo", default=None)
+    return p.parse_args()
+def _resolve_output_dir(args: argparse.Namespace) -> Path:
+    if args.output_dir:
+        return Path(args.output_dir)
+    ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    return Path("outputs") / f"aws-rl-grpo-{ts}"
+def _cli_cfg(args: argparse.Namespace) -> dict[str, Any]:
+    return {
+        "learning_rate": args.learning_rate,
+        "beta": args.beta,
+        "num_generations": args.num_generations,
+        "temperature": args.temperature,
+        "top_p": args.top_p,
+        "lora_r": args.lora_r,
+        "lora_alpha_mul": args.lora_alpha_mul,
+        "max_turns": args.max_turns,
+    }
+def main() -> None:
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)s %(name)s %(message)s",
+    )
+    args = _parse_args()
+    output_dir = _resolve_output_dir(args)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    sft_adapter = args.sft_adapter or None
+    if args.mode in ("optuna", "full"):
+        val_ids = pick_validation_task_ids(k_per_tier=args.val_tasks_per_tier)
+        with open(output_dir / "val_task_ids.json", "w") as f:
+            json.dump(val_ids, f)
+        study = optuna_search(
+            n_trials=args.n_trials,
+            trial_max_steps=args.trial_max_steps,
+            val_task_ids=val_ids,
+            base_model=args.base_model,
+            sft_adapter=sft_adapter,
+            env_url=args.env_url,
+            output_dir=output_dir,
+            max_total_tokens=args.max_total_tokens,
+            max_completion_length=args.max_completion_length,
+            max_prompt_length=args.max_prompt_length,
+        )
+        if args.mode == "optuna":
+            return
+        cfg = {**DEFAULT_CFG, **dict(study.best_params)}
+    else:
+        cfg = _cli_cfg(args)
+    run_training(
+        cfg,
+        base_model=args.base_model,
+        sft_adapter=sft_adapter,
+        env_url=args.env_url,
+        output_dir=output_dir,
+        max_steps=args.max_steps,
+        max_total_tokens=args.max_total_tokens,
+        max_completion_length=args.max_completion_length,
+        max_prompt_length=args.max_prompt_length,
+        push_to_hub=args.push_to_hub,
+        hub_repo=args.hub_repo,
+    )
+if __name__ == "__main__":
+    main()

train_grpo_lora_final.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff