Sizzing commited on
Commit
456f5a3
·
verified ·
1 Parent(s): 1b2b81e

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +14 -0
  2. Dockerfile +7 -1
  3. README.md +543 -436
  4. aws_rl_env_colab.ipynb +233 -0
  5. compare/README.md +230 -0
  6. compare/compare_base_vs_sft.ipynb +0 -0
  7. compare/compare_base_vs_sft_with_outputs.ipynb +0 -0
  8. data/README.md +238 -0
  9. docs/figures/base_vs_sft_success.png +0 -0
  10. docs/figures/compare_dataset.png +3 -0
  11. docs/figures/compare_rl_env.png +3 -0
  12. docs/figures/env_init_screenshot.png +3 -0
  13. docs/figures/grpo_final_per_step.png +3 -0
  14. docs/figures/grpo_optuna_history.png +0 -0
  15. docs/figures/grpo_optuna_history_v0.png +0 -0
  16. docs/figures/grpo_optuna_hparams.png +0 -0
  17. docs/figures/grpo_optuna_importances.png +0 -0
  18. docs/figures/grpo_optuna_parallel.png +0 -0
  19. docs/figures/grpo_optuna_trial_curves.png +3 -0
  20. docs/figures/grpo_optuna_trials_comparison.png +3 -0
  21. docs/figures/grpo_per_tier_curve.png +0 -0
  22. docs/figures/grpo_reward_by_tier.png +0 -0
  23. docs/figures/grpo_reward_curve.png +3 -0
  24. docs/figures/ministack_logo.png +3 -0
  25. docs/figures/model_eval_chart.png +0 -0
  26. docs/figures/optuna_history.png +0 -0
  27. docs/figures/optuna_parallel.png +3 -0
  28. docs/figures/optuna_param_importance.png +0 -0
  29. docs/figures/optuna_slice.png +3 -0
  30. docs/figures/optuna_trial_curves.png +0 -0
  31. docs/figures/qualitative_rollouts.png +0 -0
  32. docs/figures/rl_env_eval_base_vs_sft.png +0 -0
  33. docs/figures/sft_loss_curve.png +3 -0
  34. docs/figures/sft_optuna_trials_table.png +0 -0
  35. docs/figures/sft_vs_grpo_by_tier.png +0 -0
  36. docs/figures/sft_vs_grpo_metrics_grid.png +0 -0
  37. docs/figures/sft_vs_grpo_scalar.png +0 -0
  38. docs/figures/single_step_eval.png +0 -0
  39. images/compare_dataset.png +3 -0
  40. images/compare_rl_env.png +3 -0
  41. pyproject.toml +10 -1
  42. scripts/README.md +260 -0
  43. server/README.md +596 -0
  44. server/app.py +150 -65
  45. tests/test_pool.py +325 -0
  46. train/README.md +545 -0
  47. train/train_grpo_lora.ipynb +0 -0
  48. train/train_sft_lora.ipynb +0 -0
  49. train_grpo.py +1283 -0
  50. train_grpo_lora_final.ipynb +0 -0
.gitattributes CHANGED
@@ -36,3 +36,17 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
36
  aws_infra/ministack_logo.png filter=lfs diff=lfs merge=lfs -text
37
  scripts/Screenshot[[:space:]]2026-04-20[[:space:]]at[[:space:]]6.50.47 PM.png filter=lfs diff=lfs merge=lfs -text
38
  scripts/Screenshot[[:space:]]2026-04-20[[:space:]]at[[:space:]]6.50.47 PM.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  aws_infra/ministack_logo.png filter=lfs diff=lfs merge=lfs -text
37
  scripts/Screenshot[[:space:]]2026-04-20[[:space:]]at[[:space:]]6.50.47 PM.png filter=lfs diff=lfs merge=lfs -text
38
  scripts/Screenshot[[:space:]]2026-04-20[[:space:]]at[[:space:]]6.50.47 PM.png filter=lfs diff=lfs merge=lfs -text
39
+ docs/figures/compare_dataset.png filter=lfs diff=lfs merge=lfs -text
40
+ docs/figures/compare_rl_env.png filter=lfs diff=lfs merge=lfs -text
41
+ docs/figures/env_init_screenshot.png filter=lfs diff=lfs merge=lfs -text
42
+ docs/figures/grpo_final_per_step.png filter=lfs diff=lfs merge=lfs -text
43
+ docs/figures/grpo_optuna_trial_curves.png filter=lfs diff=lfs merge=lfs -text
44
+ docs/figures/grpo_optuna_trials_comparison.png filter=lfs diff=lfs merge=lfs -text
45
+ docs/figures/grpo_reward_curve.png filter=lfs diff=lfs merge=lfs -text
46
+ docs/figures/ministack_logo.png filter=lfs diff=lfs merge=lfs -text
47
+ docs/figures/optuna_parallel.png filter=lfs diff=lfs merge=lfs -text
48
+ docs/figures/optuna_slice.png filter=lfs diff=lfs merge=lfs -text
49
+ docs/figures/sft_loss_curve.png filter=lfs diff=lfs merge=lfs -text
50
+ images/compare_dataset.png filter=lfs diff=lfs merge=lfs -text
51
+ images/compare_rl_env.png filter=lfs diff=lfs merge=lfs -text
52
+ scripts/Screenshot[[:space:]]2026-04-20[[:space:]]at[[:space:]]6.50.47 PM.png filter=lfs diff=lfs merge=lfs -text
Dockerfile CHANGED
@@ -80,7 +80,7 @@ RUN mkdir -p /root/.aws && \
80
  ENV AWS_ENDPOINT_URL=http://localhost:4566
81
 
82
  # Enable the web interface for OpenEnv (if applicable)
83
- ENV ENABLE_WEB_INTERFACE=false
84
 
85
  # Set PATH to use the virtual environment
86
  ENV PATH="/app/.venv/bin:$PATH"
@@ -90,6 +90,9 @@ ENV PYTHONPATH="/app/env:$PYTHONPATH"
90
 
91
  ENV AWS_RL_ENV_POOL_SIZE=8
92
  ENV AWS_RL_ENV_MINISTACK_BASE_PORT=4566
 
 
 
93
 
94
  # DEV_MODE=1 enables live reload via --reload flag
95
  ENV DEV_MODE=0
@@ -100,6 +103,9 @@ ENV MODEL_NAME=Qwen/Qwen2.5-72B-Instruct
100
  # Entrypoint: start N MiniStack instances (AWS_RL_ENV_POOL_SIZE, default 1),
101
  # then run the FastAPI server. Each MiniStack listens on a distinct port
102
  # starting at AWS_RL_ENV_MINISTACK_BASE_PORT (default 4566).
 
 
 
103
  # cloudflared tunnel --url localhost:8000
104
  CMD ["sh", "-c", "\
105
  POOL_SIZE=\"${AWS_RL_ENV_POOL_SIZE:-1}\"; \
 
80
  ENV AWS_ENDPOINT_URL=http://localhost:4566
81
 
82
  # Enable the web interface for OpenEnv (if applicable)
83
+ ENV ENABLE_WEB_INTERFACE=true
84
 
85
  # Set PATH to use the virtual environment
86
  ENV PATH="/app/.venv/bin:$PATH"
 
90
 
91
  ENV AWS_RL_ENV_POOL_SIZE=8
92
  ENV AWS_RL_ENV_MINISTACK_BASE_PORT=4566
93
+ # Dedicated port for the web playground's lazily-spawned MiniStack.
94
+ # Kept outside the pool's range so a WebSocket session can never claim it.
95
+ ENV AWS_RL_ENV_WEB_MINISTACK_PORT=4565
96
 
97
  # DEV_MODE=1 enables live reload via --reload flag
98
  ENV DEV_MODE=0
 
103
  # Entrypoint: start N MiniStack instances (AWS_RL_ENV_POOL_SIZE, default 1),
104
  # then run the FastAPI server. Each MiniStack listens on a distinct port
105
  # starting at AWS_RL_ENV_MINISTACK_BASE_PORT (default 4566).
106
+ # The web playground's MiniStack on AWS_RL_ENV_WEB_MINISTACK_PORT is NOT
107
+ # started here — the FastAPI server spawns it lazily on the first /web/*
108
+ # request so training-only deployments pay zero cost.
109
  # cloudflared tunnel --url localhost:8000
110
  CMD ["sh", "-c", "\
111
  POOL_SIZE=\"${AWS_RL_ENV_POOL_SIZE:-1}\"; \
README.md CHANGED
@@ -11,244 +11,167 @@ tags:
11
  - openenv
12
  ---
13
 
14
- # AWS Cloud CLI and SRE Reinforcement Learning Environment
 
 
15
 
16
- A **OpenEnv** RL environment** for training AI agents on real-world AWS cloud operations. The agent sends AWS CLI commands as actions, receives structured observations, and progresses through a **curriculum of 120+ tasks** across 5 difficulty tiers — from basic listing to SRE incident response and security posture auditing.
17
 
18
- The agents interact with a **real-world AWS Shell simulator** a vendored MiniStack emulator (34 AWS services, in-memory, zero-cost) inside the same Docker container. The response of every executed command is the same as production AWS. The grading system evaluates rewards and penalties based on the **actual AWS infrastructure state** instead of static metrics. No AWS account needed.
19
 
20
- > **[Try the Playground](https://sizzing-aws-rl-env.hf.space/web)** | **[API Docs](https://sizzing-aws-rl-env.hf.space/docs)** | **[Hugging Face Space](https://huggingface.co/spaces/Sizzing/aws_rl_env)**
 
 
 
 
 
 
21
 
 
22
 
23
- ## Task Tiers (100+ Tasks)
24
-
25
- ### Warmup 20 tasks
26
- > List resources single read-only commands
27
-
28
- - Run one AWS CLI command to list or describe a resource type
29
- - S3 buckets, EC2 instances, DynamoDB tables, Lambda functions, RDS, EBS volumes
30
- - Graded by **command_match** — checks operation + service pair
31
- - No setup required, no state mutations
32
-
33
- ### Beginner 20 tasks
34
- > Create single resources with verification
35
-
36
- - Create an S3 bucket, DynamoDB table, SQS queue, or Lambda function
37
- - Graded by **resource_creation** — verifies the exact resource exists in the AWS Infrastructure Simulator
38
- - Introduces resource name validation — "my-bucket-2" won't satisfy a check for "my-bucket"
39
- - First tier where idempotency bonus (+0.02) can be earned
 
 
40
 
41
- ### Intermediate — 20 tasks
42
- > Multi-step workflows — create, configure, connect
43
 
44
- - Ordered sequences: create a bucket then enable versioning, create a table then add an item
45
- - Graded by **multi_step** — validates each step was completed in order
46
- - Chaos injection begins at **10% probability** — resources may be silently mutated mid-episode
47
- - Rollback penalty (-0.1) starts to matter with multi-step create/delete patterns
48
 
49
- ### Advanced20 tasks
50
- > Cross-service architectures spanning multiple AWS services
51
 
52
- - Wire Lambda to SQS, configure API Gateway with integrations, build event-driven pipelines
53
- - Graded by **multi_step + services** — all required services must be configured
54
- - Chaos injection escalates to **20% probability** — DynamoDB throughput, Lambda configs may change
55
- - Hints cost more: 3 hints = only 61% of max reward (0.85³ decay)
56
 
57
- ### Expert20 tasks
58
- > SRE incidents, drift detection & security posture audits
 
 
 
59
 
60
- - Fix overly permissive S3 policies, replace broad IAM inline policies, repair broken infrastructure
61
- - Graded by **state_checks** — actual CLI commands run against MiniStack at grading time
62
- - Chaos injection at **30% probability** — maximum perturbation frequency
63
- - **6 drift detection tasks** — correct infra is provisioned, then 2-3 random mutations applied from a pool
64
- - Agent must audit environment, discover which resources drifted, and fix only those
65
- - Drift is randomized per episode — prevents memorization of fix sequences
66
 
67
  ---
68
 
69
- ## Features
70
-
71
- ### 1. Curriculum & Training
72
-
73
- Adaptive learning system that tracks mastery and selects optimal tasks.
74
-
75
- #### Progressive Difficulty
76
- - **What:** The environment organizes 120+ tasks across 5 tiers: Warmup, Beginner, Intermediate, Advanced, and Expert. Tasks progress from simple listing operations to complex SRE incident response and drift detection scenarios.
77
- - **Why:** Prevents the agent from being overwhelmed by complex tasks early on. Scaffolded difficulty ensures the agent builds foundational skills before tackling multi-service architectures.
78
- - **How:** The `CurriculumManager` maintains per-agent tier state. Promotion requires meeting a minimum episode count and success rate threshold. A fast-track mechanism allows agents scoring 90%+ on 3 consecutive episodes to skip the minimum wait.
79
- - **Metrics:** 5 Difficulty Tiers | 120+ Total Tasks | 90% Fast-track Threshold
80
-
81
- #### Mastery Tracking
82
- - **What:** Each task independently tracks the agent's performance using a weighted success rate over a sliding window. Tasks "graduate" when performance exceeds the mastery threshold consistently.
83
- - **Why:** Ensures the agent truly masters a skill before moving on. Prevents lucky single completions from being treated as mastery. Un-graduation catches skill decay.
84
- - **How:** A `mastery_window` of 10 episodes and `mastery_threshold` of 0.7 (70% success). Minimum 3 attempts required before graduation. Recent results are weighted more heavily using exponential decay (factor 0.85). Graduated tasks can un-graduate if performance drops.
85
- - **Metrics:** 70% Mastery Threshold | 10 Window Size | 0.85 Decay Factor
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
- #### Spaced Repetition
88
- - **What:** Graduated tasks don't disappear — they resurface at exponentially increasing intervals (3, 6, 12, 24, 48 episodes) for re-testing, earning a +30 priority bonus when due.
89
- - **Why:** Prevents catastrophic forgetting. The agent must retain skills even as it learns new ones. Exponential spacing is the most efficient retention schedule, borrowed from cognitive science.
90
- - **How:** Each task tracks a `spaced_rep_interval` starting at 3 episodes. When re-tested and passes, the interval doubles (up to 48). If it fails, the interval resets. `_is_spaced_rep_due()` checks elapsed episodes against the interval.
91
- - **Metrics:** +30 Spaced Rep Bonus | 3→48 Interval Range | 2x Interval Growth
92
 
93
- #### Priority Selection
94
- - **What:** Tasks are ranked by a composite score combining novelty, weakness, spaced repetition due dates, and recency. The highest-scoring task is selected for each episode.
95
- - **Why:** Optimizes the training curriculum by ensuring the agent explores new tasks, practices weak areas, revisits graduated skills, and maintains variety — all balanced automatically.
96
- - **How:** `score = novelty_bonus (+100 if never attempted) + weakness_weight (+50 × (1 - success_rate)) + spaced_rep_bonus (+30 if due) - recency_penalty (-20 if attempted in last 2 episodes)`. Uses exponential decay (0.85) to emphasize recent performance.
97
- - **Metrics:** +100 Novelty Bonus | +50 Max Weakness Weight | -20 Recency Penalty
98
 
99
- #### Tier Progression
100
- - **What:** Agents advance through tiers via standard promotion (minimum episodes + success rate) or fast-track (3 consecutive high-scoring episodes). Tiers gate access to increasingly complex task pools.
101
- - **Why:** Provides structure to the learning process. Standard promotion ensures sufficient exposure; fast-track rewards agents that demonstrate immediate competence.
102
- - **How:** Standard: complete `min_episodes` at current tier with `success_rate >= advance_rate`. Fast-track: 3 consecutive episodes at >= 90% success bypasses the minimum episode requirement. Un-promotion is not supported — agents cannot drop tiers.
103
- - **Metrics:** 3 Fast-track Streak | 90% Fast-track Rate | 5 Total Tiers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- ### 2. Reward Shaping
106
 
107
- Dense reward signals that encourage operational discipline and real progress.
108
 
109
- ```
110
- if task_achieved: reward = 1.0
111
- else:
112
- reward = partial_progress * 0.8 # base: scaled to [0.0, 0.8]
113
- if progress_increased: reward += 0.1 # dense signal for advancing
114
- if command_failed: reward *= 0.5 # penalty for errors
115
- reward = clamp(reward, 0.0, 0.99) # never 1.0 without completion
116
- reward *= 0.85 ** hints_used # hint decay
117
- if survived_chaos: reward *= 1.05 # chaos survival bonus
118
- ```
119
 
120
- #### Rollback Penalty & Idempotency Bonus
121
- - **What:** Detects create→delete pairs on the same resource (rollbacks) and penalizes them (-0.1 each). Rewards graceful "already exists" handling (+0.02) where the agent retries idempotently.
122
- - **Why:** First RL environment rewarding operational discipline. In production, create-then-delete cycles are wasteful. Handling "already exists" gracefully is a sign of robust automation.
123
- - **How:** `EpisodeTracker.detect_rollbacks()` scans command history for paired create/delete operations on the same resource. Idempotency detection looks for commands that fail with "already exists" patterns (BucketAlreadyExists, ResourceInUseException, etc.) followed by successful continuation.
124
- - **Metrics:** -0.1 Rollback Penalty | +0.02 Idempotency Bonus | Per-pair Detection
125
-
126
- #### Shaped Reward System
127
- - **What:** Rewards are carefully shaped: 1.0 for full completion, 0.0-0.8 for partial progress, +0.1 progress bonus for advancing, ×0.5 for failures, capped at 0.99 without completion. Chaos bonus (×1.05) and hint decay (×0.85^n) layer on top.
128
- - **Why:** Dense reward signal prevents sparse-reward stagnation. The agent gets meaningful feedback on every step, not just at episode end. Capping at 0.99 ensures only real completion earns full credit.
129
- - **How:** `TaskGrader` dispatches to 5 strategies by tier: `command_match` (warmup), `resource_creation` (beginner), `multi_step` (intermediate), `multi_step+services` (advanced), and `state_checks` (expert). Each returns `partial_progress` which is converted to reward with bonuses/penalties applied.
130
- - **Metrics:** 1.0 Max Reward | 0.99 Progress Cap | ×1.05 Chaos Bonus
131
-
132
- #### Multi-Strategy Grading
133
- - **What:** Five distinct grading strategies, one per tier: `command_match` checks operation+service pairs, `resource_creation` verifies resources exist, `multi_step` validates ordered sequences, advanced adds service coverage, and expert runs `state_checks` against MiniStack.
134
- - **Why:** Each tier tests fundamentally different skills. A single grading strategy would either be too lenient for beginners or miss the nuance needed for expert SRE tasks.
135
- - **How:** `TaskGrader.grade()` dispatches based on the task's `grading_strategy` field. Each strategy returns a `GradeResult` with `partial_progress` (0.0-1.0), `completed` flag, and details. Grading is deterministic and fully automated.
136
- - **Metrics:** 5 Grading Strategies | 100% Automated | Per-tier Selection
137
-
138
- ### 3. Resilience & Adaptability
139
-
140
- Features that test agent robustness under unpredictable conditions.
141
-
142
- #### Progressive Hint System
143
- - **What:** A 3-level hint system where each level reveals progressively more detail: Level 1 names the AWS services, Level 2 describes the operations, Level 3 gives near-complete command structure. Each hint reduces the final reward by ×0.85.
144
- - **Why:** Creates an information-reward tradeoff unique in RL. The agent learns to wean off hints over time — initially relying on them for unfamiliar tasks, then solving independently for maximum reward. From GRPO perspective, it creates a natural exploration/exploitation axis within a single episode.
145
- - **How:** Agent issues special command `aws help --task-hint` as its action (intercepted before reaching MiniStack). Hints auto-generated from `SuccessCriteria` fields (services, steps, operations). Reward decay: `final_reward *= 0.85 ^ hints_used` — 0 hints: 1.0×, 1 hint: 0.85×, 2 hints: 0.72×, 3 hints: 0.61×. Curriculum naturally penalizes hint-dependent agents: lower rewards → slower graduation.
146
- - **Metrics:** 3 Hint Levels | ×0.85 Decay Per Hint | ~61% Reward with 3 Hints
147
-
148
- #### Chaos Injection Engine
149
- - **What:** Silently mutates AWS resource state mid-episode to test agent resilience. Perturbations are scoped to services the current task uses. If the agent completes despite chaos, it earns a ×1.05 bonus.
150
- - **Why:** Tests whether the agent can handle unexpected state changes — a critical SRE skill. Prevents brittle memorization of exact command sequences. Probability scales with tier difficulty.
151
- - **How:** `ChaosEngine` selects perturbation templates specific to the services in use (S3 policy changes, DynamoDB throughput modifications, Lambda config alterations, etc.). Resource names are extracted from successful commands via regex. Chaos probability: 10% (Intermediate), 20% (Advanced), 30% (Expert).
152
- - **Metrics:** ×1.05 Chaos Survival Bonus | 10-30% Probability by Tier | 5 Service Templates
153
-
154
- #### Drift Detection Tasks
155
- - **What:** 6 expert-tier tasks where infrastructure is provisioned correctly, then 2-3 random mutations are applied from a pool. The agent must audit, discover drifted resources, and fix only those — without knowing which drifted.
156
- - **Why:** Randomized per episode, preventing memorization. Tests real SRE audit skills: the agent must reason about desired vs. actual state, not just follow a script.
157
- - **How:** `DriftEngine` randomly selects 2-3 mutations from a task's `possible_drifts` pool and applies them after setup. Each task defines a `desired_state_spec` (natural language) and `state_checks` (ground truth CLI commands). Examples: S3 versioning/encryption drift, DynamoDB throughput changes, SNS subscription modifications.
158
- - **Metrics:** 6 Drift Tasks | 2-3 Mutations Per Episode | Random Selection Per Run
159
-
160
- ### 4. Security Posture Audit
161
-
162
- Tests *reasoning about configuration state* — the agent must READ and ANALYZE existing infrastructure, not just build things. Unlike SRE tasks (broken functionality), these have *working but insecure* infrastructure.
163
-
164
- #### Public S3 Bucket Lockdown
165
- - **What:** A pre-provisioned S3 bucket "public-assets" has an overly permissive bucket policy granting access to any principal (`Principal: *`). The agent must read the policy, identify the vulnerability, and replace it with a restrictive policy allowing only a specific IAM role.
166
- - **Why:** Tests security reasoning — the infrastructure is functional but insecure. Unlike SRE tasks where things are broken, here the agent must understand what "correct" security posture looks like and make the right judgment call.
167
- - **How:** Setup creates the bucket with a wide-open policy. State checks verify the new policy denies `Principal: *` and only allows the `app-role` principal to perform `s3:GetObject`.
168
- - **Metrics:** S3 Target Service | Policy Attack Surface | Expert Tier
169
-
170
- #### IAM Least Privilege
171
- - **What:** An IAM role "app-role" has an inline policy with `Action: *` and `Resource: *` — full admin access. The agent must replace it with a least-privilege policy allowing only `dynamodb:GetItem` and `dynamodb:PutItem` on the users table.
172
- - **Why:** IAM misconfiguration is the #1 cloud security risk. This task tests whether the agent understands permission scoping and can reason about what access an application actually needs vs. what it currently has.
173
- - **How:** Setup creates the role with a wildcard policy. The agent must craft a replacement policy document with specific actions and resource ARN. State checks verify the policy document matches the expected least-privilege permissions.
174
- - **Metrics:** IAM Target Service | 2 Allowed Actions | Expert Tier
175
-
176
- #### Secrets in Lambda Environment
177
- - **What:** A Lambda function "data-processor" has a database password stored as a plaintext environment variable (`DB_PASSWORD=hunter2`). The agent must create a secret in Secrets Manager, update the Lambda to reference the secret ARN, and remove the plaintext variable.
178
- - **Why:** Plaintext secrets in environment variables is a critical security anti-pattern. This task combines multiple services (Lambda + Secrets Manager) and tests the agent's ability to perform a safe credential rotation without breaking the function.
179
- - **How:** Setup creates the Lambda with the plaintext env var. The agent must: (1) create a secret in Secrets Manager, (2) add `SECRET_ARN` env var to Lambda, (3) remove `DB_PASSWORD`. State checks verify all three conditions.
180
- - **Metrics:** 2 Services Involved | 3 Required Steps | Expert Tier
181
-
182
- ### 5. Anti-Reward-Hacking (8 Defense Layers)
183
-
184
- 8 defense layers that prevent the agent from gaming the reward system.
185
-
186
- #### 1. Ground-Truth Verification via MiniStack
187
- - **What:** The grader never trusts agent command output. It independently queries MiniStack (the simulated AWS backend) to verify resource state for 20+ services. Even if the agent crafts fake-looking stdout, the grader checks actual state.
188
- - **Why:** Prevents reward hacking through output fabrication. The agent cannot game the system by producing convincing but fake CLI output — ground truth is always checked server-side.
189
- - **How:** `ResourceVerifier` has per-service verification methods that query MiniStack directly. For expert tasks, `StateCheck` assertions run actual AWS CLI commands against MiniStack at grading time, checking either `output_contains` (substring) or `json_path` extraction with expected values.
190
- - **Metrics:** 20+ Verified Services | 100% Server-side | 0 Agent Visibility
191
-
192
- #### 2. Deduplication
193
- - **What:** `EpisodeTracker.has_executed_operation()` tracks which (operation, resource) pairs have been credited. Running the same successful command twice does NOT increase `partial_progress`. Progress can only increase, never re-earn.
194
- - **Why:** Prevents the agent from gaming the reward system by repeating the same command to accumulate credit. Each unique operation earns credit exactly once.
195
- - **How:** `credit_operation()` records each (operation, resource) pair. Before granting credit, `is_operation_already_credited()` checks if this exact pair was already rewarded. The check is deterministic and happens at grading time.
196
- - **Metrics:** 1x Credit Per Operation | Exact Match Type | (op, res) Tracking Granularity
197
-
198
- #### 3. Grader Invisibility
199
- - **What:** The verification commands run by `ResourceVerifier` are NOT returned in the observation's `command_output`. They happen server-side during grading. The agent cannot observe or mimic them.
200
- - **Why:** If the agent could see which verification commands the grader runs, it could learn to craft fake outputs that match expected patterns. Keeping grader logic invisible forces the agent to actually perform the task.
201
- - **How:** `ResourceVerifier` executes AWS CLI commands against MiniStack in a separate execution context. Results are consumed internally by the grading pipeline. The observation returned to the agent only contains output from the agent's own commands.
202
- - **Metrics:** 0 Grader Cmds Exposed | Server Execution Context | 20+ Hidden Verifications
203
-
204
- #### 4. Command Allowlisting
205
- - **What:** Only commands starting with `aws` are executed. Any attempt to run shell commands, pipe to other tools, use redirects, or escape the sandbox is rejected with `success=False`.
206
- - **Why:** Prevents the agent from escaping the AWS CLI sandbox. Without this, the agent could potentially execute arbitrary shell commands, access the filesystem, or interfere with the environment.
207
- - **How:** The environment's `step()` method validates the command before execution. Commands not starting with `aws` are immediately rejected.
208
- - **Metrics:** `aws *` Allowed Pattern | 0 Shell Access | Instant Rejection
209
-
210
- #### 5. No Verification Reward
211
- - **What:** If the agent runs a command that matches a `state_check` command exactly (e.g., `aws s3api get-bucket-versioning --bucket app-config-store`), it gets no progress credit. Progress is only earned through `steps` operations (mutating commands), not read-only queries.
212
- - **Why:** Prevents the agent from gaming progress by running the same verification commands the grader uses. The agent can run read commands to understand state, but only mutation commands earn progress.
213
- - **How:** During grading, the `TaskGrader` checks if the agent's command matches any `state_check` command. Matching commands are flagged as verification-only and excluded from credit. Only commands matching `steps` operations (create, put, update, delete) earn `partial_progress`.
214
- - **Metrics:** 0 Credit for Reads | Mutate Rewarded Actions | Exact Match Detection
215
-
216
- #### 6. Monotonic Progress
217
- - **What:** `partial_progress` can only increase within an episode. It is clamped to [0.0, 0.99] — reaching 1.0 requires actual task completion. The agent cannot lose progress, but also cannot re-earn it.
218
- - **Why:** Prevents cycling strategies where the agent creates and destroys resources repeatedly. Combined with deduplication, this ensures steady forward progress.
219
- - **How:** In `TaskGrader`, `previous_progress` tracks the highest progress seen. New progress is always `max(previous, current)`. Reward is clamped at 0.99 for partial completion, reserving 1.0 exclusively for verified full completion.
220
- - **Metrics:** 0.99 Max Without Completion | 1.0 Requires Full Completion | max() Progress Function
221
-
222
- #### 7. Resource Name Validation
223
- - **What:** For `resource_exists` checks, the verifier matches the exact resource name, not just any resource of that type. Creating "my-test-bucket-2" doesn't satisfy a check for "my-test-bucket".
224
- - **Why:** Prevents the agent from creating arbitrarily named resources to game the verification system. Forces precise execution of the task requirements.
225
- - **How:** `ResourceVerifier`'s per-service methods (`verify_s3_bucket`, `verify_dynamodb_table`, etc.) compare against the exact expected resource name from the task definition. Each of the 20+ supported services has its own verification logic.
226
- - **Metrics:** Exact Name Matching | 20+ Verified Services | 0 Partial Matches
227
-
228
- #### 8. State Checks Verify Final State
229
- - **What:** For expert SRE tasks, `state_checks` run actual AWS CLI commands against MiniStack at grading time. The grader verifies the final infrastructure state — not the commands the agent ran.
230
- - **Why:** The agent cannot fake the state. MiniStack is the ground truth. This decouples "what the agent did" from "what was actually achieved", making reward hacking extremely difficult.
231
- - **How:** Each expert task defines `state_checks` with command + assertion pairs. Assertions support `output_contains` (substring match on CLI output) and `json_path + expected` (JSON extraction). The grader runs these checks against the live MiniStack state independently of the agent.
232
- - **Metrics:** CLI Verification Method | 2 Assertion Types | Live State Source
233
 
234
  ---
235
 
236
- ## Supported AWS Services (34)
237
 
238
- | Category | Services |
239
- |----------|----------|
240
- | **Storage & DB** | S3, DynamoDB, RDS, ElastiCache, EFS |
241
- | **Compute** | Lambda, ECS, EC2, Step Functions |
242
- | **Messaging** | SQS, SNS, Kinesis, EventBridge, Firehose |
243
- | **API** | API Gateway v1/v2, ALB/ELBv2 |
244
- | **Security** | IAM, STS, Cognito, ACM, WAF v2, Secrets Manager |
245
- | **Monitoring** | CloudWatch, CloudWatch Logs, SSM |
246
- | **Infrastructure** | CloudFormation, Route53 |
247
- | **Other** | SES, Athena, Glue, EMR |
248
 
249
- ---
250
 
251
- ## Quick Start
 
 
252
 
253
  ```python
254
  from aws_rl_env import AwsRlAction, AwsRlEnv
@@ -261,7 +184,7 @@ with AwsRlEnv.from_docker_image("aws-rl-env:latest") as env:
261
  print(f"Reward: {result.reward}, Done: {result.done}")
262
  ```
263
 
264
- Or connect to a running server:
265
 
266
  ```python
267
  env = AwsRlEnv(base_url="http://localhost:8000")
@@ -269,7 +192,7 @@ result = env.reset()
269
  result = env.step(AwsRlAction(command="aws s3 ls"))
270
  ```
271
 
272
- WebSocket API:
273
 
274
  ```python
275
  import websockets, json
@@ -282,116 +205,42 @@ async with websockets.connect("wss://sizzing-aws-rl-env.hf.space/ws") as ws:
282
  obs = json.loads(await ws.recv())
283
  ```
284
 
285
- ---
286
-
287
- ## Architecture
288
 
 
 
 
 
 
289
  ```
290
- ┌─────────────────────────────────────────────────────────┐
291
- │ Docker Container │
292
- │ │
293
- │ ┌─────────────────────┐ ┌────────────────────┐ │
294
- │ │ FastAPI RL Server │ │ AWS Simulator │ │
295
- │ │ (port 8000) │─────>│ (port 4566) │ │
296
- │ │ │ │ 34 AWS services │ │
297
- │ │ - Environment │ │ In-memory state │ │
298
- │ │ - Curriculum │ │ Reset API │ │
299
- │ │ - Grading Engine │ │ (Ministack) │ │
300
- │ │ - Episode Tracker │ │ │ │
301
- │ │ - Hint Provider │ │ │ │
302
- │ └─────────────────────┘ └────────────────────┘ │
303
- │ ^ ^ │
304
- │ | OpenEnv HTTP/WS | AWS CLI calls │
305
- └──────────┼─────────────────────────────┼────────────────┘
306
- | |
307
- RL Agent (client, External) (internal only)
308
- ```
309
-
310
- ### Episode Lifecycle
311
-
312
- 1. **`reset()`** — Wipes AWS Infracture state, selects next task from curriculum, provisions setup commands (if any), returns initial observation
313
- 2. **`step(action)`** — Validates command (`aws` prefix only), executes against MiniStack, records in tracker, grades with shaped reward, returns observation
314
- 3. **Hint request** — Agent sends `aws help --task-hint` to get a progressive hint (costs reward)
315
- 4. **Terminates** when `task_achieved == True` or max steps reached
316
-
317
- ---
318
-
319
-
320
- ## Core Classes
321
-
322
- ### `AwsRlEnvironment`
323
 
324
- [server/aws_rl_env_environment.py](server/aws_rl_env_environment.py) Implements the OpenEnv `Environment` interface. Orchestrates all services.
325
 
326
- | Method | Description |
327
- |--------|-------------|
328
- | `reset()` | Wipe infra, select task, provision setup, return initial observation |
329
- | `step(action)` | Execute command (or intercept hint request), grade, update curriculum, return observation |
330
-
331
- ### `Curriculum`
332
-
333
- [server/services/curriculum.py](server/services/curriculum.py) — Priority-queue-based task selection with progressive difficulty.
334
-
335
- Selects the next task using a **max-heap scored by**:
336
-
337
- ```
338
- score = (
339
- novelty_bonus # +100 if never attempted (explore first)
340
- + weakness_weight # +50 * (1 - task_success_rate) — worse tasks get higher priority
341
- + spaced_rep_bonus # +30 if graduated task is "due" for re-test
342
- - recency_penalty # -20 if attempted in last 2 episodes (ensure variety)
343
- )
344
  ```
345
 
346
- ### `TaskGrader`
347
-
348
- [server/services/task_grader.py](server/services/task_grader.py) — Evaluates task completion using a dispatcher pattern. Rewards are always in [0.0, 1.0].
349
-
350
- **Grading strategies by tier:**
351
-
352
- | Tier | Strategy | How it works |
353
- |------|----------|--------------|
354
- | Warmup | Command match | Checks command contains service string + correct operation |
355
- | Beginner | Resource creation | Verifies resource actually exists in MiniStack via `ResourceVerifier` |
356
- | Intermediate | Multi-step | Tracks ordered sequence of (operation, resource) pairs |
357
- | Advanced | Multi-step + services | All steps completed AND all required services touched |
358
- | Expert | State checks | Runs arbitrary AWS CLI commands to assert end-state (ground truth) |
359
-
360
- ### `HintProvider`
361
-
362
- [server/services/hint_provider.py](server/services/hint_provider.py) — Generates progressive hints from `SuccessCriteria` fields.
363
-
364
- | Hint Level | What it reveals | Example |
365
- |-----------|----------------|---------|
366
- | Level 1 | Which AWS services to use | "You'll need IAM and Lambda" |
367
- | Level 2 | Which operations | "Start with create-role, then put-role-policy" |
368
- | Level 3 | Near-complete command structure | "Use: aws iam create-role --role-name ..." |
369
-
370
- ### `EpisodeTracker`
371
-
372
- [server/services/episode_tracker.py](server/services/episode_tracker.py) — Maintains per-episode step history. Parses AWS CLI commands to extract (service, operation, resource) tuples. Tracks credited operations for deduplication, monotonic progress, and hint usage.
373
-
374
- ### `ResourceVerifier`
375
-
376
- [server/services/resource_verifier.py](server/services/resource_verifier.py) — Queries MiniStack directly to verify ground-truth resource state. Service-specific checks for S3, DynamoDB, Lambda, SQS, SNS, IAM, Secrets Manager, and API Gateway. Also evaluates `StateCheck` assertions (substring match, JSON path extraction).
377
-
378
- ### `EnvironmentDesigner`
379
-
380
- [server/services/environment_designer.py](server/services/environment_designer.py) — Provisions initial AWS state via setup commands before the agent acts. Used by SRE/expert tasks to create broken or insecure infrastructure the agent must fix.
381
 
382
- ### `AwsBackend`
383
 
384
- [server/services/aws_backend.py](server/services/aws_backend.py) Executes AWS CLI commands against MiniStack (`AWS_ENDPOINT_URL=http://localhost:4566`). Provides `reset_environment()` via MiniStack's `/_ministack/reset` endpoint.
385
 
386
- ### `AwsRlEnv` (Client)
 
 
 
 
 
 
387
 
388
- [client.py](client.py) OpenEnv HTTP/WebSocket client. Wraps `reset()` and `step()` calls to the server.
389
 
390
  ---
391
 
392
- ## Data Models
393
 
394
- [models.py](models.py) — All Pydantic models and type aliases.
395
 
396
  ### Action
397
 
@@ -400,51 +249,53 @@ class AwsRlAction(Action):
400
  command: str # AWS CLI command, e.g. "aws s3 ls"
401
  ```
402
 
 
 
403
  ### Observation
404
 
405
  ```python
406
  class AwsRlObservation(Observation):
407
  episode_id: EpisodeID
408
  step_count: StepCount
409
- command_success: bool
410
- command_output: str # stdout from AWS CLI
411
- error: str # stderr if failed
412
- task: TaskInfo | None # masked task definition (hides success criteria)
413
  task_achieved: bool
414
- partial_progress: float # current task progress in [0.0, 1.0]
415
- hints_used: int # number of hints requested this episode
416
- hint_text: str # most recent hint text (if any)
417
  ```
418
 
419
- ### Environment State
420
 
421
  ```python
422
  class AwsRlState(State):
423
- current_task: Task | None # full task assigned for the episode
424
- tracker: TrackerState # episode tracker snapshot
425
- infra_state: dict # AWS infrastructure state keyed by service name
426
- chaos_occurred: bool # whether chaos was injected this episode
427
- current_tier: str # agent's current difficulty tier
428
 
429
  class TrackerState:
430
- step_count: int # steps taken this episode
431
- hints_used: int # hints requested this episode
432
- progress: float # current partial progress [0.0, 1.0]
433
- commands_executed: list[str] # commands executed this episode
434
- credited_operations: list[str] # (operation, resource) pairs that earned credit
435
  ```
436
 
437
- ### Task Definitions
438
 
439
  ```python
440
  class Task:
441
  task_id: TaskID
442
- difficulty: TaskDifficulty # warmup | beginner | intermediate | advanced | expert
443
- description: str # human-readable goal
444
  success_criteria: SuccessCriteria
445
- setup_commands: list[SetupCommand] # pre-provision for SRE tasks
446
- desired_state_spec: str | None # natural-language desired end state (drift tasks)
447
- possible_drifts: list[SetupCommand] # pool of mutations for DriftEngine
448
 
449
  class TaskInfo:
450
  """Agent-visible subset of Task — masks success_criteria, setup_commands, and possible_drifts."""
@@ -454,143 +305,311 @@ class TaskInfo:
454
  desired_state_spec: str | None
455
 
456
  class SuccessCriteria:
457
- command_contains: str | None # warmup/beginner
458
- operation: str | None # warmup/beginner
459
- resource_exists: ResourceExistsCheck | None # beginner
460
- steps: list[StepCriteria] # intermediate/advanced/expert
461
- services: list[AwsService] # advanced/expert
462
- state_checks: list[StateCheck] # expert (ground truth)
463
  ```
464
 
465
- ### Curriculum Configuration
466
 
467
  ```python
468
  class TierConfig:
469
- min_episodes: int # minimum episodes before promotion
470
- advance_rate: float # tier success rate threshold (0.6 - 1.0)
471
- mastery_window: int # sliding window size (default: 10)
472
- mastery_threshold: float # per-task graduation threshold (default: 0.7)
473
  fast_track_rate: float # early promotion threshold (default: 0.9)
474
- chaos_probability: float # probability of chaos injection per step (default: 0.0)
475
 
476
  class SpacedRepState:
477
- interval: int # episodes until next re-test (3 -> 48)
478
  last_graduated_episode: int # when last graduated
479
  ```
480
 
481
  ---
482
 
483
- ## Project Structure
 
 
 
 
484
 
485
  ```
486
- aws-rl-env/
487
- ├── __init__.py # Exports: AwsRlEnv, AwsRlAction, AwsRlObservation
488
- ├── models.py # Pydantic data models & type aliases
489
- ├── client.py # AwsRlEnv OpenEnv client
490
- ├── inference.py # LLM agent inference script
491
- ├── inference-complete.py # Full inference pipeline with curriculum
492
- ├── server/
493
- │ ├── app.py # FastAPI application + web UI endpoints
494
- │ ├── aws_rl_env_environment.py # Core RL environment (reset/step)
495
- │ ├── templates/
496
- │ │ └── index.html # Web playground UI
497
- │ ├── static/
498
- │ │ ├── css/style.css # Playground styles
499
- │ │ └── js/app.js # Playground frontend logic
500
- │ └── services/
501
- │ ├── aws_backend.py # MiniStack command executor
502
- │ ├── task_grader.py # Grading engine with reward shaping
503
- │ ├── curriculum.py # Curriculum learning manager
504
- │ ├── episode_tracker.py # Per-episode step history & hints
505
- │ ├── resource_verifier.py # Ground-truth state verification
506
- │ ├── environment_designer.py # Setup provisioning for SRE tasks
507
- │ ├── hint_provider.py # Progressive hint generator
508
- │ ├── chaos_engine.py # Chaos injection engine
509
- │ ├── drift_engine.py # Drift detection engine
510
- │ ├── task_solutions.py # Reference solutions for tasks
511
- │ └── tasks/
512
- │ ├── warmup.yaml # 20 listing tasks
513
- │ ├── beginner.yaml # 20 creation tasks
514
- │ ├── intermediate.yaml # 20 multi-step tasks
515
- │ ├── advanced.yaml # 20 architecture tasks
516
- │ ├── expert.yaml # 20 SRE/security tasks
517
- │ └── drift.yaml # Drift detection tasks
518
- ├── tests/ # Unit tests for core services
519
- │ ├── test_aws_rl_env_environment.py
520
- │ ├── test_drift_engine.py
521
- │ ├── test_environment_designer.py
522
- │ ├── test_episode_tracker.py
523
- │ ├── test_hint_provider.py
524
- │ ├── test_resource_verifier.py
525
- │ └── test_task_grader.py
526
- ├── tests_tasks/ # Integration tests per task tier
527
- │ ├── test_warmup_tasks.py
528
- │ ├── test_beginner_tasks.py
529
- │ ├── test_intermediate_tasks.py
530
- │ ├── test_advanced_tasks.py
531
- │ ├── test_expert_tasks.py
532
- │ └── test_drift_tasks.py
533
- ├── aws_infra/ # MiniStack emulator (git subtree from ministackorg/ministack)
534
- │ └── ministack/
535
- │ ├── app.py # MiniStack ASGI router
536
- │ ├── core/ # Routing, persistence, responses
537
- │ └── services/ # AWS service implementations
538
- ├── Dockerfile # Multi-stage build (server + MiniStack)
539
- ├── Makefile # Dev tasks: run, format, lint, docker-*
540
- ├── openenv.yaml # OpenEnv manifest
541
- └── pyproject.toml # Dependencies & build config
542
  ```
543
 
 
 
 
 
544
  ---
545
 
546
- ## Running
547
 
548
- ### Docker (recommended)
549
 
550
- ```bash
551
- make docker-build # Build image
552
- make docker-run # Run on port 8000
553
- make docker-run-detach # Run in background
554
- make docker-health # Health check
 
 
 
 
 
 
 
 
 
555
  ```
556
 
557
- ### Local (without Docker)
558
 
559
- Use the combined Makefile target:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
560
 
561
- ```bash
562
- make run # Starts MiniStack + server
563
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
564
 
565
- ### OpenEnv Deployment
 
 
 
 
566
 
567
  ```bash
568
- make openenv-validate # Validate config
569
- make openenv-build # Build environment
570
- make openenv-push # Push to HuggingFace Spaces
571
  ```
572
 
 
 
573
  ---
574
 
575
- ## Configuration
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
576
 
577
- | Variable | Default | Description |
578
- |----------|---------|-------------|
579
- | `AWS_INFRA_URL` | `http://localhost:4566` | AWS Infra endpoint |
580
- | `AWS_ACCESS_KEY_ID` | `test` | AWS credentials (any value works) |
581
- | `AWS_SECRET_ACCESS_KEY` | `test` | AWS credentials (any value works) |
582
- | `AWS_DEFAULT_REGION` | `us-east-1` | AWS region |
583
- | `MAX_STEPS` | `15` | Max steps per episode |
584
- | `API_BASE_URL` | | LLM API endpoint (for inference.py) |
585
- | `MODEL_NAME` | | LLM model name (for inference.py) |
586
- | `HF_TOKEN` | | HuggingFace token (for inference.py) |
587
- | `TEMPERATURE` | `0.7` | LLM sampling temperature |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
588
 
589
  ---
590
 
591
- ## Curriculum Stats API
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
592
 
593
- The curriculum exposes detailed training progress:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
594
 
595
  ```python
596
  curriculum.get_stats()
@@ -609,10 +628,98 @@ curriculum.get_stats()
609
 
610
  ---
611
 
612
- ## Links
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
613
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
614
  - **GitHub**: [github.com/udaykiranpadhy/aws-rl-env](https://github.com/udaykiranpadhy/aws-rl-env)
615
- - **Hugging Face Space**: [huggingface.co/spaces/Sizzing/aws_rl_env](https://huggingface.co/spaces/Sizzing/aws_rl_env)
616
- - **API Reference**: [/docs](https://sizzing-aws-rl-env.hf.space/docs)
617
- - **ReDoc**: [/redoc](https://sizzing-aws-rl-env.hf.space/redoc)
618
  - **Portfolio**: [portfolio.udaykp.dev](https://portfolio.udaykp.dev)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  - openenv
12
  ---
13
 
14
+ <p align="center">
15
+ <img src="docs/figures/ministack_logo.png" alt="MiniStack logo" height="110"/>
16
+ </p>
17
 
18
+ # AWS Cloud CLI & SRE A Reinforcement-Learning Environment + Training Pipeline
19
 
20
+ > An OpenEnv-compatible RL environment with a curriculum of **120+ AWS tasks** across 5 difficulty tiers, paired with a complete **SFT GRPO** training pipeline (Qwen2.5-Coder-3B + LoRA + Optuna). Vendored MiniStack simulator means **zero AWS cost**, real CLI semantics, and 8-way parallel rollouts that fit on a single GPU.
21
 
22
+ | | |
23
+ |---|---|
24
+ | **Live demo** | [sizzing-aws-rl-env.hf.space/web](https://sizzing-aws-rl-env.hf.space/web) — try the playground in a browser |
25
+ | **API docs** | [sizzing-aws-rl-env.hf.space/docs](https://sizzing-aws-rl-env.hf.space/docs) (Swagger), [/redoc](https://sizzing-aws-rl-env.hf.space/redoc) |
26
+ | **HF Space** | [huggingface.co/spaces/Sizzing/aws_rl_env](https://huggingface.co/spaces/Sizzing/aws_rl_env) |
27
+ | **SFT adapter**| [Sizzing/aws-rl-sft-qwen25coder3b-adapter](https://huggingface.co/Sizzing/aws-rl-sft-qwen25coder3b-adapter) |
28
+ | **Dataset** | [Sizzing/aws-rl-sft](https://huggingface.co/datasets/Sizzing/aws-rl-sft) |
29
 
30
+ ---
31
 
32
+ ## Table of contents
33
+
34
+ 1. [What this is & why it matters](#1-what-this-is--why-it-matters)
35
+ 2. [Highlightsfull feature inventory](#2-highlights--full-feature-inventory)
36
+ 3. [Architecture](#3-architecture)
37
+ 4. [Live demo & Quick Start](#4-live-demo--quick-start)
38
+ 5. [Run on Colab](#5-run-on-colab)
39
+ 6. [Action / Observation spec](#6-action--observation-spec)
40
+ 7. [Curriculum & Reward (overview)](#7-curriculum--reward-overview)
41
+ 8. [Training pipeline (SFT → GRPO)](#8-training-pipeline-sft--grpo)
42
+ 9. [Parallel rollout architecture](#9-parallel-rollout-architecture)
43
+ 10. [MiniStack: vendored & customized](#10-ministack-vendored--customized)
44
+ 11. [Results & Benchmarks](#11-results--benchmarks)
45
+ 12. [Repository map](#12-repository-map)
46
+ 13. [Configuration & Running](#13-configuration--running)
47
+ 14. [Testing](#14-testing)
48
+ 15. [Tech stack](#15-tech-stack)
49
+ 16. [Links](#16-links)
50
+ 17. [Acknowledgments](#17-acknowledgments)
51
 
52
+ ---
 
53
 
54
+ ## 1. What this is & why it matters
 
 
 
55
 
56
+ Modern AI agents are increasingly asked to operate cloud infrastructure provisioning resources, fixing misconfigurations, responding to drift. Training such agents needs (a) a realistic environment, (b) reliable reward signals, and (c) enough scale to make RL feasible. Existing options force a hard tradeoff: real AWS costs hundreds of dollars per training run and is impossible to reset; toy emulators don't behave like production AWS.
 
57
 
58
+ **This project closes that gap.** We built:
 
 
 
59
 
60
+ 1. **An OpenEnv-compatible RL environment** that speaks real AWS CLI semantics. The agent sends `aws s3 mb …`, `aws iam create-role …`, and so on the exact same commands a human SRE would type.
61
+ 2. **A vendored, customized MiniStack simulator** that responds with production-equivalent JSON, runs locally for zero cost, supports 34 AWS services, and exposes a single-call state-introspection endpoint we added so the grader has cheap ground-truth access.
62
+ 3. **A 120+ task curriculum** across 5 tiers (warmup → expert) with adaptive selection, mastery tracking, spaced repetition, chaos injection, and drift-detection scenarios — every feature designed to keep the reward signal honest and prevent the agent from gaming it.
63
+ 4. **A complete SFT → GRPO training pipeline.** A 1,500-row synthetic dataset spanning 5 trajectory shapes, an 11-model base benchmark, LoRA fine-tuning, and TRL GRPO with multi-turn rollouts and Optuna hyperparameter search.
64
+ 5. **An 8-way parallel-rollout architecture.** Server-side MiniStack pool, client-side `GrpoPool`, in-process `MultiTurnEnvPool` — three coordinated layers that let G=8 concurrent rollouts run on one GPU without state contamination.
65
 
66
+ Everything is reproducible: the dataset is generated by a deterministic script, the model selection is documented end-to-end, training entry points run on Colab, and the env runs locally in a single Docker container with no external network requirement.
 
 
 
 
 
67
 
68
  ---
69
 
70
+ ## 2. Highlights — full feature inventory
71
+
72
+ This is the complete surface area of the project. Each entry links to deeper documentation in the corresponding sub-README.
73
+
74
+ ### Environment & Curriculum
75
+ - **[120+ tasks across 5 tiers](server/services/tasks/)** — warmup (25), beginner (25), intermediate (25), advanced (25), expert (24), drift (9). YAML-defined task spec per tier.
76
+ - **[Curriculum learning with priority scoring](server/README.md#7-curriculum-manager)** `score = novelty + weakness − recency + spaced_rep_bonus` drives task selection.
77
+ - **[Mastery tracking](server/README.md#7-curriculum-manager)** sliding 10-episode window, 0.7 threshold, 0.85 exponential decay, supports un-graduation.
78
+ - **[Spaced repetition](server/README.md#7-curriculum-manager)** graduated tasks resurface at intervals `[3, 6, 12, 24, 48]` to prevent forgetting.
79
+ - **[Tier promotion](server/README.md#7-curriculum-manager)** standard (min episodes + success rate) + fast-track (3 consecutive 90% episodes).
80
+ - **[Strategy pattern: simulator vs real AWS](server/README.md#4-strategy-pattern-simulator-vs-real-aws)** `BACKEND_TYPE=simulator` (default) or `aws`, no code fork.
81
+
82
+ ### Reward shaping
83
+ - **[Five grading strategies](server/README.md#8-reward-shaping--taskgrader)** command-match (warmup), resource-creation (beginner), multi-step (intermediate), multi-step+services (advanced), state-checks (expert).
84
+ - **[Dense partial-progress signal](server/README.md#8-reward-shaping--taskgrader)** clamped to `[0.0, 0.99]`, `1.0` reserved for verified completion.
85
+ - **[Rollback penalty](server/README.md#8-reward-shaping--taskgrader)** `−0.1` per `(create-X, …, delete-X)` pair.
86
+ - **[Idempotency bonus](server/README.md#8-reward-shaping--taskgrader)** `+0.02` for graceful "already exists" retry.
87
+ - **[Hint decay](server/README.md#13-hint-provider)** — three-level progressive hints with `0.85^n` reward multiplier.
88
+ - **[Chaos survival bonus](server/README.md#11-chaos-engine)** — `×1.05` if the agent completes a chaotic task.
89
+
90
+ ### Resilience & adversarial features
91
+ - **[Chaos injection](server/README.md#11-chaos-engine)** — silent mid-episode mutations, tier-scaled probabilities (10/20/30%) on services the task is touching.
92
+ - **[Drift detection](server/README.md#12-drift-engine)** — 6 expert tasks, 2–3 random mutations from a per-task pool, randomized per episode (no memorization).
93
+ - **[Security-posture audit tasks](server/README.md#17-security-posture-audit-examples)** — S3 public bucket lockdown, IAM least-privilege, Lambda secret rotation.
94
+ - **[8-layer anti-reward-hacking](server/README.md#9-anti-reward-hacking--8-defense-layers)** — ground-truth verification, dedup, grader invisibility, command allow-list, no-credit-for-reads, monotonic progress, exact resource-name validation, final state checks.
95
+
96
+ ### Training pipeline
97
+ - **[Synthetic SFT dataset (1,500 rows)](data/README.md)** — 5 trajectory types: success / multi-step continuation / failure recovery / verification / hint usage.
98
+ - **[Rigorous base-model selection](data/sft/MODEL_EVALUATION.md)** — 11 models × 27 prompts, [Qwen2.5-Coder-3B-Instruct](https://huggingface.co/unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit) wins.
99
+ - **[LoRA SFT](train/README.md#1-sft-stage--supervised-lora)** — `r ∈ {8,16,32}`, `lora_alpha = r × multiplier`, attention-only adaptation.
100
+ - **[GRPO RL via TRL](train/README.md#2-grpo-stage--reinforcement-learning)** — group-relative advantages, KL to SFT reference, `dapo` loss, no critic.
101
+ - **[Multi-turn rollouts](train/README.md#4-multi-turn-rollouts--parallel-envs)** — up to `MAX_TURNS=6`, observation fed back as next-turn user message.
102
+ - **[Optuna hyperparameter search](train/README.md#3-optuna-hyperparameter-search)** — TPE sampler over 8-dim space, frozen held-out validation set.
103
+ - **[HuggingFace integration](data/README.md#7-huggingface-publishing)** — adapter + dataset published to Hub, OpenEnv Space deployment.
104
+
105
+ ### Parallel rollout architecture
106
+ - **[Server-side MiniStack pool](server/README.md#6-server-side-ministack-pool-parallel-rollouts)** — `MiniStackPool` ([server/app.py](server/app.py)), free-list of ports, lock-guarded acquire/release.
107
+ - **[Client-side GrpoPool](scripts/README.md#2-three-coordinated-pool-layers)** — async-native, all-or-nothing connect, asyncio.gather for concurrent rollouts.
108
+ - **[In-process MultiTurnEnvPool](train/README.md#4-multi-turn-rollouts--parallel-envs)** — sync API, owns a background asyncio loop, used by the trainer.
109
+ - **[8 isolated rollouts on one server](scripts/README.md#7-running-the-multi-connection-demo)** — proof in [scripts/TestMultipleConnects.ipynb](scripts/TestMultipleConnects.ipynb).
110
+
111
+ ### Vendored simulator
112
+ - **[MiniStack as git subtree](server/README.md#5-ministack-vendored-fork--customizations)** — vendored at [aws_infra/](aws_infra/) (commit `2c38c0b`). 34 AWS services. MIT.
113
+ - **[Custom `/_ministack/state` endpoint](server/README.md#5-ministack-vendored-fork--customizations)** — added in commit `a648c3a`; returns full infra inventory in one call.
114
+ - **[Upstream sync workflow](server/README.md#5-ministack-vendored-fork--customizations)** — periodic `git subtree pull`; isolated patches keep conflicts minimal.
115
+
116
+ ### Operations & deployment
117
+ - **[OpenEnv-compliant](https://github.com/openai/openenv)** — `/reset`, `/step`, `/state`, `/schema`, `/ws` HTTP+WebSocket endpoints.
118
+ - **[Web playground UI](server/README.md#19-web-playground)** — `/web` route, 40 AWS service icons, Jinja2 + JS frontend.
119
+ - **[Docker-first deployment](Dockerfile)** — multi-stage build, container ships server + N MiniStack instances + AWS CLI.
120
+ - **[Comprehensive test suite](#14-testing)** — 10 unit tests + 6 tier-integration suites covering 134 tasks.
121
 
122
+ ---
 
 
 
 
123
 
124
+ ## 3. Architecture
 
 
 
 
125
 
126
+ ```
127
+ ┌────────────────────────────────── Docker container ──────────────────────────────────┐
128
+ │ │
129
+ │ FastAPI server (port 8000) │
130
+ │ ├── OpenEnv router /reset /step /state /schema /ws /health │
131
+ │ ├── Web playground /web (Jinja2 + 40 AWS icon SVGs) │
132
+ │ ├── env_factory per-WS-session AwsRlEnvironment instance │
133
+ │ │ (acquires a MiniStack port from MiniStackPool) │
134
+ │ └── Services │
135
+ │ Curriculum · TaskGrader · ResourceVerifier · ChaosEngine · DriftEngine │
136
+ │ HintProvider · EpisodeTracker · EnvironmentDesigner · EnvironmentStrategy │
137
+ │ │
138
+ │ │
139
+ │ MiniStack instances :4566 :4567 :4568 … :4566+POOL_SIZE-1 │
140
+ │ (vendored at aws_infra/, started by the Dockerfile entrypoint) │
141
+ │ │
142
+ └──────────────────────────────────────────────────────────────────────────────────────┘
143
+ ▲ ▲
144
+ │ HTTP/WS │ AWS CLI subprocess
145
+ │ │ (AWS_ENDPOINT_URL=http://localhost:4566+i)
146
+ │ │
147
+ ┌───────┴───────────┐ ┌───────┴───────────┐
148
+ │ RL Agent │ │ AWS CLI commands │
149
+ │ (client.py) │ │ the agent emits │
150
+ └───────────────────┘ └───────────────────┘
151
+ ```
152
 
153
+ A more visual diagram (architecture + curriculum progression) will live at `docs/figures/architecture_diagram.png` once added.
154
 
155
+ ### Episode lifecycle
156
 
157
+ 1. **`reset()`** — wipes simulator state, picks next task from the curriculum, runs `setup_commands`, applies drift if applicable, returns initial observation.
158
+ 2. **`step(action)`** — validates the command (must start with `aws `), intercepts hint requests, executes via the strategy, records in tracker, grades with shaped reward, optionally injects chaos, returns observation.
159
+ 3. **Hint** — agent sends `aws help --task-hint`; intercepted before reaching MiniStack; returns next-level hint, increments `hints_used` (which decays final reward by `0.85^n`).
160
+ 4. **Termination** `task_achieved=True` or `step_count >= MAX_STEPS` (default 15).
 
 
 
 
 
 
161
 
162
+ Full mechanics in [server/README.md](server/README.md).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
  ---
165
 
166
+ ## 4. Live demo & Quick Start
167
 
168
+ ### Try it in a browser
 
 
 
 
 
 
 
 
 
169
 
170
+ The hosted playground lets you click around any task without writing code:
171
 
172
+ > **[sizzing-aws-rl-env.hf.space/web](https://sizzing-aws-rl-env.hf.space/web)**
173
+
174
+ ### Python client
175
 
176
  ```python
177
  from aws_rl_env import AwsRlAction, AwsRlEnv
 
184
  print(f"Reward: {result.reward}, Done: {result.done}")
185
  ```
186
 
187
+ Or against a running server:
188
 
189
  ```python
190
  env = AwsRlEnv(base_url="http://localhost:8000")
 
192
  result = env.step(AwsRlAction(command="aws s3 ls"))
193
  ```
194
 
195
+ ### WebSocket API
196
 
197
  ```python
198
  import websockets, json
 
205
  obs = json.loads(await ws.recv())
206
  ```
207
 
208
+ ### Local Docker
 
 
209
 
210
+ ```bash
211
+ make docker-build # build the image
212
+ make docker-run # foreground; serves on :8000
213
+ make docker-run-detach # background
214
+ make docker-health # liveness probe
215
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
+ For training (8-way parallel rollouts):
218
 
219
+ ```bash
220
+ AWS_RL_ENV_POOL_SIZE=8 make run
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  ```
222
 
223
+ ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
+ ## 5. Run on Colab
226
 
227
+ The full pipeline is reproducible on a Colab GPU runtime. Drop your token into Colab Secrets, set `ENV_BASE_URL` to your HF Space (or local with ngrok), and run.
228
 
229
+ | Notebook | What it does | Open in Colab |
230
+ |-------------------------------------------------------------------------------------|-------------------------------------------------------|----------------------------------------------|
231
+ | [aws_rl_env_colab.ipynb](aws_rl_env_colab.ipynb) | End-to-end driver: validation, Optuna search, full GRPO training, plotting, optional push-to-Hub | <!-- TODO: paste Colab URL here --> |
232
+ | [train/train_sft_lora.ipynb](train/train_sft_lora.ipynb) | Stage 1 — SFT LoRA fine-tuning of Qwen2.5-Coder-3B | <!-- TODO: paste Colab URL here --> |
233
+ | [train/train_grpo_lora.ipynb](train/train_grpo_lora.ipynb) | Stage 2 — GRPO RL training with multi-turn rollouts | <!-- TODO: paste Colab URL here --> |
234
+ | [compare/compare_base_vs_sft.ipynb](compare/compare_base_vs_sft.ipynb) | Side-by-side: base model vs SFT adapter (dataset + RL env) | <!-- TODO: paste Colab URL here --> |
235
+ | [scripts/TestMultipleConnects.ipynb](scripts/TestMultipleConnects.ipynb) | Demo: 8 simultaneous WebSocket sessions stay isolated | <!-- TODO: paste Colab URL here --> |
236
 
237
+ Replace each `<!-- TODO -->` with the Colab badge URL once published.
238
 
239
  ---
240
 
241
+ ## 6. Action / Observation spec
242
 
243
+ The full Pydantic data models — kept inline so any reader can wire up an agent without leaving this page. Source: [models.py](models.py).
244
 
245
  ### Action
246
 
 
249
  command: str # AWS CLI command, e.g. "aws s3 ls"
250
  ```
251
 
252
+ The environment validates that `command` starts with `aws `; anything else is rejected with `success=False`.
253
+
254
  ### Observation
255
 
256
  ```python
257
  class AwsRlObservation(Observation):
258
  episode_id: EpisodeID
259
  step_count: StepCount
260
+ command_success: bool # exit code == 0
261
+ command_output: str # stdout from the AWS CLI invocation
262
+ error: str # stderr (empty if success)
263
+ task: TaskInfo | None # masked task definition (no success criteria)
264
  task_achieved: bool
265
+ partial_progress: float # current task progress in [0.0, 1.0]
266
+ hints_used: int # cumulative hint count this episode
267
+ hint_text: str # most recent hint text (if any)
268
  ```
269
 
270
+ ### State
271
 
272
  ```python
273
  class AwsRlState(State):
274
+ current_task: Task | None # full task assigned for the episode
275
+ tracker: TrackerState # episode tracker snapshot
276
+ infra_state: dict # AWS infrastructure state keyed by service name
277
+ chaos_occurred: bool # whether chaos was injected this episode
278
+ current_tier: str # agent's current difficulty tier
279
 
280
  class TrackerState:
281
+ step_count: int # steps taken this episode
282
+ hints_used: int # hints requested this episode
283
+ progress: float # current partial progress [0.0, 1.0]
284
+ commands_executed: list[str] # commands executed this episode
285
+ credited_operations: list[str] # (operation, resource) pairs that earned credit
286
  ```
287
 
288
+ ### Task definitions
289
 
290
  ```python
291
  class Task:
292
  task_id: TaskID
293
+ difficulty: TaskDifficulty # warmup | beginner | intermediate | advanced | expert
294
+ description: str # human-readable goal
295
  success_criteria: SuccessCriteria
296
+ setup_commands: list[SetupCommand] # pre-provision for SRE tasks
297
+ desired_state_spec: str | None # natural-language desired end state (drift tasks)
298
+ possible_drifts: list[SetupCommand] # pool of mutations for DriftEngine
299
 
300
  class TaskInfo:
301
  """Agent-visible subset of Task — masks success_criteria, setup_commands, and possible_drifts."""
 
305
  desired_state_spec: str | None
306
 
307
  class SuccessCriteria:
308
+ command_contains: str | None # warmup/beginner
309
+ operation: str | None # warmup/beginner
310
+ resource_exists: ResourceExistsCheck | None # beginner
311
+ steps: list[StepCriteria] # intermediate/advanced/expert
312
+ services: list[AwsService] # advanced/expert
313
+ state_checks: list[StateCheck] # expert (ground truth)
314
  ```
315
 
316
+ ### Curriculum config
317
 
318
  ```python
319
  class TierConfig:
320
+ min_episodes: int # minimum episodes before promotion
321
+ advance_rate: float # tier success rate threshold (0.6 - 1.0)
322
+ mastery_window: int # sliding window size (default: 10)
323
+ mastery_threshold: float # per-task graduation threshold (default: 0.7)
324
  fast_track_rate: float # early promotion threshold (default: 0.9)
325
+ chaos_probability: float # probability of chaos injection per step
326
 
327
  class SpacedRepState:
328
+ interval: int # episodes until next re-test (3 48)
329
  last_graduated_episode: int # when last graduated
330
  ```
331
 
332
  ---
333
 
334
+ ## 7. Curriculum & Reward (overview)
335
+
336
+ The curriculum and reward stack is the heart of the project. This section is the elevator pitch; **the full mechanics — priority scoring math, anti-reward-hacking layers, chaos engine, drift engine — live in [server/README.md](server/README.md)**.
337
+
338
+ ### Priority scoring (one-formula task selection)
339
 
340
  ```
341
+ score = novelty_bonus # +100 if never attempted
342
+ + weakness_weight # +50 × (1 − task_success_rate)
343
+ + spaced_rep_bonus # +30 if a graduated task is "due" for re-test
344
+ recency_penalty # −20 if attempted in the last 2 episodes
345
+ ```
346
+
347
+ Exploration, weakness-targeting, anti-forgetting, and variety — all balanced by one weighted sum.
348
+
349
+ ### Reward shaping
350
+
351
+ ```
352
+ if task_achieved:
353
+ reward = 1.0
354
+ if survived_chaos: reward *= 1.05 # chaos survival bonus
355
+ else:
356
+ reward = partial_progress * 0.8 # 0.8 from steps alone
357
+ if progress_increased: reward += 0.1 # dense progress signal
358
+ if command_failed: reward *= 0.5 # error penalty
359
+ reward -= 0.1 * rollback_count # waste penalty
360
+ reward += 0.02 * idempotent_retries # graceful retry bonus
361
+ reward = clamp(reward, 0.0, 0.99) # 1.0 reserved for completion
362
+
363
+ reward *= 0.85 ** hints_used # hint decay applied last
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  ```
365
 
366
+ The agent's loss surface is intentionally narrow: only doing the task earns full reward, and every reward-hacking shortcut we identified during design has a defense layer (full list in [server/README.md §9](server/README.md#9-anti-reward-hacking--8-defense-layers)).
367
+
368
+ > Curriculum progression visual will live at `docs/figures/curriculum_progression.png`.
369
+
370
  ---
371
 
372
+ ## 8. Training pipeline (SFT → GRPO)
373
 
374
+ The training pipeline runs in two stages, both reproducible on Colab. Full detail in **[train/README.md](train/README.md)**.
375
 
376
+ ```
377
+ ┌────────── data/sft/ ──────────┐
378
+ │ 1,500 train · 150 val rows │
379
+ │ 5 trajectory types │
380
+ └───────────────┬───────────────┘
381
+
382
+ STAGE 1 — Supervised Fine-Tuning train/train_sft_lora.ipynb
383
+ Qwen2.5-Coder-3B-Instruct + LoRA r=8/16/32 (Optuna) → SFT adapter
384
+
385
+ │ Sizzing/aws-rl-sft-qwen25coder3b-adapter
386
+
387
+ STAGE 2 — GRPO RL train/train_grpo_lora.ipynb
388
+ G=8 parallel rollouts · multi-turn · reward = env return
389
+ Optuna over (lr, β, G, T, top_p, lora_r, max_turns)
390
  ```
391
 
392
+ ### Numbers worth knowing
393
 
394
+ | | |
395
+ |---|---|
396
+ | **Base model** | `unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit` — picked via [data/sft/MODEL_EVALUATION.md](data/sft/MODEL_EVALUATION.md) |
397
+ | **SFT LoRA** | `r ∈ {8,16,32}`, `lora_alpha = r × multiplier`, target = attention only, dropout `[0.005, 0.031]` |
398
+ | **GRPO config** | `G=8`, `β=0.04`, `lr=5e-6`, `T=0.9`, `top_p=0.95`, `max_turns=6`, loss=`dapo` |
399
+ | **Optuna search** | TPE sampler, 6 trials × 30 GRPO steps, frozen 10-task held-out val set |
400
+ | **Final training** | 200 GRPO steps with best config |
401
+
402
+ ### Training graphs
403
+
404
+ > Embed once notebook is executed:
405
+ > ![SFT loss curve](docs/figures/sft_loss_curve.png)
406
+ > ![GRPO mean reward over training](docs/figures/grpo_reward_curve.png)
407
+ > ![Per-rollout reward by curriculum tier](docs/figures/grpo_per_tier_curve.png)
408
+ > ![Optuna parameter importance](docs/figures/optuna_param_importance.png)
409
+
410
+ ---
411
+
412
+ ## 9. Parallel rollout architecture
413
+
414
+ GRPO needs `G` rollouts on the same task per training step. We run all G in parallel with **state isolation guaranteed**. Three coordinated pool layers make it work:
415
 
 
 
416
  ```
417
+ Trainer (G=8 generations needed per step)
418
+
419
+ ┌────────────────────┼────────────────────┐
420
+ ▼ ▼ ▼
421
+ MultiTurnEnvPool GrpoPool (in-process)
422
+ (train_grpo.py) (scripts/grpo_pool.py)
423
+ sync API async API
424
+ │ │
425
+ └─────── 8 WebSocket connections ────────┘
426
+
427
+
428
+ FastAPI server :8000
429
+ + OpenEnv max_concurrent_envs=8
430
+
431
+
432
+ MiniStackPool (free-list, lock-guarded)
433
+ acquire(port) on connect, release on disconnect
434
+
435
+
436
+ 8 isolated MiniStack instances :4566..:4573
437
+ ```
438
+
439
+ Wall-clock impact: an 8-rollout × 6-turn episode runs in ~300 ms of env time vs ~2.4 s sequential. Full mechanics, including the **all-or-nothing connect protocol** that prevents pool-slot leakage on flake, are in **[scripts/README.md](scripts/README.md)**.
440
+
441
+ ---
442
+
443
+ ## 10. MiniStack: vendored & customized
444
+
445
+ The simulator powering the env is **vendored** as a git subtree at [aws_infra/](aws_infra/), not pulled as a black-box dependency. We forked it because we needed:
446
 
447
+ 1. A custom `/_ministack/state` JSON endpoint so the grader can read the entire infra inventory in **one HTTP call** instead of iterating 20+ list APIs per grading pass. Added in commit `a648c3a "feat: Add support for service state retrieval and action listing across multiple AWS services"`.
448
+ 2. A reproducible build with no runtime network requirement — the Docker image bundles a specific MiniStack revision.
449
+ 3. The freedom to extend service coverage on demand.
450
+
451
+ Custom commits live as small, isolated patches so periodic upstream syncs (`af2e945`, `579597b`) replay cleanly. To inspect:
452
 
453
  ```bash
454
+ git show a648c3a # the state-endpoint diff
455
+ git log --oneline -- aws_infra/ # only the aws_infra subtree history
 
456
  ```
457
 
458
+ Full subtree workflow + commit-by-commit detail in [server/README.md §5](server/README.md#5-ministack-vendored-fork--customizations). Upstream MiniStack docs (81 KB) are preserved at [aws_infra/README.md](aws_infra/README.md).
459
+
460
  ---
461
 
462
+ ## 11. Results & Benchmarks
463
+
464
+ ### Base-model selection
465
+
466
+ We evaluated 11 chat models on 27 held-out prompts. **Qwen2.5-Coder-3B-Instruct** wins on every metric that matters: 41% exact match (highest), 63% operation match (highest), 3.1 s/call (3× faster than the 4B runner-up). Full report:
467
+
468
+ > **[data/sft/MODEL_EVALUATION.md](data/sft/MODEL_EVALUATION.md)** — 270-line writeup, per-model verdicts, methodology
469
+
470
+ > ![Top 4 candidate models on the held-out benchmark](docs/figures/model_eval_chart.png)
471
+
472
+ ### Base vs SFT — actual results
473
+
474
+ After running the SFT pipeline end-to-end, the eval delta on the same held-out prompts is striking. Numbers from [out/delta_summary.json](out/delta_summary.json):
475
+
476
+ | Metric | Base | Post-SFT | Delta |
477
+ |-----------------|:------:|:--------:|:-----------:|
478
+ | `format_pct` | 33.3% | **100.0%** | **+66.7 pp** |
479
+ | `exact_pct` | 38.9% | **88.9%** | **+50.0 pp** |
480
+ | `service_pct` | 77.8% | **88.9%** | +11.1 pp |
481
+ | `operation_pct` | 61.1% | **88.9%** | +27.8 pp |
482
+ | `avg_latency` | 2.03s | **1.40s** | −0.63s (faster!) |
483
+ | `avg_len` | 85.8 | 74.7 | −11 chars (tighter) |
484
+
485
+ > ![Base vs SFT eval-metrics comparison](docs/figures/base_vs_sft_success.png)
486
+
487
+ Every target from [data/sft/MODEL_EVALUATION.md §11](data/sft/MODEL_EVALUATION.md) is met or exceeded. Format compliance is now perfect; the model never wraps commands in fences or quotes after SFT. Exact-match jumped from 39% to 89% — the agent now emits the canonical command for ~9 of every 10 prompts.
488
+
489
+ The richer two-mode benchmark (dataset eval + live RL env eval) is in [compare/compare_base_vs_sft.ipynb](compare/compare_base_vs_sft.ipynb); methodology in [compare/README.md](compare/README.md).
490
+
491
+ > ![Dataset comparison: base vs SFT (per-row scores)](docs/figures/compare_dataset.png)
492
+ > ![RL env comparison: base vs SFT (per-episode rewards)](docs/figures/compare_rl_env.png)
493
+
494
+ ### SFT training curves
495
+
496
+ > ![SFT loss curve over training](docs/figures/sft_loss_curve.png)
497
+
498
+ ### Optuna SFT search
499
+
500
+ The best SFT trial (out of 6) used `lora_r=16, lora_alpha=16, dropout=0.0058, lr=4.03e-4, warmup=0.1`. Full study at [out/optuna_study.json](out/optuna_study.json).
501
+
502
+ > ![Optuna parameter importances](docs/figures/optuna_param_importance.png)
503
+ > ![Optuna optimization history](docs/figures/optuna_history.png)
504
+
505
+ ### GRPO results (live multi-step env eval)
506
 
507
+ After 35 GRPO steps on top of the SFT adapter (config from [out_grpo/optuna_best.json](out_grpo/optuna_best.json) — `lr=1.6e-5, β=0.0021, T=0.99`), we re-evaluated end-to-end on 100+ episodes:
508
+
509
+ | Metric | Base + SFT | Base + SFT + GRPO | Δ |
510
+ |-------------------------------|:---------:|:-----------------:|:------------:|
511
+ | Overall success rate | 86.8% | 86.2% | −0.5 pp |
512
+ | Overall mean reward | 0.883 | 0.877 | −0.006 |
513
+ | Beginner success | 96.2% | **100.0%** | **+3.8 pp** |
514
+ | Intermediate success | 81.0% | **87.0%** | **+6.0 pp** |
515
+ | Warmup success | 96.0% | 90.2% | −5.8 pp |
516
+ | Expert success | 22.2% | 22.2% | flat |
517
+ | Drift repair rate | 22.2% | 22.2% | flat |
518
+ | Destructive-action fail rate | 15.1% | 14.7% | −0.4 pp |
519
+ | Steps to solve | 1.45 | 1.55 | +0.10 |
520
+
521
+ > ![SFT vs GRPO metrics grid](docs/figures/sft_vs_grpo_metrics_grid.png)
522
+ > ![SFT vs GRPO by tier](docs/figures/sft_vs_grpo_by_tier.png)
523
+
524
+ **Honest reading:** the 35-step GRPO run preserves the SFT gains and modestly improves the middle tiers (beginner +3.8 pp, intermediate +6.0 pp) — but does not crack the **expert-tier bottleneck** (22% success on SRE / drift / security-posture tasks). With longer GRPO runs and more curriculum exposure to expert tasks, this is the next gain to chase. The full episode-level data is in [out_grpo/grpo_multi_step.json](out_grpo/grpo_multi_step.json).
525
+
526
+ ### GRPO training curves
527
+
528
+ Per-step training signals from the final 35-step GRPO run ([out_grpo/final_grpo/checkpoint-35/trainer_state.json](out_grpo/final_grpo/checkpoint-35/trainer_state.json)):
529
+
530
+ > ![GRPO final per-step training signals](docs/figures/grpo_final_per_step.png)
531
+ > ![GRPO env reward over training](docs/figures/grpo_reward_curve.png)
532
+
533
+ Optuna search across 4 trials picked the final config:
534
+
535
+ > ![GRPO Optuna trial comparison](docs/figures/grpo_optuna_trials_comparison.png)
536
+ > ![GRPO Optuna parameter importances](docs/figures/grpo_optuna_importances.png)
537
+ > ![GRPO Optuna optimization history](docs/figures/grpo_optuna_history.png)
538
+
539
+ ### Qualitative rollouts (post-GRPO)
540
+
541
+ One sample episode per tier from [out_grpo/qualitative_rollouts.json](out_grpo/qualitative_rollouts.json):
542
+
543
+ > ![Qualitative rollouts on representative tasks](docs/figures/qualitative_rollouts.png)
544
+
545
+ ---
546
+
547
+ ## 12. Repository map
548
+
549
+ | Path | Purpose | Sub-README |
550
+ |--------------------------------|--------------------------------------------------------------------|-----------------------------------------|
551
+ | [server/](server/) | OpenEnv FastAPI server, env logic, services, web playground | [server/README.md](server/README.md) |
552
+ | [train/](train/) | SFT and GRPO training notebooks | [train/README.md](train/README.md) |
553
+ | [data/](data/) | SFT dataset, base-model selection, eval harness | [data/README.md](data/README.md) · [MODEL_EVALUATION.md](data/sft/MODEL_EVALUATION.md) |
554
+ | [compare/](compare/) | Base vs SFT side-by-side benchmark | [compare/README.md](compare/README.md) |
555
+ | [scripts/](scripts/) | Parallel-rollout architecture + multi-connection demo | [scripts/README.md](scripts/README.md) |
556
+ | [aws_infra/](aws_infra/) | Vendored MiniStack simulator (git subtree) | [aws_infra/README.md](aws_infra/README.md) |
557
+ | [out/](out/) | Reference SFT training output (Optuna study, baseline + post-train metrics, plots, final adapter checkpoints) | (see [train/README.md §7](train/README.md#7-logging-and-artifacts)) |
558
+ | [out_grpo/](out_grpo/) | Reference GRPO training output (Optuna study, baseline + post-train multi-step eval, qualitative rollouts, final adapter, 10 ready plots) | (see [train/README.md §7](train/README.md#7-logging-and-artifacts)) |
559
+ | [tests/](tests/), [tests_tasks/](tests_tasks/) | Unit + tier-integration test suites | (see [§14](#14-testing)) |
560
+ | [models.py](models.py) | Pydantic data models for action/observation/task | (inline §6) |
561
+ | [client.py](client.py) | OpenEnv HTTP/WebSocket client wrapper | — |
562
+ | [inference.py](inference.py) | Single-model agent loop (matches RL eval mode of `compare/`) | — |
563
+ | [train_grpo.py](train_grpo.py) | GRPO trainer (1,283 LOC) — `MultiTurnEnvPool`, Optuna, plotting | (see [train/README.md](train/README.md)) |
564
+ | [aws_rl_env_colab.ipynb](aws_rl_env_colab.ipynb) | Colab driver for the full training pipeline | — |
565
+ | [docs/figures/](docs/figures/) | All README graphs and screenshots | — |
566
 
567
  ---
568
 
569
+ ## 13. Configuration & Running
570
+
571
+ ### Docker (recommended)
572
+
573
+ ```bash
574
+ make docker-build # build the image
575
+ make docker-run # foreground on :8000
576
+ make docker-run-detach # background
577
+ make docker-health # liveness probe
578
+ ```
579
+
580
+ ### Local
581
+
582
+ ```bash
583
+ make install-all # uv sync + install aws_infra (MiniStack) editable
584
+ make run # starts MiniStack pool + FastAPI server
585
+ ```
586
+
587
+ ### OpenEnv deployment
588
 
589
+ ```bash
590
+ make openenv-validate # validate config
591
+ make openenv-build # build environment
592
+ make openenv-push # push to HuggingFace Spaces
593
+ ```
594
+
595
+ ### Environment variables
596
+
597
+ | Variable | Default | Description |
598
+ |-------------------------------------|--------------------------|-------------------------------------------------------------------|
599
+ | `AWS_INFRA_URL` | `http://localhost:4566` | MiniStack endpoint (used when `POOL_SIZE=1`) |
600
+ | `AWS_RL_ENV_POOL_SIZE` | `1` | **Server-side MiniStack pool size; set to 8 for GRPO training** |
601
+ | `AWS_RL_ENV_MINISTACK_BASE_PORT` | `4566` | First MiniStack port; pool covers `[BASE, BASE + POOL_SIZE)` |
602
+ | `BACKEND_TYPE` | `simulator` | `simulator` (MiniStack) or `aws` (real AWS, no pool) |
603
+ | `AWS_ACCESS_KEY_ID` | `test` | AWS credentials (any value works for the simulator) |
604
+ | `AWS_SECRET_ACCESS_KEY` | `test` | AWS credentials (any value works for the simulator) |
605
+ | `AWS_DEFAULT_REGION` | `us-east-1` | AWS region |
606
+ | `MAX_STEPS` | `15` | Max steps per episode |
607
+ | `API_BASE_URL` | — | LLM API endpoint for [inference.py](inference.py) |
608
+ | `MODEL_NAME` | — | LLM model name for [inference.py](inference.py) |
609
+ | `HF_TOKEN` | — | HuggingFace token (dataset/adapter access, push) |
610
+ | `TEMPERATURE` | `0.7` | LLM sampling temperature |
611
+
612
+ ### Curriculum stats API
613
 
614
  ```python
615
  curriculum.get_stats()
 
628
 
629
  ---
630
 
631
+ ## 14. Testing
632
+
633
+ The test suite covers both isolated unit logic and end-to-end task execution against MiniStack.
634
+
635
+ ### Unit tests — [tests/](tests/)
636
+
637
+ ```bash
638
+ pytest tests/ -v
639
+ ```
640
+
641
+ | File | Covers |
642
+ |----------------------------------------------------------------------------------------------|-----------------------------------------------------------------|
643
+ | [test_aws_rl_env_environment.py](tests/test_aws_rl_env_environment.py) | Environment lifecycle, reset/step semantics, reward integration |
644
+ | [test_task_grader.py](tests/test_task_grader.py) | All 5 grading strategies, partial progress, penalties, bonuses |
645
+ | [test_resource_verifier.py](tests/test_resource_verifier.py) | Per-service ground-truth verification (20+ services) |
646
+ | [test_episode_tracker.py](tests/test_episode_tracker.py) | Command parsing, dedup, monotonic progress, rollback detection |
647
+ | [test_episode_context.py](tests/test_episode_context.py) | Per-episode context lifecycle |
648
+ | [test_drift_engine.py](tests/test_drift_engine.py) | Random drift selection, mutation application |
649
+ | [test_hint_provider.py](tests/test_hint_provider.py) | Three-level progressive hints, decay computation |
650
+ | [test_environment_designer.py](tests/test_environment_designer.py) | Setup-command provisioning |
651
+ | [test_pool.py](tests/test_pool.py) | Server-side `MiniStackPool` acquire/release, exhaustion |
652
+ | [test_grpo_pool.py](tests/test_grpo_pool.py) | Client-side `GrpoPool` connect/close, all-or-nothing rollback |
653
+
654
+ ### Tier integration tests — [tests_tasks/](tests_tasks/)
655
+
656
+ ```bash
657
+ pytest tests_tasks/ -v
658
+ ```
659
+
660
+ 134 tasks exercised end-to-end:
661
 
662
+ | File | Tasks |
663
+ |-----------------------------------------------------------------------------------------------------|------:|
664
+ | [test_warmup_tasks.py](tests_tasks/test_warmup_tasks.py) | 25 |
665
+ | [test_beginner_tasks.py](tests_tasks/test_beginner_tasks.py) | 25 |
666
+ | [test_intermediate_tasks.py](tests_tasks/test_intermediate_tasks.py) | 25 |
667
+ | [test_advanced_tasks.py](tests_tasks/test_advanced_tasks.py) | 25 |
668
+ | [test_expert_tasks.py](tests_tasks/test_expert_tasks.py) | 24 |
669
+ | [test_drift_tasks.py](tests_tasks/test_drift_tasks.py) | 9 |
670
+ | **Total** | **133** |
671
+
672
+ These tests double as the source of truth for canonical solutions used by the SFT dataset generator (extracted via AST — see [data/README.md §1](data/README.md#1-sft-dataset-generation)).
673
+
674
+ ---
675
+
676
+ ## 15. Tech stack
677
+
678
+ - **Python 3.12**, [`uv`](https://github.com/astral-sh/uv) for dependency management, multi-stage Docker
679
+ - **FastAPI**, **OpenEnv** (HTTP + WebSocket env protocol), **uvicorn**
680
+ - **TRL ≥ 0.21** (`GRPOTrainer`, `GRPOConfig`)
681
+ - **PEFT** (LoRA), **Unsloth** (4-bit quantized base, fused training kernels)
682
+ - **Transformers ≥ 4.45**, **datasets ≥ 2.20**, **HuggingFace Hub ≥ 0.24**
683
+ - **Optuna ≥ 3.6** (TPE sampler, SQLite study storage)
684
+ - **asyncio** + **websockets** + **httpx** (parallel rollout orchestration)
685
+ - **MiniStack** (vendored at [aws_infra/](aws_infra/), 34 AWS services)
686
+ - **AWS CLI v2** (subprocess invocation against MiniStack endpoint)
687
+ - **matplotlib**, **plotly** (training curves, Optuna visualizations)
688
+ - **pytest** (16 test files, ~250 KB of test code)
689
+
690
+ ---
691
+
692
+ ## 16. Links
693
+
694
+ - **Live demo**: [sizzing-aws-rl-env.hf.space/web](https://sizzing-aws-rl-env.hf.space/web)
695
+ - **HF Space**: [huggingface.co/spaces/Sizzing/aws_rl_env](https://huggingface.co/spaces/Sizzing/aws_rl_env)
696
+ - **API docs**: [/docs](https://sizzing-aws-rl-env.hf.space/docs) · [/redoc](https://sizzing-aws-rl-env.hf.space/redoc)
697
+ - **SFT adapter**: [Sizzing/aws-rl-sft-qwen25coder3b-adapter](https://huggingface.co/Sizzing/aws-rl-sft-qwen25coder3b-adapter)
698
+ - **Dataset**: [Sizzing/aws-rl-sft](https://huggingface.co/datasets/Sizzing/aws-rl-sft)
699
  - **GitHub**: [github.com/udaykiranpadhy/aws-rl-env](https://github.com/udaykiranpadhy/aws-rl-env)
 
 
 
700
  - **Portfolio**: [portfolio.udaykp.dev](https://portfolio.udaykp.dev)
701
+ - **Colab**: <!-- TODO: paste Colab URL here -->
702
+
703
+ ---
704
+
705
+ ## 17. Acknowledgments
706
+
707
+ - **MiniStack** — vendored at [aws_infra/](aws_infra/). Upstream license preserved. Custom modifications attributable to commits `a648c3a`, `a00e981`; periodic upstream syncs `af2e945`, `579597b`.
708
+ - **OpenEnv** — environment protocol and Python client framework.
709
+ - **TRL** (HuggingFace) — `GRPOTrainer` implementation.
710
+ - **Unsloth** — 4-bit quantized model loaders + fused training kernels.
711
+ - **AWS service icons** in [server/static/img/aws/](server/static/img/aws/) — used in the web playground.
712
+
713
+ ---
714
+
715
+ ## Sub-README index
716
+
717
+ For deep technical detail on any subsystem:
718
+
719
+ - [server/README.md](server/README.md) — environment internals (curriculum, reward shaping, anti-hacking, chaos, drift, MiniStack-fork detail)
720
+ - [train/README.md](train/README.md) — SFT + GRPO training pipeline (LoRA config, Optuna search, multi-turn rollouts)
721
+ - [scripts/README.md](scripts/README.md) — parallel-rollout architecture (3 pool layers, all-or-nothing connect, concurrency safety)
722
+ - [data/README.md](data/README.md) — dataset generation (5 trajectory types, AST extraction) + base-model selection summary
723
+ - [data/sft/MODEL_EVALUATION.md](data/sft/MODEL_EVALUATION.md) — full 11-model benchmark report
724
+ - [compare/README.md](compare/README.md) — base vs SFT comparison harness
725
+ - [aws_infra/README.md](aws_infra/README.md) — vendored MiniStack upstream documentation (81 KB)
aws_rl_env_colab.ipynb ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "7b80aed985f7",
6
+ "metadata": {},
7
+ "source": [
8
+ "# AWS RL Env \u2014 GRPO Training (multi-turn + parallel envs)\n\nThis notebook trains a Qwen2.5-Coder-3B policy on the AWS RL environment using **GRPO** with:\n\n- **Multi-turn rollouts** \u2014 each task runs up to `MAX_TURNS` steps; each step is one `aws ...` command, the command's output is fed back into the next turn.\n- **Parallel environments** \u2014 `NUM_GENERATIONS` MiniStack-backed env sessions run concurrently, all rolling out the *same* curriculum-picked task.\n- **Curriculum** \u2014 `Curriculum.next_task()` picks one task per GRPO step; group-level reward feeds back via `Curriculum.record_result(...)` driving promotion + spaced repetition.\n- **Optuna** \u2014 TPE search over learning rate, KL coefficient, num_generations, temperature, top-p, LoRA rank, and max_turns. Frozen held-out validation tasks evaluate each trial.\n\nThe heavy lifting lives in [`train_grpo.py`](./train_grpo.py); this notebook is a thin driver that mirrors `kube-sre-gym/kube_sre_gym_colab.ipynb`.\n"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "651b17a160c4",
14
+ "metadata": {},
15
+ "source": [
16
+ "## 1 - Install dependencies"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "id": "41cb8a624696",
22
+ "metadata": {},
23
+ "execution_count": null,
24
+ "outputs": [],
25
+ "source": [
26
+ "%pip install -q --upgrade pip\n%pip install -q \\\n \"trl>=0.21\" \\\n \"transformers>=4.45\" \\\n \"peft>=0.13\" \\\n \"datasets>=2.20\" \\\n \"huggingface_hub>=0.24\" \\\n \"websockets>=13\" \\\n \"openenv-core[core]>=0.2.2\" \\\n \"pyyaml>=6.0\" \\\n \"matplotlib\" \\\n \"optuna>=3.6\" \\\n \"plotly\" \\\n \"kaleido\" \\\n \"httpx\"\n%pip install -q \"unsloth @ git+https://github.com/unslothai/unsloth.git\"\n"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "markdown",
31
+ "id": "23979a7833e6",
32
+ "metadata": {},
33
+ "source": [
34
+ "## 2 - Configuration\n\nEverything you'll typically tune lives in this cell."
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "id": "6b425cb11474",
40
+ "metadata": {},
41
+ "execution_count": null,
42
+ "outputs": [],
43
+ "source": [
44
+ "import os\nfrom pathlib import Path\nfrom datetime import datetime\n\n# --- Environment server ---\nENV_URL = os.environ.get(\"AWS_RL_ENV_URL\", \"http://localhost:8000\")\n\n# --- Model & adapter ---\nMODEL_ID = \"unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit\"\nSFT_ADAPTER = \"Sizzing/aws-rl-sft-qwen25coder3b-adapter\" # set to None to skip SFT init\nHUB_REPO = None # e.g. \"your-org/aws-rl-grpo-qwen25coder3b\"\n\n# --- Training defaults (Optuna may override) ---\nNUM_GENERATIONS = 8 # parallel envs == GRPO group size\nMAX_TURNS = 6 # multi-turn cap per episode\nMAX_STEPS = 200 # GRPO optimizer steps\nMAX_TOTAL_TOKENS = 4096 # token budget per episode (anti-OOM)\nMAX_PROMPT_LEN = 2048\nMAX_COMPL_LEN = 256\n\n# --- Optuna ---\nRUN_OPTUNA = True\nN_TRIALS = 6\nTRIAL_MAX_STEPS = 30\nVAL_TASKS_PER_TIER = 2\n\n# --- Output ---\nTIMESTAMP = datetime.now().strftime(\"%Y-%m-%d_%H-%M-%S\")\nOUTPUT_DIR = Path(f\"outputs/aws-rl-grpo-{TIMESTAMP}\")\nOUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n\nprint(f\"Env URL : {ENV_URL}\")\nprint(f\"Model : {MODEL_ID}\")\nprint(f\"SFT adapter: {SFT_ADAPTER}\")\nprint(f\"Output dir : {OUTPUT_DIR}\")\nprint(f\"Optuna : {'on' if RUN_OPTUNA else 'off'} ({N_TRIALS} trials, {TRIAL_MAX_STEPS} steps each)\")\n"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "markdown",
49
+ "id": "df5456a79dea",
50
+ "metadata": {},
51
+ "source": [
52
+ "## 3 - Authenticate to HF Hub (and optionally W&B)"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "id": "860228b6c968",
58
+ "metadata": {},
59
+ "execution_count": null,
60
+ "outputs": [],
61
+ "source": [
62
+ "import os\n\n# HF Hub\ntry:\n from google.colab import userdata\n os.environ[\"HF_TOKEN\"] = userdata.get(\"HF_TOKEN\")\nexcept (ImportError, KeyError, ModuleNotFoundError):\n pass\nif os.environ.get(\"HF_TOKEN\"):\n from huggingface_hub import login\n login(token=os.environ[\"HF_TOKEN\"], add_to_git_credential=False)\n print(\"HF Hub: logged in\")\nelse:\n print(\"HF Hub: HF_TOKEN not set (push_to_hub will be disabled)\")\n\n# Optional: W&B\ntry:\n from google.colab import userdata\n os.environ.setdefault(\"WANDB_API_KEY\", userdata.get(\"WANDB_API_KEY\"))\nexcept (ImportError, KeyError, ModuleNotFoundError):\n pass\n"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "markdown",
67
+ "id": "e7918896aa17",
68
+ "metadata": {},
69
+ "source": [
70
+ "## 4 - Smoke-test the env URL"
71
+ ]
72
+ },
73
+ {
74
+ "cell_type": "code",
75
+ "id": "1356c4b20164",
76
+ "metadata": {},
77
+ "execution_count": null,
78
+ "outputs": [],
79
+ "source": [
80
+ "import httpx\n\nresp = httpx.get(f\"{ENV_URL}/health\", timeout=10.0)\nprint(f\"GET {ENV_URL}/health -> {resp.status_code}\")\nprint(resp.text[:500])\nassert resp.status_code == 200, \"env server is not responding \u2014 start it before training\"\n"
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "markdown",
85
+ "id": "32b12707b0e9",
86
+ "metadata": {},
87
+ "source": [
88
+ "## 5 - Imports from `train_grpo`\n\nAll heavy logic (rollout, env pool, reward funcs, Optuna search, training loop) lives in `train_grpo.py` at the repo root."
89
+ ]
90
+ },
91
+ {
92
+ "cell_type": "code",
93
+ "id": "0e989d6fe640",
94
+ "metadata": {},
95
+ "execution_count": null,
96
+ "outputs": [],
97
+ "source": [
98
+ "import json\nimport logging\nfrom pathlib import Path\n\nfrom train_grpo import (\n SYSTEM_PROMPT,\n DEFAULT_CFG,\n SamplingCfg,\n load_policy,\n MultiTurnEnvPool,\n plot_rewards,\n pick_validation_task_ids,\n evaluate_on_validation,\n optuna_search,\n run_training,\n)\n\nlogging.basicConfig(level=logging.INFO, format=\"%(asctime)s %(levelname)s %(name)s %(message)s\")\nprint(\"System prompt (first 200 chars):\")\nprint(SYSTEM_PROMPT[:200])\n"
99
+ ]
100
+ },
101
+ {
102
+ "cell_type": "markdown",
103
+ "id": "f82fff889024",
104
+ "metadata": {},
105
+ "source": [
106
+ "## 6 - Pick fixed validation task ids\n\nA frozen list of tasks (k per tier) used as the held-out set across **all** Optuna trials and post-training comparisons. Stored to disk for reproducibility."
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "id": "94dd77b1304a",
112
+ "metadata": {},
113
+ "execution_count": null,
114
+ "outputs": [],
115
+ "source": [
116
+ "val_task_ids = pick_validation_task_ids(k_per_tier=VAL_TASKS_PER_TIER, seed=42)\nval_path = OUTPUT_DIR / \"val_task_ids.json\"\nval_path.write_text(json.dumps(val_task_ids))\nprint(f\"Validation task ids ({len(val_task_ids)}): {val_task_ids}\")\nprint(f\"Saved to {val_path}\")\n"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "markdown",
121
+ "id": "ee0704afd204",
122
+ "metadata": {},
123
+ "source": [
124
+ "## 7 - Optuna hyperparameter search\n\nSet `RUN_OPTUNA=False` in the config cell to skip and use `DEFAULT_CFG`."
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "id": "bf36301c7db8",
130
+ "metadata": {},
131
+ "execution_count": null,
132
+ "outputs": [],
133
+ "source": [
134
+ "best_cfg = None\n\nif RUN_OPTUNA:\n study = optuna_search(\n n_trials=N_TRIALS,\n trial_max_steps=TRIAL_MAX_STEPS,\n val_task_ids=val_task_ids,\n base_model=MODEL_ID,\n sft_adapter=SFT_ADAPTER,\n env_url=ENV_URL,\n output_dir=OUTPUT_DIR,\n max_total_tokens=MAX_TOTAL_TOKENS,\n max_completion_length=MAX_COMPL_LEN,\n max_prompt_length=MAX_PROMPT_LEN,\n )\n best_cfg = {**DEFAULT_CFG, **dict(study.best_params)}\n print(f\"\\nBest objective : {study.best_value:.4f}\")\n print(f\"Best params : {dict(study.best_params)}\")\nelse:\n print(\"Skipping Optuna; using DEFAULT_CFG.\")\n best_cfg = dict(DEFAULT_CFG)\n\nwith open(OUTPUT_DIR / \"best_cfg.json\", \"w\") as f:\n json.dump(best_cfg, f, indent=2)\nprint(f\"Saved best_cfg -> {OUTPUT_DIR / 'best_cfg.json'}\")\n"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "id": "f0323086fc29",
140
+ "metadata": {},
141
+ "execution_count": null,
142
+ "outputs": [],
143
+ "source": [
144
+ "# Optional Optuna visualisations (skip silently if Optuna wasn't run)\nif RUN_OPTUNA:\n try:\n import optuna.visualization as vis\n import plotly.io as pio\n pio.renderers.default = \"notebook\"\n vis.plot_optimization_history(study).show()\n vis.plot_param_importances(study).show()\n except Exception as e:\n print(f\"(visualisation skipped: {e})\")\n"
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "markdown",
149
+ "id": "ecf4bdc095e0",
150
+ "metadata": {},
151
+ "source": [
152
+ "## 8 - Final GRPO training pass with the best config"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "code",
157
+ "id": "80fcd2297776",
158
+ "metadata": {},
159
+ "execution_count": null,
160
+ "outputs": [],
161
+ "source": [
162
+ "print(f\"Final config: {best_cfg}\")\n\n# Override via best_cfg, falling back to top-of-notebook defaults\nNUM_GENERATIONS = int(best_cfg[\"num_generations\"])\nMAX_TURNS = int(best_cfg[\"max_turns\"])\n\nrun_training(\n cfg=best_cfg,\n base_model=MODEL_ID,\n sft_adapter=SFT_ADAPTER,\n env_url=ENV_URL,\n output_dir=OUTPUT_DIR,\n max_steps=MAX_STEPS,\n max_total_tokens=MAX_TOTAL_TOKENS,\n max_completion_length=MAX_COMPL_LEN,\n max_prompt_length=MAX_PROMPT_LEN,\n push_to_hub=False,\n hub_repo=HUB_REPO,\n)\n"
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "markdown",
167
+ "id": "b2305c48d920",
168
+ "metadata": {},
169
+ "source": [
170
+ "## 9 - Reward curves\n\n`plot_rewards` reads `reward_log.csv` (written incrementally by `EpisodeLogger`), so the chart is meaningful even if training was interrupted."
171
+ ]
172
+ },
173
+ {
174
+ "cell_type": "code",
175
+ "id": "f35ef4ee8206",
176
+ "metadata": {},
177
+ "execution_count": null,
178
+ "outputs": [],
179
+ "source": [
180
+ "from IPython.display import Image, display\n\nreward_csv = OUTPUT_DIR / \"reward_log.csv\"\nplot_path = OUTPUT_DIR / \"reward_plot.png\"\nplot_rewards(reward_csv, plot_path)\nif plot_path.exists():\n display(Image(filename=str(plot_path)))\nelse:\n print(\"No plot generated (no rows in reward_log.csv).\")\n"
181
+ ]
182
+ },
183
+ {
184
+ "cell_type": "markdown",
185
+ "id": "fb7ed0eab6e6",
186
+ "metadata": {},
187
+ "source": [
188
+ "## 10 - Quick post-training validation re-run (optional)\n\nRun the same held-out tasks again on the freshly trained adapter and compare to whatever each Optuna trial achieved on the same set."
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "code",
193
+ "id": "9fd1d1dd95ef",
194
+ "metadata": {},
195
+ "execution_count": null,
196
+ "outputs": [],
197
+ "source": [
198
+ "# Re-load policy in inference mode\nmodel, tokenizer = load_policy(MODEL_ID, SFT_ADAPTER, trainable=False)\npool = MultiTurnEnvPool(ENV_URL, size=1)\npool.start()\n\nsampling = SamplingCfg(\n temperature=float(best_cfg[\"temperature\"]),\n top_p=float(best_cfg[\"top_p\"]),\n max_new_tokens=MAX_COMPL_LEN,\n max_prompt_length=MAX_PROMPT_LEN,\n)\n\ntry:\n metrics = evaluate_on_validation(\n model=model,\n tokenizer=tokenizer,\n pool=pool,\n val_task_ids=val_task_ids,\n system_prompt=SYSTEM_PROMPT,\n max_turns=int(best_cfg[\"max_turns\"]),\n max_total_tokens=MAX_TOTAL_TOKENS,\n sampling=sampling,\n )\n print(f\"Post-training validation metrics: {metrics}\")\n with open(OUTPUT_DIR / \"post_train_val.json\", \"w\") as f:\n json.dump(metrics, f, indent=2)\nfinally:\n pool.close()\n"
199
+ ]
200
+ },
201
+ {
202
+ "cell_type": "markdown",
203
+ "id": "fb00a27401a8",
204
+ "metadata": {},
205
+ "source": [
206
+ "## 11 - Push to Hugging Face Hub (optional)"
207
+ ]
208
+ },
209
+ {
210
+ "cell_type": "code",
211
+ "id": "601184d3ddf5",
212
+ "metadata": {},
213
+ "execution_count": null,
214
+ "outputs": [],
215
+ "source": [
216
+ "# Uncomment to push the trained adapter:\n#\n# from huggingface_hub import create_repo, upload_folder\n# create_repo(HUB_REPO, exist_ok=True, private=False)\n# upload_folder(folder_path=str(OUTPUT_DIR), repo_id=HUB_REPO, repo_type=\"model\")\n# print(f\"Pushed: https://huggingface.co/{HUB_REPO}\")\n"
217
+ ]
218
+ }
219
+ ],
220
+ "metadata": {
221
+ "kernelspec": {
222
+ "display_name": "Python (aws-rl-env)",
223
+ "language": "python",
224
+ "name": "aws-rl-env"
225
+ },
226
+ "language_info": {
227
+ "name": "python",
228
+ "version": "3.12"
229
+ }
230
+ },
231
+ "nbformat": 4,
232
+ "nbformat_minor": 5
233
+ }
compare/README.md ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # `compare/` — Base Model vs SFT Adapter Benchmark
2
+
3
+ [← back to main README](../README.md)
4
+
5
+ This directory holds the side-by-side benchmark that answers the only question that ultimately matters: **did SFT actually make the model better at the task?**
6
+
7
+ The benchmark compares the base [Qwen2.5-Coder-3B-Instruct](https://huggingface.co/unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit) against our published SFT adapter [Sizzing/aws-rl-sft-qwen25coder3b-adapter](https://huggingface.co/Sizzing/aws-rl-sft-qwen25coder3b-adapter) under two evaluation modes — fast static dataset eval and slow live-environment eval. Both write structured metrics so the deltas are explicit.
8
+
9
+ > ![Dataset comparison: base vs SFT (per-row scores)](../docs/figures/compare_dataset.png)
10
+ > ![RL-env comparison: base vs SFT (per-episode rewards)](../docs/figures/compare_rl_env.png)
11
+
12
+ ---
13
+
14
+ ## Table of contents
15
+
16
+ 1. [What's compared](#1-whats-compared)
17
+ 2. [Two evaluation modes](#2-two-evaluation-modes)
18
+ 3. [Methodology](#3-methodology)
19
+ 4. [Metrics reported](#4-metrics-reported)
20
+ 5. [How to run](#5-how-to-run)
21
+ 6. [Reading the results](#6-reading-the-results)
22
+ 7. [Files in this directory](#7-files-in-this-directory)
23
+
24
+ ---
25
+
26
+ ## 1. What's compared
27
+
28
+ | | Base | SFT |
29
+ |---|---|---|
30
+ | **Model** | `unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit` | Same base + LoRA adapter |
31
+ | **Adapter** | None | `Sizzing/aws-rl-sft-qwen25coder3b-adapter` |
32
+ | **Training data** | Pretraining + Qwen instruction tuning | + 1,500 rows from [data/sft/aws_rl_sft.train.jsonl](../data/sft/aws_rl_sft.train.jsonl) |
33
+ | **Inference** | Same prompt template, same temperature | Identical |
34
+
35
+ The only variable is the LoRA adapter. Same base, same prompts, same decoding parameters, same evaluation set.
36
+
37
+ ---
38
+
39
+ ## 2. Two evaluation modes
40
+
41
+ The notebook runs two separate evaluations because they answer different questions:
42
+
43
+ ### Dataset eval (static)
44
+
45
+ | Question | Does the model emit the *canonical* command for held-out prompts, one-shot? |
46
+ |-----------|-----------------------------------------------------------------------------|
47
+ | Speed | Fast (~minutes) |
48
+ | Needs | HF token + dataset access; **no env server** |
49
+ | Source | [data/sft/aws_rl_sft.val.jsonl](../data/sft/aws_rl_sft.val.jsonl) (150 held-out rows) |
50
+ | Verifies | Format correctness + command-token match against canonical |
51
+
52
+ This is the same kind of pattern-matching benchmark as [data/sft/MODEL_EVALUATION.md](../data/sft/MODEL_EVALUATION.md) — fast and deterministic. Useful as a regression check.
53
+
54
+ ### RL env eval (live)
55
+
56
+ | Question | Can the model actually *solve* a task end-to-end against a live environment? |
57
+ |-----------|------------------------------------------------------------------------------|
58
+ | Speed | Slow (~tens of minutes per model) |
59
+ | Needs | Dataset eval above + a running env server (HF Space or local) |
60
+ | Source | Same val tasks, but exercised through `client.AwsRlEnv` round-trips |
61
+ | Verifies | Multi-step task completion, partial progress, reward shaping, hint usage |
62
+
63
+ This is closer to what training optimizes for. A model can score well on dataset eval (right command on step 1) but fail RL env eval (can't recover from a step 1 typo, can't continue past the first turn). Both signals matter.
64
+
65
+ ---
66
+
67
+ ## 3. Methodology
68
+
69
+ ### Dataset eval
70
+
71
+ 1. Load `Sizzing/aws-rl-sft` dataset from HF Hub
72
+ 2. For each row in `val`, build the prompt from `messages[:-1]` (system + user, drop assistant)
73
+ 3. Generate the model's response (`max_new_tokens=128`, deterministic decoding)
74
+ 4. **Extract the AWS CLI line**: strip markdown fences, find first line starting with `aws `
75
+ 5. Score against `messages[-1].content` (the canonical assistant response):
76
+ - Format OK (extracted line starts with `aws`)
77
+ - Service match (same first word after `aws`)
78
+ - Operation match (same first two words)
79
+ - Exact match (full token-for-token equality)
80
+
81
+ This mirrors the methodology in [eval_lm_studio_models.py](../data/eval_lm_studio_models.py); the same scoring functions are reused.
82
+
83
+ ### RL env eval
84
+
85
+ 1. Connect to the running env at `ENV_BASE_URL` (default: an HF Space; can be overridden to local)
86
+ 2. For each val task, run a full episode (up to `MAX_STEPS=15` turns):
87
+ - Build the prompt from system + task + observation history (matches [inference.py](../inference.py))
88
+ - Generate one AWS CLI command per turn
89
+ - Step the environment, record `reward`, `task_achieved`, `partial_progress`
90
+ 3. Aggregate per-episode metrics
91
+
92
+ The agent loop is identical to the training-time `rollout_one_episode` in [train_grpo.py](../train_grpo.py) — same prompt structure, same generation parameters, same termination logic. So the RL env eval is genuinely measuring "what would this model do during a GRPO rollout".
93
+
94
+ ---
95
+
96
+ ## 4. Metrics reported
97
+
98
+ ### Dataset eval
99
+
100
+ | Metric | Definition |
101
+ |----------------|-----------------------------------------------------------|
102
+ | `format_ok` | % of responses where the extracted line starts with `aws ` |
103
+ | `svc_match` | % matching the canonical service |
104
+ | `op_match` | % matching service + operation |
105
+ | `exact_match` | % matching the full canonical command token-for-token |
106
+
107
+ ### RL env eval (per episode)
108
+
109
+ | Metric | Definition |
110
+ |-------------------------|------------------------------------------------------------------|
111
+ | `avg_episode_reward` | Mean total reward accumulated per episode (sum of step rewards) |
112
+ | `completion_rate` | % of episodes ending in `task_achieved=True` |
113
+ | `avg_steps_to_complete` | Mean steps used by completed episodes (lower = more efficient) |
114
+ | `avg_max_progress` | Mean of the highest `partial_progress` reached per episode |
115
+ | `hint_usage_rate` | % of episodes where the agent requested at least one hint |
116
+ | `format_failure_rate` | % of agent commands that failed the `aws ` prefix gate |
117
+
118
+ The notebook produces per-tier breakdowns of all six metrics so you can see where SFT helped most (typically: warmup format-locking goes from ~85% → 100%; intermediate completion goes from a small base to a meaningful fraction).
119
+
120
+ ---
121
+
122
+ ## 5. How to run
123
+
124
+ ### Prerequisites
125
+
126
+ - HuggingFace token (`HF_TOKEN`) — needed to load the dataset and adapter
127
+ - A running env server — either:
128
+ - Your own HF Space deployment (set `ENV_BASE_URL` accordingly), or
129
+ - Local server: `make run` from the repo root, then `ENV_BASE_URL=http://localhost:8000`
130
+ - A GPU runtime (Colab T4 or better, A10/A100 ideal)
131
+
132
+ ### Notebooks
133
+
134
+ | Notebook | Open in Colab |
135
+ |---------------------------------------------------------------------|--------------------------------|
136
+ | [compare_base_vs_sft.ipynb](compare_base_vs_sft.ipynb) (clean) | <!-- TODO: paste Colab URL --> |
137
+ | [compare_base_vs_sft_with_outputs.ipynb](compare_base_vs_sft_with_outputs.ipynb) (with outputs) | <!-- TODO: paste Colab URL --> |
138
+
139
+ The two notebooks are functionally identical; the second has cell outputs preserved (18 display widgets, 26 stdout cells) for offline inspection.
140
+
141
+ ### Running steps
142
+
143
+ 1. Open the notebook in Colab (or local Jupyter)
144
+ 2. Edit the **CONFIG** cell:
145
+ ```python
146
+ BASE_MODEL = "unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit"
147
+ SFT_ADAPTER_REPO = "Sizzing/aws-rl-sft-qwen25coder3b-adapter"
148
+ DATASET_REPO = "Sizzing/aws-rl-sft"
149
+ ENV_BASE_URL = "https://your-hf-space.hf.space" # or local
150
+ ```
151
+ 3. Run all cells. Part 1 (dataset eval) finishes first; Part 2 (RL env eval) is the slow one.
152
+ 4. Compare the per-metric deltas between base and SFT.
153
+
154
+ ---
155
+
156
+ ## 6. Reading the results
157
+
158
+ ### Actual numbers from the run
159
+
160
+ From the saved outputs of [compare_base_vs_sft_with_outputs.ipynb](compare_base_vs_sft_with_outputs.ipynb):
161
+
162
+ #### Dataset eval
163
+
164
+ | Metric | Base | Base + SFT | Δ |
165
+ |---------------------------|:------:|:----------:|:----------:|
166
+ | `format_pct` | 33.3% | **100.0%** | **+66.7 pp** |
167
+ | `format_after_extract_pct`| 100.0% | 100.0% | 0 |
168
+ | `exact_pct` | 38.9% | **88.9%** | **+50.0 pp** |
169
+
170
+ #### RL env eval (live multi-step agent loop)
171
+
172
+ | Metric | Base | Base + SFT | Δ |
173
+ |-------------------------|:-----:|:----------:|:---------:|
174
+ | `avg_episode_reward` | 1.187 | **2.011** | **+0.824** |
175
+ | `reward_std` | 1.137 | 1.908 | +0.771 |
176
+ | `avg_steps` | 8.600 | **5.733** | **−2.867** |
177
+ | `avg_reward_per_step` | 0.138 | **0.351** | **+0.213** |
178
+
179
+ > ![RL-env eval: base vs SFT](../docs/figures/rl_env_eval_base_vs_sft.png)
180
+
181
+ The agent **earns more reward per episode while taking fewer steps** — exactly what good fine-tuning should produce. Reward-per-step jumps 2.5× because (a) the agent picks the right command more often (fewer wasted steps), and (b) format compliance is now perfect (no more `aws help` fallbacks).
182
+
183
+ #### Per-tier success in the RL eval
184
+
185
+ From the notebook's per-rollout traces (3 episodes per tier × 5 tiers = 15 episodes per model):
186
+
187
+ | Tier | Base (rollouts ✓ / 3) | Base + SFT (rollouts ✓ / 3) |
188
+ |--------------|:---------------------:|:----------------------------:|
189
+ | warmup | 3 | 3 |
190
+ | beginner | 3 | 3 |
191
+ | intermediate | 1 | 3 |
192
+ | advanced | 0 | 1 |
193
+ | expert | 0 | 2 |
194
+
195
+ SFT moves the **success frontier** up two tiers — the base model could not finish a single advanced or expert episode, while SFT completes 2 of 3 expert tasks (S3 lockdown, IAM least-privilege variants) within 5 steps.
196
+
197
+ ### What counts as a meaningful delta?
198
+
199
+ The val set is small (150 rows / ~10 unique tasks per RL eval), so individual percentage points have meaningful noise. Rules of thumb:
200
+
201
+ | Delta size | Significance |
202
+ |------------|------------------------------------------------|
203
+ | ±2pp | Within noise — don't claim improvement |
204
+ | 5–10pp | Likely real, look at per-tier breakdown |
205
+ | >10pp | Almost certainly real |
206
+
207
+ The deltas above (66.7 pp, 50.0 pp on dataset; 0.82 reward / −2.9 steps on RL eval) are well above the noise floor.
208
+
209
+ ### Going further with GRPO
210
+
211
+ Once the SFT adapter is in hand, the same comparison can be re-run against the GRPO adapter (`out_grpo/grpo_adapter/`). Multi-step results from the GRPO run are documented in the [main README §11](../README.md#11-results--benchmarks); the short version is GRPO@35-steps preserves SFT performance and modestly improves the middle tiers, while the expert tier remains the bottleneck.
212
+
213
+ ---
214
+
215
+ ## 7. Files in this directory
216
+
217
+ | File | Purpose |
218
+ |-----------------------------------------------------------------------------------------------------|------------------------------------------------------------------|
219
+ | [compare_base_vs_sft.ipynb](compare_base_vs_sft.ipynb) | Side-by-side dataset + RL env benchmark — clean version |
220
+ | [compare_base_vs_sft_with_outputs.ipynb](compare_base_vs_sft_with_outputs.ipynb) | Same notebook with cell outputs preserved (18 display widgets) |
221
+
222
+ ---
223
+
224
+ ## See also
225
+
226
+ - [Main README](../README.md) — top-level overview, results section
227
+ - [data/README.md](../data/README.md) — dataset that drives this comparison
228
+ - [data/sft/MODEL_EVALUATION.md](../data/sft/MODEL_EVALUATION.md) — base-model selection benchmark (same scoring functions reused here)
229
+ - [train/README.md](../train/README.md) — how the SFT adapter being benchmarked here was produced
230
+ - [inference.py](../inference.py) — single-model agent loop (the prototype the RL eval mode is modeled after)
compare/compare_base_vs_sft.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
compare/compare_base_vs_sft_with_outputs.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
data/README.md ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # `data/` — SFT Dataset Generation & Base-Model Selection
2
+
3
+ [← back to main README](../README.md)
4
+
5
+ This directory holds the SFT training corpus, the dataset generator that produced it, and the rigorous benchmark we used to pick the base model. Together they answer two questions a hackathon judge should be able to verify in under five minutes:
6
+
7
+ 1. **What did we train on?** A 1,500-row synthetic SFT corpus with five trajectory types covering success, continuation, failure recovery, verification, and hint usage. ([§1](#1-sft-dataset-generation))
8
+ 2. **Why this base model?** A reproducible 11-model benchmark across 27 held-out prompts. **Qwen2.5-Coder-3B-Instruct** wins on every metric that matters. ([§5](#5-base-model-selection-overview))
9
+
10
+ > ![Top 4 candidate models on the held-out benchmark](../docs/figures/model_eval_chart.png)
11
+
12
+ ---
13
+
14
+ ## Table of contents
15
+
16
+ 1. [SFT dataset generation](#1-sft-dataset-generation)
17
+ 2. [Five trajectory types](#2-five-trajectory-types)
18
+ 3. [Tier weighting](#3-tier-weighting)
19
+ 4. [Dataset format & artifacts](#4-dataset-format--artifacts)
20
+ 5. [Base-model selection — overview](#5-base-model-selection-overview)
21
+ 6. [Eval harness](#6-eval-harness)
22
+ 7. [HuggingFace publishing](#7-huggingface-publishing)
23
+ 8. [Files in this directory](#8-files-in-this-directory)
24
+
25
+ ---
26
+
27
+ ## 1. SFT dataset generation
28
+
29
+ [data/build_sft_dataset.py](build_sft_dataset.py) — 27 KB, single-script generator.
30
+
31
+ ### Approach
32
+
33
+ The dataset is **synthetically generated** but grounded in canonical solutions extracted from our integration test suite. Two design decisions worth flagging to judges:
34
+
35
+ #### AST-based extraction, not pytest execution
36
+
37
+ Each `tests_tasks/test_<tier>_tasks.py` file has a top-level constant (`WARMUP_COMMANDS`, `BEGINNER_COMMANDS`, …) mapping `task_id → canonical AWS CLI command`. We extract these via Python's `ast` module — we do **not** execute the test file. Reasons:
38
+
39
+ 1. `pytest` fixtures would spin up a MiniStack, hit AWS APIs, and add 30+ seconds of overhead per generation run.
40
+ 2. Static extraction is deterministic — no flake risk. The dataset is reproducible bit-for-bit given a seed.
41
+ 3. The canonical solutions are intentionally simple constant declarations that AST can parse without import side effects.
42
+
43
+ #### Plausible-output simulation
44
+
45
+ When generating multi-step continuations, we don't have a real MiniStack response to feed back into the user message — we have to fabricate one. The generator maps each AWS operation (`list-buckets`, `create-table`, `describe-instances`, …) to a JSON template, then interpolates the right resource names from the task. So an `aws s3api list-buckets` step in the user prompt history has output like:
46
+
47
+ ```json
48
+ {"Buckets":[{"Name":"my-app-data","CreationDate":"2026-04-15T..."}]}
49
+ ```
50
+
51
+ …instead of the empty `{"Buckets":[]}` you'd get from a fresh MiniStack. This is the difference between the SFT model learning "first step, always answer with the canonical command" (degenerate) and "first step depends on what's already been done" (correct).
52
+
53
+ ### Dynamic-ID filtering
54
+
55
+ Some tests reference resources whose IDs only exist at runtime — security groups (`sg-…`), subnets (`subnet-…`), VPCs (`vpc-…`), instance IDs (`i-…`). These commands cannot be deterministically captured by static extraction. The generator skips any task whose canonical command contains those patterns. The result: 72 unique tasks make it into the train split (out of 134 total tasks), all of which are deterministically reproducible.
56
+
57
+ ---
58
+
59
+ ## 2. Five trajectory types
60
+
61
+ The SFT corpus mixes five distinct trajectory shapes so the model learns to handle real multi-turn agent behavior, not just one-shot question answering. Actual proportions (from [data/sft/dataset_stats.json](sft/dataset_stats.json)):
62
+
63
+ | Source | Train pct (target) | Train rows | What the model sees |
64
+ |----------------------------|:------------------:|:----------:|-------------------------------------------------------------------------------------------|
65
+ | `success_first_step` | 55.1% (55%) | 826 | User → Task description → assistant emits the canonical command |
66
+ | `multi_step_continuation` | 20.1% (20%) | 301 | User → Task description + a baked-in history of N-1 prior commands and their outputs → assistant emits step N |
67
+ | `failure_recovery` | 15.5% (15%) | 232 | User → Task description + step 1 of a wrong command and its simulated error → assistant emits the recovery command |
68
+ | `verification` | 4.5% (5%) | 67 | User → Task already complete → assistant emits a read-only verification command |
69
+ | `hint_usage` | 4.9% (5%) | 74 | User → Task description → assistant emits `aws help --task-hint` (the agent action that requests a hint) |
70
+
71
+ Why include the last four sources at all?
72
+
73
+ - **`multi_step_continuation`** trains continuation behavior. Without it, the model overfits to step 1 and degrades on later turns.
74
+ - **`failure_recovery`** teaches the model that a typo / wrong command is recoverable. The reward signal during GRPO is dense — the model needs to know what "try again" looks like.
75
+ - **`verification`** trains the model to recognize when a task is done and respond appropriately. Production agents must distinguish "do something" from "confirm it's done".
76
+ - **`hint_usage`** lets the model learn that `aws help --task-hint` is the in-environment way to request help, not just a literal CLI command.
77
+
78
+ ---
79
+
80
+ ## 3. Tier weighting
81
+
82
+ [data/build_sft_dataset.py:54-60](build_sft_dataset.py) — sampling weights:
83
+
84
+ | Tier | Weight | Train rows | Why |
85
+ |--------------|:------:|:----------:|------------------------------------------------------------------------------------|
86
+ | warmup | 0.50 | 456 | Most rows. Format-locks the model on the simplest possible "aws X list" pattern. |
87
+ | beginner | 0.30 | 378 | Single-resource creation — bread and butter. |
88
+ | intermediate | 0.15 | 666 * | Multi-step workflows. Note actual count > target because each task contributes more rows via multi_step_continuation. |
89
+ | advanced | 0.05 | 0 | Cross-service architectures. Filtered out post-extraction (most have dynamic IDs). |
90
+ | expert | 0.00 | 0 | SRE / drift / security-posture. **Intentionally excluded from SFT.** |
91
+
92
+ > **Why expert tier is excluded from SFT.** The expert tasks (drift detection, security audits) have *randomized* state checks — there is no canonical command sequence. Trying to SFT on them would teach the model a particular fix script that is *wrong* on most episodes. These tasks are reserved for GRPO, where the env's `state_checks` reward signal handles the randomization correctly.
93
+
94
+ `*` Intermediate row count exceeds the simple weight because the multi-step trajectory generator naturally produces multiple rows per task (one for step 1, step 2, etc.).
95
+
96
+ ---
97
+
98
+ ## 4. Dataset format & artifacts
99
+
100
+ ### JSONL chat-message schema
101
+
102
+ ```json
103
+ {
104
+ "messages": [
105
+ {"role": "system", "content": "You are an AWS cloud engineer interacting with a real AWS environment via CLI..."},
106
+ {"role": "user", "content": "TASK: Create an S3 bucket named my-app-data and enable versioning on it.\n\nPREVIOUS COMMANDS:\n[1] $ aws s3 mb s3://my-app-data\n output: make_bucket: my-app-data\n reward: 0.50\n\n---\n\nCURRENT OBSERVATION:\nProgress: 0.50 Achieved: False Step: 2"},
107
+ {"role": "assistant", "content": "aws s3api put-bucket-versioning --bucket my-app-data --versioning-configuration Status=Enabled"}
108
+ ],
109
+ "difficulty": "intermediate",
110
+ "source": "multi_step_continuation",
111
+ "task_id": 42
112
+ }
113
+ ```
114
+
115
+ Every row carries the `difficulty`, `source`, and `task_id` metadata — useful for filtering, ablations, and debugging.
116
+
117
+ ### Artifacts
118
+
119
+ [data/sft/](sft/):
120
+
121
+ | File | Size | Rows | Unique tasks | Use |
122
+ |--------------------------------------------------------------|------:|------:|:------------:|------------------------------------------------|
123
+ | [aws_rl_sft.train.jsonl](sft/aws_rl_sft.train.jsonl) | 2.2 MB | 1,500 | 72 | SFT training |
124
+ | [aws_rl_sft.val.jsonl](sft/aws_rl_sft.val.jsonl) | 218 KB | 150 | 63 | SFT validation; basis for [MODEL_EVALUATION.md](sft/MODEL_EVALUATION.md) |
125
+ | [aws_rl_sft.reserve.jsonl](sft/aws_rl_sft.reserve.jsonl) | 294 KB | 200 | 66 | Held-out reserve for post-SFT regression checks |
126
+ | [dataset_stats.json](sft/dataset_stats.json) | 3.4 KB | — | — | Per-split source/tier/task breakdowns |
127
+ | [MODEL_EVALUATION.md](sft/MODEL_EVALUATION.md) | 15 KB | — | — | Full model-selection writeup ([§5](#5-base-model-selection-overview)) |
128
+ | [model_eval_full.json](sft/model_eval_full.json) | 209 KB | 297 | — | Per-call eval data (11 models × 27 prompts) |
129
+ | [deepseek_r1_rerun.json](sft/deepseek_r1_rerun.json) | 5.3 KB | 27 | — | DeepSeek R1 re-run with `max_tokens=2048` |
130
+
131
+ ---
132
+
133
+ ## 5. Base-model selection — overview
134
+
135
+ This is the most rigorous decision in the whole project. Full reasoning, per-model verdicts, and methodology lives in **[data/sft/MODEL_EVALUATION.md](sft/MODEL_EVALUATION.md)** — a 270-line standalone report. Read it before judging the project's technical depth; it's what convinces us we're training the right thing.
136
+
137
+ The 30-second summary:
138
+
139
+ | Model | exact% | op% | fmt% | Latency | Verdict |
140
+ |--------------------------------|:-----:|:----:|:------:|:-------:|--------------------------------------|
141
+ | **qwen2.5-coder-3b-instruct** | **41%** | **63%** | 85% | **3.1s** | ✅ Train this. Highest exact, fastest viable. |
142
+ | qwen/qwen3-4b-2507 | 33% | 59% | 100% | 10.4s | Fallback. Perfect format, 3× slower. |
143
+ | qwen2.5-coder-1.5b-instruct | 22% | 44% | 81% | 2.5s | Speed play if GRPO budget tight. |
144
+ | smollm2-1.7b-instruct | 7% | 37% | 63% | 2.1s | ❌ Ceiling too low. |
145
+ | (7 more) | 0% | … | … | … | ❌ Format-broken or wrong domain. |
146
+
147
+ > ![Per-model comparison: 5 quality metrics + latency](../docs/figures/model_eval_chart.png)
148
+
149
+ What the metrics mean:
150
+
151
+ - **`fmt%`**: raw output starts with `aws ` (no preamble, fences, or quotes). The agent's [inference.py:93](../inference.py) gate rejects everything else.
152
+ - **`+xtr%`**: `fmt%` after stripping markdown fences. Gap to `fmt%` = "model knows the answer, wrapping it in junk".
153
+ - **`exact%`**: extracted command matches canonical token-for-token. The hardest metric.
154
+ - **`svc%`**: same AWS service as canonical. Domain orientation.
155
+ - **`op%`**: same service AND operation. The gap SFT closes most reliably.
156
+
157
+ The full table (11 models, 9 metrics, per-call logs) is in [data/sft/model_eval_full.json](sft/model_eval_full.json) — 297 records.
158
+
159
+ ---
160
+
161
+ ## 6. Eval harness
162
+
163
+ [data/eval_lm_studio_models.py](eval_lm_studio_models.py) — 9.9 KB, reusable.
164
+
165
+ - Calls each chat model loaded in LM Studio at `http://localhost:1234/v1/chat/completions` (OpenAI-compatible API)
166
+ - Sends the same 27 held-out prompts to each model
167
+ - Extracts `aws ...` from the response (stripping fences / preamble)
168
+ - Compares against the canonical command from the val split
169
+ - Writes per-call detail + aggregate metrics to JSON
170
+
171
+ To re-run post-SFT:
172
+
173
+ ```bash
174
+ .venv/bin/python data/eval_lm_studio_models.py \
175
+ --max-per-combo 5 \
176
+ --out data/sft/model_eval_postsft.json
177
+ ```
178
+
179
+ A successful SFT run should see (predictions from [MODEL_EVALUATION.md §11](sft/MODEL_EVALUATION.md), and **actuals from our SFT run committed at [out/delta_summary.json](../out/delta_summary.json)**):
180
+
181
+ | Metric | Base | Target | **Actual (post-SFT)** |
182
+ |-----------|:-----:|:-------:|:---------------------:|
183
+ | `exact%` | 39% | 75%+ | **88.9%** ✅ |
184
+ | `op%` | 61% | 90%+ | **88.9%** ≈ |
185
+ | `svc%` | 78% | — | **88.9%** |
186
+ | `fmt%` | 33% | 100% | **100.0%** ✅ |
187
+ | latency | 2.03s | — | **1.40s** (faster) |
188
+
189
+ Every target from MODEL_EVALUATION.md is hit or essentially hit. Format compliance is now perfect; exact-match jumped 50 pp; the model is faster *and* tighter.
190
+
191
+ > ![Base vs SFT comparison (eval metrics)](../docs/figures/base_vs_sft_success.png)
192
+ > ![Single-step eval base vs SFT](../docs/figures/single_step_eval.png)
193
+
194
+ ---
195
+
196
+ ## 7. HuggingFace publishing
197
+
198
+ [data/upload_sft_to_hf.py](upload_sft_to_hf.py) — pushes the JSONL splits to HuggingFace Hub:
199
+
200
+ | Split | Hub repo |
201
+ |----------|-----------------------------------------------------|
202
+ | train | `Sizzing/aws-rl-sft-qwen25coder3b-train` |
203
+ | val | `Sizzing/aws-rl-sft-qwen25coder3b-val` |
204
+ | reserve | `Sizzing/aws-rl-sft-qwen25coder3b-reserve` |
205
+
206
+ The trained SFT adapter (output of [train/train_sft_lora.ipynb](../train/train_sft_lora.ipynb)) is published separately at:
207
+
208
+ - `Sizzing/aws-rl-sft-qwen25coder3b-adapter`
209
+
210
+ GRPO training picks it up by setting `SFT_ADAPTER = "Sizzing/aws-rl-sft-qwen25coder3b-adapter"` in [aws_rl_env_colab.ipynb](../aws_rl_env_colab.ipynb).
211
+
212
+ ---
213
+
214
+ ## 8. Files in this directory
215
+
216
+ | File | Purpose |
217
+ |--------------------------------------------------------------------|--------------------------------------------------------------------|
218
+ | [build_sft_dataset.py](build_sft_dataset.py) | Generator — AST extraction + 5 trajectory types + plausible outputs |
219
+ | [eval_lm_studio_models.py](eval_lm_studio_models.py) | Base-model benchmark harness (LM Studio API) |
220
+ | [upload_sft_to_hf.py](upload_sft_to_hf.py) | Push the SFT splits to HuggingFace |
221
+ | [sft/aws_rl_sft.train.jsonl](sft/aws_rl_sft.train.jsonl) | 1,500 SFT training rows |
222
+ | [sft/aws_rl_sft.val.jsonl](sft/aws_rl_sft.val.jsonl) | 150 validation rows |
223
+ | [sft/aws_rl_sft.reserve.jsonl](sft/aws_rl_sft.reserve.jsonl) | 200 reserve rows |
224
+ | [sft/dataset_stats.json](sft/dataset_stats.json) | Per-split source / tier / task counts |
225
+ | [sft/MODEL_EVALUATION.md](sft/MODEL_EVALUATION.md) | **The base-model selection report (read this)** |
226
+ | [sft/model_eval_full.json](sft/model_eval_full.json) | Per-call eval data (11 models × 27 prompts) |
227
+ | [sft/deepseek_r1_rerun.json](sft/deepseek_r1_rerun.json) | R1 re-run with extended `max_tokens` |
228
+
229
+ ---
230
+
231
+ ## See also
232
+
233
+ - [Main README](../README.md)
234
+ - [data/sft/MODEL_EVALUATION.md](sft/MODEL_EVALUATION.md) — full base-model selection writeup
235
+ - [train/README.md](../train/README.md) — how this dataset is consumed by SFT training
236
+ - [compare/README.md](../compare/README.md) — how the trained model is benchmarked vs the base
237
+ - [server/services/tasks/](../server/services/tasks/) — source of truth for task definitions (the YAML the generator reads)
238
+ - [tests_tasks/](../tests_tasks/) — canonical solutions the generator extracts via AST
docs/figures/base_vs_sft_success.png ADDED
docs/figures/compare_dataset.png ADDED

Git LFS Details

  • SHA256: 0192c7b5d9d57f278aac1a09d776329757ebaff2d3a29d791c3f5cda7258e724
  • Pointer size: 131 Bytes
  • Size of remote file: 280 kB
docs/figures/compare_rl_env.png ADDED

Git LFS Details

  • SHA256: eda0c69c8c28515195d005f0a4431b7c6e7959d1f99f5b7c44ed448ede523374
  • Pointer size: 131 Bytes
  • Size of remote file: 201 kB
docs/figures/env_init_screenshot.png ADDED

Git LFS Details

  • SHA256: 51a633c9058297eae3575abd5a4cb093d9204337bca4b69fd141f471d38ad5c8
  • Pointer size: 131 Bytes
  • Size of remote file: 372 kB
docs/figures/grpo_final_per_step.png ADDED

Git LFS Details

  • SHA256: f6d5d210de9f473d638cb75cf221e3e703eae9a3d00faa8fbcd122c17919e6ce
  • Pointer size: 131 Bytes
  • Size of remote file: 243 kB
docs/figures/grpo_optuna_history.png ADDED
docs/figures/grpo_optuna_history_v0.png ADDED
docs/figures/grpo_optuna_hparams.png ADDED
docs/figures/grpo_optuna_importances.png ADDED
docs/figures/grpo_optuna_parallel.png ADDED
docs/figures/grpo_optuna_trial_curves.png ADDED

Git LFS Details

  • SHA256: 8254a87ffe69f2c818b5b403dae41f32dc36c301ca491d8618c41164333f43c6
  • Pointer size: 131 Bytes
  • Size of remote file: 277 kB
docs/figures/grpo_optuna_trials_comparison.png ADDED

Git LFS Details

  • SHA256: 231ca2e7ecae1114a7e61d808f0b3736a22f4ddec7b90d7626cb0fb4d608c4c5
  • Pointer size: 131 Bytes
  • Size of remote file: 123 kB
docs/figures/grpo_per_tier_curve.png ADDED
docs/figures/grpo_reward_by_tier.png ADDED
docs/figures/grpo_reward_curve.png ADDED

Git LFS Details

  • SHA256: 1d1222b3510873dadb8da9be7066e17220c5dab5c6456d11385f4e9f5c99b885
  • Pointer size: 131 Bytes
  • Size of remote file: 260 kB
docs/figures/ministack_logo.png ADDED

Git LFS Details

  • SHA256: d6ee9620212659d7f7e2da8dcc9ff39cf522d3f34ea07728d6e6ab00df876de5
  • Pointer size: 131 Bytes
  • Size of remote file: 122 kB
docs/figures/model_eval_chart.png ADDED
docs/figures/optuna_history.png ADDED
docs/figures/optuna_parallel.png ADDED

Git LFS Details

  • SHA256: a235e7fc7050edfdf8f547a31d5630d737c5b85fd5e4f2bcdd0abf1677058926
  • Pointer size: 131 Bytes
  • Size of remote file: 218 kB
docs/figures/optuna_param_importance.png ADDED
docs/figures/optuna_slice.png ADDED

Git LFS Details

  • SHA256: b743ec4e945f9ee5239694224d587ee1c912a8d415910e924218c9b5074003fc
  • Pointer size: 131 Bytes
  • Size of remote file: 107 kB
docs/figures/optuna_trial_curves.png ADDED
docs/figures/qualitative_rollouts.png ADDED
docs/figures/rl_env_eval_base_vs_sft.png ADDED
docs/figures/sft_loss_curve.png ADDED

Git LFS Details

  • SHA256: e0c0d8d74358a2f95feee6e685e2d512f5ee5bda8ce869686c951114278c9a1a
  • Pointer size: 131 Bytes
  • Size of remote file: 178 kB
docs/figures/sft_optuna_trials_table.png ADDED
docs/figures/sft_vs_grpo_by_tier.png ADDED
docs/figures/sft_vs_grpo_metrics_grid.png ADDED
docs/figures/sft_vs_grpo_scalar.png ADDED
docs/figures/single_step_eval.png ADDED
images/compare_dataset.png ADDED

Git LFS Details

  • SHA256: 0192c7b5d9d57f278aac1a09d776329757ebaff2d3a29d791c3f5cda7258e724
  • Pointer size: 131 Bytes
  • Size of remote file: 280 kB
images/compare_rl_env.png ADDED

Git LFS Details

  • SHA256: eda0c69c8c28515195d005f0a4431b7c6e7959d1f99f5b7c44ed448ede523374
  • Pointer size: 131 Bytes
  • Size of remote file: 201 kB
pyproject.toml CHANGED
@@ -34,7 +34,16 @@ train = [
34
  "ipykernel",
35
  "ipywidgets>=8.1.0",
36
  "datasets>=4.8.4",
37
- "huggingface-hub>=1.9.0",
 
 
 
 
 
 
 
 
 
38
  ]
39
 
40
 
 
34
  "ipykernel",
35
  "ipywidgets>=8.1.0",
36
  "datasets>=4.8.4",
37
+ "huggingface-hub>=0.34,<1.0",
38
+ # GRPO training stack (versions mirror train/train_grpo_lora.ipynb)
39
+ "unsloth",
40
+ "trl>=0.18.2,<=0.24.0,!=0.19.0",
41
+ "peft",
42
+ "accelerate",
43
+ "bitsandbytes",
44
+ "transformers>=4.50,<5.0",
45
+ "optuna",
46
+ "matplotlib",
47
  ]
48
 
49
 
scripts/README.md ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # `scripts/` — Parallel Rollout Architecture
2
+
3
+ [← back to main README](../README.md)
4
+
5
+ This directory holds the helper modules that make **8 concurrent multi-turn rollouts** against the AWS RL environment possible — the scaling trick that turns GRPO from a thought experiment into something you can actually train on a single GPU.
6
+
7
+ If you only read one section, read [§2 — Three coordinated pool layers](#2-three-coordinated-pool-layers). It explains the architecture in one page.
8
+
9
+ ---
10
+
11
+ ## Table of contents
12
+
13
+ 1. [Why parallel rollouts matter](#1-why-parallel-rollouts-matter)
14
+ 2. [Three coordinated pool layers](#2-three-coordinated-pool-layers)
15
+ 3. [Walking through one GRPO step](#3-walking-through-one-grpo-step)
16
+ 4. [The all-or-nothing connect protocol](#4-the-all-or-nothing-connect-protocol)
17
+ 5. [Concurrency-safety guarantees](#5-concurrency-safety-guarantees)
18
+ 6. [Configuration](#6-configuration)
19
+ 7. [Running the multi-connection demo](#7-running-the-multi-connection-demo)
20
+ 8. [Files in this directory](#8-files-in-this-directory)
21
+
22
+ ---
23
+
24
+ ## 1. Why parallel rollouts matter
25
+
26
+ GRPO computes **group-relative advantages**: every gradient step needs `G` rollouts on the *same* prompt so the algorithm can normalize rewards within the group. With `G = 8`, multi-turn episodes (≤ 6 turns), and an env step that round-trips an AWS CLI invocation through MiniStack (~50 ms), the math is:
27
+
28
+ ```
29
+ Serial: 8 rollouts × 6 turns × 50 ms = 2,400 ms env-time per GRPO step
30
+ Parallel: max(8 envs) × 6 turns × 50 ms = 300 ms env-time per GRPO step
31
+ ```
32
+
33
+ That's an 8× speedup on the env side. The model forward pass still serialises (single GPU), so the practical end-to-end gain depends on the env/compute ratio — but for an env that takes ~50 ms per step, parallelism is the difference between a tractable training run and a 24-hour one.
34
+
35
+ The parallelism isn't free: each rollout needs **state isolation**. If two rollouts share an AWS world, rollout 1's S3 buckets bleed into rollout 2's view, the curriculum mastery numbers go to garbage, and the agent can hack the reward by piggy-backing off siblings. The three coordinated pools below exist to make state isolation cheap and automatic.
36
+
37
+ > ![8 simultaneous WebSocket sessions established to the env server](../docs/figures/env_init_screenshot.png)
38
+
39
+ ---
40
+
41
+ ## 2. Three coordinated pool layers
42
+
43
+ The system has **three pools** that work together. They look similar at first glance — all of them deal with N concurrent envs — but each operates at a different layer of the stack:
44
+
45
+ ```
46
+ ┌─────────────────────────────────────────────────────────────────────────────┐
47
+ │ Layer 3 — Trainer-process pool │
48
+ │ MultiTurnEnvPool (train_grpo.py) │
49
+ │ • owns a background asyncio loop │
50
+ │ • exposes a sync run_group() that the GRPO trainer can call │
51
+ │ • used by the in-process trainer (CLI: python train_grpo.py) │
52
+ └────────────────────────────────────┬────────────────────────────────────────┘
53
+ │ N WebSocket clients
54
+ ┌────────────────────────────────────▼────────────────────────────────────────┐
55
+ │ Layer 3 alt — Notebook-friendly pool │
56
+ │ GrpoPool (scripts/grpo_pool.py) │
57
+ │ • async-native API (async with GrpoPool(...) as pool: ...) │
58
+ │ • used by Colab notebooks where the cell IS the asyncio loop │
59
+ │ • simpler interface (no background thread) │
60
+ └────────────────────────────────────┬────────────────────────────────────────┘
61
+ │ N WebSocket clients
62
+ ┌────────────────────────────────────▼────────────────────────────────────────┐
63
+ │ Layer 2 — OpenEnv max_concurrent_envs │
64
+ │ create_app(env_factory, ..., max_concurrent_envs=POOL_SIZE) │
65
+ │ • OpenEnv reserves up to N env instances at once │
66
+ │ • returns 503 if a 9th client tries to connect when POOL_SIZE=8 │
67
+ └────────────────────────────────────┬────────────────────────────────────────┘
68
+ │ env_factory() invoked per session
69
+ ┌────────────────────────────────────▼────────────────────────────────────────┐
70
+ │ Layer 1 — Server-side MiniStack pool │
71
+ │ MiniStackPool (server/app.py) │
72
+ │ • free-list of MiniStack ports (BASE..BASE+POOL_SIZE-1) │
73
+ │ • acquire()/release() under a threading.Lock │
74
+ │ • each WS session binds to ONE port for its lifetime → state isolation │
75
+ └─────────────────────────────────────────────────────────────────────────────┘
76
+
77
+
78
+ N independent MiniStack processes
79
+ (started by Dockerfile / Makefile)
80
+ ```
81
+
82
+ ### Layer 1 — Server-side `MiniStackPool`
83
+
84
+ Lives in [server/app.py:75–138](../server/app.py). Documented in detail in [server/README.md §6](../server/README.md#6-server-side-ministack-pool-parallel-rollouts).
85
+
86
+ - A `threading.Lock`-guarded free list of port numbers
87
+ - `acquire()` returns a port; `release(port)` puts it back
88
+ - `RuntimeError("MiniStack pool exhausted")` if depleted
89
+ - The Dockerfile launches `POOL_SIZE` MiniStack processes on consecutive ports before the FastAPI server starts accepting connections
90
+
91
+ ### Layer 2 — OpenEnv `max_concurrent_envs`
92
+
93
+ When `create_app()` is called with `max_concurrent_envs=POOL_SIZE`, OpenEnv enforces the cap upstream — clients beyond the cap get a clean 503 instead of `RuntimeError`. Defence in depth.
94
+
95
+ ### Layer 3 — Client pools
96
+
97
+ Two flavours, same parallelism model, different ergonomics:
98
+
99
+ | | `MultiTurnEnvPool` ([train_grpo.py](../train_grpo.py)) | `GrpoPool` ([scripts/grpo_pool.py](grpo_pool.py)) |
100
+ |---|---|---|
101
+ | API | Sync — `pool.run_group(task, ...)` | Async — `await pool.run_group(rollout_fn)` |
102
+ | Loop | Owns a background thread + asyncio loop | Caller is the asyncio loop (Colab cell) |
103
+ | Use case | In-process trainer (`python train_grpo.py`) | Notebooks driving training from Colab |
104
+ | Connection | `await asyncio.gather(*(e.connect() for e in envs))` on background thread | Same, but on the caller's loop |
105
+ | `record_result()` | Trainer calls `Curriculum.record_result()` directly | `pool.record_group_result(task, rewards)` helper baked in |
106
+
107
+ Both share the **all-or-nothing connect protocol** described in §4.
108
+
109
+ ### Why two client pools?
110
+
111
+ Real life: the trainer process (`python train_grpo.py`) runs synchronously — TRL's `GRPOTrainer.train()` blocks. To use `await asyncio.gather` from inside that, we need a background asyncio loop on a separate thread. That's `MultiTurnEnvPool`.
112
+
113
+ Colab cells, on the other hand, *are* the asyncio loop (Jupyter ≥ 7 ships nest_asyncio under the hood). Running a background thread + loop there is overkill and creates ordering bugs. `GrpoPool` is the simpler async-native variant for that case.
114
+
115
+ The two pools share semantic invariants — same N, same all-or-nothing connect, same task scoping — so behaviour is identical regardless of which entry point you use.
116
+
117
+ ---
118
+
119
+ ## 3. Walking through one GRPO step
120
+
121
+ ```
122
+ 1. trainer picks one task from the Curriculum (1 task)
123
+ 2. pool.run_group(task) (asyncio.gather over N envs)
124
+ 3. for turn in 0..MAX_TURNS:
125
+ prompts = build_prompts(observations) (CPU)
126
+ completions = policy.generate(prompts) (1 batched fwd, GPU)
127
+ actions = parse_completions(completions) (CPU; extract `aws ...` line)
128
+ observations = await pool.run_group_step(actions) (N concurrent env.step)
129
+ 4. rewards = sum_per_episode(rewards_lists) (N floats)
130
+ 5. GRPO computes group-relative advantages, KL, loss (1 backward, GPU)
131
+ 6. Curriculum.record_result(task, mean(rewards)) (1 update)
132
+ ```
133
+
134
+ A couple of subtleties:
135
+
136
+ ### Generation is serialised, env-step is not
137
+
138
+ [train_grpo.py:_GENERATE_LOCK](../train_grpo.py) — a `threading.Lock` around `model.generate()`. The model lives on a single GPU; concurrent `generate()` calls would clobber each other. We let env step calls run concurrently (the slow part — WebSocket round-trip + MiniStack execution); only generation serialises.
139
+
140
+ ### Per-turn token accumulation
141
+
142
+ `rollout_one_episode()` accumulates `prompt_ids`, `completion_ids`, and `logprobs` across turns into a single sequence. GRPO then assigns the episode-level reward to that full sequence. This matches the multi-turn structure of the underlying decision problem.
143
+
144
+ ### Why every rollout in a group runs the same task
145
+
146
+ GRPO's group-relative advantage is `(reward_i − group_mean) / group_std`. If different rollouts ran different tasks, group statistics would mean nothing. The curriculum picks one task per GRPO step; the pool's `reset_group(task)` forces every env to that task; only then can the group statistics be meaningful.
147
+
148
+ ---
149
+
150
+ ## 4. The all-or-nothing connect protocol
151
+
152
+ [scripts/grpo_pool.py:58-82](grpo_pool.py) — the most non-obvious correctness detail in the whole pool stack.
153
+
154
+ ```python
155
+ async def connect(self) -> None:
156
+ if self.envs:
157
+ return
158
+ envs = [AwsRlEnv(base_url=self.base_url) for _ in range(self.size)]
159
+ try:
160
+ await asyncio.gather(*(e.connect() for e in envs))
161
+ except BaseException:
162
+ # Roll back: close every env (successful or not). return_exceptions
163
+ # so a close() failure doesn't mask the original connect error.
164
+ await asyncio.gather(
165
+ *(e.close() for e in envs),
166
+ return_exceptions=True,
167
+ )
168
+ raise
169
+ # Only publish the pool after the entire group connected successfully.
170
+ self.envs = envs
171
+ ```
172
+
173
+ What makes this important:
174
+
175
+ 1. **`asyncio.gather` raises on the first failure**. If 3 of 8 connects succeed and the 4th raises, the other 4 may or may not have connected yet. Their state is undefined.
176
+ 2. **Server-side state matters**. Each successful connect acquired a MiniStack port from the server pool. If we just `raise` without cleanup, those ports stay held until the WebSocket times out — typically minutes. The next training run hits "pool exhausted".
177
+ 3. **`self.envs` is published only after success**. If any partial state were exposed, callers might call `pool.run_group()` on a half-initialised pool and get N/M valid results.
178
+ 4. **`return_exceptions=True` on the rollback**. A close error must not mask the original connect error — the user needs to know the *real* reason connect failed, not a downstream cleanup failure.
179
+
180
+ These four invariants are the difference between "training reliably resumes after a flake" and "every flake leaks 7 ports and you're rebuilding the container at 3 AM".
181
+
182
+ `MultiTurnEnvPool._connect_all()` in [train_grpo.py:473-480](../train_grpo.py) implements the same pattern.
183
+
184
+ ---
185
+
186
+ ## 5. Concurrency-safety guarantees
187
+
188
+ | Concern | Guarantee | Where enforced |
189
+ |------------------------------|---------------------------------------------------------------------------------------------|-----------------------------------------------------------|
190
+ | Cross-rollout state isolation | Each WebSocket session holds its own MiniStack port for its lifetime | `MiniStackPool.acquire/release` ([server/app.py](../server/app.py)) |
191
+ | Curriculum coherence | One curriculum instance per training run; `record_result()` is the only mutation point | `make_rollout_func` in [train_grpo.py](../train_grpo.py) |
192
+ | GPU contention | `model.generate()` calls serialised behind `_GENERATE_LOCK` | [train_grpo.py:_GENERATE_LOCK](../train_grpo.py) |
193
+ | Pool slot leakage on flake | All-or-nothing connect with rollback close | `GrpoPool.connect`, `MultiTurnEnvPool._connect_all` |
194
+ | Hung shutdown | Pool close runs `asyncio.gather(..., return_exceptions=True)` then stops the loop with timeout | `MultiTurnEnvPool.close()` |
195
+ | Web playground vs pool collisions | Web routes refuse to mount when `POOL_SIZE > 1` | [server/app.py:171](../server/app.py) |
196
+
197
+ Tests covering these:
198
+
199
+ - [tests/test_pool.py](../tests/test_pool.py) — server-side `MiniStackPool` acquire/release, exhaustion behaviour
200
+ - [tests/test_grpo_pool.py](../tests/test_grpo_pool.py) — `GrpoPool` connect/close lifecycle, partial-connect rollback, group-result aggregation
201
+
202
+ ---
203
+
204
+ ## 6. Configuration
205
+
206
+ | Variable | Default | Purpose |
207
+ |-------------------------------------|---------|-------------------------------------------------------------------------------------|
208
+ | `AWS_RL_ENV_POOL_SIZE` | `1` | Server-side MiniStack pool size. Set to `8` for GRPO training. Must be ≥ training-time `num_generations`. |
209
+ | `AWS_RL_ENV_MINISTACK_BASE_PORT` | `4566` | First MiniStack port; the pool covers `[BASE, BASE + POOL_SIZE)` |
210
+ | `BACKEND_TYPE` | `simulator` | `simulator` (default; pool is meaningful) or `aws` (real AWS; pool disabled) |
211
+ | `NUM_GENERATIONS` (in trainer cfg) | `8` | Number of WebSocket clients the pool opens. Should equal `AWS_RL_ENV_POOL_SIZE` for full parallelism. |
212
+ | `MAX_TURNS` (in trainer cfg) | `6` | Per-rollout episode length cap |
213
+ | `MAX_TOTAL_TOKENS` (in trainer cfg) | `4096` | Per-episode token budget (anti-OOM) |
214
+
215
+ When deploying to HuggingFace Spaces, pool size is constrained by container memory — each MiniStack process is ~50–100 MB resident.
216
+
217
+ ---
218
+
219
+ ## 7. Running the multi-connection demo
220
+
221
+ [scripts/TestMultipleConnects.ipynb](TestMultipleConnects.ipynb) is a hands-on notebook that proves all 8 sessions stay isolated.
222
+
223
+ ```bash
224
+ # 1. Start the env server with pool size 8
225
+ AWS_RL_ENV_POOL_SIZE=8 make run
226
+
227
+ # 2. Run the notebook
228
+ jupyter notebook scripts/TestMultipleConnects.ipynb
229
+ ```
230
+
231
+ Expected output: 8 simultaneous "connection open" lines, 8 independent reset/step traces, no resource bleed across sessions.
232
+
233
+ The screenshot at [docs/figures/env_init_screenshot.png](../docs/figures/env_init_screenshot.png) captures one such run.
234
+
235
+ ---
236
+
237
+ ## 8. Files in this directory
238
+
239
+ | File | Purpose |
240
+ |-------------------------------------------------------|--------------------------------------------------------------------------|
241
+ | [grpo_pool.py](grpo_pool.py) (139 LOC) | Async-native `GrpoPool` — N persistent WebSockets, `asyncio.gather`, all-or-nothing connect, group-result aggregation |
242
+ | [grpo_train.py](grpo_train.py) (~430 LOC) | Alternative training entry point that uses `GrpoPool` directly (vs `train_grpo.py` which embeds `MultiTurnEnvPool`) |
243
+ | [TestMultipleConnects.ipynb](TestMultipleConnects.ipynb) | Hands-on demo proving 8 concurrent WebSockets stay isolated |
244
+
245
+ Related code outside this directory:
246
+
247
+ - [train_grpo.py](../train_grpo.py) — `MultiTurnEnvPool` class, the canonical in-process pool
248
+ - [server/app.py](../server/app.py) — `MiniStackPool`, `make_env_factory`, the server-side pool layer
249
+ - [client.py](../client.py) — `AwsRlEnv` WebSocket client used by both pools
250
+ - [tests/test_pool.py](../tests/test_pool.py), [tests/test_grpo_pool.py](../tests/test_grpo_pool.py) — concurrency tests
251
+
252
+ ---
253
+
254
+ ## See also
255
+
256
+ - [Main README](../README.md) — project overview
257
+ - [server/README.md](../server/README.md) — environment internals (server-side pool detail in §6)
258
+ - [train/README.md](../train/README.md) — SFT + GRPO training pipeline (this pool plugs into the GRPO loop)
259
+ - [tests/test_pool.py](../tests/test_pool.py) — server-side pool acquire/release tests
260
+ - [tests/test_grpo_pool.py](../tests/test_grpo_pool.py) — client-side pool lifecycle tests
server/README.md ADDED
@@ -0,0 +1,596 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # `server/` — AWS RL Environment Internals
2
+
3
+ [← back to main README](../README.md)
4
+
5
+ This directory implements the **OpenEnv-compatible FastAPI server** that powers the AWS RL Environment. The server exposes HTTP and WebSocket endpoints to a training agent, executes AWS CLI commands against a backing simulator (or real AWS), runs a reward / curriculum stack, and returns shaped observations.
6
+
7
+ If you only have time for the headline numbers, read [the main README](../README.md). This document is the reference for **how** the environment actually works — every defended invariant, every edge case, every config knob.
8
+
9
+ ---
10
+
11
+ ## Table of contents
12
+
13
+ 1. [Architecture overview](#1-architecture-overview)
14
+ 2. [HTTP / WebSocket endpoints](#2-http--websocket-endpoints)
15
+ 3. [Episode lifecycle](#3-episode-lifecycle)
16
+ 4. [Strategy pattern: Simulator vs Real AWS](#4-strategy-pattern-simulator-vs-real-aws)
17
+ 5. [MiniStack: vendored fork & customizations](#5-ministack-vendored-fork--customizations)
18
+ 6. [Server-side MiniStack pool (parallel rollouts)](#6-server-side-ministack-pool-parallel-rollouts)
19
+ 7. [Curriculum manager](#7-curriculum-manager)
20
+ 8. [Reward shaping & TaskGrader](#8-reward-shaping--taskgrader)
21
+ 9. [Anti-reward-hacking — 8 defense layers](#9-anti-reward-hacking--8-defense-layers)
22
+ 10. [Resource verifier](#10-resource-verifier)
23
+ 11. [Chaos engine](#11-chaos-engine)
24
+ 12. [Drift engine](#12-drift-engine)
25
+ 13. [Hint provider](#13-hint-provider)
26
+ 14. [Episode tracker](#14-episode-tracker)
27
+ 15. [Environment designer](#15-environment-designer)
28
+ 16. [Task definitions (YAML schema)](#16-task-definitions-yaml-schema)
29
+ 17. [Security-posture audit examples](#17-security-posture-audit-examples)
30
+ 18. [Curriculum stats API](#18-curriculum-stats-api)
31
+ 19. [Web playground](#19-web-playground)
32
+
33
+ ---
34
+
35
+ ## 1. Architecture overview
36
+
37
+ ```
38
+ ┌──────────────────────────────── server/ process ────────────────────────────────┐
39
+ │ │
40
+ │ FastAPI app (server/app.py) │
41
+ │ ├── OpenEnv router /reset /step /state /schema /ws /health │
42
+ │ ├── Web router /web /web/reset /web/step /web/state /web/solution │
43
+ │ └── env_factory ──► AwsRlEnvironment(strategy=…) │
44
+ │ │ │
45
+ │ ├── EpisodeTracker (per-episode state) │
46
+ │ ├── Curriculum (priority + mastery) │
47
+ │ ├── EnvironmentDesigner (setup commands) │
48
+ │ ├── HintProvider (3-level hints) │
49
+ │ ├── ChaosEngine (mid-episode mutations) │
50
+ │ ├── DriftEngine (drift-task injection) │
51
+ │ ├── TaskGrader (5-strategy dispatcher) │
52
+ │ ├── ResourceVerifier (ground-truth state) │
53
+ │ └── EnvironmentStrategy ──► SimulatorStrategy │
54
+ │ ╲ (talks to MiniStack) │
55
+ │ ╲ AwsStrategy │
56
+ │ (talks to real AWS) │
57
+ └─────────────────────────────────────────────────────────────────────────────────┘
58
+
59
+
60
+ MiniStack process(es) on :4566+
61
+ (own port per pool slot when AWS_RL_ENV_POOL_SIZE > 1)
62
+ ```
63
+
64
+ Files:
65
+
66
+ - [server/app.py](app.py) — FastAPI app, OpenEnv integration, MiniStack pool, web routes
67
+ - [server/aws_rl_env_environment.py](aws_rl_env_environment.py) — main `AwsRlEnvironment` orchestrator
68
+ - [server/services/](services/) — pluggable services (one concern per file, listed in §7–§16)
69
+ - [server/services/tasks/](services/tasks/) — YAML task definitions, one file per tier
70
+ - [server/templates/index.html](templates/index.html) — playground HTML
71
+ - [server/static/](static/) — playground JS/CSS, 40 AWS service icons
72
+
73
+ ---
74
+
75
+ ## 2. HTTP / WebSocket endpoints
76
+
77
+ OpenEnv-compatible (created via `openenv.core.env_server.http_server.create_app`):
78
+
79
+ | Method | Path | Purpose |
80
+ |--------|----------|-----------------------------------------------------------------|
81
+ | POST | `/reset` | Wipe infra, pick next task from curriculum, return observation |
82
+ | POST | `/step` | Execute action, grade, optionally inject chaos, return obs |
83
+ | GET | `/state` | Full `AwsRlState` snapshot (current task, tracker, infra state) |
84
+ | GET | `/schema`| JSON schemas for `AwsRlAction` / `AwsRlObservation` |
85
+ | GET | `/health`| Liveness probe |
86
+ | WS | `/ws` | Persistent session (one MiniStack acquired per connection) |
87
+
88
+ Web playground (always mounted; backed by a dedicated lazy MiniStack — see §6):
89
+
90
+ | Method | Path | Purpose |
91
+ |--------|------------------|-----------------------------------------------------------|
92
+ | GET | `/` | Redirect → `/web` |
93
+ | GET | `/web` | HTML playground (Jinja2 template `index.html`) |
94
+ | POST | `/web/reset` | Stateful reset for the playground's shared env |
95
+ | POST | `/web/step` | Stateful step for the playground's shared env |
96
+ | GET | `/web/state` | Current `AwsRlState` for the shared env |
97
+ | GET | `/web/solution` | Reveal next canonical solution command (debug aid) |
98
+
99
+ Auto-generated docs: `/docs` (Swagger), `/redoc` (ReDoc).
100
+
101
+ ---
102
+
103
+ ## 3. Episode lifecycle
104
+
105
+ 1. **`reset()`**
106
+ 1. `EnvironmentStrategy.reset_environment()` — wipes simulator state (no-op for real AWS)
107
+ 2. `Curriculum.next_task()` — picks the next task (see §7 priority scoring)
108
+ 3. `EnvironmentDesigner.provision(task.setup_commands)` — runs preflight CLI commands to create the broken / insecure infra the agent must fix (used by SRE, drift, security-posture tasks)
109
+ 4. `DriftEngine.inject(task)` — for drift tasks, randomly applies 2–3 mutations from `task.possible_drifts`
110
+ 5. `EpisodeTracker.start(task)` — fresh tracker
111
+ 6. Returns initial `AwsRlObservation` with the masked `TaskInfo` (task description but **not** success criteria)
112
+
113
+ 2. **`step(action)`**
114
+ 1. **Validate** — only commands starting with `aws ` are accepted (see §9 layer 4)
115
+ 2. **Intercept hint requests** — `aws help --task-hint` returns next-level hint, increments `hints_used`, never reaches the simulator
116
+ 3. `EnvironmentStrategy.execute(command)` — runs the AWS CLI invocation, returns stdout / stderr / exit_code
117
+ 4. `EpisodeTracker.record(...)` — parses command, dedup-checks, updates `partial_progress`
118
+ 5. `TaskGrader.grade(...)` — returns shaped reward (see §8)
119
+ 6. `ChaosEngine.maybe_inject(...)` — at tier-scaled probability, executes a destructive mutation on a resource the agent just touched
120
+ 7. `Curriculum.record_step(...)` — accumulates step-level signal
121
+ 8. Returns updated `AwsRlObservation`
122
+
123
+ 3. **Termination**
124
+ - `obs.task_achieved == True`, **or**
125
+ - `step_count >= MAX_STEPS` (default 15, configurable via env var)
126
+ - On terminate: `Curriculum.record_result(task, achieved, reward)` updates per-task mastery and may promote the agent's tier
127
+
128
+ ---
129
+
130
+ ## 4. Strategy pattern: Simulator vs Real AWS
131
+
132
+ The environment supports two backends, swapped via the `BACKEND_TYPE` env var (default `simulator`):
133
+
134
+ ### `SimulatorStrategy` — [services/simulator_strategy.py](services/simulator_strategy.py)
135
+
136
+ - Talks to a MiniStack instance over HTTP (`AWS_INFRA_URL`, default `http://localhost:4566`)
137
+ - AWS CLI invocations are subprocessed with `AWS_ENDPOINT_URL` set so they hit MiniStack
138
+ - `reset_environment()` calls MiniStack's `/_ministack/reset` endpoint to wipe state
139
+ - `get_state()` reads the **custom** `/_ministack/state` endpoint (see §5) — one HTTP call returns the entire infra inventory used by `ResourceVerifier`
140
+
141
+ ### `AwsStrategy` — [services/aws_strategy.py](services/aws_strategy.py)
142
+
143
+ - Uses ambient AWS credentials (whatever the standard AWS CLI credential chain finds)
144
+ - No `AWS_ENDPOINT_URL` override — commands hit real AWS
145
+ - `reset_environment()` is a **no-op** (we cannot wipe a real AWS account; expert-level task scenarios assume a clean / sandboxed sub-account)
146
+ - Useful for end-to-end demonstrations, less so for RL training
147
+
148
+ Switching backends:
149
+
150
+ ```bash
151
+ export BACKEND_TYPE=aws # or "simulator" (default)
152
+ make run
153
+ ```
154
+
155
+ The factory in [server/app.py](app.py) wires the right strategy at startup.
156
+
157
+ ---
158
+
159
+ ## 5. MiniStack: vendored fork & customizations
160
+
161
+ > **Why this matters:** the simulator that the grader queries is not a black-box pip dependency — it's vendored in-tree as a git subtree at [aws_infra/](../aws_infra/) so we can extend it. The custom endpoints we added there are how `ResourceVerifier` and the grader can read full infra state in a single round-trip.
162
+
163
+ ### Vendored as a git subtree
164
+
165
+ `aws_infra/` was imported via `git subtree add` in commit **[`2c38c0b` "Bring mini stack to local"](../aws_infra/)** (PR #5). Upstream is the public MiniStack project. The full upstream README is preserved at [aws_infra/README.md](../aws_infra/README.md) (81 KB).
166
+
167
+ Why we vendored instead of taking a pip dependency:
168
+
169
+ 1. **Custom endpoints**: we needed JSON state-introspection endpoints (`/_ministack/state`, `/_ministack/actions`) that upstream did not ship. These are the integration seams between our env grader and the simulator.
170
+ 2. **Reproducible builds**: the Docker image ships a specific MiniStack revision; no runtime network fetch, identical behavior across environments.
171
+ 3. **Service-coverage extensions**: occasional patches to individual service handlers (e.g. RDS state retrieval used by `ResourceVerifier`).
172
+
173
+ ### Custom modifications on top of upstream
174
+
175
+ Each modification is a separate, cleanly-cherry-pickable commit so future upstream syncs are low-conflict.
176
+
177
+ | Commit | Title | What it adds |
178
+ |-----------|----------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------|
179
+ | `a648c3a` | feat: Add support for service state retrieval and action listing across multiple AWS services | `/_ministack/state` returns the entire infra inventory as JSON in one call (the grader's primary read path). `/_ministack/actions` lists every supported operation per service — used by tooling and tests. |
180
+ | `a00e981` | chor: Small Fixes | Tightening / typo fixes on top of `a648c3a`. |
181
+ | `af2e945` | Sync MiniStack with latest changes | Periodic upstream sync. Replays our custom commits cleanly because they are isolated and well-scoped. |
182
+ | `579597b` | Sync MiniStack with latest changes | Subsequent upstream sync. |
183
+
184
+ To inspect any of these:
185
+
186
+ ```bash
187
+ git show a648c3a # see the full diff for the state endpoint
188
+ git log --oneline -- aws_infra/ # see only the aws_infra/ history
189
+ ```
190
+
191
+ ### Build integration
192
+
193
+ - [aws_infra/pyproject.toml](../aws_infra/pyproject.toml) declares MiniStack as its own package; we install it as an editable dependency via `make install-all`.
194
+ - The [Dockerfile](../Dockerfile) stages MiniStack explicitly so the resulting container has no external network requirement at runtime.
195
+ - The [aws_infra/Makefile](../aws_infra/Makefile) provides `make build` and `make test` targets if you want to work on MiniStack itself.
196
+ - `aws_infra/docker-compose.yml` lets you run MiniStack alone for debugging.
197
+
198
+ ### Upstream sync workflow
199
+
200
+ ```bash
201
+ # From the repo root
202
+ git subtree pull --prefix=aws_infra <upstream-remote> main --squash
203
+ # Resolve any conflicts (rare, because our patches live in identifiable commits)
204
+ # Test:
205
+ pytest tests/ -k "verifier or grader"
206
+ ```
207
+
208
+ ---
209
+
210
+ ## 6. Server-side MiniStack pool (parallel rollouts)
211
+
212
+ > **Why:** GRPO training generates `G=8` rollouts per step on the same task and computes group-relative advantages. To run those 8 rollouts truly in parallel **without state bleed**, every rollout needs its own AWS world. The server-side pool makes that possible.
213
+
214
+ ### Design — [server/app.py:75–138](app.py)
215
+
216
+ When the server boots, `make_env_factory(POOL_SIZE, BASE_PORT, BACKEND_TYPE)` decides which factory to install:
217
+
218
+ | Mode | What gets created |
219
+ |-------------------------------------------------|--------------------------------------------------------------------------------|
220
+ | `BACKEND_TYPE=aws` | No pool. All sessions share `AwsStrategy`. Pool would be meaningless on real AWS. |
221
+ | `AWS_RL_ENV_POOL_SIZE=1` (default) | No pool object; one shared `SimulatorStrategy` on the default port. |
222
+ | `AWS_RL_ENV_POOL_SIZE=N` (`N>1`, simulator) | A `MiniStackPool` (thread-safe free-list of ports `BASE..BASE+N-1`). Each WebSocket session calls `pool.acquire()` to get its own MiniStack port; on disconnect `env.close()` triggers `pool.release(port)`. |
223
+
224
+ The pool's `acquire()` raises `RuntimeError("MiniStack pool exhausted")` if a 9th client tries to connect when `POOL_SIZE=8`. OpenEnv's `create_app(..., max_concurrent_envs=POOL_SIZE)` enforces the same cap upstream so callers see a clean 503 instead.
225
+
226
+ ### The Dockerfile launches N MiniStacks
227
+
228
+ The container's entrypoint starts `POOL_SIZE` MiniStack processes on ports `4566..4566+POOL_SIZE-1` before the FastAPI server is ready to accept connections. Each MiniStack runs the same image but has its own in-memory state — so the 8 rollouts cannot accidentally see each other's S3 buckets, IAM roles, etc.
229
+
230
+ ### Web playground gets its own MiniStack (lazy, on a constant port)
231
+
232
+ The pool owns `[BASE..BASE+N-1]` for WebSocket sessions. The web playground's shared `_env` cannot share those ports — a `/web/step` would clobber whichever rollout currently holds the same MiniStack. Instead, the web UI uses a **dedicated MiniStack on a constant port outside the pool's range** (`AWS_RL_ENV_WEB_MINISTACK_PORT`, default `4565`). The pool is constructed as `range(BASE, BASE+N)`, so `pool.acquire()` can never hand out the web port.
233
+
234
+ That dedicated MiniStack is **spawned lazily** by the FastAPI server on the first `/web/*` request (`subprocess.Popen(["ministack", "-d"], env={"GATEWAY_PORT": "4565", ...})`). Training-only deployments — the common case — pay zero cost: the extra MiniStack only exists if a user actually opens the playground. First request takes ~1–3s for the bind; subsequent requests are fast (cached `_env`). A startup assertion refuses to boot if `AWS_RL_ENV_WEB_MINISTACK_PORT` falls inside the pool's range.
235
+
236
+ `POOL_SIZE=1` keeps the legacy single-MiniStack path: the web env shares `:4566` with the lone pool MiniStack — no extra process, no extra port.
237
+
238
+ ### Configuration
239
+
240
+ | Env var | Default | Purpose |
241
+ |------------------------------------|---------|---------------------------------------------------------------|
242
+ | `AWS_RL_ENV_POOL_SIZE` | `1` | Number of MiniStack instances + WebSocket session capacity |
243
+ | `AWS_RL_ENV_MINISTACK_BASE_PORT` | `4566` | First MiniStack port; pool covers `[BASE, BASE + N)` |
244
+ | `AWS_RL_ENV_WEB_MINISTACK_PORT` | `4565` | Web playground's dedicated MiniStack port (lazy spawn; must lie outside the pool's range when `POOL_SIZE>1`) |
245
+ | `BACKEND_TYPE` | `simulator` | `simulator` (default, MiniStack) or `aws` (real AWS, pool disabled) |
246
+
247
+ ### Cross-link
248
+
249
+ The **client side** of this pool — the `GrpoPool` and `MultiTurnEnvPool` that open N persistent WebSocket connections and run rollouts concurrently — is documented in [scripts/README.md](../scripts/README.md). Read that doc for the full multi-turn + multi-rollout walkthrough.
250
+
251
+ ---
252
+
253
+ ## 7. Curriculum manager
254
+
255
+ [services/curriculum.py](services/curriculum.py) — 536 LOC. Adaptive task selection with mastery tracking, spaced repetition, and tier promotion.
256
+
257
+ ### Per-tier configuration
258
+
259
+ | Tier | min_episodes | advance_rate | mastery_window | mastery_threshold | fast_track_rate | chaos_probability |
260
+ |--------------|:------------:|:------------:|:--------------:|:-----------------:|:---------------:|:-----------------:|
261
+ | warmup | 5 | 0.6 | 10 | 0.7 | 0.9 | 0.0 |
262
+ | beginner | 10 | 0.65 | 10 | 0.7 | 0.9 | 0.0 |
263
+ | intermediate | 15 | 0.65 | 10 | 0.7 | 0.9 | 0.10 |
264
+ | advanced | 15 | 0.7 | 10 | 0.7 | 0.9 | 0.20 |
265
+ | expert | 20 | 0.7 | 10 | 0.7 | 0.9 | 0.30 |
266
+
267
+ ### Priority scoring
268
+
269
+ For each episode the curriculum picks the highest-scored task within the agent's current tier:
270
+
271
+ ```
272
+ score = novelty_bonus # +100 if never attempted
273
+ + weakness_weight # +50 × (1 − task_success_rate)
274
+ + spaced_rep_bonus # +30 if a graduated task is "due" for re-test
275
+ − recency_penalty # −20 if attempted in the last 2 episodes
276
+ ```
277
+
278
+ This single formula simultaneously enforces exploration (novelty), targets weak spots (weakness), prevents forgetting (spaced rep), and avoids rut behavior (recency). No hand-coded scheduling — it falls out of the score.
279
+
280
+ ### Mastery model
281
+
282
+ - **Window**: the last 10 episodes for each task
283
+ - **Threshold**: a task graduates when its weighted success rate crosses 0.7
284
+ - **Decay**: `0.85` exponential — recent results count for more
285
+ - **Un-graduation**: if a graduated task drops back below threshold, it loses graduation and re-enters the rotation
286
+
287
+ ### Spaced repetition
288
+
289
+ Graduated tasks resurface at intervals `[3, 6, 12, 24, 48]` episodes. Pass on re-test → interval doubles (capped at 48). Fail → interval resets to 3. The `+30` priority bonus in the scoring formula is what surfaces them.
290
+
291
+ ### Tier promotion
292
+
293
+ Two paths:
294
+
295
+ - **Standard**: `tier_episodes >= min_episodes` and `tier_success_rate >= advance_rate`
296
+ - **Fast-track**: 3 consecutive episodes at ≥ `fast_track_rate` (0.9) — bypasses the minimum
297
+
298
+ Demotion is **not** supported — the agent's "ratchet" only goes up. (Mastery on individual tasks does decay; the *tier* does not.)
299
+
300
+ ### Notable APIs
301
+
302
+ - `Curriculum.next_task() -> Task` — selection
303
+ - `Curriculum.record_result(task, achieved, reward)` — episode-level callback
304
+ - `Curriculum.get_task_by_id(task_id) -> Task` — used by the GRPO validation harness for frozen held-out tasks
305
+ - `Curriculum.get_stats() -> dict` — see §18
306
+
307
+ ---
308
+
309
+ ## 8. Reward shaping & TaskGrader
310
+
311
+ [services/task_grader.py](services/task_grader.py) — 264 LOC. The grader is the single source of reward truth.
312
+
313
+ ### Reward formula
314
+
315
+ ```
316
+ if task_achieved:
317
+ reward = 1.0
318
+ if survived_chaos: reward *= 1.05 # ≤ 1.05 cap
319
+ else:
320
+ reward = partial_progress * 0.8 # ≤ 0.8 from steps alone
321
+ if progress_increased: reward += 0.1 # dense progress signal
322
+ if command_failed: reward *= 0.5 # error penalty
323
+ reward -= 0.1 * rollback_count # create→delete pairs
324
+ reward += 0.02 * idempotent_retries # graceful "already exists"
325
+ reward = clamp(reward, 0.0, 0.99) # 1.0 reserved for completion
326
+
327
+ reward *= 0.85 ** hints_used # hint decay applied last
328
+ ```
329
+
330
+ This is **dense by design** — the agent gets meaningful feedback on every step, not just at episode end.
331
+
332
+ ### Five grading strategies (dispatcher pattern)
333
+
334
+ `TaskGrader.grade()` dispatches on `task.success_criteria.grading_strategy`:
335
+
336
+ | Tier | Strategy | Mechanism | Partial-progress source |
337
+ |--------------|---------------------------|--------------------------------------------------------------------------------------------|--------------------------------------|
338
+ | Warmup | `command_match` | Latest command contains correct service + operation | Binary 0 or 1.0 |
339
+ | Beginner | `resource_creation` | Command match (0.5) + `ResourceVerifier` confirms exact resource exists in state (1.0) | Two-stage (0.5 → 1.0) |
340
+ | Intermediate | `multi_step` | Ordered list of `(operation, resource)` pairs; credit each new step | `completed_steps / total_steps` |
341
+ | Advanced | `multi_step + services` | Same as multi_step **and** all `services_required` must be touched | `completed_steps / total_steps` (capped until services satisfied) |
342
+ | Expert | `state_checks` | `ResourceVerifier` runs arbitrary AWS CLI commands at grading time and asserts on output | `0.7 × steps + 0.3 × state_checks` |
343
+
344
+ State-check assertions support two forms:
345
+ - `output_contains: <substring>` — substring match on stdout
346
+ - `json_path: <jq-style path>` + `expected: <value>` — JSON extraction with expected value
347
+
348
+ This per-tier polymorphism is critical: a single grading rule would be too lax for warmup or too crude for SRE tasks.
349
+
350
+ ### Chaos survival bonus
351
+
352
+ If `ChaosEngine` injected a mutation during the episode and the agent still completed, reward is `1.05` instead of `1.0` (5% bonus) — and that bonus *stacks under* hint decay (so the agent that solves a chaotic task without hints gets the maximum).
353
+
354
+ ### Rollback penalty & idempotency bonus
355
+
356
+ - **Rollback** (`-0.1` per pair): `EpisodeTracker.detect_rollbacks()` scans the command history for `(create-X, … , delete-X)` pairs on the same resource. Production-style waste — heavily penalized.
357
+ - **Idempotency** (`+0.02`): if a command fails with a known "already exists" pattern (`BucketAlreadyExists`, `ResourceInUseException`, etc.) and the next command continues productively, the agent is rewarded for graceful retry behavior.
358
+
359
+ This is the first RL environment we know of that rewards *operational discipline* directly.
360
+
361
+ ---
362
+
363
+ ## 9. Anti-reward-hacking — 8 defense layers
364
+
365
+ The agent's only loss surface is the reward signal. We harden it so that the cheapest path to a high reward is *actually doing the task* — not gaming the grader.
366
+
367
+ ### Layer 1 — Ground-truth verification via MiniStack
368
+
369
+ The grader **never trusts agent command output**. For every resource check it asks `ResourceVerifier` to query MiniStack (or AWS) directly. Even if the agent crafts a perfectly-formed fake JSON response, the grader looks at server-side state.
370
+
371
+ > ResourceVerifier covers 20+ services with bespoke verification methods (S3, DynamoDB, Lambda, SQS, SNS, IAM, Secrets Manager, API Gateway, Cognito, RDS, EFS, ElastiCache, EC2, Step Functions, Glue, Athena, EMR, ECS, EKS, EventBridge, Kinesis, …).
372
+
373
+ ### Layer 2 — Deduplication
374
+
375
+ `EpisodeTracker.has_executed_operation()` records every `(operation, resource)` pair that earned credit. Re-running the same successful command does **not** re-earn `partial_progress`. Each unique operation pays out exactly once.
376
+
377
+ ### Layer 3 — Grader invisibility
378
+
379
+ The CLI commands run by `ResourceVerifier` and `state_checks` happen **server-side** and are not echoed into the agent's observation. The agent never sees which queries the grader is making, so it cannot reverse-engineer "fake outputs" that match the grader's expectations.
380
+
381
+ ### Layer 4 — Command allow-listing
382
+
383
+ `step()` rejects anything that doesn't start with `aws ` (`success=False`, no execution). No shell metacharacters, no piping, no redirection, no escape from the AWS CLI sandbox.
384
+
385
+ ### Layer 5 — No verification reward
386
+
387
+ If the agent's command exactly matches one of the task's `state_checks` commands (e.g. `aws s3api get-bucket-versioning --bucket app-config-store`), it gets **zero** progress credit. Only mutating commands (create / put / update / delete) earn credit. Read-only auditing is freely allowed but not rewarded — exactly mirroring the grader's behavior.
388
+
389
+ ### Layer 6 — Monotonic progress
390
+
391
+ `partial_progress` only ever increases within an episode. It is clamped at `0.99`; reaching `1.0` requires fully verified completion. The agent cannot lose progress, but it also cannot re-earn lost progress, so cycling strategies (create → delete → create) yield zero net gain.
392
+
393
+ ### Layer 7 — Resource-name validation
394
+
395
+ `ResourceVerifier` checks the **exact** resource name from the task definition. Creating `my-test-bucket-2` does not satisfy a check for `my-test-bucket`. The agent cannot creatively name its way around the spec.
396
+
397
+ ### Layer 8 — State checks verify the final state
398
+
399
+ For expert SRE tasks, the grader runs the canonical `state_checks` commands at grading time against the live MiniStack. The grade is "what is true now?", not "what did the agent claim?". This is the single hardest layer to circumvent.
400
+
401
+ These layers compose: even if one is bypassed (e.g. a clever exact-match name), the others independently still produce the right reward.
402
+
403
+ ---
404
+
405
+ ## 10. Resource verifier
406
+
407
+ [services/resource_verifier.py](services/resource_verifier.py) — 362 LOC.
408
+
409
+ - **Per-service `verify_*` methods** for 20+ AWS services. Each method knows which API calls expose state for that service and how to read the response (e.g. `verify_s3_bucket(name)` calls `s3api list-buckets`, `verify_dynamodb_table(name)` calls `dynamodb describe-table`, etc.).
410
+ - **Single-shot state path**: when called via `SimulatorStrategy.get_state()`, the verifier reads MiniStack's custom `/_ministack/state` endpoint (added in commit `a648c3a`, see §5) which returns the full infra inventory in one HTTP call. This is dramatically faster than iterating 20+ list APIs per grading pass.
411
+ - **State-check evaluator**: handles `output_contains` (substring) and `json_path` + `expected` (JSON extraction with deep-path support) assertion types used by expert-tier tasks.
412
+ - **Live ground-truth source** — the verifier never consumes the agent's stdout. Always fresh state from the simulator.
413
+
414
+ ---
415
+
416
+ ## 11. Chaos engine
417
+
418
+ [services/chaos_engine.py](services/chaos_engine.py) — 168 LOC.
419
+
420
+ Probabilistically perturbs AWS resource state mid-episode. Tests whether the agent can detect and recover from unexpected drift — a critical SRE skill.
421
+
422
+ - **Tier-scaled probability**: 0% warmup/beginner, 10% intermediate, 20% advanced, 30% expert
423
+ - **Service-scoped templates**: a chaos roll only fires on services the current task is touching. Resource names are extracted from the agent's recent successful commands via service-specific regex (e.g. `aws s3 mb s3://(\S+)` → bucket name).
424
+ - **Five service templates**: S3 policy / versioning changes, DynamoDB throughput modifications, Lambda configuration alterations, IAM detach-role-policy, SNS subscription mutations
425
+ - **Silent**: chaos commands run server-side; the agent observes only the *consequence* (a state inconsistency), never the cause
426
+ - **Reward bonus**: surviving chaos and completing the task pays `1.05` instead of `1.0`
427
+
428
+ The combination of "tier-scaled probability" + "task-scoped resource selection" means chaos is rare for warmup tasks (0%) and frequent for SRE tasks (30%) — exactly where it matters.
429
+
430
+ ---
431
+
432
+ ## 12. Drift engine
433
+
434
+ [services/drift_engine.py](services/drift_engine.py) — 67 LOC.
435
+
436
+ Specialised for the 6 drift-detection expert tasks defined in [services/tasks/drift.yaml](services/tasks/drift.yaml).
437
+
438
+ - Each drift task ships a pool of `possible_drifts` (each a small list of CLI commands that mutates a resource away from the desired spec).
439
+ - On `reset()`, the engine **randomly selects 2–3 drifts** from that pool and applies them after the setup-command phase.
440
+ - The agent sees a `desired_state_spec` (natural language) and must audit the environment, identify which resources drifted, and fix only those.
441
+ - Random selection per episode means **no memorization** — the agent must reason about desired vs actual state, not recall a fix script.
442
+ - Examples: S3 versioning/encryption drift, DynamoDB throughput changes, SNS subscription modifications, Lambda env-var tampering.
443
+
444
+ ---
445
+
446
+ ## 13. Hint provider
447
+
448
+ [services/hint_provider.py](services/hint_provider.py) — 137 LOC.
449
+
450
+ Three-level progressive hints, requested via the special action `aws help --task-hint`:
451
+
452
+ | Level | What it reveals | Example |
453
+ |-------|---------------------------------------|----------------------------------------------------------|
454
+ | 1 | Required AWS services | "You'll need IAM and Lambda" |
455
+ | 2 | Operation sequence | "Start with `create-role`, then `put-role-policy`" |
456
+ | 3 | Near-complete command structure | "Use: `aws iam create-role --role-name …`" |
457
+
458
+ - Hints are **auto-derived** from the `SuccessCriteria` fields (services list, ordered steps, operation names) — no hand-written hint text per task.
459
+ - Reward decay: `final_reward *= 0.85 ** hints_used`. With three hints (max), the agent caps at `0.85³ ≈ 0.614` of normal reward.
460
+ - The hint command is **intercepted before reaching MiniStack** so it does not consume an episode step nor affect simulator state.
461
+
462
+ ---
463
+
464
+ ## 14. Episode tracker
465
+
466
+ [services/episode_tracker.py](services/episode_tracker.py) — 241 LOC.
467
+
468
+ Single source of per-episode state. Maintains:
469
+
470
+ - Step count, hint count, command history (raw + parsed)
471
+ - `partial_progress: float ∈ [0, 1]` (monotonic — see anti-hack layer 6)
472
+ - `credited_operations: set[(operation, resource)]` (for dedup — anti-hack layer 2)
473
+ - Rollback detection: scans history for `(create-X, …, delete-X)` pairs on same resource
474
+ - Idempotency detection: looks for known "already exists" error patterns
475
+
476
+ Parses each AWS CLI invocation into a structured tuple `(service, operation, resource_name)` for downstream services to query without re-parsing.
477
+
478
+ ---
479
+
480
+ ## 15. Environment designer
481
+
482
+ [services/environment_designer.py](services/environment_designer.py) — 99 LOC.
483
+
484
+ Provisioning helper for SRE / security-posture / drift tasks. A task can declare `setup_commands: list[SetupCommand]` — these are executed (server-side) **before** the agent starts so the world begins in a deliberately broken / insecure / over-provisioned state. Examples:
485
+
486
+ - "Public S3 bucket lockdown" (§17): creates `public-assets` with a wide-open bucket policy
487
+ - "IAM least-privilege": creates `app-role` with `Action: *` / `Resource: *`
488
+ - Drift tasks: provision the *correct* infra so the drift engine can mutate it
489
+
490
+ Setup failures abort the reset — partial setup is never exposed to the agent.
491
+
492
+ ---
493
+
494
+ ## 16. Task definitions (YAML schema)
495
+
496
+ [services/tasks/](services/tasks/) — one YAML file per tier:
497
+
498
+ - [warmup.yaml](services/tasks/warmup.yaml) — 25 listing tasks
499
+ - [beginner.yaml](services/tasks/beginner.yaml) — 25 single-resource creation tasks
500
+ - [intermediate.yaml](services/tasks/intermediate.yaml) — 25 multi-step workflows
501
+ - [advanced.yaml](services/tasks/advanced.yaml) — 25 cross-service architectures
502
+ - [expert.yaml](services/tasks/expert.yaml) — 24 SRE / security tasks
503
+ - [drift.yaml](services/tasks/drift.yaml) — 9 drift detection tasks
504
+
505
+ Sample task:
506
+
507
+ ```yaml
508
+ - task_id: 42
509
+ description: Create an S3 bucket named my-app-data and enable versioning on it.
510
+ difficulty: intermediate
511
+ success_criteria:
512
+ grading_strategy: multi_step
513
+ steps:
514
+ - operation: create-bucket
515
+ resource: my-app-data
516
+ - operation: put-bucket-versioning
517
+ resource: my-app-data
518
+ services: [s3]
519
+ setup_commands: []
520
+ possible_drifts: []
521
+ ```
522
+
523
+ Expert / drift tasks add `state_checks`, `desired_state_spec`, and `setup_commands`.
524
+
525
+ ---
526
+
527
+ ## 17. Security-posture audit examples
528
+
529
+ These three expert-tier tasks test reasoning about *configuration state* — the infra is functional but insecure. The agent must read existing config and recognize the vulnerability.
530
+
531
+ ### Public S3 bucket lockdown
532
+
533
+ - **Setup**: bucket `public-assets` is provisioned with a bucket policy granting `Principal: *` access
534
+ - **Task**: replace the policy so only IAM role `app-role` can `s3:GetObject`
535
+ - **State checks**: bucket policy denies `Principal: *`, allows only `app-role`
536
+
537
+ ### IAM least privilege
538
+
539
+ - **Setup**: role `app-role` exists with an inline policy `Action: *, Resource: *`
540
+ - **Task**: replace with a least-privilege policy allowing only `dynamodb:GetItem` and `dynamodb:PutItem` on the users table
541
+ - **State checks**: policy document matches the expected ARN-scoped permissions
542
+
543
+ ### Lambda secret rotation
544
+
545
+ - **Setup**: Lambda `data-processor` has env var `DB_PASSWORD=hunter2` (plaintext)
546
+ - **Task**: create a Secrets Manager secret, add `SECRET_ARN` env var, remove `DB_PASSWORD`
547
+ - **State checks**: secret exists, Lambda has `SECRET_ARN`, no `DB_PASSWORD` remains
548
+
549
+ These are not hypothetical scenarios — they're the most common cloud-misconfiguration findings in real audits.
550
+
551
+ ---
552
+
553
+ ## 18. Curriculum stats API
554
+
555
+ `Curriculum.get_stats()` returns:
556
+
557
+ ```python
558
+ {
559
+ "episode_count": 42,
560
+ "tier": "intermediate",
561
+ "tier_episodes": 12,
562
+ "tier_success_rate": 0.75,
563
+ "graduated_tasks": [0, 2, 4],
564
+ "weak_spots": [11, 12],
565
+ "skill_profile": {0: 0.95, 1: 0.8, ...}, # per-task weighted success
566
+ "spaced_rep_due": [0, 2], # graduated tasks due for re-test
567
+ "avg_reward_last_10": 0.65,
568
+ }
569
+ ```
570
+
571
+ Useful for:
572
+ - Dashboarding training progress
573
+ - Logging into the GRPO `EpisodeLogger` CSV (see [train_grpo.py:635](../train_grpo.py))
574
+ - Driving the web playground's progress bar
575
+
576
+ ---
577
+
578
+ ## 19. Web playground
579
+
580
+ Always mounted at [http://localhost:8000/web](http://localhost:8000/web). When `POOL_SIZE>1` the playground is backed by a **dedicated lazy-spawned MiniStack** on `AWS_RL_ENV_WEB_MINISTACK_PORT` (default `4565`) — see §6. First request takes ~1–3s while that MiniStack binds; subsequent requests are fast.
581
+
582
+ - HTML: [server/templates/index.html](templates/index.html)
583
+ - Static assets: [server/static/](static/) — CSS, JS, and **40 AWS service icons** in [server/static/img/aws/](static/img/aws/)
584
+ - The playground talks to `/web/reset`, `/web/step`, `/web/state`, and `/web/solution` (the last one reveals the next canonical solution command — handy for demos and debugging task definitions).
585
+
586
+ The playground runs a **single shared environment instance** on its own MiniStack (or, with `POOL_SIZE=1`, the lone pool MiniStack on `:4566`). It is intentionally separate from the per-WebSocket sessions used during training so a curious user clicking around the web UI cannot interfere with an active GRPO rollout.
587
+
588
+ ---
589
+
590
+ ## See also
591
+
592
+ - [Main README](../README.md) — project overview, results, Colab links
593
+ - [scripts/README.md](../scripts/README.md) — client-side parallel rollout pool (`GrpoPool`, `MultiTurnEnvPool`, asyncio orchestration)
594
+ - [train/README.md](../train/README.md) — SFT + GRPO training pipeline
595
+ - [data/README.md](../data/README.md) — dataset generation + base-model selection
596
+ - [aws_infra/README.md](../aws_infra/README.md) — vendored MiniStack upstream docs (81 KB)
server/app.py CHANGED
@@ -28,8 +28,14 @@ Usage:
28
  python -m server.app
29
  """
30
 
 
31
  import os
 
 
 
 
32
  import threading
 
33
  from pathlib import Path
34
  from typing import Any, Callable, Dict, Iterable
35
 
@@ -71,6 +77,22 @@ POOL_SIZE = max(int(os.getenv("AWS_RL_ENV_POOL_SIZE", "1")), 1)
71
  BASE_MINISTACK_PORT = int(os.getenv("AWS_RL_ENV_MINISTACK_BASE_PORT", "4566"))
72
  BACKEND_TYPE = os.getenv("BACKEND_TYPE", "simulator") # "simulator" | "aws"
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  class MiniStackPool:
76
  """Thread-safe free-list of MiniStack ports.
@@ -156,84 +178,147 @@ app = create_app(
156
  # The web playground needs state across requests, so we maintain a shared
157
  # environment instance and expose /web/reset and /web/step.
158
  #
159
- # Only mounted when POOL_SIZE <= 1. With a pool active, port 4566 is
160
- # claimed by the pool and a shared web _env would collide with the
161
- # per-session MiniStacks.
162
- # If POOL_SIZE=8 and web mounts anyway, the module-level _env = AwsRlEnvironment()
163
- # defaults to http://localhost:4566 which is also in the pool's range.
164
- # Any /web/step clobbers the MiniStack currently held by a WS session that
165
- # acquired port 4566. State corrupts both ways: web user's bucket appears in a
166
- # GRPO rollout; pool rollout's drift mutations show up in the web UI.
167
 
 
 
168
 
169
- # ---------------------------------------------------------------------------
170
 
171
- if POOL_SIZE <= 1:
172
- _env = AwsRlEnvironment()
 
 
173
 
174
- class WebStepRequest(BaseModel):
175
- action: Dict[str, Any]
176
 
177
- @app.post("/web/reset", include_in_schema=False)
178
- async def web_reset():
179
- obs = _env.reset()
180
- return {
181
- "observation": obs.model_dump(),
182
- "reward": obs.reward,
183
- "done": obs.done,
184
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
- @app.get("/web/solution", include_in_schema=False)
187
- async def web_solution():
188
- """Return the next solution command for the current task step."""
189
- if not _env._current_task:
190
- return {
191
- "command": None,
192
- "error": "No active task. Start a new episode first.",
193
- }
194
-
195
- from server.services.task_solutions import get_next_solution
196
-
197
- result = get_next_solution(
198
- task_id=_env._current_task.task_id,
199
- backend=_env._backend,
200
- tracker=_env._tracker,
201
- )
202
- result["task_id"] = _env._current_task.task_id
203
- return result
204
-
205
- @app.get("/web/state", include_in_schema=False)
206
- async def web_state():
207
- """Return the full AwsRlState for the web UI."""
208
- return _env.state.model_dump()
209
-
210
- @app.post("/web/step", include_in_schema=False)
211
- async def web_step(request: WebStepRequest = Body(...)):
212
- action = AwsRlAction(**request.action)
213
- obs = _env.step(action)
214
  return {
215
- "observation": obs.model_dump(),
216
- "reward": obs.reward,
217
- "done": obs.done,
218
  }
219
 
220
- # ---------------------------------------------------------------------------
221
- # Custom web UI
222
- # ---------------------------------------------------------------------------
223
 
224
- _server_dir = Path(__file__).parent
225
- _templates = Jinja2Templates(directory=str(_server_dir / "templates"))
226
- app.mount(
227
- "/static", StaticFiles(directory=str(_server_dir / "static")), name="static"
228
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
- @app.get("/", response_class=RedirectResponse, include_in_schema=False)
231
- async def root_redirect():
232
- return RedirectResponse(url="/web")
233
 
234
- @app.get("/web", response_class=HTMLResponse, include_in_schema=False)
235
- async def web_ui(request: Request):
236
- return _templates.TemplateResponse(request=request, name="index.html")
237
 
238
 
239
  def main(host: str = "0.0.0.0", port: int = 8000):
 
28
  python -m server.app
29
  """
30
 
31
+ import asyncio
32
  import os
33
+ import shutil
34
+ import socket
35
+ import subprocess
36
+ import sys
37
  import threading
38
+ import time
39
  from pathlib import Path
40
  from typing import Any, Callable, Dict, Iterable
41
 
 
77
  BASE_MINISTACK_PORT = int(os.getenv("AWS_RL_ENV_MINISTACK_BASE_PORT", "4566"))
78
  BACKEND_TYPE = os.getenv("BACKEND_TYPE", "simulator") # "simulator" | "aws"
79
 
80
+ # Constant, dedicated MiniStack port for the web playground. Kept outside the
81
+ # pool's range so a WebSocket session can never acquire it, eliminating the
82
+ # state-bleed risk that previously gated the web UI when POOL_SIZE > 1.
83
+ WEB_MINISTACK_PORT = int(os.getenv("AWS_RL_ENV_WEB_MINISTACK_PORT", "4565"))
84
+
85
+ if (
86
+ BACKEND_TYPE != "aws"
87
+ and POOL_SIZE > 1
88
+ and BASE_MINISTACK_PORT <= WEB_MINISTACK_PORT < BASE_MINISTACK_PORT + POOL_SIZE
89
+ ):
90
+ raise RuntimeError(
91
+ f"AWS_RL_ENV_WEB_MINISTACK_PORT={WEB_MINISTACK_PORT} collides with pool range "
92
+ f"[{BASE_MINISTACK_PORT}..{BASE_MINISTACK_PORT + POOL_SIZE - 1}]. "
93
+ f"Pick a port outside the pool's range."
94
+ )
95
+
96
 
97
  class MiniStackPool:
98
  """Thread-safe free-list of MiniStack ports.
 
178
  # The web playground needs state across requests, so we maintain a shared
179
  # environment instance and expose /web/reset and /web/step.
180
  #
181
+ # When POOL_SIZE > 1 the pool owns [BASE..BASE+N-1]; the web UI uses a
182
+ # dedicated MiniStack on WEB_MINISTACK_PORT (constant, outside the pool's
183
+ # range) so it can never collide with a WebSocket session. That MiniStack is
184
+ # spawned lazily on the first /web/* request — training-only deployments pay
185
+ # zero cost. Subsequent requests reuse the cached _web_env.
186
+ # ---------------------------------------------------------------------------
 
 
187
 
188
+ _web_env: AwsRlEnvironment | None = None
189
+ _web_env_lock = threading.Lock()
190
 
 
191
 
192
+ def _port_listening(port: int) -> bool:
193
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
194
+ s.settimeout(0.2)
195
+ return s.connect_ex(("127.0.0.1", port)) == 0
196
 
 
 
197
 
198
+ def _resolve_ministack_bin() -> str:
199
+ """Find the ministack entry point. Prefer the same venv as the running
200
+ Python (sys.executable's bin dir) before falling back to PATH — uvicorn
201
+ invoked via /full/path/to/.venv/bin/uvicorn doesn't always have the venv
202
+ on PATH, so a bare "ministack" lookup would FileNotFoundError.
203
+ """
204
+ candidate = Path(sys.executable).parent / "ministack"
205
+ if candidate.exists():
206
+ return str(candidate)
207
+ on_path = shutil.which("ministack")
208
+ if on_path:
209
+ return on_path
210
+ raise RuntimeError(
211
+ "Could not find the 'ministack' executable. Install with `uv sync` "
212
+ "or ensure the active venv's bin directory is on PATH."
213
+ )
214
+
215
+
216
+ def _spawn_web_ministack(port: int, timeout_s: float = 10.0) -> None:
217
+ if _port_listening(port):
218
+ return
219
+ subprocess.Popen(
220
+ [_resolve_ministack_bin(), "-d"],
221
+ env={**os.environ, "GATEWAY_PORT": str(port)},
222
+ stdout=subprocess.DEVNULL,
223
+ stderr=subprocess.DEVNULL,
224
+ )
225
+ deadline = time.monotonic() + timeout_s
226
+ while time.monotonic() < deadline:
227
+ if _port_listening(port):
228
+ return
229
+ time.sleep(0.1)
230
+ raise RuntimeError(f"Web MiniStack failed to bind {port} within {timeout_s}s")
231
+
232
+
233
+ def _get_web_env() -> AwsRlEnvironment:
234
+ global _web_env
235
+ if _web_env is not None:
236
+ return _web_env
237
+ with _web_env_lock:
238
+ if _web_env is not None:
239
+ return _web_env
240
+ if BACKEND_TYPE == "aws":
241
+ _web_env = AwsRlEnvironment(strategy=AwsStrategy())
242
+ elif POOL_SIZE > 1:
243
+ _spawn_web_ministack(WEB_MINISTACK_PORT)
244
+ _web_env = AwsRlEnvironment(
245
+ strategy=SimulatorStrategy(f"http://localhost:{WEB_MINISTACK_PORT}")
246
+ )
247
+ else:
248
+ _web_env = AwsRlEnvironment()
249
+ return _web_env
250
+
251
 
252
+ class WebStepRequest(BaseModel):
253
+ action: Dict[str, Any]
254
+
255
+
256
+ @app.post("/web/reset", include_in_schema=False)
257
+ async def web_reset():
258
+ env = await asyncio.to_thread(_get_web_env)
259
+ obs = env.reset()
260
+ return {
261
+ "observation": obs.model_dump(),
262
+ "reward": obs.reward,
263
+ "done": obs.done,
264
+ }
265
+
266
+
267
+ @app.get("/web/solution", include_in_schema=False)
268
+ async def web_solution():
269
+ """Return the next solution command for the current task step."""
270
+ env = await asyncio.to_thread(_get_web_env)
271
+ if not env._current_task:
 
 
 
 
 
 
 
 
272
  return {
273
+ "command": None,
274
+ "error": "No active task. Start a new episode first.",
 
275
  }
276
 
277
+ from server.services.task_solutions import get_next_solution
 
 
278
 
279
+ result = get_next_solution(
280
+ task_id=env._current_task.task_id,
281
+ backend=env._backend,
282
+ tracker=env._tracker,
283
  )
284
+ result["task_id"] = env._current_task.task_id
285
+ return result
286
+
287
+
288
+ @app.get("/web/state", include_in_schema=False)
289
+ async def web_state():
290
+ """Return the full AwsRlState for the web UI."""
291
+ env = await asyncio.to_thread(_get_web_env)
292
+ return env.state.model_dump()
293
+
294
+
295
+ @app.post("/web/step", include_in_schema=False)
296
+ async def web_step(request: WebStepRequest = Body(...)):
297
+ env = await asyncio.to_thread(_get_web_env)
298
+ action = AwsRlAction(**request.action)
299
+ obs = env.step(action)
300
+ return {
301
+ "observation": obs.model_dump(),
302
+ "reward": obs.reward,
303
+ "done": obs.done,
304
+ }
305
+
306
+
307
+ _server_dir = Path(__file__).parent
308
+ _templates = Jinja2Templates(directory=str(_server_dir / "templates"))
309
+ app.mount(
310
+ "/static", StaticFiles(directory=str(_server_dir / "static")), name="static"
311
+ )
312
+
313
+
314
+ @app.get("/", response_class=RedirectResponse, include_in_schema=False)
315
+ async def root_redirect():
316
+ return RedirectResponse(url="/web")
317
 
 
 
 
318
 
319
+ @app.get("/web", response_class=HTMLResponse, include_in_schema=False)
320
+ async def web_ui(request: Request):
321
+ return _templates.TemplateResponse(request=request, name="index.html")
322
 
323
 
324
  def main(host: str = "0.0.0.0", port: int = 8000):
tests/test_pool.py CHANGED
@@ -360,3 +360,328 @@ class TestFactoryConcurrencyIntegration:
360
  t.join()
361
 
362
  assert pool.free_count == 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
  t.join()
361
 
362
  assert pool.free_count == 20
363
+
364
+
365
+ # ---------------------------------------------------------------------------
366
+ # Web playground coexistence with the MiniStack pool
367
+ # ---------------------------------------------------------------------------
368
+
369
+
370
+ def _run_in_subprocess(env_overrides: dict[str, str], code: str) -> tuple[int, str, str]:
371
+ """Run `code` in a fresh subprocess with the given env overrides.
372
+
373
+ Mirrors the pattern used by TestServerAppImportIsSafeForLegacyPoolSizes
374
+ to avoid module-cache pollution across env-var changes.
375
+ """
376
+ import os
377
+ import subprocess
378
+ import sys
379
+
380
+ env = {**os.environ, **env_overrides}
381
+ result = subprocess.run(
382
+ [sys.executable, "-c", code],
383
+ env=env,
384
+ capture_output=True,
385
+ text=True,
386
+ check=False,
387
+ )
388
+ return result.returncode, result.stdout, result.stderr
389
+
390
+
391
+ class TestWebRoutesMountUnconditionally:
392
+ """The web playground used to be gated on POOL_SIZE <= 1. It now mounts
393
+ regardless of pool size, with a dedicated lazy MiniStack on
394
+ AWS_RL_ENV_WEB_MINISTACK_PORT.
395
+ """
396
+
397
+ def test_web_routes_present_when_pool_size_8(self) -> None:
398
+ code = (
399
+ "import server.app as m;"
400
+ "paths = {getattr(r, 'path', None) for r in m.app.routes};"
401
+ "import sys;"
402
+ "missing = {'/web', '/web/reset', '/web/state', '/web/step', '/web/solution'} - paths;"
403
+ "sys.stdout.write('MISSING=' + repr(missing))"
404
+ )
405
+ rc, out, err = _run_in_subprocess({"AWS_RL_ENV_POOL_SIZE": "8"}, code)
406
+ assert rc == 0, f"import failed: {err}"
407
+ assert "MISSING=set()" in out, out
408
+
409
+ def test_web_routes_present_when_pool_size_1(self) -> None:
410
+ code = (
411
+ "import server.app as m;"
412
+ "paths = {getattr(r, 'path', None) for r in m.app.routes};"
413
+ "import sys;"
414
+ "missing = {'/web', '/web/reset', '/web/state', '/web/step', '/web/solution'} - paths;"
415
+ "sys.stdout.write('MISSING=' + repr(missing))"
416
+ )
417
+ rc, out, err = _run_in_subprocess({"AWS_RL_ENV_POOL_SIZE": "1"}, code)
418
+ assert rc == 0, f"import failed: {err}"
419
+ assert "MISSING=set()" in out, out
420
+
421
+
422
+ class TestWebMiniStackPortConflictDetection:
423
+ """The startup-time guard refuses to boot if the configured web port falls
424
+ inside the pool's port range. Without it, a WebSocket session could acquire
425
+ the same port the web _env writes to and corrupt state in both directions.
426
+ """
427
+
428
+ def test_collision_inside_pool_range_raises(self) -> None:
429
+ code = "import server.app"
430
+ rc, _, err = _run_in_subprocess(
431
+ {
432
+ "AWS_RL_ENV_POOL_SIZE": "8",
433
+ "AWS_RL_ENV_MINISTACK_BASE_PORT": "4566",
434
+ "AWS_RL_ENV_WEB_MINISTACK_PORT": "4570", # inside [4566..4573]
435
+ },
436
+ code,
437
+ )
438
+ assert rc != 0
439
+ assert "collides with pool range" in err
440
+
441
+ def test_web_port_just_below_pool_range_is_allowed(self) -> None:
442
+ code = "import server.app"
443
+ rc, _, err = _run_in_subprocess(
444
+ {
445
+ "AWS_RL_ENV_POOL_SIZE": "8",
446
+ "AWS_RL_ENV_MINISTACK_BASE_PORT": "4566",
447
+ "AWS_RL_ENV_WEB_MINISTACK_PORT": "4565", # default
448
+ },
449
+ code,
450
+ )
451
+ assert rc == 0, err
452
+
453
+ def test_web_port_just_above_pool_range_is_allowed(self) -> None:
454
+ code = "import server.app"
455
+ rc, _, err = _run_in_subprocess(
456
+ {
457
+ "AWS_RL_ENV_POOL_SIZE": "8",
458
+ "AWS_RL_ENV_MINISTACK_BASE_PORT": "4566",
459
+ "AWS_RL_ENV_WEB_MINISTACK_PORT": "4574", # one past 4573
460
+ },
461
+ code,
462
+ )
463
+ assert rc == 0, err
464
+
465
+ def test_collision_check_skipped_when_pool_size_1(self) -> None:
466
+ """POOL_SIZE=1 means no pool object exists, so the constant web port
467
+ is allowed to coincide with BASE_PORT (it just means the web env
468
+ shares the lone MiniStack). Backward-compat for legacy single-mode.
469
+ """
470
+ code = "import server.app"
471
+ rc, _, err = _run_in_subprocess(
472
+ {
473
+ "AWS_RL_ENV_POOL_SIZE": "1",
474
+ "AWS_RL_ENV_MINISTACK_BASE_PORT": "4566",
475
+ "AWS_RL_ENV_WEB_MINISTACK_PORT": "4566",
476
+ },
477
+ code,
478
+ )
479
+ assert rc == 0, err
480
+
481
+ def test_collision_check_skipped_when_backend_aws(self) -> None:
482
+ """BACKEND_TYPE=aws skips the pool entirely (all sessions share
483
+ AwsStrategy), so a "collision" with the pool's range is hypothetical
484
+ — the pool object is never constructed. Refusing to boot here would
485
+ be a false positive.
486
+ """
487
+ code = "import server.app"
488
+ rc, _, err = _run_in_subprocess(
489
+ {
490
+ "AWS_RL_ENV_POOL_SIZE": "8",
491
+ "AWS_RL_ENV_MINISTACK_BASE_PORT": "4566",
492
+ "AWS_RL_ENV_WEB_MINISTACK_PORT": "4570", # would collide if simulator
493
+ "BACKEND_TYPE": "aws",
494
+ },
495
+ code,
496
+ )
497
+ assert rc == 0, err
498
+
499
+
500
+ class TestWebEnvLazyConstruction:
501
+ def test_web_env_is_none_immediately_after_import(self) -> None:
502
+ """Lazy: the dedicated MiniStack should NOT spawn until a /web/*
503
+ request arrives. Importing the module must not subprocess anything.
504
+ """
505
+ code = (
506
+ "import server.app as m;"
507
+ "import sys;"
508
+ "sys.stdout.write('\\nRESULT=' + ('NONE' if m._web_env is None else 'NOT_NONE'))"
509
+ )
510
+ rc, out, err = _run_in_subprocess({"AWS_RL_ENV_POOL_SIZE": "8"}, code)
511
+ assert rc == 0, err
512
+ assert out.strip().splitlines()[-1] == "RESULT=NONE"
513
+
514
+ def test_get_web_env_legacy_uses_default_port_for_pool_size_1(self) -> None:
515
+ """POOL_SIZE=1: web env shares the single MiniStack on :4566 — the
516
+ original behavior, locked down so it doesn't drift.
517
+ """
518
+ code = (
519
+ "import server.app as m;"
520
+ "env = m._get_web_env();"
521
+ "import sys;"
522
+ "sys.stdout.write('\\nRESULT=' + env._backend._aws_infra_url)"
523
+ )
524
+ rc, out, err = _run_in_subprocess({"AWS_RL_ENV_POOL_SIZE": "1"}, code)
525
+ assert rc == 0, err
526
+ assert out.strip().splitlines()[-1] == "RESULT=http://localhost:4566"
527
+
528
+ def test_get_web_env_uses_aws_strategy_when_backend_aws(self) -> None:
529
+ """BACKEND_TYPE=aws: web env wires AwsStrategy too. No MiniStack spawn.
530
+ Fixes the latent inconsistency where the web playground always used
531
+ the simulator regardless of training backend.
532
+ """
533
+ code = (
534
+ "import server.app as m;"
535
+ "from server.services.aws_strategy import AwsStrategy;"
536
+ "env = m._get_web_env();"
537
+ "import sys;"
538
+ "sys.stdout.write('\\nRESULT=' + ('AWS' if isinstance(env._backend, AwsStrategy) else 'NOT_AWS'))"
539
+ )
540
+ rc, out, err = _run_in_subprocess(
541
+ {"AWS_RL_ENV_POOL_SIZE": "8", "BACKEND_TYPE": "aws"},
542
+ code,
543
+ )
544
+ assert rc == 0, err
545
+ assert out.strip().splitlines()[-1] == "RESULT=AWS"
546
+
547
+
548
+ class TestSpawnWebMiniStackShortCircuit:
549
+ """`_spawn_web_ministack` must not subprocess if the port is already
550
+ listening — otherwise a server restart would race against the existing
551
+ detached MiniStack and stall on the bind check.
552
+ """
553
+
554
+ def test_does_not_spawn_when_port_already_listening(self) -> None:
555
+ import socket
556
+
557
+ from server.app import _spawn_web_ministack
558
+
559
+ # Bind an ephemeral port to simulate a MiniStack already running.
560
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sentinel:
561
+ sentinel.bind(("127.0.0.1", 0))
562
+ sentinel.listen(1)
563
+ port = sentinel.getsockname()[1]
564
+
565
+ with patch("server.app.subprocess.Popen") as popen:
566
+ _spawn_web_ministack(port, timeout_s=0.5)
567
+
568
+ popen.assert_not_called()
569
+
570
+ def test_raises_on_bind_timeout(self) -> None:
571
+ """If the spawned MiniStack never binds, raise instead of hanging."""
572
+ from server.app import _spawn_web_ministack
573
+
574
+ # Pick a port that is almost certainly free; mock Popen so nothing
575
+ # actually starts. _spawn_web_ministack should poll and time out.
576
+ with patch("server.app.subprocess.Popen"):
577
+ with pytest.raises(RuntimeError, match="failed to bind"):
578
+ _spawn_web_ministack(port=1, timeout_s=0.3)
579
+
580
+
581
+ class TestGetWebEnvAdversarial:
582
+ """Stress-test _get_web_env against the failure modes a real deployment
583
+ will eventually hit: concurrent first-request races, ministack-not-installed,
584
+ and spawn timeouts.
585
+
586
+ Each test patches at the module level inside an isolated subprocess so
587
+ real ministacks are never spawned.
588
+ """
589
+
590
+ def test_concurrent_first_requests_spawn_at_most_once(self) -> None:
591
+ """N threads racing on the cold start must result in exactly one
592
+ Popen call. The double-checked lock + cached _web_env enforce this.
593
+ Otherwise a busy /web/* moment at boot would spawn N ministacks all
594
+ fighting for the same port.
595
+ """
596
+ code = """
597
+ import sys, threading
598
+ from unittest.mock import patch
599
+ import server.app as m
600
+ with patch('server.app._spawn_web_ministack') as spawn:
601
+ spawn.return_value = None
602
+ def call():
603
+ m._get_web_env()
604
+ threads = [threading.Thread(target=call) for _ in range(20)]
605
+ for t in threads: t.start()
606
+ for t in threads: t.join()
607
+ sys.stdout.write('\\nRESULT=' + str(spawn.call_count))
608
+ """
609
+ rc, out, err = _run_in_subprocess({"AWS_RL_ENV_POOL_SIZE": "8"}, code)
610
+ assert rc == 0, err
611
+ assert out.strip().splitlines()[-1] == "RESULT=1"
612
+
613
+ def test_get_web_env_does_not_spawn_when_backend_aws(self) -> None:
614
+ """BACKEND_TYPE=aws path takes the AwsStrategy branch and never
615
+ subprocesses ministack — even with POOL_SIZE=8.
616
+ """
617
+ code = """
618
+ import sys
619
+ from unittest.mock import patch
620
+ import server.app as m
621
+ with patch('server.app.subprocess.Popen') as popen:
622
+ m._get_web_env()
623
+ sys.stdout.write('\\nRESULT=' + str(popen.call_count))
624
+ """
625
+ rc, out, err = _run_in_subprocess(
626
+ {"AWS_RL_ENV_POOL_SIZE": "8", "BACKEND_TYPE": "aws"},
627
+ code,
628
+ )
629
+ assert rc == 0, err
630
+ assert out.strip().splitlines()[-1] == "RESULT=0"
631
+
632
+ def test_get_web_env_does_not_spawn_when_pool_size_1(self) -> None:
633
+ """Legacy POOL_SIZE=1 path shares the lone pool MiniStack on :4566
634
+ and never spawns a separate web MiniStack.
635
+ """
636
+ code = """
637
+ import sys
638
+ from unittest.mock import patch
639
+ import server.app as m
640
+ with patch('server.app.subprocess.Popen') as popen:
641
+ m._get_web_env()
642
+ sys.stdout.write('\\nRESULT=' + str(popen.call_count))
643
+ """
644
+ rc, out, err = _run_in_subprocess({"AWS_RL_ENV_POOL_SIZE": "1"}, code)
645
+ assert rc == 0, err
646
+ assert out.strip().splitlines()[-1] == "RESULT=0"
647
+
648
+ def test_get_web_env_retries_after_spawn_failure(self) -> None:
649
+ """If the first spawn fails (e.g., ministack not installed yet, or
650
+ the bind timed out), _web_env stays None so a later request can
651
+ retry instead of permanently caching the failure.
652
+ """
653
+ code = """
654
+ import sys
655
+ from unittest.mock import patch
656
+ import server.app as m
657
+ with patch('server.app._spawn_web_ministack', side_effect=RuntimeError('boom')):
658
+ failed = False
659
+ try:
660
+ m._get_web_env()
661
+ except RuntimeError:
662
+ failed = True
663
+ assert failed, 'expected first call to raise'
664
+ assert m._web_env is None, '_web_env must stay None after spawn failure'
665
+ sys.stdout.write('\\nRESULT=ok')
666
+ """
667
+ rc, out, err = _run_in_subprocess({"AWS_RL_ENV_POOL_SIZE": "8"}, code)
668
+ assert rc == 0, err
669
+ assert out.strip().splitlines()[-1] == "RESULT=ok"
670
+
671
+ def test_pool_factory_capacity_independent_of_web_env(self) -> None:
672
+ """The web _env is a module-level singleton, NOT produced by the
673
+ WebSocket factory. So a pool of 8 still hands out 8 distinct ports;
674
+ the web env doesn't steal a slot. Critical for the user's "8 WS +
675
+ web UI" goal.
676
+ """
677
+ pool, factory = make_env_factory(pool_size=8, base_port=4566)
678
+ assert pool is not None
679
+ envs = [factory() for _ in range(8)]
680
+ assert pool.free_count == 0
681
+ # 9th must fail — same as before this change
682
+ with pytest.raises(RuntimeError, match="exhausted"):
683
+ factory()
684
+ # Sanity: all 8 ports distinct, none equal to 4565 (web port)
685
+ ports = {int(e._backend._aws_infra_url.rsplit(":", 1)[-1]) for e in envs}
686
+ assert len(ports) == 8
687
+ assert 4565 not in ports
train/README.md ADDED
@@ -0,0 +1,545 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # `train/` — SFT + GRPO Training Pipeline
2
+
3
+ [← back to main README](../README.md)
4
+
5
+ This directory holds the **training notebooks** for the AWS RL agent. Heavy logic for the GRPO loop lives at the repo root in [train_grpo.py](../train_grpo.py); the notebooks here are thin drivers that you can run end-to-end on Colab.
6
+
7
+ The training pipeline has two stages:
8
+
9
+ ```
10
+ ┌────────── data/sft/ ──────────┐
11
+ │ 1,500 train · 150 val rows │
12
+ │ 5 trajectory types │
13
+ └───────────────┬───────────────┘
14
+
15
+ ┌──────────────────────────────────▼──────────────────────────────────┐
16
+ │ STAGE 1 — Supervised Fine-Tuning (train_sft_lora.ipynb) │
17
+ │ Qwen2.5-Coder-3B-Instruct + LoRA r=8/16/32 (Optuna) → SFT adapter │
18
+ └──────────────────────────────────┬──────────────────────────────────┘
19
+ │ Sizzing/aws-rl-sft-qwen25coder3b-adapter
20
+ ┌──────────────────────────────────▼──────────────────────────────────┐
21
+ │ STAGE 2 — GRPO RL (train_grpo_lora.ipynb) │
22
+ │ G=8 parallel rollouts · multi-turn · reward = env return │
23
+ │ Optuna over (lr, β, G, T, top_p, lora_r, max_turns) │
24
+ └─────────────────────────────────────────────────────────────────────┘
25
+ ```
26
+
27
+ The two stages are intentionally separable: the SFT adapter is published to the Hugging Face Hub so anyone can pull it and start GRPO without re-running SFT.
28
+
29
+ ---
30
+
31
+ ## Table of contents
32
+
33
+ 1. [SFT stage — supervised LoRA](#1-sft-stage--supervised-lora)
34
+ 2. [GRPO stage — reinforcement learning](#2-grpo-stage--reinforcement-learning)
35
+ 3. [Optuna hyperparameter search](#3-optuna-hyperparameter-search)
36
+ 4. [Multi-turn rollouts + parallel envs](#4-multi-turn-rollouts--parallel-envs)
37
+ 5. [Training modes (CLI)](#5-training-modes-cli)
38
+ 6. [How to run](#6-how-to-run)
39
+ 7. [Logging and artifacts](#7-logging-and-artifacts)
40
+ 8. [Reproducing results](#8-reproducing-results)
41
+ 9. [Files in this directory](#9-files-in-this-directory)
42
+
43
+ ---
44
+
45
+ ## 1. SFT stage — supervised LoRA
46
+
47
+ [train/train_sft_lora.ipynb](train_sft_lora.ipynb) — primary SFT notebook.
48
+
49
+ ### Why SFT before GRPO?
50
+
51
+ Two reasons — both showed up in our base-model evaluation ([data/sft/MODEL_EVALUATION.md](../data/sft/MODEL_EVALUATION.md)):
52
+
53
+ 1. **Format-locking**. Even strong coder models occasionally wrap commands in markdown fences or quotes. SFT removes that surface noise in one epoch.
54
+ 2. **Bootstrap the GRPO reward signal**. GRPO with a base model that's only 41% exact-match starts from a low-density reward landscape. Pre-training on canonical commands raises the baseline so GRPO can spend its compute on optimization, not search.
55
+
56
+ ### Base model
57
+
58
+ | Choice | `unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit` |
59
+ |--------|--|
60
+ | Why | Highest exact-match (41%) of 11 candidates we benchmarked, fastest viable inference (3.1 s/call), tightest output (86 chars). Full reasoning in [data/sft/MODEL_EVALUATION.md](../data/sft/MODEL_EVALUATION.md). |
61
+ | Loader | Unsloth's 4-bit quantized variant — fits comfortably on a single 24 GB GPU, 2× faster training kernels |
62
+
63
+ ### LoRA config
64
+
65
+ ```python
66
+ LoraConfig(
67
+ r = trial.suggest_categorical("lora_r", [8, 16, 32]),
68
+ lora_alpha = r * trial.suggest_categorical("lora_alpha_mul", [1, 2, 4]),
69
+ lora_dropout = trial.suggest_float("lora_dropout", 0.005, 0.031),
70
+ bias = "none",
71
+ task_type = "CAUSAL_LM",
72
+ target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"],
73
+ )
74
+ ```
75
+
76
+ - Only attention projections are adapted — MLP / output heads stay frozen, keeping the trainable parameter count tiny (~10–40 M depending on rank).
77
+ - `lora_alpha = r × multiplier` keeps the effective scaling stable across rank variations during the Optuna search.
78
+
79
+ ### Optimization
80
+
81
+ | Hyperparameter | Value / Range |
82
+ |--------------------------|------------------------------------------|
83
+ | Optimizer | AdamW (Unsloth's fused implementation) |
84
+ | Learning rate | `[1e-4, 5e-4]` log-scale (Optuna) |
85
+ | Schedule | Cosine annealing |
86
+ | Warmup ratio | `{0.03, 0.1}` (Optuna; best 0.1) |
87
+ | Batch size | 2 per GPU |
88
+ | Epochs | 2 |
89
+ | Max sequence length | 512 |
90
+ | Packing | **Disabled** (we keep chat-template separators intact) |
91
+ | Loss masking | Assistant-only (user message tokens are masked from the loss) |
92
+
93
+ ### Dataset
94
+
95
+ [data/sft/aws_rl_sft.train.jsonl](../data/sft/aws_rl_sft.train.jsonl) — 1,500 examples. Format:
96
+
97
+ ```json
98
+ {
99
+ "messages": [
100
+ {"role": "system", "content": "You are an AWS cloud engineer..."},
101
+ {"role": "user", "content": "TASK: ...\n\nCURRENT OBSERVATION:\nProgress: 0.00 ..."},
102
+ {"role": "assistant", "content": "aws s3 mb s3://my-app-data"}
103
+ ],
104
+ "difficulty": "intermediate",
105
+ "source": "success_first_step",
106
+ "task_id": 42
107
+ }
108
+ ```
109
+
110
+ The dataset is a careful mix of **5 trajectory types** (success, multi-step continuation, failure recovery, verification, hint usage). Full generation methodology in [data/README.md](../data/README.md).
111
+
112
+ ### Training graphs
113
+
114
+ The actual SFT run shipped in [`out/`](../out/) achieved validation loss `0.052` after 188 training steps with the best Optuna trial.
115
+
116
+ > ![SFT loss curve](../docs/figures/sft_loss_curve.png)
117
+
118
+ ---
119
+
120
+ ## 2. GRPO stage — reinforcement learning
121
+
122
+ The core trainer lives at [train_grpo.py](../train_grpo.py) (1,283 LOC). Notebooks call into it:
123
+
124
+ - [train/train_grpo_lora.ipynb](train_grpo_lora.ipynb) — clean
125
+ - [train/train_grpo_lora_with_outputs.ipynb](train_grpo_lora_with_outputs.ipynb) — with execution outputs preserved
126
+ - [aws_rl_env_colab.ipynb](../aws_rl_env_colab.ipynb) — Colab driver wrapping the entire pipeline
127
+
128
+ ### What GRPO is, briefly
129
+
130
+ **GRPO** (Group Relative Policy Optimization) is the algorithm introduced by DeepSeekMath and adopted by TRL ≥ 0.18. Unlike PPO, GRPO does **not** train a critic. Instead:
131
+
132
+ 1. For one prompt (here, one curriculum-picked task), generate `G` completions
133
+ 2. Score each with the reward function(s)
134
+ 3. Compute group-relative advantage: `(reward_i − group_mean) / group_std`
135
+ 4. Backpropagate the policy gradient with that advantage
136
+ 5. Apply a KL penalty to the SFT reference model (coefficient `β`) to prevent drift
137
+
138
+ This is dramatically simpler than PPO (no value head, no GAE), more sample-efficient for verifier-style rewards, and a natural fit for our setup — the AWS RL env *is* the reward function.
139
+
140
+ ### TRL GRPOTrainer config
141
+
142
+ From [train_grpo.py:_build_grpo_config()](../train_grpo.py):
143
+
144
+ | Parameter | Default value | Notes |
145
+ |------------------------------------|---------------|-------------------------------------------------------------|
146
+ | `learning_rate` | `5e-6` | Optuna range `[1e-6, 1e-4]` log-scale |
147
+ | `beta` (KL coefficient) | `0.04` | Optuna range `[0.0, 0.1]` |
148
+ | `num_generations` (G) | `8` | Optuna `{4, 8}` |
149
+ | `temperature` | `0.9` | Optuna `[0.7, 1.0]` |
150
+ | `top_p` | `0.95` | Optuna `[0.85, 0.98]` |
151
+ | `per_device_train_batch_size` | `1` | |
152
+ | `gradient_accumulation_steps` | `8` | Effective batch 8 |
153
+ | `gradient_checkpointing` | `True` | `use_reentrant=False` — VRAM optimization |
154
+ | `max_completion_length` | `256` | Per-turn; one AWS CLI command fits comfortably |
155
+ | `max_prompt_length` | `2048` | Holds task + history + observation |
156
+ | `loss_type` | `"dapo"` | Distributional Advantage Policy Optimization (TRL default for GRPO) |
157
+ | `mask_truncated_completions` | `True` | Drop samples that hit `max_completion_length` |
158
+ | `warmup_ratio` | `0.05` | |
159
+ | `lr_scheduler_type` | `"cosine"` | |
160
+ | `max_grad_norm` | `1.0` | |
161
+ | `use_vllm` | `False` | Plain `model.generate()` — vLLM integration is future work |
162
+
163
+ ### Reward functions (TRL convention)
164
+
165
+ Three reward functions are registered, summed by GRPO:
166
+
167
+ ```python
168
+ reward_funcs=[reward_task, reward_achieved, reward_progress]
169
+ ```
170
+
171
+ - `reward_task(completions, **kwargs)` → episode return (sum of per-step env rewards). The dominant signal.
172
+ - `reward_achieved(completions, **kwargs)` → 1.0 if `task.task_achieved` at end of episode, else 0.0. Sparse but unambiguous.
173
+ - `reward_progress(completions, **kwargs)` → final `partial_progress` ∈ [0, 1]. Densifies the credit assignment for partial completions.
174
+
175
+ The env's reward shaping (see [server/README.md §8](../server/README.md#8-reward-shaping--taskgrader)) does most of the work — these three TRL functions are a thin façade.
176
+
177
+ ### Episode = one rollout
178
+
179
+ - Each rollout runs **up to `MAX_TURNS=6` sequential AWS CLI commands**
180
+ - Each command's stdout/stderr/progress is fed back as the user message for the next turn (see `build_user_prompt()` and `format_observation()` in [train_grpo.py](../train_grpo.py))
181
+ - The episode terminates on `task_achieved`, max turns, or `max_total_tokens` (per-episode token budget)
182
+ - Token sequences (prompt_ids, completion_ids, logprobs) are accumulated **across turns**, so GRPO assigns the episode-level reward to the full multi-turn token sequence — not just the last turn
183
+
184
+ ### Curriculum integration
185
+
186
+ ```
187
+ trainer step:
188
+ 1. task = curriculum.next_task() # one task per GRPO step
189
+ 2. results = pool.run_group(task, ...) # G rollouts on that task
190
+ 3. mean_r = sum(group_rewards) / G
191
+ 4. curriculum.record_result(task, achieved=any_achieved, reward=mean_r)
192
+ 5. trainer applies group-relative advantages # standard GRPO
193
+ ```
194
+
195
+ The curriculum drives task selection — every rollout in a group runs the *same* task, forced through `env.reset(task=task)`. This matches GRPO's group-relative semantics (you need the same prompt across the group to compute baseline correctly).
196
+
197
+ Full curriculum mechanics (priority scoring, mastery, spaced rep, tier promotion) live in [server/README.md §7](../server/README.md#7-curriculum-manager).
198
+
199
+ ### Training graphs
200
+
201
+ The actual GRPO run shipped in [`out_grpo/`](../out_grpo/) ran 35 steps with the best Optuna config (`lr=1.6e-5`, `β=0.0021`, `T=0.99`). Per-step signals from [`out_grpo/final_grpo/checkpoint-35/trainer_state.json`](../out_grpo/final_grpo/checkpoint-35/trainer_state.json):
202
+
203
+ > ![GRPO final per-step training signals](../docs/figures/grpo_final_per_step.png)
204
+ > ![GRPO env reward over training](../docs/figures/grpo_reward_curve.png)
205
+ > ![Success by tier (multi-step)](../docs/figures/grpo_per_tier_curve.png)
206
+ > ![Reward by tier (multi-step)](../docs/figures/grpo_reward_by_tier.png)
207
+
208
+ Notable signals from the run:
209
+
210
+ | | |
211
+ |---|---|
212
+ | `env_reward/mean` | 0.31 (mean over 16 reward-logged steps), max 0.94, min 0.13 |
213
+ | `kl` | 0.15 (mean) — KL stays small despite tiny β |
214
+ | `completion_length` | 87 tokens (mean) — agent emits compact AWS CLI commands |
215
+ | Format compliance | **100%** (`format_reward/mean = 1.0` every step) |
216
+
217
+ Multi-step end-to-end re-eval after GRPO ([out_grpo/grpo_multi_step.json](../out_grpo/grpo_multi_step.json)):
218
+
219
+ > ![SFT vs GRPO multi-step metrics grid](../docs/figures/sft_vs_grpo_metrics_grid.png)
220
+
221
+ These are produced by [`plot_rewards()`](../train_grpo.py) reading `reward_log.csv` written by `EpisodeLogger`, plus the post-hoc plots generated during the GRPO notebook run.
222
+
223
+ ---
224
+
225
+ ## 3. Optuna hyperparameter search
226
+
227
+ [train_grpo.py:optuna_search()](../train_grpo.py)
228
+
229
+ ### Search space
230
+
231
+ | Parameter | Range | Reason |
232
+ |-------------------|------------------------------------|------------------------------------------------------------------------|
233
+ | `learning_rate` | `[1e-6, 1e-4]` log | GRPO is sensitive to LR; log-scale is the right prior |
234
+ | `beta` | `[0.0, 0.1]` | KL coefficient. 0 = pure RL (drift risk), 0.1 = anchored to SFT |
235
+ | `num_generations` | `{4, 8}` | Group size. Larger → tighter advantage estimates but slower |
236
+ | `temperature` | `[0.7, 1.0]` | Exploration knob |
237
+ | `top_p` | `[0.85, 0.98]` | Nucleus sampling |
238
+ | `lora_r` | `{8, 16, 32}` | Adapter capacity |
239
+ | `lora_alpha_mul` | `{1, 2, 4}` | `lora_alpha = lora_r × multiplier` |
240
+ | `max_turns` | `{4, 6, 8}` | Episode length cap |
241
+
242
+ ### Objective
243
+
244
+ ```
245
+ objective = 0.7 × achieved_rate + 0.3 × mean_progress
246
+ ```
247
+
248
+ Calculated on the held-out validation tasks at the end of each trial. Weighting `achieved_rate` higher matches the project goal — actual task completion matters more than partial progress.
249
+
250
+ ### Sampler
251
+
252
+ `optuna.samplers.TPESampler(seed=42)` — Tree-structured Parzen Estimator. TPE outperforms random search on 8-dim spaces with ~6 trials in our experience.
253
+
254
+ Persisted to `outputs/.../optuna.db` (SQLite), so trials can be resumed if a Colab session disconnects.
255
+
256
+ ### Frozen validation set
257
+
258
+ `pick_validation_task_ids(k_per_tier=2, seed=42)` picks 2 tasks per tier (≈10 tasks total) at the start of training. The same set is used by every Optuna trial and the final post-training eval — no benchmark leakage between trials.
259
+
260
+ ### SFT-stage Optuna results (6 trials)
261
+
262
+ The SFT-stage Optuna run shipped in [`out/optuna_study.json`](../out/optuna_study.json) explored a 5-parameter space (`lora_r`, `lora_alpha_mul`, `lora_dropout`, `learning_rate`, `warmup_ratio`). 6 trials, validation loss as objective (lower = better):
263
+
264
+ | Trial | r | α | dropout | lr | warmup | val_loss |
265
+ |------:|---:|---:|:-------:|:---------:|:------:|:--------:|
266
+ | **0** | 16 | 16 | 0.006 | 4.03e-4 | 0.10 | **0.0523** ★ |
267
+ | 1 | 16 | 16 | 0.030 | 2.33e-4 | 0.03 | 0.0790 |
268
+ | 2 | 8 | 32 | 0.020 | 2.29e-4 | 0.03 | 0.0587 |
269
+ | 3 | 8 | 16 | 0.030 | 1.17e-4 | 0.03 | 0.1199 |
270
+ | 4 | 16 | 16 | 0.031 | 2.31e-4 | 0.03 | 0.0793 |
271
+ | 5 | 8 | 32 | 0.009 | 1.37e-4 | 0.10 | 0.0828 |
272
+
273
+ > ![SFT Optuna trial comparison table](../docs/figures/sft_optuna_trials_table.png)
274
+
275
+ ```json
276
+ {
277
+ "best_value": 0.052,
278
+ "best_params": {
279
+ "lora_r": 16,
280
+ "lora_alpha_mul": 1, // → lora_alpha = 16
281
+ "lora_dropout": 0.005808,
282
+ "learning_rate": 4.03e-4,
283
+ "warmup_ratio": 0.1
284
+ }
285
+ }
286
+ ```
287
+
288
+ Visualized:
289
+
290
+ > ![Optuna parameter importances](../docs/figures/optuna_param_importance.png)
291
+ > ![Optuna optimization history](../docs/figures/optuna_history.png)
292
+ > ![Optuna parallel coordinate plot](../docs/figures/optuna_parallel.png)
293
+ > ![Optuna slice plot](../docs/figures/optuna_slice.png)
294
+ > ![Optuna trial training curves](../docs/figures/optuna_trial_curves.png)
295
+
296
+ ### GRPO-stage Optuna results (4 trials)
297
+
298
+ The GRPO-stage Optuna run shipped in [`out_grpo/optuna_best.json`](../out_grpo/optuna_best.json) explored a 3-parameter space (`learning_rate`, `beta`, `temperature`). 4 trials, single-step env reward as objective (higher = better):
299
+
300
+ | Trial | lr | β | T | env_reward | success |
301
+ |------:|:---------:|:--------:|:-----:|:----------:|:-------:|
302
+ | 0 | varied | varied | varied| 0.473 | 25.0% |
303
+ | 1 | varied | varied | varied| 0.469 | 25.0% |
304
+ | 2 | varied | varied | varied| 0.469 | 25.0% |
305
+ | **3** | 1.60e-5 | 0.0021 | 0.99 | **0.552** | **33.3%** ★ |
306
+
307
+ > ![GRPO Optuna trial comparison](../docs/figures/grpo_optuna_trials_comparison.png)
308
+ > ![GRPO Optuna importances](../docs/figures/grpo_optuna_importances.png)
309
+ > ![GRPO Optuna parallel coordinate](../docs/figures/grpo_optuna_parallel.png)
310
+ > ![GRPO Optuna hparams](../docs/figures/grpo_optuna_hparams.png)
311
+ > ![GRPO Optuna trial curves](../docs/figures/grpo_optuna_trial_curves.png)
312
+
313
+ The winning GRPO config uses a **much smaller learning rate** (1.6e-5, vs 4.0e-4 for SFT) and a **tiny KL coefficient** (β=0.0021) — both expected for an RL phase that is only correcting the SFT-bootstrapped policy, not retraining it.
314
+
315
+ ---
316
+
317
+ ## 4. Multi-turn rollouts + parallel envs
318
+
319
+ This section is a quick overview — the full mechanics, including the three pool layers and asyncio orchestration, are in [scripts/README.md](../scripts/README.md).
320
+
321
+ ### MultiTurnEnvPool
322
+
323
+ [train_grpo.py:MultiTurnEnvPool](../train_grpo.py) — owns a background thread running an asyncio loop, opens N WebSocket sessions on startup, exposes a synchronous `run_group(task, ...)` API.
324
+
325
+ - One pool instance lives for the duration of training
326
+ - `run_group()` calls `asyncio.gather()` over `rollout_one_episode(env, task, ...)` for each of the N envs — every rollout runs the same task in its own MiniStack (see server-side pool in [server/README.md §6](../server/README.md#6-server-side-ministack-pool-parallel-rollouts))
327
+ - Returns a list of `{prompt_ids, completion_ids, logprobs, task_reward, task_achieved, final_progress, num_steps, transcript, task_id, difficulty}`
328
+
329
+ ### Why parallelism matters here
330
+
331
+ GRPO's group-relative advantage requires `G` rollouts before any gradient. Running them serially at MAX_TURNS=6 turns × ~50 ms env step = ~300 ms per rollout would cost 2.4 s × G=8 = ~20 s of env time per training step. With parallel rollouts that drops to ~300 ms (the slowest of 8). The model forward pass dominates, exactly as desired.
332
+
333
+ ### Generation lock
334
+
335
+ Because the policy lives on a single GPU, `model.generate()` calls across the asyncio.gather group are serialised behind a `_GENERATE_LOCK` (`threading.Lock`). The env step calls — the slow part — happily overlap. This is the single non-obvious detail that makes the parallel rollout approach actually work.
336
+
337
+ ---
338
+
339
+ ## 5. Training modes (CLI)
340
+
341
+ ```bash
342
+ # Optuna search only — produces best_cfg.json
343
+ python train_grpo.py --mode optuna --n-trials 6 --trial-max-steps 30
344
+
345
+ # Train once with explicit hyperparams (no search)
346
+ python train_grpo.py --mode train \
347
+ --env-url http://localhost:8000 \
348
+ --num-generations 8 --max-turns 6 --max-steps 200
349
+
350
+ # Search → train: Optuna trials, then a full-length run with the best config
351
+ python train_grpo.py --mode full --n-trials 6 --max-steps 200
352
+ ```
353
+
354
+ All modes write to `outputs/aws-rl-grpo-<TIMESTAMP>/`.
355
+
356
+ ---
357
+
358
+ ## 6. How to run
359
+
360
+ ### Prerequisites
361
+
362
+ - A running env server: `make run` from the repo root (starts MiniStack + FastAPI on `http://localhost:8000`)
363
+ - For pool size > 1: `AWS_RL_ENV_POOL_SIZE=8 make run`
364
+ - A GPU with ≥ 24 GB VRAM (A10, T4×2, A100, L4 all confirmed working)
365
+ - HuggingFace token (`HF_TOKEN`) if you want to push the trained adapter
366
+
367
+ ### Local
368
+
369
+ ```bash
370
+ # 1. Start the env server in one terminal
371
+ AWS_RL_ENV_POOL_SIZE=8 make run
372
+
373
+ # 2. Run training in another terminal
374
+ python train_grpo.py --mode full --n-trials 6 --max-steps 200
375
+ ```
376
+
377
+ ### Colab
378
+
379
+ The notebook [aws_rl_env_colab.ipynb](../aws_rl_env_colab.ipynb) wraps the full pipeline (env URL config, HF login, val set, Optuna, training, plotting, optional push-to-Hub):
380
+
381
+ | Notebook | Open in Colab |
382
+ |----------|---------------|
383
+ | GRPO end-to-end driver | <!-- TODO: paste Colab URL here --> |
384
+ | SFT-only ([train/train_sft_lora.ipynb](train_sft_lora.ipynb)) | <!-- TODO: paste Colab URL here --> |
385
+ | GRPO-only ([train/train_grpo_lora.ipynb](train_grpo_lora.ipynb)) | <!-- TODO: paste Colab URL here --> |
386
+
387
+ Note: the Colab notebooks expect the env server to be reachable. Two options:
388
+
389
+ 1. **HF Space tunnel**: deploy the env to your own HF Space and point `ENV_URL` at it (see main README's deployment section)
390
+ 2. **ngrok**: run the env locally and expose it via ngrok / cloudflared so Colab can reach it
391
+
392
+ ---
393
+
394
+ ## 7. Logging and artifacts
395
+
396
+ ### Reference SFT output: [`out/`](../out/)
397
+
398
+ A complete SFT training run is committed (small files only) at the repo root for reproducibility:
399
+
400
+ ```
401
+ out/
402
+ ├── baseline_metrics.json # eval scores BEFORE SFT (33% fmt, 39% exact, ...)
403
+ ├── delta_summary.json # base vs post-SFT delta (the headline numbers)
404
+ ├── optuna_study.json # SFT Optuna study summary (all 6 trials + best)
405
+ ├── optuna/ # per-trial workspaces (trial-0..trial-5)
406
+ ├── final_sft/ # final TRL SFT trainer checkpoints (gitignored)
407
+ │ ├── checkpoint-100/ # adapter + optimizer + tokenizer at step 100
408
+ │ ├── checkpoint-150/
409
+ │ └── checkpoint-188/ # last checkpoint (final adapter)
410
+ └── plots/ # 7 ready PNGs (loss curves, Optuna plots, eval comparison)
411
+ ```
412
+
413
+ The contents of `out/plots/` are mirrored into [`docs/figures/`](../docs/figures/) so the READMEs render them. The full TRL checkpoints in `out/final_sft/` are kept for reproducibility but are gitignored (each is ~50 MB; total ~175 MB).
414
+
415
+ ### Reference GRPO output: [`out_grpo/`](../out_grpo/)
416
+
417
+ A complete GRPO training run is also committed at the repo root:
418
+
419
+ ```
420
+ out_grpo/
421
+ ├── baseline_single_step.json # post-SFT single-step eval (90% reward, 85% success)
422
+ ├── baseline_multi_step.json # post-SFT multi-step eval (86.8% success, 0.88 reward, by tier)
423
+ ├── grpo_multi_step.json # post-GRPO multi-step eval (86.2% success, 0.88 reward, by tier)
424
+ ├── optuna_best.json # GRPO Optuna best params + resolved config
425
+ ├── optuna.db # SQLite Optuna study (4 trials)
426
+ ├── optuna/trial-0..3/ # per-trial trainer_state.json + single_step_metrics.json
427
+ ├── qualitative_rollouts.json # 5 hand-picked sample rollouts (one per tier, post-GRPO)
428
+ ├── final_grpo/ # final TRL GRPO checkpoints (gitignored)
429
+ │ ├── checkpoint-25/
430
+ │ └── checkpoint-35/ # last checkpoint (final GRPO adapter)
431
+ ├── grpo_adapter/ # exported final adapter for HF Hub upload (gitignored)
432
+ ├── graphs/ # 10 ready PNGs (Optuna views, training curves, by-tier breakdowns)
433
+ └── graphs.zip
434
+ ```
435
+
436
+ The 10 graphs from `out_grpo/graphs/` are mirrored into [`docs/figures/`](../docs/figures/) under descriptive names (`grpo_optuna_history.png`, `grpo_reward_curve.png`, `grpo_per_tier_curve.png`, `sft_vs_grpo_scalar.png`, `grpo_reward_by_tier.png`, etc.). The full TRL checkpoints in `out_grpo/final_grpo/` and the exported adapter in `out_grpo/grpo_adapter/` are gitignored (~160 MB total).
437
+
438
+ ### GRPO output layout
439
+
440
+ Each GRPO run writes to a fresh `outputs/aws-rl-grpo-<TIMESTAMP>/`:
441
+
442
+ | File | Written by | Contents |
443
+ |-------------------------|------------------------|-------------------------------------------------------------------------|
444
+ | `reward_log.csv` | `EpisodeLogger` | One row per rollout: `step, rollout_idx, task_id, difficulty, task_reward, task_achieved, final_progress, num_steps, tier, tier_success_rate, timestamp` |
445
+ | `transcripts.jsonl` | `EpisodeLogger` | Same rows + the full multi-turn transcript per rollout (commands, outputs, rewards) |
446
+ | `optuna.db` | Optuna | SQLite study (resumable) |
447
+ | `best_cfg.json` | `optuna_search()` | Final winning hyperparameters |
448
+ | `trial_NNN/` | `_run_one_trial()` | Per-trial trainer checkpoints + `trial_metrics.json` |
449
+ | `val_task_ids.json` | Notebook driver | Frozen held-out validation set (for reproducibility) |
450
+ | `post_train_val.json` | Notebook §10 | Final post-training validation metrics |
451
+ | `reward_plot.png` | `plot_rewards()` | Group mean reward + per-tier scatter |
452
+ | `<adapter_dir>/` | TRL `GRPOTrainer.save` | Trained LoRA adapter (`adapter_config.json`, `adapter_model.safetensors`, etc.) |
453
+
454
+ Push to HF Hub:
455
+
456
+ ```python
457
+ from huggingface_hub import create_repo, upload_folder
458
+ create_repo("your-org/aws-rl-grpo-qwen25coder3b", exist_ok=True, private=False)
459
+ upload_folder(folder_path=str(OUTPUT_DIR), repo_id="your-org/aws-rl-grpo-qwen25coder3b")
460
+ ```
461
+
462
+ ---
463
+
464
+ ## 8. Reproducing results
465
+
466
+ ### Actual SFT result (committed at [`out/`](../out/))
467
+
468
+ ```
469
+ SFT (188 steps, best Optuna trial, ~30 min on A10):
470
+ best val_loss : 0.052
471
+ best lora_r : 16
472
+ best lora_alpha : 16 (alpha_mul=1)
473
+ best lora_dropout: 0.0058
474
+ best lr : 4.03e-4
475
+ best warmup : 0.10
476
+
477
+ Held-out eval (post-SFT, same prompts as base):
478
+ format_pct : 33.3% → 100.0% (+66.7 pp)
479
+ exact_pct : 38.9% → 88.9% (+50.0 pp)
480
+ service_pct : 77.8% → 88.9% (+11.1 pp)
481
+ operation_pct : 61.1% → 88.9% (+27.8 pp)
482
+ avg_latency : 2.03s → 1.40s (−0.63s)
483
+ avg_len : 85.8 → 74.7 (tighter outputs)
484
+ ```
485
+
486
+ Every target from [data/sft/MODEL_EVALUATION.md §11](../data/sft/MODEL_EVALUATION.md) is met or exceeded.
487
+
488
+ ### Actual GRPO result (committed at [`out_grpo/`](../out_grpo/))
489
+
490
+ ```
491
+ GRPO (35 steps from best Optuna trial, ~1.5 hr on A10):
492
+ best lr : 1.60e-5
493
+ best beta : 0.0021
494
+ best temperature : 0.99
495
+ num_generations : 8
496
+
497
+ Per-step training signals (16 reward-logged steps):
498
+ env_reward (mean): 0.31 max: 0.94 min: 0.13
499
+ KL to SFT ref : 0.15 mean (small β = 0.0021 keeps drift in check)
500
+ format_reward : 1.00 every step (perfect format compliance)
501
+ completion length: 87 tokens mean (compact AWS CLI commands)
502
+
503
+ Multi-step end-to-end eval (n≈108 episodes):
504
+ Base+SFT Base+SFT+GRPO Δ
505
+ overall_success 86.8% 86.2% −0.5 pp
506
+ overall_reward 0.883 0.877 −0.006
507
+ beginner_success 96.2% 100.0% +3.8 pp ✓
508
+ intermediate_success 81.0% 87.0% +6.0 pp ✓
509
+ warmup_success 96.0% 90.2% −5.8 pp
510
+ expert_success 22.2% 22.2% flat (bottleneck)
511
+ drift_repair 22.2% 22.2% flat
512
+ destructive_fail 15.1% 14.7% −0.4 pp
513
+ steps_to_solve 1.45 1.55 +0.10
514
+ ```
515
+
516
+ **Honest reading.** A 35-step GRPO run from a strong SFT starting point (already 86.8% success) is short by RL standards. It preserves the SFT gains, modestly improves the middle tiers, but does not crack the expert-tier ceiling — the 22% expert / 22% drift-repair numbers stay flat because there are too few expert episodes in 35 GRPO steps × G=8 = 280 rollouts, with the curriculum focusing primarily on warmup/beginner/intermediate.
517
+
518
+ Variance comes mostly from Optuna trial composition. The published SFT adapter (`Sizzing/aws-rl-sft-qwen25coder3b-adapter`) is the SFT result; the GRPO adapter regenerates per-run from `out_grpo/grpo_adapter/`.
519
+
520
+ ---
521
+
522
+ ## 9. Files in this directory
523
+
524
+ | File | Purpose |
525
+ |-----------------------------------------|------------------------------------------------------------------------|
526
+ | [train_sft_lora.ipynb](train_sft_lora.ipynb) | Stage 1 — supervised LoRA fine-tuning |
527
+ | [train_grpo_lora.ipynb](train_grpo_lora.ipynb) | Stage 2 — GRPO RL training (clean) |
528
+ | [train_grpo_lora_with_outputs.ipynb](train_grpo_lora_with_outputs.ipynb) | Same notebook with cell outputs preserved |
529
+
530
+ Heavy logic referenced from these notebooks:
531
+
532
+ - [train_grpo.py](../train_grpo.py) — the `MultiTurnEnvPool`, GRPO config, Optuna search, `plot_rewards`, and the `run_training` entry point
533
+ - [aws_rl_env_colab.ipynb](../aws_rl_env_colab.ipynb) — Colab driver that imports from `train_grpo.py`
534
+ - [scripts/grpo_pool.py](../scripts/grpo_pool.py) and [scripts/grpo_train.py](../scripts/grpo_train.py) — alternative client-side pool entry point (covered in [scripts/README.md](../scripts/README.md))
535
+
536
+ ---
537
+
538
+ ## See also
539
+
540
+ - [Main README](../README.md)
541
+ - [data/README.md](../data/README.md) — dataset generation, base-model selection
542
+ - [data/sft/MODEL_EVALUATION.md](../data/sft/MODEL_EVALUATION.md) — full 11-model benchmark
543
+ - [scripts/README.md](../scripts/README.md) — parallel-rollout architecture deep-dive
544
+ - [server/README.md](../server/README.md) — environment internals (curriculum, reward shaping, anti-hacking)
545
+ - [compare/README.md](../compare/README.md) — base vs SFT comparison harness
train/train_grpo_lora.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
train/train_sft_lora.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
train_grpo.py ADDED
@@ -0,0 +1,1283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """GRPO training for the AWS RL environment — multi-turn rollouts + parallel envs.
2
+
3
+ Mirrors the kube-sre-gym training pattern (heavy logic in this module, thin
4
+ notebook on top):
5
+ - Each "episode" runs up to MAX_TURNS steps.
6
+ - Each step = one ``aws ...`` command; the command's stdout/stderr is fed
7
+ back into the next turn's prompt as the user message.
8
+ - Each GRPO step picks ONE curriculum task and runs G concurrent rollouts
9
+ (one per env in MultiTurnEnvPool) sharing that task.
10
+ - prompt_ids / completion_ids / logprobs are accumulated across turns so
11
+ GRPO assigns episode-level reward to the full token sequence.
12
+
13
+ Usage (CLI)::
14
+
15
+ # Single training pass with explicit hyperparams
16
+ python train_grpo.py --mode train \\
17
+ --env-url http://localhost:8000 \\
18
+ --num-generations 8 --max-turns 6 --max-steps 200
19
+
20
+ # Optuna search over hyperparams, then dump best_cfg.json
21
+ python train_grpo.py --mode optuna --n-trials 6
22
+
23
+ # Optuna search, then full-length retrain using the best config
24
+ python train_grpo.py --mode full --n-trials 6 --max-steps 200
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import argparse
30
+ import asyncio
31
+ import csv
32
+ import gc
33
+ import json
34
+ import logging
35
+ import re
36
+ import threading
37
+ from dataclasses import dataclass
38
+ from datetime import datetime
39
+ from pathlib import Path
40
+ from typing import Any, Callable, Optional
41
+
42
+ import torch
43
+ from datasets import Dataset
44
+ from peft import LoraConfig, PeftModel
45
+ from transformers import AutoTokenizer
46
+ from trl import GRPOConfig, GRPOTrainer
47
+
48
+ from client import AwsRlEnv
49
+ from models import AwsRlAction, AwsRlObservation, Task, TaskDifficulty, TaskID
50
+ from server.services.curriculum import Curriculum
51
+
52
+ logger = logging.getLogger(__name__)
53
+
54
+
55
+ # ============================================================
56
+ # System prompt — multi-turn AWS CLI agent
57
+ # ============================================================
58
+
59
+ SYSTEM_PROMPT = """You are an expert AWS Operations agent. You operate a simulated AWS cloud by emitting ONE AWS CLI command per turn.
60
+
61
+ The user message contains:
62
+ - The task description.
63
+ - (Optional) A history of your previous commands and their outputs from earlier in this episode — use them to decide your next move.
64
+ - The most recent observation (last command's stdout / stderr / progress).
65
+
66
+ Each turn:
67
+ 1. Optionally reason inside a single <think>...</think> block. Keep it concise.
68
+ 2. After </think>, on a NEW LINE, output EXACTLY ONE AWS CLI command starting with "aws ".
69
+
70
+ Hard rules:
71
+ - The command line must contain ONLY the command — no markdown, no backticks, no quotes around it, no trailing commentary.
72
+ - If a command failed last turn, try a DIFFERENT approach. Do not repeat the exact same command twice in a row.
73
+ - When the task description names a specific resource (a bucket, table, queue, etc.), use that exact name.
74
+ """
75
+
76
+
77
+ DEFAULT_CFG: dict[str, Any] = {
78
+ "learning_rate": 5e-6,
79
+ "beta": 0.04,
80
+ "num_generations": 8,
81
+ "temperature": 0.9,
82
+ "top_p": 0.95,
83
+ "lora_r": 16,
84
+ "lora_alpha_mul": 2,
85
+ "max_turns": 6,
86
+ }
87
+
88
+
89
+ # ============================================================
90
+ # Helpers — prompt formatting + command parsing
91
+ # ============================================================
92
+
93
+ _THINK_BLOCK = re.compile(r"<think\b[^>]*>.*?</think>", re.DOTALL | re.IGNORECASE)
94
+ _OPEN_THINK = re.compile(r"<think\b[^>]*>.*", re.DOTALL | re.IGNORECASE)
95
+
96
+
97
+ def extract_aws_command(raw: str) -> str:
98
+ """Strip <think> blocks + markdown fences, return the first ``aws ...`` line.
99
+
100
+ Falls back to ``aws help`` so the env always gets a syntactically valid
101
+ command (the env will just produce a help-text observation, which is a
102
+ better RL signal than a parse error).
103
+ """
104
+ cleaned = _THINK_BLOCK.sub("", raw)
105
+ cleaned = _OPEN_THINK.sub("", cleaned)
106
+ for line in cleaned.splitlines():
107
+ line = line.strip().strip("`").strip()
108
+ if line.startswith("aws "):
109
+ return line
110
+ return "aws help"
111
+
112
+
113
+ def _truncate(text: str, n: int) -> str:
114
+ if not text:
115
+ return ""
116
+ if len(text) <= n:
117
+ return text
118
+ return text[: n - 3] + "..."
119
+
120
+
121
+ def format_observation(obs: AwsRlObservation) -> str:
122
+ """Render the latest env observation as a compact text block."""
123
+ parts: list[str] = []
124
+ if obs.command_output:
125
+ parts.append(f"Output:\n{_truncate(obs.command_output, 800)}")
126
+ if obs.error:
127
+ parts.append(f"Error:\n{_truncate(obs.error, 400)}")
128
+ parts.append(
129
+ f"Progress: {obs.partial_progress:.2f} "
130
+ f"Achieved: {obs.task_achieved} Step: {obs.step_count}"
131
+ )
132
+ if obs.hint_text:
133
+ parts.append(f"Hint: {_truncate(obs.hint_text, 200)}")
134
+ return "\n".join(parts)
135
+
136
+
137
+ def format_history(history: list[dict], keep_last: int = 6) -> str:
138
+ """Render the last ``keep_last`` (cmd, output, reward) tuples for context."""
139
+ if not history:
140
+ return ""
141
+ recent = history[-keep_last:]
142
+ rendered: list[str] = ["PREVIOUS COMMANDS:"]
143
+ for i, h in enumerate(recent, start=max(1, len(history) - keep_last + 1)):
144
+ rendered.append(
145
+ f"[{i}] $ {h['command']}\n"
146
+ f" output: {_truncate(h['output'], 300)}\n"
147
+ f" reward: {h['reward']:.2f}"
148
+ )
149
+ return "\n".join(rendered)
150
+
151
+
152
+ def apply_chat_template(tokenizer: AutoTokenizer, messages: list[dict]) -> str:
153
+ """Apply a chat template; fall back to a plain rendering if none is set."""
154
+ if getattr(tokenizer, "chat_template", None):
155
+ try:
156
+ return tokenizer.apply_chat_template(
157
+ messages, add_generation_prompt=True, tokenize=False
158
+ )
159
+ except TypeError:
160
+ return tokenizer.apply_chat_template(messages, tokenize=False)
161
+ parts: list[str] = []
162
+ for m in messages:
163
+ parts.append(f"<|{m['role']}|>\n{m['content']}\n")
164
+ parts.append("<|assistant|>\n")
165
+ return "".join(parts)
166
+
167
+
168
+ def build_user_prompt(task: Task, obs: AwsRlObservation, history: list[dict]) -> str:
169
+ desc = task.description
170
+ if task.desired_state_spec:
171
+ desc = f"{desc}\n\nDesired end state:\n{task.desired_state_spec}"
172
+ history_text = format_history(history)
173
+ obs_text = format_observation(obs)
174
+ if history_text:
175
+ return f"TASK: {desc}\n\n{history_text}\n\n---\n\nCURRENT OBSERVATION:\n{obs_text}"
176
+ return f"TASK: {desc}\n\nCURRENT OBSERVATION:\n{obs_text}"
177
+
178
+
179
+ # ============================================================
180
+ # Policy loading — Unsloth 4-bit base + LoRA-from-SFT-adapter
181
+ # ============================================================
182
+
183
+
184
+ @dataclass
185
+ class PolicySpec:
186
+ base_model: str = "unsloth/Qwen2.5-Coder-3B-Instruct-bnb-4bit"
187
+ sft_adapter: str = "Sizzing/aws-rl-sft-qwen25coder3b-adapter"
188
+ max_seq_length: int = 3072
189
+
190
+
191
+ def load_policy(
192
+ base_model: str,
193
+ sft_adapter: Optional[str] = None,
194
+ max_seq_length: int = 3072,
195
+ trainable: bool = True,
196
+ ):
197
+ """Load Unsloth 4-bit base + (optional) LoRA adapter from the SFT run.
198
+
199
+ ``trainable=True`` returns a PeftModel ready for GRPO training (Unsloth's
200
+ training kernels enabled, input require-grads hook installed).
201
+ ``trainable=False`` returns the same stack in inference mode for eval.
202
+ """
203
+ from unsloth import FastLanguageModel
204
+
205
+ base, tokenizer = FastLanguageModel.from_pretrained(
206
+ model_name=base_model,
207
+ max_seq_length=max_seq_length,
208
+ load_in_4bit=True,
209
+ )
210
+ if sft_adapter:
211
+ model = PeftModel.from_pretrained(base, sft_adapter, is_trainable=trainable)
212
+ else:
213
+ # No adapter: GRPOTrainer can attach a fresh LoRA via peft_config later.
214
+ model = base
215
+
216
+ if trainable:
217
+ FastLanguageModel.for_training(model)
218
+ if hasattr(model, "enable_input_require_grads"):
219
+ model.enable_input_require_grads()
220
+ else:
221
+ FastLanguageModel.for_inference(model)
222
+
223
+ if tokenizer.pad_token is None:
224
+ tokenizer.pad_token = tokenizer.eos_token
225
+
226
+ return model, tokenizer
227
+
228
+
229
+ def free_model(model) -> None:
230
+ """Release VRAM held by ``model`` and any captured optimizer state."""
231
+ try:
232
+ del model
233
+ except Exception:
234
+ pass
235
+ gc.collect()
236
+ if torch.cuda.is_available():
237
+ torch.cuda.empty_cache()
238
+
239
+
240
+ # ============================================================
241
+ # Multi-turn rollout — one episode in one env
242
+ # ============================================================
243
+
244
+
245
+ @dataclass
246
+ class SamplingCfg:
247
+ temperature: float = 0.9
248
+ top_p: float = 0.95
249
+ max_new_tokens: int = 256
250
+ max_prompt_length: int = 2048
251
+
252
+
253
+ _GENERATE_LOCK = threading.Lock()
254
+ """Serialise model.generate() calls across the asyncio.gather rollout group.
255
+
256
+ The model lives on a single GPU; concurrent generate() calls would collide.
257
+ We let the env step run concurrently (the slow part — WebSocket round-trip +
258
+ MiniStack execution); only the generation is serialised.
259
+ """
260
+
261
+
262
+ def _generate_with_logprobs(
263
+ model,
264
+ tokenizer,
265
+ prompt_text: str,
266
+ sampling: SamplingCfg,
267
+ ) -> tuple[list[int], list[int], list[float]]:
268
+ """Generate one completion + return per-token logprobs.
269
+
270
+ Returns: (prompt_ids, completion_ids, completion_logprobs).
271
+ """
272
+ with _GENERATE_LOCK:
273
+ prompt_input = tokenizer(
274
+ prompt_text,
275
+ return_tensors="pt",
276
+ truncation=True,
277
+ max_length=sampling.max_prompt_length,
278
+ ).to(model.device)
279
+
280
+ was_training = model.training
281
+ model.eval()
282
+ try:
283
+ with torch.no_grad():
284
+ gen_out = model.generate(
285
+ **prompt_input,
286
+ max_new_tokens=sampling.max_new_tokens,
287
+ do_sample=True,
288
+ temperature=sampling.temperature,
289
+ top_p=sampling.top_p,
290
+ return_dict_in_generate=True,
291
+ output_scores=True,
292
+ pad_token_id=tokenizer.pad_token_id,
293
+ )
294
+ finally:
295
+ if was_training:
296
+ model.train()
297
+
298
+ prompt_ids = prompt_input.input_ids[0].tolist()
299
+ prompt_len = len(prompt_ids)
300
+ completion_seq = gen_out.sequences[0, prompt_len:].tolist()
301
+
302
+ # Per-token logprobs from raw logits.
303
+ logprobs: list[float] = []
304
+ for i, scores_t in enumerate(gen_out.scores):
305
+ if i >= len(completion_seq):
306
+ break
307
+ lp = torch.log_softmax(scores_t[0].float(), dim=-1)
308
+ logprobs.append(float(lp[completion_seq[i]].item()))
309
+
310
+ return prompt_ids, completion_seq, logprobs
311
+
312
+
313
+ async def rollout_one_episode(
314
+ env: AwsRlEnv,
315
+ task: Task,
316
+ model,
317
+ tokenizer,
318
+ system_prompt: str,
319
+ max_turns: int,
320
+ max_total_tokens: int,
321
+ sampling: SamplingCfg,
322
+ ) -> dict:
323
+ """Run one multi-turn episode in one env, accumulating tokens across turns."""
324
+ try:
325
+ res = await env.reset(task=task)
326
+ except Exception as e:
327
+ logger.warning("reset() failed for task=%s: %s", task.task_id, e)
328
+ return {
329
+ "prompt_ids": [],
330
+ "completion_ids": [],
331
+ "logprobs": [],
332
+ "task_reward": -1.0,
333
+ "task_achieved": False,
334
+ "final_progress": 0.0,
335
+ "num_steps": 0,
336
+ "transcript": [{"error": f"reset failed: {e!r}"}],
337
+ }
338
+ obs: AwsRlObservation = res.observation
339
+
340
+ prompt_ids: list[int] = []
341
+ completion_ids: list[int] = []
342
+ logprobs: list[float] = []
343
+ step_rewards: list[float] = []
344
+ history: list[dict] = []
345
+ final_progress = float(getattr(obs, "partial_progress", 0.0) or 0.0)
346
+ final_achieved = bool(getattr(obs, "task_achieved", False))
347
+
348
+ for _turn in range(max_turns):
349
+ if res.done:
350
+ break
351
+ if len(completion_ids) >= max_total_tokens:
352
+ break
353
+
354
+ user_text = build_user_prompt(task, obs, history)
355
+ messages = [
356
+ {"role": "system", "content": system_prompt},
357
+ {"role": "user", "content": user_text},
358
+ ]
359
+ prompt_text = apply_chat_template(tokenizer, messages)
360
+
361
+ # Generation runs on the calling thread (blocking) but env.step calls
362
+ # for other rollouts in this group can overlap because they're all
363
+ # awaiting in the same loop.
364
+ loop = asyncio.get_running_loop()
365
+ turn_prompt_ids, turn_completion_ids, turn_logprobs = await loop.run_in_executor(
366
+ None, _generate_with_logprobs, model, tokenizer, prompt_text, sampling
367
+ )
368
+ completion_text = tokenizer.decode(turn_completion_ids, skip_special_tokens=True)
369
+ cmd = extract_aws_command(completion_text)
370
+
371
+ try:
372
+ res = await env.step(AwsRlAction(command=cmd))
373
+ step_reward = float(res.reward or 0.0)
374
+ except Exception as e:
375
+ logger.warning("step() error on cmd=%r: %s", cmd[:80], e)
376
+ step_reward = -0.1
377
+ history.append(
378
+ {
379
+ "command": cmd,
380
+ "output": f"ERROR: {e!r}",
381
+ "reward": step_reward,
382
+ }
383
+ )
384
+ prompt_ids.extend(turn_prompt_ids)
385
+ completion_ids.extend(turn_completion_ids)
386
+ logprobs.extend(turn_logprobs)
387
+ step_rewards.append(step_reward)
388
+ break
389
+
390
+ prompt_ids.extend(turn_prompt_ids)
391
+ completion_ids.extend(turn_completion_ids)
392
+ logprobs.extend(turn_logprobs)
393
+ step_rewards.append(step_reward)
394
+ obs = res.observation
395
+ final_progress = float(getattr(obs, "partial_progress", 0.0) or 0.0)
396
+ final_achieved = bool(getattr(obs, "task_achieved", False))
397
+ history.append(
398
+ {
399
+ "command": cmd,
400
+ "output": _truncate(getattr(obs, "command_output", "") or "", 500),
401
+ "reward": step_reward,
402
+ }
403
+ )
404
+
405
+ return {
406
+ "prompt_ids": prompt_ids,
407
+ "completion_ids": completion_ids,
408
+ "logprobs": logprobs,
409
+ "task_reward": float(sum(step_rewards)) if step_rewards else -1.0,
410
+ "task_achieved": final_achieved,
411
+ "final_progress": final_progress,
412
+ "num_steps": len(history),
413
+ "transcript": history,
414
+ "task_id": int(task.task_id),
415
+ "difficulty": task.difficulty.value,
416
+ }
417
+
418
+
419
+ # ============================================================
420
+ # MultiTurnEnvPool — sync wrapper around N async env sessions
421
+ # ============================================================
422
+
423
+
424
+ class MultiTurnEnvPool:
425
+ """N persistent WebSocket env sessions, exposed via a sync ``run_group`` API.
426
+
427
+ Owns a background thread running an asyncio loop. Connect / close happens
428
+ once for the lifetime of training. Submitted coroutines run in the
429
+ background loop via ``asyncio.run_coroutine_threadsafe`` and the calling
430
+ thread blocks on the resulting concurrent.futures.Future.
431
+ """
432
+
433
+ def __init__(self, base_url: str, size: int, timeout_s: float = 120.0) -> None:
434
+ if size < 1:
435
+ raise ValueError("size must be >= 1")
436
+ self.base_url = base_url
437
+ self.size = size
438
+ self.timeout_s = timeout_s
439
+ self._envs: list[AwsRlEnv] = []
440
+ self._loop: Optional[asyncio.AbstractEventLoop] = None
441
+ self._thread: Optional[threading.Thread] = None
442
+ self._ready = threading.Event()
443
+ self._setup_error: Optional[BaseException] = None
444
+
445
+ def start(self) -> None:
446
+ """Open N WebSocket sessions on the background loop."""
447
+ if self._thread is not None:
448
+ return
449
+
450
+ def run() -> None:
451
+ loop = asyncio.new_event_loop()
452
+ self._loop = loop
453
+ asyncio.set_event_loop(loop)
454
+ try:
455
+ loop.run_until_complete(self._connect_all())
456
+ except BaseException as e:
457
+ self._setup_error = e
458
+ self._ready.set()
459
+ return
460
+ self._ready.set()
461
+ loop.run_forever()
462
+
463
+ self._thread = threading.Thread(target=run, daemon=True, name="env-pool")
464
+ self._thread.start()
465
+ self._ready.wait()
466
+ if self._setup_error is not None:
467
+ raise RuntimeError(
468
+ f"MultiTurnEnvPool failed to connect {self.size} sessions to "
469
+ f"{self.base_url}: {self._setup_error!r}"
470
+ )
471
+ logger.info("MultiTurnEnvPool: %d sessions on %s", self.size, self.base_url)
472
+
473
+ async def _connect_all(self) -> None:
474
+ envs = [AwsRlEnv(base_url=self.base_url) for _ in range(self.size)]
475
+ try:
476
+ await asyncio.gather(*(e.connect() for e in envs))
477
+ except BaseException:
478
+ await asyncio.gather(*(e.close() for e in envs), return_exceptions=True)
479
+ raise
480
+ self._envs = envs
481
+
482
+ def close(self) -> None:
483
+ if self._thread is None or self._loop is None:
484
+ return
485
+ loop = self._loop
486
+
487
+ async def _shutdown() -> None:
488
+ await asyncio.gather(
489
+ *(e.close() for e in self._envs), return_exceptions=True
490
+ )
491
+
492
+ try:
493
+ fut = asyncio.run_coroutine_threadsafe(_shutdown(), loop)
494
+ fut.result(timeout=10.0)
495
+ except Exception as e:
496
+ logger.warning("Pool shutdown error (ignored): %s", e)
497
+ finally:
498
+ loop.call_soon_threadsafe(loop.stop)
499
+ self._thread.join(timeout=5.0)
500
+ self._thread = None
501
+ self._loop = None
502
+ self._envs = []
503
+
504
+ def run_group(
505
+ self,
506
+ task: Task,
507
+ model,
508
+ tokenizer,
509
+ system_prompt: str,
510
+ max_turns: int,
511
+ max_total_tokens: int,
512
+ sampling: SamplingCfg,
513
+ ) -> list[dict]:
514
+ """Run N concurrent multi-turn rollouts on the same task. Sync; blocks."""
515
+ assert self._loop is not None and self._envs, "call start() first"
516
+
517
+ async def _gather() -> list[dict]:
518
+ return list(
519
+ await asyncio.gather(
520
+ *(
521
+ rollout_one_episode(
522
+ env,
523
+ task,
524
+ model,
525
+ tokenizer,
526
+ system_prompt,
527
+ max_turns,
528
+ max_total_tokens,
529
+ sampling,
530
+ )
531
+ for env in self._envs
532
+ )
533
+ )
534
+ )
535
+
536
+ fut = asyncio.run_coroutine_threadsafe(_gather(), self._loop)
537
+ return fut.result(timeout=self.timeout_s * max(1, max_turns))
538
+
539
+ def __enter__(self) -> "MultiTurnEnvPool":
540
+ self.start()
541
+ return self
542
+
543
+ def __exit__(self, *exc) -> None:
544
+ self.close()
545
+
546
+
547
+ # ============================================================
548
+ # Reward functions (TRL convention) + rollout_func factory
549
+ # ============================================================
550
+
551
+
552
+ def reward_task(completions: list[str], **kwargs) -> list[float]:
553
+ rewards = kwargs.get("task_reward")
554
+ if rewards is None:
555
+ return [0.0 for _ in completions]
556
+ return [float(r) for r in rewards]
557
+
558
+
559
+ def reward_achieved(completions: list[str], **kwargs) -> list[float]:
560
+ flags = kwargs.get("task_achieved")
561
+ if flags is None:
562
+ return [0.0 for _ in completions]
563
+ return [float(f) for f in flags]
564
+
565
+
566
+ def reward_progress(completions: list[str], **kwargs) -> list[float]:
567
+ progress = kwargs.get("final_progress")
568
+ if progress is None:
569
+ return [0.0 for _ in completions]
570
+ return [float(p) for p in progress]
571
+
572
+
573
+ def make_rollout_func(
574
+ curriculum: Curriculum,
575
+ pool: MultiTurnEnvPool,
576
+ model,
577
+ tokenizer,
578
+ system_prompt: str,
579
+ max_turns: int,
580
+ max_total_tokens: int,
581
+ sampling: SamplingCfg,
582
+ log_episode: Callable[[Task, list[dict]], None],
583
+ ) -> Callable:
584
+ """Build the closure GRPO calls each step.
585
+
586
+ ``prompts`` length equals ``num_generations``. We ignore the prompt strings
587
+ because the curriculum drives task selection — every rollout in the group
588
+ runs the same task forced through ``env.reset(task=...)``.
589
+ """
590
+
591
+ def rollout_func(prompts: list[str], trainer: GRPOTrainer) -> dict[str, list]:
592
+ task = curriculum.next_task()
593
+ results = pool.run_group(
594
+ task,
595
+ model,
596
+ tokenizer,
597
+ system_prompt,
598
+ max_turns,
599
+ max_total_tokens,
600
+ sampling,
601
+ )
602
+ # Pad / truncate to len(prompts) — defence in depth, group size should match.
603
+ if len(results) < len(prompts):
604
+ results.extend(results[-1:] * (len(prompts) - len(results)))
605
+ results = results[: len(prompts)]
606
+
607
+ group_rewards = [r["task_reward"] for r in results]
608
+ group_achieved = [r["task_achieved"] for r in results]
609
+ group_progress = [r["final_progress"] for r in results]
610
+
611
+ curriculum.record_result(
612
+ task,
613
+ achieved=any(group_achieved),
614
+ reward=float(sum(group_rewards) / len(group_rewards)) if group_rewards else 0.0,
615
+ )
616
+ log_episode(task, results)
617
+
618
+ return {
619
+ "prompt_ids": [r["prompt_ids"] for r in results],
620
+ "completion_ids": [r["completion_ids"] for r in results],
621
+ "logprobs": [r["logprobs"] for r in results],
622
+ "task_reward": group_rewards,
623
+ "task_achieved": [float(a) for a in group_achieved],
624
+ "final_progress": group_progress,
625
+ }
626
+
627
+ return rollout_func
628
+
629
+
630
+ # ============================================================
631
+ # CSV / JSONL logging + reward plotter
632
+ # ============================================================
633
+
634
+
635
+ class EpisodeLogger:
636
+ """Append-only CSV + JSONL writer for per-rollout episode rows."""
637
+
638
+ HEADER = [
639
+ "step",
640
+ "rollout_idx",
641
+ "task_id",
642
+ "difficulty",
643
+ "task_reward",
644
+ "task_achieved",
645
+ "final_progress",
646
+ "num_steps",
647
+ "tier",
648
+ "tier_success_rate",
649
+ "timestamp",
650
+ ]
651
+
652
+ def __init__(self, output_dir: Path) -> None:
653
+ self.output_dir = output_dir
654
+ output_dir.mkdir(parents=True, exist_ok=True)
655
+ self.csv_path = output_dir / "reward_log.csv"
656
+ self.jsonl_path = output_dir / "transcripts.jsonl"
657
+ if not self.csv_path.exists():
658
+ with open(self.csv_path, "w", newline="") as f:
659
+ csv.writer(f).writerow(self.HEADER)
660
+ self._step_counter = 0
661
+
662
+ def log(self, task: Task, results: list[dict], curriculum: Curriculum) -> None:
663
+ self._step_counter += 1
664
+ stats = curriculum.get_stats()
665
+ ts = datetime.now().isoformat()
666
+ with open(self.csv_path, "a", newline="") as f:
667
+ writer = csv.writer(f)
668
+ for i, r in enumerate(results):
669
+ writer.writerow(
670
+ [
671
+ self._step_counter,
672
+ i,
673
+ int(task.task_id),
674
+ task.difficulty.value,
675
+ f"{r['task_reward']:.4f}",
676
+ int(bool(r["task_achieved"])),
677
+ f"{r['final_progress']:.4f}",
678
+ r["num_steps"],
679
+ stats["tier"],
680
+ stats["tier_success_rate"],
681
+ ts,
682
+ ]
683
+ )
684
+ with open(self.jsonl_path, "a") as f:
685
+ for i, r in enumerate(results):
686
+ f.write(
687
+ json.dumps(
688
+ {
689
+ "step": self._step_counter,
690
+ "rollout_idx": i,
691
+ "task_id": int(task.task_id),
692
+ "difficulty": task.difficulty.value,
693
+ "task_reward": r["task_reward"],
694
+ "task_achieved": bool(r["task_achieved"]),
695
+ "final_progress": r["final_progress"],
696
+ "num_steps": r["num_steps"],
697
+ "tier": stats["tier"],
698
+ "transcript": r["transcript"],
699
+ }
700
+ )
701
+ + "\n"
702
+ )
703
+
704
+ rewards = [r["task_reward"] for r in results]
705
+ achieved = [bool(r["task_achieved"]) for r in results]
706
+ logger.info(
707
+ "Step %d task=%d (%s) rewards=%s achieved=%d/%d tier=%s tier_rate=%.2f",
708
+ self._step_counter,
709
+ int(task.task_id),
710
+ task.difficulty.value,
711
+ [round(r, 2) for r in rewards],
712
+ sum(achieved),
713
+ len(achieved),
714
+ stats["tier"],
715
+ stats["tier_success_rate"],
716
+ )
717
+
718
+
719
+ def plot_rewards(csv_path: Path, out_path: Path) -> None:
720
+ """Per-step mean group reward + 10-step rolling avg + per-tier curves."""
721
+ import matplotlib
722
+
723
+ matplotlib.use("Agg")
724
+ import matplotlib.pyplot as plt
725
+
726
+ if not csv_path.exists():
727
+ logger.warning("No CSV at %s — skipping plot.", csv_path)
728
+ return
729
+
730
+ steps_data: dict[int, list[float]] = {}
731
+ tier_data: dict[str, list[tuple[int, float]]] = {}
732
+ with open(csv_path) as f:
733
+ reader = csv.DictReader(f)
734
+ for row in reader:
735
+ step = int(row["step"])
736
+ r = float(row["task_reward"])
737
+ tier = row["tier"]
738
+ steps_data.setdefault(step, []).append(r)
739
+ tier_data.setdefault(tier, []).append((step, r))
740
+
741
+ if not steps_data:
742
+ logger.warning("CSV at %s has no rows — skipping plot.", csv_path)
743
+ return
744
+
745
+ steps = sorted(steps_data.keys())
746
+ means = [sum(steps_data[s]) / len(steps_data[s]) for s in steps]
747
+
748
+ rolling = []
749
+ window = 10
750
+ for i in range(len(means)):
751
+ lo = max(0, i - window + 1)
752
+ rolling.append(sum(means[lo : i + 1]) / (i - lo + 1))
753
+
754
+ fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8))
755
+ ax1.plot(steps, means, label="mean group reward", alpha=0.5)
756
+ ax1.plot(steps, rolling, label=f"rolling avg (k={window})", linewidth=2)
757
+ ax1.set_xlabel("GRPO step")
758
+ ax1.set_ylabel("reward")
759
+ ax1.set_title("Group mean reward over training")
760
+ ax1.legend()
761
+ ax1.grid(alpha=0.3)
762
+
763
+ for tier, points in tier_data.items():
764
+ xs = [p[0] for p in points]
765
+ ys = [p[1] for p in points]
766
+ ax2.scatter(xs, ys, s=10, alpha=0.5, label=tier)
767
+ ax2.set_xlabel("GRPO step")
768
+ ax2.set_ylabel("reward")
769
+ ax2.set_title("Per-rollout reward by curriculum tier")
770
+ ax2.legend()
771
+ ax2.grid(alpha=0.3)
772
+
773
+ fig.tight_layout()
774
+ fig.savefig(out_path, dpi=120)
775
+ plt.close(fig)
776
+ logger.info("Reward plot written to %s", out_path)
777
+
778
+
779
+ # ============================================================
780
+ # Validation eval + Optuna search
781
+ # ============================================================
782
+
783
+
784
+ def pick_validation_task_ids(
785
+ curriculum: Optional[Curriculum] = None,
786
+ k_per_tier: int = 2,
787
+ seed: int = 42,
788
+ ) -> list[int]:
789
+ """Pick a frozen list of task ids — k per tier — for held-out validation."""
790
+ import random
791
+
792
+ rng = random.Random(seed)
793
+ cur = curriculum or Curriculum()
794
+ chosen: list[int] = []
795
+ for tier in TaskDifficulty:
796
+ try:
797
+ from server.services.curriculum import load_tier
798
+
799
+ tier_tasks = load_tier(tier, cur._tasks_dir)
800
+ except Exception as e:
801
+ logger.warning("Could not load tier %s for val: %s", tier.value, e)
802
+ continue
803
+ if not tier_tasks:
804
+ continue
805
+ sample = rng.sample(tier_tasks, k=min(k_per_tier, len(tier_tasks)))
806
+ chosen.extend(int(t.task_id) for t in sample)
807
+ return chosen
808
+
809
+
810
+ def evaluate_on_validation(
811
+ model,
812
+ tokenizer,
813
+ pool: MultiTurnEnvPool,
814
+ val_task_ids: list[int],
815
+ system_prompt: str,
816
+ max_turns: int,
817
+ max_total_tokens: int,
818
+ sampling: SamplingCfg,
819
+ curriculum: Optional[Curriculum] = None,
820
+ ) -> dict[str, float]:
821
+ """Run ONE rollout per val task on env[0] of the pool. Return aggregate metrics."""
822
+ cur = curriculum or Curriculum()
823
+ achieved_flags: list[float] = []
824
+ progresses: list[float] = []
825
+ rewards: list[float] = []
826
+
827
+ async def _eval_one(task: Task) -> dict:
828
+ env = pool._envs[0]
829
+ return await rollout_one_episode(
830
+ env,
831
+ task,
832
+ model,
833
+ tokenizer,
834
+ system_prompt,
835
+ max_turns,
836
+ max_total_tokens,
837
+ sampling,
838
+ )
839
+
840
+ for tid in val_task_ids:
841
+ try:
842
+ task = cur.get_task_by_id(TaskID(int(tid)))
843
+ except KeyError:
844
+ logger.warning("val task_id=%d not found — skipping", tid)
845
+ continue
846
+ fut = asyncio.run_coroutine_threadsafe(_eval_one(task), pool._loop)
847
+ try:
848
+ res = fut.result(timeout=pool.timeout_s * max(1, max_turns))
849
+ except Exception as e:
850
+ logger.warning("val rollout failed for task=%d: %s", tid, e)
851
+ continue
852
+ achieved_flags.append(float(res["task_achieved"]))
853
+ progresses.append(float(res["final_progress"]))
854
+ rewards.append(float(res["task_reward"]))
855
+
856
+ n = max(1, len(achieved_flags))
857
+ return {
858
+ "achieved_rate": sum(achieved_flags) / n,
859
+ "mean_progress": sum(progresses) / n,
860
+ "mean_reward": sum(rewards) / n,
861
+ "n_evaluated": float(len(achieved_flags)),
862
+ }
863
+
864
+
865
+ def _build_grpo_config(
866
+ output_dir: Path,
867
+ cfg: dict[str, Any],
868
+ max_steps: int,
869
+ max_completion_length: int,
870
+ max_prompt_length: int,
871
+ save_steps: int = 25,
872
+ save_strategy: str = "steps",
873
+ report_to: str = "none",
874
+ ) -> GRPOConfig:
875
+ return GRPOConfig(
876
+ output_dir=str(output_dir),
877
+ max_steps=max_steps,
878
+ learning_rate=float(cfg["learning_rate"]),
879
+ beta=float(cfg["beta"]),
880
+ num_generations=int(cfg["num_generations"]),
881
+ generation_batch_size=int(cfg["num_generations"]),
882
+ per_device_train_batch_size=1,
883
+ gradient_accumulation_steps=8,
884
+ gradient_checkpointing=True,
885
+ gradient_checkpointing_kwargs={"use_reentrant": False},
886
+ max_completion_length=max_completion_length,
887
+ max_prompt_length=max_prompt_length,
888
+ temperature=float(cfg["temperature"]),
889
+ top_p=float(cfg["top_p"]),
890
+ logging_steps=1,
891
+ save_strategy=save_strategy,
892
+ save_steps=save_steps,
893
+ save_total_limit=3,
894
+ report_to=report_to,
895
+ loss_type="dapo",
896
+ mask_truncated_completions=True,
897
+ warmup_ratio=0.05,
898
+ lr_scheduler_type="cosine",
899
+ max_grad_norm=1.0,
900
+ use_vllm=False,
901
+ remove_unused_columns=False,
902
+ )
903
+
904
+
905
+ def _build_dummy_dataset(num_rows: int) -> Dataset:
906
+ """A length-only dataset; the prompts are ignored by ``rollout_func``."""
907
+ return Dataset.from_dict({"prompt": ["solve"] * max(1, num_rows)})
908
+
909
+
910
+ def optuna_search(
911
+ n_trials: int,
912
+ trial_max_steps: int,
913
+ val_task_ids: list[int],
914
+ base_model: str,
915
+ sft_adapter: Optional[str],
916
+ env_url: str,
917
+ output_dir: Path,
918
+ max_total_tokens: int = 2048,
919
+ max_completion_length: int = 256,
920
+ max_prompt_length: int = 2048,
921
+ seed: int = 42,
922
+ ):
923
+ """TPE-sampled hyperparam search. Persists to ``output_dir/optuna.db``."""
924
+ import optuna
925
+
926
+ output_dir.mkdir(parents=True, exist_ok=True)
927
+ study = optuna.create_study(
928
+ direction="maximize",
929
+ study_name="aws-rl-grpo",
930
+ storage=f"sqlite:///{output_dir / 'optuna.db'}",
931
+ load_if_exists=True,
932
+ sampler=optuna.samplers.TPESampler(seed=seed),
933
+ )
934
+
935
+ def _objective(trial: optuna.Trial) -> float:
936
+ cfg = {
937
+ "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
938
+ "beta": trial.suggest_float("beta", 0.0, 0.1),
939
+ "num_generations": trial.suggest_categorical("num_generations", [4, 8]),
940
+ "temperature": trial.suggest_float("temperature", 0.7, 1.0),
941
+ "top_p": trial.suggest_float("top_p", 0.85, 0.98),
942
+ "lora_r": trial.suggest_categorical("lora_r", [8, 16, 32]),
943
+ "lora_alpha_mul": trial.suggest_categorical("lora_alpha_mul", [1, 2, 4]),
944
+ "max_turns": trial.suggest_categorical("max_turns", [4, 6, 8]),
945
+ }
946
+ trial_dir = output_dir / f"trial_{trial.number:03d}"
947
+ return _run_one_trial(
948
+ cfg=cfg,
949
+ trial_max_steps=trial_max_steps,
950
+ val_task_ids=val_task_ids,
951
+ base_model=base_model,
952
+ sft_adapter=sft_adapter,
953
+ env_url=env_url,
954
+ output_dir=trial_dir,
955
+ max_total_tokens=max_total_tokens,
956
+ max_completion_length=max_completion_length,
957
+ max_prompt_length=max_prompt_length,
958
+ )
959
+
960
+ study.optimize(_objective, n_trials=n_trials, gc_after_trial=True)
961
+
962
+ best_path = output_dir / "best_cfg.json"
963
+ payload = {"best_value": study.best_value, "best_params": dict(study.best_params)}
964
+ with open(best_path, "w") as f:
965
+ json.dump(payload, f, indent=2)
966
+ logger.info(
967
+ "Optuna study finished. best_value=%.4f best_params=%s -> %s",
968
+ study.best_value,
969
+ study.best_params,
970
+ best_path,
971
+ )
972
+ return study
973
+
974
+
975
+ def _run_one_trial(
976
+ cfg: dict[str, Any],
977
+ trial_max_steps: int,
978
+ val_task_ids: list[int],
979
+ base_model: str,
980
+ sft_adapter: Optional[str],
981
+ env_url: str,
982
+ output_dir: Path,
983
+ max_total_tokens: int,
984
+ max_completion_length: int,
985
+ max_prompt_length: int,
986
+ ) -> float:
987
+ """One Optuna trial: load → train → eval on val tasks → tear down → return objective."""
988
+ output_dir.mkdir(parents=True, exist_ok=True)
989
+ logger.info("Optuna trial cfg=%s -> %s", cfg, output_dir)
990
+
991
+ model = tokenizer = None
992
+ pool: Optional[MultiTurnEnvPool] = None
993
+ trainer: Optional[GRPOTrainer] = None
994
+ try:
995
+ model, tokenizer = load_policy(base_model, sft_adapter, trainable=True)
996
+
997
+ pool = MultiTurnEnvPool(env_url, size=int(cfg["num_generations"]))
998
+ pool.start()
999
+
1000
+ curriculum = Curriculum()
1001
+ sampling = SamplingCfg(
1002
+ temperature=float(cfg["temperature"]),
1003
+ top_p=float(cfg["top_p"]),
1004
+ max_new_tokens=max_completion_length,
1005
+ max_prompt_length=max_prompt_length,
1006
+ )
1007
+ ep_logger = EpisodeLogger(output_dir)
1008
+ rollout_func = make_rollout_func(
1009
+ curriculum=curriculum,
1010
+ pool=pool,
1011
+ model=model,
1012
+ tokenizer=tokenizer,
1013
+ system_prompt=SYSTEM_PROMPT,
1014
+ max_turns=int(cfg["max_turns"]),
1015
+ max_total_tokens=max_total_tokens,
1016
+ sampling=sampling,
1017
+ log_episode=lambda task, results: ep_logger.log(task, results, curriculum),
1018
+ )
1019
+
1020
+ dataset = _build_dummy_dataset(trial_max_steps * int(cfg["num_generations"]))
1021
+ grpo_cfg = _build_grpo_config(
1022
+ output_dir=output_dir,
1023
+ cfg=cfg,
1024
+ max_steps=trial_max_steps,
1025
+ max_completion_length=max_completion_length,
1026
+ max_prompt_length=max_prompt_length,
1027
+ save_strategy="no",
1028
+ report_to="none",
1029
+ )
1030
+
1031
+ trainer = GRPOTrainer(
1032
+ model=model,
1033
+ processing_class=tokenizer,
1034
+ reward_funcs=[reward_task, reward_achieved, reward_progress],
1035
+ train_dataset=dataset,
1036
+ args=grpo_cfg,
1037
+ rollout_func=rollout_func,
1038
+ peft_config=None if sft_adapter else _lora_config(cfg),
1039
+ )
1040
+ trainer.train()
1041
+
1042
+ metrics = evaluate_on_validation(
1043
+ model=trainer.model,
1044
+ tokenizer=tokenizer,
1045
+ pool=pool,
1046
+ val_task_ids=val_task_ids,
1047
+ system_prompt=SYSTEM_PROMPT,
1048
+ max_turns=int(cfg["max_turns"]),
1049
+ max_total_tokens=max_total_tokens,
1050
+ sampling=sampling,
1051
+ curriculum=curriculum,
1052
+ )
1053
+ objective = 0.7 * metrics["achieved_rate"] + 0.3 * metrics["mean_progress"]
1054
+ with open(output_dir / "trial_metrics.json", "w") as f:
1055
+ json.dump({"cfg": cfg, "metrics": metrics, "objective": objective}, f, indent=2)
1056
+ logger.info("Trial done: metrics=%s objective=%.4f", metrics, objective)
1057
+ return float(objective)
1058
+ finally:
1059
+ if trainer is not None:
1060
+ try:
1061
+ del trainer
1062
+ except Exception:
1063
+ pass
1064
+ if model is not None:
1065
+ free_model(model)
1066
+ if pool is not None:
1067
+ try:
1068
+ pool.close()
1069
+ except Exception:
1070
+ logger.exception("Pool close error during trial cleanup")
1071
+ gc.collect()
1072
+ if torch.cuda.is_available():
1073
+ torch.cuda.empty_cache()
1074
+
1075
+
1076
+ def _lora_config(cfg: dict[str, Any]) -> LoraConfig:
1077
+ r = int(cfg["lora_r"])
1078
+ alpha_mul = int(cfg["lora_alpha_mul"])
1079
+ return LoraConfig(
1080
+ r=r,
1081
+ lora_alpha=r * alpha_mul,
1082
+ lora_dropout=0.05,
1083
+ bias="none",
1084
+ task_type="CAUSAL_LM",
1085
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
1086
+ )
1087
+
1088
+
1089
+ # ============================================================
1090
+ # Main training entrypoint (single training pass)
1091
+ # ============================================================
1092
+
1093
+
1094
+ def run_training(
1095
+ cfg: dict[str, Any],
1096
+ *,
1097
+ base_model: str,
1098
+ sft_adapter: Optional[str],
1099
+ env_url: str,
1100
+ output_dir: Path,
1101
+ max_steps: int,
1102
+ max_total_tokens: int = 4096,
1103
+ max_completion_length: int = 256,
1104
+ max_prompt_length: int = 2048,
1105
+ push_to_hub: bool = False,
1106
+ hub_repo: Optional[str] = None,
1107
+ ) -> Path:
1108
+ """Run a full GRPO training pass with the supplied config dict."""
1109
+ output_dir.mkdir(parents=True, exist_ok=True)
1110
+ logger.info("run_training cfg=%s -> %s", cfg, output_dir)
1111
+
1112
+ model, tokenizer = load_policy(base_model, sft_adapter, trainable=True)
1113
+ pool = MultiTurnEnvPool(env_url, size=int(cfg["num_generations"]))
1114
+ pool.start()
1115
+
1116
+ curriculum = Curriculum()
1117
+ sampling = SamplingCfg(
1118
+ temperature=float(cfg["temperature"]),
1119
+ top_p=float(cfg["top_p"]),
1120
+ max_new_tokens=max_completion_length,
1121
+ max_prompt_length=max_prompt_length,
1122
+ )
1123
+ ep_logger = EpisodeLogger(output_dir)
1124
+ rollout_func = make_rollout_func(
1125
+ curriculum=curriculum,
1126
+ pool=pool,
1127
+ model=model,
1128
+ tokenizer=tokenizer,
1129
+ system_prompt=SYSTEM_PROMPT,
1130
+ max_turns=int(cfg["max_turns"]),
1131
+ max_total_tokens=max_total_tokens,
1132
+ sampling=sampling,
1133
+ log_episode=lambda task, results: ep_logger.log(task, results, curriculum),
1134
+ )
1135
+
1136
+ dataset = _build_dummy_dataset(max_steps * int(cfg["num_generations"]))
1137
+ grpo_cfg = _build_grpo_config(
1138
+ output_dir=output_dir,
1139
+ cfg=cfg,
1140
+ max_steps=max_steps,
1141
+ max_completion_length=max_completion_length,
1142
+ max_prompt_length=max_prompt_length,
1143
+ )
1144
+
1145
+ trainer = GRPOTrainer(
1146
+ model=model,
1147
+ processing_class=tokenizer,
1148
+ reward_funcs=[reward_task, reward_achieved, reward_progress],
1149
+ train_dataset=dataset,
1150
+ args=grpo_cfg,
1151
+ rollout_func=rollout_func,
1152
+ peft_config=None if sft_adapter else _lora_config(cfg),
1153
+ )
1154
+
1155
+ try:
1156
+ trainer.train()
1157
+ finally:
1158
+ try:
1159
+ pool.close()
1160
+ except Exception:
1161
+ logger.exception("Pool close error after training")
1162
+ try:
1163
+ plot_rewards(ep_logger.csv_path, output_dir / "reward_plot.png")
1164
+ except Exception as e:
1165
+ logger.warning("plot_rewards failed: %s", e)
1166
+
1167
+ trainer.save_model(str(output_dir))
1168
+ logger.info("Adapter saved to %s", output_dir)
1169
+
1170
+ if push_to_hub and hub_repo:
1171
+ trainer.push_to_hub(repo_id=hub_repo)
1172
+ logger.info("Adapter pushed to https://huggingface.co/%s", hub_repo)
1173
+
1174
+ return output_dir
1175
+
1176
+
1177
+ # ============================================================
1178
+ # CLI
1179
+ # ============================================================
1180
+
1181
+
1182
+ def _parse_args() -> argparse.Namespace:
1183
+ p = argparse.ArgumentParser(description=__doc__)
1184
+ p.add_argument("--mode", choices=["train", "optuna", "full"], default="train")
1185
+ p.add_argument("--base-model", default=PolicySpec.base_model)
1186
+ p.add_argument("--sft-adapter", default=PolicySpec.sft_adapter,
1187
+ help="HF repo id of the SFT adapter (use empty string to disable)")
1188
+ p.add_argument("--env-url", default="http://localhost:8000")
1189
+ p.add_argument("--output-dir", default=None)
1190
+
1191
+ # Train-mode hyperparams (mirror DEFAULT_CFG keys)
1192
+ p.add_argument("--num-generations", type=int, default=DEFAULT_CFG["num_generations"])
1193
+ p.add_argument("--max-turns", type=int, default=DEFAULT_CFG["max_turns"])
1194
+ p.add_argument("--max-steps", type=int, default=200)
1195
+ p.add_argument("--learning-rate", type=float, default=DEFAULT_CFG["learning_rate"])
1196
+ p.add_argument("--beta", type=float, default=DEFAULT_CFG["beta"])
1197
+ p.add_argument("--temperature", type=float, default=DEFAULT_CFG["temperature"])
1198
+ p.add_argument("--top-p", type=float, default=DEFAULT_CFG["top_p"])
1199
+ p.add_argument("--lora-r", type=int, default=DEFAULT_CFG["lora_r"])
1200
+ p.add_argument("--lora-alpha-mul", type=int, default=DEFAULT_CFG["lora_alpha_mul"])
1201
+ p.add_argument("--max-prompt-length", type=int, default=2048)
1202
+ p.add_argument("--max-completion-length", type=int, default=256)
1203
+ p.add_argument("--max-total-tokens", type=int, default=4096)
1204
+
1205
+ # Optuna-specific
1206
+ p.add_argument("--n-trials", type=int, default=6)
1207
+ p.add_argument("--trial-max-steps", type=int, default=30)
1208
+ p.add_argument("--val-tasks-per-tier", type=int, default=2)
1209
+
1210
+ p.add_argument("--push-to-hub", action="store_true")
1211
+ p.add_argument("--hub-repo", default=None)
1212
+ return p.parse_args()
1213
+
1214
+
1215
+ def _resolve_output_dir(args: argparse.Namespace) -> Path:
1216
+ if args.output_dir:
1217
+ return Path(args.output_dir)
1218
+ ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
1219
+ return Path("outputs") / f"aws-rl-grpo-{ts}"
1220
+
1221
+
1222
+ def _cli_cfg(args: argparse.Namespace) -> dict[str, Any]:
1223
+ return {
1224
+ "learning_rate": args.learning_rate,
1225
+ "beta": args.beta,
1226
+ "num_generations": args.num_generations,
1227
+ "temperature": args.temperature,
1228
+ "top_p": args.top_p,
1229
+ "lora_r": args.lora_r,
1230
+ "lora_alpha_mul": args.lora_alpha_mul,
1231
+ "max_turns": args.max_turns,
1232
+ }
1233
+
1234
+
1235
+ def main() -> None:
1236
+ logging.basicConfig(
1237
+ level=logging.INFO,
1238
+ format="%(asctime)s %(levelname)s %(name)s %(message)s",
1239
+ )
1240
+ args = _parse_args()
1241
+ output_dir = _resolve_output_dir(args)
1242
+ output_dir.mkdir(parents=True, exist_ok=True)
1243
+ sft_adapter = args.sft_adapter or None
1244
+
1245
+ if args.mode in ("optuna", "full"):
1246
+ val_ids = pick_validation_task_ids(k_per_tier=args.val_tasks_per_tier)
1247
+ with open(output_dir / "val_task_ids.json", "w") as f:
1248
+ json.dump(val_ids, f)
1249
+ study = optuna_search(
1250
+ n_trials=args.n_trials,
1251
+ trial_max_steps=args.trial_max_steps,
1252
+ val_task_ids=val_ids,
1253
+ base_model=args.base_model,
1254
+ sft_adapter=sft_adapter,
1255
+ env_url=args.env_url,
1256
+ output_dir=output_dir,
1257
+ max_total_tokens=args.max_total_tokens,
1258
+ max_completion_length=args.max_completion_length,
1259
+ max_prompt_length=args.max_prompt_length,
1260
+ )
1261
+ if args.mode == "optuna":
1262
+ return
1263
+ cfg = {**DEFAULT_CFG, **dict(study.best_params)}
1264
+ else:
1265
+ cfg = _cli_cfg(args)
1266
+
1267
+ run_training(
1268
+ cfg,
1269
+ base_model=args.base_model,
1270
+ sft_adapter=sft_adapter,
1271
+ env_url=args.env_url,
1272
+ output_dir=output_dir,
1273
+ max_steps=args.max_steps,
1274
+ max_total_tokens=args.max_total_tokens,
1275
+ max_completion_length=args.max_completion_length,
1276
+ max_prompt_length=args.max_prompt_length,
1277
+ push_to_hub=args.push_to_hub,
1278
+ hub_repo=args.hub_repo,
1279
+ )
1280
+
1281
+
1282
+ if __name__ == "__main__":
1283
+ main()
train_grpo_lora_final.ipynb ADDED
The diff for this file is too large to render. See raw diff